diff --git a/.mailmap b/.mailmap
index 05877be16ce29..3c703fb4aaab9 100644
--- a/.mailmap
+++ b/.mailmap
@@ -28,9 +28,8 @@
 <qiucofan@cn.ibm.com> <qiucf@cn.ibm.com>
 <rnk@google.com> <reid@kleckner.net>
 <thakis@chromium.org> <nicolasweber@gmx.de>
-Jon Roelofs <jonathan_roelofs@apple.com> Jon Roelofs <jroelofs@jroelofs.com>
-Jon Roelofs <jonathan_roelofs@apple.com> Jonathan Roelofs <jonathan@codesourcery.com>
-Jon Roelofs <jonathan_roelofs@apple.com> Jonathan Roelofs <jroelofs@jroelofs.com>
+Jon Roelofs <jonathan_roelofs@apple.com> <jonathan@codesourcery.com>
+Jon Roelofs <jonathan_roelofs@apple.com> <jroelofs@jroelofs.com>
 LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
 Martin Storsjö <martin@martin.st>
-Saleem Abdulrasool <compnerd@compnerd.org> <compnerd@compnerd.org>
+Saleem Abdulrasool <compnerd@compnerd.org>
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
index 43812fe17a1c7..a191598415217 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.cpp
@@ -433,17 +433,25 @@ void ProTypeMemberInitCheck::checkMissingMemberInitializer(
                [&](const FieldDecl *F) { OrderedFields.push_back(F); });
 
   // Collect all the fields we need to initialize, including indirect fields.
+  // It only includes fields that have not been fixed
   SmallPtrSet<const FieldDecl *, 16> AllFieldsToInit;
-  forEachField(ClassDecl, FieldsToInit,
-               [&](const FieldDecl *F) { AllFieldsToInit.insert(F); });
-  if (AllFieldsToInit.empty())
+  forEachField(ClassDecl, FieldsToInit, [&](const FieldDecl *F) {
+    if (!HasRecordClassMemberSet.contains(F)) {
+      AllFieldsToInit.insert(F);
+      HasRecordClassMemberSet.insert(F);
+    }
+  });
+  if (FieldsToInit.empty())
     return;
 
   DiagnosticBuilder Diag =
       diag(Ctor ? Ctor->getBeginLoc() : ClassDecl.getLocation(),
            "%select{|union }0constructor %select{does not|should}0 initialize "
            "%select{|one of }0these fields: %1")
-      << IsUnion << toCommaSeparatedString(OrderedFields, AllFieldsToInit);
+      << IsUnion << toCommaSeparatedString(OrderedFields, FieldsToInit);
+
+  if (AllFieldsToInit.empty())
+    return;
 
   // Do not propose fixes for constructors in macros since we cannot place them
   // correctly.
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h
index 5b4144396eab4..af7b14ec68ad9 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeMemberInitCheck.h
@@ -10,6 +10,7 @@
 #define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CPPCOREGUIDELINES_PRO_TYPE_MEMBER_INIT_H
 
 #include "../ClangTidyCheck.h"
+#include "llvm/ADT/DenseSet.h"
 
 namespace clang {
 namespace tidy {
@@ -72,6 +73,10 @@ class ProTypeMemberInitCheck : public ClangTidyCheck {
   // instead of brace initialization. Only effective in C++11 mode. Default is
   // false.
   bool UseAssignment;
+
+  // Record the member variables that have been initialized to prevent repeated
+  // initialization.
+  llvm::DenseSet<const FieldDecl *> HasRecordClassMemberSet;
 };
 
 } // namespace cppcoreguidelines
diff --git a/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp b/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp
index fe25f7a7ccbcc..7dc519c152828 100644
--- a/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/BracesAroundStatementsCheck.cpp
@@ -186,6 +186,10 @@ BracesAroundStatementsCheck::findRParenLoc(const IfOrWhileStmt *S,
 bool BracesAroundStatementsCheck::checkStmt(
     const MatchFinder::MatchResult &Result, const Stmt *S,
     SourceLocation InitialLoc, SourceLocation EndLocHint) {
+
+  while (const auto *AS = dyn_cast<AttributedStmt>(S))
+    S = AS->getSubStmt();
+
   // 1) If there's a corresponding "else" or "while", the check inserts "} "
   // right before that token.
   // 2) If there's a multi-line block comment starting on the same line after
diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt
index 476de70654cf9..056a3272ebd11 100644
--- a/clang-tools-extra/clangd/CMakeLists.txt
+++ b/clang-tools-extra/clangd/CMakeLists.txt
@@ -83,6 +83,7 @@ add_clang_library(clangDaemon
   HeaderSourceSwitch.cpp
   HeuristicResolver.cpp
   Hover.cpp
+  IncludeCleaner.cpp
   IncludeFixer.cpp
   InlayHints.cpp
   JSONTransport.cpp
diff --git a/clang-tools-extra/clangd/CompileCommands.cpp b/clang-tools-extra/clangd/CompileCommands.cpp
index 57933169f9119..f9e283beca64c 100644
--- a/clang-tools-extra/clangd/CompileCommands.cpp
+++ b/clang-tools-extra/clangd/CompileCommands.cpp
@@ -127,7 +127,6 @@ const llvm::Optional<std::string> detectSysroot() {
   if (::getenv("SDKROOT"))
     return llvm::None;
   return queryXcrun({"xcrun", "--show-sdk-path"});
-  return llvm::None;
 }
 
 std::string detectStandardResourceDir() {
diff --git a/clang-tools-extra/clangd/IncludeCleaner.cpp b/clang-tools-extra/clangd/IncludeCleaner.cpp
new file mode 100644
index 0000000000000..dbf19d4e08f95
--- /dev/null
+++ b/clang-tools-extra/clangd/IncludeCleaner.cpp
@@ -0,0 +1,112 @@
+//===--- IncludeCleaner.cpp - Unused/Missing Headers Analysis ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "IncludeCleaner.h"
+#include "support/Logger.h"
+#include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Basic/SourceLocation.h"
+
+namespace clang {
+namespace clangd {
+namespace {
+
+/// Crawler traverses the AST and feeds in the locations of (sometimes
+/// implicitly) used symbols into \p Result.
+class ReferencedLocationCrawler
+    : public RecursiveASTVisitor<ReferencedLocationCrawler> {
+public:
+  ReferencedLocationCrawler(ReferencedLocations &Result) : Result(Result) {}
+
+  bool VisitDeclRefExpr(DeclRefExpr *DRE) {
+    add(DRE->getDecl());
+    add(DRE->getFoundDecl());
+    return true;
+  }
+
+  bool VisitMemberExpr(MemberExpr *ME) {
+    add(ME->getMemberDecl());
+    add(ME->getFoundDecl().getDecl());
+    return true;
+  }
+
+  bool VisitTagType(TagType *TT) {
+    add(TT->getDecl());
+    return true;
+  }
+
+  bool VisitCXXConstructExpr(CXXConstructExpr *CCE) {
+    add(CCE->getConstructor());
+    return true;
+  }
+
+  bool VisitTemplateSpecializationType(TemplateSpecializationType *TST) {
+    if (isNew(TST)) {
+      add(TST->getTemplateName().getAsTemplateDecl()); // Primary template.
+      add(TST->getAsCXXRecordDecl());                  // Specialization
+    }
+    return true;
+  }
+
+  bool VisitTypedefType(TypedefType *TT) {
+    add(TT->getDecl());
+    return true;
+  }
+
+  // Consider types of any subexpression used, even if the type is not named.
+  // This is helpful in getFoo().bar(), where Foo must be complete.
+  // FIXME(kirillbobyrev): Should we tweak this? It may not be desirable to
+  // consider types "used" when they are not directly spelled in code.
+  bool VisitExpr(Expr *E) {
+    TraverseType(E->getType());
+    return true;
+  }
+
+  bool TraverseType(QualType T) {
+    if (isNew(T.getTypePtrOrNull())) { // don't care about quals
+      Base::TraverseType(T);
+    }
+    return true;
+  }
+
+  bool VisitUsingDecl(UsingDecl *D) {
+    for (const auto *Shadow : D->shadows()) {
+      add(Shadow->getTargetDecl());
+    }
+    return true;
+  }
+
+private:
+  using Base = RecursiveASTVisitor<ReferencedLocationCrawler>;
+
+  void add(const Decl *D) {
+    if (!D || !isNew(D->getCanonicalDecl())) {
+      return;
+    }
+    for (const Decl *Redecl : D->redecls()) {
+      Result.insert(Redecl->getLocation());
+    }
+  }
+
+  bool isNew(const void *P) { return P && Visited.insert(P).second; }
+
+  ReferencedLocations &Result;
+  llvm::DenseSet<const void *> Visited;
+};
+
+} // namespace
+
+ReferencedLocations findReferencedLocations(ParsedAST &AST) {
+  ReferencedLocations Result;
+  ReferencedLocationCrawler Crawler(Result);
+  Crawler.TraverseAST(AST.getASTContext());
+  // FIXME(kirillbobyrev): Handle macros.
+  return Result;
+}
+
+} // namespace clangd
+} // namespace clang
diff --git a/clang-tools-extra/clangd/IncludeCleaner.h b/clang-tools-extra/clangd/IncludeCleaner.h
new file mode 100644
index 0000000000000..ca9f79c0f3b03
--- /dev/null
+++ b/clang-tools-extra/clangd/IncludeCleaner.h
@@ -0,0 +1,52 @@
+//===--- IncludeCleaner.h - Unused/Missing Headers Analysis -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Include Cleaner is clangd functionality for providing diagnostics for misuse
+/// of transitive headers and unused includes. It is inspired by
+/// Include-What-You-Use tool (https://include-what-you-use.org/). Our goal is
+/// to provide useful warnings in most popular scenarios but not 1:1 exact
+/// feature compatibility.
+///
+/// FIXME(kirillbobyrev): Add support for IWYU pragmas.
+/// FIXME(kirillbobyrev): Add support for standard library headers.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INCLUDE_CLEANER_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INCLUDE_CLEANER_H
+
+#include "Headers.h"
+#include "ParsedAST.h"
+#include "clang/Basic/SourceLocation.h"
+#include "llvm/ADT/DenseSet.h"
+
+namespace clang {
+namespace clangd {
+
+using ReferencedLocations = llvm::DenseSet<SourceLocation>;
+/// Finds locations of all symbols used in the main file.
+///
+/// Uses RecursiveASTVisitor to go through main file AST and computes all the
+/// locations used symbols are coming from. Returned locations may be macro
+/// expansions, and are not resolved to their spelling/expansion location. These
+/// locations are later used to determine which headers should be marked as
+/// "used" and "directly used".
+///
+/// We use this to compute unused headers, so we:
+///
+/// - cover the whole file in a single traversal for efficiency
+/// - don't attempt to describe where symbols were referenced from in
+///   ambiguous cases (e.g. implicitly used symbols, multiple declarations)
+/// - err on the side of reporting all possible locations
+ReferencedLocations findReferencedLocations(ParsedAST &AST);
+
+} // namespace clangd
+} // namespace clang
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INCLUDE_CLEANER_H
diff --git a/clang-tools-extra/clangd/InlayHints.cpp b/clang-tools-extra/clangd/InlayHints.cpp
index 1283aa4dd62cc..7c3c6a2421d83 100644
--- a/clang-tools-extra/clangd/InlayHints.cpp
+++ b/clang-tools-extra/clangd/InlayHints.cpp
@@ -13,6 +13,7 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/Basic/SourceManager.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang {
 namespace clangd {
@@ -314,6 +315,10 @@ class InlayHintVisitor : public RecursiveASTVisitor<InlayHintVisitor> {
         toHalfOpenFileRange(AST.getSourceManager(), AST.getLangOpts(), R);
     if (!FileRange)
       return;
+    // The hint may be in a file other than the main file (for example, a header
+    // file that was included after the preamble), do not show in that case.
+    if (!AST.getSourceManager().isWrittenInMainFile(FileRange->getBegin()))
+      return;
     Results.push_back(InlayHint{
         Range{
             sourceLocToPosition(AST.getSourceManager(), FileRange->getBegin()),
diff --git a/clang-tools-extra/clangd/unittests/CMakeLists.txt b/clang-tools-extra/clangd/unittests/CMakeLists.txt
index 2f5a754f882ae..8fc4cf414bdb1 100644
--- a/clang-tools-extra/clangd/unittests/CMakeLists.txt
+++ b/clang-tools-extra/clangd/unittests/CMakeLists.txt
@@ -58,6 +58,7 @@ add_unittest(ClangdUnitTests ClangdTests
   HeadersTests.cpp
   HeaderSourceSwitchTests.cpp
   HoverTests.cpp
+  IncludeCleanerTests.cpp
   IndexActionTests.cpp
   IndexTests.cpp
   InlayHintTests.cpp
diff --git a/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
new file mode 100644
index 0000000000000..718076adc762a
--- /dev/null
+++ b/clang-tools-extra/clangd/unittests/IncludeCleanerTests.cpp
@@ -0,0 +1,136 @@
+//===--- IncludeCleanerTests.cpp --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Annotations.h"
+#include "IncludeCleaner.h"
+#include "TestTU.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang {
+namespace clangd {
+namespace {
+
+TEST(IncludeCleaner, ReferencedLocations) {
+  struct TestCase {
+    std::string HeaderCode;
+    std::string MainCode;
+  };
+  TestCase Cases[] = {
+      // DeclRefExpr
+      {
+          "int ^x();",
+          "int y = x();",
+      },
+      // RecordDecl
+      {
+          "class ^X;",
+          "X *y;",
+      },
+      // TypedefType and UsingDecls
+      {
+          "using ^Integer = int;",
+          "Integer x;",
+      },
+      {
+          "namespace ns { struct ^X; struct ^X {}; }",
+          "using ns::X;",
+      },
+      {
+          "namespace ns { struct X; struct X {}; }",
+          "using namespace ns;",
+      },
+      {
+          "struct ^A {}; using B = A; using ^C = B;",
+          "C a;",
+      },
+      {
+          "typedef bool ^Y; template <typename T> struct ^X {};",
+          "X<Y> x;",
+      },
+      {
+          "struct Foo; struct ^Foo{}; typedef Foo ^Bar;",
+          "Bar b;",
+      },
+      // MemberExpr
+      {
+          "struct ^X{int ^a;}; X ^foo();",
+          "int y = foo().a;",
+      },
+      // Expr (type is traversed)
+      {
+          "class ^X{}; X ^foo();",
+          "auto bar() { return foo(); }",
+      },
+      // Redecls
+      {
+          "class ^X; class ^X{}; class ^X;",
+          "X *y;",
+      },
+      // Constructor
+      {
+          "struct ^X { ^X(int) {} int ^foo(); };",
+          "auto x = X(42); auto y = x.foo();",
+      },
+      // Static function
+      {
+          "struct ^X { static bool ^foo(); }; bool X::^foo() {}",
+          "auto b = X::foo();",
+      },
+      // TemplateRecordDecl
+      {
+          "template <typename> class ^X;",
+          "X<int> *y;",
+      },
+      // Type name not spelled out in code
+      {
+          "class ^X{}; X ^getX();",
+          "auto x = getX();",
+      },
+      // Enums
+      {
+          "enum ^Color { ^Red = 42, Green = 9000};",
+          "int MyColor = Red;",
+      },
+      {
+          "struct ^X { enum ^Language { ^CXX = 42, Python = 9000}; };",
+          "int Lang = X::CXX;",
+      },
+      {
+          // When a type is resolved via a using declaration, the
+          // UsingShadowDecl is not referenced in the AST.
+          // Compare to TypedefType, or DeclRefExpr::getFoundDecl().
+          //                                 ^
+          "namespace ns { class ^X; }; using ns::X;",
+          "X *y;",
+      }};
+  for (const TestCase &T : Cases) {
+    TestTU TU;
+    TU.Code = T.MainCode;
+    Annotations Header(T.HeaderCode);
+    TU.HeaderCode = Header.code().str();
+    auto AST = TU.build();
+
+    std::vector<Position> Points;
+    for (const auto &Loc : findReferencedLocations(AST)) {
+      if (AST.getSourceManager().getBufferName(Loc).endswith(
+              TU.HeaderFilename)) {
+        Points.push_back(offsetToPosition(
+            TU.HeaderCode, AST.getSourceManager().getFileOffset(Loc)));
+      }
+    }
+    llvm::sort(Points);
+
+    EXPECT_EQ(Points, Header.points()) << T.HeaderCode << "\n---\n"
+                                       << T.MainCode;
+  }
+}
+
+} // namespace
+} // namespace clangd
+} // namespace clang
diff --git a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
index 1410ed115b6bf..6796a8ce70fff 100644
--- a/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
+++ b/clang-tools-extra/clangd/unittests/InlayHintTests.cpp
@@ -9,6 +9,7 @@
 #include "InlayHints.h"
 #include "Protocol.h"
 #include "TestTU.h"
+#include "TestWorkspace.h"
 #include "XRefs.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -398,6 +399,28 @@ TEST(ParameterHints, SetterFunctions) {
                        ExpectedHint{"timeout_millis: ", "timeout_millis"});
 }
 
+TEST(ParameterHints, IncludeAtNonGlobalScope) {
+  Annotations FooInc(R"cpp(
+    void bar() { foo(42); }
+  )cpp");
+  Annotations FooCC(R"cpp(
+    struct S {
+      void foo(int param);
+      #include "foo.inc"
+    };
+  )cpp");
+
+  TestWorkspace Workspace;
+  Workspace.addSource("foo.inc", FooInc.code());
+  Workspace.addMainFile("foo.cc", FooCC.code());
+
+  auto AST = Workspace.openFile("foo.cc");
+  ASSERT_TRUE(bool(AST));
+
+  // Ensure the hint for the call in foo.inc is NOT materialized in foo.cc.
+  EXPECT_EQ(hintsOfKind(*AST, InlayHintKind::ParameterHint).size(), 0u);
+}
+
 TEST(TypeHints, Smoke) {
   assertTypeHints(R"cpp(
     auto $waldo[[waldo]] = 42;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-pro-type-member-init.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-pro-type-member-init.cpp
index 403f28baf99d4..8cab4fd755752 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-pro-type-member-init.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines-pro-type-member-init.cpp
@@ -208,9 +208,8 @@ struct PositiveMultipleConstructors {
   PositiveMultipleConstructors(const PositiveMultipleConstructors &) {}
   // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: constructor does not initialize these fields: A, B
 
-  // FIXME: The fix-its here collide providing an erroneous fix
   int A, B;
-  // CHECK-FIXES: int A{}{}{}, B{}{}{};
+  // CHECK-FIXES: int A{}, B{};
 };
 
 typedef struct {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability-braces-around-statements-attributes.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability-braces-around-statements-attributes.cpp
new file mode 100644
index 0000000000000..e799614a1f7b0
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability-braces-around-statements-attributes.cpp
@@ -0,0 +1,24 @@
+// RUN: %check_clang_tidy  -std=c++20-or-later %s readability-braces-around-statements %t
+
+void test(bool b) {
+  if (b) {
+    return;
+  }
+  if (b) [[likely]] {
+    // CHECK-FIXES-NOT: if (b) { {{[[][[]}}likely{{[]][]]}} {
+    return;
+  }
+  if (b) [[unlikely]] {
+    // CHECK-FIXES-NOT: if (b) { {{[[][[]}}unlikely{{[]][]]}} {
+    return;
+  }
+
+  if (b) [[likely]]
+    // CHECK-FIXES: if (b) {{[[][[]}}likely{{[]][]]}} {
+    return;
+  // CHECK-FIXES: }
+  if (b) [[unlikely]]
+    // CHECK-FIXES: if (b) {{[[][[]}}unlikely{{[]][]]}} {
+    return;
+  // CHECK-FIXES: }
+}
diff --git a/clang/docs/AddressSanitizer.rst b/clang/docs/AddressSanitizer.rst
index 7befbc3173da8..06b53e2e5da0b 100644
--- a/clang/docs/AddressSanitizer.rst
+++ b/clang/docs/AddressSanitizer.rst
@@ -282,11 +282,11 @@ Code generation control
 Instrumentation code outlining
 ------------------------------
 
-By default AddressSanitizer inlines the instumentation code to improve the
+By default AddressSanitizer inlines the instrumentation code to improve the
 run-time performance, which leads to increased binary size. Using the
 (clang flag ``-fsanitize-address-outline-instrumentation` default: ``false``)
-flag forces all code instumentation to be outlined, which reduces the size
-of the generated code, but also reduces the run-time performace.
+flag forces all code instrumentation to be outlined, which reduces the size
+of the generated code, but also reduces the run-time performance.
 
 Limitations
 ===========
diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst
index 2770d28a9f3b0..dcbfba3aa836c 100644
--- a/clang/docs/ClangCommandLineReference.rst
+++ b/clang/docs/ClangCommandLineReference.rst
@@ -1705,7 +1705,7 @@ Enable support for int128\_t type
 
 .. option:: -ffp-contract=<arg>
 
-Form fused FP ops (e.g. FMAs): fast (fuses across statements disregarding pragmas) \| on (only fuses in the same statement unless dictated by pragmas) \| off (never fuses) \| fast-honor-pragmas (fuses across statements unless diectated by pragmas). Default is 'fast' for CUDA, 'fast-honor-pragmas' for HIP, and 'on' otherwise.
+Form fused FP ops (e.g. FMAs): fast (fuses across statements disregarding pragmas) \| on (only fuses in the same statement unless dictated by pragmas) \| off (never fuses) \| fast-honor-pragmas (fuses across statements unless dictated by pragmas). Default is 'fast' for CUDA, 'fast-honor-pragmas' for HIP, and 'on' otherwise.
 
 .. option:: -ffp-exception-behavior=<arg>
 
@@ -2543,7 +2543,7 @@ Give global types 'default' visibility and global functions and variables 'hidde
 
 .. option:: -fvisibility-nodllstorageclass=<arg>
 
-The visibility for defintiions without an explicit DLL export class \[-fvisibility-from-dllstorageclass\]
+The visibility for definitions without an explicit DLL export class \[-fvisibility-from-dllstorageclass\]
 
 .. option:: -fvisibility=<arg>
 
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 96d89db7a5ccf..d8ac58734dc4a 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -741,8 +741,7 @@ the configuration (without a prefix: ``Auto``).
     enum { A, B } myEnum;
 
     false:
-    enum
-    {
+    enum {
       A,
       B
     } myEnum;
@@ -3542,7 +3541,7 @@ the configuration (without a prefix: ``Auto``).
     ForEach and If macros. This is useful in projects where ForEach/If
     macros are treated as function calls instead of control statements.
     ``SBPO_ControlStatementsExceptForEachMacros`` remains an alias for
-    backward compatability.
+    backward compatibility.
 
     .. code-block:: c++
 
diff --git a/clang/docs/DataFlowSanitizerDesign.rst b/clang/docs/DataFlowSanitizerDesign.rst
index bed4d2f38cba5..4f028de2ed09d 100644
--- a/clang/docs/DataFlowSanitizerDesign.rst
+++ b/clang/docs/DataFlowSanitizerDesign.rst
@@ -139,7 +139,7 @@ Origin tracking trace representation
 ------------------------------------
 
 An origin tracking trace is a list of chains. Each chain has a stack trace
-where the DFSan runtime records a label propapation, and a pointer to its
+where the DFSan runtime records a label propagation, and a pointer to its
 previous chain. The very first chain does not point to any chain.
 
 Every four 4-bytes aligned application bytes share a 4-byte origin trace ID. A
diff --git a/clang/docs/IntroductionToTheClangAST.rst b/clang/docs/IntroductionToTheClangAST.rst
index 286ab88d01ef6..6fbb8f1d6440e 100644
--- a/clang/docs/IntroductionToTheClangAST.rst
+++ b/clang/docs/IntroductionToTheClangAST.rst
@@ -32,7 +32,7 @@ clang ParenExpr).
 Examining the AST
 =================
 
-A good way to familarize yourself with the Clang AST is to actually look
+A good way to familiarize yourself with the Clang AST is to actually look
 at it on some simple example code. Clang has a builtin AST-dump mode,
 which can be enabled with the flag ``-ast-dump``.
 
diff --git a/clang/docs/MemorySanitizer.rst b/clang/docs/MemorySanitizer.rst
index 3ba5ce5bed3ee..c6fc9407ea15f 100644
--- a/clang/docs/MemorySanitizer.rst
+++ b/clang/docs/MemorySanitizer.rst
@@ -85,6 +85,15 @@ particular function.  MemorySanitizer may still instrument such functions to
 avoid false positives.  This attribute may not be supported by other compilers,
 so we suggest to use it together with ``__has_feature(memory_sanitizer)``.
 
+``__attribute__((disable_sanitizer_instrumentation))``
+--------------------------------------------------------
+
+The ``disable_sanitizer_instrumentation`` attribute can be applied to functions
+to prevent all kinds of instrumentation. As a result, it may introduce false
+positives and therefore should be used with care, and only if absolutely
+required; for example for certain code that cannot tolerate any instrumentation
+and resulting side-effects. This attribute overrides ``no_sanitize("memory")``.
+
 Ignorelist
 ----------
 
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 7291a5cfc5a0f..b1aa2ca65f14b 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -133,9 +133,9 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | loop extension               | inclusive scan extension (matching C++17 PSTL)               | :good:`done`             |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| memory mangagement           | memory allocators                                            | :good:`done`             | r341687,r357929                                                       |
+| memory management            | memory allocators                                            | :good:`done`             | r341687,r357929                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| memory mangagement           | allocate directive and allocate clause                       | :good:`done`             | r355614,r335952                                                       |
+| memory management            | allocate directive and allocate clause                       | :good:`done`             | r355614,r335952                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | OMPD                         | OMPD interfaces                                              | :part:`not upstream`     | https://github.com/OpenMPToolsInterface/LLVM-openmp/tree/ompd-tests   |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7604df7482c76..f728f5b4fcfc4 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -100,7 +100,9 @@ Windows Support
 C Language Changes in Clang
 ---------------------------
 
-- ...
+- Wide multi-characters literals such as ``L'ab'`` that would previously be interpreted as ``L'b'``
+  are now ill-formed in all language modes. The motivation for this change is outlined in
+  `P2362 <wg21.link/P2362>`_.
 
 C++ Language Changes in Clang
 -----------------------------
diff --git a/clang/include/clang/AST/ComparisonCategories.h b/clang/include/clang/AST/ComparisonCategories.h
index b41e934142ee4..fb648b322b61c 100644
--- a/clang/include/clang/AST/ComparisonCategories.h
+++ b/clang/include/clang/AST/ComparisonCategories.h
@@ -115,8 +115,7 @@ class ComparisonCategoryInfo {
 public:
   /// The declaration for the comparison category type from the
   /// standard library.
-  // FIXME: Make this const
-  CXXRecordDecl *Record = nullptr;
+  const CXXRecordDecl *Record = nullptr;
 
   /// The Kind of the comparison category type
   ComparisonCategoryType Kind;
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 27241cfa9e4a4..b223b413c7556 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -615,7 +615,9 @@ class NamespaceDecl : public NamedDecl, public DeclContext,
     if (!isInline())
       return false;
     auto X = lookup(Name);
-    auto Y = getParent()->lookup(Name);
+    // We should not perform a lookup within a transparent context, so find a
+    // non-transparent parent context.
+    auto Y = getParent()->getNonTransparentContext()->lookup(Name);
     return std::distance(X.begin(), X.end()) ==
       std::distance(Y.begin(), Y.end());
   }
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 482d2889a25a1..18468c8ca1c47 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1997,6 +1997,12 @@ class DeclContext {
     return const_cast<DeclContext*>(this)->getNonClosureAncestor();
   }
 
+  // Retrieve the nearest context that is not a transparent context.
+  DeclContext *getNonTransparentContext();
+  const DeclContext *getNonTransparentContext() const {
+    return const_cast<DeclContext *>(this)->getNonTransparentContext();
+  }
+
   /// getPrimaryContext - There may be many different
   /// declarations of the same entity (including forward declarations
   /// of classes, multiple definitions of namespaces, etc.), each with
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 1bc2e955ddc73..3741ea9989b6e 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -3465,10 +3465,6 @@ class ConstantMatrixType final : public MatrixType {
 protected:
   friend class ASTContext;
 
-  /// The element type of the matrix.
-  // FIXME: Appears to be unused? There is also MatrixType::ElementType...
-  QualType ElementType;
-
   /// Number of rows and columns.
   unsigned NumRows;
   unsigned NumColumns;
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index a5df8351e8c8a..fd99e677b1cc1 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3607,6 +3607,13 @@ def NoSanitizeSpecific : InheritableAttr {
   let ASTNode = 0;
 }
 
+def DisableSanitizerInstrumentation : InheritableAttr {
+  let Spellings = [Clang<"disable_sanitizer_instrumentation">];
+  let Subjects = SubjectList<[Function, ObjCMethod, GlobalVar]>;
+  let Documentation = [DisableSanitizerInstrumentationDocs];
+  let SimpleHandler = 1;
+}
+
 def CFICanonicalJumpTable : InheritableAttr {
   let Spellings = [Clang<"cfi_canonical_jump_table">];
   let Subjects = SubjectList<[Function], ErrorDiag>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 5cb6f853910b1..891e450bb2ab4 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3825,6 +3825,18 @@ full list of supported sanitizer flags.
   }];
 }
 
+def DisableSanitizerInstrumentationDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+Use the ``disable_sanitizer_instrumentation`` attribute on a function,
+Objective-C method, or global variable, to specify that no sanitizer
+instrumentation should be applied.
+
+This is not the same as ``__attribute__((no_sanitize(...)))``, which depending
+on the tool may still insert instrumentation to prevent false positive reports.
+  }];
+}
+
 def NoSanitizeAddressDocs : Documentation {
   let Category = DocCatFunction;
   // This function has multiple distinct spellings, and so it requires a custom
diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def
index 1dac5d2371d48..634bcaed20a6f 100644
--- a/clang/include/clang/Basic/BuiltinsAArch64.def
+++ b/clang/include/clang/Basic/BuiltinsAArch64.def
@@ -243,6 +243,9 @@ TARGET_HEADER_BUILTIN(_ReadStatusReg,  "LLii",  "nh", "intrin.h", ALL_MS_LANGUAG
 TARGET_HEADER_BUILTIN(_WriteStatusReg, "viLLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
 TARGET_HEADER_BUILTIN(_AddressOfReturnAddress, "v*", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
 
+TARGET_HEADER_BUILTIN(__mulh,  "SLLiSLLiSLLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
+TARGET_HEADER_BUILTIN(__umulh, "ULLiULLiULLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
+
 #undef BUILTIN
 #undef LANGBUILTIN
 #undef TARGET_HEADER_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def
index 04ec45aa3b747..f5120b23f8118 100644
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@@ -119,18 +119,22 @@ TARGET_BUILTIN(__builtin_wasm_all_true_i16x8, "iV8s", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_all_true_i32x4, "iV4i", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_all_true_i64x2, "iV2LLi", "nc", "simd128")
 
-TARGET_BUILTIN(__builtin_wasm_bitmask_i8x16, "iV16Sc", "nc", "simd128")
-TARGET_BUILTIN(__builtin_wasm_bitmask_i16x8, "iV8s", "nc", "simd128")
-TARGET_BUILTIN(__builtin_wasm_bitmask_i32x4, "iV4i", "nc", "simd128")
-TARGET_BUILTIN(__builtin_wasm_bitmask_i64x2, "iV2LLi", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_bitmask_i8x16, "UiV16Sc", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_bitmask_i16x8, "UiV8s", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_bitmask_i32x4, "UiV4i", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_bitmask_i64x2, "UiV2LLi", "nc", "simd128")
 
 TARGET_BUILTIN(__builtin_wasm_abs_f32x4, "V4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_abs_f64x2, "V2dV2d", "nc", "simd128")
 
 TARGET_BUILTIN(__builtin_wasm_min_f32x4, "V4fV4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_max_f32x4, "V4fV4fV4f", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_pmin_f32x4, "V4fV4fV4f", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_pmax_f32x4, "V4fV4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_min_f64x2, "V2dV2dV2d", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_max_f64x2, "V2dV2dV2d", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_pmin_f64x2, "V2dV2dV2d", "nc", "simd128")
+TARGET_BUILTIN(__builtin_wasm_pmax_f64x2, "V2dV2dV2d", "nc", "simd128")
 
 TARGET_BUILTIN(__builtin_wasm_ceil_f32x4, "V4fV4f", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_floor_f32x4, "V4fV4f", "nc", "simd128")
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index c54133c449889..f21c17ee0ebe9 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -1850,9 +1850,151 @@ TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_256, "vV8iV8iUc*Uc*", "nV:256:", "a
 TARGET_BUILTIN(__builtin_ia32_vp2intersect_d_128, "vV4iV4iUc*Uc*", "nV:128:", "avx512vp2intersect,avx512vl")
 
 // AVX512 fp16 intrinsics
+TARGET_BUILTIN(__builtin_ia32_vcomish,       "iV8xV8xIiIi",    "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_addph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_subph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_mulph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_divph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_maxph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_minph512,      "V32xV32xV32xIi", "ncV:512:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_minph256,      "V16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_minph128,      "V8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_maxph256,      "V16xV16xV16x", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_maxph128,      "V8xV8xV8x", "ncV:128:", "avx512fp16,avx512vl")
+
+TARGET_BUILTIN(__builtin_ia32_addsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_divsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_mulsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_subsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_maxsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_minsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_cmpph512_mask, "UiV32xV32xIiUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_cmpph256_mask, "UsV16xV16xIiUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cmpph128_mask, "UcV8xV8xIiUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_cmpsh_mask, "UcV8xV8xIiUcIi", "ncV:128:", "avx512fp16")
 TARGET_BUILTIN(__builtin_ia32_loadsh128_mask, "V8xV8x*V8xUc", "nV:128:", "avx512fp16")
 TARGET_BUILTIN(__builtin_ia32_storesh128_mask, "vV8x*V8xUc", "nV:128:", "avx512fp16")
 
+TARGET_BUILTIN(__builtin_ia32_rcpph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rcpph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rcpph512_mask, "V32xV32xV32xUi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_rsqrtph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rsqrtph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rsqrtph512_mask, "V32xV32xV32xUi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_getmantph128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_getmantph256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_getmantph512_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_getexpph128_mask, "V8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_getexpph256_mask, "V16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_getexpph512_mask, "V32xV32xV32xUiIi", "ncV:512:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_scalefph128_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scalefph256_mask, "V16xV16xV16xV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_scalefph512_mask, "V32xV32xV32xV32xUiIi", "ncV:512:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_rndscaleph_128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rndscaleph_256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_rndscaleph_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_reduceph128_mask, "V8xV8xIiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_reduceph256_mask, "V16xV16xIiV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_reduceph512_mask, "V32xV32xIiV32xUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_rcpsh_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_rsqrtsh_mask, "V8xV8xV8xV8xUc", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_getmantsh_round_mask, "V8xV8xV8xIiV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_getexpsh128_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_scalefsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_rndscalesh_round_mask, "V8xV8xV8xV8xUcIiIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_reducesh_mask, "V8xV8xV8xV8xUcIiIi", "ncV:128:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_sqrtph, "V8xV8x", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_sqrtph256, "V16xV16x", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_sqrtph512, "V32xV32xIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_sqrtsh_round_mask, "V8xV8xV8xV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_fpclassph128_mask, "UcV8xIiUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fpclassph256_mask, "UsV16xIiUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_fpclassph512_mask, "UiV32xIiUi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_fpclasssh_mask, "UcV8xIiUc", "ncV:128:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph128_mask, "V8xV2dV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph256_mask, "V8xV4dV8xUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtpd2ph512_mask, "V8xV8dV8xUcIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2pd128_mask, "V2dV8xV2dUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2pd256_mask, "V4dV8xV4dUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2pd512_mask, "V8dV8xV8dUcIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtsh2ss_round_mask, "V4fV4fV8xV4fUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtss2sh_round_mask, "V8xV8xV4fV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtsd2sh_round_mask, "V8xV8xV2dV8xUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtsh2sd_round_mask, "V2dV2dV8xV2dUcIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2w128_mask, "V8sV8xV8sUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2w256_mask, "V16sV16xV16sUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2w512_mask, "V32sV32xV32sUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2w128_mask, "V8sV8xV8sUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2w256_mask, "V16sV16xV16sUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2w512_mask, "V32sV32xV32sUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtw2ph128_mask, "V8xV8sV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtw2ph256_mask, "V16xV16sV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtw2ph512_mask, "V32xV32sV32xUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2uw128_mask, "V8UsV8xV8UsUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2uw256_mask, "V16UsV16xV16UsUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2uw512_mask, "V32UsV32xV32UsUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2uw128_mask, "V8UsV8xV8UsUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2uw256_mask, "V16UsV16xV16UsUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2uw512_mask, "V32UsV32xV32UsUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph128_mask, "V8xV8UsV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph256_mask, "V16xV16UsV16xUs", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtuw2ph512_mask, "V32xV32UsV32xUiIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2dq128_mask, "V4iV8xV4iUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2dq256_mask, "V8iV8xV8iUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2dq512_mask, "V16iV16xV16iUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2udq128_mask, "V4UiV8xV4UiUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2udq256_mask, "V8UiV8xV8UiUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2udq512_mask, "V16UiV16xV16UiUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph128_mask, "V8xV4iV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph256_mask, "V8xV8iV8xUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtdq2ph512_mask, "V16xV16iV16xUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph128_mask, "V8xV4UiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph256_mask, "V8xV8UiV8xUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtudq2ph512_mask, "V16xV16UiV16xUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2dq128_mask, "V4iV8xV4iUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2dq256_mask, "V8iV8xV8iUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2dq512_mask, "V16iV16xV16iUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2udq128_mask, "V4UiV8xV4UiUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2udq256_mask, "V8UiV8xV8UiUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2udq512_mask, "V16UiV16xV16UiUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph128_mask, "V8xV2OiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph256_mask, "V8xV4OiV8xUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtqq2ph512_mask, "V8xV8OiV8xUcIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2qq128_mask, "V2OiV8xV2OiUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2qq256_mask, "V4OiV8xV4OiUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2qq512_mask, "V8OiV8xV8OiUcIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph128_mask, "V8xV2UOiV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph256_mask, "V8xV4UOiV8xUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtuqq2ph512_mask, "V8xV8UOiV8xUcIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq128_mask, "V2UOiV8xV2UOiUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq256_mask, "V4UOiV8xV4UOiUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2uqq512_mask, "V8UOiV8xV8UOiUcIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2qq128_mask, "V2OiV8xV2OiUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2qq256_mask, "V4OiV8xV4OiUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2qq512_mask, "V8OiV8xV8OiUcIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq128_mask, "V2UOiV8xV2UOiUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq256_mask, "V4UOiV8xV4UOiUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvttph2uqq512_mask, "V8UOiV8xV8UOiUcIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtsh2si32, "iV8xIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtsh2usi32, "UiV8xIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtusi2sh, "V8xV8xUiIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtsi2sh, "V8xV8xiIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvttsh2si32, "iV8xIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvttsh2usi32, "UiV8xIi", "ncV:128:", "avx512fp16")
+
+TARGET_BUILTIN(__builtin_ia32_vcvtph2psx128_mask, "V4fV8xV4fUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2psx256_mask, "V8fV8xV8fUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtph2psx512_mask, "V16fV16xV16fUsIi", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtps2phx128_mask, "V8xV4fV8xUc", "ncV:128:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtps2phx256_mask, "V8xV8fV8xUc", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_vcvtps2phx512_mask, "V16xV16fV16xUsIi", "ncV:512:", "avx512fp16")
+
 // generic select intrinsics
 TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "ncV:128:", "avx512bw,avx512vl")
 TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "ncV:256:", "avx512bw,avx512vl")
@@ -1886,12 +2028,24 @@ TARGET_BUILTIN(__builtin_ia32_reduce_and_d512, "iV16i", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_and_q512, "OiV8Oi", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_fadd_pd512, "ddV8d", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ps512, "ffV16f", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph512, "xxV32x", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph256, "xxV16x", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_reduce_fadd_ph128, "xxV8x", "ncV:128:", "avx512fp16,avx512vl")
 TARGET_BUILTIN(__builtin_ia32_reduce_fmax_pd512, "dV8d", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ps512, "fV16f", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph512, "xV32x", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph256, "xV16x", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmax_ph128, "xV8x", "ncV:128:", "avx512fp16,avx512vl")
 TARGET_BUILTIN(__builtin_ia32_reduce_fmin_pd512, "dV8d", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ps512, "fV16f", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph512, "xV32x", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph256, "xV16x", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmin_ph128, "xV8x", "ncV:128:", "avx512fp16,avx512vl")
 TARGET_BUILTIN(__builtin_ia32_reduce_fmul_pd512, "ddV8d", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ps512, "ffV16f", "ncV:512:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph512, "xxV32x", "ncV:512:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph256, "xxV16x", "ncV:256:", "avx512fp16,avx512vl")
+TARGET_BUILTIN(__builtin_ia32_reduce_fmul_ph128, "xxV8x", "ncV:128:", "avx512fp16,avx512vl")
 TARGET_BUILTIN(__builtin_ia32_reduce_mul_d512, "iV16i", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_mul_q512, "OiV8Oi", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_reduce_or_d512, "iV16i", "ncV:512:", "avx512f")
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index ce2b1decdf6ca..e0c9bec9b4e00 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -92,6 +92,12 @@ TARGET_BUILTIN(__builtin_ia32_cvtsi2sd64, "V2dV2dOiIi", "ncV:128:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_cvtsi2ss64, "V4fV4fOiIi", "ncV:128:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_cvtusi2sd64, "V2dV2dUOiIi", "ncV:128:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_cvtusi2ss64, "V4fV4fUOiIi", "ncV:128:", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_vcvtsh2si64, "OiV8xIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtsh2usi64, "UOiV8xIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtusi642sh, "V8xV8xUOiIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvtsi642sh, "V8xV8xOiIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvttsh2si64, "OiV8xIi", "ncV:128:", "avx512fp16")
+TARGET_BUILTIN(__builtin_ia32_vcvttsh2usi64, "UOiV8xIi", "ncV:128:", "avx512fp16")
 TARGET_BUILTIN(__builtin_ia32_directstore_u64, "vULi*ULi", "n", "movdiri")
 
 // UINTR
diff --git a/clang/include/clang/Basic/CharInfo.h b/clang/include/clang/Basic/CharInfo.h
index 8577475fab069..472eb38b9829d 100644
--- a/clang/include/clang/Basic/CharInfo.h
+++ b/clang/include/clang/Basic/CharInfo.h
@@ -43,6 +43,11 @@ LLVM_READNONE inline bool isASCII(char c) {
   return static_cast<unsigned char>(c) <= 127;
 }
 
+LLVM_READNONE inline bool isASCII(unsigned char c) { return c <= 127; }
+
+/// Returns true if this is an ASCII character.
+LLVM_READNONE inline bool isASCII(uint32_t c) { return c <= 127; }
+
 /// Returns true if this is a valid first character of a C identifier,
 /// which is [a-zA-Z_].
 LLVM_READONLY inline bool isIdentifierHead(unsigned char c,
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index b4ec18d2726e6..c250f78b969a2 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -64,6 +64,7 @@ def StringConversion : DiagGroup<"string-conversion">;
 def SignConversion : DiagGroup<"sign-conversion">;
 def PointerBoolConversion : DiagGroup<"pointer-bool-conversion">;
 def UndefinedBoolConversion : DiagGroup<"undefined-bool-conversion">;
+def BoolOperation : DiagGroup<"bool-operation">;
 def BoolConversion : DiagGroup<"bool-conversion", [PointerBoolConversion,
                                                    UndefinedBoolConversion]>;
 def IntConversion : DiagGroup<"int-conversion">;
@@ -822,8 +823,10 @@ def ReservedIdentifier : DiagGroup<"reserved-identifier",
 //  under separate flags.
 //
 def UnreachableCodeLoopIncrement : DiagGroup<"unreachable-code-loop-increment">;
+def UnreachableCodeFallthrough : DiagGroup<"unreachable-code-fallthrough">;
 def UnreachableCode : DiagGroup<"unreachable-code",
-                                [UnreachableCodeLoopIncrement]>;
+                                [UnreachableCodeLoopIncrement,
+                                 UnreachableCodeFallthrough]>;
 def UnreachableCodeBreak : DiagGroup<"unreachable-code-break">;
 def UnreachableCodeReturn : DiagGroup<"unreachable-code-return">;
 def UnreachableCodeAggressive : DiagGroup<"unreachable-code-aggressive",
@@ -946,6 +949,7 @@ def Extra : DiagGroup<"extra", [
   ]>;
 
 def Most : DiagGroup<"most", [
+    BoolOperation,
     CharSubscript,
     Comment,
     DeleteNonVirtualDtor,
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index f621cdb466560..c19adf104db1f 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -113,8 +113,10 @@ def warn_four_char_character_literal : Warning<
 // Unicode and UCNs
 def err_invalid_utf8 : Error<
   "source file is not valid UTF-8">;
-def err_non_ascii : Error<
-  "non-ASCII characters are not allowed outside of literals and identifiers">;
+def err_character_not_allowed : Error<
+  "unexpected character <U+%0>">;
+def err_character_not_allowed_identifier : Error<
+  "character <U+%0> not allowed %select{in|at the start of}1 an identifier">;
 def ext_unicode_whitespace : ExtWarn<
   "treating Unicode character as whitespace">,
   InGroup<DiagGroup<"unicode-whitespace">>;
@@ -150,9 +152,6 @@ def warn_c99_compat_unicode_id : Warning<
   "%select{using this character in an identifier|starting an identifier with "
   "this character}0 is incompatible with C99">,
   InGroup<C99Compat>, DefaultIgnore;
-def warn_cxx98_compat_unicode_id : Warning<
-  "using this character in an identifier is incompatible with C++98">,
-  InGroup<CXX98Compat>, DefaultIgnore;
 
 def warn_cxx98_compat_literal_ucn_escape_basic_scs : Warning<
   "specifying character '%0' with a universal character name "
@@ -184,12 +183,10 @@ def warn_c2x_compat_digit_separator : Warning<
   InGroup<CPre2xCompat>, DefaultIgnore;
 def err_digit_separator_not_between_digits : Error<
   "digit separator cannot appear at %select{start|end}0 of digit sequence">;
-def warn_extraneous_char_constant : Warning<
-  "extraneous characters in character constant ignored">;
 def warn_char_constant_too_large : Warning<
   "character constant too long for its type">;
-def err_multichar_utf_character_literal : Error<
-  "Unicode character literals may not contain multiple characters">;
+def err_multichar_character_literal : Error<
+  "%select{wide|Unicode}0 character literals may not contain multiple characters">;
 def err_exponent_has_no_digits : Error<"exponent has no digits">;
 def err_hex_constant_requires : Error<
   "hexadecimal floating %select{constant|literal}0 requires "
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 7bddf754263e6..f2eb528ab5ab9 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -715,6 +715,9 @@ def warn_unreachable_return : Warning<
 def warn_unreachable_loop_increment : Warning<
   "loop will run at most once (loop increment never executed)">,
   InGroup<UnreachableCodeLoopIncrement>, DefaultIgnore;
+def warn_unreachable_fallthrough_attr : Warning<
+  "fallthrough annotation in unreachable code">,
+  InGroup<UnreachableCodeFallthrough>, DefaultIgnore;
 def note_unreachable_silence : Note<
   "silence by adding parentheses to mark code as explicitly dead">;
 
@@ -7472,7 +7475,7 @@ def note_member_first_declared_here : Note<
 def warn_bitwise_negation_bool : Warning<
   "bitwise negation of a boolean expression%select{;| always evaluates to 'true';}0 "
   "did you mean logical negation?">,
-  InGroup<DiagGroup<"bool-operation">>;
+  InGroup<BoolOperation>, DefaultIgnore;
 def err_decrement_bool : Error<"cannot decrement expression of type bool">;
 def warn_increment_bool : Warning<
   "incrementing expression of type bool is deprecated and "
@@ -9628,9 +9631,6 @@ def err_fallthrough_attr_outside_switch : Error<
   "fallthrough annotation is outside switch statement">;
 def err_fallthrough_attr_invalid_placement : Error<
   "fallthrough annotation does not directly precede switch label">;
-def warn_fallthrough_attr_unreachable : Warning<
-  "fallthrough annotation in unreachable code">,
-  InGroup<ImplicitFallthrough>, DefaultIgnore;
 
 def warn_unreachable_default : Warning<
   "default label in switch which covers all enumeration values">,
diff --git a/clang/include/clang/Basic/LangStandards.def b/clang/include/clang/Basic/LangStandards.def
index 160dc3f2405a7..6056cfd65bbbf 100644
--- a/clang/include/clang/Basic/LangStandards.def
+++ b/clang/include/clang/Basic/LangStandards.def
@@ -180,12 +180,18 @@ LANGSTANDARD(opencl20, "cl2.0",
 LANGSTANDARD(opencl30, "cl3.0",
              OpenCL, "OpenCL 3.0",
              LineComment | C99 | Digraphs | HexFloat | OpenCL)
+
 LANGSTANDARD(openclcpp10, "clc++1.0",
              OpenCL, "C++ for OpenCL 1.0",
              LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus17 |
              Digraphs | HexFloat | OpenCL)
 LANGSTANDARD_ALIAS(openclcpp10, "clc++")
 
+LANGSTANDARD(openclcpp2021, "clc++2021",
+             OpenCL, "C++ for OpenCL 2021",
+             LineComment | CPlusPlus | CPlusPlus11 | CPlusPlus14 | CPlusPlus17 |
+             Digraphs | HexFloat | OpenCL)
+
 LANGSTANDARD_ALIAS_DEPR(opencl10, "CL")
 LANGSTANDARD_ALIAS_DEPR(opencl11, "CL1.1")
 LANGSTANDARD_ALIAS_DEPR(opencl12, "CL1.2")
@@ -193,6 +199,7 @@ LANGSTANDARD_ALIAS_DEPR(opencl20, "CL2.0")
 LANGSTANDARD_ALIAS_DEPR(opencl30, "CL3.0")
 LANGSTANDARD_ALIAS_DEPR(openclcpp10, "CLC++")
 LANGSTANDARD_ALIAS_DEPR(openclcpp10, "CLC++1.0")
+LANGSTANDARD_ALIAS_DEPR(openclcpp2021, "CLC++2021")
 
 // CUDA
 LANGSTANDARD(cuda, "cuda", CUDA, "NVIDIA CUDA(tm)",
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index c7a57a7dba9a8..ab855948b447c 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -210,8 +210,8 @@ class TargetInfo : public virtual TransferrableTargetInfo,
   unsigned char RegParmMax, SSERegParmMax;
   TargetCXXABI TheCXXABI;
   const LangASMap *AddrSpaceMap;
-  const unsigned *GridValues =
-      nullptr; // Array of target-specific GPU grid values that must be
+  const llvm::omp::GV *GridValues =
+      nullptr; // target-specific GPU grid values that must be
                // consistent between host RTL (plugin), device RTL, and clang.
 
   mutable StringRef PlatformName;
@@ -871,6 +871,11 @@ class TargetInfo : public virtual TransferrableTargetInfo,
   /// across the current set of primary and secondary targets.
   virtual ArrayRef<Builtin::Info> getTargetBuiltins() const = 0;
 
+  /// Returns target-specific min and max values VScale_Range.
+  virtual Optional<std::pair<unsigned, unsigned>>
+  getVScaleRange(const LangOptions &LangOpts) const {
+    return None;
+  }
   /// The __builtin_clz* and __builtin_ctz* built-in
   /// functions are specified to have undefined results for zero inputs, but
   /// on targets that support these operations in a way that provides
@@ -1405,10 +1410,10 @@ class TargetInfo : public virtual TransferrableTargetInfo,
     return LangAS::Default;
   }
 
-  /// Return a target-specific GPU grid value based on the GVIDX enum \p gv
-  unsigned getGridValue(llvm::omp::GVIDX gv) const {
+  /// Return a target-specific GPU grid values
+  const llvm::omp::GV &getGridValue() const {
     assert(GridValues != nullptr && "GridValues not initialized");
-    return GridValues[gv];
+    return *GridValues;
   }
 
   /// Retrieve the name of the platform as it is used in the
diff --git a/clang/include/clang/Driver/Distro.h b/clang/include/clang/Driver/Distro.h
index 0d2a0939639ea..d9909bcf96968 100644
--- a/clang/include/clang/Driver/Distro.h
+++ b/clang/include/clang/Driver/Distro.h
@@ -37,6 +37,7 @@ class Distro {
     DebianStretch,
     DebianBuster,
     DebianBullseye,
+    DebianBookworm,
     Exherbo,
     RHEL5,
     RHEL6,
@@ -119,7 +120,7 @@ class Distro {
   bool IsOpenSUSE() const { return DistroVal == OpenSUSE; }
 
   bool IsDebian() const {
-    return DistroVal >= DebianLenny && DistroVal <= DebianBullseye;
+    return DistroVal >= DebianLenny && DistroVal <= DebianBookworm;
   }
 
   bool IsUbuntu() const {
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 872df1bc06264..e5da70a52427a 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -887,7 +887,7 @@ def cl_no_signed_zeros : Flag<["-"], "cl-no-signed-zeros">, Group<opencl_Group>,
   MarshallingInfoFlag<LangOpts<"CLNoSignedZero">>;
 def cl_std_EQ : Joined<["-"], "cl-std=">, Group<opencl_Group>, Flags<[CC1Option]>,
   HelpText<"OpenCL language standard to compile for.">,
-  Values<"cl,CL,cl1.0,CL1.0,cl1.1,CL1.1,cl1.2,CL1.2,cl2.0,CL2.0,cl3.0,CL3.0,clc++,CLC++,clc++1.0,CLC++1.0">;
+  Values<"cl,CL,cl1.0,CL1.0,cl1.1,CL1.1,cl1.2,CL1.2,cl2.0,CL2.0,cl3.0,CL3.0,clc++,CLC++,clc++1.0,CLC++1.0,clc++2021,CLC++2021">;
 def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group<opencl_Group>,
   HelpText<"OpenCL only. Allow denormals to be flushed to zero.">;
 def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group<opencl_Group>, Flags<[CC1Option]>,
@@ -2584,7 +2584,7 @@ def ftrivial_auto_var_init : Joined<["-"], "ftrivial-auto-var-init=">, Group<f_G
   NormalizedValues<["Uninitialized", "Zero", "Pattern"]>,
   MarshallingInfoEnum<LangOpts<"TrivialAutoVarInit">, "Uninitialized">;
 def enable_trivial_var_init_zero : Flag<["-"], "enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang">,
-  Flags<[CC1Option, CoreOption]>,
+  Flags<[CC1Option, CoreOption, NoArgumentUnused]>,
   HelpText<"Trivial automatic variable initialization to zero is only here for benchmarks, it'll eventually be removed, and I'm OK with that because I'm only using it to benchmark">;
 def ftrivial_auto_var_init_stop_after : Joined<["-"], "ftrivial-auto-var-init-stop-after=">, Group<f_Group>,
   Flags<[CC1Option, CoreOption]>, HelpText<"Stop initializing trivial automatic stack variables after the specified number of instances">,
@@ -4402,7 +4402,6 @@ def mno_vzeroupper : Flag<["-"], "mno-vzeroupper">, Group<m_x86_Features_Group>;
 // These are legacy user-facing driver-level option spellings. They are always
 // aliases for options that are spelled using the more common Unix / GNU flag
 // style of double-dash and equals-joined flags.
-def gcc_toolchain_legacy_spelling : Separate<["-"], "gcc-toolchain">, Alias<gcc_toolchain>;
 def target_legacy_spelling : Separate<["-"], "target">, Alias<target>;
 
 // Special internal option to handle -Xlinker --no-demangle.
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index abed12f38d92b..66d8297beae52 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -70,6 +70,8 @@ struct ParsedAttrInfo {
     const char *NormalizedFullName;
   };
   ArrayRef<Spelling> Spellings;
+  // The names of the known arguments of this attribute.
+  ArrayRef<const char *> ArgNames;
 
   ParsedAttrInfo(AttributeCommonInfo::Kind AttrKind =
                      AttributeCommonInfo::NoSemaHandlerAttribute)
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index f417a9db7a83a..ba825ff2fea57 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11597,15 +11597,14 @@ class Sema final {
                                      SourceLocation ModifierLoc,
                                      SourceLocation EndLoc);
   /// Called on well-formed 'map' clause.
-  OMPClause *
-  ActOnOpenMPMapClause(ArrayRef<OpenMPMapModifierKind> MapTypeModifiers,
-                       ArrayRef<SourceLocation> MapTypeModifiersLoc,
-                       CXXScopeSpec &MapperIdScopeSpec,
-                       DeclarationNameInfo &MapperId,
-                       OpenMPMapClauseKind MapType, bool IsMapTypeImplicit,
-                       SourceLocation MapLoc, SourceLocation ColonLoc,
-                       ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
-                       ArrayRef<Expr *> UnresolvedMappers = llvm::None);
+  OMPClause *ActOnOpenMPMapClause(
+      ArrayRef<OpenMPMapModifierKind> MapTypeModifiers,
+      ArrayRef<SourceLocation> MapTypeModifiersLoc,
+      CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
+      OpenMPMapClauseKind MapType, bool IsMapTypeImplicit,
+      SourceLocation MapLoc, SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
+      const OMPVarListLocTy &Locs, bool NoDiagnose = false,
+      ArrayRef<Expr *> UnresolvedMappers = llvm::None);
   /// Called on well-formed 'num_teams' clause.
   OMPClause *ActOnOpenMPNumTeamsClause(Expr *NumTeams, SourceLocation StartLoc,
                                        SourceLocation LParenLoc,
diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 444b00d73f0b7..125ef859d1ebb 100644
--- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -485,7 +485,17 @@ def DynamicMemoryModeling: Checker<"DynamicMemoryModeling">,
                   "allocating and deallocating functions are annotated with "
                   "ownership_holds, ownership_takes and ownership_returns.",
                   "false",
-                  InAlpha>
+                  InAlpha>,
+    CmdLineOption<Boolean,
+                  "AddNoOwnershipChangeNotes",
+                  "Add an additional note to the bug report for leak-like "
+                  "bugs. Dynamically allocated objects passed to functions "
+                  "that neither deallocated it, or have taken responsibility "
+                  "of the ownership are noted, similarly to "
+                  "NoStoreFuncVisitor.",
+                  "false",
+                  InAlpha,
+                  Hide>
   ]>,
   Dependencies<[CStringModeling]>,
   Documentation<NotDocumented>,
diff --git a/clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h b/clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h
index 24cae12af24a1..139b0dcd51704 100644
--- a/clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h
+++ b/clang/include/clang/StaticAnalyzer/Core/BugReporter/BugReporterVisitors.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringRef.h"
 #include <list>
 #include <memory>
@@ -622,6 +623,84 @@ class TagVisitor : public BugReporterVisitor {
                                    PathSensitiveBugReport &R) override;
 };
 
+class ObjCMethodCall;
+class CXXConstructorCall;
+
+/// Put a diagnostic on return statement (or on } in its absence) of all inlined
+/// functions for which some property remained unchanged.
+/// Resulting diagnostics may read such as "Returning without writing to X".
+///
+/// Descendants can define what a "state change is", like a change of value
+/// to a memory region, liveness, etc. For function calls where the state did
+/// not change as defined, a custom note may be constructed.
+class NoStateChangeFuncVisitor : public BugReporterVisitor {
+private:
+  /// Frames modifying the state as defined in \c wasModifiedBeforeCallExit.
+  /// This visitor generates a note only if a function does *not* change the
+  /// state that way. This information is not immediately available
+  /// by looking at the node associated with the exit from the function
+  /// (usually the return statement). To avoid recomputing the same information
+  /// many times (going up the path for each node and checking whether the
+  /// region was written into) we instead lazily compute the stack frames
+  /// along the path.
+  llvm::SmallPtrSet<const StackFrameContext *, 32> FramesModifying;
+  llvm::SmallPtrSet<const StackFrameContext *, 32> FramesModifyingCalculated;
+
+  /// Check and lazily calculate whether the state is modified in the stack
+  /// frame to which \p CallExitBeginN belongs.
+  /// The calculation is cached in FramesModifying.
+  bool isModifiedInFrame(const ExplodedNode *CallExitBeginN);
+
+  /// Write to \c FramesModifying all stack frames along the path in the current
+  /// stack frame which modifies the state.
+  void findModifyingFrames(const ExplodedNode *const CallExitBeginN);
+
+protected:
+  bugreporter::TrackingKind TKind;
+
+  /// \return Whether the state was modified from the current node, \CurrN, to
+  /// the end of the stack fram, at \p CallExitBeginN.
+  virtual bool
+  wasModifiedBeforeCallExit(const ExplodedNode *CurrN,
+                            const ExplodedNode *CallExitBeginN) = 0;
+
+  /// Consume the information on the non-modifying stack frame in order to
+  /// either emit a note or not. May suppress the report entirely.
+  /// \return Diagnostics piece for the unmodified state in the current
+  /// function, if it decides to emit one. A good description might start with
+  /// "Returning without...".
+  virtual PathDiagnosticPieceRef
+  maybeEmitNoteForObjCSelf(PathSensitiveBugReport &R,
+                           const ObjCMethodCall &Call,
+                           const ExplodedNode *N) = 0;
+
+  /// Consume the information on the non-modifying stack frame in order to
+  /// either emit a note or not. May suppress the report entirely.
+  /// \return Diagnostics piece for the unmodified state in the current
+  /// function, if it decides to emit one. A good description might start with
+  /// "Returning without...".
+  virtual PathDiagnosticPieceRef
+  maybeEmitNoteForCXXThis(PathSensitiveBugReport &R,
+                          const CXXConstructorCall &Call,
+                          const ExplodedNode *N) = 0;
+
+  /// Consume the information on the non-modifying stack frame in order to
+  /// either emit a note or not. May suppress the report entirely.
+  /// \return Diagnostics piece for the unmodified state in the current
+  /// function, if it decides to emit one. A good description might start with
+  /// "Returning without...".
+  virtual PathDiagnosticPieceRef
+  maybeEmitNoteForParameters(PathSensitiveBugReport &R, const CallEvent &Call,
+                             const ExplodedNode *N) = 0;
+
+public:
+  NoStateChangeFuncVisitor(bugreporter::TrackingKind TKind) : TKind(TKind) {}
+
+  PathDiagnosticPieceRef VisitNode(const ExplodedNode *N,
+                                   BugReporterContext &BR,
+                                   PathSensitiveBugReport &R) override final;
+};
+
 } // namespace ento
 
 } // namespace clang
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index e85ca8ef86157..c49a52b03106f 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -172,29 +172,28 @@ static SourceLocation getDeclLocForCommentSearch(const Decl *D,
       // Allow association with Y across {} in `typedef struct X {} Y`.
       isa<TypedefDecl>(D))
     return D->getBeginLoc();
-  else {
-    const SourceLocation DeclLoc = D->getLocation();
-    if (DeclLoc.isMacroID()) {
-      if (isa<TypedefDecl>(D)) {
-        // If location of the typedef name is in a macro, it is because being
-        // declared via a macro. Try using declaration's starting location as
-        // the "declaration location".
-        return D->getBeginLoc();
-      } else if (const auto *TD = dyn_cast<TagDecl>(D)) {
-        // If location of the tag decl is inside a macro, but the spelling of
-        // the tag name comes from a macro argument, it looks like a special
-        // macro like NS_ENUM is being used to define the tag decl.  In that
-        // case, adjust the source location to the expansion loc so that we can
-        // attach the comment to the tag decl.
-        if (SourceMgr.isMacroArgExpansion(DeclLoc) &&
-            TD->isCompleteDefinition())
-          return SourceMgr.getExpansionLoc(DeclLoc);
-      }
+
+  const SourceLocation DeclLoc = D->getLocation();
+  if (DeclLoc.isMacroID()) {
+    if (isa<TypedefDecl>(D)) {
+      // If location of the typedef name is in a macro, it is because being
+      // declared via a macro. Try using declaration's starting location as
+      // the "declaration location".
+      return D->getBeginLoc();
+    }
+
+    if (const auto *TD = dyn_cast<TagDecl>(D)) {
+      // If location of the tag decl is inside a macro, but the spelling of
+      // the tag name comes from a macro argument, it looks like a special
+      // macro like NS_ENUM is being used to define the tag decl.  In that
+      // case, adjust the source location to the expansion loc so that we can
+      // attach the comment to the tag decl.
+      if (SourceMgr.isMacroArgExpansion(DeclLoc) && TD->isCompleteDefinition())
+        return SourceMgr.getExpansionLoc(DeclLoc);
     }
-    return DeclLoc;
   }
 
-  return {};
+  return DeclLoc;
 }
 
 RawComment *ASTContext::getRawCommentForDeclNoCacheImpl(
diff --git a/clang/lib/AST/ASTDiagnostic.cpp b/clang/lib/AST/ASTDiagnostic.cpp
index dc22481d0a84c..7e435e8b35b80 100644
--- a/clang/lib/AST/ASTDiagnostic.cpp
+++ b/clang/lib/AST/ASTDiagnostic.cpp
@@ -1088,6 +1088,9 @@ class TemplateDiff {
             Ty->getAs<TemplateSpecializationType>())
       return TST;
 
+    if (const auto* SubstType = Ty->getAs<SubstTemplateTypeParmType>())
+      Ty = SubstType->getReplacementType();
+
     const RecordType *RT = Ty->getAs<RecordType>();
 
     if (!RT)
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 3467da2b549e2..e042ae8dae4ae 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -1217,6 +1217,15 @@ bool DeclContext::Encloses(const DeclContext *DC) const {
   return false;
 }
 
+DeclContext *DeclContext::getNonTransparentContext() {
+  DeclContext *DC = this;
+  while (DC->isTransparentContext()) {
+    DC = DC->getParent();
+    assert(DC && "All transparent contexts should have a parent!");
+  }
+  return DC;
+}
+
 DeclContext *DeclContext::getPrimaryContext() {
   switch (getDeclKind()) {
   case Decl::ExternCContext:
diff --git a/clang/lib/Basic/LangOptions.cpp b/clang/lib/Basic/LangOptions.cpp
index 169b679490389..3e3e7fe3dc155 100644
--- a/clang/lib/Basic/LangOptions.cpp
+++ b/clang/lib/Basic/LangOptions.cpp
@@ -47,6 +47,8 @@ bool LangOptions::isNoBuiltinFunc(StringRef FuncName) const {
 
 VersionTuple LangOptions::getOpenCLVersionTuple() const {
   const int Ver = OpenCLCPlusPlus ? OpenCLCPlusPlusVersion : OpenCLVersion;
+  if (OpenCLCPlusPlus && Ver != 100)
+    return VersionTuple(Ver / 100);
   return VersionTuple(Ver / 100, (Ver % 100) / 10);
 }
 
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index e163ebfa2348b..2b5bf34a7b23f 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -424,6 +424,17 @@ ArrayRef<Builtin::Info> AArch64TargetInfo::getTargetBuiltins() const {
                                              Builtin::FirstTSBuiltin);
 }
 
+Optional<std::pair<unsigned, unsigned>>
+AArch64TargetInfo::getVScaleRange(const LangOptions &LangOpts) const {
+  if (LangOpts.ArmSveVectorBits) {
+    unsigned VScale = LangOpts.ArmSveVectorBits / 128;
+    return std::pair<unsigned, unsigned>(VScale, VScale);
+  }
+  if (hasFeature("sve"))
+    return std::pair<unsigned, unsigned>(0, 16);
+  return None;
+}
+
 bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
   return Feature == "aarch64" || Feature == "arm64" || Feature == "arm" ||
          (Feature == "neon" && (FPU & NeonMode)) ||
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 46882a808336b..12830348fb453 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -96,6 +96,9 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
 
   ArrayRef<Builtin::Info> getTargetBuiltins() const override;
 
+  Optional<std::pair<unsigned, unsigned>>
+  getVScaleRange(const LangOptions &LangOpts) const override;
+
   bool hasFeature(StringRef Feature) const override;
   bool handleTargetFeatures(std::vector<std::string> &Features,
                             DiagnosticsEngine &Diags) override;
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index fac786dbcf9e2..cebb19e7ccab3 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -335,7 +335,7 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple,
                   llvm::AMDGPU::getArchAttrR600(GPUKind)) {
   resetDataLayout(isAMDGCN(getTriple()) ? DataLayoutStringAMDGCN
                                         : DataLayoutStringR600);
-  GridValues = llvm::omp::AMDGPUGpuGridValues;
+  GridValues = &llvm::omp::AMDGPUGridValues;
 
   setAddressSpaceMap(Triple.getOS() == llvm::Triple::Mesa3D ||
                      !isAMDGCN(Triple));
diff --git a/clang/lib/Basic/Targets/AMDGPU.h b/clang/lib/Basic/Targets/AMDGPU.h
index f8772cbe244f0..77c2c5fd50145 100644
--- a/clang/lib/Basic/Targets/AMDGPU.h
+++ b/clang/lib/Basic/Targets/AMDGPU.h
@@ -353,6 +353,8 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUTargetInfo final : public TargetInfo {
 
   LangAS getCUDABuiltinAddressSpace(unsigned AS) const override {
     switch (AS) {
+    case 0:
+      return LangAS::Default;
     case 1:
       return LangAS::cuda_device;
     case 3:
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index ae546492a8bb3..398d5a1107f51 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -65,7 +65,7 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
   TLSSupported = false;
   VLASupported = false;
   AddrSpaceMap = &NVPTXAddrSpaceMap;
-  GridValues = llvm::omp::NVPTXGpuGridValues;
+  GridValues = &llvm::omp::NVPTXGridValues;
   UseAddrSpaceMapMangling = true;
   HasLegalHalfType = true;
   HasFloat16 = true;
diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index c15d2df33f9f7..c8afb71e7dfd1 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -237,6 +237,7 @@ static void defineXLCompatMacros(MacroBuilder &Builder) {
   Builder.defineMacro("__fsqrt", "__builtin_ppc_fsqrt");
   Builder.defineMacro("__fsqrts", "__builtin_ppc_fsqrts");
   Builder.defineMacro("__addex", "__builtin_ppc_addex");
+  Builder.defineMacro("__cmplxl", "__builtin_complex");
 }
 
 /// PPCTargetInfo::getTargetDefines - Return a set of the PowerPC-specific
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 7c7431886be01..f906383082269 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1189,7 +1189,7 @@ static void addSanitizers(const Triple &TargetTriple,
             CompileKernel, Recover, ModuleUseAfterScope, UseOdrIndicator,
             DestructorKind));
         MPM.addPass(createModuleToFunctionPassAdaptor(AddressSanitizerPass(
-            CompileKernel, Recover, UseAfterScope, UseAfterReturn)));
+            {CompileKernel, Recover, UseAfterScope, UseAfterReturn})));
       }
     };
     ASanPass(SanitizerKind::Address, false);
@@ -1199,8 +1199,8 @@ static void addSanitizers(const Triple &TargetTriple,
       if (LangOpts.Sanitize.has(Mask)) {
         bool Recover = CodeGenOpts.SanitizeRecover.has(Mask);
         MPM.addPass(HWAddressSanitizerPass(
-            CompileKernel, Recover,
-            /*DisableOptimization=*/CodeGenOpts.OptimizationLevel == 0));
+            {CompileKernel, Recover,
+             /*DisableOptimization=*/CodeGenOpts.OptimizationLevel == 0}));
       }
     };
     HWASanPass(SanitizerKind::HWAddress, false);
@@ -1289,6 +1289,8 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager(
                           "", PGOOptions::NoAction, PGOOptions::CSIRInstr,
                           CodeGenOpts.DebugInfoForProfiling);
   }
+  if (TM)
+    TM->setPGOOption(PGOOpt);
 
   PipelineTuningOptions PTO;
   PTO.LoopUnrolling = CodeGenOpts.UnrollLoops;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 65671a49ed92f..6bbf556a55680 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -8385,7 +8385,7 @@ Value *CodeGenFunction::vectorWrapScalar16(Value *Op) {
 /// SVEBuiltinMemEltTy - Returns the memory element type for this memory
 /// access builtin.  Only required if it can't be inferred from the base pointer
 /// operand.
-llvm::Type *CodeGenFunction::SVEBuiltinMemEltTy(SVETypeFlags TypeFlags) {
+llvm::Type *CodeGenFunction::SVEBuiltinMemEltTy(const SVETypeFlags &TypeFlags) {
   switch (TypeFlags.getMemEltType()) {
   case SVETypeFlags::MemEltTyDefault:
     return getEltType(TypeFlags);
@@ -8401,7 +8401,7 @@ llvm::Type *CodeGenFunction::SVEBuiltinMemEltTy(SVETypeFlags TypeFlags) {
   llvm_unreachable("Unknown MemEltType");
 }
 
-llvm::Type *CodeGenFunction::getEltType(SVETypeFlags TypeFlags) {
+llvm::Type *CodeGenFunction::getEltType(const SVETypeFlags &TypeFlags) {
   switch (TypeFlags.getEltType()) {
   default:
     llvm_unreachable("Invalid SVETypeFlag!");
@@ -8436,7 +8436,7 @@ llvm::Type *CodeGenFunction::getEltType(SVETypeFlags TypeFlags) {
 // Return the llvm predicate vector type corresponding to the specified element
 // TypeFlags.
 llvm::ScalableVectorType *
-CodeGenFunction::getSVEPredType(SVETypeFlags TypeFlags) {
+CodeGenFunction::getSVEPredType(const SVETypeFlags &TypeFlags) {
   switch (TypeFlags.getEltType()) {
   default: llvm_unreachable("Unhandled SVETypeFlag!");
 
@@ -8505,7 +8505,8 @@ CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
   }
 }
 
-llvm::Value *CodeGenFunction::EmitSVEAllTruePred(SVETypeFlags TypeFlags) {
+llvm::Value *
+CodeGenFunction::EmitSVEAllTruePred(const SVETypeFlags &TypeFlags) {
   Function *Ptrue =
       CGM.getIntrinsic(Intrinsic::aarch64_sve_ptrue, getSVEPredType(TypeFlags));
   return Builder.CreateCall(Ptrue, {Builder.getInt32(/*SV_ALL*/ 31)});
@@ -8549,7 +8550,7 @@ Value *CodeGenFunction::EmitSVEPredicateCast(Value *Pred,
   return C;
 }
 
-Value *CodeGenFunction::EmitSVEGatherLoad(SVETypeFlags TypeFlags,
+Value *CodeGenFunction::EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
                                           SmallVectorImpl<Value *> &Ops,
                                           unsigned IntID) {
   auto *ResultTy = getSVEType(TypeFlags);
@@ -8601,7 +8602,7 @@ Value *CodeGenFunction::EmitSVEGatherLoad(SVETypeFlags TypeFlags,
                                   : Builder.CreateSExt(Call, ResultTy);
 }
 
-Value *CodeGenFunction::EmitSVEScatterStore(SVETypeFlags TypeFlags,
+Value *CodeGenFunction::EmitSVEScatterStore(const SVETypeFlags &TypeFlags,
                                             SmallVectorImpl<Value *> &Ops,
                                             unsigned IntID) {
   auto *SrcDataTy = getSVEType(TypeFlags);
@@ -8656,7 +8657,7 @@ Value *CodeGenFunction::EmitSVEScatterStore(SVETypeFlags TypeFlags,
   return Builder.CreateCall(F, Ops);
 }
 
-Value *CodeGenFunction::EmitSVEGatherPrefetch(SVETypeFlags TypeFlags,
+Value *CodeGenFunction::EmitSVEGatherPrefetch(const SVETypeFlags &TypeFlags,
                                               SmallVectorImpl<Value *> &Ops,
                                               unsigned IntID) {
   // The gather prefetches are overloaded on the vector input - this can either
@@ -8689,7 +8690,7 @@ Value *CodeGenFunction::EmitSVEGatherPrefetch(SVETypeFlags TypeFlags,
   return Builder.CreateCall(F, Ops);
 }
 
-Value *CodeGenFunction::EmitSVEStructLoad(SVETypeFlags TypeFlags,
+Value *CodeGenFunction::EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
                                           SmallVectorImpl<Value*> &Ops,
                                           unsigned IntID) {
   llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
@@ -8723,7 +8724,7 @@ Value *CodeGenFunction::EmitSVEStructLoad(SVETypeFlags TypeFlags,
   return Builder.CreateCall(F, { Predicate, BasePtr });
 }
 
-Value *CodeGenFunction::EmitSVEStructStore(SVETypeFlags TypeFlags,
+Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
                                            SmallVectorImpl<Value*> &Ops,
                                            unsigned IntID) {
   llvm::ScalableVectorType *VTy = getSVEType(TypeFlags);
@@ -8770,7 +8771,7 @@ Value *CodeGenFunction::EmitSVEStructStore(SVETypeFlags TypeFlags,
 // SVE2's svpmullb and svpmullt builtins are similar to the svpmullb_pair and
 // svpmullt_pair intrinsics, with the exception that their results are bitcast
 // to a wider type.
-Value *CodeGenFunction::EmitSVEPMull(SVETypeFlags TypeFlags,
+Value *CodeGenFunction::EmitSVEPMull(const SVETypeFlags &TypeFlags,
                                      SmallVectorImpl<Value *> &Ops,
                                      unsigned BuiltinID) {
   // Splat scalar operand to vector (intrinsics with _n infix)
@@ -8788,14 +8789,14 @@ Value *CodeGenFunction::EmitSVEPMull(SVETypeFlags TypeFlags,
   return EmitSVEReinterpret(Call, Ty);
 }
 
-Value *CodeGenFunction::EmitSVEMovl(SVETypeFlags TypeFlags,
+Value *CodeGenFunction::EmitSVEMovl(const SVETypeFlags &TypeFlags,
                                     ArrayRef<Value *> Ops, unsigned BuiltinID) {
   llvm::Type *OverloadedTy = getSVEType(TypeFlags);
   Function *F = CGM.getIntrinsic(BuiltinID, OverloadedTy);
   return Builder.CreateCall(F, {Ops[0], Builder.getInt32(0)});
 }
 
-Value *CodeGenFunction::EmitSVEPrefetchLoad(SVETypeFlags TypeFlags,
+Value *CodeGenFunction::EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
                                             SmallVectorImpl<Value *> &Ops,
                                             unsigned BuiltinID) {
   auto *MemEltTy = SVEBuiltinMemEltTy(TypeFlags);
@@ -8904,8 +8905,10 @@ static void InsertExplicitUndefOperand(CGBuilderTy &Builder, llvm::Type *Ty,
   Ops.insert(Ops.begin(), SplatUndef);
 }
 
-SmallVector<llvm::Type *, 2> CodeGenFunction::getSVEOverloadTypes(
-    SVETypeFlags TypeFlags, llvm::Type *ResultType, ArrayRef<Value *> Ops) {
+SmallVector<llvm::Type *, 2>
+CodeGenFunction::getSVEOverloadTypes(const SVETypeFlags &TypeFlags,
+                                     llvm::Type *ResultType,
+                                     ArrayRef<Value *> Ops) {
   if (TypeFlags.isOverloadNone())
     return {};
 
@@ -9718,6 +9721,29 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F);
   }
 
+  if (BuiltinID == AArch64::BI__mulh || BuiltinID == AArch64::BI__umulh) {
+    llvm::Type *ResType = ConvertType(E->getType());
+    llvm::Type *Int128Ty = llvm::IntegerType::get(getLLVMContext(), 128);
+
+    bool IsSigned = BuiltinID == AArch64::BI__mulh;
+    Value *LHS =
+        Builder.CreateIntCast(EmitScalarExpr(E->getArg(0)), Int128Ty, IsSigned);
+    Value *RHS =
+        Builder.CreateIntCast(EmitScalarExpr(E->getArg(1)), Int128Ty, IsSigned);
+
+    Value *MulResult, *HigherBits;
+    if (IsSigned) {
+      MulResult = Builder.CreateNSWMul(LHS, RHS);
+      HigherBits = Builder.CreateAShr(MulResult, 64);
+    } else {
+      MulResult = Builder.CreateNUWMul(LHS, RHS);
+      HigherBits = Builder.CreateLShr(MulResult, 64);
+    }
+    HigherBits = Builder.CreateIntCast(HigherBits, ResType, IsSigned);
+
+    return HigherBits;
+  }
+
   // Handle MSVC intrinsics before argument evaluation to prevent double
   // evaluation.
   if (Optional<MSVCIntrin> MsvcIntId = translateAarch64ToMsvcIntrin(BuiltinID))
@@ -12729,10 +12755,16 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_cvtdq2ps512_mask:
   case X86::BI__builtin_ia32_cvtqq2ps512_mask:
   case X86::BI__builtin_ia32_cvtqq2pd512_mask:
+  case X86::BI__builtin_ia32_vcvtw2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
     return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ true);
   case X86::BI__builtin_ia32_cvtudq2ps512_mask:
   case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
   case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
+  case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
     return EmitX86ConvertIntToFp(*this, E, Ops, /*IsSigned*/ false);
 
   case X86::BI__builtin_ia32_vfmaddss3:
@@ -13936,15 +13968,28 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     }
     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
   }
+  case X86::BI__builtin_ia32_sqrtsh_round_mask:
   case X86::BI__builtin_ia32_sqrtsd_round_mask:
   case X86::BI__builtin_ia32_sqrtss_round_mask: {
     unsigned CC = cast<llvm::ConstantInt>(Ops[4])->getZExtValue();
     // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
     // otherwise keep the intrinsic.
     if (CC != 4) {
-      Intrinsic::ID IID = BuiltinID == X86::BI__builtin_ia32_sqrtsd_round_mask ?
-                          Intrinsic::x86_avx512_mask_sqrt_sd :
-                          Intrinsic::x86_avx512_mask_sqrt_ss;
+      Intrinsic::ID IID;
+
+      switch (BuiltinID) {
+      default:
+        llvm_unreachable("Unsupported intrinsic!");
+      case X86::BI__builtin_ia32_sqrtsh_round_mask:
+        IID = Intrinsic::x86_avx512fp16_mask_sqrt_sh;
+        break;
+      case X86::BI__builtin_ia32_sqrtsd_round_mask:
+        IID = Intrinsic::x86_avx512_mask_sqrt_sd;
+        break;
+      case X86::BI__builtin_ia32_sqrtss_round_mask:
+        IID = Intrinsic::x86_avx512_mask_sqrt_ss;
+        break;
+      }
       return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
     }
     Value *A = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
@@ -13966,6 +14011,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_sqrtpd:
   case X86::BI__builtin_ia32_sqrtps256:
   case X86::BI__builtin_ia32_sqrtps:
+  case X86::BI__builtin_ia32_sqrtph256:
+  case X86::BI__builtin_ia32_sqrtph:
+  case X86::BI__builtin_ia32_sqrtph512:
   case X86::BI__builtin_ia32_sqrtps512:
   case X86::BI__builtin_ia32_sqrtpd512: {
     if (Ops.size() == 2) {
@@ -13973,9 +14021,21 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
       // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
       // otherwise keep the intrinsic.
       if (CC != 4) {
-        Intrinsic::ID IID = BuiltinID == X86::BI__builtin_ia32_sqrtps512 ?
-                            Intrinsic::x86_avx512_sqrt_ps_512 :
-                            Intrinsic::x86_avx512_sqrt_pd_512;
+        Intrinsic::ID IID;
+
+        switch (BuiltinID) {
+        default:
+          llvm_unreachable("Unsupported intrinsic!");
+        case X86::BI__builtin_ia32_sqrtph512:
+          IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
+          break;
+        case X86::BI__builtin_ia32_sqrtps512:
+          IID = Intrinsic::x86_avx512_sqrt_ps_512;
+          break;
+        case X86::BI__builtin_ia32_sqrtpd512:
+          IID = Intrinsic::x86_avx512_sqrt_pd_512;
+          break;
+        }
         return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
       }
     }
@@ -14143,28 +14203,40 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F, {Ops[0]});
   }
   case X86::BI__builtin_ia32_reduce_fadd_pd512:
-  case X86::BI__builtin_ia32_reduce_fadd_ps512: {
+  case X86::BI__builtin_ia32_reduce_fadd_ps512:
+  case X86::BI__builtin_ia32_reduce_fadd_ph512:
+  case X86::BI__builtin_ia32_reduce_fadd_ph256:
+  case X86::BI__builtin_ia32_reduce_fadd_ph128: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fadd, Ops[1]->getType());
     Builder.getFastMathFlags().setAllowReassoc();
     return Builder.CreateCall(F, {Ops[0], Ops[1]});
   }
   case X86::BI__builtin_ia32_reduce_fmul_pd512:
-  case X86::BI__builtin_ia32_reduce_fmul_ps512: {
+  case X86::BI__builtin_ia32_reduce_fmul_ps512:
+  case X86::BI__builtin_ia32_reduce_fmul_ph512:
+  case X86::BI__builtin_ia32_reduce_fmul_ph256:
+  case X86::BI__builtin_ia32_reduce_fmul_ph128: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fmul, Ops[1]->getType());
     Builder.getFastMathFlags().setAllowReassoc();
     return Builder.CreateCall(F, {Ops[0], Ops[1]});
   }
   case X86::BI__builtin_ia32_reduce_fmax_pd512:
-  case X86::BI__builtin_ia32_reduce_fmax_ps512: {
+  case X86::BI__builtin_ia32_reduce_fmax_ps512:
+  case X86::BI__builtin_ia32_reduce_fmax_ph512:
+  case X86::BI__builtin_ia32_reduce_fmax_ph256:
+  case X86::BI__builtin_ia32_reduce_fmax_ph128: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fmax, Ops[0]->getType());
     Builder.getFastMathFlags().setNoNaNs();
     return Builder.CreateCall(F, {Ops[0]});
   }
   case X86::BI__builtin_ia32_reduce_fmin_pd512:
-  case X86::BI__builtin_ia32_reduce_fmin_ps512: {
+  case X86::BI__builtin_ia32_reduce_fmin_ps512:
+  case X86::BI__builtin_ia32_reduce_fmin_ph512:
+  case X86::BI__builtin_ia32_reduce_fmin_ph256:
+  case X86::BI__builtin_ia32_reduce_fmin_ph128: {
     Function *F =
         CGM.getIntrinsic(Intrinsic::vector_reduce_fmin, Ops[0]->getType());
     Builder.getFastMathFlags().setNoNaNs();
@@ -14280,6 +14352,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_fpclassps128_mask:
   case X86::BI__builtin_ia32_fpclassps256_mask:
   case X86::BI__builtin_ia32_fpclassps512_mask:
+  case X86::BI__builtin_ia32_fpclassph128_mask:
+  case X86::BI__builtin_ia32_fpclassph256_mask:
+  case X86::BI__builtin_ia32_fpclassph512_mask:
   case X86::BI__builtin_ia32_fpclasspd128_mask:
   case X86::BI__builtin_ia32_fpclasspd256_mask:
   case X86::BI__builtin_ia32_fpclasspd512_mask: {
@@ -14291,6 +14366,15 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     Intrinsic::ID ID;
     switch (BuiltinID) {
     default: llvm_unreachable("Unsupported intrinsic!");
+    case X86::BI__builtin_ia32_fpclassph128_mask:
+      ID = Intrinsic::x86_avx512fp16_fpclass_ph_128;
+      break;
+    case X86::BI__builtin_ia32_fpclassph256_mask:
+      ID = Intrinsic::x86_avx512fp16_fpclass_ph_256;
+      break;
+    case X86::BI__builtin_ia32_fpclassph512_mask:
+      ID = Intrinsic::x86_avx512fp16_fpclass_ph_512;
+      break;
     case X86::BI__builtin_ia32_fpclassps128_mask:
       ID = Intrinsic::x86_avx512_fpclass_ps_128;
       break;
@@ -14428,6 +14512,9 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
   case X86::BI__builtin_ia32_cmpordps:
   case X86::BI__builtin_ia32_cmpordpd:
     return getVectorFCmpIR(CmpInst::FCMP_ORD, /*IsSignaling*/false);
+  case X86::BI__builtin_ia32_cmpph128_mask:
+  case X86::BI__builtin_ia32_cmpph256_mask:
+  case X86::BI__builtin_ia32_cmpph512_mask:
   case X86::BI__builtin_ia32_cmpps128_mask:
   case X86::BI__builtin_ia32_cmpps256_mask:
   case X86::BI__builtin_ia32_cmpps512_mask:
@@ -15909,11 +15996,9 @@ Value *EmitAMDGPUDispatchPtr(CodeGenFunction &CGF,
                              const CallExpr *E = nullptr) {
   auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_dispatch_ptr);
   auto *Call = CGF.Builder.CreateCall(F);
-  Call->addAttribute(
-      AttributeList::ReturnIndex,
+  Call->addRetAttr(
       Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
-  Call->addAttribute(AttributeList::ReturnIndex,
-                     Attribute::getWithAlignment(Call->getContext(), Align(4)));
+  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
   if (!E)
     return Call;
   QualType BuiltinRetType = E->getType();
@@ -17786,6 +17871,22 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::maximum, ConvertType(E->getType()));
     return Builder.CreateCall(Callee, {LHS, RHS});
   }
+  case WebAssembly::BI__builtin_wasm_pmin_f32x4:
+  case WebAssembly::BI__builtin_wasm_pmin_f64x2: {
+    Value *LHS = EmitScalarExpr(E->getArg(0));
+    Value *RHS = EmitScalarExpr(E->getArg(1));
+    Function *Callee =
+        CGM.getIntrinsic(Intrinsic::wasm_pmin, ConvertType(E->getType()));
+    return Builder.CreateCall(Callee, {LHS, RHS});
+  }
+  case WebAssembly::BI__builtin_wasm_pmax_f32x4:
+  case WebAssembly::BI__builtin_wasm_pmax_f64x2: {
+    Value *LHS = EmitScalarExpr(E->getArg(0));
+    Value *RHS = EmitScalarExpr(E->getArg(1));
+    Function *Callee =
+        CGM.getIntrinsic(Intrinsic::wasm_pmax, ConvertType(E->getType()));
+    return Builder.CreateCall(Callee, {LHS, RHS});
+  }
   case WebAssembly::BI__builtin_wasm_ceil_f32x4:
   case WebAssembly::BI__builtin_wasm_floor_f32x4:
   case WebAssembly::BI__builtin_wasm_trunc_f32x4:
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 96117aeea07ff..daf25f41492d7 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1892,7 +1892,7 @@ void CodeGenModule::addDefaultFunctionDefinitionAttributes(llvm::Function &F) {
   getDefaultFunctionAttributes(F.getName(), F.hasOptNone(),
                                /* AttrOnCallSite = */ false, FuncAttrs);
   // TODO: call GetCPUAndFeaturesAttributes?
-  F.addAttributes(llvm::AttributeList::FunctionIndex, FuncAttrs);
+  F.addFnAttrs(FuncAttrs);
 }
 
 void CodeGenModule::addDefaultFunctionDefinitionAttributes(
@@ -4540,10 +4540,8 @@ maybeRaiseRetAlignmentAttribute(llvm::LLVMContext &Ctx,
   if (CurAlign >= NewAlign)
     return Attrs;
   llvm::Attribute AlignAttr = llvm::Attribute::getWithAlignment(Ctx, NewAlign);
-  return Attrs
-      .removeAttribute(Ctx, llvm::AttributeList::ReturnIndex,
-                       llvm::Attribute::AttrKind::Alignment)
-      .addAttribute(Ctx, llvm::AttributeList::ReturnIndex, AlignAttr);
+  return Attrs.removeRetAttribute(Ctx, llvm::Attribute::AttrKind::Alignment)
+      .addRetAttribute(Ctx, AlignAttr);
 }
 
 template <typename AlignedAttrTy> class AbstractAssumeAlignedAttrEmitter {
@@ -5209,15 +5207,11 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(CurFuncDecl))
     if (FD->hasAttr<StrictFPAttr>())
       // All calls within a strictfp function are marked strictfp
-      Attrs =
-        Attrs.addAttribute(getLLVMContext(), llvm::AttributeList::FunctionIndex,
-                           llvm::Attribute::StrictFP);
+      Attrs = Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::StrictFP);
 
   // Add call-site nomerge attribute if exists.
   if (InNoMergeAttributedStmt)
-    Attrs =
-        Attrs.addAttribute(getLLVMContext(), llvm::AttributeList::FunctionIndex,
-                           llvm::Attribute::NoMerge);
+    Attrs = Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::NoMerge);
 
   // Apply some call-site-specific attributes.
   // TODO: work this into building the attribute set.
@@ -5227,15 +5221,12 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   if (CurCodeDecl && CurCodeDecl->hasAttr<FlattenAttr>() &&
       !(TargetDecl && TargetDecl->hasAttr<NoInlineAttr>())) {
     Attrs =
-        Attrs.addAttribute(getLLVMContext(), llvm::AttributeList::FunctionIndex,
-                           llvm::Attribute::AlwaysInline);
+        Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::AlwaysInline);
   }
 
   // Disable inlining inside SEH __try blocks.
   if (isSEHTryScope()) {
-    Attrs =
-        Attrs.addAttribute(getLLVMContext(), llvm::AttributeList::FunctionIndex,
-                           llvm::Attribute::NoInline);
+    Attrs = Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::NoInline);
   }
 
   // Decide whether to use a call or an invoke.
@@ -5251,7 +5242,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     CannotThrow = true;
   } else {
     // Otherwise, nounwind call sites will never throw.
-    CannotThrow = Attrs.hasFnAttribute(llvm::Attribute::NoUnwind);
+    CannotThrow = Attrs.hasFnAttr(llvm::Attribute::NoUnwind);
 
     if (auto *FPtr = dyn_cast<llvm::Function>(CalleePtr))
       if (FPtr->hasFnAttribute(llvm::Attribute::NoUnwind))
@@ -5274,9 +5265,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(CurFuncDecl))
     if (FD->hasAttr<StrictFPAttr>())
       // All calls within a strictfp function are marked strictfp
-      Attrs =
-        Attrs.addAttribute(getLLVMContext(), llvm::AttributeList::FunctionIndex,
-                           llvm::Attribute::StrictFP);
+      Attrs = Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::StrictFP);
 
   AssumeAlignedAttrEmitter AssumeAlignedAttrEmitter(*this, TargetDecl);
   Attrs = AssumeAlignedAttrEmitter.TryEmitAsCallSiteAttribute(Attrs);
@@ -5303,8 +5292,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   if (const auto *FD = dyn_cast_or_null<FunctionDecl>(CurFuncDecl)) {
     if (const auto *A = FD->getAttr<CFGuardAttr>()) {
       if (A->getGuard() == CFGuardAttr::GuardArg::nocf && !CI->getCalledFunction())
-        Attrs = Attrs.addAttribute(
-            getLLVMContext(), llvm::AttributeList::FunctionIndex, "guard_nocf");
+        Attrs = Attrs.addFnAttribute(getLLVMContext(), "guard_nocf");
     }
   }
 
@@ -5374,8 +5362,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       // attributes of the called function.
       if (auto *F = CI->getCalledFunction())
         F->removeFnAttr(llvm::Attribute::NoReturn);
-      CI->removeAttribute(llvm::AttributeList::FunctionIndex,
-                          llvm::Attribute::NoReturn);
+      CI->removeFnAttr(llvm::Attribute::NoReturn);
 
       // Avoid incompatibility with ASan which relies on the `noreturn`
       // attribute to insert handler calls.
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 6a3c98f7989ef..e7c42d95f61a2 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1384,16 +1384,16 @@ llvm::DIType *CGDebugInfo::createBitFieldType(const FieldDecl *BitFieldDecl,
     Offset = BitFieldInfo.StorageSize - BitFieldInfo.Size - Offset;
   uint64_t OffsetInBits = StorageOffsetInBits + Offset;
   llvm::DINode::DIFlags Flags = getAccessFlag(BitFieldDecl->getAccess(), RD);
+  llvm::DINodeArray Annotations = CollectBTFTagAnnotations(BitFieldDecl);
   return DBuilder.createBitFieldMemberType(
       RecordTy, Name, File, Line, SizeInBits, OffsetInBits, StorageOffsetInBits,
-      Flags, DebugType);
+      Flags, DebugType, Annotations);
 }
 
-llvm::DIType *
-CGDebugInfo::createFieldType(StringRef name, QualType type, SourceLocation loc,
-                             AccessSpecifier AS, uint64_t offsetInBits,
-                             uint32_t AlignInBits, llvm::DIFile *tunit,
-                             llvm::DIScope *scope, const RecordDecl *RD) {
+llvm::DIType *CGDebugInfo::createFieldType(
+    StringRef name, QualType type, SourceLocation loc, AccessSpecifier AS,
+    uint64_t offsetInBits, uint32_t AlignInBits, llvm::DIFile *tunit,
+    llvm::DIScope *scope, const RecordDecl *RD, llvm::DINodeArray Annotations) {
   llvm::DIType *debugType = getOrCreateType(type, tunit);
 
   // Get the location for the field.
@@ -1411,7 +1411,7 @@ CGDebugInfo::createFieldType(StringRef name, QualType type, SourceLocation loc,
 
   llvm::DINode::DIFlags flags = getAccessFlag(AS, RD);
   return DBuilder.createMemberType(scope, name, file, line, SizeInBits, Align,
-                                   offsetInBits, flags, debugType);
+                                   offsetInBits, flags, debugType, Annotations);
 }
 
 void CGDebugInfo::CollectRecordLambdaFields(
@@ -1501,9 +1501,10 @@ void CGDebugInfo::CollectRecordNormalField(
     FieldType = createBitFieldType(field, RecordTy, RD);
   } else {
     auto Align = getDeclAlignIfRequired(field, CGM.getContext());
+    llvm::DINodeArray Annotations = CollectBTFTagAnnotations(field);
     FieldType =
         createFieldType(name, type, field->getLocation(), field->getAccess(),
-                        OffsetInBits, Align, tunit, RecordTy, RD);
+                        OffsetInBits, Align, tunit, RecordTy, RD, Annotations);
   }
 
   elements.push_back(FieldType);
@@ -2070,6 +2071,20 @@ llvm::DINodeArray CGDebugInfo::CollectCXXTemplateParams(
   return CollectTemplateParams(TPList, TAList.asArray(), Unit);
 }
 
+llvm::DINodeArray CGDebugInfo::CollectBTFTagAnnotations(const Decl *D) {
+  if (!D->hasAttr<BTFTagAttr>())
+    return nullptr;
+
+  SmallVector<llvm::Metadata *, 4> Annotations;
+  for (const auto *I : D->specific_attrs<BTFTagAttr>()) {
+    llvm::Metadata *Ops[2] = {
+        llvm::MDString::get(CGM.getLLVMContext(), StringRef("btf_tag")),
+        llvm::MDString::get(CGM.getLLVMContext(), I->getBTFTag())};
+    Annotations.push_back(llvm::MDNode::get(CGM.getLLVMContext(), Ops));
+  }
+  return DBuilder.getOrCreateArray(Annotations);
+}
+
 llvm::DIType *CGDebugInfo::getOrCreateVTablePtrType(llvm::DIFile *Unit) {
   if (VTablePtrType)
     return VTablePtrType;
@@ -3442,9 +3457,10 @@ llvm::DICompositeType *CGDebugInfo::CreateLimitedType(const RecordType *Ty) {
         Flags |= llvm::DINode::FlagExportSymbols;
   }
 
+  llvm::DINodeArray Annotations = CollectBTFTagAnnotations(D);
   llvm::DICompositeType *RealDecl = DBuilder.createReplaceableCompositeType(
       getTagForRecord(RD), RDName, RDContext, DefUnit, Line, 0, Size, Align,
-      Flags, Identifier);
+      Flags, Identifier, Annotations);
 
   // Elements of composite types usually have back to the type, creating
   // uniquing cycles.  Distinct nodes are more efficient.
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index f3322fa91f57f..2cccc392658cf 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -297,11 +297,15 @@ class CGDebugInfo {
   CollectCXXTemplateParams(const ClassTemplateSpecializationDecl *TS,
                            llvm::DIFile *F);
 
+  /// A helper function to collect debug info for btf_tag annotations.
+  llvm::DINodeArray CollectBTFTagAnnotations(const Decl *D);
+
   llvm::DIType *createFieldType(StringRef name, QualType type,
                                 SourceLocation loc, AccessSpecifier AS,
                                 uint64_t offsetInBits, uint32_t AlignInBits,
                                 llvm::DIFile *tunit, llvm::DIScope *scope,
-                                const RecordDecl *RD = nullptr);
+                                const RecordDecl *RD = nullptr,
+                                llvm::DINodeArray Annotations = nullptr);
 
   llvm::DIType *createFieldType(StringRef name, QualType type,
                                 SourceLocation loc, AccessSpecifier AS,
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index d70cbb64de84c..7453a923042b7 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -3499,7 +3499,7 @@ void CodeGenFunction::EmitTrapCheck(llvm::Value *Checked,
     if (!CGM.getCodeGenOpts().TrapFuncName.empty()) {
       auto A = llvm::Attribute::get(getLLVMContext(), "trap-func-name",
                                     CGM.getCodeGenOpts().TrapFuncName);
-      TrapCall->addAttribute(llvm::AttributeList::FunctionIndex, A);
+      TrapCall->addFnAttr(A);
     }
     TrapCall->setDoesNotReturn();
     TrapCall->setDoesNotThrow();
@@ -3523,7 +3523,7 @@ llvm::CallInst *CodeGenFunction::EmitTrapCall(llvm::Intrinsic::ID IntrID) {
   if (!CGM.getCodeGenOpts().TrapFuncName.empty()) {
     auto A = llvm::Attribute::get(getLLVMContext(), "trap-func-name",
                                   CGM.getCodeGenOpts().TrapFuncName);
-    TrapCall->addAttribute(llvm::AttributeList::FunctionIndex, A);
+    TrapCall->addFnAttr(A);
   }
 
   return TrapCall;
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 96cf977ca2901..d86ae7ad17d68 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1323,8 +1323,7 @@ static RValue EmitNewDeleteCall(CodeGenFunction &CGF,
   llvm::Function *Fn = dyn_cast<llvm::Function>(CalleePtr);
   if (CalleeDecl->isReplaceableGlobalAllocationFunction() &&
       Fn && Fn->hasFnAttribute(llvm::Attribute::NoBuiltin)) {
-    CallOrInvoke->addAttribute(llvm::AttributeList::FunctionIndex,
-                               llvm::Attribute::Builtin);
+    CallOrInvoke->addFnAttr(llvm::Attribute::Builtin);
   }
 
   return RV;
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index b66d8d0bf869c..962ab55ee35d1 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -4806,11 +4806,8 @@ Value *ScalarExprEmitter::VisitAsTypeExpr(AsTypeExpr *E) {
   // vector to get a vec4, then a bitcast if the target type is different.
   if (NumElementsSrc == 3 && NumElementsDst != 3) {
     Src = ConvertVec3AndVec4(Builder, CGF, Src, 4);
-
-    if (!CGF.CGM.getCodeGenOpts().PreserveVec3Type) {
-      Src = createCastsForTypeOfSameSize(Builder, CGF.CGM.getDataLayout(), Src,
-                                         DstTy);
-    }
+    Src = createCastsForTypeOfSameSize(Builder, CGF.CGM.getDataLayout(), Src,
+                                       DstTy);
 
     Src->setName("astype");
     return Src;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 90fcf2232be2f..5718546b3bb67 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -9435,34 +9435,50 @@ static void emitNonContiguousDescriptor(
   }
 }
 
+// Try to extract the base declaration from a `this->x` expression if possible.
+static ValueDecl *getDeclFromThisExpr(const Expr *E) {
+  if (!E)
+    return nullptr;
+
+  if (const auto *OASE = dyn_cast<OMPArraySectionExpr>(E->IgnoreParenCasts()))
+    if (const MemberExpr *ME =
+            dyn_cast<MemberExpr>(OASE->getBase()->IgnoreParenImpCasts()))
+      return ME->getMemberDecl();
+  return nullptr;
+}
+
 /// Emit a string constant containing the names of the values mapped to the
 /// offloading runtime library.
 llvm::Constant *
 emitMappingInformation(CodeGenFunction &CGF, llvm::OpenMPIRBuilder &OMPBuilder,
                        MappableExprsHandler::MappingExprInfo &MapExprs) {
-  llvm::Constant *SrcLocStr;
-  if (!MapExprs.getMapDecl()) {
-    SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr();
+
+  if (!MapExprs.getMapDecl() && !MapExprs.getMapExpr())
+    return OMPBuilder.getOrCreateDefaultSrcLocStr();
+
+  SourceLocation Loc;
+  if (!MapExprs.getMapDecl() && MapExprs.getMapExpr()) {
+    if (const ValueDecl *VD = getDeclFromThisExpr(MapExprs.getMapExpr()))
+      Loc = VD->getLocation();
+    else
+      Loc = MapExprs.getMapExpr()->getExprLoc();
   } else {
-    std::string ExprName = "";
-    if (MapExprs.getMapExpr()) {
-      PrintingPolicy P(CGF.getContext().getLangOpts());
-      llvm::raw_string_ostream OS(ExprName);
-      MapExprs.getMapExpr()->printPretty(OS, nullptr, P);
-      OS.flush();
-    } else {
-      ExprName = MapExprs.getMapDecl()->getNameAsString();
-    }
+    Loc = MapExprs.getMapDecl()->getLocation();
+  }
 
-    SourceLocation Loc = MapExprs.getMapDecl()->getLocation();
-    PresumedLoc PLoc = CGF.getContext().getSourceManager().getPresumedLoc(Loc);
-    const char *FileName = PLoc.getFilename();
-    unsigned Line = PLoc.getLine();
-    unsigned Column = PLoc.getColumn();
-    SrcLocStr = OMPBuilder.getOrCreateSrcLocStr(FileName, ExprName.c_str(),
-                                                Line, Column);
+  std::string ExprName = "";
+  if (MapExprs.getMapExpr()) {
+    PrintingPolicy P(CGF.getContext().getLangOpts());
+    llvm::raw_string_ostream OS(ExprName);
+    MapExprs.getMapExpr()->printPretty(OS, nullptr, P);
+    OS.flush();
+  } else {
+    ExprName = MapExprs.getMapDecl()->getNameAsString();
   }
-  return SrcLocStr;
+
+  PresumedLoc PLoc = CGF.getContext().getSourceManager().getPresumedLoc(Loc);
+  return OMPBuilder.getOrCreateSrcLocStr(PLoc.getFilename(), ExprName.c_str(),
+                                         PLoc.getLine(), PLoc.getColumn());
 }
 
 /// Emit the arrays used to pass the captures and map information to the
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
index 33d4ab838af12..cac5faaa8d0f2 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
@@ -20,6 +20,7 @@
 #include "clang/AST/StmtVisitor.h"
 #include "clang/Basic/Cuda.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 
 using namespace clang;
@@ -35,7 +36,7 @@ CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM)
 llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
   CGBuilderTy &Bld = CGF.Builder;
   // return constant compile-time target-specific warp size
-  unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
+  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
   return Bld.getInt32(WarpSize);
 }
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 63fecedc6fb77..b13d55994ef68 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -339,7 +339,7 @@ class CheckVarsEscapingDeclContext final
     assert(!GlobalizedRD &&
            "Record for globalized variables is built already.");
     ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
-    unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
+    unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
     if (IsInTTDRegion)
       EscapedDeclsForTeams = EscapedDecls.getArrayRef();
     else
@@ -535,8 +535,7 @@ class CheckVarsEscapingDeclContext final
 /// on the NVPTX device, to generate more efficient code.
 static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
   CGBuilderTy &Bld = CGF.Builder;
-  unsigned LaneIDBits =
-      CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size_Log2);
+  unsigned LaneIDBits = CGF.getTarget().getGridValue().GV_Warp_Size_Log2;
   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
 }
@@ -546,8 +545,8 @@ static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
 /// on the NVPTX device, to generate more efficient code.
 static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
   CGBuilderTy &Bld = CGF.Builder;
-  unsigned LaneIDMask = CGF.getContext().getTargetInfo().getGridValue(
-      llvm::omp::GV_Warp_Size_Log2_Mask);
+  unsigned LaneIDMask =
+      CGF.getContext().getTargetInfo().getGridValue().GV_Warp_Size_Log2_Mask;
   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
                        "nvptx_lane_id");
@@ -1308,7 +1307,7 @@ llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
   const RecordDecl *GlobalizedRD = nullptr;
   llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
-  unsigned WarpSize = CGM.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
+  unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
   // Globalize team reductions variable unconditionally in all modes.
   if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
     getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
@@ -2089,7 +2088,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
       "__openmp_nvptx_data_transfer_temporary_storage";
   llvm::GlobalVariable *TransferMedium =
       M.getGlobalVariable(TransferMediumName);
-  unsigned WarpSize = CGF.getTarget().getGridValue(llvm::omp::GV_Warp_Size);
+  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
   if (!TransferMedium) {
     auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
     unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index b5f1b843c46b8..5d3b711e6d4b5 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -17,7 +17,6 @@
 #include "CGOpenMPRuntime.h"
 #include "CodeGenFunction.h"
 #include "clang/AST/StmtOpenMP.h"
-#include "llvm/Frontend/OpenMP/OMPGridValues.h"
 
 namespace clang {
 namespace CodeGen {
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 0a3a722fa6533..4ff1f7b3a85b9 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -2188,20 +2188,16 @@ static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect,
                               CodeGenFunction &CGF,
                               std::vector<llvm::Value *> &RegResults) {
   if (!HasUnwindClobber)
-    Result.addAttribute(llvm::AttributeList::FunctionIndex,
-                        llvm::Attribute::NoUnwind);
+    Result.addFnAttr(llvm::Attribute::NoUnwind);
 
   if (NoMerge)
-    Result.addAttribute(llvm::AttributeList::FunctionIndex,
-                        llvm::Attribute::NoMerge);
+    Result.addFnAttr(llvm::Attribute::NoMerge);
   // Attach readnone and readonly attributes.
   if (!HasSideEffect) {
     if (ReadNone)
-      Result.addAttribute(llvm::AttributeList::FunctionIndex,
-                          llvm::Attribute::ReadNone);
+      Result.addFnAttr(llvm::Attribute::ReadNone);
     else if (ReadOnly)
-      Result.addAttribute(llvm::AttributeList::FunctionIndex,
-                          llvm::Attribute::ReadOnly);
+      Result.addFnAttr(llvm::Attribute::ReadOnly);
   }
 
   // Slap the source location of the inline asm into a !srcloc metadata on the
@@ -2223,8 +2219,7 @@ static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect,
     // convergent (meaning, they may call an intrinsically convergent op, such
     // as bar.sync, and so can't have certain optimizations applied around
     // them).
-    Result.addAttribute(llvm::AttributeList::FunctionIndex,
-                        llvm::Attribute::Convergent);
+    Result.addFnAttr(llvm::Attribute::Convergent);
   // Extract all of the register value results from the asm.
   if (ResultRegTypes.size() == 1) {
     RegResults.push_back(&Result);
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 2328dd8198426..b96515093a7f3 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -309,8 +309,8 @@ llvm::Value *CodeGenFunction::getTypeSize(QualType Ty) {
     while (const VariableArrayType *VAT = C.getAsVariableArrayType(Ty)) {
       VlaSizePair VlaSize = getVLASize(VAT);
       Ty = VlaSize.Type;
-      Size = Size ? Builder.CreateNUWMul(Size, VlaSize.NumElts)
-                  : VlaSize.NumElts;
+      Size =
+          Size ? Builder.CreateNUWMul(Size, VlaSize.NumElts) : VlaSize.NumElts;
     }
     SizeInChars = C.getTypeSizeInChars(Ty);
     if (SizeInChars.isZero())
@@ -498,9 +498,8 @@ static llvm::Function *emitOutlinedFunctionPrologue(
             : CGM.getOpenMPRuntime().translateParameter(FD, Arg));
     ++I;
   }
-  Args.append(
-      std::next(CD->param_begin(), CD->getContextParamPosition() + 1),
-      CD->param_end());
+  Args.append(std::next(CD->param_begin(), CD->getContextParamPosition() + 1),
+              CD->param_end());
   TargetArgs.append(
       std::next(CD->param_begin(), CD->getContextParamPosition() + 1),
       CD->param_end());
@@ -672,9 +671,9 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
       if (EI != VLASizes.end()) {
         CallArg = EI->second.second;
       } else {
-        LValue LV = WrapperCGF.MakeAddrLValue(WrapperCGF.GetAddrOfLocalVar(Arg),
-                                              Arg->getType(),
-                                              AlignmentSource::Decl);
+        LValue LV =
+            WrapperCGF.MakeAddrLValue(WrapperCGF.GetAddrOfLocalVar(Arg),
+                                      Arg->getType(), AlignmentSource::Decl);
         CallArg = WrapperCGF.EmitLoadOfScalar(LV, S.getBeginLoc());
       }
     }
@@ -719,29 +718,29 @@ void CodeGenFunction::EmitOMPAggregateAssign(
   CharUnits ElementSize = getContext().getTypeSizeInChars(ElementTy);
 
   llvm::PHINode *SrcElementPHI =
-    Builder.CreatePHI(SrcBegin->getType(), 2, "omp.arraycpy.srcElementPast");
+      Builder.CreatePHI(SrcBegin->getType(), 2, "omp.arraycpy.srcElementPast");
   SrcElementPHI->addIncoming(SrcBegin, EntryBB);
   Address SrcElementCurrent =
       Address(SrcElementPHI,
               SrcAddr.getAlignment().alignmentOfArrayElement(ElementSize));
 
-  llvm::PHINode *DestElementPHI =
-    Builder.CreatePHI(DestBegin->getType(), 2, "omp.arraycpy.destElementPast");
+  llvm::PHINode *DestElementPHI = Builder.CreatePHI(
+      DestBegin->getType(), 2, "omp.arraycpy.destElementPast");
   DestElementPHI->addIncoming(DestBegin, EntryBB);
   Address DestElementCurrent =
-    Address(DestElementPHI,
-            DestAddr.getAlignment().alignmentOfArrayElement(ElementSize));
+      Address(DestElementPHI,
+              DestAddr.getAlignment().alignmentOfArrayElement(ElementSize));
 
   // Emit copy.
   CopyGen(DestElementCurrent, SrcElementCurrent);
 
   // Shift the address forward by one element.
-  llvm::Value *DestElementNext = Builder.CreateConstGEP1_32(
-      DestAddr.getElementType(), DestElementPHI, /*Idx0=*/1,
-      "omp.arraycpy.dest.element");
-  llvm::Value *SrcElementNext = Builder.CreateConstGEP1_32(
-      SrcAddr.getElementType(), SrcElementPHI, /*Idx0=*/1,
-      "omp.arraycpy.src.element");
+  llvm::Value *DestElementNext =
+      Builder.CreateConstGEP1_32(DestAddr.getElementType(), DestElementPHI,
+                                 /*Idx0=*/1, "omp.arraycpy.dest.element");
+  llvm::Value *SrcElementNext =
+      Builder.CreateConstGEP1_32(SrcAddr.getElementType(), SrcElementPHI,
+                                 /*Idx0=*/1, "omp.arraycpy.src.element");
   // Check whether we've reached the end.
   llvm::Value *Done =
       Builder.CreateICmpEQ(DestElementNext, DestEnd, "omp.arraycpy.done");
@@ -1004,9 +1003,9 @@ bool CodeGenFunction::EmitOMPCopyinClause(const OMPExecutableDirective &D) {
           LocalDeclMap.erase(VD);
         } else {
           MasterAddr =
-            Address(VD->isStaticLocal() ? CGM.getStaticLocalDeclAddress(VD)
-                                        : CGM.GetAddrOfGlobal(VD),
-                    getContext().getDeclAlign(VD));
+              Address(VD->isStaticLocal() ? CGM.getStaticLocalDeclAddress(VD)
+                                          : CGM.GetAddrOfGlobal(VD),
+                      getContext().getDeclAlign(VD));
         }
         // Get the address of the threadprivate variable.
         Address PrivateAddr = EmitLValue(*IRef).getAddress(*this);
@@ -1077,7 +1076,7 @@ bool CodeGenFunction::EmitOMPLastprivateClauseInit(
         PrivateScope.addPrivate(DestVD, [this, OrigVD, IRef]() {
           DeclRefExpr DRE(getContext(), const_cast<VarDecl *>(OrigVD),
                           /*RefersToEnclosingVariableOrCapture=*/
-                              CapturedStmtInfo->lookup(OrigVD) != nullptr,
+                          CapturedStmtInfo->lookup(OrigVD) != nullptr,
                           (*IRef)->getType(), VK_LValue, (*IRef)->getExprLoc());
           return EmitLValue(&DRE).getAddress(*this);
         });
@@ -1086,19 +1085,19 @@ bool CodeGenFunction::EmitOMPLastprivateClauseInit(
         // for 'firstprivate' clause.
         if (IInit && !SIMDLCVs.count(OrigVD->getCanonicalDecl())) {
           const auto *VD = cast<VarDecl>(cast<DeclRefExpr>(IInit)->getDecl());
-          bool IsRegistered = PrivateScope.addPrivate(OrigVD, [this, VD, C,
-                                                               OrigVD]() {
-            if (C->getKind() == OMPC_LASTPRIVATE_conditional) {
-              Address VDAddr =
-                  CGM.getOpenMPRuntime().emitLastprivateConditionalInit(*this,
-                                                                        OrigVD);
-              setAddrOfLocalVar(VD, VDAddr);
-              return VDAddr;
-            }
-            // Emit private VarDecl with copy init.
-            EmitDecl(*VD);
-            return GetAddrOfLocalVar(VD);
-          });
+          bool IsRegistered =
+              PrivateScope.addPrivate(OrigVD, [this, VD, C, OrigVD]() {
+                if (C->getKind() == OMPC_LASTPRIVATE_conditional) {
+                  Address VDAddr =
+                      CGM.getOpenMPRuntime().emitLastprivateConditionalInit(
+                          *this, OrigVD);
+                  setAddrOfLocalVar(VD, VDAddr);
+                  return VDAddr;
+                }
+                // Emit private VarDecl with copy init.
+                EmitDecl(*VD);
+                return GetAddrOfLocalVar(VD);
+              });
           assert(IsRegistered &&
                  "lastprivate var already registered as private");
           (void)IsRegistered;
@@ -1293,14 +1292,12 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
             OriginalAddr, ConvertTypeForMem(LHSVD->getType()), "lhs.begin");
       }
       PrivateScope.addPrivate(LHSVD, [OriginalAddr]() { return OriginalAddr; });
-      PrivateScope.addPrivate(
-          RHSVD, [this, PrivateVD, RHSVD, IsArray]() {
-            return IsArray
-                       ? Builder.CreateElementBitCast(
+      PrivateScope.addPrivate(RHSVD, [this, PrivateVD, RHSVD, IsArray]() {
+        return IsArray ? Builder.CreateElementBitCast(
                              GetAddrOfLocalVar(PrivateVD),
                              ConvertTypeForMem(RHSVD->getType()), "rhs.begin")
                        : GetAddrOfLocalVar(PrivateVD);
-          });
+      });
     }
     ++ILHS;
     ++IRHS;
@@ -2114,9 +2111,10 @@ bool CodeGenFunction::EmitOMPLinearClauseInit(const OMPLoopDirective &D) {
                         CapturedStmtInfo->lookup(OrigVD) != nullptr,
                         VD->getInit()->getType(), VK_LValue,
                         VD->getInit()->getExprLoc());
-        EmitExprAsInit(&DRE, VD, MakeAddrLValue(Emission.getAllocatedAddress(),
-                                                VD->getType()),
-                       /*capturedByInit=*/false);
+        EmitExprAsInit(
+            &DRE, VD,
+            MakeAddrLValue(Emission.getAllocatedAddress(), VD->getType()),
+            /*capturedByInit=*/false);
         EmitAutoVarCleanups(Emission);
       } else {
         EmitVarDecl(*VD);
@@ -2219,9 +2217,8 @@ void CodeGenFunction::EmitOMPPrivateLoopCounters(
     AutoVarEmission VarEmission = EmitAutoVarAlloca(*PrivateVD);
     EmitAutoVarCleanups(VarEmission);
     LocalDeclMap.erase(PrivateVD);
-    (void)LoopScope.addPrivate(VD, [&VarEmission]() {
-      return VarEmission.getAllocatedAddress();
-    });
+    (void)LoopScope.addPrivate(
+        VD, [&VarEmission]() { return VarEmission.getAllocatedAddress(); });
     if (LocalDeclMap.count(VD) || CapturedStmtInfo->lookup(VD) ||
         VD->hasGlobalStorage()) {
       (void)LoopScope.addPrivate(PrivateVD, [this, VD, E]() {
@@ -2273,7 +2270,7 @@ static void emitPreCond(CodeGenFunction &CGF, const OMPLoopDirective &S,
   // Create temp loop control variables with their init values to support
   // non-rectangular loops.
   CodeGenFunction::OMPMapVars PreCondVars;
-  for (const Expr * E: S.dependent_counters()) {
+  for (const Expr *E : S.dependent_counters()) {
     if (!E)
       continue;
     assert(!E->getType().getNonReferenceType()->isRecordType() &&
@@ -2733,12 +2730,10 @@ void CodeGenFunction::EmitOMPForOuterLoop(
   CGOpenMPRuntime &RT = CGM.getOpenMPRuntime();
 
   // Dynamic scheduling of the outer loop (dynamic, guided, auto, runtime).
-  const bool DynamicOrOrdered =
-      Ordered || RT.isDynamic(ScheduleKind.Schedule);
+  const bool DynamicOrOrdered = Ordered || RT.isDynamic(ScheduleKind.Schedule);
 
-  assert((Ordered ||
-          !RT.isStaticNonchunked(ScheduleKind.Schedule,
-                                 LoopArgs.Chunk != nullptr)) &&
+  assert((Ordered || !RT.isStaticNonchunked(ScheduleKind.Schedule,
+                                            LoopArgs.Chunk != nullptr)) &&
          "static non-chunked schedule does not need outer loop");
 
   // Emit outer loop.
@@ -3058,15 +3053,15 @@ void CodeGenFunction::EmitOMPTargetSimdDirective(
 }
 
 namespace {
-  struct ScheduleKindModifiersTy {
-    OpenMPScheduleClauseKind Kind;
-    OpenMPScheduleClauseModifier M1;
-    OpenMPScheduleClauseModifier M2;
-    ScheduleKindModifiersTy(OpenMPScheduleClauseKind Kind,
-                            OpenMPScheduleClauseModifier M1,
-                            OpenMPScheduleClauseModifier M2)
-        : Kind(Kind), M1(M1), M2(M2) {}
-  };
+struct ScheduleKindModifiersTy {
+  OpenMPScheduleClauseKind Kind;
+  OpenMPScheduleClauseModifier M1;
+  OpenMPScheduleClauseModifier M2;
+  ScheduleKindModifiersTy(OpenMPScheduleClauseKind Kind,
+                          OpenMPScheduleClauseModifier M1,
+                          OpenMPScheduleClauseModifier M2)
+      : Kind(Kind), M1(M1), M2(M2) {}
+};
 } // namespace
 
 bool CodeGenFunction::EmitOMPWorksharingLoop(
@@ -3186,8 +3181,10 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
       // If the static schedule kind is specified or if the ordered clause is
       // specified, and if no monotonic modifier is specified, the effect will
       // be as if the monotonic modifier was specified.
-      bool StaticChunkedOne = RT.isStaticChunked(ScheduleKind.Schedule,
-          /* Chunked */ Chunk != nullptr) && HasChunkSizeOne &&
+      bool StaticChunkedOne =
+          RT.isStaticChunked(ScheduleKind.Schedule,
+                             /* Chunked */ Chunk != nullptr) &&
+          HasChunkSizeOne &&
           isOpenMPLoopBoundSharingDirective(S.getDirectiveKind());
       bool IsMonotonic =
           Ordered ||
@@ -4442,7 +4439,7 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
         const auto *OrigVD = cast<VarDecl>(Pair.second->getDecl());
         DeclRefExpr DRE(CGF.getContext(), const_cast<VarDecl *>(OrigVD),
                         /*RefersToEnclosingVariableOrCapture=*/
-                            CGF.CapturedStmtInfo->lookup(OrigVD) != nullptr,
+                        CGF.CapturedStmtInfo->lookup(OrigVD) != nullptr,
                         Pair.second->getType(), VK_LValue,
                         Pair.second->getExprLoc());
         Scope.addPrivate(Pair.first, [&CGF, &DRE]() {
@@ -5170,8 +5167,8 @@ void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S,
       // iteration space is divided into chunks that are approximately equal
       // in size, and at most one chunk is distributed to each team of the
       // league. The size of the chunks is unspecified in this case.
-      bool StaticChunked = RT.isStaticChunked(
-          ScheduleKind, /* Chunked */ Chunk != nullptr) &&
+      bool StaticChunked =
+          RT.isStaticChunked(ScheduleKind, /* Chunked */ Chunk != nullptr) &&
           isOpenMPLoopBoundSharingDirective(S.getDirectiveKind());
       if (RT.isStaticNonchunked(ScheduleKind,
                                 /* Chunked */ Chunk != nullptr) ||
@@ -5942,8 +5939,7 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF,
     return;
   }
 
-  auto LPCRegion =
-      CGOpenMPRuntime::LastprivateConditionalRAII::disable(CGF, S);
+  auto LPCRegion = CGOpenMPRuntime::LastprivateConditionalRAII::disable(CGF, S);
   llvm::Function *Fn = nullptr;
   llvm::Constant *FnID = nullptr;
 
@@ -6473,7 +6469,8 @@ void CodeGenFunction::EmitOMPUseDevicePtrClause(
   auto OrigVarIt = C.varlist_begin();
   auto InitIt = C.inits().begin();
   for (const Expr *PvtVarIt : C.private_copies()) {
-    const auto *OrigVD = cast<VarDecl>(cast<DeclRefExpr>(*OrigVarIt)->getDecl());
+    const auto *OrigVD =
+        cast<VarDecl>(cast<DeclRefExpr>(*OrigVarIt)->getDecl());
     const auto *InitVD = cast<VarDecl>(cast<DeclRefExpr>(*InitIt)->getDecl());
     const auto *PvtVD = cast<VarDecl>(cast<DeclRefExpr>(PvtVarIt)->getDecl());
 
@@ -6496,31 +6493,30 @@ void CodeGenFunction::EmitOMPUseDevicePtrClause(
     if (InitAddrIt == CaptureDeviceAddrMap.end())
       continue;
 
-    bool IsRegistered = PrivateScope.addPrivate(OrigVD, [this, OrigVD,
-                                                         InitAddrIt, InitVD,
-                                                         PvtVD]() {
-      // Initialize the temporary initialization variable with the address we
-      // get from the runtime library. We have to cast the source address
-      // because it is always a void *. References are materialized in the
-      // privatization scope, so the initialization here disregards the fact
-      // the original variable is a reference.
-      QualType AddrQTy =
-          getContext().getPointerType(OrigVD->getType().getNonReferenceType());
-      llvm::Type *AddrTy = ConvertTypeForMem(AddrQTy);
-      Address InitAddr = Builder.CreateBitCast(InitAddrIt->second, AddrTy);
-      setAddrOfLocalVar(InitVD, InitAddr);
-
-      // Emit private declaration, it will be initialized by the value we
-      // declaration we just added to the local declarations map.
-      EmitDecl(*PvtVD);
-
-      // The initialization variables reached its purpose in the emission
-      // of the previous declaration, so we don't need it anymore.
-      LocalDeclMap.erase(InitVD);
-
-      // Return the address of the private variable.
-      return GetAddrOfLocalVar(PvtVD);
-    });
+    bool IsRegistered = PrivateScope.addPrivate(
+        OrigVD, [this, OrigVD, InitAddrIt, InitVD, PvtVD]() {
+          // Initialize the temporary initialization variable with the address
+          // we get from the runtime library. We have to cast the source address
+          // because it is always a void *. References are materialized in the
+          // privatization scope, so the initialization here disregards the fact
+          // the original variable is a reference.
+          QualType AddrQTy = getContext().getPointerType(
+              OrigVD->getType().getNonReferenceType());
+          llvm::Type *AddrTy = ConvertTypeForMem(AddrQTy);
+          Address InitAddr = Builder.CreateBitCast(InitAddrIt->second, AddrTy);
+          setAddrOfLocalVar(InitVD, InitAddr);
+
+          // Emit private declaration, it will be initialized by the value we
+          // declaration we just added to the local declarations map.
+          EmitDecl(*PvtVD);
+
+          // The initialization variables reached its purpose in the emission
+          // of the previous declaration, so we don't need it anymore.
+          LocalDeclMap.erase(InitVD);
+
+          // Return the address of the private variable.
+          return GetAddrOfLocalVar(PvtVD);
+        });
     assert(IsRegistered && "firstprivate var already registered as private");
     // Silence the warning about unused variable.
     (void)IsRegistered;
@@ -6881,11 +6877,11 @@ void CodeGenFunction::EmitOMPTaskLoopBasedDirective(const OMPLoopDirective &S) {
   // TODO: Check if we should emit tied or untied task.
   Data.Tied = true;
   // Set scheduling for taskloop
-  if (const auto* Clause = S.getSingleClause<OMPGrainsizeClause>()) {
+  if (const auto *Clause = S.getSingleClause<OMPGrainsizeClause>()) {
     // grainsize clause
     Data.Schedule.setInt(/*IntVal=*/false);
     Data.Schedule.setPointer(EmitScalarExpr(Clause->getGrainsize()));
-  } else if (const auto* Clause = S.getSingleClause<OMPNumTasksClause>()) {
+  } else if (const auto *Clause = S.getSingleClause<OMPNumTasksClause>()) {
     // num_tasks clause
     Data.Schedule.setInt(/*IntVal=*/true);
     Data.Schedule.setPointer(EmitScalarExpr(Clause->getNumTasks()));
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 21169e4ffaa78..d3fefb41ef88e 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -383,6 +383,9 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
                        "__cyg_profile_func_exit");
   }
 
+  if (ShouldSkipSanitizerInstrumentation())
+    CurFn->addFnAttr(llvm::Attribute::DisableSanitizerInstrumentation);
+
   // Emit debug descriptor for function end.
   if (CGDebugInfo *DI = getDebugInfo())
     DI->EmitFunctionEnd(Builder, CurFn);
@@ -486,11 +489,13 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
   //    function.
   CurFn->addFnAttr("min-legal-vector-width", llvm::utostr(LargestVectorWidth));
 
-  // Add vscale attribute if appropriate.
-  if (getLangOpts().ArmSveVectorBits) {
-    unsigned VScale = getLangOpts().ArmSveVectorBits / 128;
-    CurFn->addFnAttr(llvm::Attribute::getWithVScaleRangeArgs(getLLVMContext(),
-                                                             VScale, VScale));
+  // Add vscale_range attribute if appropriate.
+  Optional<std::pair<unsigned, unsigned>> VScaleRange =
+      getContext().getTargetInfo().getVScaleRange(getLangOpts());
+  if (VScaleRange) {
+    CurFn->addFnAttr(llvm::Attribute::getWithVScaleRangeArgs(
+        getLLVMContext(), VScaleRange.getValue().first,
+        VScaleRange.getValue().second));
   }
 
   // If we generated an unreachable return block, delete it now.
@@ -519,6 +524,12 @@ bool CodeGenFunction::ShouldInstrumentFunction() {
   return true;
 }
 
+bool CodeGenFunction::ShouldSkipSanitizerInstrumentation() {
+  if (!CurFuncDecl)
+    return false;
+  return CurFuncDecl->hasAttr<DisableSanitizerInstrumentationAttr>();
+}
+
 /// ShouldXRayInstrument - Return true if the current function should be
 /// instrumented with XRay nop sleds.
 bool CodeGenFunction::ShouldXRayInstrumentFunction() const {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 0c451aef8a950..6fca0eb6933e6 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -2286,6 +2286,10 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// instrumented with __cyg_profile_func_* calls
   bool ShouldInstrumentFunction();
 
+  /// ShouldSkipSanitizerInstrumentation - Return true if the current function
+  /// should not be instrumented with sanitizers.
+  bool ShouldSkipSanitizerInstrumentation();
+
   /// ShouldXRayInstrument - Return true if the current function should be
   /// instrumented with XRay nop sleds.
   bool ShouldXRayInstrumentFunction() const;
@@ -4126,30 +4130,30 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// SVEBuiltinMemEltTy - Returns the memory element type for this memory
   /// access builtin.  Only required if it can't be inferred from the base
   /// pointer operand.
-  llvm::Type *SVEBuiltinMemEltTy(SVETypeFlags TypeFlags);
+  llvm::Type *SVEBuiltinMemEltTy(const SVETypeFlags &TypeFlags);
 
-  SmallVector<llvm::Type *, 2> getSVEOverloadTypes(SVETypeFlags TypeFlags,
-                                                   llvm::Type *ReturnType,
-                                                   ArrayRef<llvm::Value *> Ops);
-  llvm::Type *getEltType(SVETypeFlags TypeFlags);
+  SmallVector<llvm::Type *, 2>
+  getSVEOverloadTypes(const SVETypeFlags &TypeFlags, llvm::Type *ReturnType,
+                      ArrayRef<llvm::Value *> Ops);
+  llvm::Type *getEltType(const SVETypeFlags &TypeFlags);
   llvm::ScalableVectorType *getSVEType(const SVETypeFlags &TypeFlags);
-  llvm::ScalableVectorType *getSVEPredType(SVETypeFlags TypeFlags);
-  llvm::Value *EmitSVEAllTruePred(SVETypeFlags TypeFlags);
+  llvm::ScalableVectorType *getSVEPredType(const SVETypeFlags &TypeFlags);
+  llvm::Value *EmitSVEAllTruePred(const SVETypeFlags &TypeFlags);
   llvm::Value *EmitSVEDupX(llvm::Value *Scalar);
   llvm::Value *EmitSVEDupX(llvm::Value *Scalar, llvm::Type *Ty);
   llvm::Value *EmitSVEReinterpret(llvm::Value *Val, llvm::Type *Ty);
-  llvm::Value *EmitSVEPMull(SVETypeFlags TypeFlags,
+  llvm::Value *EmitSVEPMull(const SVETypeFlags &TypeFlags,
                             llvm::SmallVectorImpl<llvm::Value *> &Ops,
                             unsigned BuiltinID);
-  llvm::Value *EmitSVEMovl(SVETypeFlags TypeFlags,
+  llvm::Value *EmitSVEMovl(const SVETypeFlags &TypeFlags,
                            llvm::ArrayRef<llvm::Value *> Ops,
                            unsigned BuiltinID);
   llvm::Value *EmitSVEPredicateCast(llvm::Value *Pred,
                                     llvm::ScalableVectorType *VTy);
-  llvm::Value *EmitSVEGatherLoad(SVETypeFlags TypeFlags,
+  llvm::Value *EmitSVEGatherLoad(const SVETypeFlags &TypeFlags,
                                  llvm::SmallVectorImpl<llvm::Value *> &Ops,
                                  unsigned IntID);
-  llvm::Value *EmitSVEScatterStore(SVETypeFlags TypeFlags,
+  llvm::Value *EmitSVEScatterStore(const SVETypeFlags &TypeFlags,
                                    llvm::SmallVectorImpl<llvm::Value *> &Ops,
                                    unsigned IntID);
   llvm::Value *EmitSVEMaskedLoad(const CallExpr *, llvm::Type *ReturnTy,
@@ -4158,15 +4162,16 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitSVEMaskedStore(const CallExpr *,
                                   SmallVectorImpl<llvm::Value *> &Ops,
                                   unsigned BuiltinID);
-  llvm::Value *EmitSVEPrefetchLoad(SVETypeFlags TypeFlags,
+  llvm::Value *EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags,
                                    SmallVectorImpl<llvm::Value *> &Ops,
                                    unsigned BuiltinID);
-  llvm::Value *EmitSVEGatherPrefetch(SVETypeFlags TypeFlags,
+  llvm::Value *EmitSVEGatherPrefetch(const SVETypeFlags &TypeFlags,
                                      SmallVectorImpl<llvm::Value *> &Ops,
                                      unsigned IntID);
-  llvm::Value *EmitSVEStructLoad(SVETypeFlags TypeFlags,
-                                 SmallVectorImpl<llvm::Value *> &Ops, unsigned IntID);
-  llvm::Value *EmitSVEStructStore(SVETypeFlags TypeFlags,
+  llvm::Value *EmitSVEStructLoad(const SVETypeFlags &TypeFlags,
+                                 SmallVectorImpl<llvm::Value *> &Ops,
+                                 unsigned IntID);
+  llvm::Value *EmitSVEStructStore(const SVETypeFlags &TypeFlags,
                                   SmallVectorImpl<llvm::Value *> &Ops,
                                   unsigned IntID);
   llvm::Value *EmitAArch64SVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 0632e79324652..82cef26a49fe6 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1848,7 +1848,7 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
         CodeGenOpts.getInlining() == CodeGenOptions::OnlyAlwaysInlining)
       B.addAttribute(llvm::Attribute::NoInline);
 
-    F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+    F->addFnAttrs(B);
     return;
   }
 
@@ -1935,7 +1935,7 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
       B.addAttribute(llvm::Attribute::MinSize);
   }
 
-  F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+  F->addFnAttrs(B);
 
   unsigned alignment = D->getMaxAlignment() / Context.getCharWidth();
   if (alignment)
@@ -1985,7 +1985,7 @@ void CodeGenModule::setLLVMFunctionFEnvAttributes(const FunctionDecl *D,
   if (D->hasAttr<StrictFPAttr>()) {
     llvm::AttrBuilder FuncAttrs;
     FuncAttrs.addAttribute("strictfp");
-    F->addAttributes(llvm::AttributeList::FunctionIndex, FuncAttrs);
+    F->addFnAttrs(FuncAttrs);
   }
 }
 
@@ -2101,8 +2101,8 @@ void CodeGenModule::setNonAliasAttributes(GlobalDecl GD,
         RemoveAttrs.addAttribute("target-cpu");
         RemoveAttrs.addAttribute("target-features");
         RemoveAttrs.addAttribute("tune-cpu");
-        F->removeAttributes(llvm::AttributeList::FunctionIndex, RemoveAttrs);
-        F->addAttributes(llvm::AttributeList::FunctionIndex, Attrs);
+        F->removeFnAttrs(RemoveAttrs);
+        F->addFnAttrs(Attrs);
       }
     }
 
@@ -2185,7 +2185,7 @@ void CodeGenModule::SetFunctionAttributes(GlobalDecl GD, llvm::Function *F,
            F->arg_begin()->getType()
              ->canLosslesslyBitCastTo(F->getReturnType()) &&
            "unexpected this return");
-    F->addAttribute(1, llvm::Attribute::Returned);
+    F->addParamAttr(0, llvm::Attribute::Returned);
   }
 
   // Only a few attributes are set on declarations; these may later be
@@ -2211,15 +2211,13 @@ void CodeGenModule::SetFunctionAttributes(GlobalDecl GD, llvm::Function *F,
     assert(HasBody && "Inline builtin declarations should always have an "
                       "available body!");
     if (shouldEmitFunction(FDBody))
-      F->addAttribute(llvm::AttributeList::FunctionIndex,
-                      llvm::Attribute::NoBuiltin);
+      F->addFnAttr(llvm::Attribute::NoBuiltin);
   }
 
   if (FD->isReplaceableGlobalAllocationFunction()) {
     // A replaceable global allocation function does not act like a builtin by
     // default, only if it is invoked by a new-expression or delete-expression.
-    F->addAttribute(llvm::AttributeList::FunctionIndex,
-                    llvm::Attribute::NoBuiltin);
+    F->addFnAttr(llvm::Attribute::NoBuiltin);
   }
 
   if (isa<CXXConstructorDecl>(FD) || isa<CXXDestructorDecl>(FD))
@@ -3736,9 +3734,9 @@ llvm::Constant *CodeGenModule::GetOrCreateLLVMFunction(
   assert(F->getName() == MangledName && "name was uniqued!");
   if (D)
     SetFunctionAttributes(GD, F, IsIncompleteFunction, IsThunk);
-  if (ExtraAttrs.hasAttributes(llvm::AttributeList::FunctionIndex)) {
+  if (ExtraAttrs.hasFnAttrs()) {
     llvm::AttrBuilder B(ExtraAttrs, llvm::AttributeList::FunctionIndex);
-    F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+    F->addFnAttrs(B);
   }
 
   if (!DontDefer) {
@@ -3884,8 +3882,7 @@ CodeGenModule::CreateRuntimeFunction(llvm::FunctionType *FTy, StringRef Name,
                                      bool AssumeConvergent) {
   if (AssumeConvergent) {
     ExtraAttrs =
-        ExtraAttrs.addAttribute(VMContext, llvm::AttributeList::FunctionIndex,
-                                llvm::Attribute::Convergent);
+        ExtraAttrs.addFnAttribute(VMContext, llvm::Attribute::Convergent);
   }
 
   llvm::Constant *C =
@@ -5034,7 +5031,7 @@ static void replaceUsesOfNonProtoConstant(llvm::Constant *old,
       }
 
       // Add any parameter attributes.
-      newArgAttrs.push_back(oldAttrs.getParamAttributes(argNo));
+      newArgAttrs.push_back(oldAttrs.getParamAttrs(argNo));
       argNo++;
     }
     if (dontTransform)
@@ -5062,9 +5059,9 @@ static void replaceUsesOfNonProtoConstant(llvm::Constant *old,
 
     if (!newCall->getType()->isVoidTy())
       newCall->takeName(callSite);
-    newCall->setAttributes(llvm::AttributeList::get(
-        newFn->getContext(), oldAttrs.getFnAttributes(),
-        oldAttrs.getRetAttributes(), newArgAttrs));
+    newCall->setAttributes(
+        llvm::AttributeList::get(newFn->getContext(), oldAttrs.getFnAttrs(),
+                                 oldAttrs.getRetAttrs(), newArgAttrs));
     newCall->setCallingConv(callSite->getCallingConv());
 
     // Finally, remove the old call, replacing any uses with the new one.
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index f752347a90ac9..4005c5c731ed6 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -827,19 +827,19 @@ class WebAssemblyTargetCodeGenInfo final : public TargetCodeGenInfo {
         llvm::Function *Fn = cast<llvm::Function>(GV);
         llvm::AttrBuilder B;
         B.addAttribute("wasm-import-module", Attr->getImportModule());
-        Fn->addAttributes(llvm::AttributeList::FunctionIndex, B);
+        Fn->addFnAttrs(B);
       }
       if (const auto *Attr = FD->getAttr<WebAssemblyImportNameAttr>()) {
         llvm::Function *Fn = cast<llvm::Function>(GV);
         llvm::AttrBuilder B;
         B.addAttribute("wasm-import-name", Attr->getImportName());
-        Fn->addAttributes(llvm::AttributeList::FunctionIndex, B);
+        Fn->addFnAttrs(B);
       }
       if (const auto *Attr = FD->getAttr<WebAssemblyExportNameAttr>()) {
         llvm::Function *Fn = cast<llvm::Function>(GV);
         llvm::AttrBuilder B;
         B.addAttribute("wasm-export-name", Attr->getExportName());
-        Fn->addAttributes(llvm::AttributeList::FunctionIndex, B);
+        Fn->addFnAttrs(B);
       }
     }
 
@@ -1524,6 +1524,14 @@ ABIArgInfo X86_32ABIInfo::classifyReturnType(QualType RetTy,
     if (isEmptyRecord(getContext(), RetTy, true))
       return ABIArgInfo::getIgnore();
 
+    // Return complex of _Float16 as <2 x half> so the backend will use xmm0.
+    if (const ComplexType *CT = RetTy->getAs<ComplexType>()) {
+      QualType ET = getContext().getCanonicalType(CT->getElementType());
+      if (ET->isFloat16Type())
+        return ABIArgInfo::getDirect(llvm::FixedVectorType::get(
+            llvm::Type::getHalfTy(getVMContext()), 2));
+    }
+
     // Small structures which are register sized are generally returned
     // in a register.
     if (shouldReturnTypeInRegister(RetTy, getContext())) {
@@ -6393,7 +6401,7 @@ class ARMTargetCodeGenInfo : public TargetCodeGenInfo {
     // the backend to perform a realignment as part of the function prologue.
     llvm::AttrBuilder B;
     B.addStackAlignmentAttr(8);
-    Fn->addAttributes(llvm::AttributeList::FunctionIndex, B);
+    Fn->addFnAttrs(B);
   }
 };
 
diff --git a/clang/lib/Driver/Distro.cpp b/clang/lib/Driver/Distro.cpp
index c4cf4e48b5b8d..cdb5a1725750f 100644
--- a/clang/lib/Driver/Distro.cpp
+++ b/clang/lib/Driver/Distro.cpp
@@ -150,6 +150,8 @@ static Distro::DistroType DetectDistro(llvm::vfs::FileSystem &VFS) {
         return Distro::DebianBuster;
       case 11:
         return Distro::DebianBullseye;
+      case 12:
+        return Distro::DebianBookworm;
       default:
         return Distro::UnknownDistro;
       }
@@ -161,6 +163,7 @@ static Distro::DistroType DetectDistro(llvm::vfs::FileSystem &VFS) {
         .Case("stretch/sid", Distro::DebianStretch)
         .Case("buster/sid", Distro::DebianBuster)
         .Case("bullseye/sid", Distro::DebianBullseye)
+        .Case("bookworm/sid", Distro::DebianBookworm)
         .Default(Distro::UnknownDistro);
   }
 
diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp
index cebf9d13a4ce0..5a12406a51cc6 100644
--- a/clang/lib/Driver/ToolChains/AVR.cpp
+++ b/clang/lib/Driver/ToolChains/AVR.cpp
@@ -453,11 +453,21 @@ void AVR::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 }
 
 llvm::Optional<std::string> AVRToolChain::findAVRLibcInstallation() const {
+  // Search avr-libc installation according to avr-gcc installation.
+  std::string GCCParent(GCCInstallation.getParentLibPath());
+  std::string Path(GCCParent + "/avr");
+  if (llvm::sys::fs::is_directory(Path))
+    return Path;
+  Path = GCCParent + "/../avr";
+  if (llvm::sys::fs::is_directory(Path))
+    return Path;
+
+  // Search avr-libc installation from possible locations, and return the first
+  // one that exists, if there is no avr-gcc installed.
   for (StringRef PossiblePath : PossibleAVRLibcLocations) {
     std::string Path = getDriver().SysRoot + PossiblePath.str();
-    // Return the first avr-libc installation that exists.
     if (llvm::sys::fs::is_directory(Path))
-      return Optional<std::string>(Path);
+      return Path;
   }
 
   return llvm::None;
diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
index 314d0efce4414..5f5964ec982b6 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.cpp
+++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -146,6 +146,8 @@ void hexagon::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
       "-mcpu=hexagon" +
       toolchains::HexagonToolChain::GetTargetCPUVersion(Args)));
 
+  addSanitizerRuntimes(HTC, Args, CmdArgs);
+
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
@@ -223,6 +225,8 @@ constructHexagonLinkArgs(Compilation &C, const JobAction &JA,
   bool UseShared = IsShared && !IsStatic;
   StringRef CpuVer = toolchains::HexagonToolChain::GetTargetCPUVersion(Args);
 
+  bool NeedsSanitizerDeps = addSanitizerRuntimes(HTC, Args, CmdArgs);
+
   //----------------------------------------------------------------------------
   // Silence warnings for various options
   //----------------------------------------------------------------------------
@@ -288,6 +292,12 @@ constructHexagonLinkArgs(Compilation &C, const JobAction &JA,
     AddLinkerInputs(HTC, Inputs, Args, CmdArgs, JA);
 
     if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
+      if (NeedsSanitizerDeps) {
+        linkSanitizerRuntimeDeps(HTC, CmdArgs);
+
+        CmdArgs.push_back("-lunwind");
+      }
+
       CmdArgs.push_back("-lclang_rt.builtins-hexagon");
       CmdArgs.push_back("-lc");
     }
@@ -450,6 +460,13 @@ Optional<unsigned> HexagonToolChain::getSmallDataThreshold(
   return None;
 }
 
+std::string HexagonToolChain::getCompilerRTPath() const {
+  SmallString<128> Dir(getDriver().SysRoot);
+  llvm::sys::path::append(Dir, "usr", "lib");
+  Dir += SelectedMultilib.gccSuffix();
+  return std::string(Dir.str());
+}
+
 void HexagonToolChain::getHexagonLibraryPaths(const ArgList &Args,
       ToolChain::path_list &LibPaths) const {
   const Driver &D = getDriver();
diff --git a/clang/lib/Driver/ToolChains/Hexagon.h b/clang/lib/Driver/ToolChains/Hexagon.h
index c32cb7f09591a..9dc9b3ceddde1 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.h
+++ b/clang/lib/Driver/ToolChains/Hexagon.h
@@ -104,6 +104,8 @@ class LLVM_LIBRARY_VISIBILITY HexagonToolChain : public Linux {
   void getHexagonLibraryPaths(const llvm::opt::ArgList &Args,
       ToolChain::path_list &LibPaths) const;
 
+  std::string getCompilerRTPath() const override;
+
   static bool isAutoHVXEnabled(const llvm::opt::ArgList &Args);
   static const StringRef GetDefaultCPU();
   static const StringRef GetTargetCPUVersion(const llvm::opt::ArgList &Args);
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 7d4b67a611e7b..0bae9c759501e 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -699,6 +699,7 @@ SanitizerMask Linux::getSupportedSanitizers() const {
                          getTriple().getArch() == llvm::Triple::thumbeb;
   const bool IsRISCV64 = getTriple().getArch() == llvm::Triple::riscv64;
   const bool IsSystemZ = getTriple().getArch() == llvm::Triple::systemz;
+  const bool IsHexagon = getTriple().getArch() == llvm::Triple::hexagon;
   SanitizerMask Res = ToolChain::getSupportedSanitizers();
   Res |= SanitizerKind::Address;
   Res |= SanitizerKind::PointerCompare;
@@ -712,7 +713,7 @@ SanitizerMask Linux::getSupportedSanitizers() const {
   if (IsX86_64 || IsMIPS64 || IsAArch64)
     Res |= SanitizerKind::DataFlow;
   if (IsX86_64 || IsMIPS64 || IsAArch64 || IsX86 || IsArmArch || IsPowerPC64 ||
-      IsRISCV64 || IsSystemZ)
+      IsRISCV64 || IsSystemZ || IsHexagon)
     Res |= SanitizerKind::Leak;
   if (IsX86_64 || IsMIPS64 || IsAArch64 || IsPowerPC64 || IsSystemZ)
     Res |= SanitizerKind::Thread;
@@ -721,7 +722,7 @@ SanitizerMask Linux::getSupportedSanitizers() const {
   if (IsX86 || IsX86_64)
     Res |= SanitizerKind::Function;
   if (IsX86_64 || IsMIPS64 || IsAArch64 || IsX86 || IsMIPS || IsArmArch ||
-      IsPowerPC64)
+      IsPowerPC64 || IsHexagon)
     Res |= SanitizerKind::Scudo;
   if (IsX86_64 || IsAArch64) {
     Res |= SanitizerKind::HWAddress;
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 0c4cacab50506..8487875064aa8 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -14,7 +14,6 @@
 
 #include "UnwrappedLineParser.h"
 #include "FormatToken.h"
-#include "clang/Basic/TokenKinds.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -995,6 +994,13 @@ static bool isJSDeclOrStmt(const AdditionalKeywords &Keywords,
       Keywords.kw_import, tok::kw_export);
 }
 
+// Checks whether a token is a type in K&R C (aka C78).
+static bool isC78Type(const FormatToken &Tok) {
+  return Tok.isOneOf(tok::kw_char, tok::kw_short, tok::kw_int, tok::kw_long,
+                     tok::kw_unsigned, tok::kw_float, tok::kw_double,
+                     tok::identifier);
+}
+
 // This function checks whether a token starts the first parameter declaration
 // in a K&R C (aka C78) function definition, e.g.:
 //   int f(a, b)
@@ -1002,13 +1008,24 @@ static bool isJSDeclOrStmt(const AdditionalKeywords &Keywords,
 //   {
 //      return a + b;
 //   }
-static bool isC78ParameterDecl(const FormatToken *Tok) {
-  if (!Tok)
+static bool isC78ParameterDecl(const FormatToken *Tok, const FormatToken *Next,
+                               const FormatToken *FuncName) {
+  assert(Tok);
+  assert(Next);
+  assert(FuncName);
+
+  if (FuncName->isNot(tok::identifier))
     return false;
 
-  if (!Tok->isOneOf(tok::kw_int, tok::kw_char, tok::kw_float, tok::kw_double,
-                    tok::kw_struct, tok::kw_union, tok::kw_long, tok::kw_short,
-                    tok::kw_unsigned, tok::kw_register))
+  const FormatToken *Prev = FuncName->Previous;
+  if (!Prev || (Prev->isNot(tok::star) && !isC78Type(*Prev)))
+    return false;
+
+  if (!isC78Type(*Tok) &&
+      !Tok->isOneOf(tok::kw_register, tok::kw_struct, tok::kw_union))
+    return false;
+
+  if (Next->isNot(tok::star) && !Next->Tok.getIdentifierInfo())
     return false;
 
   Tok = Tok->Previous;
@@ -1369,22 +1386,20 @@ void UnwrappedLineParser::parseStructuralElement(bool IsTopLevel) {
     case tok::r_brace:
       addUnwrappedLine();
       return;
-    case tok::l_paren:
+    case tok::l_paren: {
       parseParens();
       // Break the unwrapped line if a K&R C function definition has a parameter
       // declaration.
-      if (!IsTopLevel || !Style.isCpp())
-        break;
-      if (!Previous || Previous->isNot(tok::identifier))
+      if (!IsTopLevel || !Style.isCpp() || !Previous || FormatTok->is(tok::eof))
         break;
-      if (Previous->Previous && Previous->Previous->is(tok::at))
-        break;
-      if (!Line->Tokens.begin()->Tok->is(tok::kw_typedef) &&
-          isC78ParameterDecl(FormatTok)) {
+      const unsigned Position = Tokens->getPosition() + 1;
+      assert(Position < AllTokens.size());
+      if (isC78ParameterDecl(FormatTok, AllTokens[Position], Previous)) {
         addUnwrappedLine();
         return;
       }
       break;
+    }
     case tok::kw_operator:
       nextToken();
       if (FormatTok->isBinaryOperator())
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 9975504624fcd..5ff6fdb929405 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3171,6 +3171,8 @@ void CompilerInvocation::setLangDefaults(LangOptions &Opts, InputKind IK,
     Opts.OpenCLVersion = 300;
   else if (LangStd == LangStandard::lang_openclcpp10)
     Opts.OpenCLCPlusPlusVersion = 100;
+  else if (LangStd == LangStandard::lang_openclcpp2021)
+    Opts.OpenCLCPlusPlusVersion = 202100;
 
   // OpenCL has some additional defaults.
   if (Opts.OpenCL) {
@@ -3320,6 +3322,7 @@ void CompilerInvocation::GenerateLangArgs(const LangOptions &Opts,
   case LangStandard::lang_opencl20:
   case LangStandard::lang_opencl30:
   case LangStandard::lang_openclcpp10:
+  case LangStandard::lang_openclcpp2021:
     StdOpt = OPT_cl_std_EQ;
     break;
   default:
@@ -3647,6 +3650,7 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
         .Cases("cl3.0", "CL3.0", LangStandard::lang_opencl30)
         .Cases("clc++", "CLC++", LangStandard::lang_openclcpp10)
         .Cases("clc++1.0", "CLC++1.0", LangStandard::lang_openclcpp10)
+        .Cases("clc++2021", "CLC++2021", LangStandard::lang_openclcpp2021)
         .Default(LangStandard::lang_unspecified);
 
     if (OpenCLLangStd == LangStandard::lang_unspecified) {
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 84e153ae12304..68f7249e6b916 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -433,11 +433,18 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
   // OpenCL v1.0/1.1 s6.9, v1.2/2.0 s6.10: Preprocessor Directives and Macros.
   if (LangOpts.OpenCL) {
     if (LangOpts.CPlusPlus) {
-      if (LangOpts.OpenCLCPlusPlusVersion == 100)
+      switch (LangOpts.OpenCLCPlusPlusVersion) {
+      case 100:
         Builder.defineMacro("__OPENCL_CPP_VERSION__", "100");
-      else
+        break;
+      case 202100:
+        Builder.defineMacro("__OPENCL_CPP_VERSION__", "202100");
+        break;
+      default:
         llvm_unreachable("Unsupported C++ version for OpenCL");
+      }
       Builder.defineMacro("__CL_CPP_VERSION_1_0__", "100");
+      Builder.defineMacro("__CL_CPP_VERSION_2021__", "202100");
     } else {
       // OpenCL v1.0 and v1.1 do not have a predefined macro to indicate the
       // language standard with which the program is compiled. __OPENCL_VERSION__
diff --git a/clang/lib/Frontend/PrintPreprocessedOutput.cpp b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
index 1a820ad985a4c..5c5fc751179d1 100644
--- a/clang/lib/Frontend/PrintPreprocessedOutput.cpp
+++ b/clang/lib/Frontend/PrintPreprocessedOutput.cpp
@@ -182,7 +182,7 @@ class PrintPPOutputPPCallbacks : public PPCallbacks {
   /// implicitly when at the beginning of the file.
   ///
   /// @param Tok                 Token where to move to.
-  /// @param RequiresStartOfLine Whether the next line depends on being in the
+  /// @param RequireStartOfLine  Whether the next line depends on being in the
   ///                            first column, such as a directive.
   ///
   /// @return Whether column adjustments are necessary.
diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h
index 9effaa18d3e8c..ef7e087b832ca 100644
--- a/clang/lib/Headers/__clang_hip_math.h
+++ b/clang/lib/Headers/__clang_hip_math.h
@@ -19,6 +19,9 @@
 #endif
 #include <limits.h>
 #include <stdint.h>
+#ifdef __OPENMP_AMDGCN__
+#include <omp.h>
+#endif
 #endif // !defined(__HIPCC_RTC__)
 
 #pragma push_macro("__DEVICE__")
@@ -258,6 +261,9 @@ float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); }
 __DEVICE__
 float frexpf(float __x, int *__nptr) {
   int __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   float __r =
       __ocml_frexp_f32(__x, (__attribute__((address_space(5))) int *)&__tmp);
   *__nptr = __tmp;
@@ -343,6 +349,9 @@ long int lroundf(float __x) { return __ocml_round_f32(__x); }
 __DEVICE__
 float modff(float __x, float *__iptr) {
   float __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   float __r =
       __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
   *__iptr = __tmp;
@@ -423,6 +432,9 @@ float remainderf(float __x, float __y) {
 __DEVICE__
 float remquof(float __x, float __y, int *__quo) {
   int __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   float __r = __ocml_remquo_f32(
       __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
   *__quo = __tmp;
@@ -479,6 +491,9 @@ __RETURN_TYPE __signbitf(float __x) { return __ocml_signbit_f32(__x); }
 __DEVICE__
 void sincosf(float __x, float *__sinptr, float *__cosptr) {
   float __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   *__sinptr =
       __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp);
   *__cosptr = __tmp;
@@ -487,6 +502,9 @@ void sincosf(float __x, float *__sinptr, float *__cosptr) {
 __DEVICE__
 void sincospif(float __x, float *__sinptr, float *__cosptr) {
   float __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   *__sinptr = __ocml_sincospi_f32(
       __x, (__attribute__((address_space(5))) float *)&__tmp);
   *__cosptr = __tmp;
@@ -799,6 +817,9 @@ double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); }
 __DEVICE__
 double frexp(double __x, int *__nptr) {
   int __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   double __r =
       __ocml_frexp_f64(__x, (__attribute__((address_space(5))) int *)&__tmp);
   *__nptr = __tmp;
@@ -883,6 +904,9 @@ long int lround(double __x) { return __ocml_round_f64(__x); }
 __DEVICE__
 double modf(double __x, double *__iptr) {
   double __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   double __r =
       __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp);
   *__iptr = __tmp;
@@ -971,6 +995,9 @@ double remainder(double __x, double __y) {
 __DEVICE__
 double remquo(double __x, double __y, int *__quo) {
   int __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   double __r = __ocml_remquo_f64(
       __x, __y, (__attribute__((address_space(5))) int *)&__tmp);
   *__quo = __tmp;
@@ -1029,6 +1056,9 @@ double sin(double __x) { return __ocml_sin_f64(__x); }
 __DEVICE__
 void sincos(double __x, double *__sinptr, double *__cosptr) {
   double __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   *__sinptr = __ocml_sincos_f64(
       __x, (__attribute__((address_space(5))) double *)&__tmp);
   *__cosptr = __tmp;
@@ -1037,6 +1067,9 @@ void sincos(double __x, double *__sinptr, double *__cosptr) {
 __DEVICE__
 void sincospi(double __x, double *__sinptr, double *__cosptr) {
   double __tmp;
+#ifdef __OPENMP_AMDGCN__
+#pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc)
+#endif
   *__sinptr = __ocml_sincospi_f64(
       __x, (__attribute__((address_space(5))) double *)&__tmp);
   *__cosptr = __tmp;
diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
index ec601a58e7c34..4940666e80836 100644
--- a/clang/lib/Headers/amxintrin.h
+++ b/clang/lib/Headers/amxintrin.h
@@ -314,8 +314,8 @@ typedef struct __tile1024i_str {
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
 __DEFAULT_FN_ATTRS_TILE
-static void __tile_loadd(__tile1024i *dst, const void *base,
-                         __SIZE_TYPE__ stride) {
+static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
+                                    __SIZE_TYPE__ stride) {
   dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
 }
 
@@ -335,8 +335,8 @@ static void __tile_loadd(__tile1024i *dst, const void *base,
 /// \param stride
 ///    The stride between the rows' data to be loaded in memory.
 __DEFAULT_FN_ATTRS_TILE
-static void __tile_stream_loadd(__tile1024i *dst, const void *base,
-                                __SIZE_TYPE__ stride) {
+static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
+                                           __SIZE_TYPE__ stride) {
   dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
 }
 
@@ -357,8 +357,8 @@ static void __tile_stream_loadd(__tile1024i *dst, const void *base,
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
-                          __tile1024i src1) {
+static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
+                                     __tile1024i src1) {
   dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
                                     src0.tile, src1.tile);
 }
@@ -380,8 +380,8 @@ static void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
-                          __tile1024i src1) {
+static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
+                                     __tile1024i src1) {
   dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
                                     src0.tile, src1.tile);
 }
@@ -403,8 +403,8 @@ static void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
-                          __tile1024i src1) {
+static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
+                                     __tile1024i src1) {
   dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
                                     src0.tile, src1.tile);
 }
@@ -426,8 +426,8 @@ static void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_INT8
-static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
-                          __tile1024i src1) {
+static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
+                                     __tile1024i src1) {
   dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
                                     src0.tile, src1.tile);
 }
@@ -446,7 +446,8 @@ static void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
 /// \param stride
 ///    The stride between the rows' data to be stored in memory.
 __DEFAULT_FN_ATTRS_TILE
-static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
+static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
+                                     __tile1024i src) {
   _tile_stored_internal(src.row, src.col, base, stride, src.tile);
 }
 
@@ -459,7 +460,7 @@ static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) {
 /// \param dst
 ///    The destination tile to be zero. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_TILE
-static void __tile_zero(__tile1024i *dst) {
+static __inline__ void __tile_zero(__tile1024i *dst) {
   dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
 }
 
@@ -479,8 +480,8 @@ static void __tile_zero(__tile1024i *dst) {
 /// \param src1
 ///    The 2nd source tile. Max size is 1024 Bytes.
 __DEFAULT_FN_ATTRS_BF16
-static void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
-                            __tile1024i src1) {
+static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
+                                       __tile1024i src1) {
   dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
                                       src0.tile, src1.tile);
 }
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index 4281a33d375c2..6aee8aed84871 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -178,16 +178,16 @@ _kadd_mask64(__mmask64 __A, __mmask64 __B)
 }
 
 #define _kshiftli_mask32(A, I) \
-  (__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I))
+  ((__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I)))
 
 #define _kshiftri_mask32(A, I) \
-  (__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I))
+  ((__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I)))
 
 #define _kshiftli_mask64(A, I) \
-  (__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I))
+  ((__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I)))
 
 #define _kshiftri_mask64(A, I) \
-  (__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I))
+  ((__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I)))
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _cvtmask32_u32(__mmask32 __A) {
@@ -232,44 +232,44 @@ _store_mask64(__mmask64 *__A, __mmask64 __B) {
 /* Integer compare */
 
 #define _mm512_cmp_epi8_mask(a, b, p) \
-  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
-                                         (__v64qi)(__m512i)(b), (int)(p), \
-                                         (__mmask64)-1)
+  ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), (int)(p), \
+                                          (__mmask64)-1))
 
 #define _mm512_mask_cmp_epi8_mask(m, a, b, p) \
-  (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
-                                         (__v64qi)(__m512i)(b), (int)(p), \
-                                         (__mmask64)(m))
+  ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+                                          (__v64qi)(__m512i)(b), (int)(p), \
+                                          (__mmask64)(m)))
 
 #define _mm512_cmp_epu8_mask(a, b, p) \
-  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
-                                          (__v64qi)(__m512i)(b), (int)(p), \
-                                          (__mmask64)-1)
+  ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                           (__v64qi)(__m512i)(b), (int)(p), \
+                                           (__mmask64)-1))
 
 #define _mm512_mask_cmp_epu8_mask(m, a, b, p) \
-  (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
-                                          (__v64qi)(__m512i)(b), (int)(p), \
-                                          (__mmask64)(m))
+  ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+                                           (__v64qi)(__m512i)(b), (int)(p), \
+                                           (__mmask64)(m)))
 
 #define _mm512_cmp_epi16_mask(a, b, p) \
-  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
-                                         (__v32hi)(__m512i)(b), (int)(p), \
-                                         (__mmask32)-1)
+  ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), (int)(p), \
+                                          (__mmask32)-1))
 
 #define _mm512_mask_cmp_epi16_mask(m, a, b, p) \
-  (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
-                                         (__v32hi)(__m512i)(b), (int)(p), \
-                                         (__mmask32)(m))
+  ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+                                          (__v32hi)(__m512i)(b), (int)(p), \
+                                          (__mmask32)(m)))
 
 #define _mm512_cmp_epu16_mask(a, b, p) \
-  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
-                                          (__v32hi)(__m512i)(b), (int)(p), \
-                                          (__mmask32)-1)
+  ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                           (__v32hi)(__m512i)(b), (int)(p), \
+                                           (__mmask32)-1))
 
 #define _mm512_mask_cmp_epu16_mask(m, a, b, p) \
-  (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
-                                          (__v32hi)(__m512i)(b), (int)(p), \
-                                          (__mmask32)(m))
+  ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+                                           (__v32hi)(__m512i)(b), (int)(p), \
+                                           (__mmask32)(m)))
 
 #define _mm512_cmpeq_epi8_mask(A, B) \
     _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
@@ -1428,36 +1428,36 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
 
 
 #define _mm512_shufflehi_epi16(A, imm) \
-  (__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm))
+  ((__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm)))
 
 #define _mm512_mask_shufflehi_epi16(W, U, A, imm) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                      (__v32hi)_mm512_shufflehi_epi16((A), \
-                                                                      (imm)), \
-                                      (__v32hi)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                       (__v32hi)_mm512_shufflehi_epi16((A), \
+                                                                       (imm)), \
+                                       (__v32hi)(__m512i)(W)))
 
 #define _mm512_maskz_shufflehi_epi16(U, A, imm) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                      (__v32hi)_mm512_shufflehi_epi16((A), \
-                                                                      (imm)), \
-                                      (__v32hi)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                       (__v32hi)_mm512_shufflehi_epi16((A), \
+                                                                       (imm)), \
+                                       (__v32hi)_mm512_setzero_si512()))
 
 #define _mm512_shufflelo_epi16(A, imm) \
-  (__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm))
+  ((__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm)))
 
 
 #define _mm512_mask_shufflelo_epi16(W, U, A, imm) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                      (__v32hi)_mm512_shufflelo_epi16((A), \
-                                                                      (imm)), \
-                                      (__v32hi)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                       (__v32hi)_mm512_shufflelo_epi16((A), \
+                                                                       (imm)), \
+                                       (__v32hi)(__m512i)(W)))
 
 
 #define _mm512_maskz_shufflelo_epi16(U, A, imm) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                      (__v32hi)_mm512_shufflelo_epi16((A), \
-                                                                      (imm)), \
-                                      (__v32hi)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                       (__v32hi)_mm512_shufflelo_epi16((A), \
+                                                                       (imm)), \
+                                       (__v32hi)_mm512_setzero_si512()))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_sllv_epi16(__m512i __A, __m512i __B)
@@ -1527,7 +1527,7 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
 }
 
 #define _mm512_bslli_epi128(a, imm) \
-  (__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))
+  ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srlv_epi16(__m512i __A, __m512i __B)
@@ -1664,7 +1664,7 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
 }
 
 #define _mm512_bsrli_epi128(a, imm) \
-  (__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))
+  ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
@@ -1984,32 +1984,32 @@ _mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
 }
 
 #define _mm512_alignr_epi8(A, B, N) \
-  (__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \
-                                     (__v64qi)(__m512i)(B), (int)(N))
+  ((__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \
+                                      (__v64qi)(__m512i)(B), (int)(N)))
 
 #define _mm512_mask_alignr_epi8(W, U, A, B, N) \
-  (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
-                             (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
-                             (__v64qi)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+                              (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
+                              (__v64qi)(__m512i)(W)))
 
 #define _mm512_maskz_alignr_epi8(U, A, B, N) \
-  (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
                               (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
-                              (__v64qi)(__m512i)_mm512_setzero_si512())
+                              (__v64qi)(__m512i)_mm512_setzero_si512()))
 
 #define _mm512_dbsad_epu8(A, B, imm) \
-  (__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \
-                                      (__v64qi)(__m512i)(B), (int)(imm))
+  ((__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \
+                                       (__v64qi)(__m512i)(B), (int)(imm)))
 
 #define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                   (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
-                                  (__v32hi)(__m512i)(W))
+                                  (__v32hi)(__m512i)(W)))
 
 #define _mm512_maskz_dbsad_epu8(U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
                                   (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
-                                  (__v32hi)_mm512_setzero_si512())
+                                  (__v32hi)_mm512_setzero_si512()))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_sad_epu8 (__m512i __A, __m512i __B)
diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h
index 337256c50f50d..3ba0a0cfd5fdf 100644
--- a/clang/lib/Headers/avx512dqintrin.h
+++ b/clang/lib/Headers/avx512dqintrin.h
@@ -121,10 +121,10 @@ _kadd_mask16(__mmask16 __A, __mmask16 __B)
 }
 
 #define _kshiftli_mask8(A, I) \
-  (__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I))
+  ((__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I)))
 
 #define _kshiftri_mask8(A, I) \
-  (__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I))
+  ((__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I)))
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _cvtmask8_u32(__mmask8 __A) {
@@ -342,19 +342,19 @@ _mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) {
 }
 
 #define _mm512_cvt_roundpd_epi64(A, R) \
-  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
-                                           (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
-                                           (__v8di)(__m512i)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \
-  (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
-                                           (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvtpd_epu64 (__m512d __A) {
@@ -381,19 +381,19 @@ _mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) {
 }
 
 #define _mm512_cvt_roundpd_epu64(A, R) \
-  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R))
+  ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)(__m512i)(W), \
+                                             (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \
-  (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvtps_epi64 (__m256 __A) {
@@ -420,19 +420,19 @@ _mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) {
 }
 
 #define _mm512_cvt_roundps_epi64(A, R) \
-  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
-                                           (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
-                                           (__v8di)(__m512i)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)(__m512i)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvt_roundps_epi64(U, A, R) \
-  (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
-                                           (__v8di)_mm512_setzero_si512(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+                                            (__v8di)_mm512_setzero_si512(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvtps_epu64 (__m256 __A) {
@@ -459,19 +459,19 @@ _mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) {
 }
 
 #define _mm512_cvt_roundps_epu64(A, R) \
-  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
-                                            (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R))
+  ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
-                                            (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)(__m512i)(W), \
+                                             (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvt_roundps_epu64(U, A, R) \
-  (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
-                                            (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)(U), (int)(R)))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
@@ -494,19 +494,19 @@ _mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
 }
 
 #define _mm512_cvt_roundepi64_pd(A, R) \
-  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \
-  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
-                                           (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \
-  (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS512
 _mm512_cvtepi64_ps (__m512i __A) {
@@ -533,19 +533,19 @@ _mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) {
 }
 
 #define _mm512_cvt_roundepi64_ps(A, R) \
-  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
-                                          (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R))
+  ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \
-  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
-                                          (__v8sf)(__m256)(W), (__mmask8)(U), \
-                                          (int)(R))
+  ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+                                           (__v8sf)(__m256)(W), (__mmask8)(U), \
+                                           (int)(R)))
 
 #define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \
-  (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
-                                          (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R))
+  ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)))
 
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -573,19 +573,19 @@ _mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) {
 }
 
 #define _mm512_cvtt_roundpd_epi64(A, R) \
-  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R))
+  ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)(__m512i)(W), \
+                                             (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \
-  (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvttpd_epu64 (__m512d __A) {
@@ -612,19 +612,19 @@ _mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) {
 }
 
 #define _mm512_cvtt_roundpd_epu64(A, R) \
-  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)-1, (int)(R))
+  ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                              (__v8di)_mm512_setzero_si512(), \
+                                              (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8di)(__m512i)(W), \
-                                             (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                              (__v8di)(__m512i)(W), \
+                                              (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \
-  (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+                                              (__v8di)_mm512_setzero_si512(), \
+                                              (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvttps_epi64 (__m256 __A) {
@@ -651,19 +651,19 @@ _mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) {
 }
 
 #define _mm512_cvtt_roundps_epi64(A, R) \
-  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
-                                            (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)-1, (int)(R))
+  ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
-                                            (__v8di)(__m512i)(W), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)(__m512i)(W), \
+                                             (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \
-  (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
-                                            (__v8di)_mm512_setzero_si512(), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+                                             (__v8di)_mm512_setzero_si512(), \
+                                             (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvttps_epu64 (__m256 __A) {
@@ -690,19 +690,19 @@ _mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) {
 }
 
 #define _mm512_cvtt_roundps_epu64(A, R) \
-  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
-                                             (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)-1, (int)(R))
+  ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+                                              (__v8di)_mm512_setzero_si512(), \
+                                              (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
-                                             (__v8di)(__m512i)(W), \
-                                             (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+                                              (__v8di)(__m512i)(W), \
+                                              (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \
-  (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
-                                             (__v8di)_mm512_setzero_si512(), \
-                                             (__mmask8)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+                                              (__v8di)_mm512_setzero_si512(), \
+                                              (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_cvtepu64_pd (__m512i __A) {
@@ -724,20 +724,20 @@ _mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
 }
 
 #define _mm512_cvt_roundepu64_pd(A, R) \
-  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+                                             (__v8df)_mm512_setzero_pd(), \
+                                             (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \
-  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
-                                            (__v8df)(__m512d)(W), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+                                             (__v8df)(__m512d)(W), \
+                                             (__mmask8)(U), (int)(R)))
 
 
 #define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \
-  (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+                                             (__v8df)_mm512_setzero_pd(), \
+                                             (__mmask8)(U), (int)(R)))
 
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS512
@@ -765,290 +765,290 @@ _mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) {
 }
 
 #define _mm512_cvt_roundepu64_ps(A, R) \
-  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+                                            (__v8sf)_mm256_setzero_ps(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \
-  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
-                                           (__v8sf)(__m256)(W), (__mmask8)(U), \
-                                           (int)(R))
+  ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+                                            (__v8sf)(__m256)(W), (__mmask8)(U), \
+                                            (int)(R)))
 
 #define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \
-  (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+                                            (__v8sf)_mm256_setzero_ps(), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm512_range_pd(A, B, C) \
-  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), (int)(C), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)-1, \
-                                          _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), (int)(C), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)-1, \
+                                           _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_mask_range_pd(W, U, A, B, C) \
-  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), (int)(C), \
-                                          (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                          _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), (int)(C), \
+                                           (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                           _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_maskz_range_pd(U, A, B, C) \
-  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), (int)(C), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)(U), \
-                                          _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), (int)(C), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), \
+                                           _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_range_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), (int)(C), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), (int)(C), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_range_round_pd(W, U, A, B, C, R) \
-  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), (int)(C), \
-                                          (__v8df)(__m512d)(W), (__mmask8)(U), \
-                                          (int)(R))
+  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), (int)(C), \
+                                           (__v8df)(__m512d)(W), (__mmask8)(U), \
+                                           (int)(R)))
 
 #define _mm512_maskz_range_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
-                                          (__v8df)(__m512d)(B), (int)(C), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+                                           (__v8df)(__m512d)(B), (int)(C), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(U), (int)(R)))
 
 #define _mm512_range_ps(A, B, C) \
-  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), (int)(C), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)-1, \
-                                         _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), (int)(C), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1, \
+                                          _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_mask_range_ps(W, U, A, B, C) \
-  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), (int)(C), \
-                                         (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                         _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), (int)(C), \
+                                          (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                          _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_maskz_range_ps(U, A, B, C) \
-  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), (int)(C), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)(U), \
-                                         _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), (int)(C), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), \
+                                          _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_range_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), (int)(C), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), (int)(C), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_range_round_ps(W, U, A, B, C, R) \
-  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), (int)(C), \
-                                         (__v16sf)(__m512)(W), (__mmask16)(U), \
-                                         (int)(R))
+  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), (int)(C), \
+                                          (__v16sf)(__m512)(W), (__mmask16)(U), \
+                                          (int)(R)))
 
 #define _mm512_maskz_range_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
-                                         (__v16sf)(__m512)(B), (int)(C), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+                                          (__v16sf)(__m512)(B), (int)(C), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(U), (int)(R)))
 
 #define _mm_range_round_ss(A, B, C, R) \
-  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8) -1, (int)(C),\
-                                               (int)(R))
+  ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)_mm_setzero_ps(), \
+                                                (__mmask8) -1, (int)(C),\
+                                                (int)(R)))
 
 #define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
 #define _mm_mask_range_round_ss(W, U, A, B, C, R) \
-  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)(__m128)(W),\
-                                               (__mmask8)(U), (int)(C),\
-                                               (int)(R))
+  ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)(__m128)(W),\
+                                                (__mmask8)(U), (int)(C),\
+                                                (int)(R)))
 
 #define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_maskz_range_round_ss(U, A, B, C, R) \
-  (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)(U), (int)(C),\
-                                               (int)(R))
+  ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)_mm_setzero_ps(), \
+                                                (__mmask8)(U), (int)(C),\
+                                                (int)(R)))
 
 #define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
 #define _mm_range_round_sd(A, B, C, R) \
-  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8) -1, (int)(C),\
-                                                (int)(R))
+  ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8) -1, (int)(C),\
+                                                 (int)(R)))
 
 #define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
 #define _mm_mask_range_round_sd(W, U, A, B, C, R) \
-  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)(__m128d)(W),\
-                                                (__mmask8)(U), (int)(C),\
-                                                (int)(R))
+  ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)(__m128d)(W),\
+                                                 (__mmask8)(U), (int)(C),\
+                                                 (int)(R)))
 
 #define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
 #define _mm_maskz_range_round_sd(U, A, B, C, R) \
-  (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(U), (int)(C),\
-                                                (int)(R))
+  ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8)(U), (int)(C),\
+                                                 (int)(R)))
 
 #define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
 
 #define _mm512_reduce_pd(A, B) \
-  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)-1, \
-                                           _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)-1, \
+                                            _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_mask_reduce_pd(W, U, A, B) \
-  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
-                                           (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U), \
-                                           _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), \
+                                            _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_maskz_reduce_pd(U, A, B) \
-  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), \
-                                           _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), \
+                                            _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_reduce_ps(A, B) \
-  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)-1, \
-                                          _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)-1, \
+                                           _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_mask_reduce_ps(W, U, A, B) \
-  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
-                                          (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U), \
-                                          _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), \
+                                           _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_maskz_reduce_ps(U, A, B) \
-  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U), \
-                                          _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), \
+                                           _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_reduce_round_pd(A, B, R) \
-  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_reduce_round_pd(W, U, A, B, R) \
-  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
-                                           (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_reduce_round_pd(U, A, B, R) \
-  (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm512_reduce_round_ps(A, B, R) \
-  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_reduce_round_ps(W, U, A, B, R) \
-  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
-                                          (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), (int)(R)))
 
 #define _mm512_maskz_reduce_round_ps(U, A, B, R) \
-  (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), (int)(R)))
 
 #define _mm_reduce_ss(A, B, C) \
-  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
-                                       (__v4sf)(__m128)(B), \
-                                       (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
-                                       (int)(C), _MM_FROUND_CUR_DIRECTION)
+  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
+                                        (int)(C), _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_mask_reduce_ss(W, U, A, B, C) \
-  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
-                                       (__v4sf)(__m128)(B), \
-                                       (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                       (int)(C), _MM_FROUND_CUR_DIRECTION)
+  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                        (int)(C), _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_maskz_reduce_ss(U, A, B, C) \
-  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
-                                       (__v4sf)(__m128)(B), \
-                                       (__v4sf)_mm_setzero_ps(), \
-                                       (__mmask8)(U), (int)(C), \
-                                       _MM_FROUND_CUR_DIRECTION)
+  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)_mm_setzero_ps(), \
+                                        (__mmask8)(U), (int)(C), \
+                                        _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_reduce_round_ss(A, B, C, R) \
-  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
-                                       (__v4sf)(__m128)(B), \
-                                       (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
-                                       (int)(C), (int)(R))
+  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
+                                        (int)(C), (int)(R)))
 
 #define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \
-  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
-                                       (__v4sf)(__m128)(B), \
-                                       (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                       (int)(C), (int)(R))
+  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                        (int)(C), (int)(R)))
 
 #define _mm_maskz_reduce_round_ss(U, A, B, C, R) \
-  (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
-                                       (__v4sf)(__m128)(B), \
-                                       (__v4sf)_mm_setzero_ps(), \
-                                       (__mmask8)(U), (int)(C), (int)(R))
+  ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+                                        (__v4sf)(__m128)(B), \
+                                        (__v4sf)_mm_setzero_ps(), \
+                                        (__mmask8)(U), (int)(C), (int)(R)))
 
 #define _mm_reduce_sd(A, B, C) \
-  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
-                                        (__v2df)(__m128d)(B), \
-                                        (__v2df)_mm_setzero_pd(), \
-                                        (__mmask8)-1, (int)(C), \
-                                        _MM_FROUND_CUR_DIRECTION)
+  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)_mm_setzero_pd(), \
+                                         (__mmask8)-1, (int)(C), \
+                                         _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_mask_reduce_sd(W, U, A, B, C) \
-  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
-                                        (__v2df)(__m128d)(B), \
-                                        (__v2df)(__m128d)(W), (__mmask8)(U), \
-                                        (int)(C), _MM_FROUND_CUR_DIRECTION)
+  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
+                                         (int)(C), _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_maskz_reduce_sd(U, A, B, C) \
-  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
-                                        (__v2df)(__m128d)(B), \
-                                        (__v2df)_mm_setzero_pd(), \
-                                        (__mmask8)(U), (int)(C), \
-                                        _MM_FROUND_CUR_DIRECTION)
+  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)_mm_setzero_pd(), \
+                                         (__mmask8)(U), (int)(C), \
+                                         _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_reduce_round_sd(A, B, C, R) \
-  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
-                                        (__v2df)(__m128d)(B), \
-                                        (__v2df)_mm_setzero_pd(), \
-                                        (__mmask8)-1, (int)(C), (int)(R))
+  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)_mm_setzero_pd(), \
+                                         (__mmask8)-1, (int)(C), (int)(R)))
 
 #define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \
-  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
-                                        (__v2df)(__m128d)(B), \
-                                        (__v2df)(__m128d)(W), (__mmask8)(U), \
-                                        (int)(C), (int)(R))
+  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)(__m128d)(W), (__mmask8)(U), \
+                                         (int)(C), (int)(R)))
 
 #define _mm_maskz_reduce_round_sd(U, A, B, C, R) \
-  (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
-                                        (__v2df)(__m128d)(B), \
-                                        (__v2df)_mm_setzero_pd(), \
-                                        (__mmask8)(U), (int)(C), (int)(R))
+  ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+                                         (__v2df)(__m128d)(B), \
+                                         (__v2df)_mm_setzero_pd(), \
+                                         (__mmask8)(U), (int)(C), (int)(R)))
 
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
 _mm512_movepi32_mask (__m512i __A)
@@ -1218,158 +1218,158 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
 }
 
 #define _mm512_extractf32x8_ps(A, imm) \
-  (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                           (__v8sf)_mm256_undefined_ps(), \
-                                           (__mmask8)-1)
+  ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                            (__v8sf)_mm256_undefined_ps(), \
+                                            (__mmask8)-1))
 
 #define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
-  (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                           (__v8sf)(__m256)(W), \
-                                           (__mmask8)(U))
+  ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                            (__v8sf)(__m256)(W), \
+                                            (__mmask8)(U)))
 
 #define _mm512_maskz_extractf32x8_ps(U, A, imm) \
-  (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)(U))
+  ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                            (__v8sf)_mm256_setzero_ps(), \
+                                            (__mmask8)(U)))
 
 #define _mm512_extractf64x2_pd(A, imm) \
-  (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
-                                                (int)(imm), \
-                                                (__v2df)_mm_undefined_pd(), \
-                                                (__mmask8)-1)
+  ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+                                                 (int)(imm), \
+                                                 (__v2df)_mm_undefined_pd(), \
+                                                 (__mmask8)-1))
 
 #define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
-  (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
-                                                (int)(imm), \
-                                                (__v2df)(__m128d)(W), \
-                                                (__mmask8)(U))
+  ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+                                                 (int)(imm), \
+                                                 (__v2df)(__m128d)(W), \
+                                                 (__mmask8)(U)))
 
 #define _mm512_maskz_extractf64x2_pd(U, A, imm) \
-  (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
-                                                (int)(imm), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(U))
+  ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+                                                 (int)(imm), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8)(U)))
 
 #define _mm512_extracti32x8_epi32(A, imm) \
-  (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                            (__v8si)_mm256_undefined_si256(), \
-                                            (__mmask8)-1)
+  ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+                                             (__v8si)_mm256_undefined_si256(), \
+                                             (__mmask8)-1))
 
 #define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
-  (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                            (__v8si)(__m256i)(W), \
-                                            (__mmask8)(U))
+  ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+                                             (__v8si)(__m256i)(W), \
+                                             (__mmask8)(U)))
 
 #define _mm512_maskz_extracti32x8_epi32(U, A, imm) \
-  (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                            (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)(U))
+  ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+                                             (__v8si)_mm256_setzero_si256(), \
+                                             (__mmask8)(U)))
 
 #define _mm512_extracti64x2_epi64(A, imm) \
-  (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+  ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
                                                 (int)(imm), \
                                                 (__v2di)_mm_undefined_si128(), \
-                                                (__mmask8)-1)
+                                                (__mmask8)-1))
 
 #define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
-  (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)(__m128i)(W), \
-                                                (__mmask8)(U))
+  ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+                                                 (int)(imm), \
+                                                 (__v2di)(__m128i)(W), \
+                                                 (__mmask8)(U)))
 
 #define _mm512_maskz_extracti64x2_epi64(U, A, imm) \
-  (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)_mm_setzero_si128(), \
-                                                (__mmask8)(U))
+  ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+                                                 (int)(imm), \
+                                                 (__v2di)_mm_setzero_si128(), \
+                                                 (__mmask8)(U)))
 
 #define _mm512_insertf32x8(A, B, imm) \
-  (__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
-                                     (__v8sf)(__m256)(B), (int)(imm))
+  ((__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
+                                      (__v8sf)(__m256)(B), (int)(imm)))
 
 #define _mm512_mask_insertf32x8(W, U, A, B, imm) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                  (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
-                                 (__v16sf)(__m512)(W))
+                                 (__v16sf)(__m512)(W)))
 
 #define _mm512_maskz_insertf32x8(U, A, B, imm) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                  (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
-                                 (__v16sf)_mm512_setzero_ps())
+                                 (__v16sf)_mm512_setzero_ps()))
 
 #define _mm512_insertf64x2(A, B, imm) \
-  (__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
-                                          (__v2df)(__m128d)(B), (int)(imm))
+  ((__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
+                                           (__v2df)(__m128d)(B), (int)(imm)))
 
 #define _mm512_mask_insertf64x2(W, U, A, B, imm) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                   (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
-                                  (__v8df)(__m512d)(W))
+                                  (__v8df)(__m512d)(W)))
 
 #define _mm512_maskz_insertf64x2(U, A, B, imm) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                   (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
-                                  (__v8df)_mm512_setzero_pd())
+                                  (__v8df)_mm512_setzero_pd()))
 
 #define _mm512_inserti32x8(A, B, imm) \
-  (__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
-                                      (__v8si)(__m256i)(B), (int)(imm))
+  ((__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
+                                       (__v8si)(__m256i)(B), (int)(imm)))
 
 #define _mm512_mask_inserti32x8(W, U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                  (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
-                                 (__v16si)(__m512i)(W))
+                                 (__v16si)(__m512i)(W)))
 
 #define _mm512_maskz_inserti32x8(U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
                                  (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
-                                 (__v16si)_mm512_setzero_si512())
+                                 (__v16si)_mm512_setzero_si512()))
 
 #define _mm512_inserti64x2(A, B, imm) \
-  (__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
-                                          (__v2di)(__m128i)(B), (int)(imm))
+  ((__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
+                                           (__v2di)(__m128i)(B), (int)(imm)))
 
 #define _mm512_mask_inserti64x2(W, U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                   (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
-                                  (__v8di)(__m512i)(W))
+                                  (__v8di)(__m512i)(W)))
 
 #define _mm512_maskz_inserti64x2(U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
                                   (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
-                                  (__v8di)_mm512_setzero_si512())
+                                  (__v8di)_mm512_setzero_si512()))
 
 #define _mm512_mask_fpclass_ps_mask(U, A, imm) \
-  (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
-                                              (int)(imm), (__mmask16)(U))
+  ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
+                                               (int)(imm), (__mmask16)(U)))
 
 #define _mm512_fpclass_ps_mask(A, imm) \
-  (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
-                                              (int)(imm), (__mmask16)-1)
+  ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
+                                               (int)(imm), (__mmask16)-1))
 
 #define _mm512_mask_fpclass_pd_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                             (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                              (__mmask8)(U)))
 
 #define _mm512_fpclass_pd_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                             (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                              (__mmask8)-1))
 
 #define _mm_fpclass_sd_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                          (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                           (__mmask8)-1))
 
 #define _mm_mask_fpclass_sd_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                          (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                           (__mmask8)(U)))
 
 #define _mm_fpclass_ss_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                          (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                           (__mmask8)-1))
 
 #define _mm_mask_fpclass_ss_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                          (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                           (__mmask8)(U)))
 
 #undef __DEFAULT_FN_ATTRS512
 #undef __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avx512erintrin.h b/clang/lib/Headers/avx512erintrin.h
index 8570061699068..1c5a2d2d208ff 100644
--- a/clang/lib/Headers/avx512erintrin.h
+++ b/clang/lib/Headers/avx512erintrin.h
@@ -15,19 +15,19 @@
 
 /* exp2a23 */
 #define _mm512_exp2a23_round_pd(A, R) \
-  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                      (__v8df)_mm512_setzero_pd(), \
-                                      (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
-  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                      (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                      (int)(R))
+  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                       (int)(R)))
 
 #define _mm512_maskz_exp2a23_round_pd(M, A, R) \
-  (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
-                                      (__v8df)_mm512_setzero_pd(), \
-                                      (__mmask8)(M), (int)(R))
+  ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+                                       (__v8df)_mm512_setzero_pd(), \
+                                       (__mmask8)(M), (int)(R)))
 
 #define _mm512_exp2a23_pd(A) \
   _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@@ -39,19 +39,19 @@
   _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm512_exp2a23_round_ps(A, R) \
-  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                     (__v16sf)_mm512_setzero_ps(), \
-                                     (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
-  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                     (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                     (int)(R))
+  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                      (int)(R)))
 
 #define _mm512_maskz_exp2a23_round_ps(M, A, R) \
-  (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
-                                     (__v16sf)_mm512_setzero_ps(), \
-                                     (__mmask16)(M), (int)(R))
+  ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+                                      (__v16sf)_mm512_setzero_ps(), \
+                                      (__mmask16)(M), (int)(R)))
 
 #define _mm512_exp2a23_ps(A) \
   _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@@ -64,19 +64,19 @@
 
 /* rsqrt28 */
 #define _mm512_rsqrt28_round_pd(A, R) \
-  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                         (__v8df)_mm512_setzero_pd(), \
-                                         (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
-  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                         (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                         (int)(R))
+  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                          (int)(R)))
 
 #define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
-  (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
-                                         (__v8df)_mm512_setzero_pd(), \
-                                         (__mmask8)(M), (int)(R))
+  ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)_mm512_setzero_pd(), \
+                                          (__mmask8)(M), (int)(R)))
 
 #define _mm512_rsqrt28_pd(A) \
   _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@@ -88,19 +88,19 @@
   _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm512_rsqrt28_round_ps(A, R) \
-  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                        (__v16sf)_mm512_setzero_ps(), \
-                                        (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
-  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                        (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                        (int)(R))
+  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                         (int)(R)))
 
 #define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
-  (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
-                                        (__v16sf)_mm512_setzero_ps(), \
-                                        (__mmask16)(M), (int)(R))
+  ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+                                         (__v16sf)_mm512_setzero_ps(), \
+                                         (__mmask16)(M), (int)(R)))
 
 #define _mm512_rsqrt28_ps(A) \
   _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@@ -112,22 +112,22 @@
   _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_rsqrt28_round_ss(A, B, R) \
-  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)-1, (int)(R))
+  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
-  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (__v4sf)(__m128)(S), \
-                                              (__mmask8)(M), (int)(R))
+  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)(__m128)(S), \
+                                               (__mmask8)(M), (int)(R)))
 
 #define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
-  (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)(M), (int)(R))
+  ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(M), (int)(R)))
 
 #define _mm_rsqrt28_ss(A, B) \
   _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@@ -139,22 +139,22 @@
   _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_rsqrt28_round_sd(A, B, R) \
-  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
-  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (__v2df)(__m128d)(S), \
-                                               (__mmask8)(M), (int)(R))
+  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)(__m128d)(S), \
+                                                (__mmask8)(M), (int)(R)))
 
 #define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
-  (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)(M), (int)(R))
+  ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(M), (int)(R)))
 
 #define _mm_rsqrt28_sd(A, B) \
   _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
@@ -167,19 +167,19 @@
 
 /* rcp28 */
 #define _mm512_rcp28_round_pd(A, R) \
-  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_rcp28_round_pd(S, M, A, R) \
-  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)(__m512d)(S), (__mmask8)(M), \
-                                       (int)(R))
+  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)(__m512d)(S), (__mmask8)(M), \
+                                        (int)(R)))
 
 #define _mm512_maskz_rcp28_round_pd(M, A, R) \
-  (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
-                                       (__v8df)_mm512_setzero_pd(), \
-                                       (__mmask8)(M), (int)(R))
+  ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+                                        (__v8df)_mm512_setzero_pd(), \
+                                        (__mmask8)(M), (int)(R)))
 
 #define _mm512_rcp28_pd(A) \
   _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
@@ -191,19 +191,19 @@
   _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm512_rcp28_round_ps(A, R) \
-  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_rcp28_round_ps(S, M, A, R) \
-  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)(__m512)(S), (__mmask16)(M), \
-                                      (int)(R))
+  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)(__m512)(S), (__mmask16)(M), \
+                                       (int)(R)))
 
 #define _mm512_maskz_rcp28_round_ps(M, A, R) \
-  (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
-                                      (__v16sf)_mm512_setzero_ps(), \
-                                      (__mmask16)(M), (int)(R))
+  ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+                                       (__v16sf)_mm512_setzero_ps(), \
+                                       (__mmask16)(M), (int)(R)))
 
 #define _mm512_rcp28_ps(A) \
   _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
@@ -215,22 +215,22 @@
   _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_rcp28_round_ss(A, B, R) \
-  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                            (__v4sf)(__m128)(B), \
-                                            (__v4sf)_mm_setzero_ps(), \
-                                            (__mmask8)-1, (int)(R))
+  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
-  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                            (__v4sf)(__m128)(B), \
-                                            (__v4sf)(__m128)(S), \
-                                            (__mmask8)(M), (int)(R))
+  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)(__m128)(S), \
+                                             (__mmask8)(M), (int)(R)))
 
 #define _mm_maskz_rcp28_round_ss(M, A, B, R) \
-  (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
-                                            (__v4sf)(__m128)(B), \
-                                            (__v4sf)_mm_setzero_ps(), \
-                                            (__mmask8)(M), (int)(R))
+  ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4sf)_mm_setzero_ps(), \
+                                             (__mmask8)(M), (int)(R)))
 
 #define _mm_rcp28_ss(A, B) \
   _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
@@ -242,22 +242,22 @@
   _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
 
 #define _mm_rcp28_round_sd(A, B, R) \
-  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                             (__v2df)(__m128d)(B), \
-                                             (__v2df)_mm_setzero_pd(), \
-                                             (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
-  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                             (__v2df)(__m128d)(B), \
-                                             (__v2df)(__m128d)(S), \
-                                             (__mmask8)(M), (int)(R))
+  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)(__m128d)(S), \
+                                              (__mmask8)(M), (int)(R)))
 
 #define _mm_maskz_rcp28_round_sd(M, A, B, R) \
-  (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
-                                             (__v2df)(__m128d)(B), \
-                                             (__v2df)_mm_setzero_pd(), \
-                                             (__mmask8)(M), (int)(R))
+  ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2df)_mm_setzero_pd(), \
+                                              (__mmask8)(M), (int)(R)))
 
 #define _mm_rcp28_sd(A, B) \
   _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 010bcadab0195..df298640523b7 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -937,18 +937,18 @@ _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
 }
 
 #define _mm512_max_round_pd(A, B, R) \
-  (__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \
-                                   (__v8df)(__m512d)(B), (int)(R))
+  ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \
+                                    (__v8df)(__m512d)(B), (int)(R)))
 
 #define _mm512_mask_max_round_pd(W, U, A, B, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_max_round_pd((A), (B), (R)), \
-                                   (__v8df)(W))
+                                   (__v8df)(W)))
 
 #define _mm512_maskz_max_round_pd(U, A, B, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_max_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd())
+                                   (__v8df)_mm512_setzero_pd()))
 
 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_max_pd(__m512d __A, __m512d __B)
@@ -974,18 +974,18 @@ _mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
 }
 
 #define _mm512_max_round_ps(A, B, R) \
-  (__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \
-                                  (__v16sf)(__m512)(B), (int)(R))
+  ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \
+                                   (__v16sf)(__m512)(B), (int)(R)))
 
 #define _mm512_mask_max_round_ps(W, U, A, B, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
-                                  (__v16sf)(W))
+                                  (__v16sf)(W)))
 
 #define _mm512_maskz_max_round_ps(U, A, B, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps())
+                                  (__v16sf)_mm512_setzero_ps()))
 
 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_max_ps(__m512 __A, __m512 __B)
@@ -1029,22 +1029,22 @@ _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
 }
 
 #define _mm_max_round_ss(A, B, R) \
-  (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R))
+  ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_max_round_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                          (int)(R))
+  ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                           (int)(R)))
 
 #define _mm_maskz_max_round_ss(U, A, B, R) \
-  (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
@@ -1065,22 +1065,22 @@ _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
 }
 
 #define _mm_max_round_sd(A, B, R) \
-  (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_max_round_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)(__m128d)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_max_round_sd(U, A, B, R) \
-  (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline __m512i
 __DEFAULT_FN_ATTRS512
@@ -1172,18 +1172,18 @@ _mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
 }
 
 #define _mm512_min_round_pd(A, B, R) \
-  (__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \
-                                   (__v8df)(__m512d)(B), (int)(R))
+  ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \
+                                    (__v8df)(__m512d)(B), (int)(R)))
 
 #define _mm512_mask_min_round_pd(W, U, A, B, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_min_round_pd((A), (B), (R)), \
-                                   (__v8df)(W))
+                                   (__v8df)(W)))
 
 #define _mm512_maskz_min_round_pd(U, A, B, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_min_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd())
+                                   (__v8df)_mm512_setzero_pd()))
 
 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_min_pd(__m512d __A, __m512d __B)
@@ -1209,18 +1209,18 @@ _mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
 }
 
 #define _mm512_min_round_ps(A, B, R) \
-  (__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \
-                                  (__v16sf)(__m512)(B), (int)(R))
+  ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \
+                                   (__v16sf)(__m512)(B), (int)(R)))
 
 #define _mm512_mask_min_round_ps(W, U, A, B, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
-                                  (__v16sf)(W))
+                                  (__v16sf)(W)))
 
 #define _mm512_maskz_min_round_ps(U, A, B, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps())
+                                  (__v16sf)_mm512_setzero_ps()))
 
 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_min_ps(__m512 __A, __m512 __B)
@@ -1264,22 +1264,22 @@ _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
 }
 
 #define _mm_min_round_ss(A, B, R) \
-  (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R))
+  ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_min_round_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                          (int)(R))
+  ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                           (int)(R)))
 
 #define _mm_maskz_min_round_ss(U, A, B, R) \
-  (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
@@ -1300,22 +1300,22 @@ _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
 }
 
 #define _mm_min_round_sd(A, B, R) \
-  (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_min_round_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)(__m128d)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_min_round_sd(U, A, B, R) \
-  (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline __m512i
 __DEFAULT_FN_ATTRS512
@@ -1485,17 +1485,17 @@ _mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
 }
 
 #define _mm512_sqrt_round_pd(A, R) \
-  (__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R))
+  ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)))
 
 #define _mm512_mask_sqrt_round_pd(W, U, A, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                        (__v8df)_mm512_sqrt_round_pd((A), (R)), \
-                                       (__v8df)(__m512d)(W))
+                                       (__v8df)(__m512d)(W)))
 
 #define _mm512_maskz_sqrt_round_pd(U, A, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                        (__v8df)_mm512_sqrt_round_pd((A), (R)), \
-                                       (__v8df)_mm512_setzero_pd())
+                                       (__v8df)_mm512_setzero_pd()))
 
 static  __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_sqrt_pd(__m512d __A)
@@ -1521,17 +1521,17 @@ _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
 }
 
 #define _mm512_sqrt_round_ps(A, R) \
-  (__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R))
+  ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)))
 
 #define _mm512_mask_sqrt_round_ps(W, U, A, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                       (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
-                                      (__v16sf)(__m512)(W))
+                                      (__v16sf)(__m512)(W)))
 
 #define _mm512_maskz_sqrt_round_ps(U, A, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                       (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
-                                      (__v16sf)_mm512_setzero_ps())
+                                      (__v16sf)_mm512_setzero_ps()))
 
 static  __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_sqrt_ps(__m512 __A)
@@ -1900,22 +1900,22 @@ _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
 }
 
 #define _mm_add_round_ss(A, B, R) \
-  (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R))
+  ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_add_round_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                          (int)(R))
+  ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                           (int)(R)))
 
 #define _mm_maskz_add_round_ss(U, A, B, R) \
-  (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
@@ -1929,22 +1929,22 @@ _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
   return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
 }
 #define _mm_add_round_sd(A, B, R) \
-  (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_add_round_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)(__m128d)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_add_round_sd(U, A, B, R) \
-  (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
@@ -1975,32 +1975,32 @@ _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
 }
 
 #define _mm512_add_round_pd(A, B, R) \
-  (__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \
-                                   (__v8df)(__m512d)(B), (int)(R))
+  ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \
+                                    (__v8df)(__m512d)(B), (int)(R)))
 
 #define _mm512_mask_add_round_pd(W, U, A, B, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_add_round_pd((A), (B), (R)), \
-                                   (__v8df)(__m512d)(W))
+                                   (__v8df)(__m512d)(W)))
 
 #define _mm512_maskz_add_round_pd(U, A, B, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_add_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd())
+                                   (__v8df)_mm512_setzero_pd()))
 
 #define _mm512_add_round_ps(A, B, R) \
-  (__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
-                                  (__v16sf)(__m512)(B), (int)(R))
+  ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
+                                   (__v16sf)(__m512)(B), (int)(R)))
 
 #define _mm512_mask_add_round_ps(W, U, A, B, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
-                                  (__v16sf)(__m512)(W))
+                                  (__v16sf)(__m512)(W)))
 
 #define _mm512_maskz_add_round_ps(U, A, B, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps())
+                                  (__v16sf)_mm512_setzero_ps()))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
@@ -2014,22 +2014,22 @@ _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
 }
 #define _mm_sub_round_ss(A, B, R) \
-  (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R))
+  ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_sub_round_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                          (int)(R))
+  ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                           (int)(R)))
 
 #define _mm_maskz_sub_round_ss(U, A, B, R) \
-  (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
@@ -2044,22 +2044,22 @@ _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
 }
 
 #define _mm_sub_round_sd(A, B, R) \
-  (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_sub_round_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)(__m128d)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_sub_round_sd(U, A, B, R) \
-  (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
@@ -2090,32 +2090,32 @@ _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
 }
 
 #define _mm512_sub_round_pd(A, B, R) \
-  (__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \
-                                   (__v8df)(__m512d)(B), (int)(R))
+  ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \
+                                    (__v8df)(__m512d)(B), (int)(R)))
 
 #define _mm512_mask_sub_round_pd(W, U, A, B, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
-                                   (__v8df)(__m512d)(W))
+                                   (__v8df)(__m512d)(W)))
 
 #define _mm512_maskz_sub_round_pd(U, A, B, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd())
+                                   (__v8df)_mm512_setzero_pd()))
 
 #define _mm512_sub_round_ps(A, B, R) \
-  (__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
-                                  (__v16sf)(__m512)(B), (int)(R))
+  ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
+                                   (__v16sf)(__m512)(B), (int)(R)))
 
 #define _mm512_mask_sub_round_ps(W, U, A, B, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
-                                  (__v16sf)(__m512)(W))
+                                  (__v16sf)(__m512)(W)))
 
 #define _mm512_maskz_sub_round_ps(U, A, B, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps())
+                                  (__v16sf)_mm512_setzero_ps()))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
@@ -2129,22 +2129,22 @@ _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
   return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
 }
 #define _mm_mul_round_ss(A, B, R) \
-  (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R))
+  ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_mul_round_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                          (int)(R))
+  ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                           (int)(R)))
 
 #define _mm_maskz_mul_round_ss(U, A, B, R) \
-  (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
@@ -2159,22 +2159,22 @@ _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
 }
 
 #define _mm_mul_round_sd(A, B, R) \
-  (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_mul_round_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)(__m128d)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_mul_round_sd(U, A, B, R) \
-  (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
@@ -2205,32 +2205,32 @@ _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
 }
 
 #define _mm512_mul_round_pd(A, B, R) \
-  (__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \
-                                   (__v8df)(__m512d)(B), (int)(R))
+  ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \
+                                    (__v8df)(__m512d)(B), (int)(R)))
 
 #define _mm512_mask_mul_round_pd(W, U, A, B, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
-                                   (__v8df)(__m512d)(W))
+                                   (__v8df)(__m512d)(W)))
 
 #define _mm512_maskz_mul_round_pd(U, A, B, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd())
+                                   (__v8df)_mm512_setzero_pd()))
 
 #define _mm512_mul_round_ps(A, B, R) \
-  (__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
-                                  (__v16sf)(__m512)(B), (int)(R))
+  ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
+                                  (__v16sf)(__m512)(B), (int)(R)))
 
 #define _mm512_mask_mul_round_ps(W, U, A, B, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
-                                  (__v16sf)(__m512)(W))
+                                  (__v16sf)(__m512)(W)))
 
 #define _mm512_maskz_mul_round_ps(U, A, B, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps())
+                                  (__v16sf)_mm512_setzero_ps()))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
@@ -2245,22 +2245,22 @@ _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
 }
 
 #define _mm_div_round_ss(A, B, R) \
-  (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R))
+  ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_div_round_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                          (int)(R))
+  ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                           (int)(R)))
 
 #define _mm_maskz_div_round_ss(U, A, B, R) \
-  (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
-                                          (__v4sf)(__m128)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
@@ -2275,22 +2275,22 @@ _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
 }
 
 #define _mm_div_round_sd(A, B, R) \
-  (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_div_round_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)(__m128d)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_div_round_sd(U, A, B, R) \
-  (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
-                                           (__v2df)(__m128d)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_div_pd(__m512d __a, __m512d __b)
@@ -2333,179 +2333,179 @@ _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
 }
 
 #define _mm512_div_round_pd(A, B, R) \
-  (__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \
-                                   (__v8df)(__m512d)(B), (int)(R))
+  ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \
+                                    (__v8df)(__m512d)(B), (int)(R)))
 
 #define _mm512_mask_div_round_pd(W, U, A, B, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_div_round_pd((A), (B), (R)), \
-                                   (__v8df)(__m512d)(W))
+                                   (__v8df)(__m512d)(W)))
 
 #define _mm512_maskz_div_round_pd(U, A, B, R) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
                                    (__v8df)_mm512_div_round_pd((A), (B), (R)), \
-                                   (__v8df)_mm512_setzero_pd())
+                                   (__v8df)_mm512_setzero_pd()))
 
 #define _mm512_div_round_ps(A, B, R) \
-  (__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
-                                  (__v16sf)(__m512)(B), (int)(R))
+  ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
+                                   (__v16sf)(__m512)(B), (int)(R)))
 
 #define _mm512_mask_div_round_ps(W, U, A, B, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
-                                  (__v16sf)(__m512)(W))
+                                  (__v16sf)(__m512)(W)))
 
 #define _mm512_maskz_div_round_ps(U, A, B, R) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
                                   (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
-                                  (__v16sf)_mm512_setzero_ps())
+                                  (__v16sf)_mm512_setzero_ps()))
 
 #define _mm512_roundscale_ps(A, B) \
-  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
-                                         (__v16sf)_mm512_undefined_ps(), \
-                                         (__mmask16)-1, \
-                                         _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
+                                          (__v16sf)_mm512_undefined_ps(), \
+                                          (__mmask16)-1, \
+                                          _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_mask_roundscale_ps(A, B, C, imm) \
-  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
+  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
                                          (__v16sf)(__m512)(A), (__mmask16)(B), \
-                                         _MM_FROUND_CUR_DIRECTION)
+                                         _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_maskz_roundscale_ps(A, B, imm) \
-  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)(A), \
-                                         _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(A), \
+                                          _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \
-  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
+  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
                                          (__v16sf)(__m512)(A), (__mmask16)(B), \
-                                         (int)(R))
+                                         (int)(R)))
 
 #define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \
-  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
-                                         (__v16sf)_mm512_setzero_ps(), \
-                                         (__mmask16)(A), (int)(R))
+  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
+                                          (__v16sf)_mm512_setzero_ps(), \
+                                          (__mmask16)(A), (int)(R)))
 
 #define _mm512_roundscale_round_ps(A, imm, R) \
-  (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                         (__v16sf)_mm512_undefined_ps(), \
-                                         (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                          (__v16sf)_mm512_undefined_ps(), \
+                                          (__mmask16)-1, (int)(R)))
 
 #define _mm512_roundscale_pd(A, B) \
-  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
-                                          (__v8df)_mm512_undefined_pd(), \
-                                          (__mmask8)-1, \
-                                          _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
+                                           (__v8df)_mm512_undefined_pd(), \
+                                           (__mmask8)-1, \
+                                           _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_mask_roundscale_pd(A, B, C, imm) \
-  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
+  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
                                           (__v8df)(__m512d)(A), (__mmask8)(B), \
-                                          _MM_FROUND_CUR_DIRECTION)
+                                          _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_maskz_roundscale_pd(A, B, imm) \
-  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)(A), \
-                                          _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(A), \
+                                           _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \
-  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
+  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
                                           (__v8df)(__m512d)(A), (__mmask8)(B), \
-                                          (int)(R))
+                                          (int)(R)))
 
 #define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \
-  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
-                                          (__v8df)_mm512_setzero_pd(), \
-                                          (__mmask8)(A), (int)(R))
+  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
+                                           (__v8df)_mm512_setzero_pd(), \
+                                           (__mmask8)(A), (int)(R)))
 
 #define _mm512_roundscale_round_pd(A, imm, R) \
-  (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                          (__v8df)_mm512_undefined_pd(), \
-                                          (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                           (__v8df)_mm512_undefined_pd(), \
+                                           (__mmask8)-1, (int)(R)))
 
 #define _mm512_fmadd_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           (__v8df)(__m512d)(C), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)-1, (int)(R)))
 
 
 #define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           (__v8df)(__m512d)(C), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)))
 
 
 #define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8df)(__m512d)(C), \
+                                             (__mmask8)(U), (int)(R)))
 
 
 #define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8df)(__m512d)(C), \
+                                             (__mmask8)(U), (int)(R)))
 
 
 #define _mm512_fmsub_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           -(__v8df)(__m512d)(C), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            -(__v8df)(__m512d)(C), \
+                                            (__mmask8)-1, (int)(R)))
 
 
 #define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           -(__v8df)(__m512d)(C), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            -(__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)))
 
 
 #define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            -(__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             -(__v8df)(__m512d)(C), \
+                                             (__mmask8)(U), (int)(R)))
 
 
 #define _mm512_fnmadd_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           (__v8df)(__m512d)(C), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)-1, (int)(R)))
 
 
 #define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8df)(__m512d)(C), \
+                                             (__mmask8)(U), (int)(R)))
 
 
 #define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8df)(__m512d)(C), \
+                                             (__mmask8)(U), (int)(R)))
 
 
 #define _mm512_fnmsub_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           -(__v8df)(__m512d)(C), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            -(__v8df)(__m512d)(C), \
+                                            (__mmask8)-1, (int)(R)))
 
 
 #define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            -(__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             -(__v8df)(__m512d)(C), \
+                                             (__mmask8)(U), (int)(R)))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
@@ -2629,87 +2629,87 @@ _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 }
 
 #define _mm512_fmadd_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          (__v16sf)(__m512)(C), \
-                                          (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)-1, (int)(R)))
 
 
 #define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          (__v16sf)(__m512)(C), \
-                                          (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)))
 
 
 #define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
-  (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16sf)(__m512)(C), \
+                                            (__mmask16)(U), (int)(R)))
 
 
 #define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16sf)(__m512)(C), \
+                                            (__mmask16)(U), (int)(R)))
 
 
 #define _mm512_fmsub_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          -(__v16sf)(__m512)(C), \
-                                          (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           -(__v16sf)(__m512)(C), \
+                                           (__mmask16)-1, (int)(R)))
 
 
 #define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          -(__v16sf)(__m512)(C), \
-                                          (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           -(__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)))
 
 
 #define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           -(__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            -(__v16sf)(__m512)(C), \
+                                            (__mmask16)(U), (int)(R)))
 
 
 #define _mm512_fnmadd_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                          -(__v16sf)(__m512)(B), \
-                                          (__v16sf)(__m512)(C), \
-                                          (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                           -(__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)-1, (int)(R)))
 
 
 #define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
-  (__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16sf)(__m512)(C), \
+                                            (__mmask16)(U), (int)(R)))
 
 
 #define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16sf)(__m512)(C), \
+                                            (__mmask16)(U), (int)(R)))
 
 
 #define _mm512_fnmsub_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                          -(__v16sf)(__m512)(B), \
-                                          -(__v16sf)(__m512)(C), \
-                                          (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                           -(__v16sf)(__m512)(B), \
+                                           -(__v16sf)(__m512)(C), \
+                                           (__mmask16)-1, (int)(R)))
 
 
 #define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           -(__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            -(__v16sf)(__m512)(C), \
+                                            (__mmask16)(U), (int)(R)))
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
@@ -2833,52 +2833,52 @@ _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 }
 
 #define _mm512_fmaddsub_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              (__v8df)(__m512d)(C), \
-                                              (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)-1, (int)(R)))
 
 
 #define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              (__v8df)(__m512d)(C), \
-                                              (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)))
 
 
 #define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               (__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
+                                                (__v8df)(__m512d)(B), \
+                                                (__v8df)(__m512d)(C), \
+                                                (__mmask8)(U), (int)(R)))
 
 
 #define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               (__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
+                                                (__v8df)(__m512d)(B), \
+                                                (__v8df)(__m512d)(C), \
+                                                (__mmask8)(U), (int)(R)))
 
 
 #define _mm512_fmsubadd_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              -(__v8df)(__m512d)(C), \
-                                              (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               -(__v8df)(__m512d)(C), \
+                                               (__mmask8)-1, (int)(R)))
 
 
 #define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              -(__v8df)(__m512d)(C), \
-                                              (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               -(__v8df)(__m512d)(C), \
+                                               (__mmask8)(U), (int)(R)))
 
 
 #define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               -(__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
+                                                (__v8df)(__m512d)(B), \
+                                                -(__v8df)(__m512d)(C), \
+                                                (__mmask8)(U), (int)(R)))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
@@ -2952,52 +2952,52 @@ _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
 }
 
 #define _mm512_fmaddsub_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             (__v16sf)(__m512)(C), \
-                                             (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)-1, (int)(R)))
 
 
 #define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             (__v16sf)(__m512)(C), \
-                                             (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)))
 
 
 #define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
-  (__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              (__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
+                                               (__v16sf)(__m512)(B), \
+                                               (__v16sf)(__m512)(C), \
+                                               (__mmask16)(U), (int)(R)))
 
 
 #define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              (__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
+                                               (__v16sf)(__m512)(B), \
+                                               (__v16sf)(__m512)(C), \
+                                               (__mmask16)(U), (int)(R)))
 
 
 #define _mm512_fmsubadd_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             -(__v16sf)(__m512)(C), \
-                                             (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              -(__v16sf)(__m512)(C), \
+                                              (__mmask16)-1, (int)(R)))
 
 
 #define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             -(__v16sf)(__m512)(C), \
-                                             (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              -(__v16sf)(__m512)(C), \
+                                              (__mmask16)(U), (int)(R)))
 
 
 #define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              -(__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
+                                               (__v16sf)(__m512)(B), \
+                                               -(__v16sf)(__m512)(C), \
+                                               (__mmask16)(U), (int)(R)))
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
@@ -3071,10 +3071,10 @@ _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
 }
 
 #define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
-  (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8df)(__m512d)(C), \
+                                             (__mmask8)(U), (int)(R)))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
@@ -3088,10 +3088,10 @@ _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 }
 
 #define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
-  (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16sf)(__m512)(C), \
+                                            (__mmask16)(U), (int)(R)))
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
@@ -3104,10 +3104,10 @@ _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 }
 
 #define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
-  (__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
-                                               (__v8df)(__m512d)(B), \
-                                               (__v8df)(__m512d)(C), \
-                                               (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
+                                                (__v8df)(__m512d)(B), \
+                                                (__v8df)(__m512d)(C), \
+                                                (__mmask8)(U), (int)(R)))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
@@ -3121,10 +3121,10 @@ _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 }
 
 #define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
-  (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
-                                              (__v16sf)(__m512)(B), \
-                                              (__v16sf)(__m512)(C), \
-                                              (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
+                                               (__v16sf)(__m512)(B), \
+                                               (__v16sf)(__m512)(C), \
+                                               (__mmask16)(U), (int)(R)))
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
@@ -3138,10 +3138,10 @@ _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 }
 
 #define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                           -(__v8df)(__m512d)(B), \
-                                           (__v8df)(__m512d)(C), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                            -(__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
@@ -3155,10 +3155,10 @@ _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
 }
 
 #define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                          -(__v16sf)(__m512)(B), \
-                                          (__v16sf)(__m512)(C), \
-                                          (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                           -(__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)))
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
@@ -3172,17 +3172,17 @@ _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
 }
 
 #define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
-  (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
-                                           -(__v8df)(__m512d)(B), \
-                                           -(__v8df)(__m512d)(C), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+                                            -(__v8df)(__m512d)(B), \
+                                            -(__v8df)(__m512d)(C), \
+                                            (__mmask8)(U), (int)(R)))
 
 
 #define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
-  (__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
-                                            (__v8df)(__m512d)(B), \
-                                            (__v8df)(__m512d)(C), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
+                                             (__v8df)(__m512d)(B), \
+                                             (__v8df)(__m512d)(C), \
+                                             (__mmask8)(U), (int)(R)))
 
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
@@ -3206,17 +3206,17 @@ _mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
 }
 
 #define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
-  (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
-                                          -(__v16sf)(__m512)(B), \
-                                          -(__v16sf)(__m512)(C), \
-                                          (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+                                           -(__v16sf)(__m512)(B), \
+                                           -(__v16sf)(__m512)(C), \
+                                           (__mmask16)(U), (int)(R)))
 
 
 #define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
-  (__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
-                                           (__v16sf)(__m512)(B), \
-                                           (__v16sf)(__m512)(C), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
+                                            (__v16sf)(__m512)(B), \
+                                            (__v16sf)(__m512)(C), \
+                                            (__mmask16)(U), (int)(R)))
 
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
@@ -3312,63 +3312,63 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
 }
 
 #define _mm512_alignr_epi64(A, B, I) \
-  (__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
-                                    (__v8di)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
+                                     (__v8di)(__m512i)(B), (int)(I)))
 
 #define _mm512_mask_alignr_epi64(W, U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
-                                 (__v8di)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+                                  (__v8di)(__m512i)(W)))
 
 #define _mm512_maskz_alignr_epi64(U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                 (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
-                                 (__v8di)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                  (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+                                  (__v8di)_mm512_setzero_si512()))
 
 #define _mm512_alignr_epi32(A, B, I) \
-  (__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
-                                    (__v16si)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
+                                     (__v16si)(__m512i)(B), (int)(I)))
 
 #define _mm512_mask_alignr_epi32(W, U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
-                                (__v16si)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+                                 (__v16si)(__m512i)(W)))
 
 #define _mm512_maskz_alignr_epi32(U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
-                                (__v16si)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                 (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+                                 (__v16si)_mm512_setzero_si512()))
 /* Vector Extract */
 
 #define _mm512_extractf64x4_pd(A, I) \
-  (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
-                                            (__v4df)_mm256_undefined_pd(), \
-                                            (__mmask8)-1)
+  ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
+                                             (__v4df)_mm256_undefined_pd(), \
+                                             (__mmask8)-1))
 
 #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
-  (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                            (__v4df)(__m256d)(W), \
-                                            (__mmask8)(U))
+  ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                             (__v4df)(__m256d)(W), \
+                                             (__mmask8)(U)))
 
 #define _mm512_maskz_extractf64x4_pd(U, A, imm) \
-  (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
-                                            (__v4df)_mm256_setzero_pd(), \
-                                            (__mmask8)(U))
+  ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
+                                             (__v4df)_mm256_setzero_pd(), \
+                                             (__mmask8)(U)))
 
 #define _mm512_extractf32x4_ps(A, I) \
-  (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
-                                           (__v4sf)_mm_undefined_ps(), \
-                                           (__mmask8)-1)
+  ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
+                                            (__v4sf)_mm_undefined_ps(), \
+                                            (__mmask8)-1))
 
 #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
-  (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                           (__v4sf)(__m128)(W), \
-                                           (__mmask8)(U))
+  ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                            (__v4sf)(__m128)(W), \
+                                            (__mmask8)(U)))
 
 #define _mm512_maskz_extractf32x4_ps(U, A, imm) \
-  (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U))
+  ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
+                                            (__v4sf)_mm_setzero_ps(), \
+                                            (__mmask8)(U)))
 
 /* Vector Blend */
 
@@ -3407,14 +3407,14 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
 /* Compare */
 
 #define _mm512_cmp_round_ps_mask(A, B, P, R) \
-  (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), (int)(P), \
-                                          (__mmask16)-1, (int)(R))
+  ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), (int)(P), \
+                                           (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \
-  (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), (int)(P), \
-                                          (__mmask16)(U), (int)(R))
+  ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), (int)(P), \
+                                           (__mmask16)(U), (int)(R)))
 
 #define _mm512_cmp_ps_mask(A, B, P) \
   _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
@@ -3462,14 +3462,14 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
     _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
 
 #define _mm512_cmp_round_pd_mask(A, B, P, R) \
-  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
-                                         (__v8df)(__m512d)(B), (int)(P), \
-                                         (__mmask8)-1, (int)(R))
+  ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(P), \
+                                          (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \
-  (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
-                                         (__v8df)(__m512d)(B), (int)(P), \
-                                         (__mmask8)(U), (int)(R))
+  ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+                                          (__v8df)(__m512d)(B), (int)(P), \
+                                          (__mmask8)(U), (int)(R)))
 
 #define _mm512_cmp_pd_mask(A, B, P) \
   _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
@@ -3519,19 +3519,19 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
 /* Conversion */
 
 #define _mm512_cvtt_roundps_epu32(A, R) \
-  (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
-                                             (__v16si)_mm512_undefined_epi32(), \
-                                             (__mmask16)-1, (int)(R))
+  ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+                                              (__v16si)_mm512_undefined_epi32(), \
+                                              (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
-                                             (__v16si)(__m512i)(W), \
-                                             (__mmask16)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+                                              (__v16si)(__m512i)(W), \
+                                              (__mmask16)(U), (int)(R)))
 
 #define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \
-  (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
-                                             (__v16si)_mm512_setzero_si512(), \
-                                             (__mmask16)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+                                              (__v16si)_mm512_setzero_si512(), \
+                                              (__mmask16)(U), (int)(R)))
 
 
 static __inline __m512i __DEFAULT_FN_ATTRS512
@@ -3563,34 +3563,34 @@ _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
 }
 
 #define _mm512_cvt_roundepi32_ps(A, R) \
-  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \
-  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
-                                          (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), (int)(R)))
 
 #define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \
-  (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), (int)(R)))
 
 #define _mm512_cvt_roundepu32_ps(A, R) \
-  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
-                                           (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+                                            (__v16sf)_mm512_setzero_ps(), \
+                                            (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \
-  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
-                                           (__v16sf)(__m512)(W), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+                                            (__v16sf)(__m512)(W), \
+                                            (__mmask16)(U), (int)(R)))
 
 #define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \
-  (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
-                                           (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+                                            (__v16sf)_mm512_setzero_ps(), \
+                                            (__mmask16)(U), (int)(R)))
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_cvtepu32_ps (__m512i __A)
@@ -3705,19 +3705,19 @@ _mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
 }
 
 #define _mm512_cvt_roundpd_ps(A, R) \
-  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
-                                          (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)-1, (int)(R))
+  ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \
-  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
-                                          (__v8sf)(__m256)(W), (__mmask8)(U), \
-                                          (int)(R))
+  ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+                                           (__v8sf)(__m256)(W), (__mmask8)(U), \
+                                           (int)(R)))
 
 #define _mm512_maskz_cvt_roundpd_ps(U, A, R) \
-  (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
-                                          (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)(U), (int)(R))
+  ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)(U), (int)(R)))
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS512
 _mm512_cvtpd_ps (__m512d __A)
@@ -3765,38 +3765,38 @@ _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
 }
 
 #define _mm512_cvt_roundps_ph(A, I) \
-  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
-                                            (__v16hi)_mm256_undefined_si256(), \
-                                            (__mmask16)-1)
+  ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                             (__v16hi)_mm256_undefined_si256(), \
+                                             (__mmask16)-1))
 
 #define _mm512_mask_cvt_roundps_ph(U, W, A, I) \
-  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
-                                            (__v16hi)(__m256i)(U), \
-                                            (__mmask16)(W))
+  ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                             (__v16hi)(__m256i)(U), \
+                                             (__mmask16)(W)))
 
 #define _mm512_maskz_cvt_roundps_ph(W, A, I) \
-  (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
-                                            (__v16hi)_mm256_setzero_si256(), \
-                                            (__mmask16)(W))
+  ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+                                             (__v16hi)_mm256_setzero_si256(), \
+                                             (__mmask16)(W)))
 
 #define _mm512_cvtps_ph       _mm512_cvt_roundps_ph
 #define _mm512_mask_cvtps_ph  _mm512_mask_cvt_roundps_ph
 #define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph
 
 #define _mm512_cvt_roundph_ps(A, R) \
-  (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
-                                           (__v16sf)_mm512_undefined_ps(), \
-                                           (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+                                            (__v16sf)_mm512_undefined_ps(), \
+                                            (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundph_ps(W, U, A, R) \
-  (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
-                                           (__v16sf)(__m512)(W), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+                                            (__v16sf)(__m512)(W), \
+                                            (__mmask16)(U), (int)(R)))
 
 #define _mm512_maskz_cvt_roundph_ps(U, A, R) \
-  (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
-                                           (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+                                            (__v16sf)_mm512_setzero_ps(), \
+                                            (__mmask16)(U), (int)(R)))
 
 
 static  __inline __m512 __DEFAULT_FN_ATTRS512
@@ -3828,19 +3828,19 @@ _mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
 }
 
 #define _mm512_cvtt_roundpd_epi32(A, R) \
-  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)-1, (int)(R))
+  ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)_mm256_setzero_si256(), \
+                                             (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \
-  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8si)(__m256i)(W), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)(__m256i)(W), \
+                                             (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \
-  (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)_mm256_setzero_si256(), \
+                                             (__mmask8)(U), (int)(R)))
 
 static __inline __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvttpd_epi32(__m512d __a)
@@ -3870,19 +3870,19 @@ _mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
 }
 
 #define _mm512_cvtt_roundps_epi32(A, R) \
-  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
-                                            (__v16si)_mm512_setzero_si512(), \
-                                            (__mmask16)-1, (int)(R))
+  ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)_mm512_setzero_si512(), \
+                                             (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
-                                            (__v16si)(__m512i)(W), \
-                                            (__mmask16)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)(__m512i)(W), \
+                                             (__mmask16)(U), (int)(R)))
 
 #define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \
-  (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
-                                            (__v16si)_mm512_setzero_si512(), \
-                                            (__mmask16)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)_mm512_setzero_si512(), \
+                                             (__mmask16)(U), (int)(R)))
 
 static __inline __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvttps_epi32(__m512 __a)
@@ -3912,19 +3912,19 @@ _mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
 }
 
 #define _mm512_cvt_roundps_epi32(A, R) \
-  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
-                                           (__v16si)_mm512_setzero_si512(), \
-                                           (__mmask16)-1, (int)(R))
+  ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
-                                           (__v16si)(__m512i)(W), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)(__m512i)(W), \
+                                            (__mmask16)(U), (int)(R)))
 
 #define _mm512_maskz_cvt_roundps_epi32(U, A, R) \
-  (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
-                                           (__v16si)_mm512_setzero_si512(), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+                                            (__v16si)_mm512_setzero_si512(), \
+                                            (__mmask16)(U), (int)(R)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvtps_epi32 (__m512 __A)
@@ -3955,19 +3955,19 @@ _mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
 }
 
 #define _mm512_cvt_roundpd_epi32(A, R) \
-  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
-                                           (__v8si)_mm256_setzero_si256(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \
-  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
-                                           (__v8si)(__m256i)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)(__m256i)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \
-  (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
-                                           (__v8si)_mm256_setzero_si256(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+                                            (__v8si)_mm256_setzero_si256(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvtpd_epi32 (__m512d __A)
@@ -3999,19 +3999,19 @@ _mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
 }
 
 #define _mm512_cvt_roundps_epu32(A, R) \
-  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
-                                            (__v16si)_mm512_setzero_si512(), \
-                                            (__mmask16)-1, (int)(R))
+  ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)_mm512_setzero_si512(), \
+                                             (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \
-  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
-                                            (__v16si)(__m512i)(W), \
-                                            (__mmask16)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)(__m512i)(W), \
+                                             (__mmask16)(U), (int)(R)))
 
 #define _mm512_maskz_cvt_roundps_epu32(U, A, R) \
-  (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
-                                            (__v16si)_mm512_setzero_si512(), \
-                                            (__mmask16)(U), (int)(R))
+  ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+                                             (__v16si)_mm512_setzero_si512(), \
+                                             (__mmask16)(U), (int)(R)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_cvtps_epu32 ( __m512 __A)
@@ -4043,19 +4043,19 @@ _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
 }
 
 #define _mm512_cvt_roundpd_epu32(A, R) \
-  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)-1, (int)(R))
+  ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)_mm256_setzero_si256(), \
+                                             (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \
-  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8si)(__m256i)(W), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)(__m256i)(W), \
+                                             (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \
-  (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
-                                            (__v8si)_mm256_setzero_si256(), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+                                             (__v8si)_mm256_setzero_si256(), \
+                                             (__mmask8)(U), (int)(R)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvtpd_epu32 (__m512d __A)
@@ -4975,70 +4975,70 @@ _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
 
 
 #define _mm512_cmp_epi32_mask(a, b, p) \
-  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
-                                         (__v16si)(__m512i)(b), (int)(p), \
-                                         (__mmask16)-1)
+  ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+                                          (__v16si)(__m512i)(b), (int)(p), \
+                                          (__mmask16)-1))
 
 #define _mm512_cmp_epu32_mask(a, b, p) \
-  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
-                                          (__v16si)(__m512i)(b), (int)(p), \
-                                          (__mmask16)-1)
+  ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+                                           (__v16si)(__m512i)(b), (int)(p), \
+                                           (__mmask16)-1))
 
 #define _mm512_cmp_epi64_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
-                                        (__v8di)(__m512i)(b), (int)(p), \
-                                        (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+                                         (__v8di)(__m512i)(b), (int)(p), \
+                                         (__mmask8)-1))
 
 #define _mm512_cmp_epu64_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
-                                         (__v8di)(__m512i)(b), (int)(p), \
-                                         (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+                                          (__v8di)(__m512i)(b), (int)(p), \
+                                          (__mmask8)-1))
 
 #define _mm512_mask_cmp_epi32_mask(m, a, b, p) \
-  (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
-                                         (__v16si)(__m512i)(b), (int)(p), \
-                                         (__mmask16)(m))
+  ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+                                          (__v16si)(__m512i)(b), (int)(p), \
+                                          (__mmask16)(m)))
 
 #define _mm512_mask_cmp_epu32_mask(m, a, b, p) \
-  (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
-                                          (__v16si)(__m512i)(b), (int)(p), \
-                                          (__mmask16)(m))
+  ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+                                           (__v16si)(__m512i)(b), (int)(p), \
+                                           (__mmask16)(m)))
 
 #define _mm512_mask_cmp_epi64_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
-                                        (__v8di)(__m512i)(b), (int)(p), \
-                                        (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+                                         (__v8di)(__m512i)(b), (int)(p), \
+                                         (__mmask8)(m)))
 
 #define _mm512_mask_cmp_epu64_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
-                                         (__v8di)(__m512i)(b), (int)(p), \
-                                         (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+                                          (__v8di)(__m512i)(b), (int)(p), \
+                                          (__mmask8)(m)))
 
 #define _mm512_rol_epi32(a, b) \
-  (__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b))
+  ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)))
 
 #define _mm512_mask_rol_epi32(W, U, a, b) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                      (__v16si)_mm512_rol_epi32((a), (b)), \
-                                      (__v16si)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                       (__v16si)_mm512_rol_epi32((a), (b)), \
+                                       (__v16si)(__m512i)(W)))
 
 #define _mm512_maskz_rol_epi32(U, a, b) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                      (__v16si)_mm512_rol_epi32((a), (b)), \
-                                      (__v16si)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                       (__v16si)_mm512_rol_epi32((a), (b)), \
+                                       (__v16si)_mm512_setzero_si512()))
 
 #define _mm512_rol_epi64(a, b) \
-  (__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b))
+  ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)))
 
 #define _mm512_mask_rol_epi64(W, U, a, b) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                      (__v8di)_mm512_rol_epi64((a), (b)), \
-                                      (__v8di)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                       (__v8di)_mm512_rol_epi64((a), (b)), \
+                                       (__v8di)(__m512i)(W)))
 
 #define _mm512_maskz_rol_epi64(U, a, b) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                      (__v8di)_mm512_rol_epi64((a), (b)), \
-                                      (__v8di)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                       (__v8di)_mm512_rol_epi64((a), (b)), \
+                                       (__v8di)_mm512_setzero_si512()))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_rolv_epi32 (__m512i __A, __m512i __B)
@@ -5085,30 +5085,30 @@ _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
 }
 
 #define _mm512_ror_epi32(A, B) \
-  (__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B))
+  ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)))
 
 #define _mm512_mask_ror_epi32(W, U, A, B) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                      (__v16si)_mm512_ror_epi32((A), (B)), \
-                                      (__v16si)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                       (__v16si)_mm512_ror_epi32((A), (B)), \
+                                       (__v16si)(__m512i)(W)))
 
 #define _mm512_maskz_ror_epi32(U, A, B) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                      (__v16si)_mm512_ror_epi32((A), (B)), \
-                                      (__v16si)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                       (__v16si)_mm512_ror_epi32((A), (B)), \
+                                       (__v16si)_mm512_setzero_si512()))
 
 #define _mm512_ror_epi64(A, B) \
-  (__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B))
+  ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)))
 
 #define _mm512_mask_ror_epi64(W, U, A, B) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                      (__v8di)_mm512_ror_epi64((A), (B)), \
-                                      (__v8di)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                       (__v8di)_mm512_ror_epi64((A), (B)), \
+                                       (__v8di)(__m512i)(W)))
 
 #define _mm512_maskz_ror_epi64(U, A, B) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                      (__v8di)_mm512_ror_epi64((A), (B)), \
-                                      (__v8di)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                       (__v8di)_mm512_ror_epi64((A), (B)), \
+                                       (__v8di)_mm512_setzero_si512()))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_slli_epi32(__m512i __A, unsigned int __B)
@@ -5304,168 +5304,168 @@ _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
 }
 
 #define _mm512_fixupimm_round_pd(A, B, C, imm, R) \
-  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             (__v8di)(__m512i)(C), (int)(imm), \
-                                             (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8di)(__m512i)(C), (int)(imm), \
+                                              (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
-  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             (__v8di)(__m512i)(C), (int)(imm), \
-                                             (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8di)(__m512i)(C), (int)(imm), \
+                                              (__mmask8)(U), (int)(R)))
 
 #define _mm512_fixupimm_pd(A, B, C, imm) \
-  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             (__v8di)(__m512i)(C), (int)(imm), \
-                                             (__mmask8)-1, \
-                                             _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8di)(__m512i)(C), (int)(imm), \
+                                              (__mmask8)-1, \
+                                              _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \
-  (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
-                                             (__v8df)(__m512d)(B), \
-                                             (__v8di)(__m512i)(C), (int)(imm), \
-                                             (__mmask8)(U), \
-                                             _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+                                              (__v8df)(__m512d)(B), \
+                                              (__v8di)(__m512i)(C), (int)(imm), \
+                                              (__mmask8)(U), \
+                                              _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
-  (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              (__v8di)(__m512i)(C), \
-                                              (int)(imm), (__mmask8)(U), \
-                                              (int)(R))
+  ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8di)(__m512i)(C), \
+                                               (int)(imm), (__mmask8)(U), \
+                                               (int)(R)))
 
 #define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \
-  (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
-                                              (__v8df)(__m512d)(B), \
-                                              (__v8di)(__m512i)(C), \
-                                              (int)(imm), (__mmask8)(U), \
-                                              _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
+                                               (__v8df)(__m512d)(B), \
+                                               (__v8di)(__m512i)(C), \
+                                               (int)(imm), (__mmask8)(U), \
+                                               _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_fixupimm_round_ps(A, B, C, imm, R) \
-  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            (__v16si)(__m512i)(C), (int)(imm), \
-                                            (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16si)(__m512i)(C), (int)(imm), \
+                                             (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
-  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            (__v16si)(__m512i)(C), (int)(imm), \
-                                            (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16si)(__m512i)(C), (int)(imm), \
+                                             (__mmask16)(U), (int)(R)))
 
 #define _mm512_fixupimm_ps(A, B, C, imm) \
-  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            (__v16si)(__m512i)(C), (int)(imm), \
-                                            (__mmask16)-1, \
-                                            _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16si)(__m512i)(C), (int)(imm), \
+                                             (__mmask16)-1, \
+                                             _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \
-  (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
-                                            (__v16sf)(__m512)(B), \
-                                            (__v16si)(__m512i)(C), (int)(imm), \
-                                            (__mmask16)(U), \
-                                            _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+                                             (__v16sf)(__m512)(B), \
+                                             (__v16si)(__m512i)(C), (int)(imm), \
+                                             (__mmask16)(U), \
+                                             _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
-  (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             (__v16si)(__m512i)(C), \
-                                             (int)(imm), (__mmask16)(U), \
-                                             (int)(R))
+  ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16si)(__m512i)(C), \
+                                              (int)(imm), (__mmask16)(U), \
+                                              (int)(R)))
 
 #define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \
-  (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
-                                             (__v16sf)(__m512)(B), \
-                                             (__v16si)(__m512i)(C), \
-                                             (int)(imm), (__mmask16)(U), \
-                                             _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
+                                              (__v16sf)(__m512)(B), \
+                                              (__v16si)(__m512i)(C), \
+                                              (int)(imm), (__mmask16)(U), \
+                                              _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_fixupimm_round_sd(A, B, C, imm, R) \
-  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), \
-                                          (__v2di)(__m128i)(C), (int)(imm), \
-                                          (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2di)(__m128i)(C), (int)(imm), \
+                                           (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \
-  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), \
-                                          (__v2di)(__m128i)(C), (int)(imm), \
-                                          (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2di)(__m128i)(C), (int)(imm), \
+                                           (__mmask8)(U), (int)(R)))
 
 #define _mm_fixupimm_sd(A, B, C, imm) \
-  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), \
-                                          (__v2di)(__m128i)(C), (int)(imm), \
-                                          (__mmask8)-1, \
-                                          _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_fixupimm_sd(A, U, B, C, imm) \
-  (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), \
-                                          (__v2di)(__m128i)(C), (int)(imm), \
-                                          (__mmask8)(U), \
-                                          _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \
-  (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
+  ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2di)(__m128i)(C), (int)(imm), \
-                                           (__mmask8)(U), (int)(R))
+                                           (__mmask8)-1, \
+                                           _MM_FROUND_CUR_DIRECTION))
 
-#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \
-  (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
+#define _mm_mask_fixupimm_sd(A, U, B, C, imm) \
+  ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
                                            (__v2df)(__m128d)(B), \
                                            (__v2di)(__m128i)(C), (int)(imm), \
                                            (__mmask8)(U), \
-                                           _MM_FROUND_CUR_DIRECTION)
+                                           _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \
+  ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2di)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \
+  ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
+                                            (__v2df)(__m128d)(B), \
+                                            (__v2di)(__m128i)(C), (int)(imm), \
+                                            (__mmask8)(U), \
+                                            _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_fixupimm_round_ss(A, B, C, imm, R) \
-  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), \
-                                         (__v4si)(__m128i)(C), (int)(imm), \
-                                         (__mmask8)-1, (int)(R))
+  ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4si)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \
-  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), \
-                                         (__v4si)(__m128i)(C), (int)(imm), \
-                                         (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4si)(__m128i)(C), (int)(imm), \
+                                          (__mmask8)(U), (int)(R)))
 
 #define _mm_fixupimm_ss(A, B, C, imm) \
-  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), \
-                                         (__v4si)(__m128i)(C), (int)(imm), \
-                                         (__mmask8)-1, \
-                                         _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_fixupimm_ss(A, U, B, C, imm) \
-  (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), \
-                                         (__v4si)(__m128i)(C), (int)(imm), \
-                                         (__mmask8)(U), \
-                                         _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \
-  (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4si)(__m128i)(C), (int)(imm), \
-                                          (__mmask8)(U), (int)(R))
+                                          (__mmask8)-1, \
+                                          _MM_FROUND_CUR_DIRECTION))
 
-#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \
-  (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
+#define _mm_mask_fixupimm_ss(A, U, B, C, imm) \
+  ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
                                           (__v4sf)(__m128)(B), \
                                           (__v4si)(__m128i)(C), (int)(imm), \
                                           (__mmask8)(U), \
-                                          _MM_FROUND_CUR_DIRECTION)
+                                          _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \
+  ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4si)(__m128i)(C), (int)(imm), \
+                                           (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \
+  ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
+                                           (__v4sf)(__m128)(B), \
+                                           (__v4si)(__m128i)(C), (int)(imm), \
+                                           (__mmask8)(U), \
+                                           _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_getexp_round_sd(A, B, R) \
-  (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
-                                                 (__v2df)(__m128d)(B), \
-                                                 (__v2df)_mm_setzero_pd(), \
-                                                 (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+                                                  (__v2df)(__m128d)(B), \
+                                                  (__v2df)_mm_setzero_pd(), \
+                                                  (__mmask8)-1, (int)(R)))
 
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
@@ -5486,10 +5486,10 @@ _mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 }
 
 #define _mm_mask_getexp_round_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
-                                                 (__v2df)(__m128d)(B), \
-                                                 (__v2df)(__m128d)(W), \
-                                                 (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+                                                  (__v2df)(__m128d)(B), \
+                                                  (__v2df)(__m128d)(W), \
+                                                  (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
@@ -5502,16 +5502,16 @@ _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
 }
 
 #define _mm_maskz_getexp_round_sd(U, A, B, R) \
-  (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
-                                                 (__v2df)(__m128d)(B), \
-                                                 (__v2df)_mm_setzero_pd(), \
-                                                 (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+                                                  (__v2df)(__m128d)(B), \
+                                                  (__v2df)_mm_setzero_pd(), \
+                                                  (__mmask8)(U), (int)(R)))
 
 #define _mm_getexp_round_ss(A, B, R) \
-  (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
-                                                (__v4sf)(__m128)(B), \
-                                                (__v4sf)_mm_setzero_ps(), \
-                                                (__mmask8)-1, (int)(R))
+  ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+                                                 (__v4sf)(__m128)(B), \
+                                                 (__v4sf)_mm_setzero_ps(), \
+                                                 (__mmask8)-1, (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_getexp_ss (__m128 __A, __m128 __B)
@@ -5531,10 +5531,10 @@ _mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 }
 
 #define _mm_mask_getexp_round_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
-                                                (__v4sf)(__m128)(B), \
-                                                (__v4sf)(__m128)(W), \
-                                                (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+                                                 (__v4sf)(__m128)(B), \
+                                                 (__v4sf)(__m128)(W), \
+                                                 (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
@@ -5547,100 +5547,100 @@ _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
 }
 
 #define _mm_maskz_getexp_round_ss(U, A, B, R) \
-  (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
-                                                (__v4sf)(__m128)(B), \
-                                                (__v4sf)_mm_setzero_ps(), \
-                                                (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+                                                 (__v4sf)(__m128)(B), \
+                                                 (__v4sf)_mm_setzero_ps(), \
+                                                 (__mmask8)(U), (int)(R)))
 
 #define _mm_getmant_round_sd(A, B, C, D, R) \
-  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(((D)<<2) | (C)), \
-                                               (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (int)(((D)<<2) | (C)), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)-1, (int)(R)))
 
 #define _mm_getmant_sd(A, B, C, D)  \
-  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(((D)<<2) | (C)), \
-                                               (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)-1, \
-                                               _MM_FROUND_CUR_DIRECTION)
+  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (int)(((D)<<2) | (C)), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)-1, \
+                                                _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_mask_getmant_sd(W, U, A, B, C, D) \
-  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(((D)<<2) | (C)), \
-                                               (__v2df)(__m128d)(W), \
-                                               (__mmask8)(U), \
-                                               _MM_FROUND_CUR_DIRECTION)
+  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (int)(((D)<<2) | (C)), \
+                                                (__v2df)(__m128d)(W), \
+                                                (__mmask8)(U), \
+                                                _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \
-  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(((D)<<2) | (C)), \
-                                               (__v2df)(__m128d)(W), \
-                                               (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (int)(((D)<<2) | (C)), \
+                                                (__v2df)(__m128d)(W), \
+                                                (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_getmant_sd(U, A, B, C, D) \
-  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(((D)<<2) | (C)), \
-                                               (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)(U), \
-                                               _MM_FROUND_CUR_DIRECTION)
+  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (int)(((D)<<2) | (C)), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(U), \
+                                                _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \
-  (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
-                                               (__v2df)(__m128d)(B), \
-                                               (int)(((D)<<2) | (C)), \
-                                               (__v2df)_mm_setzero_pd(), \
-                                               (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+                                                (__v2df)(__m128d)(B), \
+                                                (int)(((D)<<2) | (C)), \
+                                                (__v2df)_mm_setzero_pd(), \
+                                                (__mmask8)(U), (int)(R)))
 
-#define _mm_getmant_round_ss(A, B, C, D, R) \
-  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (int)(((D)<<2) | (C)), \
-                                              (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)-1, (int)(R))
+#define _mm_getmant_round_ss(A, B, C, D, R) \
+  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)-1, (int)(R)))
 
 #define _mm_getmant_ss(A, B, C, D) \
-  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (int)(((D)<<2) | (C)), \
-                                              (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)-1, \
-                                              _MM_FROUND_CUR_DIRECTION)
+  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)-1, \
+                                               _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_mask_getmant_ss(W, U, A, B, C, D) \
-  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (int)(((D)<<2) | (C)), \
-                                              (__v4sf)(__m128)(W), \
-                                              (__mmask8)(U), \
-                                              _MM_FROUND_CUR_DIRECTION)
+  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v4sf)(__m128)(W), \
+                                               (__mmask8)(U), \
+                                               _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \
-  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (int)(((D)<<2) | (C)), \
-                                              (__v4sf)(__m128)(W), \
-                                              (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v4sf)(__m128)(W), \
+                                               (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_getmant_ss(U, A, B, C, D) \
-  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (int)(((D)<<2) | (C)), \
-                                              (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)(U), \
-                                              _MM_FROUND_CUR_DIRECTION)
+  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(U), \
+                                               _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \
-  (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (int)(((D)<<2) | (C)), \
-                                              (__v4sf)_mm_setzero_ps(), \
-                                              (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (int)(((D)<<2) | (C)), \
+                                               (__v4sf)_mm_setzero_ps(), \
+                                               (__mmask8)(U), (int)(R)))
 
 static __inline__ __mmask16 __DEFAULT_FN_ATTRS
 _mm512_kmov (__mmask16 __A)
@@ -5649,16 +5649,16 @@ _mm512_kmov (__mmask16 __A)
 }
 
 #define _mm_comi_round_sd(A, B, P, R) \
-  (int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
-                              (int)(P), (int)(R))
+  ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
+                               (int)(P), (int)(R)))
 
 #define _mm_comi_round_ss(A, B, P, R) \
-  (int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
-                              (int)(P), (int)(R))
+  ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
+                               (int)(P), (int)(R)))
 
 #ifdef __x86_64__
 #define _mm_cvt_roundsd_si64(A, R) \
-  (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))
+  ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
 #endif
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
@@ -5926,54 +5926,54 @@ _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
 }
 
 #define _mm512_ternarylogic_epi32(A, B, C, imm) \
-  (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
-                                            (__v16si)(__m512i)(B), \
-                                            (__v16si)(__m512i)(C), (int)(imm), \
-                                            (__mmask16)-1)
+  ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
+                                             (__v16si)(__m512i)(B), \
+                                             (__v16si)(__m512i)(C), (int)(imm), \
+                                             (__mmask16)-1))
 
 #define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \
-  (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
-                                            (__v16si)(__m512i)(B), \
-                                            (__v16si)(__m512i)(C), (int)(imm), \
-                                            (__mmask16)(U))
+  ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
+                                             (__v16si)(__m512i)(B), \
+                                             (__v16si)(__m512i)(C), (int)(imm), \
+                                             (__mmask16)(U)))
 
 #define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \
-  (__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \
-                                             (__v16si)(__m512i)(B), \
-                                             (__v16si)(__m512i)(C), \
-                                             (int)(imm), (__mmask16)(U))
+  ((__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \
+                                              (__v16si)(__m512i)(B), \
+                                              (__v16si)(__m512i)(C), \
+                                              (int)(imm), (__mmask16)(U)))
 
 #define _mm512_ternarylogic_epi64(A, B, C, imm) \
-  (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
-                                            (__v8di)(__m512i)(B), \
-                                            (__v8di)(__m512i)(C), (int)(imm), \
-                                            (__mmask8)-1)
+  ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
+                                             (__v8di)(__m512i)(B), \
+                                             (__v8di)(__m512i)(C), (int)(imm), \
+                                             (__mmask8)-1))
 
 #define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \
-  (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
-                                            (__v8di)(__m512i)(B), \
-                                            (__v8di)(__m512i)(C), (int)(imm), \
-                                            (__mmask8)(U))
-
-#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \
-  (__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \
+  ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
                                              (__v8di)(__m512i)(B), \
                                              (__v8di)(__m512i)(C), (int)(imm), \
-                                             (__mmask8)(U))
+                                             (__mmask8)(U)))
+
+#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \
+  ((__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \
+                                              (__v8di)(__m512i)(B), \
+                                              (__v8di)(__m512i)(C), (int)(imm), \
+                                              (__mmask8)(U)))
 
 #ifdef __x86_64__
 #define _mm_cvt_roundsd_i64(A, R) \
-  (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))
+  ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
 #endif
 
 #define _mm_cvt_roundsd_si32(A, R) \
-  (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))
+  ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
 
 #define _mm_cvt_roundsd_i32(A, R) \
-  (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))
+  ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
 
 #define _mm_cvt_roundsd_u32(A, R) \
-  (unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R))
+  ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)))
 
 static __inline__ unsigned __DEFAULT_FN_ATTRS128
 _mm_cvtsd_u32 (__m128d __A)
@@ -5984,8 +5984,8 @@ _mm_cvtsd_u32 (__m128d __A)
 
 #ifdef __x86_64__
 #define _mm_cvt_roundsd_u64(A, R) \
-  (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
-                                                  (int)(R))
+  ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
+                                                   (int)(R)))
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
 _mm_cvtsd_u64 (__m128d __A)
@@ -5997,21 +5997,21 @@ _mm_cvtsd_u64 (__m128d __A)
 #endif
 
 #define _mm_cvt_roundss_si32(A, R) \
-  (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))
+  ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
 
 #define _mm_cvt_roundss_i32(A, R) \
-  (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))
+  ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
 
 #ifdef __x86_64__
 #define _mm_cvt_roundss_si64(A, R) \
-  (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))
+  ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
 
 #define _mm_cvt_roundss_i64(A, R) \
-  (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))
+  ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
 #endif
 
 #define _mm_cvt_roundss_u32(A, R) \
-  (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R))
+  ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)))
 
 static __inline__ unsigned __DEFAULT_FN_ATTRS128
 _mm_cvtss_u32 (__m128 __A)
@@ -6022,8 +6022,8 @@ _mm_cvtss_u32 (__m128 __A)
 
 #ifdef __x86_64__
 #define _mm_cvt_roundss_u64(A, R) \
-  (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
-                                                  (int)(R))
+  ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
+                                                   (int)(R)))
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
 _mm_cvtss_u64 (__m128 __A)
@@ -6035,10 +6035,10 @@ _mm_cvtss_u64 (__m128 __A)
 #endif
 
 #define _mm_cvtt_roundsd_i32(A, R) \
-  (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))
+  ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
 
 #define _mm_cvtt_roundsd_si32(A, R) \
-  (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))
+  ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
 
 static __inline__ int __DEFAULT_FN_ATTRS128
 _mm_cvttsd_i32 (__m128d __A)
@@ -6049,10 +6049,10 @@ _mm_cvttsd_i32 (__m128d __A)
 
 #ifdef __x86_64__
 #define _mm_cvtt_roundsd_si64(A, R) \
-  (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))
+  ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
 
 #define _mm_cvtt_roundsd_i64(A, R) \
-  (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))
+  ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
 
 static __inline__ long long __DEFAULT_FN_ATTRS128
 _mm_cvttsd_i64 (__m128d __A)
@@ -6063,7 +6063,7 @@ _mm_cvttsd_i64 (__m128d __A)
 #endif
 
 #define _mm_cvtt_roundsd_u32(A, R) \
-  (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R))
+  ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)))
 
 static __inline__ unsigned __DEFAULT_FN_ATTRS128
 _mm_cvttsd_u32 (__m128d __A)
@@ -6074,8 +6074,8 @@ _mm_cvttsd_u32 (__m128d __A)
 
 #ifdef __x86_64__
 #define _mm_cvtt_roundsd_u64(A, R) \
-  (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
-                                                   (int)(R))
+  ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
+                                                    (int)(R)))
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
 _mm_cvttsd_u64 (__m128d __A)
@@ -6087,10 +6087,10 @@ _mm_cvttsd_u64 (__m128d __A)
 #endif
 
 #define _mm_cvtt_roundss_i32(A, R) \
-  (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))
+  ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
 
 #define _mm_cvtt_roundss_si32(A, R) \
-  (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))
+  ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
 
 static __inline__ int __DEFAULT_FN_ATTRS128
 _mm_cvttss_i32 (__m128 __A)
@@ -6101,10 +6101,10 @@ _mm_cvttss_i32 (__m128 __A)
 
 #ifdef __x86_64__
 #define _mm_cvtt_roundss_i64(A, R) \
-  (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))
+  ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
 
 #define _mm_cvtt_roundss_si64(A, R) \
-  (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))
+  ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
 
 static __inline__ long long __DEFAULT_FN_ATTRS128
 _mm_cvttss_i64 (__m128 __A)
@@ -6115,7 +6115,7 @@ _mm_cvttss_i64 (__m128 __A)
 #endif
 
 #define _mm_cvtt_roundss_u32(A, R) \
-  (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R))
+  ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)))
 
 static __inline__ unsigned __DEFAULT_FN_ATTRS128
 _mm_cvttss_u32 (__m128 __A)
@@ -6126,8 +6126,8 @@ _mm_cvttss_u32 (__m128 __A)
 
 #ifdef __x86_64__
 #define _mm_cvtt_roundss_u64(A, R) \
-  (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
-                                                   (int)(R))
+  ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
+                                                    (int)(R)))
 
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
 _mm_cvttss_u64 (__m128 __A)
@@ -6139,30 +6139,30 @@ _mm_cvttss_u64 (__m128 __A)
 #endif
 
 #define _mm512_permute_pd(X, C) \
-  (__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C))
+  ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C)))
 
 #define _mm512_mask_permute_pd(W, U, X, C) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                       (__v8df)_mm512_permute_pd((X), (C)), \
-                                       (__v8df)(__m512d)(W))
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                        (__v8df)_mm512_permute_pd((X), (C)), \
+                                        (__v8df)(__m512d)(W)))
 
 #define _mm512_maskz_permute_pd(U, X, C) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                       (__v8df)_mm512_permute_pd((X), (C)), \
-                                       (__v8df)_mm512_setzero_pd())
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                        (__v8df)_mm512_permute_pd((X), (C)), \
+                                        (__v8df)_mm512_setzero_pd()))
 
 #define _mm512_permute_ps(X, C) \
-  (__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C))
+  ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C)))
 
 #define _mm512_mask_permute_ps(W, U, X, C) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                      (__v16sf)_mm512_permute_ps((X), (C)), \
-                                      (__v16sf)(__m512)(W))
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                       (__v16sf)_mm512_permute_ps((X), (C)), \
+                                       (__v16sf)(__m512)(W)))
 
 #define _mm512_maskz_permute_ps(U, X, C) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                      (__v16sf)_mm512_permute_ps((X), (C)), \
-                                      (__v16sf)_mm512_setzero_ps())
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                       (__v16sf)_mm512_permute_ps((X), (C)), \
+                                       (__v16sf)_mm512_setzero_ps()))
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_permutevar_pd(__m512d __A, __m512i __C)
@@ -6274,19 +6274,19 @@ _mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
 
 
 #define _mm512_cvtt_roundpd_epu32(A, R) \
-  (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8si)_mm256_undefined_si256(), \
-                                             (__mmask8)-1, (int)(R))
+  ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+                                              (__v8si)_mm256_undefined_si256(), \
+                                              (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \
-  (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8si)(__m256i)(W), \
-                                             (__mmask8)(U), (int)(R))
+  ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+                                              (__v8si)(__m256i)(W), \
+                                              (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \
-  (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
-                                             (__v8si)_mm256_setzero_si256(), \
-                                             (__mmask8)(U), (int)(R))
+  ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+                                              (__v8si)_mm256_setzero_si256(), \
+                                              (__mmask8)(U), (int)(R)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS512
 _mm512_cvttpd_epu32 (__m512d __A)
@@ -6318,106 +6318,106 @@ _mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
 }
 
 #define _mm_roundscale_round_sd(A, B, imm, R) \
-  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)-1, (int)(imm), \
-                                                (int)(R))
+  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8)-1, (int)(imm), \
+                                                 (int)(R)))
 
 #define _mm_roundscale_sd(A, B, imm) \
-  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)-1, (int)(imm), \
-                                                _MM_FROUND_CUR_DIRECTION)
+  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8)-1, (int)(imm), \
+                                                 _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_mask_roundscale_sd(W, U, A, B, imm) \
-  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)(__m128d)(W), \
-                                                (__mmask8)(U), (int)(imm), \
-                                                _MM_FROUND_CUR_DIRECTION)
+  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)(__m128d)(W), \
+                                                 (__mmask8)(U), (int)(imm), \
+                                                 _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \
-  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)(__m128d)(W), \
-                                                (__mmask8)(U), (int)(I), \
-                                                (int)(R))
+  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)(__m128d)(W), \
+                                                 (__mmask8)(U), (int)(I), \
+                                                 (int)(R)))
 
 #define _mm_maskz_roundscale_sd(U, A, B, I) \
-  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(U), (int)(I), \
-                                                _MM_FROUND_CUR_DIRECTION)
+  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8)(U), (int)(I), \
+                                                 _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \
-  (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
-                                                (__v2df)(__m128d)(B), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(U), (int)(I), \
-                                                (int)(R))
+  ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+                                                 (__v2df)(__m128d)(B), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8)(U), (int)(I), \
+                                                 (int)(R)))
 
 #define _mm_roundscale_round_ss(A, B, imm, R) \
-  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)-1, (int)(imm), \
-                                               (int)(R))
+  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)_mm_setzero_ps(), \
+                                                (__mmask8)-1, (int)(imm), \
+                                                (int)(R)))
 
 #define _mm_roundscale_ss(A, B, imm) \
-  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)-1, (int)(imm), \
-                                               _MM_FROUND_CUR_DIRECTION)
+  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)_mm_setzero_ps(), \
+                                                (__mmask8)-1, (int)(imm), \
+                                                _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_mask_roundscale_ss(W, U, A, B, I) \
-  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)(__m128)(W), \
-                                               (__mmask8)(U), (int)(I), \
-                                               _MM_FROUND_CUR_DIRECTION)
+  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)(__m128)(W), \
+                                                (__mmask8)(U), (int)(I), \
+                                                _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \
-  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)(__m128)(W), \
-                                               (__mmask8)(U), (int)(I), \
-                                               (int)(R))
+  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)(__m128)(W), \
+                                                (__mmask8)(U), (int)(I), \
+                                                (int)(R)))
 
 #define _mm_maskz_roundscale_ss(U, A, B, I) \
-  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)(U), (int)(I), \
-                                               _MM_FROUND_CUR_DIRECTION)
+  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)_mm_setzero_ps(), \
+                                                (__mmask8)(U), (int)(I), \
+                                                _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \
-  (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
-                                               (__v4sf)(__m128)(B), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)(U), (int)(I), \
-                                               (int)(R))
+  ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+                                                (__v4sf)(__m128)(B), \
+                                                (__v4sf)_mm_setzero_ps(), \
+                                                (__mmask8)(U), (int)(I), \
+                                                (int)(R)))
 
 #define _mm512_scalef_round_pd(A, B, R) \
-  (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           (__v8df)_mm512_undefined_pd(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)_mm512_undefined_pd(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_scalef_round_pd(W, U, A, B, R) \
-  (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_scalef_round_pd(U, A, B, R) \
-  (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(B), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(B), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_scalef_pd (__m512d __A, __m512d __B)
@@ -6452,22 +6452,22 @@ _mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
 }
 
 #define _mm512_scalef_round_ps(A, B, R) \
-  (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          (__v16sf)_mm512_undefined_ps(), \
-                                          (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)_mm512_undefined_ps(), \
+                                           (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_scalef_round_ps(W, U, A, B, R) \
-  (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), (int)(R)))
 
 #define _mm512_maskz_scalef_round_ps(U, A, B, R) \
-  (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(B), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(B), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), (int)(R)))
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_scalef_ps (__m512 __A, __m512 __B)
@@ -6502,10 +6502,10 @@ _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
 }
 
 #define _mm_scalef_round_sd(A, B, R) \
-  (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)-1, (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_scalef_sd (__m128d __A, __m128d __B)
@@ -6527,10 +6527,10 @@ _mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 }
 
 #define _mm_mask_scalef_round_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)(__m128d)(W), \
-                                              (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)(__m128d)(W), \
+                                               (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
@@ -6543,16 +6543,16 @@ _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
 }
 
 #define _mm_maskz_scalef_round_sd(U, A, B, R) \
-  (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)(U), (int)(R)))
 
 #define _mm_scalef_round_ss(A, B, R) \
-  (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)-1, (int)(R))
+  ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)-1, (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_scalef_ss (__m128 __A, __m128 __B)
@@ -6574,10 +6574,10 @@ _mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 }
 
 #define _mm_mask_scalef_round_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)(__m128)(W), \
-                                             (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)(__m128)(W), \
+                                              (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
@@ -6590,11 +6590,11 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
 }
 
 #define _mm_maskz_scalef_round_ss(U, A, B, R) \
-  (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v4sf)(__m128)(B), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)(U), \
-                                             (int)(R))
+  ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)(U), \
+                                              (int)(R)))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_srai_epi32(__m512i __A, unsigned int __B)
@@ -6642,94 +6642,94 @@ _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
 }
 
 #define _mm512_shuffle_f32x4(A, B, imm) \
-  (__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \
-                                    (__v16sf)(__m512)(B), (int)(imm))
+  ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \
+                                     (__v16sf)(__m512)(B), (int)(imm)))
 
 #define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                      (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
-                                      (__v16sf)(__m512)(W))
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                       (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
+                                       (__v16sf)(__m512)(W)))
 
 #define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                      (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
-                                      (__v16sf)_mm512_setzero_ps())
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                       (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
+                                       (__v16sf)_mm512_setzero_ps()))
 
 #define _mm512_shuffle_f64x2(A, B, imm) \
-  (__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \
-                                     (__v8df)(__m512d)(B), (int)(imm))
+  ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \
+                                      (__v8df)(__m512d)(B), (int)(imm)))
 
 #define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                       (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
-                                       (__v8df)(__m512d)(W))
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                        (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
+                                        (__v8df)(__m512d)(W)))
 
 #define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                       (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
-                                       (__v8df)_mm512_setzero_pd())
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                        (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
+                                        (__v8df)_mm512_setzero_pd()))
 
 #define _mm512_shuffle_i32x4(A, B, imm) \
-  (__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \
-                                     (__v16si)(__m512i)(B), (int)(imm))
+  ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \
+                                      (__v16si)(__m512i)(B), (int)(imm)))
 
 #define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                      (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
-                                      (__v16si)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                       (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
+                                       (__v16si)(__m512i)(W)))
 
 #define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                      (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
-                                      (__v16si)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                       (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
+                                       (__v16si)_mm512_setzero_si512()))
 
 #define _mm512_shuffle_i64x2(A, B, imm) \
-  (__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \
-                                     (__v8di)(__m512i)(B), (int)(imm))
+  ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \
+                                      (__v8di)(__m512i)(B), (int)(imm)))
 
 #define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                      (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
-                                      (__v8di)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                       (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
+                                       (__v8di)(__m512i)(W)))
 
 #define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                      (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
-                                      (__v8di)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                       (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
+                                       (__v8di)_mm512_setzero_si512()))
 
 #define _mm512_shuffle_pd(A, B, M) \
-  (__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \
-                                    (__v8df)(__m512d)(B), (int)(M))
+  ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \
+                                     (__v8df)(__m512d)(B), (int)(M)))
 
 #define _mm512_mask_shuffle_pd(W, U, A, B, M) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                       (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
-                                       (__v8df)(__m512d)(W))
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                        (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
+                                        (__v8df)(__m512d)(W)))
 
 #define _mm512_maskz_shuffle_pd(U, A, B, M) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                       (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
-                                       (__v8df)_mm512_setzero_pd())
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                        (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
+                                        (__v8df)_mm512_setzero_pd()))
 
 #define _mm512_shuffle_ps(A, B, M) \
-  (__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \
-                                   (__v16sf)(__m512)(B), (int)(M))
+  ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \
+                                    (__v16sf)(__m512)(B), (int)(M)))
 
 #define _mm512_mask_shuffle_ps(W, U, A, B, M) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                      (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
-                                      (__v16sf)(__m512)(W))
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                       (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
+                                       (__v16sf)(__m512)(W)))
 
 #define _mm512_maskz_shuffle_ps(U, A, B, M) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                      (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
-                                      (__v16sf)_mm512_setzero_ps())
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                       (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
+                                       (__v16sf)_mm512_setzero_ps()))
 
 #define _mm_sqrt_round_sd(A, B, R) \
-  (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)_mm_setzero_pd(), \
+                                             (__mmask8)-1, (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
@@ -6742,10 +6742,10 @@ _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 }
 
 #define _mm_mask_sqrt_round_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)(__m128d)(W), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)(__m128d)(W), \
+                                             (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
@@ -6758,16 +6758,16 @@ _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
 }
 
 #define _mm_maskz_sqrt_round_sd(U, A, B, R) \
-  (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
-                                            (__v2df)(__m128d)(B), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+                                             (__v2df)(__m128d)(B), \
+                                             (__v2df)_mm_setzero_pd(), \
+                                             (__mmask8)(U), (int)(R)))
 
 #define _mm_sqrt_round_ss(A, B, R) \
-  (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)_mm_setzero_ps(), \
+                                            (__mmask8)-1, (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
@@ -6780,10 +6780,10 @@ _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 }
 
 #define _mm_mask_sqrt_round_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)(__m128)(W), (__mmask8)(U), \
-                                           (int)(R))
+  ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)(__m128)(W), (__mmask8)(U), \
+                                            (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
@@ -6796,10 +6796,10 @@ _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
 }
 
 #define _mm_maskz_sqrt_round_ss(U, A, B, R) \
-  (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
-                                           (__v4sf)(__m128)(B), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+                                            (__v4sf)(__m128)(B), \
+                                            (__v4sf)_mm_setzero_ps(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_broadcast_f32x4(__m128 __A)
@@ -7366,183 +7366,183 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
 }
 
 #define _mm512_extracti32x4_epi32(A, imm) \
-  (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                            (__v4si)_mm_undefined_si128(), \
-                                            (__mmask8)-1)
+  ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
+                                             (__v4si)_mm_undefined_si128(), \
+                                             (__mmask8)-1))
 
 #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
-  (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                            (__v4si)(__m128i)(W), \
-                                            (__mmask8)(U))
+  ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
+                                             (__v4si)(__m128i)(W), \
+                                             (__mmask8)(U)))
 
 #define _mm512_maskz_extracti32x4_epi32(U, A, imm) \
-  (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
-                                            (__v4si)_mm_setzero_si128(), \
-                                            (__mmask8)(U))
+  ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
+                                             (__v4si)_mm_setzero_si128(), \
+                                             (__mmask8)(U)))
 
 #define _mm512_extracti64x4_epi64(A, imm) \
-  (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
-                                            (__v4di)_mm256_undefined_si256(), \
-                                            (__mmask8)-1)
+  ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
+                                             (__v4di)_mm256_undefined_si256(), \
+                                             (__mmask8)-1))
 
 #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
-  (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
-                                            (__v4di)(__m256i)(W), \
-                                            (__mmask8)(U))
+  ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
+                                             (__v4di)(__m256i)(W), \
+                                             (__mmask8)(U)))
 
 #define _mm512_maskz_extracti64x4_epi64(U, A, imm) \
-  (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
-                                            (__v4di)_mm256_setzero_si256(), \
-                                            (__mmask8)(U))
+  ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
+                                             (__v4di)_mm256_setzero_si256(), \
+                                             (__mmask8)(U)))
 
 #define _mm512_insertf64x4(A, B, imm) \
-  (__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
-                                      (__v4df)(__m256d)(B), (int)(imm))
+  ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
+                                       (__v4df)(__m256d)(B), (int)(imm)))
 
 #define _mm512_mask_insertf64x4(W, U, A, B, imm) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                  (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
-                                  (__v8df)(__m512d)(W))
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+                                   (__v8df)(__m512d)(W)))
 
 #define _mm512_maskz_insertf64x4(U, A, B, imm) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                  (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
-                                  (__v8df)_mm512_setzero_pd())
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                   (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+                                   (__v8df)_mm512_setzero_pd()))
 
 #define _mm512_inserti64x4(A, B, imm) \
-  (__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \
-                                      (__v4di)(__m256i)(B), (int)(imm))
+  ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \
+                                       (__v4di)(__m256i)(B), (int)(imm)))
 
 #define _mm512_mask_inserti64x4(W, U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                  (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
-                                  (__v8di)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                   (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+                                   (__v8di)(__m512i)(W)))
 
 #define _mm512_maskz_inserti64x4(U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                  (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
-                                  (__v8di)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                   (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+                                   (__v8di)_mm512_setzero_si512()))
 
 #define _mm512_insertf32x4(A, B, imm) \
-  (__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \
-                                     (__v4sf)(__m128)(B), (int)(imm))
+  ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \
+                                      (__v4sf)(__m128)(B), (int)(imm)))
 
 #define _mm512_mask_insertf32x4(W, U, A, B, imm) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
-                                 (__v16sf)(__m512)(W))
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+                                  (__v16sf)(__m512)(W)))
 
 #define _mm512_maskz_insertf32x4(U, A, B, imm) \
-  (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
-                                 (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
-                                 (__v16sf)_mm512_setzero_ps())
+  ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+                                  (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+                                  (__v16sf)_mm512_setzero_ps()))
 
 #define _mm512_inserti32x4(A, B, imm) \
-  (__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \
-                                      (__v4si)(__m128i)(B), (int)(imm))
+  ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \
+                                       (__v4si)(__m128i)(B), (int)(imm)))
 
 #define _mm512_mask_inserti32x4(W, U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
-                                 (__v16si)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                  (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+                                  (__v16si)(__m512i)(W)))
 
 #define _mm512_maskz_inserti32x4(U, A, B, imm) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                 (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
-                                 (__v16si)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                  (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+                                  (__v16si)_mm512_setzero_si512()))
 
 #define _mm512_getmant_round_pd(A, B, C, R) \
-  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v8df)_mm512_undefined_pd(), \
-                                            (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                             (int)(((C)<<2) | (B)), \
+                                             (__v8df)_mm512_undefined_pd(), \
+                                             (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \
-  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v8df)(__m512d)(W), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                             (int)(((C)<<2) | (B)), \
+                                             (__v8df)(__m512d)(W), \
+                                             (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \
-  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                             (int)(((C)<<2) | (B)), \
+                                             (__v8df)_mm512_setzero_pd(), \
+                                             (__mmask8)(U), (int)(R)))
 
 #define _mm512_getmant_pd(A, B, C) \
-  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)-1, \
-                                            _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                             (int)(((C)<<2) | (B)), \
+                                             (__v8df)_mm512_setzero_pd(), \
+                                             (__mmask8)-1, \
+                                             _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_mask_getmant_pd(W, U, A, B, C) \
-  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v8df)(__m512d)(W), \
-                                            (__mmask8)(U), \
-                                            _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                             (int)(((C)<<2) | (B)), \
+                                             (__v8df)(__m512d)(W), \
+                                             (__mmask8)(U), \
+                                             _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_maskz_getmant_pd(U, A, B, C) \
-  (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v8df)_mm512_setzero_pd(), \
-                                            (__mmask8)(U), \
-                                            _MM_FROUND_CUR_DIRECTION)
+  ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+                                             (int)(((C)<<2) | (B)), \
+                                             (__v8df)_mm512_setzero_pd(), \
+                                             (__mmask8)(U), \
+                                             _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_getmant_round_ps(A, B, C, R) \
-  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
-                                           (int)(((C)<<2) | (B)), \
-                                           (__v16sf)_mm512_undefined_ps(), \
-                                           (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v16sf)_mm512_undefined_ps(), \
+                                            (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \
-  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
-                                           (int)(((C)<<2) | (B)), \
-                                           (__v16sf)(__m512)(W), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v16sf)(__m512)(W), \
+                                            (__mmask16)(U), (int)(R)))
 
 #define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \
-  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
-                                           (int)(((C)<<2) | (B)), \
-                                           (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v16sf)_mm512_setzero_ps(), \
+                                            (__mmask16)(U), (int)(R)))
 
 #define _mm512_getmant_ps(A, B, C) \
-  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
-                                           (int)(((C)<<2)|(B)), \
-                                           (__v16sf)_mm512_undefined_ps(), \
-                                           (__mmask16)-1, \
-                                           _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                            (int)(((C)<<2)|(B)), \
+                                            (__v16sf)_mm512_undefined_ps(), \
+                                            (__mmask16)-1, \
+                                            _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_mask_getmant_ps(W, U, A, B, C) \
-  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
-                                           (int)(((C)<<2)|(B)), \
-                                           (__v16sf)(__m512)(W), \
-                                           (__mmask16)(U), \
-                                           _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                            (int)(((C)<<2)|(B)), \
+                                            (__v16sf)(__m512)(W), \
+                                            (__mmask16)(U), \
+                                            _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_maskz_getmant_ps(U, A, B, C) \
-  (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
-                                           (int)(((C)<<2)|(B)), \
-                                           (__v16sf)_mm512_setzero_ps(), \
-                                           (__mmask16)(U), \
-                                           _MM_FROUND_CUR_DIRECTION)
+  ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+                                            (int)(((C)<<2)|(B)), \
+                                            (__v16sf)_mm512_setzero_ps(), \
+                                            (__mmask16)(U), \
+                                            _MM_FROUND_CUR_DIRECTION))
 
 #define _mm512_getexp_round_pd(A, R) \
-  (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)_mm512_undefined_pd(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)_mm512_undefined_pd(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_getexp_round_pd(W, U, A, R) \
-  (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_getexp_round_pd(U, A, R) \
-  (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_getexp_pd (__m512d __A)
@@ -7572,19 +7572,19 @@ _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
 }
 
 #define _mm512_getexp_round_ps(A, R) \
-  (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)_mm512_undefined_ps(), \
-                                          (__mmask16)-1, (int)(R))
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)_mm512_undefined_ps(), \
+                                           (__mmask16)-1, (int)(R)))
 
 #define _mm512_mask_getexp_round_ps(W, U, A, R) \
-  (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)(__m512)(W), \
-                                          (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)(__m512)(W), \
+                                           (__mmask16)(U), (int)(R)))
 
 #define _mm512_maskz_getexp_round_ps(U, A, R) \
-  (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
-                                          (__v16sf)_mm512_setzero_ps(), \
-                                          (__mmask16)(U), (int)(R))
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+                                           (__v16sf)_mm512_setzero_ps(), \
+                                           (__mmask16)(U), (int)(R)))
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_getexp_ps (__m512 __A)
@@ -7614,100 +7614,100 @@ _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
 }
 
 #define _mm512_i64gather_ps(index, addr, scale) \
-  (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
-                                       (void const *)(addr), \
-                                       (__v8di)(__m512i)(index), (__mmask8)-1, \
-                                       (int)(scale))
+  ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
+                                        (void const *)(addr), \
+                                        (__v8di)(__m512i)(index), (__mmask8)-1, \
+                                        (int)(scale)))
 
 #define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \
-  (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
-                                       (void const *)(addr), \
-                                       (__v8di)(__m512i)(index), \
-                                       (__mmask8)(mask), (int)(scale))
-
-#define _mm512_i64gather_epi32(index, addr, scale) \
-  (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \
+  ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
                                         (void const *)(addr), \
                                         (__v8di)(__m512i)(index), \
-                                        (__mmask8)-1, (int)(scale))
+                                        (__mmask8)(mask), (int)(scale)))
+
+#define _mm512_i64gather_epi32(index, addr, scale) \
+  ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \
+                                         (void const *)(addr), \
+                                         (__v8di)(__m512i)(index), \
+                                         (__mmask8)-1, (int)(scale)))
 
 #define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \
-  (__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v8di)(__m512i)(index), \
-                                        (__mmask8)(mask), (int)(scale))
+  ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v8di)(__m512i)(index), \
+                                         (__mmask8)(mask), (int)(scale)))
 
 #define _mm512_i64gather_pd(index, addr, scale) \
-  (__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
-                                       (void const *)(addr), \
-                                       (__v8di)(__m512i)(index), (__mmask8)-1, \
-                                       (int)(scale))
+  ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
+                                        (void const *)(addr), \
+                                        (__v8di)(__m512i)(index), (__mmask8)-1, \
+                                        (int)(scale)))
 
 #define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \
-  (__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
-                                       (void const *)(addr), \
-                                       (__v8di)(__m512i)(index), \
-                                       (__mmask8)(mask), (int)(scale))
+  ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
+                                        (void const *)(addr), \
+                                        (__v8di)(__m512i)(index), \
+                                        (__mmask8)(mask), (int)(scale)))
 
 #define _mm512_i64gather_epi64(index, addr, scale) \
-  (__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \
-                                       (void const *)(addr), \
-                                       (__v8di)(__m512i)(index), (__mmask8)-1, \
-                                       (int)(scale))
+  ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \
+                                        (void const *)(addr), \
+                                        (__v8di)(__m512i)(index), (__mmask8)-1, \
+                                        (int)(scale)))
 
 #define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \
-  (__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
-                                       (void const *)(addr), \
-                                       (__v8di)(__m512i)(index), \
-                                       (__mmask8)(mask), (int)(scale))
+  ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
+                                        (void const *)(addr), \
+                                        (__v8di)(__m512i)(index), \
+                                        (__mmask8)(mask), (int)(scale)))
 
 #define _mm512_i32gather_ps(index, addr, scale) \
-  (__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
-                                       (void const *)(addr), \
-                                       (__v16si)(__m512)(index), \
-                                       (__mmask16)-1, (int)(scale))
+  ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
+                                        (void const *)(addr), \
+                                        (__v16si)(__m512)(index), \
+                                        (__mmask16)-1, (int)(scale)))
 
 #define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \
-  (__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
-                                       (void const *)(addr), \
-                                       (__v16si)(__m512)(index), \
-                                       (__mmask16)(mask), (int)(scale))
+  ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
+                                        (void const *)(addr), \
+                                        (__v16si)(__m512)(index), \
+                                        (__mmask16)(mask), (int)(scale)))
 
 #define _mm512_i32gather_epi32(index, addr, scale) \
-  (__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
-                                        (void const *)(addr), \
-                                        (__v16si)(__m512i)(index), \
-                                        (__mmask16)-1, (int)(scale))
+  ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
+                                         (void const *)(addr), \
+                                         (__v16si)(__m512i)(index), \
+                                         (__mmask16)-1, (int)(scale)))
 
 #define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \
-  (__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v16si)(__m512i)(index), \
-                                        (__mmask16)(mask), (int)(scale))
+  ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v16si)(__m512i)(index), \
+                                         (__mmask16)(mask), (int)(scale)))
 
 #define _mm512_i32gather_pd(index, addr, scale) \
-  (__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
-                                       (void const *)(addr), \
-                                       (__v8si)(__m256i)(index), (__mmask8)-1, \
-                                       (int)(scale))
+  ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
+                                        (void const *)(addr), \
+                                        (__v8si)(__m256i)(index), (__mmask8)-1, \
+                                        (int)(scale)))
 
 #define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \
-  (__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
-                                       (void const *)(addr), \
-                                       (__v8si)(__m256i)(index), \
-                                       (__mmask8)(mask), (int)(scale))
+  ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
+                                        (void const *)(addr), \
+                                        (__v8si)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)))
 
 #define _mm512_i32gather_epi64(index, addr, scale) \
-  (__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
-                                       (void const *)(addr), \
-                                       (__v8si)(__m256i)(index), (__mmask8)-1, \
-                                       (int)(scale))
+  ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
+                                        (void const *)(addr), \
+                                        (__v8si)(__m256i)(index), (__mmask8)-1, \
+                                        (int)(scale)))
 
 #define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \
-  (__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
-                                       (void const *)(addr), \
-                                       (__v8si)(__m256i)(index), \
-                                       (__mmask8)(mask), (int)(scale))
+  ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
+                                        (void const *)(addr), \
+                                        (__v8si)(__m256i)(index), \
+                                        (__mmask8)(mask), (int)(scale)))
 
 #define _mm512_i64scatter_ps(addr, index, v1, scale) \
   __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \
@@ -7800,16 +7800,16 @@ _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 }
 
 #define _mm_fmadd_round_ss(A, B, C, R) \
-  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        (__v4sf)(__m128)(C), (__mmask8)-1, \
-                                        (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         (__v4sf)(__m128)(C), (__mmask8)-1, \
+                                         (int)(R)))
 
 #define _mm_mask_fmadd_round_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
-                                        (__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
-                                        (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), (__mmask8)(U), \
+                                         (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
@@ -7822,10 +7822,10 @@ _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 }
 
 #define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \
-  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), \
-                                         (__v4sf)(__m128)(C), (__mmask8)(U), \
-                                         (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(C), (__mmask8)(U), \
+                                          (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
@@ -7838,10 +7838,10 @@ _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 }
 
 #define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \
-  (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
-                                         (__v4sf)(__m128)(X), \
-                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
-                                         (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
+                                          (__v4sf)(__m128)(X), \
+                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                          (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
@@ -7854,16 +7854,16 @@ _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 }
 
 #define _mm_fmsub_round_ss(A, B, C, R) \
-  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), \
-                                        -(__v4sf)(__m128)(C), (__mmask8)-1, \
-                                        (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), \
+                                         -(__v4sf)(__m128)(C), (__mmask8)-1, \
+                                         (int)(R)))
 
 #define _mm_mask_fmsub_round_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
-                                        (__v4sf)(__m128)(A), \
-                                        -(__v4sf)(__m128)(B), (__mmask8)(U), \
-                                        (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                         (__v4sf)(__m128)(A), \
+                                         -(__v4sf)(__m128)(B), (__mmask8)(U), \
+                                         (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
@@ -7876,10 +7876,10 @@ _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 }
 
 #define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \
-  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), \
-                                         -(__v4sf)(__m128)(C), (__mmask8)(U), \
-                                         (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), \
+                                          -(__v4sf)(__m128)(C), (__mmask8)(U), \
+                                          (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
@@ -7892,10 +7892,10 @@ _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 }
 
 #define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \
-  (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
-                                         (__v4sf)(__m128)(X), \
-                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
-                                         (int)(R))
+  ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
+                                          (__v4sf)(__m128)(X), \
+                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                          (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
@@ -7908,16 +7908,16 @@ _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 }
 
 #define _mm_fnmadd_round_ss(A, B, C, R) \
-  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
-                                        -(__v4sf)(__m128)(B), \
-                                        (__v4sf)(__m128)(C), (__mmask8)-1, \
-                                        (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
+                                         -(__v4sf)(__m128)(B), \
+                                         (__v4sf)(__m128)(C), (__mmask8)-1, \
+                                         (int)(R)))
 
 #define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
-                                        -(__v4sf)(__m128)(A), \
-                                        (__v4sf)(__m128)(B), (__mmask8)(U), \
-                                        (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                         -(__v4sf)(__m128)(A), \
+                                         (__v4sf)(__m128)(B), (__mmask8)(U), \
+                                         (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
@@ -7930,10 +7930,10 @@ _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 }
 
 #define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \
-  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
-                                         -(__v4sf)(__m128)(B), \
-                                         (__v4sf)(__m128)(C), (__mmask8)(U), \
-                                         (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                          -(__v4sf)(__m128)(B), \
+                                          (__v4sf)(__m128)(C), (__mmask8)(U), \
+                                          (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
@@ -7946,10 +7946,10 @@ _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 }
 
 #define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \
-  (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
-                                         -(__v4sf)(__m128)(X), \
-                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
-                                         (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
+                                          -(__v4sf)(__m128)(X), \
+                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                          (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
@@ -7962,16 +7962,16 @@ _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
 }
 
 #define _mm_fnmsub_round_ss(A, B, C, R) \
-  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
-                                        -(__v4sf)(__m128)(B), \
-                                        -(__v4sf)(__m128)(C), (__mmask8)-1, \
-                                        (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
+                                         -(__v4sf)(__m128)(B), \
+                                         -(__v4sf)(__m128)(C), (__mmask8)-1, \
+                                         (int)(R)))
 
 #define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
-                                        -(__v4sf)(__m128)(A), \
-                                        -(__v4sf)(__m128)(B), (__mmask8)(U), \
-                                        (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+                                         -(__v4sf)(__m128)(A), \
+                                         -(__v4sf)(__m128)(B), (__mmask8)(U), \
+                                         (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
@@ -7984,10 +7984,10 @@ _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
 }
 
 #define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \
-  (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
-                                         -(__v4sf)(__m128)(B), \
-                                         -(__v4sf)(__m128)(C), (__mmask8)(U), \
-                                         (int)(R))
+  ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+                                          -(__v4sf)(__m128)(B), \
+                                          -(__v4sf)(__m128)(C), (__mmask8)(U), \
+                                          (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
@@ -8000,10 +8000,10 @@ _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
 }
 
 #define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \
-  (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
-                                         -(__v4sf)(__m128)(X), \
-                                         (__v4sf)(__m128)(Y), (__mmask8)(U), \
-                                         (int)(R))
+  ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
+                                          -(__v4sf)(__m128)(X), \
+                                          (__v4sf)(__m128)(Y), (__mmask8)(U), \
+                                          (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
@@ -8016,16 +8016,16 @@ _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 }
 
 #define _mm_fmadd_round_sd(A, B, C, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         (__v2df)(__m128d)(C), (__mmask8)-1, \
-                                         (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          (__v2df)(__m128d)(C), (__mmask8)-1, \
+                                          (int)(R)))
 
 #define _mm_mask_fmadd_round_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
-                                         (__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), (__mmask8)(U), \
-                                         (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), (__mmask8)(U), \
+                                          (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
@@ -8038,10 +8038,10 @@ _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 }
 
 #define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), \
-                                          (__v2df)(__m128d)(C), (__mmask8)(U), \
-                                          (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(C), (__mmask8)(U), \
+                                           (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
@@ -8054,10 +8054,10 @@ _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 }
 
 #define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
-                                          (__v2df)(__m128d)(X), \
-                                          (__v2df)(__m128d)(Y), (__mmask8)(U), \
-                                          (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+                                           (__v2df)(__m128d)(X), \
+                                           (__v2df)(__m128d)(Y), (__mmask8)(U), \
+                                           (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
@@ -8070,16 +8070,16 @@ _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 }
 
 #define _mm_fmsub_round_sd(A, B, C, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), \
-                                         -(__v2df)(__m128d)(C), (__mmask8)-1, \
-                                         (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), \
+                                          -(__v2df)(__m128d)(C), (__mmask8)-1, \
+                                          (int)(R)))
 
 #define _mm_mask_fmsub_round_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
-                                         (__v2df)(__m128d)(A), \
-                                         -(__v2df)(__m128d)(B), (__mmask8)(U), \
-                                         (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                          (__v2df)(__m128d)(A), \
+                                          -(__v2df)(__m128d)(B), (__mmask8)(U), \
+                                          (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
@@ -8092,10 +8092,10 @@ _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 }
 
 #define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), \
-                                          -(__v2df)(__m128d)(C), \
-                                          (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), \
+                                           -(__v2df)(__m128d)(C), \
+                                           (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
@@ -8108,10 +8108,10 @@ _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 }
 
 #define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \
-  (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
-                                          (__v2df)(__m128d)(X), \
-                                          (__v2df)(__m128d)(Y), \
-                                          (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
+                                           (__v2df)(__m128d)(X), \
+                                           (__v2df)(__m128d)(Y), \
+                                           (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
@@ -8124,16 +8124,16 @@ _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 }
 
 #define _mm_fnmadd_round_sd(A, B, C, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
-                                         -(__v2df)(__m128d)(B), \
-                                         (__v2df)(__m128d)(C), (__mmask8)-1, \
-                                         (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
+                                          -(__v2df)(__m128d)(B), \
+                                          (__v2df)(__m128d)(C), (__mmask8)-1, \
+                                          (int)(R)))
 
 #define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
-                                         -(__v2df)(__m128d)(A), \
-                                         (__v2df)(__m128d)(B), (__mmask8)(U), \
-                                         (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                          -(__v2df)(__m128d)(A), \
+                                          (__v2df)(__m128d)(B), (__mmask8)(U), \
+                                          (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
@@ -8146,10 +8146,10 @@ _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 }
 
 #define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
-                                          -(__v2df)(__m128d)(B), \
-                                          (__v2df)(__m128d)(C), (__mmask8)(U), \
-                                          (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                           -(__v2df)(__m128d)(B), \
+                                           (__v2df)(__m128d)(C), (__mmask8)(U), \
+                                           (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
@@ -8162,10 +8162,10 @@ _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 }
 
 #define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
-                                          -(__v2df)(__m128d)(X), \
-                                          (__v2df)(__m128d)(Y), (__mmask8)(U), \
-                                          (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+                                           -(__v2df)(__m128d)(X), \
+                                           (__v2df)(__m128d)(Y), (__mmask8)(U), \
+                                           (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
@@ -8178,16 +8178,16 @@ _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
 }
 
 #define _mm_fnmsub_round_sd(A, B, C, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
-                                         -(__v2df)(__m128d)(B), \
-                                         -(__v2df)(__m128d)(C), (__mmask8)-1, \
-                                         (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
+                                          -(__v2df)(__m128d)(B), \
+                                          -(__v2df)(__m128d)(C), (__mmask8)-1, \
+                                          (int)(R)))
 
 #define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
-                                         -(__v2df)(__m128d)(A), \
-                                         -(__v2df)(__m128d)(B), (__mmask8)(U), \
-                                         (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+                                          -(__v2df)(__m128d)(A), \
+                                          -(__v2df)(__m128d)(B), (__mmask8)(U), \
+                                          (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
@@ -8200,11 +8200,11 @@ _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
 }
 
 #define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \
-  (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
-                                          -(__v2df)(__m128d)(B), \
-                                          -(__v2df)(__m128d)(C), \
-                                          (__mmask8)(U), \
-                                          (int)(R))
+  ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+                                           -(__v2df)(__m128d)(B), \
+                                           -(__v2df)(__m128d)(C), \
+                                           (__mmask8)(U), \
+                                           (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
@@ -8217,36 +8217,36 @@ _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
 }
 
 #define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \
-  (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
-                                          -(__v2df)(__m128d)(X), \
-                                          (__v2df)(__m128d)(Y), \
-                                          (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
+                                           -(__v2df)(__m128d)(X), \
+                                           (__v2df)(__m128d)(Y), \
+                                           (__mmask8)(U), (int)(R)))
 
 #define _mm512_permutex_pd(X, C) \
-  (__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C))
+  ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C)))
 
 #define _mm512_mask_permutex_pd(W, U, X, C) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                       (__v8df)_mm512_permutex_pd((X), (C)), \
-                                       (__v8df)(__m512d)(W))
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                        (__v8df)_mm512_permutex_pd((X), (C)), \
+                                        (__v8df)(__m512d)(W)))
 
 #define _mm512_maskz_permutex_pd(U, X, C) \
-  (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
-                                       (__v8df)_mm512_permutex_pd((X), (C)), \
-                                       (__v8df)_mm512_setzero_pd())
+  ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+                                        (__v8df)_mm512_permutex_pd((X), (C)), \
+                                        (__v8df)_mm512_setzero_pd()))
 
 #define _mm512_permutex_epi64(X, C) \
-  (__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C))
+  ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C)))
 
 #define _mm512_mask_permutex_epi64(W, U, X, C) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                      (__v8di)_mm512_permutex_epi64((X), (C)), \
-                                      (__v8di)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                       (__v8di)_mm512_permutex_epi64((X), (C)), \
+                                       (__v8di)(__m512i)(W)))
 
 #define _mm512_maskz_permutex_epi64(U, X, C) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                      (__v8di)_mm512_permutex_epi64((X), (C)), \
-                                      (__v8di)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                       (__v8di)_mm512_permutex_epi64((X), (C)), \
+                                       (__v8di)_mm512_setzero_si512()))
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_permutexvar_pd (__m512i __X, __m512d __Y)
@@ -8416,10 +8416,10 @@ _mm512_kxor (__mmask16 __A, __mmask16 __B)
 #define _kxor_mask16 _mm512_kxor
 
 #define _kshiftli_mask16(A, I) \
-  (__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I))
+  ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I)))
 
 #define _kshiftri_mask16(A, I) \
-  (__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I))
+  ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)))
 
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 _cvtmask16_u32(__mmask16 __A) {
@@ -8538,48 +8538,48 @@ _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
 }
 
 #define _mm_cmp_round_ss_mask(X, Y, P, R) \
-  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
-                                      (__v4sf)(__m128)(Y), (int)(P), \
-                                      (__mmask8)-1, (int)(R))
+  ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                       (__v4sf)(__m128)(Y), (int)(P), \
+                                       (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \
-  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
-                                      (__v4sf)(__m128)(Y), (int)(P), \
-                                      (__mmask8)(M), (int)(R))
+  ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                       (__v4sf)(__m128)(Y), (int)(P), \
+                                       (__mmask8)(M), (int)(R)))
 
 #define _mm_cmp_ss_mask(X, Y, P) \
-  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
-                                      (__v4sf)(__m128)(Y), (int)(P), \
-                                      (__mmask8)-1, \
-                                      _MM_FROUND_CUR_DIRECTION)
+  ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                       (__v4sf)(__m128)(Y), (int)(P), \
+                                       (__mmask8)-1, \
+                                       _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_mask_cmp_ss_mask(M, X, Y, P) \
-  (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
-                                      (__v4sf)(__m128)(Y), (int)(P), \
-                                      (__mmask8)(M), \
-                                      _MM_FROUND_CUR_DIRECTION)
+  ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+                                       (__v4sf)(__m128)(Y), (int)(P), \
+                                       (__mmask8)(M), \
+                                       _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_cmp_round_sd_mask(X, Y, P, R) \
-  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
-                                      (__v2df)(__m128d)(Y), (int)(P), \
-                                      (__mmask8)-1, (int)(R))
+  ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                       (__v2df)(__m128d)(Y), (int)(P), \
+                                       (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \
-  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
-                                      (__v2df)(__m128d)(Y), (int)(P), \
-                                      (__mmask8)(M), (int)(R))
+  ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                       (__v2df)(__m128d)(Y), (int)(P), \
+                                       (__mmask8)(M), (int)(R)))
 
 #define _mm_cmp_sd_mask(X, Y, P) \
-  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
-                                      (__v2df)(__m128d)(Y), (int)(P), \
-                                      (__mmask8)-1, \
-                                      _MM_FROUND_CUR_DIRECTION)
+  ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                       (__v2df)(__m128d)(Y), (int)(P), \
+                                       (__mmask8)-1, \
+                                       _MM_FROUND_CUR_DIRECTION))
 
 #define _mm_mask_cmp_sd_mask(M, X, Y, P) \
-  (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
-                                      (__v2df)(__m128d)(Y), (int)(P), \
-                                      (__mmask8)(M), \
-                                      _MM_FROUND_CUR_DIRECTION)
+  ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+                                       (__v2df)(__m128d)(Y), (int)(P), \
+                                       (__mmask8)(M), \
+                                       _MM_FROUND_CUR_DIRECTION))
 
 /* Bit Test */
 
@@ -8760,17 +8760,17 @@ _mm_maskz_load_sd (__mmask8 __U, const double* __A)
 }
 
 #define _mm512_shuffle_epi32(A, I) \
-  (__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I))
+  ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I)))
 
 #define _mm512_mask_shuffle_epi32(W, U, A, I) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                      (__v16si)_mm512_shuffle_epi32((A), (I)), \
-                                      (__v16si)(__m512i)(W))
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                       (__v16si)_mm512_shuffle_epi32((A), (I)), \
+                                       (__v16si)(__m512i)(W)))
 
 #define _mm512_maskz_shuffle_epi32(U, A, I) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                      (__v16si)_mm512_shuffle_epi32((A), (I)), \
-                                      (__v16si)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                       (__v16si)_mm512_shuffle_epi32((A), (I)), \
+                                       (__v16si)_mm512_setzero_si512()))
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
@@ -8901,19 +8901,19 @@ _mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
 }
 
 #define _mm512_cvt_roundps_pd(A, R) \
-  (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
-                                           (__v8df)_mm512_undefined_pd(), \
-                                           (__mmask8)-1, (int)(R))
+  ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+                                            (__v8df)_mm512_undefined_pd(), \
+                                            (__mmask8)-1, (int)(R)))
 
 #define _mm512_mask_cvt_roundps_pd(W, U, A, R) \
-  (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
-                                           (__v8df)(__m512d)(W), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+                                            (__v8df)(__m512d)(W), \
+                                            (__mmask8)(U), (int)(R)))
 
 #define _mm512_maskz_cvt_roundps_pd(U, A, R) \
-  (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
-                                           (__v8df)_mm512_setzero_pd(), \
-                                           (__mmask8)(U), (int)(R))
+  ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+                                            (__v8df)_mm512_setzero_pd(), \
+                                            (__mmask8)(U), (int)(R)))
 
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_cvtps_pd (__m256 __A)
@@ -9010,22 +9010,22 @@ _mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
 }
 
 #define _mm_cvt_roundsd_ss(A, B, R) \
-  (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v2df)(__m128d)(B), \
-                                             (__v4sf)_mm_undefined_ps(), \
-                                             (__mmask8)-1, (int)(R))
+  ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v4sf)_mm_undefined_ps(), \
+                                              (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \
-  (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v2df)(__m128d)(B), \
-                                             (__v4sf)(__m128)(W), \
-                                             (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v4sf)(__m128)(W), \
+                                              (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \
-  (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
-                                             (__v2df)(__m128d)(B), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)(U), (int)(R))
+  ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
@@ -9058,47 +9058,47 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
 
 #ifdef __x86_64__
 #define _mm_cvt_roundi64_sd(A, B, R) \
-  (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
-                                     (int)(R))
+  ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
+                                      (int)(R)))
 
 #define _mm_cvt_roundsi64_sd(A, B, R) \
-  (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
-                                     (int)(R))
+  ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
+                                      (int)(R)))
 #endif
 
 #define _mm_cvt_roundsi32_ss(A, B, R) \
-  (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))
+  ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
 
 #define _mm_cvt_roundi32_ss(A, B, R) \
-  (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))
+  ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
 
 #ifdef __x86_64__
 #define _mm_cvt_roundsi64_ss(A, B, R) \
-  (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
-                                    (int)(R))
+  ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
+                                     (int)(R)))
 
 #define _mm_cvt_roundi64_ss(A, B, R) \
-  (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
-                                    (int)(R))
+  ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
+                                     (int)(R)))
 #endif
 
 #define _mm_cvt_roundss_sd(A, B, R) \
-  (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (__v2df)_mm_undefined_pd(), \
-                                              (__mmask8)-1, (int)(R))
+  ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v2df)_mm_undefined_pd(), \
+                                               (__mmask8)-1, (int)(R)))
 
 #define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \
-  (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (__v2df)(__m128d)(W), \
-                                              (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v2df)(__m128d)(W), \
+                                               (__mmask8)(U), (int)(R)))
 
 #define _mm_maskz_cvt_roundss_sd(U, A, B, R) \
-  (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
-                                              (__v4sf)(__m128)(B), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)(U), (int)(R))
+  ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+                                               (__v4sf)(__m128)(B), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)(U), (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
@@ -9127,8 +9127,8 @@ _mm_cvtu32_sd (__m128d __A, unsigned __B)
 
 #ifdef __x86_64__
 #define _mm_cvt_roundu64_sd(A, B, R) \
-  (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
-                                      (unsigned long long)(B), (int)(R))
+  ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
+                                       (unsigned long long)(B), (int)(R)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_cvtu64_sd (__m128d __A, unsigned long long __B)
@@ -9139,8 +9139,8 @@ _mm_cvtu64_sd (__m128d __A, unsigned long long __B)
 #endif
 
 #define _mm_cvt_roundu32_ss(A, B, R) \
-  (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
-                                     (int)(R))
+  ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
+                                      (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_cvtu32_ss (__m128 __A, unsigned __B)
@@ -9151,8 +9151,8 @@ _mm_cvtu32_ss (__m128 __A, unsigned __B)
 
 #ifdef __x86_64__
 #define _mm_cvt_roundu64_ss(A, B, R) \
-  (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
-                                     (unsigned long long)(B), (int)(R))
+  ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
+                                      (unsigned long long)(B), (int)(R)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h
index 926033e20152c..48370d0bf0ee0 100644
--- a/clang/lib/Headers/avx512fp16intrin.h
+++ b/clang/lib/Headers/avx512fp16intrin.h
@@ -269,10 +269,539 @@ _mm512_zextph256_ph512(__m256h __a) {
                                  29, 30, 31);
 }
 
+#define _mm_comi_round_sh(A, B, P, R)                                          \
+  __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
+
+#define _mm_comi_sh(A, B, pred)                                                \
+  _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A,
+                                                          __m128h B) {
+  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS,
+                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A,
+                                                          __m128h B) {
+  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS,
+                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A,
+                                                          __m128h B) {
+  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS,
+                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A,
+                                                          __m128h B) {
+  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS,
+                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A,
+                                                          __m128h B) {
+  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS,
+                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A,
+                                                           __m128h B) {
+  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US,
+                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A,
+                                                           __m128h B) {
+  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ,
+                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A,
+                                                           __m128h B) {
+  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ,
+                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A,
+                                                           __m128h B) {
+  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ,
+                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A,
+                                                           __m128h B) {
+  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ,
+                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A,
+                                                           __m128h B) {
+  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ,
+                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A,
+                                                            __m128h B) {
+  return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ,
+                                _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
+                                                              __m512h __B) {
+  return (__m512h)((__v32hf)__A + (__v32hf)__B);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_selectph_512(
+      (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
+                                              (__v32hf)_mm512_add_ph(__A, __B),
+                                              (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm512_add_round_ph(A, B, R)                                           \
+  ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A),                     \
+                                    (__v32hf)(__m512h)(B), (int)(R)))
+
+#define _mm512_mask_add_round_ph(W, U, A, B, R)                                \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
+      (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_add_round_ph(U, A, B, R)                                  \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)),             \
+      (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
+                                                              __m512h __B) {
+  return (__m512h)((__v32hf)__A - (__v32hf)__B);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_selectph_512(
+      (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
+                                              (__v32hf)_mm512_sub_ph(__A, __B),
+                                              (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm512_sub_round_ph(A, B, R)                                           \
+  ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A),                     \
+                                    (__v32hf)(__m512h)(B), (int)(R)))
+
+#define _mm512_mask_sub_round_ph(W, U, A, B, R)                                \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
+      (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_sub_round_ph(U, A, B, R)                                  \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)),             \
+      (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
+                                                              __m512h __B) {
+  return (__m512h)((__v32hf)__A * (__v32hf)__B);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_selectph_512(
+      (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
+                                              (__v32hf)_mm512_mul_ph(__A, __B),
+                                              (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm512_mul_round_ph(A, B, R)                                           \
+  ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A),                     \
+                                    (__v32hf)(__m512h)(B), (int)(R)))
+
+#define _mm512_mask_mul_round_ph(W, U, A, B, R)                                \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
+      (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_mul_round_ph(U, A, B, R)                                  \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)),             \
+      (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
+                                                              __m512h __B) {
+  return (__m512h)((__v32hf)__A / (__v32hf)__B);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_selectph_512(
+      (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
+                                              (__v32hf)_mm512_div_ph(__A, __B),
+                                              (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm512_div_round_ph(A, B, R)                                           \
+  ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A),                     \
+                                    (__v32hf)(__m512h)(B), (int)(R)))
+
+#define _mm512_mask_div_round_ph(W, U, A, B, R)                                \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
+      (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_div_round_ph(U, A, B, R)                                  \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)),             \
+      (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
+                                                              __m512h __B) {
+  return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
+                                          _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_selectph_512(
+      (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
+                                              (__v32hf)_mm512_min_ph(__A, __B),
+                                              (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm512_min_round_ph(A, B, R)                                           \
+  ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A),                     \
+                                    (__v32hf)(__m512h)(B), (int)(R)))
+
+#define _mm512_mask_min_round_ph(W, U, A, B, R)                                \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
+      (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_min_round_ph(U, A, B, R)                                  \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)),             \
+      (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
+                                                              __m512h __B) {
+  return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
+                                          _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_selectph_512(
+      (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
+                                              (__v32hf)_mm512_max_ph(__A, __B),
+                                              (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm512_max_round_ph(A, B, R)                                           \
+  ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A),                     \
+                                    (__v32hf)(__m512h)(B), (int)(R)))
+
+#define _mm512_mask_max_round_ph(W, U, A, B, R)                                \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
+      (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_max_round_ph(U, A, B, R)                                  \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)),             \
+      (__v32hf)_mm512_setzero_ph()))
+
 static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
   return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
 }
 
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
+                                                           __m128h __B) {
+  __A[0] += __B[0];
+  return __A;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  __A = _mm_add_sh(__A, __B);
+  return __builtin_ia32_selectsh_128(__U, __A, __W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  __A = _mm_add_sh(__A, __B);
+  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
+}
+
+#define _mm_add_round_sh(A, B, R)                                              \
+  ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_add_round_sh(W, U, A, B, R)                                   \
+  ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
+      (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_add_round_sh(U, A, B, R)                                     \
+  ((__m128h)__builtin_ia32_addsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
+                                                           __m128h __B) {
+  __A[0] -= __B[0];
+  return __A;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  __A = _mm_sub_sh(__A, __B);
+  return __builtin_ia32_selectsh_128(__U, __A, __W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  __A = _mm_sub_sh(__A, __B);
+  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
+}
+
+#define _mm_sub_round_sh(A, B, R)                                              \
+  ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_sub_round_sh(W, U, A, B, R)                                   \
+  ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
+      (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_sub_round_sh(U, A, B, R)                                     \
+  ((__m128h)__builtin_ia32_subsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
+                                                           __m128h __B) {
+  __A[0] *= __B[0];
+  return __A;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  __A = _mm_mul_sh(__A, __B);
+  return __builtin_ia32_selectsh_128(__U, __A, __W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  __A = _mm_mul_sh(__A, __B);
+  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
+}
+
+#define _mm_mul_round_sh(A, B, R)                                              \
+  ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_mul_round_sh(W, U, A, B, R)                                   \
+  ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
+      (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_mul_round_sh(U, A, B, R)                                     \
+  ((__m128h)__builtin_ia32_mulsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
+                                                           __m128h __B) {
+  __A[0] /= __B[0];
+  return __A;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  __A = _mm_div_sh(__A, __B);
+  return __builtin_ia32_selectsh_128(__U, __A, __W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  __A = _mm_div_sh(__A, __B);
+  return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
+}
+
+#define _mm_div_round_sh(A, B, R)                                              \
+  ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_div_round_sh(W, U, A, B, R)                                   \
+  ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
+      (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_div_round_sh(U, A, B, R)                                     \
+  ((__m128h)__builtin_ia32_divsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
+                                                           __m128h __B) {
+  return (__m128h)__builtin_ia32_minsh_round_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
+                                                  (__v8hf)__W, (__mmask8)__U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  return (__m128h)__builtin_ia32_minsh_round_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_sh(A, B, R)                                              \
+  ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_min_round_sh(W, U, A, B, R)                                   \
+  ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
+      (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_min_round_sh(U, A, B, R)                                     \
+  ((__m128h)__builtin_ia32_minsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
+                                                           __m128h __B) {
+  return (__m128h)__builtin_ia32_maxsh_round_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
+                                                  (__v8hf)__W, (__mmask8)__U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  return (__m128h)__builtin_ia32_maxsh_round_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_sh(A, B, R)                                              \
+  ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_max_round_sh(W, U, A, B, R)                                   \
+  ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
+      (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_max_round_sh(U, A, B, R)                                     \
+  ((__m128h)__builtin_ia32_maxsh_round_mask(                                   \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)(U), (int)(R)))
+
+#define _mm512_cmp_round_ph_mask(A, B, P, R)                                   \
+  ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
+                                           (__v32hf)(__m512h)(B), (int)(P),    \
+                                           (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R)                           \
+  ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A),              \
+                                           (__v32hf)(__m512h)(B), (int)(P),    \
+                                           (__mmask32)(U), (int)(R)))
+
+#define _mm512_cmp_ph_mask(A, B, P)                                            \
+  _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_cmp_ph_mask(U, A, B, P)                                    \
+  _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_cmp_round_sh_mask(X, Y, P, R)                                      \
+  ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
+                                       (__v8hf)(__m128h)(Y), (int)(P),         \
+                                       (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R)                              \
+  ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X),                   \
+                                       (__v8hf)(__m128h)(Y), (int)(P),         \
+                                       (__mmask8)(M), (int)(R)))
+
+#define _mm_cmp_sh_mask(X, Y, P)                                               \
+  ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
+      (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1,      \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_cmp_sh_mask(M, X, Y, P)                                       \
+  ((__mmask8)__builtin_ia32_cmpsh_mask(                                        \
+      (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M),     \
+      _MM_FROUND_CUR_DIRECTION))
 // loads with vmovsh:
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
   struct __mm_load_sh_struct {
@@ -418,6 +947,1502 @@ static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
   return __b[0];
 }
 
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
+  return (__m512h)__builtin_ia32_rcpph512_mask(
+      (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+  return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
+                                               (__mmask32)__U);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
+  return (__m512h)__builtin_ia32_rcpph512_mask(
+      (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
+  return (__m512h)__builtin_ia32_rsqrtph512_mask(
+      (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+  return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
+                                                 (__mmask32)__U);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
+  return (__m512h)__builtin_ia32_rsqrtph512_mask(
+      (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
+}
+
+#define _mm512_getmant_ph(A, B, C)                                             \
+  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
+      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
+      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,                           \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_ph(W, U, A, B, C)                                  \
+  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
+      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
+      (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getmant_ph(U, A, B, C)                                    \
+  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
+      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
+      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_getmant_round_ph(A, B, C, R)                                    \
+  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
+      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
+      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R)                         \
+  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
+      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W),   \
+      (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_getmant_round_ph(U, A, B, C, R)                           \
+  ((__m512h)__builtin_ia32_getmantph512_mask(                                  \
+      (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)),                          \
+      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
+  return (__m512h)__builtin_ia32_getexpph512_mask(
+      (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+  return (__m512h)__builtin_ia32_getexpph512_mask(
+      (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
+  return (__m512h)__builtin_ia32_getexpph512_mask(
+      (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_getexp_round_ph(A, R)                                           \
+  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
+                                            (__v32hf)_mm512_undefined_ph(),    \
+                                            (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_getexp_round_ph(W, U, A, R)                                \
+  ((__m512h)__builtin_ia32_getexpph512_mask(                                   \
+      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_getexp_round_ph(U, A, R)                                  \
+  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),             \
+                                            (__v32hf)_mm512_setzero_ph(),      \
+                                            (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
+                                                                 __m512h __B) {
+  return (__m512h)__builtin_ia32_scalefph512_mask(
+      (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
+                                                  (__v32hf)__W, (__mmask32)__U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  return (__m512h)__builtin_ia32_scalefph512_mask(
+      (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_scalef_round_ph(A, B, R)                                        \
+  ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
+      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
+      (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_scalef_round_ph(W, U, A, B, R)                             \
+  ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
+      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W),     \
+      (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_scalef_round_ph(U, A, B, R)                               \
+  ((__m512h)__builtin_ia32_scalefph512_mask(                                   \
+      (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B),                            \
+      (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+#define _mm512_roundscale_ph(A, B)                                             \
+  ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
+      (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1,   \
+      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_ph(A, B, C, imm)                                \
+  ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
+      (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A),                \
+      (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_roundscale_ph(A, B, imm)                                  \
+  ((__m512h)__builtin_ia32_rndscaleph_mask(                                    \
+      (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
+      (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R)                       \
+  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm),  \
+                                           (__v32hf)(__m512h)(A),              \
+                                           (__mmask32)(B), (int)(R)))
+
+#define _mm512_maskz_roundscale_round_ph(A, B, imm, R)                         \
+  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm),  \
+                                           (__v32hf)_mm512_setzero_ph(),       \
+                                           (__mmask32)(A), (int)(R)))
+
+#define _mm512_roundscale_round_ph(A, imm, R)                                  \
+  ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm),  \
+                                           (__v32hf)_mm512_undefined_ph(),     \
+                                           (__mmask32)-1, (int)(R)))
+
+#define _mm512_reduce_ph(A, imm)                                               \
+  ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
+      (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(),       \
+      (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_reduce_ph(W, U, A, imm)                                    \
+  ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
+      (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W),                \
+      (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_reduce_ph(U, A, imm)                                      \
+  ((__m512h)__builtin_ia32_reduceph512_mask(                                   \
+      (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(),         \
+      (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_reduce_round_ph(W, U, A, imm, R)                           \
+  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
+                                            (__v32hf)(__m512h)(W),             \
+                                            (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_reduce_round_ph(U, A, imm, R)                             \
+  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
+                                            (__v32hf)_mm512_setzero_ph(),      \
+                                            (__mmask32)(U), (int)(R)))
+
+#define _mm512_reduce_round_ph(A, imm, R)                                      \
+  ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
+                                            (__v32hf)_mm512_undefined_ph(),    \
+                                            (__mmask32)-1, (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
+                                                           __m128h __B) {
+  return (__m128h)__builtin_ia32_rcpsh_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
+                                            (__v8hf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  return (__m128h)__builtin_ia32_rcpsh_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
+                                                             __m128h __B) {
+  return (__m128h)__builtin_ia32_rsqrtsh_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
+                                                                  __mmask8 __U,
+                                                                  __m128h __A,
+                                                                  __m128h __B) {
+  return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
+                                              (__v8hf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  return (__m128h)__builtin_ia32_rsqrtsh_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+#define _mm_getmant_round_sh(A, B, C, D, R)                                    \
+  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
+      (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
+
+#define _mm_getmant_sh(A, B, C, D)                                             \
+  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
+      (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_sh(W, U, A, B, C, D)                                  \
+  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
+      (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R)                         \
+  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
+      (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_getmant_sh(U, A, B, C, D)                                    \
+  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
+      (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R)                           \
+  ((__m128h)__builtin_ia32_getmantsh_round_mask(                               \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)),     \
+      (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+#define _mm_getexp_round_sh(A, B, R)                                           \
+  ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)-1, (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
+                                                              __m128h __B) {
+  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_getexp_round_sh(W, U, A, B, R)                                \
+  ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
+      (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  return (__m128h)__builtin_ia32_getexpsh128_round_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_getexp_round_sh(U, A, B, R)                                  \
+  ((__m128h)__builtin_ia32_getexpsh128_round_mask(                             \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)(U), (int)(R)))
+
+#define _mm_scalef_round_sh(A, B, R)                                           \
+  ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)-1, (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
+                                                              __m128h __B) {
+  return (__m128h)__builtin_ia32_scalefsh_round_mask(
+      (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
+                                                     (__v8hf)__W, (__mmask8)__U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_scalef_round_sh(W, U, A, B, R)                                \
+  ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
+      (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  return (__m128h)__builtin_ia32_scalefsh_round_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_scalef_round_sh(U, A, B, R)                                  \
+  ((__m128h)__builtin_ia32_scalefsh_round_mask(                                \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)(U), (int)(R)))
+
+#define _mm_roundscale_round_sh(A, B, imm, R)                                  \
+  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)-1, (int)(imm), (int)(R)))
+
+#define _mm_roundscale_sh(A, B, imm)                                           \
+  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_sh(W, U, A, B, I)                                  \
+  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
+      (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R)                         \
+  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
+      (__mmask8)(U), (int)(I), (int)(R)))
+
+#define _mm_maskz_roundscale_sh(U, A, B, I)                                    \
+  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_roundscale_round_sh(U, A, B, I, R)                           \
+  ((__m128h)__builtin_ia32_rndscalesh_round_mask(                              \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)(U), (int)(I), (int)(R)))
+
+#define _mm_reduce_sh(A, B, C)                                                 \
+  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_reduce_sh(W, U, A, B, C)                                      \
+  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
+      (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_reduce_sh(U, A, B, C)                                        \
+  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_reduce_round_sh(A, B, C, R)                                        \
+  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)-1, (int)(C), (int)(R)))
+
+#define _mm_mask_reduce_round_sh(W, U, A, B, C, R)                             \
+  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
+      (__mmask8)(U), (int)(C), (int)(R)))
+
+#define _mm_maskz_reduce_round_sh(U, A, B, C, R)                               \
+  ((__m128h)__builtin_ia32_reducesh_mask(                                      \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)(U), (int)(C), (int)(R)))
+
+#define _mm512_sqrt_round_ph(A, R)                                             \
+  ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
+
+#define _mm512_mask_sqrt_round_ph(W, U, A, R)                                  \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
+      (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_sqrt_round_ph(U, A, R)                                    \
+  ((__m512h)__builtin_ia32_selectph_512(                                       \
+      (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)),                 \
+      (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
+  return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
+                                           _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+  return (__m512h)__builtin_ia32_selectph_512(
+      (__mmask32)(__U),
+      (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
+      (__v32hf)(__m512h)(__W));
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
+  return (__m512h)__builtin_ia32_selectph_512(
+      (__mmask32)(__U),
+      (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
+      (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm_sqrt_round_sh(A, B, R)                                             \
+  ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_sqrt_round_sh(W, U, A, B, R)                                  \
+  ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W),        \
+      (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_sqrt_round_sh(U, A, B, R)                                    \
+  ((__m128h)__builtin_ia32_sqrtsh_round_mask(                                  \
+      (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(),    \
+      (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
+                                                            __m128h __B) {
+  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
+      (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
+      (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
+                                                                 __mmask32 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
+      (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
+      (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
+                                                                  __m128h __A,
+                                                                  __m128h __B) {
+  return (__m128h)__builtin_ia32_sqrtsh_round_mask(
+      (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
+      (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fpclass_ph_mask(U, A, imm)                                 \
+  ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
+                                               (int)(imm), (__mmask32)(U)))
+
+#define _mm512_fpclass_ph_mask(A, imm)                                         \
+  ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A),          \
+                                               (int)(imm), (__mmask32)-1))
+
+#define _mm_fpclass_sh_mask(A, imm)                                            \
+  ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
+                                           (__mmask8)-1))
+
+#define _mm_mask_fpclass_sh_mask(U, A, imm)                                    \
+  ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm),   \
+                                           (__mmask8)(U)))
+
+#define _mm512_cvt_roundpd_ph(A, R)                                            \
+  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
+      (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundpd_ph(W, U, A, R)                                 \
+  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W),         \
+                                             (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundpd_ph(U, A, R)                                   \
+  ((__m128h)__builtin_ia32_vcvtpd2ph512_mask(                                  \
+      (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
+  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
+      (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
+  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
+      (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
+  return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
+      (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_pd(A, R)                                            \
+  ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
+      (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundph_pd(W, U, A, R)                                 \
+  ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W),         \
+                                             (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_pd(U, A, R)                                   \
+  ((__m512d)__builtin_ia32_vcvtph2pd512_mask(                                  \
+      (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
+  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
+      (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
+  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
+      (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
+  return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
+      (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundsh_ss(A, B, R)                                            \
+  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
+                                               (__v4sf)_mm_undefined_ps(),     \
+                                               (__mmask8)(-1), (int)(R)))
+
+#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R)                                 \
+  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask(                                \
+      (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_cvt_roundsh_ss(U, A, B, R)                                   \
+  ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B),       \
+                                               (__v4sf)_mm_setzero_ps(),       \
+                                               (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
+                                                            __m128h __B) {
+  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
+      (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
+                                                                 __mmask8 __U,
+                                                                 __m128 __A,
+                                                                 __m128h __B) {
+  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
+                                                     (__v4sf)__W, (__mmask8)__U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
+                                                                  __m128 __A,
+                                                                  __m128h __B) {
+  return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
+      (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundss_sh(A, B, R)                                            \
+  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
+                                                (__v8hf)_mm_undefined_ph(),    \
+                                                (__mmask8)(-1), (int)(R)))
+
+#define _mm_mask_cvt_roundss_sh(W, U, A, B, R)                                 \
+  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask(                               \
+      (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_cvt_roundss_sh(U, A, B, R)                                   \
+  ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B),      \
+                                                (__v8hf)_mm_setzero_ph(),      \
+                                                (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
+                                                             __m128 __B) {
+  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
+      (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
+                                                                  __mmask8 __U,
+                                                                  __m128h __A,
+                                                                  __m128 __B) {
+  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
+      (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
+                                                                   __m128h __A,
+                                                                   __m128 __B) {
+  return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
+      (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundsd_sh(A, B, R)                                            \
+  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
+                                                (__v8hf)_mm_undefined_ph(),    \
+                                                (__mmask8)(-1), (int)(R)))
+
+#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R)                                 \
+  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask(                               \
+      (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_cvt_roundsd_sh(U, A, B, R)                                   \
+  ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B),      \
+                                                (__v8hf)_mm_setzero_ph(),      \
+                                                (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
+                                                             __m128d __B) {
+  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
+      (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
+                                                                  __mmask8 __U,
+                                                                  __m128h __A,
+                                                                  __m128d __B) {
+  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
+      (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
+  return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
+      (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundsh_sd(A, B, R)                                            \
+  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
+                                                (__v2df)_mm_undefined_pd(),    \
+                                                (__mmask8)(-1), (int)(R)))
+
+#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R)                                 \
+  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask(                               \
+      (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_cvt_roundsh_sd(U, A, B, R)                                   \
+  ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B),      \
+                                                (__v2df)_mm_setzero_pd(),      \
+                                                (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
+                                                             __m128h __B) {
+  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
+      (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
+                                                                  __mmask8 __U,
+                                                                  __m128d __A,
+                                                                  __m128h __B) {
+  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
+      (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
+  return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
+      (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_epi16(A, R)                                         \
+  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
+                                            (__v32hi)_mm512_undefined_epi32(), \
+                                            (__mmask32)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundph_epi16(W, U, A, R)                              \
+  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W),        \
+                                            (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_epi16(U, A, R)                                \
+  ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A),                      \
+                                            (__v32hi)_mm512_setzero_epi32(),   \
+                                            (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtph_epi16(__m512h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
+      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
+      (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2w512_mask(
+      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundph_epi16(A, R)                                        \
+  ((__m512i)__builtin_ia32_vcvttph2w512_mask(                                  \
+      (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1),        \
+      (int)(R)))
+
+#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R)                             \
+  ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W),       \
+                                             (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundph_epi16(U, A, R)                               \
+  ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A),                     \
+                                             (__v32hi)_mm512_setzero_epi32(),  \
+                                             (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttph_epi16(__m512h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
+      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
+      (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2w512_mask(
+      (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi16_ph(A, R)                                         \
+  ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A),                      \
+                                            (__v32hf)_mm512_undefined_ph(),    \
+                                            (__mmask32)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R)                              \
+  ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W),        \
+                                            (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepi16_ph(U, A, R)                                \
+  ((__m512h)__builtin_ia32_vcvtw2ph512_mask(                                   \
+      (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_cvtepi16_ph(__m512i __A) {
+  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
+      (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
+  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
+      (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
+  return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
+      (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_epu16(A, R)                                         \
+  ((__m512i)__builtin_ia32_vcvtph2uw512_mask(                                  \
+      (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
+      (int)(R)))
+
+#define _mm512_mask_cvt_roundph_epu16(W, U, A, R)                              \
+  ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W),       \
+                                             (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_epu16(U, A, R)                                \
+  ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A),                     \
+                                             (__v32hu)_mm512_setzero_epi32(),  \
+                                             (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtph_epu16(__m512h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
+      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
+      (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
+      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundph_epu16(A, R)                                        \
+  ((__m512i)__builtin_ia32_vcvttph2uw512_mask(                                 \
+      (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1),        \
+      (int)(R)))
+
+#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R)                             \
+  ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W),      \
+                                              (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundph_epu16(U, A, R)                               \
+  ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A),                    \
+                                              (__v32hu)_mm512_setzero_epi32(), \
+                                              (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttph_epu16(__m512h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
+      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
+      (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
+      (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu16_ph(A, R)                                         \
+  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A),                     \
+                                             (__v32hf)_mm512_undefined_ph(),   \
+                                             (__mmask32)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R)                              \
+  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W),       \
+                                             (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepu16_ph(U, A, R)                                \
+  ((__m512h)__builtin_ia32_vcvtuw2ph512_mask(                                  \
+      (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_cvtepu16_ph(__m512i __A) {
+  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
+      (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
+  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
+      (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
+  return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
+      (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_epi32(A, R)                                         \
+  ((__m512i)__builtin_ia32_vcvtph2dq512_mask(                                  \
+      (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
+      (int)(R)))
+
+#define _mm512_mask_cvt_roundph_epi32(W, U, A, R)                              \
+  ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W),       \
+                                             (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_epi32(U, A, R)                                \
+  ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A),                     \
+                                             (__v16si)_mm512_setzero_epi32(),  \
+                                             (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtph_epi32(__m256h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
+      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
+      (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
+      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_epu32(A, R)                                         \
+  ((__m512i)__builtin_ia32_vcvtph2udq512_mask(                                 \
+      (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
+      (int)(R)))
+
+#define _mm512_mask_cvt_roundph_epu32(W, U, A, R)                              \
+  ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W),      \
+                                              (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_epu32(U, A, R)                                \
+  ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A),                    \
+                                              (__v16su)_mm512_setzero_epi32(), \
+                                              (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtph_epu32(__m256h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
+      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
+      (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
+      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi32_ph(A, R)                                         \
+  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A),                     \
+                                             (__v16hf)_mm256_undefined_ph(),   \
+                                             (__mmask16)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R)                              \
+  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W),       \
+                                             (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepi32_ph(U, A, R)                                \
+  ((__m256h)__builtin_ia32_vcvtdq2ph512_mask(                                  \
+      (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_cvtepi32_ph(__m512i __A) {
+  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
+      (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
+  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
+      (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
+  return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
+      (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu32_ph(A, R)                                         \
+  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A),                    \
+                                              (__v16hf)_mm256_undefined_ph(),  \
+                                              (__mmask16)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R)                              \
+  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W),      \
+                                              (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepu32_ph(U, A, R)                                \
+  ((__m256h)__builtin_ia32_vcvtudq2ph512_mask(                                 \
+      (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_cvtepu32_ph(__m512i __A) {
+  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
+      (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
+  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
+      (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
+  return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
+      (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundph_epi32(A, R)                                        \
+  ((__m512i)__builtin_ia32_vcvttph2dq512_mask(                                 \
+      (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1),        \
+      (int)(R)))
+
+#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R)                             \
+  ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W),      \
+                                              (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundph_epi32(U, A, R)                               \
+  ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A),                    \
+                                              (__v16si)_mm512_setzero_epi32(), \
+                                              (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttph_epi32(__m256h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
+      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
+      (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
+      (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundph_epu32(A, R)                                        \
+  ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
+      (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1),        \
+      (int)(R)))
+
+#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R)                             \
+  ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W),     \
+                                               (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundph_epu32(U, A, R)                               \
+  ((__m512i)__builtin_ia32_vcvttph2udq512_mask(                                \
+      (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U),           \
+      (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttph_epu32(__m256h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
+      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
+      (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
+      (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi64_ph(A, R)                                         \
+  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
+      (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R)                              \
+  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W),         \
+                                             (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepi64_ph(U, A, R)                                \
+  ((__m128h)__builtin_ia32_vcvtqq2ph512_mask(                                  \
+      (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_cvtepi64_ph(__m512i __A) {
+  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
+      (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
+  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
+      (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
+  return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
+      (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_epi64(A, R)                                         \
+  ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A),                      \
+                                             (__v8di)_mm512_undefined_epi32(), \
+                                             (__mmask8)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundph_epi64(W, U, A, R)                              \
+  ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W),         \
+                                             (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_epi64(U, A, R)                                \
+  ((__m512i)__builtin_ia32_vcvtph2qq512_mask(                                  \
+      (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtph_epi64(__m128h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
+      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
+      (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
+      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu64_ph(A, R)                                         \
+  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
+      (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R)                              \
+  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W),        \
+                                              (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepu64_ph(U, A, R)                                \
+  ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask(                                 \
+      (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_cvtepu64_ph(__m512i __A) {
+  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
+      (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
+  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
+      (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
+  return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
+      (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_epu64(A, R)                                         \
+  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
+      (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
+      (int)(R)))
+
+#define _mm512_mask_cvt_roundph_epu64(W, U, A, R)                              \
+  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W),        \
+                                              (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_epu64(U, A, R)                                \
+  ((__m512i)__builtin_ia32_vcvtph2uqq512_mask(                                 \
+      (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtph_epu64(__m128h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
+      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
+      (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
+  return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
+      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundph_epi64(A, R)                                        \
+  ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
+      (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1),           \
+      (int)(R)))
+
+#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R)                             \
+  ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W),        \
+                                              (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundph_epi64(U, A, R)                               \
+  ((__m512i)__builtin_ia32_vcvttph2qq512_mask(                                 \
+      (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttph_epi64(__m128h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
+      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
+      (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
+      (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundph_epu64(A, R)                                        \
+  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
+      (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1),           \
+      (int)(R)))
+
+#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R)                             \
+  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W),       \
+                                               (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundph_epu64(U, A, R)                               \
+  ((__m512i)__builtin_ia32_vcvttph2uqq512_mask(                                \
+      (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttph_epu64(__m128h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
+      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
+      (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
+  return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
+      (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundsh_i32(A, R)                                              \
+  ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
+  return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundsh_u32(A, R)                                              \
+  ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS128
+_mm_cvtsh_u32(__m128h __A) {
+  return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsh_i64(A, R)                                              \
+  ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
+
+static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
+  return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundsh_u64(A, R)                                              \
+  ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
+_mm_cvtsh_u64(__m128h __A) {
+  return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
+      (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
+}
+#endif // __x86_64__
+
+#define _mm_cvt_roundu32_sh(A, B, R)                                           \
+  ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_cvtu32_sh(__m128h __A, unsigned int __B) {
+  __A[0] = __B;
+  return __A;
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundu64_sh(A, B, R)                                           \
+  ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B),  \
+                                        (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
+  __A[0] = __B;
+  return __A;
+}
+#endif
+
+#define _mm_cvt_roundi32_sh(A, B, R)                                           \
+  ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
+                                                              int __B) {
+  __A[0] = __B;
+  return __A;
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundi64_sh(A, B, R)                                           \
+  ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
+                                                              long long __B) {
+  __A[0] = __B;
+  return __A;
+}
+#endif
+
+#define _mm_cvtt_roundsh_i32(A, R)                                             \
+  ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
+  return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
+                                          _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundsh_i64(A, R)                                             \
+  ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
+
+static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
+  return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundsh_u32(A, R)                                             \
+  ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS128
+_mm_cvttsh_u32(__m128h __A) {
+  return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundsh_u64(A, R)                                             \
+  ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
+_mm_cvttsh_u64(__m128h __A) {
+  return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
+      (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm512_cvtx_roundph_ps(A, R)                                           \
+  ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A),                     \
+                                             (__v16sf)_mm512_undefined_ps(),   \
+                                             (__mmask16)(-1), (int)(R)))
+
+#define _mm512_mask_cvtx_roundph_ps(W, U, A, R)                                \
+  ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W),       \
+                                             (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvtx_roundph_ps(U, A, R)                                  \
+  ((__m512)__builtin_ia32_vcvtph2psx512_mask(                                  \
+      (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
+  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
+      (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
+  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
+      (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
+  return (__m512)__builtin_ia32_vcvtph2psx512_mask(
+      (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtx_roundps_ph(A, R)                                           \
+  ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A),                    \
+                                              (__v16hf)_mm256_undefined_ph(),  \
+                                              (__mmask16)(-1), (int)(R)))
+
+#define _mm512_mask_cvtx_roundps_ph(W, U, A, R)                                \
+  ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W),      \
+                                              (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvtx_roundps_ph(U, A, R)                                  \
+  ((__m256h)__builtin_ia32_vcvtps2phx512_mask(                                 \
+      (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
+  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
+      (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
+  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
+      (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
+  return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
+      (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
+      _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS512
+_mm512_reduce_add_ph(__m512h __W) {
+  return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS512
+_mm512_reduce_mul_ph(__m512h __W) {
+  return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_ph(__m512h __V) {
+  return __builtin_ia32_reduce_fmax_ph512(__V);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_ph(__m512h __V) {
+  return __builtin_ia32_reduce_fmin_ph512(__V);
+}
+
 static __inline__ __m512h __DEFAULT_FN_ATTRS512
 _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
   return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
diff --git a/clang/lib/Headers/avx512vbmi2intrin.h b/clang/lib/Headers/avx512vbmi2intrin.h
index a23144616ce36..17fa77722c64f 100644
--- a/clang/lib/Headers/avx512vbmi2intrin.h
+++ b/clang/lib/Headers/avx512vbmi2intrin.h
@@ -129,88 +129,88 @@ _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
 }
 
 #define _mm512_shldi_epi64(A, B, I) \
-  (__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
-                                     (__v8di)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
+                                      (__v8di)(__m512i)(B), (int)(I)))
 
 #define _mm512_mask_shldi_epi64(S, U, A, B, I) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                    (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
-                                    (__v8di)(__m512i)(S))
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                     (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
+                                     (__v8di)(__m512i)(S)))
 
 #define _mm512_maskz_shldi_epi64(U, A, B, I) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                    (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
-                                    (__v8di)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                     (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
+                                     (__v8di)_mm512_setzero_si512()))
 
 #define _mm512_shldi_epi32(A, B, I) \
-  (__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
-                                     (__v16si)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
+                                      (__v16si)(__m512i)(B), (int)(I)))
 
 #define _mm512_mask_shldi_epi32(S, U, A, B, I) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                   (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
-                                   (__v16si)(__m512i)(S))
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                    (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
+                                    (__v16si)(__m512i)(S)))
 
 #define _mm512_maskz_shldi_epi32(U, A, B, I) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                   (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
-                                   (__v16si)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                    (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
+                                    (__v16si)_mm512_setzero_si512()))
 
 #define _mm512_shldi_epi16(A, B, I) \
-  (__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
-                                     (__v32hi)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
+                                      (__v32hi)(__m512i)(B), (int)(I)))
 
 #define _mm512_mask_shldi_epi16(S, U, A, B, I) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                   (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
-                                   (__v32hi)(__m512i)(S))
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                    (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
+                                    (__v32hi)(__m512i)(S)))
 
 #define _mm512_maskz_shldi_epi16(U, A, B, I) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                   (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
-                                   (__v32hi)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                    (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
+                                    (__v32hi)_mm512_setzero_si512()))
 
 #define _mm512_shrdi_epi64(A, B, I) \
-  (__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
-                                     (__v8di)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
+                                      (__v8di)(__m512i)(B), (int)(I)))
 
 #define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                    (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
-                                    (__v8di)(__m512i)(S))
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                     (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
+                                     (__v8di)(__m512i)(S)))
 
 #define _mm512_maskz_shrdi_epi64(U, A, B, I) \
-  (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
-                                    (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
-                                    (__v8di)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+                                     (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
+                                     (__v8di)_mm512_setzero_si512()))
 
 #define _mm512_shrdi_epi32(A, B, I) \
-  (__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
-                                     (__v16si)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
+                                      (__v16si)(__m512i)(B), (int)(I)))
 
 #define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                   (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
-                                   (__v16si)(__m512i)(S))
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                    (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
+                                    (__v16si)(__m512i)(S)))
 
 #define _mm512_maskz_shrdi_epi32(U, A, B, I) \
-  (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
-                                   (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
-                                   (__v16si)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+                                    (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
+                                    (__v16si)_mm512_setzero_si512()))
 
 #define _mm512_shrdi_epi16(A, B, I) \
-  (__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
-                                     (__v32hi)(__m512i)(B), (int)(I))
+  ((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
+                                      (__v32hi)(__m512i)(B), (int)(I)))
 
 #define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                   (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
-                                   (__v32hi)(__m512i)(S))
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                    (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
+                                    (__v32hi)(__m512i)(S)))
 
 #define _mm512_maskz_shrdi_epi16(U, A, B, I) \
-  (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
-                                   (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
-                                   (__v32hi)_mm512_setzero_si512())
+  ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+                                    (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
+                                    (__v32hi)_mm512_setzero_si512()))
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 6ed10ed9803ba..7873516053ece 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -21,84 +21,84 @@
 /* Integer compare */
 
 #define _mm_cmp_epi8_mask(a, b, p) \
-  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
-                                         (__v16qi)(__m128i)(b), (int)(p), \
-                                         (__mmask16)-1)
+  ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), (int)(p), \
+                                          (__mmask16)-1))
 
 #define _mm_mask_cmp_epi8_mask(m, a, b, p) \
-  (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
-                                         (__v16qi)(__m128i)(b), (int)(p), \
-                                         (__mmask16)(m))
+  ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+                                          (__v16qi)(__m128i)(b), (int)(p), \
+                                          (__mmask16)(m)))
 
 #define _mm_cmp_epu8_mask(a, b, p) \
-  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
-                                          (__v16qi)(__m128i)(b), (int)(p), \
-                                          (__mmask16)-1)
+  ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                           (__v16qi)(__m128i)(b), (int)(p), \
+                                           (__mmask16)-1))
 
 #define _mm_mask_cmp_epu8_mask(m, a, b, p) \
-  (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
-                                          (__v16qi)(__m128i)(b), (int)(p), \
-                                          (__mmask16)(m))
+  ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+                                           (__v16qi)(__m128i)(b), (int)(p), \
+                                           (__mmask16)(m)))
 
 #define _mm256_cmp_epi8_mask(a, b, p) \
-  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
-                                         (__v32qi)(__m256i)(b), (int)(p), \
-                                         (__mmask32)-1)
+  ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), (int)(p), \
+                                          (__mmask32)-1))
 
 #define _mm256_mask_cmp_epi8_mask(m, a, b, p) \
-  (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
-                                         (__v32qi)(__m256i)(b), (int)(p), \
-                                         (__mmask32)(m))
+  ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+                                          (__v32qi)(__m256i)(b), (int)(p), \
+                                          (__mmask32)(m)))
 
 #define _mm256_cmp_epu8_mask(a, b, p) \
-  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
-                                          (__v32qi)(__m256i)(b), (int)(p), \
-                                          (__mmask32)-1)
+  ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                           (__v32qi)(__m256i)(b), (int)(p), \
+                                           (__mmask32)-1))
 
 #define _mm256_mask_cmp_epu8_mask(m, a, b, p) \
-  (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
-                                          (__v32qi)(__m256i)(b), (int)(p), \
-                                          (__mmask32)(m))
+  ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+                                           (__v32qi)(__m256i)(b), (int)(p), \
+                                           (__mmask32)(m)))
 
 #define _mm_cmp_epi16_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
-                                        (__v8hi)(__m128i)(b), (int)(p), \
-                                        (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), (int)(p), \
+                                         (__mmask8)-1))
 
 #define _mm_mask_cmp_epi16_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
-                                        (__v8hi)(__m128i)(b), (int)(p), \
-                                        (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+                                         (__v8hi)(__m128i)(b), (int)(p), \
+                                         (__mmask8)(m)))
 
 #define _mm_cmp_epu16_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
-                                         (__v8hi)(__m128i)(b), (int)(p), \
-                                         (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                          (__v8hi)(__m128i)(b), (int)(p), \
+                                          (__mmask8)-1))
 
 #define _mm_mask_cmp_epu16_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
-                                         (__v8hi)(__m128i)(b), (int)(p), \
-                                         (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+                                          (__v8hi)(__m128i)(b), (int)(p), \
+                                          (__mmask8)(m)))
 
 #define _mm256_cmp_epi16_mask(a, b, p) \
-  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
-                                         (__v16hi)(__m256i)(b), (int)(p), \
-                                         (__mmask16)-1)
+  ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), (int)(p), \
+                                          (__mmask16)-1))
 
 #define _mm256_mask_cmp_epi16_mask(m, a, b, p) \
-  (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
-                                         (__v16hi)(__m256i)(b), (int)(p), \
-                                         (__mmask16)(m))
+  ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+                                          (__v16hi)(__m256i)(b), (int)(p), \
+                                          (__mmask16)(m)))
 
 #define _mm256_cmp_epu16_mask(a, b, p) \
-  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
-                                          (__v16hi)(__m256i)(b), (int)(p), \
-                                          (__mmask16)-1)
+  ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                           (__v16hi)(__m256i)(b), (int)(p), \
+                                           (__mmask16)-1))
 
 #define _mm256_mask_cmp_epu16_mask(m, a, b, p) \
-  (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
-                                          (__v16hi)(__m256i)(b), (int)(p), \
-                                          (__mmask16)(m))
+  ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+                                           (__v16hi)(__m256i)(b), (int)(p), \
+                                           (__mmask16)(m)))
 
 #define _mm_cmpeq_epi8_mask(A, B) \
     _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
@@ -1821,46 +1821,46 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
 
 
 #define _mm_mask_shufflehi_epi16(W, U, A, imm) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                      (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
-                                      (__v8hi)(__m128i)(W))
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                       (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
+                                       (__v8hi)(__m128i)(W)))
 
 #define _mm_maskz_shufflehi_epi16(U, A, imm) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                      (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
-                                      (__v8hi)_mm_setzero_si128())
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                       (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
+                                       (__v8hi)_mm_setzero_si128()))
 
 #define _mm256_mask_shufflehi_epi16(W, U, A, imm) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                      (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
-                                      (__v16hi)(__m256i)(W))
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                       (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
+                                       (__v16hi)(__m256i)(W)))
 
 #define _mm256_maskz_shufflehi_epi16(U, A, imm) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                      (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
-                                      (__v16hi)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                       (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
+                                       (__v16hi)_mm256_setzero_si256()))
 
 #define _mm_mask_shufflelo_epi16(W, U, A, imm) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                      (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
-                                      (__v8hi)(__m128i)(W))
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                       (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
+                                       (__v8hi)(__m128i)(W)))
 
 #define _mm_maskz_shufflelo_epi16(U, A, imm) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                      (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
-                                      (__v8hi)_mm_setzero_si128())
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                       (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
+                                       (__v8hi)_mm_setzero_si128()))
 
 #define _mm256_mask_shufflelo_epi16(W, U, A, imm) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                      (__v16hi)_mm256_shufflelo_epi16((A), \
-                                                                      (imm)), \
-                                      (__v16hi)(__m256i)(W))
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                       (__v16hi)_mm256_shufflelo_epi16((A), \
+                                                                       (imm)), \
+                                       (__v16hi)(__m256i)(W)))
 
 #define _mm256_maskz_shufflelo_epi16(U, A, imm) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                      (__v16hi)_mm256_shufflelo_epi16((A), \
-                                                                      (imm)), \
-                                      (__v16hi)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                       (__v16hi)_mm256_shufflelo_epi16((A), \
+                                                                       (imm)), \
+                                       (__v16hi)_mm256_setzero_si256()))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sllv_epi16(__m256i __A, __m256i __B)
@@ -2756,52 +2756,52 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
 }
 
 #define _mm_mask_alignr_epi8(W, U, A, B, N) \
-  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
                                  (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
-                                 (__v16qi)(__m128i)(W))
+                                 (__v16qi)(__m128i)(W)))
 
 #define _mm_maskz_alignr_epi8(U, A, B, N) \
-  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
                                  (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
-                                 (__v16qi)_mm_setzero_si128())
+                                 (__v16qi)_mm_setzero_si128()))
 
 #define _mm256_mask_alignr_epi8(W, U, A, B, N) \
-  (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
                               (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
-                              (__v32qi)(__m256i)(W))
+                              (__v32qi)(__m256i)(W)))
 
 #define _mm256_maskz_alignr_epi8(U, A, B, N) \
-  (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
                               (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
-                              (__v32qi)_mm256_setzero_si256())
+                              (__v32qi)_mm256_setzero_si256()))
 
 #define _mm_dbsad_epu8(A, B, imm) \
-  (__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \
-                                      (__v16qi)(__m128i)(B), (int)(imm))
+  ((__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \
+                                       (__v16qi)(__m128i)(B), (int)(imm)))
 
 #define _mm_mask_dbsad_epu8(W, U, A, B, imm) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
-                                      (__v8hi)(__m128i)(W))
+                                      (__v8hi)(__m128i)(W)))
 
 #define _mm_maskz_dbsad_epu8(U, A, B, imm) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
                                       (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
-                                      (__v8hi)_mm_setzero_si128())
+                                      (__v8hi)_mm_setzero_si128()))
 
 #define _mm256_dbsad_epu8(A, B, imm) \
-  (__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \
-                                      (__v32qi)(__m256i)(B), (int)(imm))
+  ((__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \
+                                       (__v32qi)(__m256i)(B), (int)(imm)))
 
 #define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                   (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
-                                  (__v16hi)(__m256i)(W))
+                                  (__v16hi)(__m256i)(W)))
 
 #define _mm256_maskz_dbsad_epu8(U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
                                   (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
-                                  (__v16hi)_mm256_setzero_si256())
+                                  (__v16hi)_mm256_setzero_si256()))
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h
index 95ba574ea8210..713e1a18a1b3f 100644
--- a/clang/lib/Headers/avx512vldqintrin.h
+++ b/clang/lib/Headers/avx512vldqintrin.h
@@ -773,134 +773,134 @@ _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
 }
 
 #define _mm_range_pd(A, B, C) \
-  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), (int)(C), \
-                                          (__v2df)_mm_setzero_pd(), \
-                                          (__mmask8)-1)
+  ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), (int)(C), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)-1))
 
 #define _mm_mask_range_pd(W, U, A, B, C) \
-  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), (int)(C), \
-                                          (__v2df)(__m128d)(W), \
-                                          (__mmask8)(U))
+  ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), (int)(C), \
+                                           (__v2df)(__m128d)(W), \
+                                           (__mmask8)(U)))
 
 #define _mm_maskz_range_pd(U, A, B, C) \
-  (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
-                                          (__v2df)(__m128d)(B), (int)(C), \
-                                          (__v2df)_mm_setzero_pd(), \
-                                          (__mmask8)(U))
+  ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+                                           (__v2df)(__m128d)(B), (int)(C), \
+                                           (__v2df)_mm_setzero_pd(), \
+                                           (__mmask8)(U)))
 
 #define _mm256_range_pd(A, B, C) \
-  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
-                                          (__v4df)(__m256d)(B), (int)(C), \
-                                          (__v4df)_mm256_setzero_pd(), \
-                                          (__mmask8)-1)
+  ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+                                           (__v4df)(__m256d)(B), (int)(C), \
+                                           (__v4df)_mm256_setzero_pd(), \
+                                           (__mmask8)-1))
 
 #define _mm256_mask_range_pd(W, U, A, B, C) \
-  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
-                                          (__v4df)(__m256d)(B), (int)(C), \
-                                          (__v4df)(__m256d)(W), \
-                                          (__mmask8)(U))
+  ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+                                           (__v4df)(__m256d)(B), (int)(C), \
+                                           (__v4df)(__m256d)(W), \
+                                           (__mmask8)(U)))
 
 #define _mm256_maskz_range_pd(U, A, B, C) \
-  (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
-                                          (__v4df)(__m256d)(B), (int)(C), \
-                                          (__v4df)_mm256_setzero_pd(), \
-                                          (__mmask8)(U))
+  ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+                                           (__v4df)(__m256d)(B), (int)(C), \
+                                           (__v4df)_mm256_setzero_pd(), \
+                                           (__mmask8)(U)))
 
 #define _mm_range_ps(A, B, C) \
-  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), (int)(C), \
-                                         (__v4sf)_mm_setzero_ps(), \
-                                         (__mmask8)-1)
+  ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), (int)(C), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)-1))
 
 #define _mm_mask_range_ps(W, U, A, B, C) \
-  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), (int)(C), \
-                                         (__v4sf)(__m128)(W), (__mmask8)(U))
+  ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), (int)(C), \
+                                          (__v4sf)(__m128)(W), (__mmask8)(U)))
 
 #define _mm_maskz_range_ps(U, A, B, C) \
-  (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
-                                         (__v4sf)(__m128)(B), (int)(C), \
-                                         (__v4sf)_mm_setzero_ps(), \
-                                         (__mmask8)(U))
+  ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+                                          (__v4sf)(__m128)(B), (int)(C), \
+                                          (__v4sf)_mm_setzero_ps(), \
+                                          (__mmask8)(U)))
 
 #define _mm256_range_ps(A, B, C) \
-  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
-                                         (__v8sf)(__m256)(B), (int)(C), \
-                                         (__v8sf)_mm256_setzero_ps(), \
-                                         (__mmask8)-1)
+  ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+                                          (__v8sf)(__m256)(B), (int)(C), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)-1))
 
 #define _mm256_mask_range_ps(W, U, A, B, C) \
-  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
-                                         (__v8sf)(__m256)(B), (int)(C), \
-                                         (__v8sf)(__m256)(W), (__mmask8)(U))
+  ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+                                          (__v8sf)(__m256)(B), (int)(C), \
+                                          (__v8sf)(__m256)(W), (__mmask8)(U)))
 
 #define _mm256_maskz_range_ps(U, A, B, C) \
-  (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
-                                         (__v8sf)(__m256)(B), (int)(C), \
-                                         (__v8sf)_mm256_setzero_ps(), \
-                                         (__mmask8)(U))
+  ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+                                          (__v8sf)(__m256)(B), (int)(C), \
+                                          (__v8sf)_mm256_setzero_ps(), \
+                                          (__mmask8)(U)))
 
 #define _mm_reduce_pd(A, B) \
-  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)-1)
+  ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)-1))
 
 #define _mm_mask_reduce_pd(W, U, A, B) \
-  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
-                                           (__v2df)(__m128d)(W), \
-                                           (__mmask8)(U))
+  ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+                                            (__v2df)(__m128d)(W), \
+                                            (__mmask8)(U)))
 
 #define _mm_maskz_reduce_pd(U, A, B) \
-  (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
-                                           (__v2df)_mm_setzero_pd(), \
-                                           (__mmask8)(U))
+  ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+                                            (__v2df)_mm_setzero_pd(), \
+                                            (__mmask8)(U)))
 
 #define _mm256_reduce_pd(A, B) \
-  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
-                                           (__v4df)_mm256_setzero_pd(), \
-                                           (__mmask8)-1)
+  ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+                                            (__v4df)_mm256_setzero_pd(), \
+                                            (__mmask8)-1))
 
 #define _mm256_mask_reduce_pd(W, U, A, B) \
-  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
-                                           (__v4df)(__m256d)(W), \
-                                           (__mmask8)(U))
+  ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+                                            (__v4df)(__m256d)(W), \
+                                            (__mmask8)(U)))
 
 #define _mm256_maskz_reduce_pd(U, A, B) \
-  (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
-                                           (__v4df)_mm256_setzero_pd(), \
-                                           (__mmask8)(U))
+  ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+                                            (__v4df)_mm256_setzero_pd(), \
+                                            (__mmask8)(U)))
 
 #define _mm_reduce_ps(A, B) \
-  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)-1)
+  ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)-1))
 
 #define _mm_mask_reduce_ps(W, U, A, B) \
-  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
-                                          (__v4sf)(__m128)(W), \
-                                          (__mmask8)(U))
+  ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+                                           (__v4sf)(__m128)(W), \
+                                           (__mmask8)(U)))
 
 #define _mm_maskz_reduce_ps(U, A, B) \
-  (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
-                                          (__v4sf)_mm_setzero_ps(), \
-                                          (__mmask8)(U))
+  ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+                                           (__v4sf)_mm_setzero_ps(), \
+                                           (__mmask8)(U)))
 
 #define _mm256_reduce_ps(A, B) \
-  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
-                                          (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)-1)
+  ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)-1))
 
 #define _mm256_mask_reduce_ps(W, U, A, B) \
-  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
-                                          (__v8sf)(__m256)(W), \
-                                          (__mmask8)(U))
+  ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+                                           (__v8sf)(__m256)(W), \
+                                           (__mmask8)(U)))
 
 #define _mm256_maskz_reduce_ps(U, A, B) \
-  (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
-                                          (__v8sf)_mm256_setzero_ps(), \
-                                          (__mmask8)(U))
+  ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+                                           (__v8sf)_mm256_setzero_ps(), \
+                                           (__mmask8)(U)))
 
 static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
 _mm_movepi32_mask (__m128i __A)
@@ -1066,100 +1066,100 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
 }
 
 #define _mm256_extractf64x2_pd(A, imm) \
-  (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
-                                                (int)(imm), \
-                                                (__v2df)_mm_undefined_pd(), \
-                                                (__mmask8)-1)
+  ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+                                                 (int)(imm), \
+                                                 (__v2df)_mm_undefined_pd(), \
+                                                 (__mmask8)-1))
 
 #define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
-  (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
-                                                (int)(imm), \
-                                                (__v2df)(__m128d)(W), \
-                                                (__mmask8)(U))
+  ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+                                                 (int)(imm), \
+                                                 (__v2df)(__m128d)(W), \
+                                                 (__mmask8)(U)))
 
 #define _mm256_maskz_extractf64x2_pd(U, A, imm) \
-  (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
-                                                (int)(imm), \
-                                                (__v2df)_mm_setzero_pd(), \
-                                                (__mmask8)(U))
+  ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+                                                 (int)(imm), \
+                                                 (__v2df)_mm_setzero_pd(), \
+                                                 (__mmask8)(U)))
 
 #define _mm256_extracti64x2_epi64(A, imm) \
-  (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+  ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
                                                 (int)(imm), \
                                                 (__v2di)_mm_undefined_si128(), \
-                                                (__mmask8)-1)
+                                                (__mmask8)-1))
 
 #define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
-  (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)(__m128i)(W), \
-                                                (__mmask8)(U))
+  ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+                                                 (int)(imm), \
+                                                 (__v2di)(__m128i)(W), \
+                                                 (__mmask8)(U)))
 
 #define _mm256_maskz_extracti64x2_epi64(U, A, imm) \
-  (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
-                                                (int)(imm), \
-                                                (__v2di)_mm_setzero_si128(), \
-                                                (__mmask8)(U))
+  ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+                                                 (int)(imm), \
+                                                 (__v2di)_mm_setzero_si128(), \
+                                                 (__mmask8)(U)))
 
 #define _mm256_insertf64x2(A, B, imm) \
-  (__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
-                                          (__v2df)(__m128d)(B), (int)(imm))
+  ((__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
+                                           (__v2df)(__m128d)(B), (int)(imm)))
 
 #define _mm256_mask_insertf64x2(W, U, A, B, imm) \
-  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
                                   (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
-                                  (__v4df)(__m256d)(W))
+                                  (__v4df)(__m256d)(W)))
 
 #define _mm256_maskz_insertf64x2(U, A, B, imm) \
-  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
                                   (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
-                                  (__v4df)_mm256_setzero_pd())
+                                  (__v4df)_mm256_setzero_pd()))
 
 #define _mm256_inserti64x2(A, B, imm) \
-  (__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
-                                          (__v2di)(__m128i)(B), (int)(imm))
+  ((__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
+                                           (__v2di)(__m128i)(B), (int)(imm)))
 
 #define _mm256_mask_inserti64x2(W, U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                  (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
-                                  (__v4di)(__m256i)(W))
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                   (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+                                   (__v4di)(__m256i)(W)))
 
 #define _mm256_maskz_inserti64x2(U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                  (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
-                                  (__v4di)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                   (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+                                   (__v4di)_mm256_setzero_si256()))
 
 #define _mm_mask_fpclass_pd_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                             (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                              (__mmask8)(U)))
 
 #define _mm_fpclass_pd_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
-                                             (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
+                                              (__mmask8)-1))
 
 #define _mm256_mask_fpclass_pd_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
-                                             (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
+                                              (__mmask8)(U)))
 
 #define _mm256_fpclass_pd_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
-                                             (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
+                                              (__mmask8)-1))
 
 #define _mm_mask_fpclass_ps_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                             (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                              (__mmask8)(U)))
 
 #define _mm_fpclass_ps_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                             (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                              (__mmask8)-1))
 
 #define _mm256_mask_fpclass_ps_mask(U, A, imm) \
-  (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                             (__mmask8)(U))
+  ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                              (__mmask8)(U)))
 
 #define _mm256_fpclass_ps_mask(A, imm) \
-  (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                             (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                              (__mmask8)-1))
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index cda54bcc8351d..1809211fd4066 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -69,6 +69,240 @@ _mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
   _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8),   \
                 (h7), (h6), (h5), (h4), (h3), (h2), (h1))
 
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A,
+                                                              __m256h __B) {
+  return (__m256h)((__v16hf)__A + (__v16hf)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A,
+                                                           __m128h __B) {
+  return (__m128h)((__v8hf)__A + (__v8hf)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
+                                              (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
+                                              (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A,
+                                                              __m256h __B) {
+  return (__m256h)((__v16hf)__A - (__v16hf)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A,
+                                                           __m128h __B) {
+  return (__m128h)((__v8hf)__A - (__v8hf)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
+                                              (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
+                                              (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A,
+                                                              __m256h __B) {
+  return (__m256h)((__v16hf)__A * (__v16hf)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A,
+                                                           __m128h __B) {
+  return (__m128h)((__v8hf)__A * (__v8hf)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
+                                              (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
+                                              (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A,
+                                                              __m256h __B) {
+  return (__m256h)((__v16hf)__A / (__v16hf)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A,
+                                                           __m128h __B) {
+  return (__m128h)((__v8hf)__A / (__v8hf)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
+                                              (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
+                                              (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A,
+                                                              __m256h __B) {
+  return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      (__mmask16)__U,
+      (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
+      (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      (__mmask16)__U,
+      (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
+      (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A,
+                                                           __m128h __B) {
+  return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
+      (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
+      (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A,
+                                                              __m256h __B) {
+  return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      (__mmask16)__U,
+      (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
+      (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      (__mmask16)__U,
+      (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
+      (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A,
+                                                           __m128h __B) {
+  return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A,
+                                                                __m128h __B) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
+      (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U,
+                                                                 __m128h __A,
+                                                                 __m128h __B) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
+      (__v8hf)_mm_setzero_ph());
+}
+
 static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) {
   return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A);
 }
@@ -77,6 +311,1066 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) {
   return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A);
 }
 
+#define _mm256_cmp_ph_mask(a, b, p)                                            \
+  ((__mmask16)__builtin_ia32_cmpph256_mask(                                    \
+      (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1))
+
+#define _mm256_mask_cmp_ph_mask(m, a, b, p)                                    \
+  ((__mmask16)__builtin_ia32_cmpph256_mask(                                    \
+      (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m)))
+
+#define _mm_cmp_ph_mask(a, b, p)                                               \
+  ((__mmask8)__builtin_ia32_cmpph128_mask(                                     \
+      (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1))
+
+#define _mm_mask_cmp_ph_mask(m, a, b, p)                                       \
+  ((__mmask8)__builtin_ia32_cmpph128_mask(                                     \
+      (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m)))
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) {
+  return (__m256h)__builtin_ia32_rcpph256_mask(
+      (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+  return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W,
+                                               (__mmask16)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) {
+  return (__m256h)__builtin_ia32_rcpph256_mask(
+      (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) {
+  return (__m128h)__builtin_ia32_rcpph128_mask(
+      (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W,
+                                                                __mmask8 __U,
+                                                                __m128h __A) {
+  return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W,
+                                               (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U,
+                                                                 __m128h __A) {
+  return (__m128h)__builtin_ia32_rcpph128_mask(
+      (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) {
+  return (__m256h)__builtin_ia32_rsqrtph256_mask(
+      (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+  return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W,
+                                                 (__mmask16)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) {
+  return (__m256h)__builtin_ia32_rsqrtph256_mask(
+      (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) {
+  return (__m128h)__builtin_ia32_rsqrtph128_mask(
+      (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W,
+                                                                  __mmask8 __U,
+                                                                  __m128h __A) {
+  return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W,
+                                                 (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) {
+  return (__m128h)__builtin_ia32_rsqrtph128_mask(
+      (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) {
+  return (__m128h)__builtin_ia32_getexpph128_mask(
+      (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) {
+  return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W,
+                                                  (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) {
+  return (__m128h)__builtin_ia32_getexpph128_mask(
+      (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) {
+  return (__m256h)__builtin_ia32_getexpph256_mask(
+      (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+  return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W,
+                                                  (__mmask16)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) {
+  return (__m256h)__builtin_ia32_getexpph256_mask(
+      (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
+}
+
+#define _mm_getmant_ph(A, B, C)                                                \
+  ((__m128h)__builtin_ia32_getmantph128_mask(                                  \
+      (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
+      (__mmask8)-1))
+
+#define _mm_mask_getmant_ph(W, U, A, B, C)                                     \
+  ((__m128h)__builtin_ia32_getmantph128_mask(                                  \
+      (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W),     \
+      (__mmask8)(U)))
+
+#define _mm_maskz_getmant_ph(U, A, B, C)                                       \
+  ((__m128h)__builtin_ia32_getmantph128_mask(                                  \
+      (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
+      (__mmask8)(U)))
+
+#define _mm256_getmant_ph(A, B, C)                                             \
+  ((__m256h)__builtin_ia32_getmantph256_mask(                                  \
+      (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)),                          \
+      (__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
+
+#define _mm256_mask_getmant_ph(W, U, A, B, C)                                  \
+  ((__m256h)__builtin_ia32_getmantph256_mask(                                  \
+      (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W),   \
+      (__mmask16)(U)))
+
+#define _mm256_maskz_getmant_ph(U, A, B, C)                                    \
+  ((__m256h)__builtin_ia32_getmantph256_mask(                                  \
+      (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)),                          \
+      (__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A,
+                                                              __m128h __B) {
+  return (__m128h)__builtin_ia32_scalefph128_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B,
+                                                  (__v8hf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) {
+  return (__m128h)__builtin_ia32_scalefph128_mask(
+      (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A,
+                                                                 __m256h __B) {
+  return (__m256h)__builtin_ia32_scalefph256_mask(
+      (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B,
+                                                  (__v16hf)__W, (__mmask16)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+  return (__m256h)__builtin_ia32_scalefph256_mask(
+      (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
+}
+
+#define _mm_roundscale_ph(A, imm)                                              \
+  ((__m128h)__builtin_ia32_rndscaleph_128_mask(                                \
+      (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(),              \
+      (__mmask8)-1))
+
+#define _mm_mask_roundscale_ph(W, U, A, imm)                                   \
+  ((__m128h)__builtin_ia32_rndscaleph_128_mask(                                \
+      (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
+
+#define _mm_maskz_roundscale_ph(U, A, imm)                                     \
+  ((__m128h)__builtin_ia32_rndscaleph_128_mask(                                \
+      (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(),              \
+      (__mmask8)(U)))
+
+#define _mm256_roundscale_ph(A, imm)                                           \
+  ((__m256h)__builtin_ia32_rndscaleph_256_mask(                                \
+      (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(),         \
+      (__mmask16)-1))
+
+#define _mm256_mask_roundscale_ph(W, U, A, imm)                                \
+  ((__m256h)__builtin_ia32_rndscaleph_256_mask(                                \
+      (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W),                \
+      (__mmask16)(U)))
+
+#define _mm256_maskz_roundscale_ph(U, A, imm)                                  \
+  ((__m256h)__builtin_ia32_rndscaleph_256_mask(                                \
+      (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(),         \
+      (__mmask16)(U)))
+
+#define _mm_reduce_ph(A, imm)                                                  \
+  ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm),  \
+                                            (__v8hf)_mm_setzero_ph(),          \
+                                            (__mmask8)-1))
+
+#define _mm_mask_reduce_ph(W, U, A, imm)                                       \
+  ((__m128h)__builtin_ia32_reduceph128_mask(                                   \
+      (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
+
+#define _mm_maskz_reduce_ph(U, A, imm)                                         \
+  ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm),  \
+                                            (__v8hf)_mm_setzero_ph(),          \
+                                            (__mmask8)(U)))
+
+#define _mm256_reduce_ph(A, imm)                                               \
+  ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
+                                            (__v16hf)_mm256_setzero_ph(),      \
+                                            (__mmask16)-1))
+
+#define _mm256_mask_reduce_ph(W, U, A, imm)                                    \
+  ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
+                                            (__v16hf)(__m256h)(W),             \
+                                            (__mmask16)(U)))
+
+#define _mm256_maskz_reduce_ph(U, A, imm)                                      \
+  ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
+                                            (__v16hf)_mm256_setzero_ph(),      \
+                                            (__mmask16)(U)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
+  return __builtin_ia32_sqrtph((__v8hf)__a);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
+                                                                 __mmask8 __U,
+                                                                 __m128h __A) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
+                                                                  __m128h __A) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph());
+}
+
+static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
+  return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) {
+  return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
+                                              (__v16hf)_mm256_sqrt_ph(__A),
+                                              (__v16hf)_mm256_setzero_ph());
+}
+
+#define _mm_mask_fpclass_ph_mask(U, A, imm)                                    \
+  ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A),            \
+                                              (int)(imm), (__mmask8)(U)))
+
+#define _mm_fpclass_ph_mask(A, imm)                                            \
+  ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A),            \
+                                              (int)(imm), (__mmask8)-1))
+
+#define _mm256_mask_fpclass_ph_mask(U, A, imm)                                 \
+  ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A),          \
+                                               (int)(imm), (__mmask16)(U)))
+
+#define _mm256_fpclass_ph_mask(A, imm)                                         \
+  ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A),          \
+                                               (int)(imm), (__mmask16)-1))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) {
+  return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
+      (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W,
+                                                                  __mmask8 __U,
+                                                                  __m128d __A) {
+  return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) {
+  return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
+      (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) {
+  return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
+      (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) {
+  return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) {
+  return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
+      (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) {
+  return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
+      (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W,
+                                                                  __mmask8 __U,
+                                                                  __m128h __A) {
+  return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
+  return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
+      (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) {
+  return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
+      (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) {
+  return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
+  return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
+      (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2w128_mask(
+      (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W,
+                                                  (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2w128_mask(
+      (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtph_epi16(__m256h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2w256_mask(
+      (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W,
+                                                  (__mmask16)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2w256_mask(
+      (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2w128_mask(
+      (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2w128_mask(
+      (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttph_epi16(__m256h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2w256_mask(
+      (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W,
+                                                   (__mmask16)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2w256_mask(
+      (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) {
+  return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_cvtepi16_ph(__m256i __A) {
+  return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) {
+  return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
+                                              (__v16hf)_mm256_cvtepi16_ph(__A),
+                                              (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
+      (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
+      (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtph_epu16(__m256h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
+      (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W,
+                                                   (__mmask16)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
+      (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
+      (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
+      (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttph_epu16(__m256h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
+      (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W,
+                                                    (__mmask16)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
+      (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) {
+  return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_cvtepu16_ph(__m256i __A) {
+  return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
+  return (__m256h)__builtin_ia32_selectph_256(
+      (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) {
+  return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
+                                              (__v16hf)_mm256_cvtepu16_ph(__A),
+                                              (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
+      (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
+      (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtph_epi32(__m128h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
+      (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
+      (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
+      (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
+      (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtph_epu32(__m128h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
+      (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
+      (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) {
+  return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
+      (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
+  return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) {
+  return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
+      (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_cvtepi32_ph(__m256i __A) {
+  return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) {
+  return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
+      (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
+  return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) {
+  return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
+      (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_cvtepu32_ph(__m256i __A) {
+  return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) {
+  return (__m128h)__builtin_ia32_selectph_128(
+      (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
+      (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
+      (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttph_epi32(__m128h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
+      (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
+      (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
+      (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W,
+                                                     (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
+      (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttph_epu32(__m128h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
+      (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W,
+                                                     (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
+      (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) {
+  return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
+      (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
+  return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) {
+  return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
+      (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_cvtepi64_ph(__m256i __A) {
+  return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
+      (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
+  return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) {
+  return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
+      (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
+      (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
+      (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtph_epi64(__m128h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
+      (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
+      (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) {
+  return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
+      (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
+  return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) {
+  return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
+      (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_cvtepu64_ph(__m256i __A) {
+  return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
+      (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
+  return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) {
+  return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
+      (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
+      (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
+      (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtph_epu64(__m128h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
+      (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
+      (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
+      (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
+      (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttph_epi64(__m128h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
+      (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
+      (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
+      (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W,
+                                                     (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
+  return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
+      (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttph_epu64(__m128h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
+      (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W,
+                                                     (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
+  return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
+      (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) {
+  return (__m128)__builtin_ia32_vcvtph2psx128_mask(
+      (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W,
+                                                                  __mmask8 __U,
+                                                                  __m128h __A) {
+  return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
+  return (__m128)__builtin_ia32_vcvtph2psx128_mask(
+      (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) {
+  return (__m256)__builtin_ia32_vcvtph2psx256_mask(
+      (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) {
+  return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W,
+                                                   (__mmask8)__U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
+  return (__m256)__builtin_ia32_vcvtph2psx256_mask(
+      (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) {
+  return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
+      (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W,
+                                                                   __mmask8 __U,
+                                                                   __m128 __A) {
+  return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) {
+  return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
+      (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) {
+  return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
+      (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) {
+  return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W,
+                                                    (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) {
+  return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
+      (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U,
                                                                   __m128h __A,
                                                                   __m128h __W) {
@@ -112,6 +1406,46 @@ _mm256_permutexvar_ph(__m256i __A, __m256h __B) {
   return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
 }
 
+static __inline__ _Float16 __DEFAULT_FN_ATTRS256
+_mm256_reduce_add_ph(__m256h __W) {
+  return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS256
+_mm256_reduce_mul_ph(__m256h __W) {
+  return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS256
+_mm256_reduce_max_ph(__m256h __V) {
+  return __builtin_ia32_reduce_fmax_ph256(__V);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS256
+_mm256_reduce_min_ph(__m256h __V) {
+  return __builtin_ia32_reduce_fmin_ph256(__V);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS128
+_mm_reduce_add_ph(__m128h __W) {
+  return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS128
+_mm_reduce_mul_ph(__m128h __W) {
+  return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS128
+_mm_reduce_max_ph(__m128h __V) {
+  return __builtin_ia32_reduce_fmax_ph128(__V);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS128
+_mm_reduce_min_ph(__m128h __V) {
+  return __builtin_ia32_reduce_fmin_ph128(__V);
+}
+
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
 
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 968c10efeac0c..0519dba59081a 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -771,124 +771,124 @@ _mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B)
 }
 
 #define _mm_cmp_epi32_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
-                                        (__v4si)(__m128i)(b), (int)(p), \
-                                        (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+                                         (__v4si)(__m128i)(b), (int)(p), \
+                                         (__mmask8)-1))
 
 #define _mm_mask_cmp_epi32_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
-                                        (__v4si)(__m128i)(b), (int)(p), \
-                                        (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+                                         (__v4si)(__m128i)(b), (int)(p), \
+                                         (__mmask8)(m)))
 
 #define _mm_cmp_epu32_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
-                                         (__v4si)(__m128i)(b), (int)(p), \
-                                         (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+                                          (__v4si)(__m128i)(b), (int)(p), \
+                                          (__mmask8)-1))
 
 #define _mm_mask_cmp_epu32_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
-                                         (__v4si)(__m128i)(b), (int)(p), \
-                                         (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+                                          (__v4si)(__m128i)(b), (int)(p), \
+                                          (__mmask8)(m)))
 
 #define _mm256_cmp_epi32_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
-                                        (__v8si)(__m256i)(b), (int)(p), \
-                                        (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+                                         (__v8si)(__m256i)(b), (int)(p), \
+                                         (__mmask8)-1))
 
 #define _mm256_mask_cmp_epi32_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
-                                        (__v8si)(__m256i)(b), (int)(p), \
-                                        (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+                                         (__v8si)(__m256i)(b), (int)(p), \
+                                         (__mmask8)(m)))
 
 #define _mm256_cmp_epu32_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
-                                         (__v8si)(__m256i)(b), (int)(p), \
-                                         (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+                                          (__v8si)(__m256i)(b), (int)(p), \
+                                          (__mmask8)-1))
 
 #define _mm256_mask_cmp_epu32_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
-                                         (__v8si)(__m256i)(b), (int)(p), \
-                                         (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+                                          (__v8si)(__m256i)(b), (int)(p), \
+                                          (__mmask8)(m)))
 
 #define _mm_cmp_epi64_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
-                                        (__v2di)(__m128i)(b), (int)(p), \
-                                        (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+                                         (__v2di)(__m128i)(b), (int)(p), \
+                                         (__mmask8)-1))
 
 #define _mm_mask_cmp_epi64_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
-                                        (__v2di)(__m128i)(b), (int)(p), \
-                                        (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+                                         (__v2di)(__m128i)(b), (int)(p), \
+                                         (__mmask8)(m)))
 
 #define _mm_cmp_epu64_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
-                                         (__v2di)(__m128i)(b), (int)(p), \
-                                         (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+                                          (__v2di)(__m128i)(b), (int)(p), \
+                                          (__mmask8)-1))
 
 #define _mm_mask_cmp_epu64_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
-                                         (__v2di)(__m128i)(b), (int)(p), \
-                                         (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+                                          (__v2di)(__m128i)(b), (int)(p), \
+                                          (__mmask8)(m)))
 
 #define _mm256_cmp_epi64_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
-                                        (__v4di)(__m256i)(b), (int)(p), \
-                                        (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+                                         (__v4di)(__m256i)(b), (int)(p), \
+                                         (__mmask8)-1))
 
 #define _mm256_mask_cmp_epi64_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
-                                        (__v4di)(__m256i)(b), (int)(p), \
-                                        (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+                                         (__v4di)(__m256i)(b), (int)(p), \
+                                         (__mmask8)(m)))
 
 #define _mm256_cmp_epu64_mask(a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
-                                         (__v4di)(__m256i)(b), (int)(p), \
-                                         (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+                                          (__v4di)(__m256i)(b), (int)(p), \
+                                          (__mmask8)-1))
 
 #define _mm256_mask_cmp_epu64_mask(m, a, b, p) \
-  (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
-                                         (__v4di)(__m256i)(b), (int)(p), \
-                                         (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+                                          (__v4di)(__m256i)(b), (int)(p), \
+                                          (__mmask8)(m)))
 
 #define _mm256_cmp_ps_mask(a, b, p)  \
-  (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
-                                         (__v8sf)(__m256)(b), (int)(p), \
-                                         (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+                                          (__v8sf)(__m256)(b), (int)(p), \
+                                          (__mmask8)-1))
 
 #define _mm256_mask_cmp_ps_mask(m, a, b, p)  \
-  (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
-                                         (__v8sf)(__m256)(b), (int)(p), \
-                                         (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+                                          (__v8sf)(__m256)(b), (int)(p), \
+                                          (__mmask8)(m)))
 
 #define _mm256_cmp_pd_mask(a, b, p)  \
-  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
-                                         (__v4df)(__m256d)(b), (int)(p), \
-                                         (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
+                                          (__v4df)(__m256d)(b), (int)(p), \
+                                          (__mmask8)-1))
 
 #define _mm256_mask_cmp_pd_mask(m, a, b, p)  \
-  (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
-                                         (__v4df)(__m256d)(b), (int)(p), \
-                                         (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
+                                          (__v4df)(__m256d)(b), (int)(p), \
+                                          (__mmask8)(m)))
 
 #define _mm_cmp_ps_mask(a, b, p)  \
-  (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
-                                         (__v4sf)(__m128)(b), (int)(p), \
-                                         (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+                                          (__v4sf)(__m128)(b), (int)(p), \
+                                          (__mmask8)-1))
 
 #define _mm_mask_cmp_ps_mask(m, a, b, p)  \
-  (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
-                                         (__v4sf)(__m128)(b), (int)(p), \
-                                         (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+                                          (__v4sf)(__m128)(b), (int)(p), \
+                                          (__mmask8)(m)))
 
 #define _mm_cmp_pd_mask(a, b, p)  \
-  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
-                                         (__v2df)(__m128d)(b), (int)(p), \
-                                         (__mmask8)-1)
+  ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
+                                          (__v2df)(__m128d)(b), (int)(p), \
+                                          (__mmask8)-1))
 
 #define _mm_mask_cmp_pd_mask(m, a, b, p)  \
-  (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
-                                         (__v2df)(__m128d)(b), (int)(p), \
-                                         (__mmask8)(m))
+  ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
+                                          (__v2df)(__m128d)(b), (int)(p), \
+                                          (__mmask8)(m)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
@@ -3289,78 +3289,78 @@ _mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
 }
 
 #define _mm_roundscale_pd(A, imm) \
-  (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
-                                              (int)(imm), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)-1)
+  ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+                                               (int)(imm), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)-1))
 
 
 #define _mm_mask_roundscale_pd(W, U, A, imm) \
-  (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
-                                              (int)(imm), \
-                                              (__v2df)(__m128d)(W), \
-                                              (__mmask8)(U))
+  ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+                                               (int)(imm), \
+                                               (__v2df)(__m128d)(W), \
+                                               (__mmask8)(U)))
 
 
 #define _mm_maskz_roundscale_pd(U, A, imm) \
-  (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
-                                              (int)(imm), \
-                                              (__v2df)_mm_setzero_pd(), \
-                                              (__mmask8)(U))
+  ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+                                               (int)(imm), \
+                                               (__v2df)_mm_setzero_pd(), \
+                                               (__mmask8)(U)))
 
 
 #define _mm256_roundscale_pd(A, imm) \
-  (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
-                                              (int)(imm), \
-                                              (__v4df)_mm256_setzero_pd(), \
-                                              (__mmask8)-1)
+  ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+                                               (int)(imm), \
+                                               (__v4df)_mm256_setzero_pd(), \
+                                               (__mmask8)-1))
 
 
 #define _mm256_mask_roundscale_pd(W, U, A, imm) \
-  (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
-                                              (int)(imm), \
-                                              (__v4df)(__m256d)(W), \
-                                              (__mmask8)(U))
+  ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+                                               (int)(imm), \
+                                               (__v4df)(__m256d)(W), \
+                                               (__mmask8)(U)))
 
 
 #define _mm256_maskz_roundscale_pd(U, A, imm)  \
-  (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
-                                              (int)(imm), \
-                                              (__v4df)_mm256_setzero_pd(), \
-                                              (__mmask8)(U))
+  ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+                                               (int)(imm), \
+                                               (__v4df)_mm256_setzero_pd(), \
+                                               (__mmask8)(U)))
 
 #define _mm_roundscale_ps(A, imm)  \
-  (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)-1)
+  ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)-1))
 
 
 #define _mm_mask_roundscale_ps(W, U, A, imm)  \
-  (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                             (__v4sf)(__m128)(W), \
-                                             (__mmask8)(U))
+  ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                              (__v4sf)(__m128)(W), \
+                                              (__mmask8)(U)))
 
 
 #define _mm_maskz_roundscale_ps(U, A, imm)  \
-  (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
-                                             (__v4sf)_mm_setzero_ps(), \
-                                             (__mmask8)(U))
+  ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+                                              (__v4sf)_mm_setzero_ps(), \
+                                              (__mmask8)(U)))
 
 #define _mm256_roundscale_ps(A, imm)  \
-  (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                             (__v8sf)_mm256_setzero_ps(), \
-                                             (__mmask8)-1)
+  ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                              (__v8sf)_mm256_setzero_ps(), \
+                                              (__mmask8)-1))
 
 #define _mm256_mask_roundscale_ps(W, U, A, imm)  \
-  (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                             (__v8sf)(__m256)(W), \
-                                             (__mmask8)(U))
+  ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                              (__v8sf)(__m256)(W), \
+                                              (__mmask8)(U)))
 
 
 #define _mm256_maskz_roundscale_ps(U, A, imm)  \
-  (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
-                                             (__v8sf)_mm256_setzero_ps(), \
-                                             (__mmask8)(U))
+  ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+                                              (__v8sf)_mm256_setzero_ps(), \
+                                              (__mmask8)(U)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_scalef_pd (__m128d __A, __m128d __B) {
@@ -4298,56 +4298,56 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
 
 
 #define _mm_rol_epi32(a, b) \
-  (__m128i)__builtin_ia32_prold128((__v4si)(__m128i)(a), (int)(b))
+  ((__m128i)__builtin_ia32_prold128((__v4si)(__m128i)(a), (int)(b)))
 
 #define _mm_mask_rol_epi32(w, u, a, b) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
-                                      (__v4si)_mm_rol_epi32((a), (b)), \
-                                      (__v4si)(__m128i)(w))
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
+                                       (__v4si)_mm_rol_epi32((a), (b)), \
+                                       (__v4si)(__m128i)(w)))
 
 #define _mm_maskz_rol_epi32(u, a, b) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
-                                      (__v4si)_mm_rol_epi32((a), (b)), \
-                                      (__v4si)_mm_setzero_si128())
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
+                                       (__v4si)_mm_rol_epi32((a), (b)), \
+                                       (__v4si)_mm_setzero_si128()))
 
 #define _mm256_rol_epi32(a, b) \
-  (__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b))
+  ((__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b)))
 
 #define _mm256_mask_rol_epi32(w, u, a, b) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
-                                      (__v8si)_mm256_rol_epi32((a), (b)), \
-                                      (__v8si)(__m256i)(w))
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
+                                       (__v8si)_mm256_rol_epi32((a), (b)), \
+                                       (__v8si)(__m256i)(w)))
 
 #define _mm256_maskz_rol_epi32(u, a, b) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
-                                      (__v8si)_mm256_rol_epi32((a), (b)), \
-                                      (__v8si)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
+                                       (__v8si)_mm256_rol_epi32((a), (b)), \
+                                       (__v8si)_mm256_setzero_si256()))
 
 #define _mm_rol_epi64(a, b) \
-  (__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b))
+  ((__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b)))
 
 #define _mm_mask_rol_epi64(w, u, a, b) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
-                                      (__v2di)_mm_rol_epi64((a), (b)), \
-                                      (__v2di)(__m128i)(w))
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
+                                       (__v2di)_mm_rol_epi64((a), (b)), \
+                                       (__v2di)(__m128i)(w)))
 
 #define _mm_maskz_rol_epi64(u, a, b) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
-                                      (__v2di)_mm_rol_epi64((a), (b)), \
-                                      (__v2di)_mm_setzero_si128())
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
+                                       (__v2di)_mm_rol_epi64((a), (b)), \
+                                       (__v2di)_mm_setzero_si128()))
 
 #define _mm256_rol_epi64(a, b) \
-  (__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b))
+  ((__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b)))
 
 #define _mm256_mask_rol_epi64(w, u, a, b) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
-                                      (__v4di)_mm256_rol_epi64((a), (b)), \
-                                      (__v4di)(__m256i)(w))
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
+                                       (__v4di)_mm256_rol_epi64((a), (b)), \
+                                       (__v4di)(__m256i)(w)))
 
 #define _mm256_maskz_rol_epi64(u, a, b) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
-                                      (__v4di)_mm256_rol_epi64((a), (b)), \
-                                      (__v4di)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
+                                       (__v4di)_mm256_rol_epi64((a), (b)), \
+                                       (__v4di)_mm256_setzero_si256()))
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_rolv_epi32 (__m128i __A, __m128i __B)
@@ -4438,56 +4438,56 @@ _mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
 }
 
 #define _mm_ror_epi32(a, b) \
-  (__m128i)__builtin_ia32_prord128((__v4si)(__m128i)(a), (int)(b))
+  ((__m128i)__builtin_ia32_prord128((__v4si)(__m128i)(a), (int)(b)))
 
 #define _mm_mask_ror_epi32(w, u, a, b) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
-                                      (__v4si)_mm_ror_epi32((a), (b)), \
-                                      (__v4si)(__m128i)(w))
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
+                                       (__v4si)_mm_ror_epi32((a), (b)), \
+                                       (__v4si)(__m128i)(w)))
 
 #define _mm_maskz_ror_epi32(u, a, b) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
-                                      (__v4si)_mm_ror_epi32((a), (b)), \
-                                      (__v4si)_mm_setzero_si128())
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
+                                       (__v4si)_mm_ror_epi32((a), (b)), \
+                                       (__v4si)_mm_setzero_si128()))
 
 #define _mm256_ror_epi32(a, b) \
-  (__m256i)__builtin_ia32_prord256((__v8si)(__m256i)(a), (int)(b))
+  ((__m256i)__builtin_ia32_prord256((__v8si)(__m256i)(a), (int)(b)))
 
 #define _mm256_mask_ror_epi32(w, u, a, b) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
-                                      (__v8si)_mm256_ror_epi32((a), (b)), \
-                                      (__v8si)(__m256i)(w))
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
+                                       (__v8si)_mm256_ror_epi32((a), (b)), \
+                                       (__v8si)(__m256i)(w)))
 
 #define _mm256_maskz_ror_epi32(u, a, b) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
-                                      (__v8si)_mm256_ror_epi32((a), (b)), \
-                                      (__v8si)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
+                                       (__v8si)_mm256_ror_epi32((a), (b)), \
+                                       (__v8si)_mm256_setzero_si256()))
 
 #define _mm_ror_epi64(a, b) \
-  (__m128i)__builtin_ia32_prorq128((__v2di)(__m128i)(a), (int)(b))
+  ((__m128i)__builtin_ia32_prorq128((__v2di)(__m128i)(a), (int)(b)))
 
 #define _mm_mask_ror_epi64(w, u, a, b) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
-                                      (__v2di)_mm_ror_epi64((a), (b)), \
-                                      (__v2di)(__m128i)(w))
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
+                                       (__v2di)_mm_ror_epi64((a), (b)), \
+                                       (__v2di)(__m128i)(w)))
 
 #define _mm_maskz_ror_epi64(u, a, b) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
-                                      (__v2di)_mm_ror_epi64((a), (b)), \
-                                      (__v2di)_mm_setzero_si128())
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
+                                       (__v2di)_mm_ror_epi64((a), (b)), \
+                                       (__v2di)_mm_setzero_si128()))
 
 #define _mm256_ror_epi64(a, b) \
-  (__m256i)__builtin_ia32_prorq256((__v4di)(__m256i)(a), (int)(b))
+  ((__m256i)__builtin_ia32_prorq256((__v4di)(__m256i)(a), (int)(b)))
 
 #define _mm256_mask_ror_epi64(w, u, a, b) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
-                                      (__v4di)_mm256_ror_epi64((a), (b)), \
-                                      (__v4di)(__m256i)(w))
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
+                                       (__v4di)_mm256_ror_epi64((a), (b)), \
+                                       (__v4di)(__m256i)(w)))
 
 #define _mm256_maskz_ror_epi64(u, a, b) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
-                                      (__v4di)_mm256_ror_epi64((a), (b)), \
-                                      (__v4di)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
+                                       (__v4di)_mm256_ror_epi64((a), (b)), \
+                                       (__v4di)_mm256_setzero_si256()))
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
@@ -5356,76 +5356,76 @@ _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
 }
 
 #define _mm_fixupimm_pd(A, B, C, imm) \
-  (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
-                                             (__v2df)(__m128d)(B), \
-                                             (__v2di)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)-1)
+  ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2di)(__m128i)(C), (int)(imm), \
+                                              (__mmask8)-1))
 
 #define _mm_mask_fixupimm_pd(A, U, B, C, imm) \
-  (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
-                                             (__v2df)(__m128d)(B), \
-                                             (__v2di)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)(U))
+  ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
+                                              (__v2df)(__m128d)(B), \
+                                              (__v2di)(__m128i)(C), (int)(imm), \
+                                              (__mmask8)(U)))
 
 #define _mm_maskz_fixupimm_pd(U, A, B, C, imm) \
-  (__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \
-                                              (__v2df)(__m128d)(B), \
-                                              (__v2di)(__m128i)(C), \
-                                              (int)(imm), (__mmask8)(U))
+  ((__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \
+                                               (__v2df)(__m128d)(B), \
+                                               (__v2di)(__m128i)(C), \
+                                               (int)(imm), (__mmask8)(U)))
 
 #define _mm256_fixupimm_pd(A, B, C, imm) \
-  (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
-                                             (__v4df)(__m256d)(B), \
-                                             (__v4di)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)-1)
+  ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (__v4di)(__m256i)(C), (int)(imm), \
+                                              (__mmask8)-1))
 
 #define _mm256_mask_fixupimm_pd(A, U, B, C, imm) \
-  (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
-                                             (__v4df)(__m256d)(B), \
-                                             (__v4di)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)(U))
+  ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
+                                              (__v4df)(__m256d)(B), \
+                                              (__v4di)(__m256i)(C), (int)(imm), \
+                                              (__mmask8)(U)))
 
 #define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) \
-  (__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \
-                                              (__v4df)(__m256d)(B), \
-                                              (__v4di)(__m256i)(C), \
-                                              (int)(imm), (__mmask8)(U))
+  ((__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \
+                                               (__v4df)(__m256d)(B), \
+                                               (__v4di)(__m256i)(C), \
+                                               (int)(imm), (__mmask8)(U)))
 
 #define _mm_fixupimm_ps(A, B, C, imm) \
-  (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
-                                            (__v4sf)(__m128)(B), \
-                                            (__v4si)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)-1)
+  ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
+                                             (__v4sf)(__m128)(B), \
+                                             (__v4si)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)-1))
 
 #define _mm_mask_fixupimm_ps(A, U, B, C, imm) \
-  (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
-                                            (__v4sf)(__m128)(B), \
-                                            (__v4si)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)(U))
-
-#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) \
-  (__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \
+  ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
                                              (__v4sf)(__m128)(B), \
                                              (__v4si)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)(U))
+                                             (__mmask8)(U)))
+
+#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) \
+  ((__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \
+                                              (__v4sf)(__m128)(B), \
+                                              (__v4si)(__m128i)(C), (int)(imm), \
+                                              (__mmask8)(U)))
 
 #define _mm256_fixupimm_ps(A, B, C, imm) \
-  (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
-                                            (__v8sf)(__m256)(B), \
-                                            (__v8si)(__m256i)(C), (int)(imm), \
-                                            (__mmask8)-1)
+  ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
+                                             (__v8sf)(__m256)(B), \
+                                             (__v8si)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)-1))
 
 #define _mm256_mask_fixupimm_ps(A, U, B, C, imm) \
-  (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
-                                            (__v8sf)(__m256)(B), \
-                                            (__v8si)(__m256i)(C), (int)(imm), \
-                                            (__mmask8)(U))
-
-#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) \
-  (__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \
+  ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
                                              (__v8sf)(__m256)(B), \
                                              (__v8si)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)(U))
+                                             (__mmask8)(U)))
+
+#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) \
+  ((__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \
+                                              (__v8sf)(__m256)(B), \
+                                              (__v8si)(__m256i)(C), (int)(imm), \
+                                              (__mmask8)(U)))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P)
@@ -6033,44 +6033,44 @@ _mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A)
 }
 
 #define _mm_mask_permute_pd(W, U, X, C) \
-  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
-                                       (__v2df)_mm_permute_pd((X), (C)), \
-                                       (__v2df)(__m128d)(W))
+  ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                        (__v2df)_mm_permute_pd((X), (C)), \
+                                        (__v2df)(__m128d)(W)))
 
 #define _mm_maskz_permute_pd(U, X, C) \
-  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
-                                       (__v2df)_mm_permute_pd((X), (C)), \
-                                       (__v2df)_mm_setzero_pd())
+  ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                        (__v2df)_mm_permute_pd((X), (C)), \
+                                        (__v2df)_mm_setzero_pd()))
 
 #define _mm256_mask_permute_pd(W, U, X, C) \
-  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                       (__v4df)_mm256_permute_pd((X), (C)), \
-                                       (__v4df)(__m256d)(W))
+  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                        (__v4df)_mm256_permute_pd((X), (C)), \
+                                        (__v4df)(__m256d)(W)))
 
 #define _mm256_maskz_permute_pd(U, X, C) \
-  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                       (__v4df)_mm256_permute_pd((X), (C)), \
-                                       (__v4df)_mm256_setzero_pd())
+  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                        (__v4df)_mm256_permute_pd((X), (C)), \
+                                        (__v4df)_mm256_setzero_pd()))
 
 #define _mm_mask_permute_ps(W, U, X, C) \
-  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
-                                      (__v4sf)_mm_permute_ps((X), (C)), \
-                                      (__v4sf)(__m128)(W))
+  ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                       (__v4sf)_mm_permute_ps((X), (C)), \
+                                       (__v4sf)(__m128)(W)))
 
 #define _mm_maskz_permute_ps(U, X, C) \
-  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
-                                      (__v4sf)_mm_permute_ps((X), (C)), \
-                                      (__v4sf)_mm_setzero_ps())
+  ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                       (__v4sf)_mm_permute_ps((X), (C)), \
+                                       (__v4sf)_mm_setzero_ps()))
 
 #define _mm256_mask_permute_ps(W, U, X, C) \
-  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                      (__v8sf)_mm256_permute_ps((X), (C)), \
-                                      (__v8sf)(__m256)(W))
+  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                       (__v8sf)_mm256_permute_ps((X), (C)), \
+                                       (__v8sf)(__m256)(W)))
 
 #define _mm256_maskz_permute_ps(U, X, C) \
-  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                      (__v8sf)_mm256_permute_ps((X), (C)), \
-                                      (__v8sf)_mm256_setzero_ps())
+  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                       (__v8sf)_mm256_permute_ps((X), (C)), \
+                                       (__v8sf)_mm256_setzero_ps()))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C)
@@ -6526,175 +6526,175 @@ _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm)
 }
 
 #define _mm_ternarylogic_epi32(A, B, C, imm) \
-  (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
-                                            (__v4si)(__m128i)(B), \
-                                            (__v4si)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)-1)
+  ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
+                                             (__v4si)(__m128i)(B), \
+                                             (__v4si)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)-1))
 
 #define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) \
-  (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
-                                            (__v4si)(__m128i)(B), \
-                                            (__v4si)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)(U))
-
-#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) \
-  (__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \
+  ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
                                              (__v4si)(__m128i)(B), \
                                              (__v4si)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)(U))
+                                             (__mmask8)(U)))
+
+#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) \
+  ((__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \
+                                              (__v4si)(__m128i)(B), \
+                                              (__v4si)(__m128i)(C), (int)(imm), \
+                                              (__mmask8)(U)))
 
 #define _mm256_ternarylogic_epi32(A, B, C, imm) \
-  (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
-                                            (__v8si)(__m256i)(B), \
-                                            (__v8si)(__m256i)(C), (int)(imm), \
-                                            (__mmask8)-1)
+  ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
+                                             (__v8si)(__m256i)(B), \
+                                             (__v8si)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)-1))
 
 #define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) \
-  (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
-                                            (__v8si)(__m256i)(B), \
-                                            (__v8si)(__m256i)(C), (int)(imm), \
-                                            (__mmask8)(U))
-
-#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) \
-  (__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \
+  ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
                                              (__v8si)(__m256i)(B), \
                                              (__v8si)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)(U))
+                                             (__mmask8)(U)))
+
+#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) \
+  ((__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \
+                                              (__v8si)(__m256i)(B), \
+                                              (__v8si)(__m256i)(C), (int)(imm), \
+                                              (__mmask8)(U)))
 
 #define _mm_ternarylogic_epi64(A, B, C, imm) \
-  (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
-                                            (__v2di)(__m128i)(B), \
-                                            (__v2di)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)-1)
+  ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
+                                             (__v2di)(__m128i)(B), \
+                                             (__v2di)(__m128i)(C), (int)(imm), \
+                                             (__mmask8)-1))
 
 #define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) \
-  (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
-                                            (__v2di)(__m128i)(B), \
-                                            (__v2di)(__m128i)(C), (int)(imm), \
-                                            (__mmask8)(U))
-
-#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) \
-  (__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \
+  ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
                                              (__v2di)(__m128i)(B), \
                                              (__v2di)(__m128i)(C), (int)(imm), \
-                                             (__mmask8)(U))
+                                             (__mmask8)(U)))
+
+#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) \
+  ((__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \
+                                              (__v2di)(__m128i)(B), \
+                                              (__v2di)(__m128i)(C), (int)(imm), \
+                                              (__mmask8)(U)))
 
 #define _mm256_ternarylogic_epi64(A, B, C, imm) \
-  (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
-                                            (__v4di)(__m256i)(B), \
-                                            (__v4di)(__m256i)(C), (int)(imm), \
-                                            (__mmask8)-1)
+  ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
+                                             (__v4di)(__m256i)(B), \
+                                             (__v4di)(__m256i)(C), (int)(imm), \
+                                             (__mmask8)-1))
 
 #define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) \
-  (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
-                                            (__v4di)(__m256i)(B), \
-                                            (__v4di)(__m256i)(C), (int)(imm), \
-                                            (__mmask8)(U))
-
-#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) \
-  (__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \
+  ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
                                              (__v4di)(__m256i)(B), \
                                              (__v4di)(__m256i)(C), (int)(imm), \
-                                             (__mmask8)(U))
+                                             (__mmask8)(U)))
+
+#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) \
+  ((__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \
+                                              (__v4di)(__m256i)(B), \
+                                              (__v4di)(__m256i)(C), (int)(imm), \
+                                              (__mmask8)(U)))
 
 
 
 #define _mm256_shuffle_f32x4(A, B, imm) \
-  (__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \
-                                        (__v8sf)(__m256)(B), (int)(imm))
+  ((__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \
+                                         (__v8sf)(__m256)(B), (int)(imm)))
 
 #define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) \
-  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                      (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \
-                                      (__v8sf)(__m256)(W))
+  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                       (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \
+                                       (__v8sf)(__m256)(W)))
 
 #define _mm256_maskz_shuffle_f32x4(U, A, B, imm) \
-  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                      (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \
-                                      (__v8sf)_mm256_setzero_ps())
+  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                       (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \
+                                       (__v8sf)_mm256_setzero_ps()))
 
 #define _mm256_shuffle_f64x2(A, B, imm) \
-  (__m256d)__builtin_ia32_shuf_f64x2_256((__v4df)(__m256d)(A), \
-                                         (__v4df)(__m256d)(B), (int)(imm))
+  ((__m256d)__builtin_ia32_shuf_f64x2_256((__v4df)(__m256d)(A), \
+                                          (__v4df)(__m256d)(B), (int)(imm)))
 
 #define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) \
-  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                      (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \
-                                      (__v4df)(__m256d)(W))
+  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \
+                                       (__v4df)(__m256d)(W)))
 
 #define _mm256_maskz_shuffle_f64x2(U, A, B, imm) \
-  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                      (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \
-                                      (__v4df)_mm256_setzero_pd())
+  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                       (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \
+                                       (__v4df)_mm256_setzero_pd()))
 
 #define _mm256_shuffle_i32x4(A, B, imm) \
-  (__m256i)__builtin_ia32_shuf_i32x4_256((__v8si)(__m256i)(A), \
-                                         (__v8si)(__m256i)(B), (int)(imm))
+  ((__m256i)__builtin_ia32_shuf_i32x4_256((__v8si)(__m256i)(A), \
+                                          (__v8si)(__m256i)(B), (int)(imm)))
 
 #define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                      (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \
-                                      (__v8si)(__m256i)(W))
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                       (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \
+                                       (__v8si)(__m256i)(W)))
 
 #define _mm256_maskz_shuffle_i32x4(U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                      (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \
-                                      (__v8si)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                       (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \
+                                       (__v8si)_mm256_setzero_si256()))
 
 #define _mm256_shuffle_i64x2(A, B, imm) \
-  (__m256i)__builtin_ia32_shuf_i64x2_256((__v4di)(__m256i)(A), \
-                                         (__v4di)(__m256i)(B), (int)(imm))
+  ((__m256i)__builtin_ia32_shuf_i64x2_256((__v4di)(__m256i)(A), \
+                                          (__v4di)(__m256i)(B), (int)(imm)))
 
 #define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                      (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \
-                                      (__v4di)(__m256i)(W))
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                       (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \
+                                       (__v4di)(__m256i)(W)))
 
 
 #define _mm256_maskz_shuffle_i64x2(U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                      (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \
-                                      (__v4di)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                       (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \
+                                       (__v4di)_mm256_setzero_si256()))
 
 #define _mm_mask_shuffle_pd(W, U, A, B, M) \
-  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
-                                       (__v2df)_mm_shuffle_pd((A), (B), (M)), \
-                                       (__v2df)(__m128d)(W))
+  ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                        (__v2df)_mm_shuffle_pd((A), (B), (M)), \
+                                        (__v2df)(__m128d)(W)))
 
 #define _mm_maskz_shuffle_pd(U, A, B, M) \
-  (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
-                                       (__v2df)_mm_shuffle_pd((A), (B), (M)), \
-                                       (__v2df)_mm_setzero_pd())
+  ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+                                        (__v2df)_mm_shuffle_pd((A), (B), (M)), \
+                                        (__v2df)_mm_setzero_pd()))
 
 #define _mm256_mask_shuffle_pd(W, U, A, B, M) \
-  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                       (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
-                                       (__v4df)(__m256d)(W))
+  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                        (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
+                                        (__v4df)(__m256d)(W)))
 
 #define _mm256_maskz_shuffle_pd(U, A, B, M) \
-  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                       (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
-                                       (__v4df)_mm256_setzero_pd())
+  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                        (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
+                                        (__v4df)_mm256_setzero_pd()))
 
 #define _mm_mask_shuffle_ps(W, U, A, B, M) \
-  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
-                                      (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
-                                      (__v4sf)(__m128)(W))
+  ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                       (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
+                                       (__v4sf)(__m128)(W)))
 
 #define _mm_maskz_shuffle_ps(U, A, B, M) \
-  (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
-                                      (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
-                                      (__v4sf)_mm_setzero_ps())
+  ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+                                       (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
+                                       (__v4sf)_mm_setzero_ps()))
 
 #define _mm256_mask_shuffle_ps(W, U, A, B, M) \
-  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                      (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
-                                      (__v8sf)(__m256)(W))
+  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                       (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
+                                       (__v8sf)(__m256)(W)))
 
 #define _mm256_maskz_shuffle_ps(U, A, B, M) \
-  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
-                                      (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
-                                      (__v8sf)_mm256_setzero_ps())
+  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+                                       (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
+                                       (__v8sf)_mm256_setzero_ps()))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_rsqrt14_pd (__m128d __A)
@@ -7834,262 +7834,262 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
 }
 
 #define _mm256_extractf32x4_ps(A, imm) \
-  (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
-                                               (int)(imm), \
-                                               (__v4sf)_mm_undefined_ps(), \
-                                               (__mmask8)-1)
+  ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
+                                                (int)(imm), \
+                                                (__v4sf)_mm_undefined_ps(), \
+                                                (__mmask8)-1))
 
 #define _mm256_mask_extractf32x4_ps(W, U, A, imm) \
-  (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
-                                               (int)(imm), \
-                                               (__v4sf)(__m128)(W), \
-                                               (__mmask8)(U))
+  ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
+                                                (int)(imm), \
+                                                (__v4sf)(__m128)(W), \
+                                                (__mmask8)(U)))
 
 #define _mm256_maskz_extractf32x4_ps(U, A, imm) \
-  (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
-                                               (int)(imm), \
-                                               (__v4sf)_mm_setzero_ps(), \
-                                               (__mmask8)(U))
+  ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
+                                                (int)(imm), \
+                                                (__v4sf)_mm_setzero_ps(), \
+                                                (__mmask8)(U)))
 
 #define _mm256_extracti32x4_epi32(A, imm) \
-  (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
-                                                (int)(imm), \
-                                                (__v4si)_mm_undefined_si128(), \
-                                                (__mmask8)-1)
+  ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
+                                                 (int)(imm), \
+                                                 (__v4si)_mm_undefined_si128(), \
+                                                 (__mmask8)-1))
 
 #define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \
-  (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
-                                                (int)(imm), \
-                                                (__v4si)(__m128i)(W), \
-                                                (__mmask8)(U))
+  ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
+                                                 (int)(imm), \
+                                                 (__v4si)(__m128i)(W), \
+                                                 (__mmask8)(U)))
 
 #define _mm256_maskz_extracti32x4_epi32(U, A, imm) \
-  (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
-                                                (int)(imm), \
-                                                (__v4si)_mm_setzero_si128(), \
-                                                (__mmask8)(U))
+  ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
+                                                 (int)(imm), \
+                                                 (__v4si)_mm_setzero_si128(), \
+                                                 (__mmask8)(U)))
 
 #define _mm256_insertf32x4(A, B, imm) \
-  (__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \
-                                         (__v4sf)(__m128)(B), (int)(imm))
+  ((__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \
+                                          (__v4sf)(__m128)(B), (int)(imm)))
 
 #define _mm256_mask_insertf32x4(W, U, A, B, imm) \
-  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
                                   (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
-                                  (__v8sf)(__m256)(W))
+                                  (__v8sf)(__m256)(W)))
 
 #define _mm256_maskz_insertf32x4(U, A, B, imm) \
-  (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+  ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
                                   (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
-                                  (__v8sf)_mm256_setzero_ps())
+                                  (__v8sf)_mm256_setzero_ps()))
 
 #define _mm256_inserti32x4(A, B, imm) \
-  (__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \
-                                          (__v4si)(__m128i)(B), (int)(imm))
+  ((__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \
+                                           (__v4si)(__m128i)(B), (int)(imm)))
 
 #define _mm256_mask_inserti32x4(W, U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
                                   (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
-                                  (__v8si)(__m256i)(W))
+                                  (__v8si)(__m256i)(W)))
 
 #define _mm256_maskz_inserti32x4(U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
                                   (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
-                                  (__v8si)_mm256_setzero_si256())
+                                  (__v8si)_mm256_setzero_si256()))
 
 #define _mm_getmant_pd(A, B, C) \
-  (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)-1)
+  ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+                                             (int)(((C)<<2) | (B)), \
+                                             (__v2df)_mm_setzero_pd(), \
+                                             (__mmask8)-1))
 
 #define _mm_mask_getmant_pd(W, U, A, B, C) \
-  (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v2df)(__m128d)(W), \
-                                            (__mmask8)(U))
+  ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+                                             (int)(((C)<<2) | (B)), \
+                                             (__v2df)(__m128d)(W), \
+                                             (__mmask8)(U)))
 
 #define _mm_maskz_getmant_pd(U, A, B, C) \
-  (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v2df)_mm_setzero_pd(), \
-                                            (__mmask8)(U))
+  ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+                                             (int)(((C)<<2) | (B)), \
+                                             (__v2df)_mm_setzero_pd(), \
+                                             (__mmask8)(U)))
 
 #define _mm256_getmant_pd(A, B, C) \
-  (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v4df)_mm256_setzero_pd(), \
-                                            (__mmask8)-1)
+  ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+                                             (int)(((C)<<2) | (B)), \
+                                             (__v4df)_mm256_setzero_pd(), \
+                                             (__mmask8)-1))
 
 #define _mm256_mask_getmant_pd(W, U, A, B, C) \
-  (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v4df)(__m256d)(W), \
-                                            (__mmask8)(U))
+  ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+                                             (int)(((C)<<2) | (B)), \
+                                             (__v4df)(__m256d)(W), \
+                                             (__mmask8)(U)))
 
 #define _mm256_maskz_getmant_pd(U, A, B, C) \
-  (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
-                                            (int)(((C)<<2) | (B)), \
-                                            (__v4df)_mm256_setzero_pd(), \
-                                            (__mmask8)(U))
+  ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+                                             (int)(((C)<<2) | (B)), \
+                                             (__v4df)_mm256_setzero_pd(), \
+                                             (__mmask8)(U)))
 
 #define _mm_getmant_ps(A, B, C) \
-  (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
-                                           (int)(((C)<<2) | (B)), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)-1)
+  ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v4sf)_mm_setzero_ps(), \
+                                            (__mmask8)-1))
 
 #define _mm_mask_getmant_ps(W, U, A, B, C) \
-  (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
-                                           (int)(((C)<<2) | (B)), \
-                                           (__v4sf)(__m128)(W), \
-                                           (__mmask8)(U))
+  ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v4sf)(__m128)(W), \
+                                            (__mmask8)(U)))
 
 #define _mm_maskz_getmant_ps(U, A, B, C) \
-  (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
-                                           (int)(((C)<<2) | (B)), \
-                                           (__v4sf)_mm_setzero_ps(), \
-                                           (__mmask8)(U))
+  ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v4sf)_mm_setzero_ps(), \
+                                            (__mmask8)(U)))
 
 #define _mm256_getmant_ps(A, B, C) \
-  (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
-                                           (int)(((C)<<2) | (B)), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)-1)
+  ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8sf)_mm256_setzero_ps(), \
+                                            (__mmask8)-1))
 
 #define _mm256_mask_getmant_ps(W, U, A, B, C) \
-  (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
-                                           (int)(((C)<<2) | (B)), \
-                                           (__v8sf)(__m256)(W), \
-                                           (__mmask8)(U))
+  ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8sf)(__m256)(W), \
+                                            (__mmask8)(U)))
 
 #define _mm256_maskz_getmant_ps(U, A, B, C) \
-  (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
-                                           (int)(((C)<<2) | (B)), \
-                                           (__v8sf)_mm256_setzero_ps(), \
-                                           (__mmask8)(U))
+  ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+                                            (int)(((C)<<2) | (B)), \
+                                            (__v8sf)_mm256_setzero_ps(), \
+                                            (__mmask8)(U)))
 
 #define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \
-  (__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v2di)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale))
+  ((__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v2di)(__m128i)(index), \
+                                         (__mmask8)(mask), (int)(scale)))
 
 #define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \
-  (__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v2di)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale))
+  ((__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v2di)(__m128i)(index), \
+                                         (__mmask8)(mask), (int)(scale)))
 
 #define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \
-  (__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v4di)(__m256i)(index), \
-                                        (__mmask8)(mask), (int)(scale))
+  ((__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v4di)(__m256i)(index), \
+                                         (__mmask8)(mask), (int)(scale)))
 
 #define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \
-  (__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v4di)(__m256i)(index), \
-                                        (__mmask8)(mask), (int)(scale))
+  ((__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v4di)(__m256i)(index), \
+                                         (__mmask8)(mask), (int)(scale)))
 
 #define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \
-  (__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \
-                                       (void const *)(addr), \
-                                       (__v2di)(__m128i)(index), \
-                                       (__mmask8)(mask), (int)(scale))
-
-#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \
-  (__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \
+  ((__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \
                                         (void const *)(addr), \
                                         (__v2di)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale))
+                                        (__mmask8)(mask), (int)(scale)))
 
-#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \
-  (__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \
-                                       (void const *)(addr), \
-                                       (__v4di)(__m256i)(index), \
-                                       (__mmask8)(mask), (int)(scale))
+#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \
+  ((__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v2di)(__m128i)(index), \
+                                         (__mmask8)(mask), (int)(scale)))
 
-#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \
-  (__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \
+#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \
+  ((__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \
                                         (void const *)(addr), \
                                         (__v4di)(__m256i)(index), \
-                                        (__mmask8)(mask), (int)(scale))
+                                        (__mmask8)(mask), (int)(scale)))
+
+#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \
+  ((__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v4di)(__m256i)(index), \
+                                         (__mmask8)(mask), (int)(scale)))
 
 #define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \
-  (__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v4si)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale))
+  ((__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v4si)(__m128i)(index), \
+                                         (__mmask8)(mask), (int)(scale)))
 
 #define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \
-  (__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v4si)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale))
+  ((__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v4si)(__m128i)(index), \
+                                         (__mmask8)(mask), (int)(scale)))
 
 #define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \
-  (__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v4si)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale))
+  ((__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v4si)(__m128i)(index), \
+                                         (__mmask8)(mask), (int)(scale)))
 
 #define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \
-  (__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \
-                                        (void const *)(addr), \
-                                        (__v4si)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale))
+  ((__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v4si)(__m128i)(index), \
+                                         (__mmask8)(mask), (int)(scale)))
 
 #define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \
-  (__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \
-                                       (void const *)(addr), \
-                                       (__v4si)(__m128i)(index), \
-                                       (__mmask8)(mask), (int)(scale))
-
-#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \
-  (__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \
+  ((__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \
                                         (void const *)(addr), \
                                         (__v4si)(__m128i)(index), \
-                                        (__mmask8)(mask), (int)(scale))
+                                        (__mmask8)(mask), (int)(scale)))
 
-#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \
-  (__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \
-                                       (void const *)(addr), \
-                                       (__v8si)(__m256i)(index), \
-                                       (__mmask8)(mask), (int)(scale))
+#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \
+  ((__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v4si)(__m128i)(index), \
+                                         (__mmask8)(mask), (int)(scale)))
 
-#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \
-  (__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \
+#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \
+  ((__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \
                                         (void const *)(addr), \
                                         (__v8si)(__m256i)(index), \
-                                        (__mmask8)(mask), (int)(scale))
+                                        (__mmask8)(mask), (int)(scale)))
+
+#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \
+  ((__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \
+                                         (void const *)(addr), \
+                                         (__v8si)(__m256i)(index), \
+                                         (__mmask8)(mask), (int)(scale)))
 
 #define _mm256_permutex_pd(X, C) \
-  (__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(C))
+  ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(C)))
 
 #define _mm256_mask_permutex_pd(W, U, X, C) \
-  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
                                        (__v4df)_mm256_permutex_pd((X), (C)), \
-                                       (__v4df)(__m256d)(W))
+                                       (__v4df)(__m256d)(W)))
 
 #define _mm256_maskz_permutex_pd(U, X, C) \
-  (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
-                                       (__v4df)_mm256_permutex_pd((X), (C)), \
-                                       (__v4df)_mm256_setzero_pd())
+  ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+                                        (__v4df)_mm256_permutex_pd((X), (C)), \
+                                        (__v4df)_mm256_setzero_pd()))
 
 #define _mm256_permutex_epi64(X, C) \
-  (__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(C))
+  ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(C)))
 
 #define _mm256_mask_permutex_epi64(W, U, X, C) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                       (__v4di)_mm256_permutex_epi64((X), (C)), \
-                                      (__v4di)(__m256i)(W))
+                                      (__v4di)(__m256i)(W)))
 
 #define _mm256_maskz_permutex_epi64(U, X, C) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                       (__v4di)_mm256_permutex_epi64((X), (C)), \
-                                      (__v4di)_mm256_setzero_si256())
+                                      (__v4di)_mm256_setzero_si256()))
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_permutexvar_pd (__m256i __X, __m256d __Y)
@@ -8175,60 +8175,60 @@ _mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
 }
 
 #define _mm_alignr_epi32(A, B, imm) \
-  (__m128i)__builtin_ia32_alignd128((__v4si)(__m128i)(A), \
-                                    (__v4si)(__m128i)(B), (int)(imm))
+  ((__m128i)__builtin_ia32_alignd128((__v4si)(__m128i)(A), \
+                                     (__v4si)(__m128i)(B), (int)(imm)))
 
 #define _mm_mask_alignr_epi32(W, U, A, B, imm) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
                                     (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
-                                    (__v4si)(__m128i)(W))
+                                    (__v4si)(__m128i)(W)))
 
 #define _mm_maskz_alignr_epi32(U, A, B, imm) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
                                     (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
-                                    (__v4si)_mm_setzero_si128())
+                                    (__v4si)_mm_setzero_si128()))
 
 #define _mm256_alignr_epi32(A, B, imm) \
-  (__m256i)__builtin_ia32_alignd256((__v8si)(__m256i)(A), \
-                                    (__v8si)(__m256i)(B), (int)(imm))
+  ((__m256i)__builtin_ia32_alignd256((__v8si)(__m256i)(A), \
+                                     (__v8si)(__m256i)(B), (int)(imm)))
 
 #define _mm256_mask_alignr_epi32(W, U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
                                  (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
-                                 (__v8si)(__m256i)(W))
+                                 (__v8si)(__m256i)(W)))
 
 #define _mm256_maskz_alignr_epi32(U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
                                  (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
-                                 (__v8si)_mm256_setzero_si256())
+                                 (__v8si)_mm256_setzero_si256()))
 
 #define _mm_alignr_epi64(A, B, imm) \
-  (__m128i)__builtin_ia32_alignq128((__v2di)(__m128i)(A), \
-                                    (__v2di)(__m128i)(B), (int)(imm))
+  ((__m128i)__builtin_ia32_alignq128((__v2di)(__m128i)(A), \
+                                     (__v2di)(__m128i)(B), (int)(imm)))
 
 #define _mm_mask_alignr_epi64(W, U, A, B, imm) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
                                     (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
-                                    (__v2di)(__m128i)(W))
+                                    (__v2di)(__m128i)(W)))
 
 #define _mm_maskz_alignr_epi64(U, A, B, imm) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
                                     (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
-                                    (__v2di)_mm_setzero_si128())
+                                    (__v2di)_mm_setzero_si128()))
 
 #define _mm256_alignr_epi64(A, B, imm) \
-  (__m256i)__builtin_ia32_alignq256((__v4di)(__m256i)(A), \
-                                    (__v4di)(__m256i)(B), (int)(imm))
+  ((__m256i)__builtin_ia32_alignq256((__v4di)(__m256i)(A), \
+                                     (__v4di)(__m256i)(B), (int)(imm)))
 
 #define _mm256_mask_alignr_epi64(W, U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                  (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
-                                 (__v4di)(__m256i)(W))
+                                 (__v4di)(__m256i)(W)))
 
 #define _mm256_maskz_alignr_epi64(U, A, B, imm) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
                                  (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
-                                 (__v4di)_mm256_setzero_si256())
+                                 (__v4di)_mm256_setzero_si256()))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
@@ -8295,24 +8295,24 @@ _mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A)
 }
 
 #define _mm256_mask_shuffle_epi32(W, U, A, I) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                      (__v8si)_mm256_shuffle_epi32((A), (I)), \
-                                      (__v8si)(__m256i)(W))
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                       (__v8si)_mm256_shuffle_epi32((A), (I)), \
+                                       (__v8si)(__m256i)(W)))
 
 #define _mm256_maskz_shuffle_epi32(U, A, I) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                      (__v8si)_mm256_shuffle_epi32((A), (I)), \
-                                      (__v8si)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                       (__v8si)_mm256_shuffle_epi32((A), (I)), \
+                                       (__v8si)_mm256_setzero_si256()))
 
 #define _mm_mask_shuffle_epi32(W, U, A, I) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                      (__v4si)_mm_shuffle_epi32((A), (I)), \
-                                      (__v4si)(__m128i)(W))
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                       (__v4si)_mm_shuffle_epi32((A), (I)), \
+                                       (__v4si)(__m128i)(W)))
 
 #define _mm_maskz_shuffle_epi32(U, A, I) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                      (__v4si)_mm_shuffle_epi32((A), (I)), \
-                                      (__v4si)_mm_setzero_si128())
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                       (__v4si)_mm_shuffle_epi32((A), (I)), \
+                                       (__v4si)_mm_setzero_si128()))
 
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A)
@@ -8413,27 +8413,27 @@ _mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
 }
 
 #define _mm_mask_cvt_roundps_ph(W, U, A, I) \
-  (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
-                                         (__v8hi)(__m128i)(W), \
-                                         (__mmask8)(U))
+  ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
+                                          (__v8hi)(__m128i)(W), \
+                                          (__mmask8)(U)))
 
 #define _mm_maskz_cvt_roundps_ph(U, A, I) \
-  (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
-                                         (__v8hi)_mm_setzero_si128(), \
-                                         (__mmask8)(U))
+  ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
+                                          (__v8hi)_mm_setzero_si128(), \
+                                          (__mmask8)(U)))
 
 #define _mm_mask_cvtps_ph  _mm_mask_cvt_roundps_ph
 #define _mm_maskz_cvtps_ph _mm_maskz_cvt_roundps_ph
 
 #define _mm256_mask_cvt_roundps_ph(W, U, A, I) \
-  (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
-                                            (__v8hi)(__m128i)(W), \
-                                            (__mmask8)(U))
+  ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
+                                             (__v8hi)(__m128i)(W), \
+                                             (__mmask8)(U)))
 
 #define _mm256_maskz_cvt_roundps_ph(U, A, I) \
-  (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
-                                            (__v8hi)_mm_setzero_si128(), \
-                                            (__mmask8)(U))
+  ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
+                                             (__v8hi)_mm_setzero_si128(), \
+                                             (__mmask8)(U)))
 
 #define _mm256_mask_cvtps_ph  _mm256_mask_cvt_roundps_ph
 #define _mm256_maskz_cvtps_ph _mm256_maskz_cvt_roundps_ph
diff --git a/clang/lib/Headers/avx512vlvbmi2intrin.h b/clang/lib/Headers/avx512vlvbmi2intrin.h
index a40f926de75ab..fac1f232415af 100644
--- a/clang/lib/Headers/avx512vlvbmi2intrin.h
+++ b/clang/lib/Headers/avx512vlvbmi2intrin.h
@@ -239,172 +239,172 @@ _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
 }
 
 #define _mm256_shldi_epi64(A, B, I) \
-  (__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
-                                     (__v4di)(__m256i)(B), (int)(I))
+  ((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
+                                      (__v4di)(__m256i)(B), (int)(I)))
 
 #define _mm256_mask_shldi_epi64(S, U, A, B, I) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                    (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
-                                    (__v4di)(__m256i)(S))
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                     (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
+                                     (__v4di)(__m256i)(S)))
 
 #define _mm256_maskz_shldi_epi64(U, A, B, I) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                    (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
-                                    (__v4di)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                     (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
+                                     (__v4di)_mm256_setzero_si256()))
 
 #define _mm_shldi_epi64(A, B, I) \
-  (__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
-                                     (__v2di)(__m128i)(B), (int)(I))
+  ((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
+                                      (__v2di)(__m128i)(B), (int)(I)))
 
 #define _mm_mask_shldi_epi64(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                      (__v2di)_mm_shldi_epi64((A), (B), (I)), \
-                                      (__v2di)(__m128i)(S))
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                       (__v2di)_mm_shldi_epi64((A), (B), (I)), \
+                                       (__v2di)(__m128i)(S)))
 
 #define _mm_maskz_shldi_epi64(U, A, B, I) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                      (__v2di)_mm_shldi_epi64((A), (B), (I)), \
-                                      (__v2di)_mm_setzero_si128())
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                       (__v2di)_mm_shldi_epi64((A), (B), (I)), \
+                                       (__v2di)_mm_setzero_si128()))
 
 #define _mm256_shldi_epi32(A, B, I) \
-  (__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
-                                     (__v8si)(__m256i)(B), (int)(I))
+  ((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
+                                      (__v8si)(__m256i)(B), (int)(I)))
 
 #define _mm256_mask_shldi_epi32(S, U, A, B, I) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                    (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
-                                    (__v8si)(__m256i)(S))
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                     (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
+                                     (__v8si)(__m256i)(S)))
 
 #define _mm256_maskz_shldi_epi32(U, A, B, I) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                    (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
-                                    (__v8si)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                     (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
+                                     (__v8si)_mm256_setzero_si256()))
 
 #define _mm_shldi_epi32(A, B, I) \
-  (__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
-                                     (__v4si)(__m128i)(B), (int)(I))
+  ((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
+                                      (__v4si)(__m128i)(B), (int)(I)))
 
 #define _mm_mask_shldi_epi32(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                      (__v4si)_mm_shldi_epi32((A), (B), (I)), \
-                                      (__v4si)(__m128i)(S))
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                       (__v4si)_mm_shldi_epi32((A), (B), (I)), \
+                                       (__v4si)(__m128i)(S)))
 
 #define _mm_maskz_shldi_epi32(U, A, B, I) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                      (__v4si)_mm_shldi_epi32((A), (B), (I)), \
-                                      (__v4si)_mm_setzero_si128())
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                       (__v4si)_mm_shldi_epi32((A), (B), (I)), \
+                                       (__v4si)_mm_setzero_si128()))
 
 #define _mm256_shldi_epi16(A, B, I) \
-  (__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
-                                     (__v16hi)(__m256i)(B), (int)(I))
+  ((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
+                                      (__v16hi)(__m256i)(B), (int)(I)))
 
 #define _mm256_mask_shldi_epi16(S, U, A, B, I) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                   (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
-                                   (__v16hi)(__m256i)(S))
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                    (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
+                                    (__v16hi)(__m256i)(S)))
 
 #define _mm256_maskz_shldi_epi16(U, A, B, I) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                   (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
-                                   (__v16hi)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                    (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
+                                    (__v16hi)_mm256_setzero_si256()))
 
 #define _mm_shldi_epi16(A, B, I) \
-  (__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
-                                     (__v8hi)(__m128i)(B), (int)(I))
+  ((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
+                                      (__v8hi)(__m128i)(B), (int)(I)))
 
 #define _mm_mask_shldi_epi16(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                      (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
-                                      (__v8hi)(__m128i)(S))
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                       (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
+                                       (__v8hi)(__m128i)(S)))
 
 #define _mm_maskz_shldi_epi16(U, A, B, I) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                      (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
-                                      (__v8hi)_mm_setzero_si128())
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                       (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
+                                       (__v8hi)_mm_setzero_si128()))
 
 #define _mm256_shrdi_epi64(A, B, I) \
-  (__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
-                                     (__v4di)(__m256i)(B), (int)(I))
+  ((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
+                                      (__v4di)(__m256i)(B), (int)(I)))
 
 #define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                    (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
-                                    (__v4di)(__m256i)(S))
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                     (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
+                                     (__v4di)(__m256i)(S)))
 
 #define _mm256_maskz_shrdi_epi64(U, A, B, I) \
-  (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
-                                    (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
-                                    (__v4di)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+                                     (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
+                                     (__v4di)_mm256_setzero_si256()))
 
 #define _mm_shrdi_epi64(A, B, I) \
-  (__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
-                                     (__v2di)(__m128i)(B), (int)(I))
+  ((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
+                                      (__v2di)(__m128i)(B), (int)(I)))
 
 #define _mm_mask_shrdi_epi64(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                      (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
-                                      (__v2di)(__m128i)(S))
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                       (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
+                                       (__v2di)(__m128i)(S)))
 
 #define _mm_maskz_shrdi_epi64(U, A, B, I) \
-  (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
-                                      (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
-                                      (__v2di)_mm_setzero_si128())
+  ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+                                       (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
+                                       (__v2di)_mm_setzero_si128()))
 
 #define _mm256_shrdi_epi32(A, B, I) \
-  (__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
-                                     (__v8si)(__m256i)(B), (int)(I))
+  ((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
+                                      (__v8si)(__m256i)(B), (int)(I)))
 
 #define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                    (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
-                                    (__v8si)(__m256i)(S))
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                     (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
+                                     (__v8si)(__m256i)(S)))
 
 #define _mm256_maskz_shrdi_epi32(U, A, B, I) \
-  (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
-                                    (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
-                                    (__v8si)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+                                     (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
+                                     (__v8si)_mm256_setzero_si256()))
 
 #define _mm_shrdi_epi32(A, B, I) \
-  (__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
-                                     (__v4si)(__m128i)(B), (int)(I))
+  ((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
+                                      (__v4si)(__m128i)(B), (int)(I)))
 
 #define _mm_mask_shrdi_epi32(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                      (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
-                                      (__v4si)(__m128i)(S))
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                       (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
+                                       (__v4si)(__m128i)(S)))
 
 #define _mm_maskz_shrdi_epi32(U, A, B, I) \
-  (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
-                                      (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
-                                      (__v4si)_mm_setzero_si128())
+  ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+                                       (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
+                                       (__v4si)_mm_setzero_si128()))
 
 #define _mm256_shrdi_epi16(A, B, I) \
-  (__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
-                                     (__v16hi)(__m256i)(B), (int)(I))
+  ((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
+                                      (__v16hi)(__m256i)(B), (int)(I)))
 
 #define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                   (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
-                                   (__v16hi)(__m256i)(S))
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                    (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
+                                    (__v16hi)(__m256i)(S)))
 
 #define _mm256_maskz_shrdi_epi16(U, A, B, I) \
-  (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
-                                   (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
-                                   (__v16hi)_mm256_setzero_si256())
+  ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+                                    (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
+                                    (__v16hi)_mm256_setzero_si256()))
 
 #define _mm_shrdi_epi16(A, B, I) \
-  (__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
-                                     (__v8hi)(__m128i)(B), (int)(I))
+  ((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
+                                      (__v8hi)(__m128i)(B), (int)(I)))
 
 #define _mm_mask_shrdi_epi16(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                      (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
-                                      (__v8hi)(__m128i)(S))
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                       (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
+                                       (__v8hi)(__m128i)(S)))
 
 #define _mm_maskz_shrdi_epi16(U, A, B, I) \
-  (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
-                                      (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
-                                      (__v8hi)_mm_setzero_si128())
+  ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+                                       (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
+                                       (__v8hi)_mm_setzero_si128()))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C)
diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h
index 71ac1b4370d4f..0fb29af262f71 100644
--- a/clang/lib/Headers/avx512vlvnniintrin.h
+++ b/clang/lib/Headers/avx512vlvnniintrin.h
@@ -36,7 +36,7 @@
 ///    DST[MAX:256] := 0
 /// \endoperation
 #define _mm256_dpbusd_epi32(S, A, B) \
-  (__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+  ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@@ -56,7 +56,7 @@
 ///    DST[MAX:256] := 0
 /// \endoperation
 #define _mm256_dpbusds_epi32(S, A, B) \
-  (__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+  ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -74,7 +74,7 @@
 ///    DST[MAX:256] := 0
 /// \endoperation
 #define _mm256_dpwssd_epi32(S, A, B) \
-  (__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -92,7 +92,7 @@
 ///    DST[MAX:256] := 0
 /// \endoperation
 #define _mm256_dpwssds_epi32(S, A, B) \
-  (__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@@ -112,7 +112,7 @@
 ///    DST[MAX:128] := 0
 /// \endoperation
 #define _mm_dpbusd_epi32(S, A, B) \
-  (__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+  ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@@ -132,7 +132,7 @@
 ///    DST[MAX:128] := 0
 /// \endoperation
 #define _mm_dpbusds_epi32(S, A, B) \
-  (__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+  ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -150,7 +150,7 @@
 ///    DST[MAX:128] := 0
 /// \endoperation
 #define _mm_dpwssd_epi32(S, A, B) \
-  (__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -168,7 +168,7 @@
 ///    DST[MAX:128] := 0
 /// \endoperation
 #define _mm_dpwssds_epi32(S, A, B) \
-  (__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
diff --git a/clang/lib/Headers/f16cintrin.h b/clang/lib/Headers/f16cintrin.h
index 109b604adae3b..13905e6fb0ec8 100644
--- a/clang/lib/Headers/f16cintrin.h
+++ b/clang/lib/Headers/f16cintrin.h
@@ -66,8 +66,8 @@ _cvtsh_ss(unsigned short __a)
 ///    1XX: Use MXCSR.RC for rounding
 /// \returns The converted 16-bit half-precision float value.
 #define _cvtss_sh(a, imm) \
-  (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
-                                                     (imm)))[0])
+  ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
+                                                     (imm)))[0]))
 
 /// Converts a 128-bit vector containing 32-bit float values into a
 ///    128-bit vector containing 16-bit half-precision float values.
@@ -93,7 +93,7 @@ _cvtsh_ss(unsigned short __a)
 ///    values. The lower 64 bits are used to store the converted 16-bit
 ///    half-precision floating-point values.
 #define _mm_cvtps_ph(a, imm) \
-  (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm))
+  ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
 
 /// Converts a 128-bit vector containing 16-bit half-precision float
 ///    values into a 128-bit vector containing 32-bit float values.
@@ -136,7 +136,7 @@ _mm_cvtph_ps(__m128i __a)
 /// \returns A 128-bit vector containing the converted 16-bit half-precision
 ///    float values.
 #define _mm256_cvtps_ph(a, imm) \
- (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm))
+ ((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)))
 
 /// Converts a 128-bit vector containing 16-bit half-precision float
 ///    values into a 256-bit vector of [8 x float].
diff --git a/clang/lib/Headers/gfniintrin.h b/clang/lib/Headers/gfniintrin.h
index 11a321b7c919b..a59238b0b1319 100644
--- a/clang/lib/Headers/gfniintrin.h
+++ b/clang/lib/Headers/gfniintrin.h
@@ -28,14 +28,14 @@
 #define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
 
 #define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
-  (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A),          \
-                                                  (__v16qi)(__m128i)(B),          \
-                                                  (char)(I))
+  ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
+                                                   (__v16qi)(__m128i)(B), \
+                                                   (char)(I)))
 
 #define _mm_gf2p8affine_epi64_epi8(A, B, I) \
-  (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A),             \
-                                                  (__v16qi)(__m128i)(B),          \
-                                                  (char)(I))
+  ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
+                                                   (__v16qi)(__m128i)(B), \
+                                                   (char)(I)))
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
@@ -46,14 +46,14 @@ _mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
 
 #ifdef __AVXINTRIN_H
 #define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
-  (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A),          \
-                                                  (__v32qi)(__m256i)(B),          \
-                                                  (char)(I))
+  ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
+                                                   (__v32qi)(__m256i)(B), \
+                                                   (char)(I)))
 
 #define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
-  (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A),             \
-                                                  (__v32qi)(__m256i)(B),          \
-                                                  (char)(I))
+  ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
+                                                   (__v32qi)(__m256i)(B), \
+                                                   (char)(I)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
 _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
@@ -65,32 +65,32 @@ _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
 
 #ifdef __AVX512BWINTRIN_H
 #define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
-  (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A),          \
-                                                  (__v64qi)(__m512i)(B),          \
-                                                  (char)(I))
+  ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
+                                                   (__v64qi)(__m512i)(B), \
+                                                   (char)(I)))
 
 #define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
-        (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I),                       \
-        (__v64qi)(__m512i)(S))
+  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+         (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
+         (__v64qi)(__m512i)(S)))
 
 #define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(),    \
-        U, A, B, I)
+  _mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
+         U, A, B, I)
 
 #define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
-  (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A),             \
-                                                  (__v64qi)(__m512i)(B),          \
-                                                  (char)(I))
+  ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
+                                                   (__v64qi)(__m512i)(B), \
+                                                   (char)(I)))
 
 #define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-   (__m512i)__builtin_ia32_selectb_512((__mmask64)(U),                            \
-        (__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I),                          \
-        (__v64qi)(__m512i)(S))
+  ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+         (__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \
+         (__v64qi)(__m512i)(S)))
 
 #define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(),       \
-        U, A, B, I)
+  _mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
+         U, A, B, I)
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
 _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
@@ -117,40 +117,39 @@ _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
 
 #ifdef __AVX512VLBWINTRIN_H
 #define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
-        (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I),                          \
-        (__v16qi)(__m128i)(S))
+  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+         (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
+         (__v16qi)(__m128i)(S)))
 
 #define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(),       \
-        U, A, B, I)
+  _mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
+         U, A, B, I)
 
 #define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
-   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
-        (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I),                       \
-        (__v32qi)(__m256i)(S))
+  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+         (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
+         (__v32qi)(__m256i)(S)))
 
 #define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
-  (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
-        U, A, B, I)
+  _mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
+         U, A, B, I)
 
 #define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-  (__m128i)__builtin_ia32_selectb_128((__mmask16)(U),                             \
-        (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I),                             \
-        (__v16qi)(__m128i)(S))
+  ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+         (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
+         (__v16qi)(__m128i)(S)))
 
 #define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(),          \
-        U, A, B, I)
+  _mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I)
 
 #define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
-   (__m256i)__builtin_ia32_selectb_256((__mmask32)(U),                            \
-        (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I),                          \
-        (__v32qi)(__m256i)(S))
+  ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+         (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
+         (__v32qi)(__m256i)(S)))
 
 #define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
-  (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(),    \
-        U, A, B, I)
+  _mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
+         U, A, B, I)
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
 _mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
diff --git a/clang/lib/Headers/intrin.h b/clang/lib/Headers/intrin.h
index ff8eb8fca2687..34ec79d6acbc6 100644
--- a/clang/lib/Headers/intrin.h
+++ b/clang/lib/Headers/intrin.h
@@ -574,6 +574,9 @@ void _WriteStatusReg(int, __int64);
 unsigned short __cdecl _byteswap_ushort(unsigned short val);
 unsigned long __cdecl _byteswap_ulong (unsigned long val);
 unsigned __int64 __cdecl _byteswap_uint64(unsigned __int64 val);
+
+__int64 __mulh(__int64 __a, __int64 __b);
+unsigned __int64 __umulh(unsigned __int64 __a, unsigned __int64 __b);
 #endif
 
 /*----------------------------------------------------------------------------*\
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 3ee58c9d79370..8913a196144bb 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -865,15 +865,13 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    10: Bits [95:64] of parameter \a X are returned. \n
 ///    11: Bits [127:96] of parameter \a X are returned.
 /// \returns A 32-bit integer containing the extracted 32 bits of float data.
-#define _mm_extract_ps(X, N) (__extension__                      \
-  ({ union { int __i; float __f; } __t;  \
-     __t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
-     __t.__i;}))
+#define _mm_extract_ps(X, N) \
+  __builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
 
 /* Miscellaneous insert and extract macros.  */
 /* Extract a single-precision float from X at index N into D.  */
 #define _MM_EXTRACT_FLOAT(D, X, N) \
-  { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); }
+  do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0)
 
 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
    an index suitable for _mm_insert_ps.  */
diff --git a/clang/lib/Headers/vpclmulqdqintrin.h b/clang/lib/Headers/vpclmulqdqintrin.h
index 44daadb07d57c..485692ea2b5b1 100644
--- a/clang/lib/Headers/vpclmulqdqintrin.h
+++ b/clang/lib/Headers/vpclmulqdqintrin.h
@@ -15,15 +15,15 @@
 #define __VPCLMULQDQINTRIN_H
 
 #define _mm256_clmulepi64_epi128(A, B, I) \
-  (__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A),  \
-                                       (__v4di)(__m256i)(B),  \
-                                       (char)(I))
+  ((__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A),  \
+                                        (__v4di)(__m256i)(B),  \
+                                        (char)(I)))
 
 #ifdef __AVX512FINTRIN_H
 #define _mm512_clmulepi64_epi128(A, B, I) \
-  (__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A),  \
-                                       (__v8di)(__m512i)(B),  \
-                                       (char)(I))
+  ((__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A),  \
+                                        (__v8di)(__m512i)(B),  \
+                                        (char)(I)))
 #endif // __AVX512FINTRIN_H
 
 #endif /* __VPCLMULQDQINTRIN_H */
diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h
index 712fa03780986..3889a2769faf7 100644
--- a/clang/lib/Headers/wasm_simd128.h
+++ b/clang/lib/Headers/wasm_simd128.h
@@ -276,12 +276,28 @@ wasm_i8x16_make(int8_t __c0, int8_t __c1, int8_t __c2, int8_t __c3, int8_t __c4,
                            __c12, __c13, __c14, __c15};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS
+wasm_u8x16_make(uint8_t __c0, uint8_t __c1, uint8_t __c2, uint8_t __c3,
+                uint8_t __c4, uint8_t __c5, uint8_t __c6, uint8_t __c7,
+                uint8_t __c8, uint8_t __c9, uint8_t __c10, uint8_t __c11,
+                uint8_t __c12, uint8_t __c13, uint8_t __c14, uint8_t __c15) {
+  return (v128_t)(__u8x16){__c0,  __c1,  __c2,  __c3, __c4,  __c5,
+                           __c6,  __c7,  __c8,  __c9, __c10, __c11,
+                           __c12, __c13, __c14, __c15};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i16x8_make(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
                 int16_t __c4, int16_t __c5, int16_t __c6, int16_t __c7) {
   return (v128_t)(__i16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS
+wasm_u16x8_make(uint16_t __c0, uint16_t __c1, uint16_t __c2, uint16_t __c3,
+                uint16_t __c4, uint16_t __c5, uint16_t __c6, uint16_t __c7) {
+  return (v128_t)(__u16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_make(int32_t __c0,
                                                             int32_t __c1,
                                                             int32_t __c2,
@@ -289,11 +305,23 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_make(int32_t __c0,
   return (v128_t)(__i32x4){__c0, __c1, __c2, __c3};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_make(uint32_t __c0,
+                                                            uint32_t __c1,
+                                                            uint32_t __c2,
+                                                            uint32_t __c3) {
+  return (v128_t)(__u32x4){__c0, __c1, __c2, __c3};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_make(int64_t __c0,
                                                             int64_t __c1) {
   return (v128_t)(__i64x2){__c0, __c1};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_make(uint64_t __c0,
+                                                            uint64_t __c1) {
+  return (v128_t)(__u64x2){__c0, __c1};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_make(float __c0,
                                                             float __c1,
                                                             float __c2,
@@ -324,6 +352,24 @@ wasm_i8x16_const(int8_t __c0, int8_t __c1, int8_t __c2, int8_t __c3,
                            __c12, __c13, __c14, __c15};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS
+wasm_u8x16_const(uint8_t __c0, uint8_t __c1, uint8_t __c2, uint8_t __c3,
+                 uint8_t __c4, uint8_t __c5, uint8_t __c6, uint8_t __c7,
+                 uint8_t __c8, uint8_t __c9, uint8_t __c10, uint8_t __c11,
+                 uint8_t __c12, uint8_t __c13, uint8_t __c14, uint8_t __c15)
+    __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
+        __REQUIRE_CONSTANT(__c3) __REQUIRE_CONSTANT(__c4)
+            __REQUIRE_CONSTANT(__c5) __REQUIRE_CONSTANT(__c6)
+                __REQUIRE_CONSTANT(__c7) __REQUIRE_CONSTANT(__c8)
+                    __REQUIRE_CONSTANT(__c9) __REQUIRE_CONSTANT(__c10)
+                        __REQUIRE_CONSTANT(__c11) __REQUIRE_CONSTANT(__c12)
+                            __REQUIRE_CONSTANT(__c13) __REQUIRE_CONSTANT(__c14)
+                                __REQUIRE_CONSTANT(__c15) {
+  return (v128_t)(__u8x16){__c0,  __c1,  __c2,  __c3, __c4,  __c5,
+                           __c6,  __c7,  __c8,  __c9, __c10, __c11,
+                           __c12, __c13, __c14, __c15};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i16x8_const(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
                  int16_t __c4, int16_t __c5, int16_t __c6, int16_t __c7)
@@ -334,6 +380,16 @@ wasm_i16x8_const(int16_t __c0, int16_t __c1, int16_t __c2, int16_t __c3,
   return (v128_t)(__i16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS
+wasm_u16x8_const(uint16_t __c0, uint16_t __c1, uint16_t __c2, uint16_t __c3,
+                 uint16_t __c4, uint16_t __c5, uint16_t __c6, uint16_t __c7)
+    __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
+        __REQUIRE_CONSTANT(__c3) __REQUIRE_CONSTANT(__c4)
+            __REQUIRE_CONSTANT(__c5) __REQUIRE_CONSTANT(__c6)
+                __REQUIRE_CONSTANT(__c7) {
+  return (v128_t)(__u16x8){__c0, __c1, __c2, __c3, __c4, __c5, __c6, __c7};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_i32x4_const(int32_t __c0, int32_t __c1, int32_t __c2, int32_t __c3)
     __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
@@ -341,12 +397,25 @@ wasm_i32x4_const(int32_t __c0, int32_t __c1, int32_t __c2, int32_t __c3)
   return (v128_t)(__i32x4){__c0, __c1, __c2, __c3};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS
+wasm_u32x4_const(uint32_t __c0, uint32_t __c1, uint32_t __c2, uint32_t __c3)
+    __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
+        __REQUIRE_CONSTANT(__c3) {
+  return (v128_t)(__u32x4){__c0, __c1, __c2, __c3};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_const(int64_t __c0,
                                                              int64_t __c1)
     __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) {
   return (v128_t)(__i64x2){__c0, __c1};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_const(uint64_t __c0,
+                                                             uint64_t __c1)
+    __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) {
+  return (v128_t)(__u64x2){__c0, __c1};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS
 wasm_f32x4_const(float __c0, float __c1, float __c2, float __c3)
     __REQUIRE_CONSTANT(__c0) __REQUIRE_CONSTANT(__c1) __REQUIRE_CONSTANT(__c2)
@@ -366,21 +435,42 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_const_splat(int8_t __c)
                            __c, __c, __c, __c, __c, __c, __c, __c};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_const_splat(uint8_t __c)
+    __REQUIRE_CONSTANT(__c) {
+  return (v128_t)(__u8x16){__c, __c, __c, __c, __c, __c, __c, __c,
+                           __c, __c, __c, __c, __c, __c, __c, __c};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_const_splat(int16_t __c)
     __REQUIRE_CONSTANT(__c) {
   return (v128_t)(__i16x8){__c, __c, __c, __c, __c, __c, __c, __c};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_const_splat(uint16_t __c)
+    __REQUIRE_CONSTANT(__c) {
+  return (v128_t)(__u16x8){__c, __c, __c, __c, __c, __c, __c, __c};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_const_splat(int32_t __c)
     __REQUIRE_CONSTANT(__c) {
   return (v128_t)(__i32x4){__c, __c, __c, __c};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_const_splat(uint32_t __c)
+    __REQUIRE_CONSTANT(__c) {
+  return (v128_t)(__u32x4){__c, __c, __c, __c};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_const_splat(int64_t __c)
     __REQUIRE_CONSTANT(__c) {
   return (v128_t)(__i64x2){__c, __c};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_const_splat(uint64_t __c)
+    __REQUIRE_CONSTANT(__c) {
+  return (v128_t)(__u64x2){__c, __c};
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_const_splat(float __c)
     __REQUIRE_CONSTANT(__c) {
   return (v128_t)(__f32x4){__c, __c, __c, __c};
@@ -396,6 +486,11 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_splat(int8_t __a) {
                            __a, __a, __a, __a, __a, __a, __a, __a};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_splat(uint8_t __a) {
+  return (v128_t)(__u8x16){__a, __a, __a, __a, __a, __a, __a, __a,
+                           __a, __a, __a, __a, __a, __a, __a, __a};
+}
+
 static __inline__ int8_t __DEFAULT_FN_ATTRS wasm_i8x16_extract_lane(v128_t __a,
                                                                     int __i)
     __REQUIRE_CONSTANT(__i) {
@@ -417,10 +512,23 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_replace_lane(v128_t __a,
   return (v128_t)__v;
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_replace_lane(v128_t __a,
+                                                                    int __i,
+                                                                    uint8_t __b)
+    __REQUIRE_CONSTANT(__i) {
+  __u8x16 __v = (__u8x16)__a;
+  __v[__i] = __b;
+  return (v128_t)__v;
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_splat(int16_t __a) {
   return (v128_t)(__i16x8){__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_splat(uint16_t __a) {
+  return (v128_t)(__u16x8){__a, __a, __a, __a, __a, __a, __a, __a};
+}
+
 static __inline__ int16_t __DEFAULT_FN_ATTRS wasm_i16x8_extract_lane(v128_t __a,
                                                                      int __i)
     __REQUIRE_CONSTANT(__i) {
@@ -441,16 +549,32 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_replace_lane(v128_t __a,
   return (v128_t)__v;
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_replace_lane(
+    v128_t __a, int __i, uint16_t __b) __REQUIRE_CONSTANT(__i) {
+  __u16x8 __v = (__u16x8)__a;
+  __v[__i] = __b;
+  return (v128_t)__v;
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_splat(int32_t __a) {
   return (v128_t)(__i32x4){__a, __a, __a, __a};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_splat(uint32_t __a) {
+  return (v128_t)(__u32x4){__a, __a, __a, __a};
+}
+
 static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i32x4_extract_lane(v128_t __a,
                                                                      int __i)
     __REQUIRE_CONSTANT(__i) {
   return ((__i32x4)__a)[__i];
 }
 
+static __inline__ uint32_t __DEFAULT_FN_ATTRS
+wasm_u32x4_extract_lane(v128_t __a, int __i) __REQUIRE_CONSTANT(__i) {
+  return ((__u32x4)__a)[__i];
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_replace_lane(v128_t __a,
                                                                     int __i,
                                                                     int32_t __b)
@@ -460,16 +584,32 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_replace_lane(v128_t __a,
   return (v128_t)__v;
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_replace_lane(
+    v128_t __a, int __i, uint32_t __b) __REQUIRE_CONSTANT(__i) {
+  __u32x4 __v = (__u32x4)__a;
+  __v[__i] = __b;
+  return (v128_t)__v;
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_splat(int64_t __a) {
   return (v128_t)(__i64x2){__a, __a};
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_splat(uint64_t __a) {
+  return (v128_t)(__u64x2){__a, __a};
+}
+
 static __inline__ int64_t __DEFAULT_FN_ATTRS wasm_i64x2_extract_lane(v128_t __a,
                                                                      int __i)
     __REQUIRE_CONSTANT(__i) {
   return ((__i64x2)__a)[__i];
 }
 
+static __inline__ uint64_t __DEFAULT_FN_ATTRS
+wasm_u64x2_extract_lane(v128_t __a, int __i) __REQUIRE_CONSTANT(__i) {
+  return ((__u64x2)__a)[__i];
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_replace_lane(v128_t __a,
                                                                     int __i,
                                                                     int64_t __b)
@@ -479,6 +619,13 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_replace_lane(v128_t __a,
   return (v128_t)__v;
 }
 
+static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_replace_lane(
+    v128_t __a, int __i, uint64_t __b) __REQUIRE_CONSTANT(__i) {
+  __u64x2 __v = (__u64x2)__a;
+  __v[__i] = __b;
+  return (v128_t)__v;
+}
+
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_splat(float __a) {
   return (v128_t)(__f32x4){__a, __a, __a, __a};
 }
@@ -804,7 +951,7 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i8x16_all_true(v128_t __a) {
   return __builtin_wasm_all_true_i8x16((__i8x16)__a);
 }
 
-static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i8x16_bitmask(v128_t __a) {
+static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i8x16_bitmask(v128_t __a) {
   return __builtin_wasm_bitmask_i8x16((__i8x16)__a);
 }
 
@@ -813,17 +960,17 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_popcnt(v128_t __a) {
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
   return (v128_t)((__i8x16)__a << __b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
   return (v128_t)((__i8x16)__a >> __b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
   return (v128_t)((__u8x16)__a >> __b);
 }
 
@@ -894,22 +1041,22 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i16x8_all_true(v128_t __a) {
   return __builtin_wasm_all_true_i16x8((__i16x8)__a);
 }
 
-static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i16x8_bitmask(v128_t __a) {
+static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i16x8_bitmask(v128_t __a) {
   return __builtin_wasm_bitmask_i16x8((__i16x8)__a);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shl(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
   return (v128_t)((__i16x8)__a << __b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
   return (v128_t)((__i16x8)__a >> __b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
   return (v128_t)((__u16x8)__a >> __b);
 }
 
@@ -985,22 +1132,22 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i32x4_all_true(v128_t __a) {
   return __builtin_wasm_all_true_i32x4((__i32x4)__a);
 }
 
-static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i32x4_bitmask(v128_t __a) {
+static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i32x4_bitmask(v128_t __a) {
   return __builtin_wasm_bitmask_i32x4((__i32x4)__a);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shl(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
   return (v128_t)((__i32x4)__a << __b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
   return (v128_t)((__i32x4)__a >> __b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
   return (v128_t)((__u32x4)__a >> __b);
 }
 
@@ -1056,22 +1203,22 @@ static __inline__ bool __DEFAULT_FN_ATTRS wasm_i64x2_all_true(v128_t __a) {
   return __builtin_wasm_all_true_i64x2((__i64x2)__a);
 }
 
-static __inline__ int32_t __DEFAULT_FN_ATTRS wasm_i64x2_bitmask(v128_t __a) {
+static __inline__ uint32_t __DEFAULT_FN_ATTRS wasm_i64x2_bitmask(v128_t __a) {
   return __builtin_wasm_bitmask_i64x2((__i64x2)__a);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shl(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
   return (v128_t)((__i64x2)__a << (int64_t)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
   return (v128_t)((__i64x2)__a >> (int64_t)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_shr(v128_t __a,
-                                                           int32_t __b) {
+                                                           uint32_t __b) {
   return (v128_t)((__u64x2)__a >> (int64_t)__b);
 }
 
@@ -1150,14 +1297,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_max(v128_t __a,
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmin(v128_t __a,
                                                             v128_t __b) {
-  __i32x4 __mask = (__i32x4)((__f32x4)__b < (__f32x4)__a);
-  return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask));
+  return (v128_t)__builtin_wasm_pmin_f32x4((__f32x4)__a, (__f32x4)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_pmax(v128_t __a,
                                                             v128_t __b) {
-  __i32x4 __mask = (__i32x4)((__f32x4)__a < (__f32x4)__b);
-  return (v128_t)((((__i32x4)__b) & __mask) | (((__i32x4)__a) & ~__mask));
+  return (v128_t)__builtin_wasm_pmax_f32x4((__f32x4)__a, (__f32x4)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_abs(v128_t __a) {
@@ -1220,14 +1365,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_max(v128_t __a,
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmin(v128_t __a,
                                                             v128_t __b) {
-  __i64x2 __mask = (__i64x2)((__f64x2)__b < (__f64x2)__a);
-  return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask));
+  return (v128_t)__builtin_wasm_pmin_f64x2((__f64x2)__a, (__f64x2)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_pmax(v128_t __a,
                                                             v128_t __b) {
-  __i64x2 __mask = (__i64x2)((__f64x2)__a < (__f64x2)__b);
-  return (v128_t)((((__i64x2)__b) & __mask) | (((__i64x2)__a) & ~__mask));
+  return (v128_t)__builtin_wasm_pmax_f64x2((__f64x2)__a, (__f64x2)__b);
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/xopintrin.h b/clang/lib/Headers/xopintrin.h
index 5cedde41b625f..976cdf4902a40 100644
--- a/clang/lib/Headers/xopintrin.h
+++ b/clang/lib/Headers/xopintrin.h
@@ -225,16 +225,16 @@ _mm_rot_epi64(__m128i __A, __m128i __B)
 }
 
 #define _mm_roti_epi8(A, N) \
-  (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N))
+  ((__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)))
 
 #define _mm_roti_epi16(A, N) \
-  (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N))
+  ((__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)))
 
 #define _mm_roti_epi32(A, N) \
-  (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N))
+  ((__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)))
 
 #define _mm_roti_epi64(A, N) \
-  (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N))
+  ((__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)))
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_shl_epi8(__m128i __A, __m128i __B)
@@ -285,36 +285,36 @@ _mm_sha_epi64(__m128i __A, __m128i __B)
 }
 
 #define _mm_com_epu8(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
-                                  (__v16qi)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
+                                   (__v16qi)(__m128i)(B), (N)))
 
 #define _mm_com_epu16(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
-                                  (__v8hi)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
+                                   (__v8hi)(__m128i)(B), (N)))
 
 #define _mm_com_epu32(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
-                                  (__v4si)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
+                                   (__v4si)(__m128i)(B), (N)))
 
 #define _mm_com_epu64(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
-                                  (__v2di)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
+                                   (__v2di)(__m128i)(B), (N)))
 
 #define _mm_com_epi8(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
-                                 (__v16qi)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
+                                  (__v16qi)(__m128i)(B), (N)))
 
 #define _mm_com_epi16(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
-                                 (__v8hi)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
+                                  (__v8hi)(__m128i)(B), (N)))
 
 #define _mm_com_epi32(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
-                                 (__v4si)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
+                                  (__v4si)(__m128i)(B), (N)))
 
 #define _mm_com_epi64(A, B, N) \
-  (__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
-                                 (__v2di)(__m128i)(B), (N))
+  ((__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
+                                  (__v2di)(__m128i)(B), (N)))
 
 #define _MM_PCOMCTRL_LT    0
 #define _MM_PCOMCTRL_LE    1
@@ -710,23 +710,23 @@ _mm_comtrue_epi64(__m128i __A, __m128i __B)
 }
 
 #define _mm_permute2_pd(X, Y, C, I) \
-  (__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
-                                     (__v2df)(__m128d)(Y), \
-                                     (__v2di)(__m128i)(C), (I))
+  ((__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
+                                      (__v2df)(__m128d)(Y), \
+                                      (__v2di)(__m128i)(C), (I)))
 
 #define _mm256_permute2_pd(X, Y, C, I) \
-  (__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
-                                        (__v4df)(__m256d)(Y), \
-                                        (__v4di)(__m256i)(C), (I))
+  ((__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
+                                         (__v4df)(__m256d)(Y), \
+                                         (__v4di)(__m256i)(C), (I)))
 
 #define _mm_permute2_ps(X, Y, C, I) \
-  (__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
-                                    (__v4si)(__m128i)(C), (I))
+  ((__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
+                                     (__v4si)(__m128i)(C), (I)))
 
 #define _mm256_permute2_ps(X, Y, C, I) \
-  (__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
-                                       (__v8sf)(__m256)(Y), \
-                                       (__v8si)(__m256i)(C), (I))
+  ((__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
+                                        (__v8sf)(__m256)(Y), \
+                                        (__v8si)(__m256i)(C), (I)))
 
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_frcz_ss(__m128 __A)
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 64944492eb99b..05b84e0c14b3e 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -1446,19 +1446,30 @@ void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
   IsAtPhysicalStartOfLine = StartOfLine;
 }
 
+static bool isUnicodeWhitespace(uint32_t Codepoint) {
+  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
+      UnicodeWhitespaceCharRanges);
+  return UnicodeWhitespaceChars.contains(Codepoint);
+}
+
 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
   if (LangOpts.AsmPreprocessor) {
     return false;
   } else if (LangOpts.DollarIdents && '$' == C) {
     return true;
-  } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
+  } else if (LangOpts.CPlusPlus) {
+    // A non-leading codepoint must have the XID_Continue property.
+    // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
+    // so we need to check both tables.
+    // '_' doesn't have the XID_Continue property but is allowed in C++.
+    static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
+    static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
+    return C == '_' || XIDStartChars.contains(C) ||
+           XIDContinueChars.contains(C);
+  } else if (LangOpts.C11) {
     static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
         C11AllowedIDCharRanges);
     return C11AllowedIDChars.contains(C);
-  } else if (LangOpts.CPlusPlus) {
-    static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
-        CXX03AllowedIDCharRanges);
-    return CXX03AllowedIDChars.contains(C);
   } else {
     static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
         C99AllowedIDCharRanges);
@@ -1467,20 +1478,24 @@ static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
 }
 
 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
-  assert(isAllowedIDChar(C, LangOpts));
   if (LangOpts.AsmPreprocessor) {
     return false;
-  } else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
+  }
+  if (LangOpts.CPlusPlus) {
+    static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
+    // '_' doesn't have the XID_Start property but is allowed in C++.
+    return C == '_' || XIDStartChars.contains(C);
+  }
+  if (!isAllowedIDChar(C, LangOpts))
+    return false;
+  if (LangOpts.C11) {
     static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
         C11DisallowedInitialIDCharRanges);
     return !C11DisallowedInitialIDChars.contains(C);
-  } else if (LangOpts.CPlusPlus) {
-    return true;
-  } else {
-    static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
-        C99DisallowedInitialIDCharRanges);
-    return !C99DisallowedInitialIDChars.contains(C);
   }
+  static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
+      C99DisallowedInitialIDCharRanges);
+  return !C99DisallowedInitialIDChars.contains(C);
 }
 
 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
@@ -1512,16 +1527,6 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
         << CannotStartIdentifier;
     }
   }
-
-  // Check C++98 compatibility.
-  if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
-    static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
-        CXX03AllowedIDCharRanges);
-    if (!CXX03AllowedIDChars.contains(C)) {
-      Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
-        << Range;
-    }
-  }
 }
 
 /// After encountering UTF-8 character C and interpreting it as an identifier
@@ -1608,14 +1613,56 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
   }
 }
 
+static void diagnoseInvalidUnicodeCodepointInIdentifier(
+    DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
+    CharSourceRange Range, bool IsFirst) {
+  if (isASCII(CodePoint))
+    return;
+
+  bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts);
+  bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts);
+
+  if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
+    return;
+
+  bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
+
+  llvm::SmallString<5> CharBuf;
+  llvm::raw_svector_ostream CharOS(CharBuf);
+  llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4);
+
+  if (!IsFirst || InvalidOnlyAtStart) {
+    Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
+        << Range << CharBuf << int(InvalidOnlyAtStart)
+        << FixItHint::CreateRemoval(Range);
+  } else {
+    Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
+        << Range << CharBuf << FixItHint::CreateRemoval(Range);
+  }
+}
+
 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
                                     Token &Result) {
   const char *UCNPtr = CurPtr + Size;
   uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
-  if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
+  if (CodePoint == 0) {
     return false;
+  }
 
-  if (!isLexingRawMode())
+  if (!isAllowedIDChar(CodePoint, LangOpts)) {
+    if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
+      return false;
+    if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
+        !PP->isPreprocessedOutput())
+      diagnoseInvalidUnicodeCodepointInIdentifier(
+          PP->getDiagnostics(), LangOpts, CodePoint,
+          makeCharRange(*this, CurPtr, UCNPtr),
+          /*IsFirst=*/false);
+
+    // We got a unicode codepoint that is neither a space nor a
+    // a valid identifier part.
+    // Carry on as if the codepoint was valid for recovery purposes.
+  } else if (!isLexingRawMode())
     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
                               makeCharRange(*this, CurPtr, UCNPtr),
                               /*IsFirst=*/false);
@@ -1638,11 +1685,22 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
                                 (const llvm::UTF8 *)BufferEnd,
                                 &CodePoint,
                                 llvm::strictConversion);
-  if (Result != llvm::conversionOK ||
-      !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
+  if (Result != llvm::conversionOK)
     return false;
 
-  if (!isLexingRawMode()) {
+  if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) {
+    if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
+      return false;
+
+    if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
+        !PP->isPreprocessedOutput())
+      diagnoseInvalidUnicodeCodepointInIdentifier(
+          PP->getDiagnostics(), LangOpts, CodePoint,
+          makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false);
+    // We got a unicode codepoint that is neither a space nor a
+    // a valid identifier part. Carry on as if the codepoint was
+    // valid for recovery purposes.
+  } else if (!isLexingRawMode()) {
     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
                               makeCharRange(*this, CurPtr, UnicodePtr),
                               /*IsFirst=*/false);
@@ -3136,10 +3194,8 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
 
 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
                                    const char *CurPtr) {
-  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
-      UnicodeWhitespaceCharRanges);
   if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
-      UnicodeWhitespaceChars.contains(C)) {
+      isUnicodeWhitespace(C)) {
     Diag(BufferPtr, diag::ext_unicode_whitespace)
       << makeCharRange(*this, BufferPtr, CurPtr);
 
@@ -3150,7 +3206,7 @@ bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
 }
 
 bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
-  if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
+  if (isAllowedInitiallyIDChar(C, LangOpts)) {
     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
         !PP->isPreprocessedOutput()) {
       maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
@@ -3165,8 +3221,8 @@ bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
   }
 
   if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
-      !PP->isPreprocessedOutput() &&
-      !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
+      !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
+      !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) {
     // Non-ASCII characters tend to creep into source code unintentionally.
     // Instead of letting the parser complain about the unknown token,
     // just drop the character.
@@ -3176,9 +3232,9 @@ bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
     // loophole in the mapping of Unicode characters to basic character set
     // characters that allows us to map these particular characters to, say,
     // whitespace.
-    Diag(BufferPtr, diag::err_non_ascii)
-      << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
-
+    diagnoseInvalidUnicodeCodepointInIdentifier(
+        PP->getDiagnostics(), LangOpts, C,
+        makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
     BufferPtr = CurPtr;
     return false;
   }
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 85d826ce9c6f7..f012fb72580ed 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1390,14 +1390,14 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
   unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
 
   if (NumCharsSoFar > 1) {
-    if (isWide())
-      PP.Diag(Loc, diag::warn_extraneous_char_constant);
-    else if (isAscii() && NumCharsSoFar == 4)
+    if (isAscii() && NumCharsSoFar == 4)
       PP.Diag(Loc, diag::warn_four_char_character_literal);
     else if (isAscii())
       PP.Diag(Loc, diag::warn_multichar_character_literal);
-    else
-      PP.Diag(Loc, diag::err_multichar_utf_character_literal);
+    else {
+      PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);
+      HadError = true;
+    }
     IsMultiChar = true;
   } else {
     IsMultiChar = false;
diff --git a/clang/lib/Lex/UnicodeCharSets.h b/clang/lib/Lex/UnicodeCharSets.h
index 74dd57fdf1184..33fb4d39ebec2 100644
--- a/clang/lib/Lex/UnicodeCharSets.h
+++ b/clang/lib/Lex/UnicodeCharSets.h
@@ -10,6 +10,344 @@
 
 #include "llvm/Support/UnicodeCharRanges.h"
 
+// Unicode 13 XID_Start
+static const llvm::sys::UnicodeCharRange XIDStartRanges[] = {
+    {0x0041, 0x005A},   {0x0061, 0x007A},   {0x00AA, 0x00AA},
+    {0x00B5, 0x00B5},   {0x00BA, 0x00BA},   {0x00C0, 0x00D6},
+    {0x00D8, 0x00F6},   {0x00F8, 0x02C1},   {0x02C6, 0x02D1},
+    {0x02E0, 0x02E4},   {0x02EC, 0x02EC},   {0x02EE, 0x02EE},
+    {0x0370, 0x0374},   {0x0376, 0x0377},   {0x037B, 0x037D},
+    {0x037F, 0x037F},   {0x0386, 0x0386},   {0x0388, 0x038A},
+    {0x038C, 0x038C},   {0x038E, 0x03A1},   {0x03A3, 0x03F5},
+    {0x03F7, 0x0481},   {0x048A, 0x052F},   {0x0531, 0x0556},
+    {0x0559, 0x0559},   {0x0560, 0x0588},   {0x05D0, 0x05EA},
+    {0x05EF, 0x05F2},   {0x0620, 0x064A},   {0x066E, 0x066F},
+    {0x0671, 0x06D3},   {0x06D5, 0x06D5},   {0x06E5, 0x06E6},
+    {0x06EE, 0x06EF},   {0x06FA, 0x06FC},   {0x06FF, 0x06FF},
+    {0x0710, 0x0710},   {0x0712, 0x072F},   {0x074D, 0x07A5},
+    {0x07B1, 0x07B1},   {0x07CA, 0x07EA},   {0x07F4, 0x07F5},
+    {0x07FA, 0x07FA},   {0x0800, 0x0815},   {0x081A, 0x081A},
+    {0x0824, 0x0824},   {0x0828, 0x0828},   {0x0840, 0x0858},
+    {0x0860, 0x086A},   {0x08A0, 0x08B4},   {0x08B6, 0x08C7},
+    {0x0904, 0x0939},   {0x093D, 0x093D},   {0x0950, 0x0950},
+    {0x0958, 0x0961},   {0x0971, 0x0980},   {0x0985, 0x098C},
+    {0x098F, 0x0990},   {0x0993, 0x09A8},   {0x09AA, 0x09B0},
+    {0x09B2, 0x09B2},   {0x09B6, 0x09B9},   {0x09BD, 0x09BD},
+    {0x09CE, 0x09CE},   {0x09DC, 0x09DD},   {0x09DF, 0x09E1},
+    {0x09F0, 0x09F1},   {0x09FC, 0x09FC},   {0x0A05, 0x0A0A},
+    {0x0A0F, 0x0A10},   {0x0A13, 0x0A28},   {0x0A2A, 0x0A30},
+    {0x0A32, 0x0A33},   {0x0A35, 0x0A36},   {0x0A38, 0x0A39},
+    {0x0A59, 0x0A5C},   {0x0A5E, 0x0A5E},   {0x0A72, 0x0A74},
+    {0x0A85, 0x0A8D},   {0x0A8F, 0x0A91},   {0x0A93, 0x0AA8},
+    {0x0AAA, 0x0AB0},   {0x0AB2, 0x0AB3},   {0x0AB5, 0x0AB9},
+    {0x0ABD, 0x0ABD},   {0x0AD0, 0x0AD0},   {0x0AE0, 0x0AE1},
+    {0x0AF9, 0x0AF9},   {0x0B05, 0x0B0C},   {0x0B0F, 0x0B10},
+    {0x0B13, 0x0B28},   {0x0B2A, 0x0B30},   {0x0B32, 0x0B33},
+    {0x0B35, 0x0B39},   {0x0B3D, 0x0B3D},   {0x0B5C, 0x0B5D},
+    {0x0B5F, 0x0B61},   {0x0B71, 0x0B71},   {0x0B83, 0x0B83},
+    {0x0B85, 0x0B8A},   {0x0B8E, 0x0B90},   {0x0B92, 0x0B95},
+    {0x0B99, 0x0B9A},   {0x0B9C, 0x0B9C},   {0x0B9E, 0x0B9F},
+    {0x0BA3, 0x0BA4},   {0x0BA8, 0x0BAA},   {0x0BAE, 0x0BB9},
+    {0x0BD0, 0x0BD0},   {0x0C05, 0x0C0C},   {0x0C0E, 0x0C10},
+    {0x0C12, 0x0C28},   {0x0C2A, 0x0C39},   {0x0C3D, 0x0C3D},
+    {0x0C58, 0x0C5A},   {0x0C60, 0x0C61},   {0x0C80, 0x0C80},
+    {0x0C85, 0x0C8C},   {0x0C8E, 0x0C90},   {0x0C92, 0x0CA8},
+    {0x0CAA, 0x0CB3},   {0x0CB5, 0x0CB9},   {0x0CBD, 0x0CBD},
+    {0x0CDE, 0x0CDE},   {0x0CE0, 0x0CE1},   {0x0CF1, 0x0CF2},
+    {0x0D04, 0x0D0C},   {0x0D0E, 0x0D10},   {0x0D12, 0x0D3A},
+    {0x0D3D, 0x0D3D},   {0x0D4E, 0x0D4E},   {0x0D54, 0x0D56},
+    {0x0D5F, 0x0D61},   {0x0D7A, 0x0D7F},   {0x0D85, 0x0D96},
+    {0x0D9A, 0x0DB1},   {0x0DB3, 0x0DBB},   {0x0DBD, 0x0DBD},
+    {0x0DC0, 0x0DC6},   {0x0E01, 0x0E30},   {0x0E32, 0x0E32},
+    {0x0E40, 0x0E46},   {0x0E81, 0x0E82},   {0x0E84, 0x0E84},
+    {0x0E86, 0x0E8A},   {0x0E8C, 0x0EA3},   {0x0EA5, 0x0EA5},
+    {0x0EA7, 0x0EB0},   {0x0EB2, 0x0EB2},   {0x0EBD, 0x0EBD},
+    {0x0EC0, 0x0EC4},   {0x0EC6, 0x0EC6},   {0x0EDC, 0x0EDF},
+    {0x0F00, 0x0F00},   {0x0F40, 0x0F47},   {0x0F49, 0x0F6C},
+    {0x0F88, 0x0F8C},   {0x1000, 0x102A},   {0x103F, 0x103F},
+    {0x1050, 0x1055},   {0x105A, 0x105D},   {0x1061, 0x1061},
+    {0x1065, 0x1066},   {0x106E, 0x1070},   {0x1075, 0x1081},
+    {0x108E, 0x108E},   {0x10A0, 0x10C5},   {0x10C7, 0x10C7},
+    {0x10CD, 0x10CD},   {0x10D0, 0x10FA},   {0x10FC, 0x1248},
+    {0x124A, 0x124D},   {0x1250, 0x1256},   {0x1258, 0x1258},
+    {0x125A, 0x125D},   {0x1260, 0x1288},   {0x128A, 0x128D},
+    {0x1290, 0x12B0},   {0x12B2, 0x12B5},   {0x12B8, 0x12BE},
+    {0x12C0, 0x12C0},   {0x12C2, 0x12C5},   {0x12C8, 0x12D6},
+    {0x12D8, 0x1310},   {0x1312, 0x1315},   {0x1318, 0x135A},
+    {0x1380, 0x138F},   {0x13A0, 0x13F5},   {0x13F8, 0x13FD},
+    {0x1401, 0x166C},   {0x166F, 0x167F},   {0x1681, 0x169A},
+    {0x16A0, 0x16EA},   {0x16EE, 0x16F8},   {0x1700, 0x170C},
+    {0x170E, 0x1711},   {0x1720, 0x1731},   {0x1740, 0x1751},
+    {0x1760, 0x176C},   {0x176E, 0x1770},   {0x1780, 0x17B3},
+    {0x17D7, 0x17D7},   {0x17DC, 0x17DC},   {0x1820, 0x1878},
+    {0x1880, 0x18A8},   {0x18AA, 0x18AA},   {0x18B0, 0x18F5},
+    {0x1900, 0x191E},   {0x1950, 0x196D},   {0x1970, 0x1974},
+    {0x1980, 0x19AB},   {0x19B0, 0x19C9},   {0x1A00, 0x1A16},
+    {0x1A20, 0x1A54},   {0x1AA7, 0x1AA7},   {0x1B05, 0x1B33},
+    {0x1B45, 0x1B4B},   {0x1B83, 0x1BA0},   {0x1BAE, 0x1BAF},
+    {0x1BBA, 0x1BE5},   {0x1C00, 0x1C23},   {0x1C4D, 0x1C4F},
+    {0x1C5A, 0x1C7D},   {0x1C80, 0x1C88},   {0x1C90, 0x1CBA},
+    {0x1CBD, 0x1CBF},   {0x1CE9, 0x1CEC},   {0x1CEE, 0x1CF3},
+    {0x1CF5, 0x1CF6},   {0x1CFA, 0x1CFA},   {0x1D00, 0x1DBF},
+    {0x1E00, 0x1F15},   {0x1F18, 0x1F1D},   {0x1F20, 0x1F45},
+    {0x1F48, 0x1F4D},   {0x1F50, 0x1F57},   {0x1F59, 0x1F59},
+    {0x1F5B, 0x1F5B},   {0x1F5D, 0x1F5D},   {0x1F5F, 0x1F7D},
+    {0x1F80, 0x1FB4},   {0x1FB6, 0x1FBC},   {0x1FBE, 0x1FBE},
+    {0x1FC2, 0x1FC4},   {0x1FC6, 0x1FCC},   {0x1FD0, 0x1FD3},
+    {0x1FD6, 0x1FDB},   {0x1FE0, 0x1FEC},   {0x1FF2, 0x1FF4},
+    {0x1FF6, 0x1FFC},   {0x2071, 0x2071},   {0x207F, 0x207F},
+    {0x2090, 0x209C},   {0x2102, 0x2102},   {0x2107, 0x2107},
+    {0x210A, 0x2113},   {0x2115, 0x2115},   {0x2118, 0x211D},
+    {0x2124, 0x2124},   {0x2126, 0x2126},   {0x2128, 0x2128},
+    {0x212A, 0x2139},   {0x213C, 0x213F},   {0x2145, 0x2149},
+    {0x214E, 0x214E},   {0x2160, 0x2188},   {0x2C00, 0x2C2E},
+    {0x2C30, 0x2C5E},   {0x2C60, 0x2CE4},   {0x2CEB, 0x2CEE},
+    {0x2CF2, 0x2CF3},   {0x2D00, 0x2D25},   {0x2D27, 0x2D27},
+    {0x2D2D, 0x2D2D},   {0x2D30, 0x2D67},   {0x2D6F, 0x2D6F},
+    {0x2D80, 0x2D96},   {0x2DA0, 0x2DA6},   {0x2DA8, 0x2DAE},
+    {0x2DB0, 0x2DB6},   {0x2DB8, 0x2DBE},   {0x2DC0, 0x2DC6},
+    {0x2DC8, 0x2DCE},   {0x2DD0, 0x2DD6},   {0x2DD8, 0x2DDE},
+    {0x3005, 0x3007},   {0x3021, 0x3029},   {0x3031, 0x3035},
+    {0x3038, 0x303C},   {0x3041, 0x3096},   {0x309D, 0x309F},
+    {0x30A1, 0x30FA},   {0x30FC, 0x30FF},   {0x3105, 0x312F},
+    {0x3131, 0x318E},   {0x31A0, 0x31BF},   {0x31F0, 0x31FF},
+    {0x3400, 0x4DBF},   {0x4E00, 0x9FFC},   {0xA000, 0xA48C},
+    {0xA4D0, 0xA4FD},   {0xA500, 0xA60C},   {0xA610, 0xA61F},
+    {0xA62A, 0xA62B},   {0xA640, 0xA66E},   {0xA67F, 0xA69D},
+    {0xA6A0, 0xA6EF},   {0xA717, 0xA71F},   {0xA722, 0xA788},
+    {0xA78B, 0xA7BF},   {0xA7C2, 0xA7CA},   {0xA7F5, 0xA801},
+    {0xA803, 0xA805},   {0xA807, 0xA80A},   {0xA80C, 0xA822},
+    {0xA840, 0xA873},   {0xA882, 0xA8B3},   {0xA8F2, 0xA8F7},
+    {0xA8FB, 0xA8FB},   {0xA8FD, 0xA8FE},   {0xA90A, 0xA925},
+    {0xA930, 0xA946},   {0xA960, 0xA97C},   {0xA984, 0xA9B2},
+    {0xA9CF, 0xA9CF},   {0xA9E0, 0xA9E4},   {0xA9E6, 0xA9EF},
+    {0xA9FA, 0xA9FE},   {0xAA00, 0xAA28},   {0xAA40, 0xAA42},
+    {0xAA44, 0xAA4B},   {0xAA60, 0xAA76},   {0xAA7A, 0xAA7A},
+    {0xAA7E, 0xAAAF},   {0xAAB1, 0xAAB1},   {0xAAB5, 0xAAB6},
+    {0xAAB9, 0xAABD},   {0xAAC0, 0xAAC0},   {0xAAC2, 0xAAC2},
+    {0xAADB, 0xAADD},   {0xAAE0, 0xAAEA},   {0xAAF2, 0xAAF4},
+    {0xAB01, 0xAB06},   {0xAB09, 0xAB0E},   {0xAB11, 0xAB16},
+    {0xAB20, 0xAB26},   {0xAB28, 0xAB2E},   {0xAB30, 0xAB5A},
+    {0xAB5C, 0xAB69},   {0xAB70, 0xABE2},   {0xAC00, 0xD7A3},
+    {0xD7B0, 0xD7C6},   {0xD7CB, 0xD7FB},   {0xF900, 0xFA6D},
+    {0xFA70, 0xFAD9},   {0xFB00, 0xFB06},   {0xFB13, 0xFB17},
+    {0xFB1D, 0xFB1D},   {0xFB1F, 0xFB28},   {0xFB2A, 0xFB36},
+    {0xFB38, 0xFB3C},   {0xFB3E, 0xFB3E},   {0xFB40, 0xFB41},
+    {0xFB43, 0xFB44},   {0xFB46, 0xFBB1},   {0xFBD3, 0xFC5D},
+    {0xFC64, 0xFD3D},   {0xFD50, 0xFD8F},   {0xFD92, 0xFDC7},
+    {0xFDF0, 0xFDF9},   {0xFE71, 0xFE71},   {0xFE73, 0xFE73},
+    {0xFE77, 0xFE77},   {0xFE79, 0xFE79},   {0xFE7B, 0xFE7B},
+    {0xFE7D, 0xFE7D},   {0xFE7F, 0xFEFC},   {0xFF21, 0xFF3A},
+    {0xFF41, 0xFF5A},   {0xFF66, 0xFF9D},   {0xFFA0, 0xFFBE},
+    {0xFFC2, 0xFFC7},   {0xFFCA, 0xFFCF},   {0xFFD2, 0xFFD7},
+    {0xFFDA, 0xFFDC},   {0x10000, 0x1000B}, {0x1000D, 0x10026},
+    {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D},
+    {0x10050, 0x1005D}, {0x10080, 0x100FA}, {0x10140, 0x10174},
+    {0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F},
+    {0x1032D, 0x1034A}, {0x10350, 0x10375}, {0x10380, 0x1039D},
+    {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x103D1, 0x103D5},
+    {0x10400, 0x1049D}, {0x104B0, 0x104D3}, {0x104D8, 0x104FB},
+    {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736},
+    {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805},
+    {0x10808, 0x10808}, {0x1080A, 0x10835}, {0x10837, 0x10838},
+    {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876},
+    {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5},
+    {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7},
+    {0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13},
+    {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C},
+    {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4},
+    {0x10B00, 0x10B35}, {0x10B40, 0x10B55}, {0x10B60, 0x10B72},
+    {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2},
+    {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9},
+    {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C}, {0x10F27, 0x10F27},
+    {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6},
+    {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8},
+    {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147},
+    {0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2},
+    {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC},
+    {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286},
+    {0x11288, 0x11288}, {0x1128A, 0x1128D}, {0x1128F, 0x1129D},
+    {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C},
+    {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330},
+    {0x11332, 0x11333}, {0x11335, 0x11339}, {0x1133D, 0x1133D},
+    {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434},
+    {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF},
+    {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE},
+    {0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644},
+    {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A},
+    {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906},
+    {0x11909, 0x11909}, {0x1190C, 0x11913}, {0x11915, 0x11916},
+    {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941},
+    {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1},
+    {0x119E3, 0x119E3}, {0x11A00, 0x11A00}, {0x11A0B, 0x11A32},
+    {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89},
+    {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08},
+    {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F},
+    {0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30},
+    {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68},
+    {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2},
+    {0x11FB0, 0x11FB0}, {0x12000, 0x12399}, {0x12400, 0x1246E},
+    {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646},
+    {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED},
+    {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77},
+    {0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A},
+    {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1},
+    {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5},
+    {0x18D00, 0x18D08}, {0x1B000, 0x1B11E}, {0x1B150, 0x1B152},
+    {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A},
+    {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99},
+    {0x1D400, 0x1D454}, {0x1D456, 0x1D49C}, {0x1D49E, 0x1D49F},
+    {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC},
+    {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3},
+    {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514},
+    {0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E},
+    {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550},
+    {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA},
+    {0x1D6DC, 0x1D6FA}, {0x1D6FC, 0x1D714}, {0x1D716, 0x1D734},
+    {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788},
+    {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB},
+    {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D}, {0x1E14E, 0x1E14E},
+    {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943},
+    {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F},
+    {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27},
+    {0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39},
+    {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47},
+    {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F},
+    {0x1EE51, 0x1EE52}, {0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57},
+    {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D},
+    {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64},
+    {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72}, {0x1EE74, 0x1EE77},
+    {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89},
+    {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9},
+    {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734},
+    {0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0},
+    {0x2F800, 0x2FA1D}, {0x30000, 0x3134A}};
+
+// Unicode 13 XID_Continue, excluding XID_Start
+// The Unicode Property XID_Continue is a super set of XID_Start.
+// To save Space, the table below only contains the codepoints
+// that are not also in XID_Start.
+static const llvm::sys::UnicodeCharRange XIDContinueRanges[] = {
+    {0x0030, 0x0039},   {0x005F, 0x005F},   {0x00B7, 0x00B7},
+    {0x0300, 0x036F},   {0x0387, 0x0387},   {0x0483, 0x0487},
+    {0x0591, 0x05BD},   {0x05BF, 0x05BF},   {0x05C1, 0x05C2},
+    {0x05C4, 0x05C5},   {0x05C7, 0x05C7},   {0x0610, 0x061A},
+    {0x064B, 0x0669},   {0x0670, 0x0670},   {0x06D6, 0x06DC},
+    {0x06DF, 0x06E4},   {0x06E7, 0x06E8},   {0x06EA, 0x06ED},
+    {0x06F0, 0x06F9},   {0x0711, 0x0711},   {0x0730, 0x074A},
+    {0x07A6, 0x07B0},   {0x07C0, 0x07C9},   {0x07EB, 0x07F3},
+    {0x07FD, 0x07FD},   {0x0816, 0x0819},   {0x081B, 0x0823},
+    {0x0825, 0x0827},   {0x0829, 0x082D},   {0x0859, 0x085B},
+    {0x08D3, 0x08E1},   {0x08E3, 0x0903},   {0x093A, 0x093C},
+    {0x093E, 0x094F},   {0x0951, 0x0957},   {0x0962, 0x0963},
+    {0x0966, 0x096F},   {0x0981, 0x0983},   {0x09BC, 0x09BC},
+    {0x09BE, 0x09C4},   {0x09C7, 0x09C8},   {0x09CB, 0x09CD},
+    {0x09D7, 0x09D7},   {0x09E2, 0x09E3},   {0x09E6, 0x09EF},
+    {0x09FE, 0x09FE},   {0x0A01, 0x0A03},   {0x0A3C, 0x0A3C},
+    {0x0A3E, 0x0A42},   {0x0A47, 0x0A48},   {0x0A4B, 0x0A4D},
+    {0x0A51, 0x0A51},   {0x0A66, 0x0A71},   {0x0A75, 0x0A75},
+    {0x0A81, 0x0A83},   {0x0ABC, 0x0ABC},   {0x0ABE, 0x0AC5},
+    {0x0AC7, 0x0AC9},   {0x0ACB, 0x0ACD},   {0x0AE2, 0x0AE3},
+    {0x0AE6, 0x0AEF},   {0x0AFA, 0x0AFF},   {0x0B01, 0x0B03},
+    {0x0B3C, 0x0B3C},   {0x0B3E, 0x0B44},   {0x0B47, 0x0B48},
+    {0x0B4B, 0x0B4D},   {0x0B55, 0x0B57},   {0x0B62, 0x0B63},
+    {0x0B66, 0x0B6F},   {0x0B82, 0x0B82},   {0x0BBE, 0x0BC2},
+    {0x0BC6, 0x0BC8},   {0x0BCA, 0x0BCD},   {0x0BD7, 0x0BD7},
+    {0x0BE6, 0x0BEF},   {0x0C00, 0x0C04},   {0x0C3E, 0x0C44},
+    {0x0C46, 0x0C48},   {0x0C4A, 0x0C4D},   {0x0C55, 0x0C56},
+    {0x0C62, 0x0C63},   {0x0C66, 0x0C6F},   {0x0C81, 0x0C83},
+    {0x0CBC, 0x0CBC},   {0x0CBE, 0x0CC4},   {0x0CC6, 0x0CC8},
+    {0x0CCA, 0x0CCD},   {0x0CD5, 0x0CD6},   {0x0CE2, 0x0CE3},
+    {0x0CE6, 0x0CEF},   {0x0D00, 0x0D03},   {0x0D3B, 0x0D3C},
+    {0x0D3E, 0x0D44},   {0x0D46, 0x0D48},   {0x0D4A, 0x0D4D},
+    {0x0D57, 0x0D57},   {0x0D62, 0x0D63},   {0x0D66, 0x0D6F},
+    {0x0D81, 0x0D83},   {0x0DCA, 0x0DCA},   {0x0DCF, 0x0DD4},
+    {0x0DD6, 0x0DD6},   {0x0DD8, 0x0DDF},   {0x0DE6, 0x0DEF},
+    {0x0DF2, 0x0DF3},   {0x0E31, 0x0E31},   {0x0E33, 0x0E3A},
+    {0x0E47, 0x0E4E},   {0x0E50, 0x0E59},   {0x0EB1, 0x0EB1},
+    {0x0EB3, 0x0EBC},   {0x0EC8, 0x0ECD},   {0x0ED0, 0x0ED9},
+    {0x0F18, 0x0F19},   {0x0F20, 0x0F29},   {0x0F35, 0x0F35},
+    {0x0F37, 0x0F37},   {0x0F39, 0x0F39},   {0x0F3E, 0x0F3F},
+    {0x0F71, 0x0F84},   {0x0F86, 0x0F87},   {0x0F8D, 0x0F97},
+    {0x0F99, 0x0FBC},   {0x0FC6, 0x0FC6},   {0x102B, 0x103E},
+    {0x1040, 0x1049},   {0x1056, 0x1059},   {0x105E, 0x1060},
+    {0x1062, 0x1064},   {0x1067, 0x106D},   {0x1071, 0x1074},
+    {0x1082, 0x108D},   {0x108F, 0x109D},   {0x135D, 0x135F},
+    {0x1369, 0x1371},   {0x1712, 0x1714},   {0x1732, 0x1734},
+    {0x1752, 0x1753},   {0x1772, 0x1773},   {0x17B4, 0x17D3},
+    {0x17DD, 0x17DD},   {0x17E0, 0x17E9},   {0x180B, 0x180D},
+    {0x1810, 0x1819},   {0x18A9, 0x18A9},   {0x1920, 0x192B},
+    {0x1930, 0x193B},   {0x1946, 0x194F},   {0x19D0, 0x19DA},
+    {0x1A17, 0x1A1B},   {0x1A55, 0x1A5E},   {0x1A60, 0x1A7C},
+    {0x1A7F, 0x1A89},   {0x1A90, 0x1A99},   {0x1AB0, 0x1ABD},
+    {0x1ABF, 0x1AC0},   {0x1B00, 0x1B04},   {0x1B34, 0x1B44},
+    {0x1B50, 0x1B59},   {0x1B6B, 0x1B73},   {0x1B80, 0x1B82},
+    {0x1BA1, 0x1BAD},   {0x1BB0, 0x1BB9},   {0x1BE6, 0x1BF3},
+    {0x1C24, 0x1C37},   {0x1C40, 0x1C49},   {0x1C50, 0x1C59},
+    {0x1CD0, 0x1CD2},   {0x1CD4, 0x1CE8},   {0x1CED, 0x1CED},
+    {0x1CF4, 0x1CF4},   {0x1CF7, 0x1CF9},   {0x1DC0, 0x1DF9},
+    {0x1DFB, 0x1DFF},   {0x203F, 0x2040},   {0x2054, 0x2054},
+    {0x20D0, 0x20DC},   {0x20E1, 0x20E1},   {0x20E5, 0x20F0},
+    {0x2CEF, 0x2CF1},   {0x2D7F, 0x2D7F},   {0x2DE0, 0x2DFF},
+    {0x302A, 0x302F},   {0x3099, 0x309A},   {0xA620, 0xA629},
+    {0xA66F, 0xA66F},   {0xA674, 0xA67D},   {0xA69E, 0xA69F},
+    {0xA6F0, 0xA6F1},   {0xA802, 0xA802},   {0xA806, 0xA806},
+    {0xA80B, 0xA80B},   {0xA823, 0xA827},   {0xA82C, 0xA82C},
+    {0xA880, 0xA881},   {0xA8B4, 0xA8C5},   {0xA8D0, 0xA8D9},
+    {0xA8E0, 0xA8F1},   {0xA8FF, 0xA909},   {0xA926, 0xA92D},
+    {0xA947, 0xA953},   {0xA980, 0xA983},   {0xA9B3, 0xA9C0},
+    {0xA9D0, 0xA9D9},   {0xA9E5, 0xA9E5},   {0xA9F0, 0xA9F9},
+    {0xAA29, 0xAA36},   {0xAA43, 0xAA43},   {0xAA4C, 0xAA4D},
+    {0xAA50, 0xAA59},   {0xAA7B, 0xAA7D},   {0xAAB0, 0xAAB0},
+    {0xAAB2, 0xAAB4},   {0xAAB7, 0xAAB8},   {0xAABE, 0xAABF},
+    {0xAAC1, 0xAAC1},   {0xAAEB, 0xAAEF},   {0xAAF5, 0xAAF6},
+    {0xABE3, 0xABEA},   {0xABEC, 0xABED},   {0xABF0, 0xABF9},
+    {0xFB1E, 0xFB1E},   {0xFE00, 0xFE0F},   {0xFE20, 0xFE2F},
+    {0xFE33, 0xFE34},   {0xFE4D, 0xFE4F},   {0xFF10, 0xFF19},
+    {0xFF3F, 0xFF3F},   {0xFF9E, 0xFF9F},   {0x101FD, 0x101FD},
+    {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x104A0, 0x104A9},
+    {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F},
+    {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F}, {0x10AE5, 0x10AE6},
+    {0x10D24, 0x10D27}, {0x10D30, 0x10D39}, {0x10EAB, 0x10EAC},
+    {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046},
+    {0x11066, 0x1106F}, {0x1107F, 0x11082}, {0x110B0, 0x110BA},
+    {0x110F0, 0x110F9}, {0x11100, 0x11102}, {0x11127, 0x11134},
+    {0x11136, 0x1113F}, {0x11145, 0x11146}, {0x11173, 0x11173},
+    {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC},
+    {0x111CE, 0x111D9}, {0x1122C, 0x11237}, {0x1123E, 0x1123E},
+    {0x112DF, 0x112EA}, {0x112F0, 0x112F9}, {0x11300, 0x11303},
+    {0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348},
+    {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363},
+    {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446},
+    {0x11450, 0x11459}, {0x1145E, 0x1145E}, {0x114B0, 0x114C3},
+    {0x114D0, 0x114D9}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0},
+    {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x11650, 0x11659},
+    {0x116AB, 0x116B7}, {0x116C0, 0x116C9}, {0x1171D, 0x1172B},
+    {0x11730, 0x11739}, {0x1182C, 0x1183A}, {0x118E0, 0x118E9},
+    {0x11930, 0x11935}, {0x11937, 0x11938}, {0x1193B, 0x1193E},
+    {0x11940, 0x11940}, {0x11942, 0x11943}, {0x11950, 0x11959},
+    {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4},
+    {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E},
+    {0x11A47, 0x11A47}, {0x11A51, 0x11A5B}, {0x11A8A, 0x11A99},
+    {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C50, 0x11C59},
+    {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36},
+    {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45},
+    {0x11D47, 0x11D47}, {0x11D50, 0x11D59}, {0x11D8A, 0x11D8E},
+    {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11DA0, 0x11DA9},
+    {0x11EF3, 0x11EF6}, {0x16A60, 0x16A69}, {0x16AF0, 0x16AF4},
+    {0x16B30, 0x16B36}, {0x16B50, 0x16B59}, {0x16F4F, 0x16F4F},
+    {0x16F51, 0x16F87}, {0x16F8F, 0x16F92}, {0x16FE4, 0x16FE4},
+    {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169},
+    {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B},
+    {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1D7CE, 0x1D7FF},
+    {0x1DA00, 0x1DA36}, {0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75},
+    {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF},
+    {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021},
+    {0x1E023, 0x1E024}, {0x1E026, 0x1E02A}, {0x1E130, 0x1E136},
+    {0x1E140, 0x1E149}, {0x1E2EC, 0x1E2F9}, {0x1E8D0, 0x1E8D6},
+    {0x1E944, 0x1E94A}, {0x1E950, 0x1E959}, {0x1FBF0, 0x1FBF9},
+    {0xE0100, 0xE01EF}};
+
 // C11 D.1, C++11 [charname.allowed]
 static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[] = {
   // 1
@@ -40,127 +378,6 @@ static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[] = {
   { 0xD0000, 0xDFFFD }, { 0xE0000, 0xEFFFD }
 };
 
-// C++03 [extendid]
-// Note that this is not the same as C++98, but we don't distinguish C++98
-// and C++03 in Clang.
-static const llvm::sys::UnicodeCharRange CXX03AllowedIDCharRanges[] = {
-  // Latin
-  { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 }, { 0x00F8, 0x01F5 },
-  { 0x01FA, 0x0217 }, { 0x0250, 0x02A8 },
-
-  // Greek
-  { 0x0384, 0x0384 }, { 0x0388, 0x038A }, { 0x038C, 0x038C },
-  { 0x038E, 0x03A1 }, { 0x03A3, 0x03CE }, { 0x03D0, 0x03D6 },
-  { 0x03DA, 0x03DA }, { 0x03DC, 0x03DC }, { 0x03DE, 0x03DE },
-  { 0x03E0, 0x03E0 }, { 0x03E2, 0x03F3 },
-
-  // Cyrillic
-  { 0x0401, 0x040D }, { 0x040F, 0x044F }, { 0x0451, 0x045C },
-  { 0x045E, 0x0481 }, { 0x0490, 0x04C4 }, { 0x04C7, 0x04C8 },
-  { 0x04CB, 0x04CC }, { 0x04D0, 0x04EB }, { 0x04EE, 0x04F5 },
-  { 0x04F8, 0x04F9 },
-
-  // Armenian
-  { 0x0531, 0x0556 }, { 0x0561, 0x0587 },
-
-  // Hebrew
-  { 0x05D0, 0x05EA }, { 0x05F0, 0x05F4 },
-
-  // Arabic
-  { 0x0621, 0x063A }, { 0x0640, 0x0652 }, { 0x0670, 0x06B7 },
-  { 0x06BA, 0x06BE }, { 0x06C0, 0x06CE }, { 0x06E5, 0x06E7 },
-
-  // Devanagari
-  { 0x0905, 0x0939 }, { 0x0958, 0x0962 },
-
-  // Bengali
-  { 0x0985, 0x098C }, { 0x098F, 0x0990 }, { 0x0993, 0x09A8 },
-  { 0x09AA, 0x09B0 }, { 0x09B2, 0x09B2 }, { 0x09B6, 0x09B9 },
-  { 0x09DC, 0x09DD }, { 0x09DF, 0x09E1 }, { 0x09F0, 0x09F1 },
-
-  // Gurmukhi
-  { 0x0A05, 0x0A0A }, { 0x0A0F, 0x0A10 }, { 0x0A13, 0x0A28 },
-  { 0x0A2A, 0x0A30 }, { 0x0A32, 0x0A33 }, { 0x0A35, 0x0A36 },
-  { 0x0A38, 0x0A39 }, { 0x0A59, 0x0A5C }, { 0x0A5E, 0x0A5E },
-
-  // Gujarti
-  { 0x0A85, 0x0A8B }, { 0x0A8D, 0x0A8D }, { 0x0A8F, 0x0A91 },
-  { 0x0A93, 0x0AA8 }, { 0x0AAA, 0x0AB0 }, { 0x0AB2, 0x0AB3 },
-  { 0x0AB5, 0x0AB9 }, { 0x0AE0, 0x0AE0 },
-
-  // Oriya
-  { 0x0B05, 0x0B0C }, { 0x0B0F, 0x0B10 }, { 0x0B13, 0x0B28 },
-  { 0x0B2A, 0x0B30 }, { 0x0B32, 0x0B33 }, { 0x0B36, 0x0B39 },
-  { 0x0B5C, 0x0B5D }, { 0x0B5F, 0x0B61 },
-
-  // Tamil
-  { 0x0B85, 0x0B8A }, { 0x0B8E, 0x0B90 }, { 0x0B92, 0x0B95 },
-  { 0x0B99, 0x0B9A }, { 0x0B9C, 0x0B9C }, { 0x0B9E, 0x0B9F },
-  { 0x0BA3, 0x0BA4 }, { 0x0BA8, 0x0BAA }, { 0x0BAE, 0x0BB5 },
-  { 0x0BB7, 0x0BB9 },
-
-  // Telugu
-  { 0x0C05, 0x0C0C }, { 0x0C0E, 0x0C10 }, { 0x0C12, 0x0C28 },
-  { 0x0C2A, 0x0C33 }, { 0x0C35, 0x0C39 }, { 0x0C60, 0x0C61 },
-
-  // Kannada
-  { 0x0C85, 0x0C8C }, { 0x0C8E, 0x0C90 }, { 0x0C92, 0x0CA8 },
-  { 0x0CAA, 0x0CB3 }, { 0x0CB5, 0x0CB9 }, { 0x0CE0, 0x0CE1 },
-
-  // Malayam
-  { 0x0D05, 0x0D0C }, { 0x0D0E, 0x0D10 }, { 0x0D12, 0x0D28 },
-  { 0x0D2A, 0x0D39 }, { 0x0D60, 0x0D61 },
-
-  // Thai
-  { 0x0E01, 0x0E30 }, { 0x0E32, 0x0E33 }, { 0x0E40, 0x0E46 },
-  { 0x0E4F, 0x0E5B },
-
-  // Lao
-  { 0x0E81, 0x0E82 }, { 0x0E84, 0x0E84 }, { 0x0E87, 0x0E87 },
-  { 0x0E88, 0x0E88 }, { 0x0E8A, 0x0E8A }, { 0x0E8D, 0x0E8D },
-  { 0x0E94, 0x0E97 }, { 0x0E99, 0x0E9F }, { 0x0EA1, 0x0EA3 },
-  { 0x0EA5, 0x0EA5 }, { 0x0EA7, 0x0EA7 }, { 0x0EAA, 0x0EAA },
-  { 0x0EAB, 0x0EAB }, { 0x0EAD, 0x0EB0 }, { 0x0EB2, 0x0EB2 },
-  { 0x0EB3, 0x0EB3 }, { 0x0EBD, 0x0EBD }, { 0x0EC0, 0x0EC4 },
-  { 0x0EC6, 0x0EC6 },
-
-  // Georgian
-  { 0x10A0, 0x10C5 }, { 0x10D0, 0x10F6 },
-
-  // Hangul
-  { 0x1100, 0x1159 }, { 0x1161, 0x11A2 }, { 0x11A8, 0x11F9 },
-
-  // Latin (2)
-  { 0x1E00, 0x1E9A }, { 0x1EA0, 0x1EF9 },
-
-  // Greek (2)
-  { 0x1F00, 0x1F15 }, { 0x1F18, 0x1F1D }, { 0x1F20, 0x1F45 },
-  { 0x1F48, 0x1F4D }, { 0x1F50, 0x1F57 }, { 0x1F59, 0x1F59 },
-  { 0x1F5B, 0x1F5B }, { 0x1F5D, 0x1F5D }, { 0x1F5F, 0x1F7D },
-  { 0x1F80, 0x1FB4 }, { 0x1FB6, 0x1FBC }, { 0x1FC2, 0x1FC4 },
-  { 0x1FC6, 0x1FCC }, { 0x1FD0, 0x1FD3 }, { 0x1FD6, 0x1FDB },
-  { 0x1FE0, 0x1FEC }, { 0x1FF2, 0x1FF4 }, { 0x1FF6, 0x1FFC },
-
-  // Hiragana
-  { 0x3041, 0x3094 }, { 0x309B, 0x309E },
-
-  // Katakana
-  { 0x30A1, 0x30FE },
-
-  // Bopmofo [sic]
-  { 0x3105, 0x312C },
-
-  // CJK Unified Ideographs
-  { 0x4E00, 0x9FA5 }, { 0xF900, 0xFA2D }, { 0xFB1F, 0xFB36 },
-  { 0xFB38, 0xFB3C }, { 0xFB3E, 0xFB3E }, { 0xFB40, 0xFB41 },
-  { 0xFB42, 0xFB44 }, { 0xFB46, 0xFBB1 }, { 0xFBD3, 0xFD3F },
-  { 0xFD50, 0xFD8F }, { 0xFD92, 0xFDC7 }, { 0xFDF0, 0xFDFB },
-  { 0xFE70, 0xFE72 }, { 0xFE74, 0xFE74 }, { 0xFE76, 0xFEFC },
-  { 0xFF21, 0xFF3A }, { 0xFF41, 0xFF5A }, { 0xFF66, 0xFFBE },
-  { 0xFFC2, 0xFFC7 }, { 0xFFCA, 0xFFCF }, { 0xFFD2, 0xFFD7 },
-  { 0xFFDA, 0xFFDC }
-};
-
 // C99 Annex D
 static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[] = {
   // Latin (1)
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 4896640a4eabe..7870b3d30d70e 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -2401,7 +2401,7 @@ Parser::ParseExprAfterUnaryExprOrTypeTrait(const Token &OpTok,
 /// a parameter.
 ExprResult Parser::ParseSYCLUniqueStableNameExpression() {
   assert(Tok.is(tok::kw___builtin_sycl_unique_stable_name) &&
-         "Not __bulitin_sycl_unique_stable_name");
+         "Not __builtin_sycl_unique_stable_name");
 
   SourceLocation OpLoc = ConsumeToken();
   BalancedDelimiterTracker T(*this, tok::l_paren);
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index aa2602c8d9256..99ce143d3559d 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -1125,7 +1125,7 @@ namespace {
                 // unreachable in all instantiations of the template.
                 if (!IsTemplateInstantiation)
                   S.Diag(AS->getBeginLoc(),
-                         diag::warn_fallthrough_attr_unreachable);
+                         diag::warn_unreachable_fallthrough_attr);
                 markFallthroughVisited(AS);
                 ++AnnotatedCnt;
                 break;
diff --git a/clang/lib/Sema/DeclSpec.cpp b/clang/lib/Sema/DeclSpec.cpp
index 72d9ea6dd3bf3..7fee9545f8b28 100644
--- a/clang/lib/Sema/DeclSpec.cpp
+++ b/clang/lib/Sema/DeclSpec.cpp
@@ -1300,8 +1300,8 @@ void DeclSpec::Finish(Sema &S, const PrintingPolicy &Policy) {
       if (!S.getLangOpts().CPlusPlus)
         S.Diag(TSTLoc, diag::ext_integer_complex);
     } else if (TypeSpecType != TST_float && TypeSpecType != TST_double &&
-               TypeSpecType != TST_float128) {
-      // FIXME: _Float16, __fp16?
+               TypeSpecType != TST_float128 && TypeSpecType != TST_float16) {
+      // FIXME: __fp16?
       S.Diag(TSCLoc, diag::err_invalid_complex_spec)
         << getSpecifierName((TST)TypeSpecType, Policy);
       TypeSpecComplex = TSC_unspecified;
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index ecf94a1d78070..7332ca06d5d9c 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3903,14 +3903,22 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_vcvttss2si64:
   case X86::BI__builtin_ia32_vcvttss2usi32:
   case X86::BI__builtin_ia32_vcvttss2usi64:
+  case X86::BI__builtin_ia32_vcvttsh2si32:
+  case X86::BI__builtin_ia32_vcvttsh2si64:
+  case X86::BI__builtin_ia32_vcvttsh2usi32:
+  case X86::BI__builtin_ia32_vcvttsh2usi64:
     ArgNum = 1;
     break;
   case X86::BI__builtin_ia32_maxpd512:
   case X86::BI__builtin_ia32_maxps512:
   case X86::BI__builtin_ia32_minpd512:
   case X86::BI__builtin_ia32_minps512:
+  case X86::BI__builtin_ia32_maxph512:
+  case X86::BI__builtin_ia32_minph512:
     ArgNum = 2;
     break;
+  case X86::BI__builtin_ia32_vcvtph2pd512_mask:
+  case X86::BI__builtin_ia32_vcvtph2psx512_mask:
   case X86::BI__builtin_ia32_cvtps2pd512_mask:
   case X86::BI__builtin_ia32_cvttpd2dq512_mask:
   case X86::BI__builtin_ia32_cvttpd2qq512_mask:
@@ -3920,16 +3928,24 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_cvttps2qq512_mask:
   case X86::BI__builtin_ia32_cvttps2udq512_mask:
   case X86::BI__builtin_ia32_cvttps2uqq512_mask:
+  case X86::BI__builtin_ia32_vcvttph2w512_mask:
+  case X86::BI__builtin_ia32_vcvttph2uw512_mask:
+  case X86::BI__builtin_ia32_vcvttph2dq512_mask:
+  case X86::BI__builtin_ia32_vcvttph2udq512_mask:
+  case X86::BI__builtin_ia32_vcvttph2qq512_mask:
+  case X86::BI__builtin_ia32_vcvttph2uqq512_mask:
   case X86::BI__builtin_ia32_exp2pd_mask:
   case X86::BI__builtin_ia32_exp2ps_mask:
   case X86::BI__builtin_ia32_getexppd512_mask:
   case X86::BI__builtin_ia32_getexpps512_mask:
+  case X86::BI__builtin_ia32_getexpph512_mask:
   case X86::BI__builtin_ia32_rcp28pd_mask:
   case X86::BI__builtin_ia32_rcp28ps_mask:
   case X86::BI__builtin_ia32_rsqrt28pd_mask:
   case X86::BI__builtin_ia32_rsqrt28ps_mask:
   case X86::BI__builtin_ia32_vcomisd:
   case X86::BI__builtin_ia32_vcomiss:
+  case X86::BI__builtin_ia32_vcomish:
   case X86::BI__builtin_ia32_vcvtph2ps512_mask:
     ArgNum = 3;
     break;
@@ -3937,21 +3953,30 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_cmpps512_mask:
   case X86::BI__builtin_ia32_cmpsd_mask:
   case X86::BI__builtin_ia32_cmpss_mask:
+  case X86::BI__builtin_ia32_cmpsh_mask:
+  case X86::BI__builtin_ia32_vcvtsh2sd_round_mask:
+  case X86::BI__builtin_ia32_vcvtsh2ss_round_mask:
   case X86::BI__builtin_ia32_cvtss2sd_round_mask:
   case X86::BI__builtin_ia32_getexpsd128_round_mask:
   case X86::BI__builtin_ia32_getexpss128_round_mask:
+  case X86::BI__builtin_ia32_getexpsh128_round_mask:
   case X86::BI__builtin_ia32_getmantpd512_mask:
   case X86::BI__builtin_ia32_getmantps512_mask:
+  case X86::BI__builtin_ia32_getmantph512_mask:
   case X86::BI__builtin_ia32_maxsd_round_mask:
   case X86::BI__builtin_ia32_maxss_round_mask:
+  case X86::BI__builtin_ia32_maxsh_round_mask:
   case X86::BI__builtin_ia32_minsd_round_mask:
   case X86::BI__builtin_ia32_minss_round_mask:
+  case X86::BI__builtin_ia32_minsh_round_mask:
   case X86::BI__builtin_ia32_rcp28sd_round_mask:
   case X86::BI__builtin_ia32_rcp28ss_round_mask:
   case X86::BI__builtin_ia32_reducepd512_mask:
   case X86::BI__builtin_ia32_reduceps512_mask:
+  case X86::BI__builtin_ia32_reduceph512_mask:
   case X86::BI__builtin_ia32_rndscalepd_mask:
   case X86::BI__builtin_ia32_rndscaleps_mask:
+  case X86::BI__builtin_ia32_rndscaleph_mask:
   case X86::BI__builtin_ia32_rsqrt28sd_round_mask:
   case X86::BI__builtin_ia32_rsqrt28ss_round_mask:
     ArgNum = 4;
@@ -3966,14 +3991,17 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_fixupimmss_maskz:
   case X86::BI__builtin_ia32_getmantsd_round_mask:
   case X86::BI__builtin_ia32_getmantss_round_mask:
+  case X86::BI__builtin_ia32_getmantsh_round_mask:
   case X86::BI__builtin_ia32_rangepd512_mask:
   case X86::BI__builtin_ia32_rangeps512_mask:
   case X86::BI__builtin_ia32_rangesd128_round_mask:
   case X86::BI__builtin_ia32_rangess128_round_mask:
   case X86::BI__builtin_ia32_reducesd_mask:
   case X86::BI__builtin_ia32_reducess_mask:
+  case X86::BI__builtin_ia32_reducesh_mask:
   case X86::BI__builtin_ia32_rndscalesd_round_mask:
   case X86::BI__builtin_ia32_rndscaless_round_mask:
+  case X86::BI__builtin_ia32_rndscalesh_round_mask:
     ArgNum = 5;
     break;
   case X86::BI__builtin_ia32_vcvtsd2si64:
@@ -3984,11 +4012,20 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_vcvtss2si64:
   case X86::BI__builtin_ia32_vcvtss2usi32:
   case X86::BI__builtin_ia32_vcvtss2usi64:
+  case X86::BI__builtin_ia32_vcvtsh2si32:
+  case X86::BI__builtin_ia32_vcvtsh2si64:
+  case X86::BI__builtin_ia32_vcvtsh2usi32:
+  case X86::BI__builtin_ia32_vcvtsh2usi64:
   case X86::BI__builtin_ia32_sqrtpd512:
   case X86::BI__builtin_ia32_sqrtps512:
+  case X86::BI__builtin_ia32_sqrtph512:
     ArgNum = 1;
     HasRC = true;
     break;
+  case X86::BI__builtin_ia32_addph512:
+  case X86::BI__builtin_ia32_divph512:
+  case X86::BI__builtin_ia32_mulph512:
+  case X86::BI__builtin_ia32_subph512:
   case X86::BI__builtin_ia32_addpd512:
   case X86::BI__builtin_ia32_addps512:
   case X86::BI__builtin_ia32_divpd512:
@@ -4003,11 +4040,17 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_cvtusi2sd64:
   case X86::BI__builtin_ia32_cvtusi2ss32:
   case X86::BI__builtin_ia32_cvtusi2ss64:
+  case X86::BI__builtin_ia32_vcvtusi2sh:
+  case X86::BI__builtin_ia32_vcvtusi642sh:
+  case X86::BI__builtin_ia32_vcvtsi2sh:
+  case X86::BI__builtin_ia32_vcvtsi642sh:
     ArgNum = 2;
     HasRC = true;
     break;
   case X86::BI__builtin_ia32_cvtdq2ps512_mask:
   case X86::BI__builtin_ia32_cvtudq2ps512_mask:
+  case X86::BI__builtin_ia32_vcvtpd2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtps2phx512_mask:
   case X86::BI__builtin_ia32_cvtpd2ps512_mask:
   case X86::BI__builtin_ia32_cvtpd2dq512_mask:
   case X86::BI__builtin_ia32_cvtpd2qq512_mask:
@@ -4021,24 +4064,45 @@ bool Sema::CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_cvtqq2ps512_mask:
   case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
   case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
+  case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtw2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtph2w512_mask:
+  case X86::BI__builtin_ia32_vcvtph2uw512_mask:
+  case X86::BI__builtin_ia32_vcvtph2dq512_mask:
+  case X86::BI__builtin_ia32_vcvtph2udq512_mask:
+  case X86::BI__builtin_ia32_vcvtph2qq512_mask:
+  case X86::BI__builtin_ia32_vcvtph2uqq512_mask:
+  case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
     ArgNum = 3;
     HasRC = true;
     break;
+  case X86::BI__builtin_ia32_addsh_round_mask:
   case X86::BI__builtin_ia32_addss_round_mask:
   case X86::BI__builtin_ia32_addsd_round_mask:
+  case X86::BI__builtin_ia32_divsh_round_mask:
   case X86::BI__builtin_ia32_divss_round_mask:
   case X86::BI__builtin_ia32_divsd_round_mask:
+  case X86::BI__builtin_ia32_mulsh_round_mask:
   case X86::BI__builtin_ia32_mulss_round_mask:
   case X86::BI__builtin_ia32_mulsd_round_mask:
+  case X86::BI__builtin_ia32_subsh_round_mask:
   case X86::BI__builtin_ia32_subss_round_mask:
   case X86::BI__builtin_ia32_subsd_round_mask:
+  case X86::BI__builtin_ia32_scalefph512_mask:
   case X86::BI__builtin_ia32_scalefpd512_mask:
   case X86::BI__builtin_ia32_scalefps512_mask:
   case X86::BI__builtin_ia32_scalefsd_round_mask:
   case X86::BI__builtin_ia32_scalefss_round_mask:
+  case X86::BI__builtin_ia32_scalefsh_round_mask:
   case X86::BI__builtin_ia32_cvtsd2ss_round_mask:
+  case X86::BI__builtin_ia32_vcvtss2sh_round_mask:
+  case X86::BI__builtin_ia32_vcvtsd2sh_round_mask:
   case X86::BI__builtin_ia32_sqrtsd_round_mask:
   case X86::BI__builtin_ia32_sqrtss_round_mask:
+  case X86::BI__builtin_ia32_sqrtsh_round_mask:
   case X86::BI__builtin_ia32_vfmaddsd3_mask:
   case X86::BI__builtin_ia32_vfmaddsd3_maskz:
   case X86::BI__builtin_ia32_vfmaddsd3_mask3:
@@ -4412,6 +4476,9 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   case X86::BI__builtin_ia32_getmantps256_mask:
   case X86::BI__builtin_ia32_getmantpd512_mask:
   case X86::BI__builtin_ia32_getmantps512_mask:
+  case X86::BI__builtin_ia32_getmantph128_mask:
+  case X86::BI__builtin_ia32_getmantph256_mask:
+  case X86::BI__builtin_ia32_getmantph512_mask:
   case X86::BI__builtin_ia32_vec_ext_v16qi:
   case X86::BI__builtin_ia32_vec_ext_v16hi:
     i = 1; l = 0; u = 15;
@@ -4430,6 +4497,7 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   case X86::BI__builtin_ia32_rangeps512_mask:
   case X86::BI__builtin_ia32_getmantsd_round_mask:
   case X86::BI__builtin_ia32_getmantss_round_mask:
+  case X86::BI__builtin_ia32_getmantsh_round_mask:
   case X86::BI__builtin_ia32_vec_set_v16qi:
   case X86::BI__builtin_ia32_vec_set_v16hi:
     i = 2; l = 0; u = 15;
@@ -4482,12 +4550,16 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   case X86::BI__builtin_ia32_rndscalepd_256_mask:
   case X86::BI__builtin_ia32_rndscaleps_mask:
   case X86::BI__builtin_ia32_rndscalepd_mask:
+  case X86::BI__builtin_ia32_rndscaleph_mask:
   case X86::BI__builtin_ia32_reducepd128_mask:
   case X86::BI__builtin_ia32_reducepd256_mask:
   case X86::BI__builtin_ia32_reducepd512_mask:
   case X86::BI__builtin_ia32_reduceps128_mask:
   case X86::BI__builtin_ia32_reduceps256_mask:
   case X86::BI__builtin_ia32_reduceps512_mask:
+  case X86::BI__builtin_ia32_reduceph128_mask:
+  case X86::BI__builtin_ia32_reduceph256_mask:
+  case X86::BI__builtin_ia32_reduceph512_mask:
   case X86::BI__builtin_ia32_prold512:
   case X86::BI__builtin_ia32_prolq512:
   case X86::BI__builtin_ia32_prold128:
@@ -4506,8 +4578,12 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   case X86::BI__builtin_ia32_fpclassps256_mask:
   case X86::BI__builtin_ia32_fpclassps512_mask:
   case X86::BI__builtin_ia32_fpclasspd512_mask:
+  case X86::BI__builtin_ia32_fpclassph128_mask:
+  case X86::BI__builtin_ia32_fpclassph256_mask:
+  case X86::BI__builtin_ia32_fpclassph512_mask:
   case X86::BI__builtin_ia32_fpclasssd_mask:
   case X86::BI__builtin_ia32_fpclassss_mask:
+  case X86::BI__builtin_ia32_fpclasssh_mask:
   case X86::BI__builtin_ia32_pslldqi128_byteshift:
   case X86::BI__builtin_ia32_pslldqi256_byteshift:
   case X86::BI__builtin_ia32_pslldqi512_byteshift:
@@ -4618,6 +4694,8 @@ bool Sema::CheckX86BuiltinFunctionCall(const TargetInfo &TI, unsigned BuiltinID,
   case X86::BI__builtin_ia32_reducess_mask:
   case X86::BI__builtin_ia32_rndscalesd_round_mask:
   case X86::BI__builtin_ia32_rndscaless_round_mask:
+  case X86::BI__builtin_ia32_rndscalesh_round_mask:
+  case X86::BI__builtin_ia32_reducesh_mask:
     i = 4; l = 0; u = 255;
     break;
   }
diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp
index e8e8fb209cc9a..8c551a798b32b 100644
--- a/clang/lib/Sema/SemaCodeComplete.cpp
+++ b/clang/lib/Sema/SemaCodeComplete.cpp
@@ -4389,7 +4389,8 @@ void Sema::CodeCompleteAttribute(AttributeCommonInfo::Syntax Syntax,
   auto AddCompletions = [&](const ParsedAttrInfo &A) {
     if (A.IsTargetSpecific && !A.existsInTarget(Context.getTargetInfo()))
       return;
-    // FIXME: filter by langopts (diagLangOpts method requires a ParsedAttr)
+    if (!A.acceptsLangOpts(getLangOpts()))
+      return;
     for (const auto &S : A.Spellings) {
       if (S.Syntax != Syntax)
         continue;
@@ -4422,33 +4423,59 @@ void Sema::CodeCompleteAttribute(AttributeCommonInfo::Syntax Syntax,
         Scope = "";
       }
 
+      auto Add = [&](llvm::StringRef Scope, llvm::StringRef Name,
+                     bool Underscores) {
+        CodeCompletionBuilder Builder(Results.getAllocator(),
+                                      Results.getCodeCompletionTUInfo());
+        llvm::SmallString<32> Text;
+        if (!Scope.empty()) {
+          Text.append(Scope);
+          Text.append("::");
+        }
+        if (Underscores)
+          Text.append("__");
+        Text.append(Name);
+        if (Underscores)
+          Text.append("__");
+        Builder.AddTypedTextChunk(Results.getAllocator().CopyString(Text));
+
+        if (!A.ArgNames.empty()) {
+          Builder.AddChunk(CodeCompletionString::CK_LeftParen, "(");
+          bool First = true;
+          for (const char *Arg : A.ArgNames) {
+            if (!First)
+              Builder.AddChunk(CodeCompletionString::CK_Comma, ", ");
+            First = false;
+            Builder.AddPlaceholderChunk(Arg);
+          }
+          Builder.AddChunk(CodeCompletionString::CK_RightParen, ")");
+        }
+
+        Results.AddResult(Builder.TakeString());
+      };
+
       // Generate the non-underscore-guarded result.
       // Note this is (a suffix of) the NormalizedFullName, no need to copy.
       // If an underscore-guarded scope was specified, only the
       // underscore-guarded attribute name is relevant.
       if (!InScopeUnderscore)
-        Results.AddResult(Scope.empty() ? Name.data() : S.NormalizedFullName);
+        Add(Scope, Name, /*Underscores=*/false);
 
       // Generate the underscore-guarded version, for syntaxes that support it.
       // We skip this if the scope was already spelled and not guarded, or
       // we must spell it and can't guard it.
       if (!(InScope && !InScopeUnderscore) && SyntaxSupportsGuards) {
         llvm::SmallString<32> Guarded;
-        if (!Scope.empty()) {
+        if (Scope.empty()) {
+          Add(Scope, Name, /*Underscores=*/true);
+        } else {
           const char *GuardedScope = underscoreAttrScope(Scope);
           if (!GuardedScope)
             continue;
-          Guarded.append(GuardedScope);
-          Guarded.append("::");
+          Add(GuardedScope, Name, /*Underscores=*/true);
         }
-        Guarded.append("__");
-        Guarded.append(Name);
-        Guarded.append("__");
-        Results.AddResult(
-            CodeCompletionResult(Results.getAllocator().CopyString(Guarded)));
       }
 
-      // FIXME: include the list of arg names (not currently exposed).
       // It may be nice to include the Kind so we can look up the docs later.
     }
   };
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index a00a72982bac8..4827f6b3bb345 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -16836,10 +16836,7 @@ NamedDecl *Sema::ActOnFriendFunctionDecl(Scope *S, Declarator &D,
     while (DC->isRecord())
       DC = DC->getParent();
 
-    DeclContext *LookupDC = DC;
-    while (LookupDC->isTransparentContext())
-      LookupDC = LookupDC->getParent();
-
+    DeclContext *LookupDC = DC->getNonTransparentContext();
     while (true) {
       LookupQualifiedName(Previous, LookupDC);
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 68b6c5e384599..9d9b39b3ab996 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -18442,7 +18442,6 @@ static ExprResult rebuildPotentialResultsAsNonOdrUsed(Sema &S, Expr *E,
         ME->getQualifierLoc(), ME->getTemplateKeywordLoc(), ME->getMemberDecl(),
         ME->getFoundDecl(), ME->getMemberNameInfo(), CopiedTemplateArgs(ME),
         ME->getType(), ME->getValueKind(), ME->getObjectKind(), NOUR);
-    return ExprEmpty();
   }
 
   case Expr::BinaryOperatorClass: {
diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp
index af2aa49c01039..92b7464cd0bbf 100644
--- a/clang/lib/Sema/SemaExprMember.cpp
+++ b/clang/lib/Sema/SemaExprMember.cpp
@@ -564,10 +564,7 @@ bool Sema::CheckQualifiedMemberReference(Expr *BaseExpr,
       return false;
 
     // Note that we use the DC of the decl, not the underlying decl.
-    DeclContext *DC = (*I)->getDeclContext();
-    while (DC->isTransparentContext())
-      DC = DC->getParent();
-
+    DeclContext *DC = (*I)->getDeclContext()->getNonTransparentContext();
     if (!DC->isRecord())
       continue;
 
diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp
index 8a9c933fc93f8..9e46801ea5089 100644
--- a/clang/lib/Sema/SemaExprObjC.cpp
+++ b/clang/lib/Sema/SemaExprObjC.cpp
@@ -4015,12 +4015,11 @@ static bool CheckObjCBridgeNSCast(Sema &S, QualType castType, Expr *castExpr,
         if (Parm->isStr("id"))
           return true;
 
-        NamedDecl *Target = nullptr;
         // Check for an existing type with this name.
         LookupResult R(S, DeclarationName(Parm), SourceLocation(),
                        Sema::LookupOrdinaryName);
         if (S.LookupName(R, S.TUScope)) {
-          Target = R.getFoundDecl();
+          NamedDecl *Target = R.getFoundDecl();
           if (Target && isa<ObjCInterfaceDecl>(Target)) {
             ObjCInterfaceDecl *ExprClass = cast<ObjCInterfaceDecl>(Target);
             if (const ObjCObjectPointerType *InterfacePointerType =
@@ -4056,8 +4055,6 @@ static bool CheckObjCBridgeNSCast(Sema &S, QualType castType, Expr *castExpr,
                  diag::err_objc_cf_bridged_not_interface)
               << castExpr->getType() << Parm;
           S.Diag(TDNDecl->getBeginLoc(), diag::note_declared_at);
-          if (Target)
-            S.Diag(Target->getBeginLoc(), diag::note_declared_at);
         }
         return true;
       }
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index fe81ddf6bf851..51db4a28ee92c 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -5814,6 +5814,31 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
         ErrorFound = true;
       }
     }
+    // OpenMP 5.0 [2.19.7]
+    // If a list item appears in a reduction, lastprivate or linear
+    // clause on a combined target construct then it is treated as
+    // if it also appears in a map clause with a map-type of tofrom
+    if (getLangOpts().OpenMP >= 50 && Kind != OMPD_target &&
+        isOpenMPTargetExecutionDirective(Kind)) {
+      SmallVector<Expr *, 4> ImplicitExprs;
+      for (OMPClause *C : Clauses) {
+        if (auto *RC = dyn_cast<OMPReductionClause>(C))
+          for (Expr *E : RC->varlists())
+            if (!isa<DeclRefExpr>(E->IgnoreParenImpCasts()))
+              ImplicitExprs.emplace_back(E);
+      }
+      if (!ImplicitExprs.empty()) {
+        ArrayRef<Expr *> Exprs = ImplicitExprs;
+        CXXScopeSpec MapperIdScopeSpec;
+        DeclarationNameInfo MapperId;
+        if (OMPClause *Implicit = ActOnOpenMPMapClause(
+                OMPC_MAP_MODIFIER_unknown, SourceLocation(), MapperIdScopeSpec,
+                MapperId, OMPC_MAP_tofrom,
+                /*IsMapTypeImplicit=*/true, SourceLocation(), SourceLocation(),
+                Exprs, OMPVarListLocTy(), /*NoDiagnose=*/true))
+          ClausesWithImplicit.emplace_back(Implicit);
+      }
+    }
     for (unsigned I = 0, E = DefaultmapKindNum; I < E; ++I) {
       int ClauseKindCnt = -1;
       for (ArrayRef<Expr *> ImplicitMap : ImplicitMaps[I]) {
@@ -18734,7 +18759,10 @@ class MapBaseChecker final : public StmtVisitor<MapBaseChecker, bool> {
   }
 
   bool VisitOMPArraySectionExpr(OMPArraySectionExpr *OASE) {
-    assert(!NoDiagnose && "Array sections cannot be implicitly mapped.");
+    // After OMP 5.0  Array section in reduction clause will be implicitly
+    // mapped
+    assert(!(SemaRef.getLangOpts().OpenMP < 50 && NoDiagnose) &&
+           "Array sections cannot be implicitly mapped.");
     Expr *E = OASE->getBase()->IgnoreParenImpCasts();
     QualType CurType =
       OMPArraySectionExpr::getBaseOriginalType(E).getCanonicalType();
@@ -18777,6 +18805,8 @@ class MapBaseChecker final : public StmtVisitor<MapBaseChecker, bool> {
     } else if (AllowUnitySizeArraySection && NotUnity) {
       // A unity or whole array section is not allowed and that is not
       // compatible with the properties of the current array section.
+      if (NoDiagnose)
+        return false;
       SemaRef.Diag(
         ELoc, diag::err_array_section_does_not_specify_contiguous_storage)
         << OASE->getSourceRange();
@@ -19320,7 +19350,7 @@ static void checkMappableExpressionList(
     CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo MapperId,
     ArrayRef<Expr *> UnresolvedMappers,
     OpenMPMapClauseKind MapType = OMPC_MAP_unknown,
-    bool IsMapTypeImplicit = false) {
+    bool IsMapTypeImplicit = false, bool NoDiagnose = false) {
   // We only expect mappable expressions in 'to', 'from', and 'map' clauses.
   assert((CKind == OMPC_map || CKind == OMPC_to || CKind == OMPC_from) &&
          "Unexpected clause kind with mappable expressions!");
@@ -19399,9 +19429,9 @@ static void checkMappableExpressionList(
 
     // Obtain the array or member expression bases if required. Also, fill the
     // components array with all the components identified in the process.
-    const Expr *BE = checkMapClauseExpressionBase(
-        SemaRef, SimpleExpr, CurComponents, CKind, DSAS->getCurrentDirective(),
-        /*NoDiagnose=*/false);
+    const Expr *BE =
+        checkMapClauseExpressionBase(SemaRef, SimpleExpr, CurComponents, CKind,
+                                     DSAS->getCurrentDirective(), NoDiagnose);
     if (!BE)
       continue;
 
@@ -19447,6 +19477,8 @@ static void checkMappableExpressionList(
     // OpenMP 4.5 [2.10.5, target update Construct]
     // threadprivate variables cannot appear in a from clause.
     if (VD && DSAS->isThreadPrivate(VD)) {
+      if (NoDiagnose)
+        continue;
       DSAStackTy::DSAVarData DVar = DSAS->getTopDSA(VD, /*FromParent=*/false);
       SemaRef.Diag(ELoc, diag::err_omp_threadprivate_in_clause)
           << getOpenMPClauseName(CKind);
@@ -19507,7 +19539,7 @@ static void checkMappableExpressionList(
     // OpenMP 4.5 [2.15.5.1, map Clause, Restrictions, p.9]
     //  A list item must have a mappable type.
     if (!checkTypeMappable(VE->getExprLoc(), VE->getSourceRange(), SemaRef,
-                           DSAS, Type))
+                           DSAS, Type, /*FullCheck=*/true))
       continue;
 
     if (CKind == OMPC_map) {
@@ -19610,7 +19642,8 @@ OMPClause *Sema::ActOnOpenMPMapClause(
     CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
     OpenMPMapClauseKind MapType, bool IsMapTypeImplicit, SourceLocation MapLoc,
     SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
-    const OMPVarListLocTy &Locs, ArrayRef<Expr *> UnresolvedMappers) {
+    const OMPVarListLocTy &Locs, bool NoDiagnose,
+    ArrayRef<Expr *> UnresolvedMappers) {
   OpenMPMapModifierKind Modifiers[] = {
       OMPC_MAP_MODIFIER_unknown, OMPC_MAP_MODIFIER_unknown,
       OMPC_MAP_MODIFIER_unknown, OMPC_MAP_MODIFIER_unknown};
@@ -19634,7 +19667,7 @@ OMPClause *Sema::ActOnOpenMPMapClause(
   MappableVarListInfo MVLI(VarList);
   checkMappableExpressionList(*this, DSAStack, OMPC_map, MVLI, Locs.StartLoc,
                               MapperIdScopeSpec, MapperId, UnresolvedMappers,
-                              MapType, IsMapTypeImplicit);
+                              MapType, IsMapTypeImplicit, NoDiagnose);
 
   // We need to produce a map clause even if we don't have variables so that
   // other diagnostics related with non-existing map clauses are accurate.
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index ac0de21da0d38..acf258a6e357b 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -3316,7 +3316,7 @@ Sema::ActOnBreakStmt(SourceLocation BreakLoc, Scope *CurScope) {
 /// being thrown, or being co_returned from a coroutine. This expression
 /// might be modified by the implementation.
 ///
-/// \param ForceCXX2b Overrides detection of current language mode
+/// \param Mode Overrides detection of current language mode
 /// and uses the rules for C++2b.
 ///
 /// \returns An aggregate which contains the Candidate and isMoveEligible
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 4fa98877a2657..f9f6a134262d3 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -1933,10 +1933,10 @@ class TreeTransform {
       OpenMPMapClauseKind MapType, bool IsMapTypeImplicit,
       SourceLocation MapLoc, SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
       const OMPVarListLocTy &Locs, ArrayRef<Expr *> UnresolvedMappers) {
-    return getSema().ActOnOpenMPMapClause(MapTypeModifiers, MapTypeModifiersLoc,
-                                          MapperIdScopeSpec, MapperId, MapType,
-                                          IsMapTypeImplicit, MapLoc, ColonLoc,
-                                          VarList, Locs, UnresolvedMappers);
+    return getSema().ActOnOpenMPMapClause(
+        MapTypeModifiers, MapTypeModifiersLoc, MapperIdScopeSpec, MapperId,
+        MapType, IsMapTypeImplicit, MapLoc, ColonLoc, VarList, Locs,
+        /*NoDiagnose=*/false, UnresolvedMappers);
   }
 
   /// Build a new OpenMP 'allocate' clause.
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 0fafa0eb6551f..d6f9710ad803d 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -4240,8 +4240,11 @@ ASTReader::ASTReadResult ASTReader::ReadAST(StringRef FileName,
     PreviousGeneration = incrementGeneration(*ContextObj);
 
   unsigned NumModules = ModuleMgr.size();
-  auto removeModulesAndReturn = [&](ASTReadResult ReadResult) {
-    assert(ReadResult && "expected to return error");
+  SmallVector<ImportedModule, 4> Loaded;
+  if (ASTReadResult ReadResult =
+          ReadASTCore(FileName, Type, ImportLoc,
+                      /*ImportedBy=*/nullptr, Loaded, 0, 0, ASTFileSignature(),
+                      ClientLoadCapabilities)) {
     ModuleMgr.removeModules(ModuleMgr.begin() + NumModules,
                             PP.getLangOpts().Modules
                                 ? &PP.getHeaderSearchInfo().getModuleMap()
@@ -4252,22 +4255,6 @@ ASTReader::ASTReadResult ASTReader::ReadAST(StringRef FileName,
     GlobalIndex.reset();
     ModuleMgr.setGlobalIndex(nullptr);
     return ReadResult;
-  };
-
-  SmallVector<ImportedModule, 4> Loaded;
-  switch (ASTReadResult ReadResult =
-              ReadASTCore(FileName, Type, ImportLoc,
-                          /*ImportedBy=*/nullptr, Loaded, 0, 0,
-                          ASTFileSignature(), ClientLoadCapabilities)) {
-  case Failure:
-  case Missing:
-  case OutOfDate:
-  case VersionMismatch:
-  case ConfigurationMismatch:
-  case HadErrors:
-    return removeModulesAndReturn(ReadResult);
-  case Success:
-    break;
   }
 
   // Here comes stuff that we only do once the entire chain is loaded.
@@ -4279,18 +4266,18 @@ ASTReader::ASTReadResult ASTReader::ReadAST(StringRef FileName,
 
     // Read the AST block.
     if (ASTReadResult Result = ReadASTBlock(F, ClientLoadCapabilities))
-      return removeModulesAndReturn(Result);
+      return Failure;
 
     // The AST block should always have a definition for the main module.
     if (F.isModule() && !F.DidReadTopLevelSubmodule) {
       Error(diag::err_module_file_missing_top_level_submodule, F.FileName);
-      return removeModulesAndReturn(Failure);
+      return Failure;
     }
 
     // Read the extension blocks.
     while (!SkipCursorToBlock(F.Stream, EXTENSION_BLOCK_ID)) {
       if (ASTReadResult Result = ReadExtensionBlock(F))
-        return removeModulesAndReturn(Result);
+        return Failure;
     }
 
     // Once read, set the ModuleFile bit base offset and update the size in
@@ -5605,17 +5592,20 @@ ASTReader::ReadSubmoduleBlock(ModuleFile &F, unsigned ClientLoadCapabilities) {
     }
 
     case SUBMODULE_UMBRELLA_HEADER: {
+      // FIXME: This doesn't work for framework modules as `Filename` is the
+      //        name as written in the module file and does not include
+      //        `Headers/`, so this path will never exist.
       std::string Filename = std::string(Blob);
       ResolveImportedPath(F, Filename);
       if (auto Umbrella = PP.getFileManager().getFile(Filename)) {
-        if (!CurrentModule->getUmbrellaHeader())
+        if (!CurrentModule->getUmbrellaHeader()) {
           // FIXME: NameAsWritten
           ModMap.setUmbrellaHeader(CurrentModule, *Umbrella, Blob, "");
-        else if (CurrentModule->getUmbrellaHeader().Entry != *Umbrella) {
-          if ((ClientLoadCapabilities & ARR_OutOfDate) == 0)
-            Error("mismatched umbrella headers in submodule");
-          return OutOfDate;
         }
+        // Note that it's too late at this point to return out of date if the
+        // name from the PCM doesn't match up with the one in the module map,
+        // but also quite unlikely since we will have already checked the
+        // modification time and size of the module map file itself.
       }
       break;
     }
@@ -5639,16 +5629,13 @@ ASTReader::ReadSubmoduleBlock(ModuleFile &F, unsigned ClientLoadCapabilities) {
       break;
 
     case SUBMODULE_UMBRELLA_DIR: {
+      // See comments in SUBMODULE_UMBRELLA_HEADER
       std::string Dirname = std::string(Blob);
       ResolveImportedPath(F, Dirname);
       if (auto Umbrella = PP.getFileManager().getDirectory(Dirname)) {
-        if (!CurrentModule->getUmbrellaDir())
+        if (!CurrentModule->getUmbrellaDir()) {
           // FIXME: NameAsWritten
           ModMap.setUmbrellaDir(CurrentModule, *Umbrella, Blob, "");
-        else if (CurrentModule->getUmbrellaDir().Entry != *Umbrella) {
-          if ((ClientLoadCapabilities & ARR_OutOfDate) == 0)
-            Error("mismatched umbrella directories in submodule");
-          return OutOfDate;
         }
       }
       break;
diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp
index 175dfcef0df45..a13de306eac84 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CheckObjCInstMethSignature.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file defines a CheckObjCInstMethSignature, a flow-insenstive check
+//  This file defines a CheckObjCInstMethSignature, a flow-insensitive check
 //  that determines if an Objective-C class interface incorrectly redefines
 //  the method signature in a subclass.
 //
diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index a6470da09c458..7db4066653cbd 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -48,6 +48,7 @@
 #include "InterCheckerAPI.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclCXX.h"
+#include "clang/AST/DeclTemplate.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/ParentMap.h"
@@ -64,12 +65,15 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerHelpers.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/ExplodedGraph.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramStateTrait.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState_Fwd.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/StoreRef.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Compiler.h"
@@ -298,6 +302,8 @@ class MallocChecker
   /// which might free a pointer are annotated.
   DefaultBool ShouldIncludeOwnershipAnnotatedFunctions;
 
+  DefaultBool ShouldRegisterNoOwnershipChangeVisitor;
+
   /// Many checkers are essentially built into this one, so enabling them will
   /// make MallocChecker perform additional modeling and reporting.
   enum CheckKind {
@@ -722,11 +728,146 @@ class MallocChecker
   bool isArgZERO_SIZE_PTR(ProgramStateRef State, CheckerContext &C,
                           SVal ArgVal) const;
 };
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Definition of NoOwnershipChangeVisitor.
+//===----------------------------------------------------------------------===//
+
+namespace {
+class NoOwnershipChangeVisitor final : public NoStateChangeFuncVisitor {
+  SymbolRef Sym;
+  using OwnerSet = llvm::SmallPtrSet<const MemRegion *, 8>;
+
+  // Collect which entities point to the allocated memory, and could be
+  // responsible for deallocating it.
+  class OwnershipBindingsHandler : public StoreManager::BindingsHandler {
+    SymbolRef Sym;
+    OwnerSet &Owners;
+
+  public:
+    OwnershipBindingsHandler(SymbolRef Sym, OwnerSet &Owners)
+        : Sym(Sym), Owners(Owners) {}
+
+    bool HandleBinding(StoreManager &SMgr, Store Store, const MemRegion *Region,
+                       SVal Val) override {
+      if (Val.getAsSymbol() == Sym)
+        Owners.insert(Region);
+      return true;
+    }
+  };
+
+protected:
+  OwnerSet getOwnersAtNode(const ExplodedNode *N) {
+    OwnerSet Ret;
+
+    ProgramStateRef State = N->getState();
+    OwnershipBindingsHandler Handler{Sym, Ret};
+    State->getStateManager().getStoreManager().iterBindings(State->getStore(),
+                                                            Handler);
+    return Ret;
+  }
+
+  static const ExplodedNode *getCallExitEnd(const ExplodedNode *N) {
+    while (N && !N->getLocationAs<CallExitEnd>())
+      N = N->getFirstSucc();
+    return N;
+  }
+
+  virtual bool
+  wasModifiedBeforeCallExit(const ExplodedNode *CurrN,
+                            const ExplodedNode *CallExitN) override {
+    if (CurrN->getLocationAs<CallEnter>())
+      return true;
+
+    // Its the state right *after* the call that is interesting. Any pointers
+    // inside the call that pointed to the allocated memory are of little
+    // consequence if their lifetime ends within the function.
+    CallExitN = getCallExitEnd(CallExitN);
+    if (!CallExitN)
+      return true;
+
+    if (CurrN->getState()->get<RegionState>(Sym) !=
+        CallExitN->getState()->get<RegionState>(Sym))
+      return true;
+
+    OwnerSet CurrOwners = getOwnersAtNode(CurrN);
+    OwnerSet ExitOwners = getOwnersAtNode(CallExitN);
+
+    // Owners in the current set may be purged from the analyzer later on.
+    // If a variable is dead (is not referenced directly or indirectly after
+    // some point), it will be removed from the Store before the end of its
+    // actual lifetime.
+    // This means that that if the ownership status didn't change, CurrOwners
+    // must be a superset of, but not necessarily equal to ExitOwners.
+    return !llvm::set_is_subset(ExitOwners, CurrOwners);
+  }
+
+  static PathDiagnosticPieceRef emitNote(const ExplodedNode *N) {
+    PathDiagnosticLocation L = PathDiagnosticLocation::create(
+        N->getLocation(),
+        N->getState()->getStateManager().getContext().getSourceManager());
+    return std::make_shared<PathDiagnosticEventPiece>(
+        L, "Returning without deallocating memory or storing the pointer for "
+           "later deallocation");
+  }
+
+  virtual PathDiagnosticPieceRef
+  maybeEmitNoteForObjCSelf(PathSensitiveBugReport &R,
+                           const ObjCMethodCall &Call,
+                           const ExplodedNode *N) override {
+    // TODO: Implement.
+    return nullptr;
+  }
+
+  virtual PathDiagnosticPieceRef
+  maybeEmitNoteForCXXThis(PathSensitiveBugReport &R,
+                          const CXXConstructorCall &Call,
+                          const ExplodedNode *N) override {
+    // TODO: Implement.
+    return nullptr;
+  }
+
+  virtual PathDiagnosticPieceRef
+  maybeEmitNoteForParameters(PathSensitiveBugReport &R, const CallEvent &Call,
+                             const ExplodedNode *N) override {
+    // TODO: Factor the logic of "what constitutes as an entity being passed
+    // into a function call" out by reusing the code in
+    // NoStoreFuncVisitor::maybeEmitNoteForParameters, maybe by incorporating
+    // the printing technology in UninitializedObject's FieldChainInfo.
+    ArrayRef<ParmVarDecl *> Parameters = Call.parameters();
+    for (unsigned I = 0; I < Call.getNumArgs() && I < Parameters.size(); ++I) {
+      SVal V = Call.getArgSVal(I);
+      if (V.getAsSymbol() == Sym)
+        return emitNote(N);
+    }
+    return nullptr;
+  }
+
+public:
+  NoOwnershipChangeVisitor(SymbolRef Sym)
+      : NoStateChangeFuncVisitor(bugreporter::TrackingKind::Thorough),
+        Sym(Sym) {}
+
+  void Profile(llvm::FoldingSetNodeID &ID) const override {
+    static int Tag = 0;
+    ID.AddPointer(&Tag);
+    ID.AddPointer(Sym);
+  }
+
+  void *getTag() const {
+    static int Tag = 0;
+    return static_cast<void *>(&Tag);
+  }
+};
+
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 // Definition of MallocBugVisitor.
 //===----------------------------------------------------------------------===//
 
+namespace {
 /// The bug visitor which allows us to print extra diagnostics along the
 /// BugReport path. For example, showing the allocation site of the leaked
 /// region.
@@ -851,7 +992,6 @@ class MallocBugVisitor final : public BugReporterVisitor {
     }
   };
 };
-
 } // end anonymous namespace
 
 // A map from the freed symbol to the symbol representing the return value of
@@ -2579,6 +2719,8 @@ void MallocChecker::HandleLeak(SymbolRef Sym, ExplodedNode *N,
       AllocNode->getLocationContext()->getDecl());
   R->markInteresting(Sym);
   R->addVisitor<MallocBugVisitor>(Sym, true);
+  if (ShouldRegisterNoOwnershipChangeVisitor)
+    R->addVisitor<NoOwnershipChangeVisitor>(Sym);
   C.emitReport(std::move(R));
 }
 
@@ -3395,6 +3537,9 @@ void ento::registerDynamicMemoryModeling(CheckerManager &mgr) {
   auto *checker = mgr.registerChecker<MallocChecker>();
   checker->ShouldIncludeOwnershipAnnotatedFunctions =
       mgr.getAnalyzerOptions().getCheckerBooleanOption(checker, "Optimistic");
+  checker->ShouldRegisterNoOwnershipChangeVisitor =
+      mgr.getAnalyzerOptions().getCheckerBooleanOption(
+          checker, "AddNoOwnershipChangeNotes");
 }
 
 bool ento::shouldRegisterDynamicMemoryModeling(const CheckerManager &mgr) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp
index 90c5583d89691..dcca8be55e337 100644
--- a/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/NSErrorChecker.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file defines a CheckNSError, a flow-insenstive check
+//  This file defines a CheckNSError, a flow-insensitive check
 //  that determines if an Objective-C class interface correctly returns
 //  a non-void return type.
 //
diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
index d06a2d4933038..ecd9b649c4f46 100644
--- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -343,46 +343,140 @@ BugReporterVisitor::getDefaultEndPath(const BugReporterContext &BRC,
   return P;
 }
 
+//===----------------------------------------------------------------------===//
+// Implementation of NoStateChangeFuncVisitor.
+//===----------------------------------------------------------------------===//
+
+bool NoStateChangeFuncVisitor::isModifiedInFrame(const ExplodedNode *N) {
+  const LocationContext *Ctx = N->getLocationContext();
+  const StackFrameContext *SCtx = Ctx->getStackFrame();
+  if (!FramesModifyingCalculated.count(SCtx))
+    findModifyingFrames(N);
+  return FramesModifying.count(SCtx);
+}
+
+void NoStateChangeFuncVisitor::findModifyingFrames(
+    const ExplodedNode *const CallExitBeginN) {
+
+  assert(CallExitBeginN->getLocationAs<CallExitBegin>());
+  const ExplodedNode *LastReturnN = CallExitBeginN;
+  const StackFrameContext *const OriginalSCtx =
+      CallExitBeginN->getLocationContext()->getStackFrame();
+
+  const ExplodedNode *CurrN = CallExitBeginN;
+
+  do {
+    ProgramStateRef State = CurrN->getState();
+    auto CallExitLoc = CurrN->getLocationAs<CallExitBegin>();
+    if (CallExitLoc) {
+      LastReturnN = CurrN;
+    }
+
+    FramesModifyingCalculated.insert(
+        CurrN->getLocationContext()->getStackFrame());
+
+    if (wasModifiedBeforeCallExit(CurrN, LastReturnN)) {
+      const StackFrameContext *SCtx = CurrN->getStackFrame();
+      while (!SCtx->inTopFrame()) {
+        auto p = FramesModifying.insert(SCtx);
+        if (!p.second)
+          break; // Frame and all its parents already inserted.
+        SCtx = SCtx->getParent()->getStackFrame();
+      }
+    }
+
+    // Stop calculation at the call to the current function.
+    if (auto CE = CurrN->getLocationAs<CallEnter>())
+      if (CE->getCalleeContext() == OriginalSCtx)
+        break;
+
+    CurrN = CurrN->getFirstPred();
+  } while (CurrN);
+}
+
+PathDiagnosticPieceRef NoStateChangeFuncVisitor::VisitNode(
+    const ExplodedNode *N, BugReporterContext &BR, PathSensitiveBugReport &R) {
+
+  const LocationContext *Ctx = N->getLocationContext();
+  const StackFrameContext *SCtx = Ctx->getStackFrame();
+  ProgramStateRef State = N->getState();
+  auto CallExitLoc = N->getLocationAs<CallExitBegin>();
+
+  // No diagnostic if region was modified inside the frame.
+  if (!CallExitLoc || isModifiedInFrame(N))
+    return nullptr;
+
+  CallEventRef<> Call =
+      BR.getStateManager().getCallEventManager().getCaller(SCtx, State);
+
+  // Optimistically suppress uninitialized value bugs that result
+  // from system headers having a chance to initialize the value
+  // but failing to do so. It's too unlikely a system header's fault.
+  // It's much more likely a situation in which the function has a failure
+  // mode that the user decided not to check. If we want to hunt such
+  // omitted checks, we should provide an explicit function-specific note
+  // describing the precondition under which the function isn't supposed to
+  // initialize its out-parameter, and additionally check that such
+  // precondition can actually be fulfilled on the current path.
+  if (Call->isInSystemHeader()) {
+    // We make an exception for system header functions that have no branches.
+    // Such functions unconditionally fail to initialize the variable.
+    // If they call other functions that have more paths within them,
+    // this suppression would still apply when we visit these inner functions.
+    // One common example of a standard function that doesn't ever initialize
+    // its out parameter is operator placement new; it's up to the follow-up
+    // constructor (if any) to initialize the memory.
+    if (!N->getStackFrame()->getCFG()->isLinear()) {
+      static int i = 0;
+      R.markInvalid(&i, nullptr);
+    }
+    return nullptr;
+  }
+
+  if (const auto *MC = dyn_cast<ObjCMethodCall>(Call)) {
+    // If we failed to construct a piece for self, we still want to check
+    // whether the entity of interest is in a parameter.
+    if (PathDiagnosticPieceRef Piece = maybeEmitNoteForObjCSelf(R, *MC, N))
+      return Piece;
+  }
+
+  if (const auto *CCall = dyn_cast<CXXConstructorCall>(Call)) {
+    // Do not generate diagnostics for not modified parameters in
+    // constructors.
+    return maybeEmitNoteForCXXThis(R, *CCall, N);
+  }
+
+  return maybeEmitNoteForParameters(R, *Call, N);
+}
+
 //===----------------------------------------------------------------------===//
 // Implementation of NoStoreFuncVisitor.
 //===----------------------------------------------------------------------===//
 
 namespace {
-
 /// Put a diagnostic on return statement of all inlined functions
 /// for which  the region of interest \p RegionOfInterest was passed into,
 /// but not written inside, and it has caused an undefined read or a null
 /// pointer dereference outside.
-class NoStoreFuncVisitor final : public BugReporterVisitor {
+class NoStoreFuncVisitor final : public NoStateChangeFuncVisitor {
   const SubRegion *RegionOfInterest;
   MemRegionManager &MmrMgr;
   const SourceManager &SM;
   const PrintingPolicy &PP;
-  bugreporter::TrackingKind TKind;
 
   /// Recursion limit for dereferencing fields when looking for the
   /// region of interest.
   /// The limit of two indicates that we will dereference fields only once.
   static const unsigned DEREFERENCE_LIMIT = 2;
 
-  /// Frames writing into \c RegionOfInterest.
-  /// This visitor generates a note only if a function does not write into
-  /// a region of interest. This information is not immediately available
-  /// by looking at the node associated with the exit from the function
-  /// (usually the return statement). To avoid recomputing the same information
-  /// many times (going up the path for each node and checking whether the
-  /// region was written into) we instead lazily compute the
-  /// stack frames along the path which write into the region of interest.
-  llvm::SmallPtrSet<const StackFrameContext *, 32> FramesModifyingRegion;
-  llvm::SmallPtrSet<const StackFrameContext *, 32> FramesModifyingCalculated;
-
   using RegionVector = SmallVector<const MemRegion *, 5>;
 
 public:
   NoStoreFuncVisitor(const SubRegion *R, bugreporter::TrackingKind TKind)
-      : RegionOfInterest(R), MmrMgr(R->getMemRegionManager()),
+      : NoStateChangeFuncVisitor(TKind), RegionOfInterest(R),
+        MmrMgr(R->getMemRegionManager()),
         SM(MmrMgr.getContext().getSourceManager()),
-        PP(MmrMgr.getContext().getPrintingPolicy()), TKind(TKind) {}
+        PP(MmrMgr.getContext().getPrintingPolicy()) {}
 
   void Profile(llvm::FoldingSetNodeID &ID) const override {
     static int Tag = 0;
@@ -395,11 +489,13 @@ class NoStoreFuncVisitor final : public BugReporterVisitor {
     return static_cast<void *>(&Tag);
   }
 
-  PathDiagnosticPieceRef VisitNode(const ExplodedNode *N,
-                                   BugReporterContext &BR,
-                                   PathSensitiveBugReport &R) override;
-
 private:
+  /// \return Whether \c RegionOfInterest was modified at \p CurrN compared to
+  /// the value it holds in \p CallExitBeginN.
+  virtual bool
+  wasModifiedBeforeCallExit(const ExplodedNode *CurrN,
+                            const ExplodedNode *CallExitBeginN) override;
+
   /// Attempts to find the region of interest in a given record decl,
   /// by either following the base classes or fields.
   /// Dereferences fields up to a given recursion limit.
@@ -411,20 +507,21 @@ class NoStoreFuncVisitor final : public BugReporterVisitor {
                                const MemRegion *R, const RegionVector &Vec = {},
                                int depth = 0);
 
-  /// Check and lazily calculate whether the region of interest is
-  /// modified in the stack frame to which \p N belongs.
-  /// The calculation is cached in FramesModifyingRegion.
-  bool isRegionOfInterestModifiedInFrame(const ExplodedNode *N) {
-    const LocationContext *Ctx = N->getLocationContext();
-    const StackFrameContext *SCtx = Ctx->getStackFrame();
-    if (!FramesModifyingCalculated.count(SCtx))
-      findModifyingFrames(N);
-    return FramesModifyingRegion.count(SCtx);
-  }
+  // Region of interest corresponds to an IVar, exiting a method
+  // which could have written into that IVar, but did not.
+  virtual PathDiagnosticPieceRef
+  maybeEmitNoteForObjCSelf(PathSensitiveBugReport &R,
+                           const ObjCMethodCall &Call,
+                           const ExplodedNode *N) override final;
+
+  virtual PathDiagnosticPieceRef
+  maybeEmitNoteForCXXThis(PathSensitiveBugReport &R,
+                          const CXXConstructorCall &Call,
+                          const ExplodedNode *N) override final;
 
-  /// Write to \c FramesModifyingRegion all stack frames along
-  /// the path in the current stack frame which modify \c RegionOfInterest.
-  void findModifyingFrames(const ExplodedNode *N);
+  virtual PathDiagnosticPieceRef
+  maybeEmitNoteForParameters(PathSensitiveBugReport &R, const CallEvent &Call,
+                             const ExplodedNode *N) override final;
 
   /// Consume the information on the no-store stack frame in order to
   /// either emit a note or suppress the report enirely.
@@ -436,22 +533,18 @@ class NoStoreFuncVisitor final : public BugReporterVisitor {
                 const MemRegion *MatchedRegion, StringRef FirstElement,
                 bool FirstIsReferenceType, unsigned IndirectionLevel);
 
-  /// Pretty-print region \p MatchedRegion to \p os.
-  /// \return Whether printing succeeded.
-  bool prettyPrintRegionName(StringRef FirstElement, bool FirstIsReferenceType,
+  bool prettyPrintRegionName(const RegionVector &FieldChain,
                              const MemRegion *MatchedRegion,
-                             const RegionVector &FieldChain,
-                             int IndirectionLevel,
+                             StringRef FirstElement, bool FirstIsReferenceType,
+                             unsigned IndirectionLevel,
                              llvm::raw_svector_ostream &os);
 
-  /// Print first item in the chain, return new separator.
-  static StringRef prettyPrintFirstElement(StringRef FirstElement,
-                                           bool MoreItemsExpected,
-                                           int IndirectionLevel,
-                                           llvm::raw_svector_ostream &os);
+  StringRef prettyPrintFirstElement(StringRef FirstElement,
+                                    bool MoreItemsExpected,
+                                    int IndirectionLevel,
+                                    llvm::raw_svector_ostream &os);
 };
-
-} // end of anonymous namespace
+} // namespace
 
 /// \return Whether the method declaration \p Parent
 /// syntactically has a binary operation writing into the ivar \p Ivar.
@@ -486,25 +579,6 @@ static bool potentiallyWritesIntoIvar(const Decl *Parent,
   return false;
 }
 
-/// Get parameters associated with runtime definition in order
-/// to get the correct parameter name.
-static ArrayRef<ParmVarDecl *> getCallParameters(CallEventRef<> Call) {
-  // Use runtime definition, if available.
-  RuntimeDefinition RD = Call->getRuntimeDefinition();
-  if (const auto *FD = dyn_cast_or_null<FunctionDecl>(RD.getDecl()))
-    return FD->parameters();
-  if (const auto *MD = dyn_cast_or_null<ObjCMethodDecl>(RD.getDecl()))
-    return MD->parameters();
-
-  return Call->parameters();
-}
-
-/// \return whether \p Ty points to a const type, or is a const reference.
-static bool isPointerToConst(QualType Ty) {
-  return !Ty->getPointeeType().isNull() &&
-         Ty->getPointeeType().getCanonicalType().isConstQualified();
-}
-
 /// Attempts to find the region of interest in a given CXX decl,
 /// by either following the base classes or fields.
 /// Dereferences fields up to a given recursion limit.
@@ -564,68 +638,66 @@ NoStoreFuncVisitor::findRegionOfInterestInRecord(
 }
 
 PathDiagnosticPieceRef
-NoStoreFuncVisitor::VisitNode(const ExplodedNode *N, BugReporterContext &BR,
-                              PathSensitiveBugReport &R) {
-
-  const LocationContext *Ctx = N->getLocationContext();
-  const StackFrameContext *SCtx = Ctx->getStackFrame();
-  ProgramStateRef State = N->getState();
-  auto CallExitLoc = N->getLocationAs<CallExitBegin>();
-
-  // No diagnostic if region was modified inside the frame.
-  if (!CallExitLoc || isRegionOfInterestModifiedInFrame(N))
-    return nullptr;
-
-  CallEventRef<> Call =
-      BR.getStateManager().getCallEventManager().getCaller(SCtx, State);
-
-  // Region of interest corresponds to an IVar, exiting a method
-  // which could have written into that IVar, but did not.
-  if (const auto *MC = dyn_cast<ObjCMethodCall>(Call)) {
-    if (const auto *IvarR = dyn_cast<ObjCIvarRegion>(RegionOfInterest)) {
-      const MemRegion *SelfRegion = MC->getReceiverSVal().getAsRegion();
-      if (RegionOfInterest->isSubRegionOf(SelfRegion) &&
-          potentiallyWritesIntoIvar(Call->getRuntimeDefinition().getDecl(),
-                                    IvarR->getDecl()))
-        return maybeEmitNote(R, *Call, N, {}, SelfRegion, "self",
-                             /*FirstIsReferenceType=*/false, 1);
-    }
+NoStoreFuncVisitor::maybeEmitNoteForObjCSelf(PathSensitiveBugReport &R,
+                                             const ObjCMethodCall &Call,
+                                             const ExplodedNode *N) {
+  if (const auto *IvarR = dyn_cast<ObjCIvarRegion>(RegionOfInterest)) {
+    const MemRegion *SelfRegion = Call.getReceiverSVal().getAsRegion();
+    if (RegionOfInterest->isSubRegionOf(SelfRegion) &&
+        potentiallyWritesIntoIvar(Call.getRuntimeDefinition().getDecl(),
+                                  IvarR->getDecl()))
+      return maybeEmitNote(R, Call, N, {}, SelfRegion, "self",
+                           /*FirstIsReferenceType=*/false, 1);
   }
+  return nullptr;
+}
 
-  if (const auto *CCall = dyn_cast<CXXConstructorCall>(Call)) {
-    const MemRegion *ThisR = CCall->getCXXThisVal().getAsRegion();
-    if (RegionOfInterest->isSubRegionOf(ThisR) &&
-        !CCall->getDecl()->isImplicit())
-      return maybeEmitNote(R, *Call, N, {}, ThisR, "this",
-                           /*FirstIsReferenceType=*/false, 1);
+PathDiagnosticPieceRef
+NoStoreFuncVisitor::maybeEmitNoteForCXXThis(PathSensitiveBugReport &R,
+                                            const CXXConstructorCall &Call,
+                                            const ExplodedNode *N) {
+  const MemRegion *ThisR = Call.getCXXThisVal().getAsRegion();
+  if (RegionOfInterest->isSubRegionOf(ThisR) && !Call.getDecl()->isImplicit())
+    return maybeEmitNote(R, Call, N, {}, ThisR, "this",
+                         /*FirstIsReferenceType=*/false, 1);
+
+  // Do not generate diagnostics for not modified parameters in
+  // constructors.
+  return nullptr;
+}
 
-    // Do not generate diagnostics for not modified parameters in
-    // constructors.
-    return nullptr;
-  }
+/// \return whether \p Ty points to a const type, or is a const reference.
+static bool isPointerToConst(QualType Ty) {
+  return !Ty->getPointeeType().isNull() &&
+         Ty->getPointeeType().getCanonicalType().isConstQualified();
+}
 
-  ArrayRef<ParmVarDecl *> parameters = getCallParameters(Call);
-  for (unsigned I = 0; I < Call->getNumArgs() && I < parameters.size(); ++I) {
-    const ParmVarDecl *PVD = parameters[I];
-    SVal V = Call->getArgSVal(I);
+PathDiagnosticPieceRef NoStoreFuncVisitor::maybeEmitNoteForParameters(
+    PathSensitiveBugReport &R, const CallEvent &Call, const ExplodedNode *N) {
+  ArrayRef<ParmVarDecl *> Parameters = Call.parameters();
+  for (unsigned I = 0; I < Call.getNumArgs() && I < Parameters.size(); ++I) {
+    const ParmVarDecl *PVD = Parameters[I];
+    SVal V = Call.getArgSVal(I);
     bool ParamIsReferenceType = PVD->getType()->isReferenceType();
     std::string ParamName = PVD->getNameAsString();
 
-    int IndirectionLevel = 1;
+    unsigned IndirectionLevel = 1;
     QualType T = PVD->getType();
     while (const MemRegion *MR = V.getAsRegion()) {
       if (RegionOfInterest->isSubRegionOf(MR) && !isPointerToConst(T))
-        return maybeEmitNote(R, *Call, N, {}, MR, ParamName,
+        return maybeEmitNote(R, Call, N, {}, MR, ParamName,
                              ParamIsReferenceType, IndirectionLevel);
 
       QualType PT = T->getPointeeType();
       if (PT.isNull() || PT->isVoidType())
         break;
 
+      ProgramStateRef State = N->getState();
+
       if (const RecordDecl *RD = PT->getAsRecordDecl())
         if (Optional<RegionVector> P =
                 findRegionOfInterestInRecord(RD, State, MR))
-          return maybeEmitNote(R, *Call, N, *P, RegionOfInterest, ParamName,
+          return maybeEmitNote(R, Call, N, *P, RegionOfInterest, ParamName,
                                ParamIsReferenceType, IndirectionLevel);
 
       V = State->getSVal(MR, PT);
@@ -637,40 +709,11 @@ NoStoreFuncVisitor::VisitNode(const ExplodedNode *N, BugReporterContext &BR,
   return nullptr;
 }
 
-void NoStoreFuncVisitor::findModifyingFrames(const ExplodedNode *N) {
-  assert(N->getLocationAs<CallExitBegin>());
-  ProgramStateRef LastReturnState = N->getState();
-  SVal ValueAtReturn = LastReturnState->getSVal(RegionOfInterest);
-  const LocationContext *Ctx = N->getLocationContext();
-  const StackFrameContext *OriginalSCtx = Ctx->getStackFrame();
-
-  do {
-    ProgramStateRef State = N->getState();
-    auto CallExitLoc = N->getLocationAs<CallExitBegin>();
-    if (CallExitLoc) {
-      LastReturnState = State;
-      ValueAtReturn = LastReturnState->getSVal(RegionOfInterest);
-    }
-
-    FramesModifyingCalculated.insert(N->getLocationContext()->getStackFrame());
-
-    if (wasRegionOfInterestModifiedAt(RegionOfInterest, N, ValueAtReturn)) {
-      const StackFrameContext *SCtx = N->getStackFrame();
-      while (!SCtx->inTopFrame()) {
-        auto p = FramesModifyingRegion.insert(SCtx);
-        if (!p.second)
-          break; // Frame and all its parents already inserted.
-        SCtx = SCtx->getParent()->getStackFrame();
-      }
-    }
-
-    // Stop calculation at the call to the current function.
-    if (auto CE = N->getLocationAs<CallEnter>())
-      if (CE->getCalleeContext() == OriginalSCtx)
-        break;
-
-    N = N->getFirstPred();
-  } while (N);
+bool NoStoreFuncVisitor::wasModifiedBeforeCallExit(
+    const ExplodedNode *CurrN, const ExplodedNode *CallExitBeginN) {
+  return ::wasRegionOfInterestModifiedAt(
+      RegionOfInterest, CurrN,
+      CallExitBeginN->getState()->getSVal(RegionOfInterest));
 }
 
 static llvm::StringLiteral WillBeUsedForACondition =
@@ -681,27 +724,6 @@ PathDiagnosticPieceRef NoStoreFuncVisitor::maybeEmitNote(
     const RegionVector &FieldChain, const MemRegion *MatchedRegion,
     StringRef FirstElement, bool FirstIsReferenceType,
     unsigned IndirectionLevel) {
-  // Optimistically suppress uninitialized value bugs that result
-  // from system headers having a chance to initialize the value
-  // but failing to do so. It's too unlikely a system header's fault.
-  // It's much more likely a situation in which the function has a failure
-  // mode that the user decided not to check. If we want to hunt such
-  // omitted checks, we should provide an explicit function-specific note
-  // describing the precondition under which the function isn't supposed to
-  // initialize its out-parameter, and additionally check that such
-  // precondition can actually be fulfilled on the current path.
-  if (Call.isInSystemHeader()) {
-    // We make an exception for system header functions that have no branches.
-    // Such functions unconditionally fail to initialize the variable.
-    // If they call other functions that have more paths within them,
-    // this suppression would still apply when we visit these inner functions.
-    // One common example of a standard function that doesn't ever initialize
-    // its out parameter is operator placement new; it's up to the follow-up
-    // constructor (if any) to initialize the memory.
-    if (!N->getStackFrame()->getCFG()->isLinear())
-      R.markInvalid(getTag(), nullptr);
-    return nullptr;
-  }
 
   PathDiagnosticLocation L =
       PathDiagnosticLocation::create(N->getLocation(), SM);
@@ -717,8 +739,8 @@ PathDiagnosticPieceRef NoStoreFuncVisitor::maybeEmitNote(
   os << "Returning without writing to '";
 
   // Do not generate the note if failed to pretty-print.
-  if (!prettyPrintRegionName(FirstElement, FirstIsReferenceType, MatchedRegion,
-                             FieldChain, IndirectionLevel, os))
+  if (!prettyPrintRegionName(FieldChain, MatchedRegion, FirstElement,
+                             FirstIsReferenceType, IndirectionLevel, os))
     return nullptr;
 
   os << "'";
@@ -727,11 +749,11 @@ PathDiagnosticPieceRef NoStoreFuncVisitor::maybeEmitNote(
   return std::make_shared<PathDiagnosticEventPiece>(L, os.str());
 }
 
-bool NoStoreFuncVisitor::prettyPrintRegionName(StringRef FirstElement,
-                                               bool FirstIsReferenceType,
+bool NoStoreFuncVisitor::prettyPrintRegionName(const RegionVector &FieldChain,
                                                const MemRegion *MatchedRegion,
-                                               const RegionVector &FieldChain,
-                                               int IndirectionLevel,
+                                               StringRef FirstElement,
+                                               bool FirstIsReferenceType,
+                                               unsigned IndirectionLevel,
                                                llvm::raw_svector_ostream &os) {
 
   if (FirstIsReferenceType)
@@ -1153,7 +1175,7 @@ class StoreSiteFinder final : public TrackingBugReporterVisitor {
 public:
   /// \param V We're searching for the store where \c R received this value.
   /// \param R The region we're tracking.
-  /// \param TKind May limit the amount of notes added to the bug report.
+  /// \param Options Tracking behavior options.
   /// \param OriginSFC Only adds notes when the last store happened in a
   ///        different stackframe to this one. Disregarded if the tracking kind
   ///        is thorough.
diff --git a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
index 3ee12c0bdf651..c90046ffb4131 100644
--- a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
@@ -1289,15 +1289,32 @@ var findNum = function() {
     return out;
 };
 
+var classListAdd = function(el, theClass) {
+  if(!el.className.baseVal)
+    el.className += " " + theClass;
+  else
+    el.className.baseVal += " " + theClass;
+};
+
+var classListRemove = function(el, theClass) {
+  var className = (!el.className.baseVal) ?
+      el.className : el.className.baseVal;
+    className = className.replace(" " + theClass, "");
+  if(!el.className.baseVal)
+    el.className = className;
+  else
+    el.className.baseVal = className;
+};
+
 var scrollTo = function(el) {
     querySelectorAllArray(".selected").forEach(function(s) {
-        s.classList.remove("selected");
+      classListRemove(s, "selected");
     });
-    el.classList.add("selected");
+    classListAdd(el, "selected");
     window.scrollBy(0, el.getBoundingClientRect().top -
         (window.innerHeight / 2));
     highlightArrowsForSelectedEvent();
-}
+};
 
 var move = function(num, up, numItems) {
   if (num == 1 && up || num == numItems - 1 && !up) {
@@ -1332,9 +1349,11 @@ window.addEventListener("keydown", function (event) {
   if (event.defaultPrevented) {
     return;
   }
-  if (event.key == "j") {
+  // key 'j'
+  if (event.keyCode == 74) {
     navigateTo(/*up=*/false);
-  } else if (event.key == "k") {
+  // key 'k'
+  } else if (event.keyCode == 75) {
     navigateTo(/*up=*/true);
   } else {
     return;
@@ -1350,8 +1369,11 @@ StringRef HTMLDiagnostics::generateArrowDrawingJavascript() {
 <script type='text/javascript'>
 // Return range of numbers from a range [lower, upper).
 function range(lower, upper) {
-  const size = upper - lower;
-  return Array.from(new Array(size), (x, i) => i + lower);
+  var array = [];
+  for (var i = lower; i <= upper; ++i) {
+      array.push(i);
+  }
+  return array;
 }
 
 var getRelatedArrowIndices = function(pathId) {
@@ -1371,7 +1393,9 @@ var highlightArrowsForSelectedEvent = function() {
   const arrowIndicesToHighlight = getRelatedArrowIndices(selectedNum);
   arrowIndicesToHighlight.forEach((index) => {
     var arrow = document.querySelector("#arrow" + index);
-    arrow.classList.add("selected");
+    if(arrow) {
+      classListAdd(arrow, "selected")
+    }
   });
 }
 
diff --git a/clang/test/Analysis/NewDeleteLeaks.cpp b/clang/test/Analysis/NewDeleteLeaks.cpp
new file mode 100644
index 0000000000000..28040d9d0d36b
--- /dev/null
+++ b/clang/test/Analysis/NewDeleteLeaks.cpp
@@ -0,0 +1,142 @@
+// RUN: %clang_analyze_cc1 -verify -analyzer-output=text %s \
+// RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-checker=cplusplus \
+// RUN:   -analyzer-checker=unix \
+// RUN:   -analyzer-config \
+// RUN:     unix.DynamicMemoryModeling:AddNoOwnershipChangeNotes=false
+
+// RUN: %clang_analyze_cc1 -verify=expected,ownership -analyzer-output=text %s \
+// RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-checker=cplusplus \
+// RUN:   -analyzer-checker=unix \
+// RUN:   -analyzer-config \
+// RUN:     unix.DynamicMemoryModeling:AddNoOwnershipChangeNotes=true
+
+#include "Inputs/system-header-simulator-for-malloc.h"
+
+//===----------------------------------------------------------------------===//
+// Report for which we expect NoOwnershipChangeVisitor to add a new note.
+//===----------------------------------------------------------------------===//
+
+bool coin();
+
+namespace memory_allocated_in_fn_call {
+
+void sink(int *P) {
+} // ownership-note {{Returning without deallocating memory or storing the pointer for later deallocation}}
+
+void foo() {
+  sink(new int(5)); // expected-note {{Memory is allocated}}
+                    // ownership-note@-1 {{Calling 'sink'}}
+                    // ownership-note@-2 {{Returning from 'sink'}}
+} // expected-warning {{Potential memory leak [cplusplus.NewDeleteLeaks]}}
+// expected-note@-1 {{Potential memory leak}}
+
+} // namespace memory_allocated_in_fn_call
+
+namespace memory_passed_to_fn_call {
+
+void sink(int *P) {
+  if (coin()) // ownership-note {{Assuming the condition is false}}
+              // ownership-note@-1 {{Taking false branch}}
+    delete P;
+} // ownership-note {{Returning without deallocating memory or storing the pointer for later deallocation}}
+
+void foo() {
+  int *ptr = new int(5); // expected-note {{Memory is allocated}}
+  sink(ptr);             // ownership-note {{Calling 'sink'}}
+                         // ownership-note@-1 {{Returning from 'sink'}}
+} // expected-warning {{Potential leak of memory pointed to by 'ptr' [cplusplus.NewDeleteLeaks]}}
+// expected-note@-1 {{Potential leak}}
+
+} // namespace memory_passed_to_fn_call
+
+namespace memory_shared_with_ptr_of_shorter_lifetime {
+
+void sink(int *P) {
+  int *Q = P;
+  if (coin()) // ownership-note {{Assuming the condition is false}}
+              // ownership-note@-1 {{Taking false branch}}
+    delete P;
+  (void)Q;
+} // ownership-note {{Returning without deallocating memory or storing the pointer for later deallocation}}
+
+void foo() {
+  int *ptr = new int(5); // expected-note {{Memory is allocated}}
+  sink(ptr);             // ownership-note {{Calling 'sink'}}
+                         // ownership-note@-1 {{Returning from 'sink'}}
+} // expected-warning {{Potential leak of memory pointed to by 'ptr' [cplusplus.NewDeleteLeaks]}}
+// expected-note@-1 {{Potential leak}}
+
+} // namespace memory_shared_with_ptr_of_shorter_lifetime
+
+//===----------------------------------------------------------------------===//
+// Report for which we *do not* expect NoOwnershipChangeVisitor add a new note,
+// nor do we want it to.
+//===----------------------------------------------------------------------===//
+
+namespace memory_not_passed_to_fn_call {
+
+void sink(int *P) {
+  if (coin())
+    delete P;
+}
+
+void foo() {
+  int *ptr = new int(5); // expected-note {{Memory is allocated}}
+  int *q = nullptr;
+  sink(q);
+  (void)ptr;
+} // expected-warning {{Potential leak of memory pointed to by 'ptr' [cplusplus.NewDeleteLeaks]}}
+// expected-note@-1 {{Potential leak}}
+
+} // namespace memory_not_passed_to_fn_call
+
+namespace memory_shared_with_ptr_of_same_lifetime {
+
+void sink(int *P, int **Q) {
+  // NOTE: Not a job of NoOwnershipChangeVisitor, but maybe this could be
+  // highlighted still?
+  *Q = P;
+}
+
+void foo() {
+  int *ptr = new int(5); // expected-note {{Memory is allocated}}
+  int *q = nullptr;
+  sink(ptr, &q);
+} // expected-warning {{Potential leak of memory pointed to by 'q' [cplusplus.NewDeleteLeaks]}}
+// expected-note@-1 {{Potential leak}}
+
+} // namespace memory_shared_with_ptr_of_same_lifetime
+
+// TODO: We don't want a note here. sink() doesn't seem like a function that
+// even attempts to take care of any memory ownership problems.
+namespace memory_passed_into_fn_that_doesnt_intend_to_free {
+
+void sink(int *P) {
+} // ownership-note {{Returning without deallocating memory or storing the pointer for later deallocation}}
+
+void foo() {
+  int *ptr = new int(5); // expected-note {{Memory is allocated}}
+  sink(ptr);             // ownership-note {{Calling 'sink'}}
+                         // ownership-note@-1 {{Returning from 'sink'}}
+} // expected-warning {{Potential leak of memory pointed to by 'ptr' [cplusplus.NewDeleteLeaks]}}
+// expected-note@-1 {{Potential leak}}
+
+} // namespace memory_passed_into_fn_that_doesnt_intend_to_free
+
+namespace refkind_from_unoallocated_to_allocated {
+
+// RefKind of the symbol changed from nothing to Allocated. We don't want to
+// emit notes when the RefKind changes in the stack frame.
+static char *malloc_wrapper_ret() {
+  return (char *)malloc(12); // expected-note {{Memory is allocated}}
+}
+void use_ret() {
+  char *v;
+  v = malloc_wrapper_ret(); // expected-note {{Calling 'malloc_wrapper_ret'}}
+                            // expected-note@-1 {{Returned allocated memory}}
+} // expected-warning {{Potential leak of memory pointed to by 'v' [unix.Malloc]}}
+// expected-note@-1 {{Potential leak of memory pointed to by 'v'}}
+
+} // namespace refkind_from_unoallocated_to_allocated
diff --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c
index d286f1258c21a..2a41e692dd59b 100644
--- a/clang/test/Analysis/analyzer-config.c
+++ b/clang/test/Analysis/analyzer-config.c
@@ -116,6 +116,7 @@
 // CHECK-NEXT: suppress-null-return-paths = true
 // CHECK-NEXT: track-conditions = true
 // CHECK-NEXT: track-conditions-debug = false
+// CHECK-NEXT: unix.DynamicMemoryModeling:AddNoOwnershipChangeNotes = false
 // CHECK-NEXT: unix.DynamicMemoryModeling:Optimistic = false
 // CHECK-NEXT: unroll-loops = false
 // CHECK-NEXT: widen-loops = false
diff --git a/clang/test/CXX/drs/dr1xx.cpp b/clang/test/CXX/drs/dr1xx.cpp
index b262db004fcf9..4efa2e2fa9431 100644
--- a/clang/test/CXX/drs/dr1xx.cpp
+++ b/clang/test/CXX/drs/dr1xx.cpp
@@ -370,13 +370,10 @@ namespace dr128 { // dr128: yes
 // dr129: dup 616
 // dr130: na
 
-namespace dr131 { // dr131: yes
+namespace dr131 { // dr131: sup P1949
   const char *a_with_\u0e8c = "\u0e8c";
   const char *b_with_\u0e8d = "\u0e8d";
   const char *c_with_\u0e8e = "\u0e8e";
-#if __cplusplus < 201103L
-  // expected-error@-4 {{expected ';'}} expected-error@-2 {{expected ';'}}
-#endif
 }
 
 namespace dr132 { // dr132: no
diff --git a/clang/test/CXX/drs/dr2xx.cpp b/clang/test/CXX/drs/dr2xx.cpp
index ab70b8742d00b..52aca02382094 100644
--- a/clang/test/CXX/drs/dr2xx.cpp
+++ b/clang/test/CXX/drs/dr2xx.cpp
@@ -597,12 +597,8 @@ namespace dr247 { // dr247: yes
   void (F::*i)() = &F::f;
 }
 
-namespace dr248 { // dr248: yes c++11
-  // FIXME: Should this also apply to c++98 mode? This was a DR against C++98.
+namespace dr248 { // dr248: sup P1949
   int \u040d\u040e = 0;
-#if __cplusplus < 201103L
-  // FIXME: expected-error@-2 {{expected ';'}}
-#endif
 }
 
 namespace dr249 { // dr249: yes
diff --git a/clang/test/CXX/drs/dr6xx.cpp b/clang/test/CXX/drs/dr6xx.cpp
index e32d6b2afb131..e9e08fd2e2c00 100644
--- a/clang/test/CXX/drs/dr6xx.cpp
+++ b/clang/test/CXX/drs/dr6xx.cpp
@@ -720,11 +720,8 @@ namespace dr662 { // dr662: yes
   void g(int n) { f<int&>(n); } // expected-note {{instantiation of}}
 }
 
-namespace dr663 { // dr663: yes c++11
+namespace dr663 { // dr663: sup P1949
   int ЍЎ = 123;
-#if __cplusplus < 201103L
-  // expected-error@-2 {{non-ASCII}}
-#endif
 }
 
 #if __cplusplus >= 201103L
diff --git a/clang/test/CodeCompletion/attr.cpp b/clang/test/CodeCompletion/attr.cpp
index 61807cf4984cb..c3f76e7f0d99d 100644
--- a/clang/test/CodeCompletion/attr.cpp
+++ b/clang/test/CodeCompletion/attr.cpp
@@ -1,78 +1,83 @@
 int a [[gnu::used]];
 // RUN: %clang_cc1 -code-completion-at=%s:1:9 %s | FileCheck --check-prefix=STD %s
-// STD:     COMPLETION: __carries_dependency__
-// STD-NOT: COMPLETION: __convergent__
-// STD:     COMPLETION: __gnu__::__used__
-// STD-NOT: COMPLETION: __gnu__::used
-// STD-NOT: COMPLETION: __used__
-// STD:     COMPLETION: _Clang::__convergent__
-// STD:     COMPLETION: carries_dependency
-// STD:     COMPLETION: clang::convergent
-// STD-NOT: COMPLETION: convergent
-// STD-NOT:     COMPLETION: gnu::__used__
-// STD:     COMPLETION: gnu::used
-// STD-NOT: COMPLETION: used
+// STD:     COMPLETION: Pattern : __carries_dependency__
+// STD-NOT: COMPLETION: Pattern : __convergent__
+// STD:     COMPLETION: Pattern : __gnu__::__used__
+// STD-NOT: COMPLETION: Pattern : __gnu__::used
+// STD-NOT: COMPLETION: Pattern : __used__
+// STD:     COMPLETION: Pattern : _Clang::__convergent__
+// STD:     COMPLETION: Pattern : carries_dependency
+// STD-NOT: COMPLETION: Pattern : clang::called_once
+// STD:     COMPLETION: Pattern : clang::convergent
+// STD-NOT: COMPLETION: Pattern : convergent
+// STD-NOT: COMPLETION: Pattern : gnu::__used__
+// STD:     COMPLETION: Pattern : gnu::abi_tag(<#Tags...#>)
+// STD:     COMPLETION: Pattern : gnu::alias(<#Aliasee#>)
+// STD:     COMPLETION: Pattern : gnu::used
+// STD-NOT: COMPLETION: Pattern : used
+// RUN: %clang_cc1 -code-completion-at=%s:1:9 -xobjective-c++ %s | FileCheck --check-prefix=STD-OBJC %s
+// STD-OBJC: COMPLETION: Pattern : clang::called_once
 // RUN: %clang_cc1 -code-completion-at=%s:1:14 %s | FileCheck --check-prefix=STD-NS %s
-// STD-NS-NOT: COMPLETION: __used__
-// STD-NS-NOT: COMPLETION: carries_dependency
-// STD-NS-NOT: COMPLETION: clang::convergent
-// STD-NS-NOT: COMPLETION: convergent
-// STD-NS-NOT: COMPLETION: gnu::used
-// STD-NS:     COMPLETION: used
+// STD-NS-NOT: COMPLETION: Pattern : __used__
+// STD-NS-NOT: COMPLETION: Pattern : carries_dependency
+// STD-NS-NOT: COMPLETION: Pattern : clang::convergent
+// STD-NS-NOT: COMPLETION: Pattern : convergent
+// STD-NS-NOT: COMPLETION: Pattern : gnu::used
+// STD-NS:     COMPLETION: Pattern : used
 int b [[__gnu__::used]];
-// RUN: %clang_cc1 -code-completion-at=%s:22:18 %s | FileCheck --check-prefix=STD-NSU %s
-// STD-NSU:     COMPLETION: __used__
-// STD-NSU-NOT: COMPLETION: used
+// RUN: %clang_cc1 -code-completion-at=%s:27:18 %s | FileCheck --check-prefix=STD-NSU %s
+// STD-NSU:     COMPLETION: Pattern : __used__
+// STD-NSU-NOT: COMPLETION: Pattern : used
 
 int c [[using gnu: used]];
-// RUN: %clang_cc1 -code-completion-at=%s:27:15 %s | FileCheck --check-prefix=STD-USING %s
+// RUN: %clang_cc1 -code-completion-at=%s:32:15 %s | FileCheck --check-prefix=STD-USING %s
 // STD-USING:     COMPLETION: __gnu__
 // STD-USING:     COMPLETION: _Clang
-// STD-USING-NOT: COMPLETION: carries_dependency
+// STD-USING-NOT: COMPLETION: Pattern : carries_dependency
 // STD-USING:     COMPLETION: clang
-// STD-USING-NOT: COMPLETION: clang::
-// STD-USING-NOT: COMPLETION: gnu::
+// STD-USING-NOT: COMPLETION: Pattern : clang::
+// STD-USING-NOT: COMPLETION: Pattern : gnu::
 // STD-USING:     COMPLETION: gnu
-// RUN: %clang_cc1 -code-completion-at=%s:27:20 %s | FileCheck --check-prefix=STD-NS %s
+// RUN: %clang_cc1 -code-completion-at=%s:32:20 %s | FileCheck --check-prefix=STD-NS %s
 
 int d __attribute__((used));
-// RUN: %clang_cc1 -code-completion-at=%s:38:22 %s | FileCheck --check-prefix=GNU %s
-// GNU:     COMPLETION: __carries_dependency__
-// GNU:     COMPLETION: __convergent__
-// GNU-NOT: COMPLETION: __gnu__::__used__
-// GNU:     COMPLETION: __used__
-// GNU-NOT: COMPLETION: _Clang::__convergent__
-// GNU:     COMPLETION: carries_dependency
-// GNU-NOT: COMPLETION: clang::convergent
-// GNU:     COMPLETION: convergent
-// GNU-NOT: COMPLETION: gnu::used
-// GNU:     COMPLETION: used
+// RUN: %clang_cc1 -code-completion-at=%s:43:22 %s | FileCheck --check-prefix=GNU %s
+// GNU:     COMPLETION: Pattern : __carries_dependency__
+// GNU:     COMPLETION: Pattern : __convergent__
+// GNU-NOT: COMPLETION: Pattern : __gnu__::__used__
+// GNU:     COMPLETION: Pattern : __used__
+// GNU-NOT: COMPLETION: Pattern : _Clang::__convergent__
+// GNU:     COMPLETION: Pattern : carries_dependency
+// GNU-NOT: COMPLETION: Pattern : clang::convergent
+// GNU:     COMPLETION: Pattern : convergent
+// GNU-NOT: COMPLETION: Pattern : gnu::used
+// GNU:     COMPLETION: Pattern : used
 
 #pragma clang attribute push (__attribute__((internal_linkage)), apply_to=variable)
 int e;
 #pragma clang attribute pop
-// RUN: %clang_cc1 -code-completion-at=%s:51:46 %s | FileCheck --check-prefix=PRAGMA %s
-// PRAGMA: internal_linkage
+// RUN: %clang_cc1 -code-completion-at=%s:56:46 %s | FileCheck --check-prefix=PRAGMA %s
+// PRAGMA: COMPLETION: Pattern : internal_linkage
 
 #ifdef MS_EXT
 int __declspec(thread) f;
-// RUN: %clang_cc1 -fms-extensions -DMS_EXT -code-completion-at=%s:58:16 %s | FileCheck --check-prefix=DS %s
-// DS-NOT: COMPLETION: __convergent__
-// DS-NOT: COMPLETION: __used__
-// DS-NOT: COMPLETION: clang::convergent
-// DS-NOT: COMPLETION: convergent
-// DS:     COMPLETION: thread
-// DS-NOT: COMPLETION: used
-// DS:     COMPLETION: uuid
+// RUN: %clang_cc1 -fms-extensions -DMS_EXT -code-completion-at=%s:63:16 %s | FileCheck --check-prefix=DS %s
+// DS-NOT: COMPLETION: Pattern : __convergent__
+// DS-NOT: COMPLETION: Pattern : __used__
+// DS-NOT: COMPLETION: Pattern : clang::convergent
+// DS-NOT: COMPLETION: Pattern : convergent
+// DS:     COMPLETION: Pattern : thread
+// DS-NOT: COMPLETION: Pattern : used
+// DS:     COMPLETION: Pattern : uuid
 
 [uuid("123e4567-e89b-12d3-a456-426614174000")] struct g;
-// RUN: %clang_cc1 -fms-extensions -DMS_EXT -code-completion-at=%s:68:2 %s | FileCheck --check-prefix=MS %s
-// MS-NOT: COMPLETION: __uuid__
-// MS-NOT: COMPLETION: clang::convergent
-// MS-NOT: COMPLETION: convergent
-// MS-NOT: COMPLETION: thread
-// MS-NOT: COMPLETION: used
-// MS:     COMPLETION: uuid
+// RUN: %clang_cc1 -fms-extensions -DMS_EXT -code-completion-at=%s:73:2 %s | FileCheck --check-prefix=MS %s
+// MS-NOT: COMPLETION: Pattern : __uuid__
+// MS-NOT: COMPLETION: Pattern : clang::convergent
+// MS-NOT: COMPLETION: Pattern : convergent
+// MS-NOT: COMPLETION: Pattern : thread
+// MS-NOT: COMPLETION: Pattern : used
+// MS:     COMPLETION: Pattern : uuid
 #endif // MS_EXT
 
 void foo() {
@@ -80,9 +85,9 @@ void foo() {
   {}
 }
 // FIXME: support for omp attributes would be nice.
-// RUN: %clang_cc1 -fopenmp -code-completion-at=%s:79:5 %s | FileCheck --check-prefix=OMP-NS --allow-empty %s
-// OMP-NS-NOT: omp
-// RUN: %clang_cc1 -fopenmp -code-completion-at=%s:79:10 %s | FileCheck --check-prefix=OMP-ATTR --allow-empty %s
-// OMP-ATTR-NOT: sequence
-// RUN: %clang_cc1 -fopenmp -code-completion-at=%s:79:19 %s | FileCheck --check-prefix=OMP-NESTED --allow-empty %s
-// OMP-NESTED-NOT: directive
+// RUN: %clang_cc1 -fopenmp -code-completion-at=%s:84:5 %s | FileCheck --check-prefix=OMP-NS --allow-empty %s
+// OMP-NS-NOT: COMPLETION: omp
+// RUN: %clang_cc1 -fopenmp -code-completion-at=%s:84:10 %s | FileCheck --check-prefix=OMP-ATTR --allow-empty %s
+// OMP-ATTR-NOT: COMPLETION: Pattern : sequence
+// RUN: %clang_cc1 -fopenmp -code-completion-at=%s:84:19 %s | FileCheck --check-prefix=OMP-NESTED --allow-empty %s
+// OMP-NESTED-NOT: COMPLETION: Pattern : directive
diff --git a/clang/test/CodeGen/X86/avx512fp16-builtins.c b/clang/test/CodeGen/X86/avx512fp16-builtins.c
index f8a989afa871a..42591662606eb 100644
--- a/clang/test/CodeGen/X86/avx512fp16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512fp16-builtins.c
@@ -337,166 +337,2854 @@ __m512h test_mm512_zextph256_ph512(__m256h __a) {
   return _mm512_zextph256_ph512(__a);
 }
 
+int test_mm_comi_round_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_comi_round_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 0, i32 8)
+  return _mm_comi_round_sh(__A, __B, 0, _MM_FROUND_NO_EXC);
+}
+
+int test_mm_comi_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_comi_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 0, i32 4)
+  return _mm_comi_sh(__A, __B, 0);
+}
+
+int test_mm_comieq_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_comieq_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 16, i32 4)
+  return _mm_comieq_sh(__A, __B);
+}
+
+int test_mm_comilt_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_comilt_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 1, i32 4)
+  return _mm_comilt_sh(__A, __B);
+}
+
+int test_mm_comile_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_comile_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 2, i32 4)
+  return _mm_comile_sh(__A, __B);
+}
+
+int test_mm_comigt_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_comigt_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 14, i32 4)
+  return _mm_comigt_sh(__A, __B);
+}
+
+int test_mm_comige_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_comige_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 13, i32 4)
+  return _mm_comige_sh(__A, __B);
+}
+
+int test_mm_comineq_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_comineq_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 20, i32 4)
+  return _mm_comineq_sh(__A, __B);
+}
+
+int test_mm_ucomieq_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_ucomieq_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 0, i32 4)
+  return _mm_ucomieq_sh(__A, __B);
+}
+
+int test_mm_ucomilt_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_ucomilt_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 17, i32 4)
+  return _mm_ucomilt_sh(__A, __B);
+}
+
+int test_mm_ucomile_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_ucomile_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 18, i32 4)
+  return _mm_ucomile_sh(__A, __B);
+}
+
+int test_mm_ucomigt_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_ucomigt_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 30, i32 4)
+  return _mm_ucomigt_sh(__A, __B);
+}
+
+int test_mm_ucomige_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_ucomige_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 29, i32 4)
+  return _mm_ucomige_sh(__A, __B);
+}
+
+int test_mm_ucomineq_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: test_mm_ucomineq_sh
+  // CHECK: %{{.}} = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %{{.}}, <8 x half> %{{.}}, i32 4, i32 4)
+  return _mm_ucomineq_sh(__A, __B);
+}
+
+__m512h test_mm512_add_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_add_ph
+  // CHECK: %{{.*}} = fadd <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_add_ph(__A, __B);
+}
+
+__m512h test_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_add_ph
+  // CHECK: %{{.*}} = fadd <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return (__m512h)_mm512_mask_add_ph(__W, __U, __A, __B);
+}
+
+__m512h test_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_add_ph
+  // CHECK: %{{.*}} = fadd <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_maskz_add_ph(__U, __A, __B);
+}
+
+__m512h test_mm512_add_round_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_add_round_ph
+  // CHECK: @llvm.x86.avx512fp16.add.ph.512
+  return _mm512_add_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+__m512h test_mm512_mask_add_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_add_round_ph
+  // CHECK: @llvm.x86.avx512fp16.add.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_mask_add_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+__m512h test_mm512_maskz_add_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_add_round_ph
+  // CHECK: @llvm.x86.avx512fp16.add.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_maskz_add_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_sub_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_sub_ph
+  // CHECK: %{{.*}} = fsub <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_sub_ph(__A, __B);
+}
+
+__m512h test_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_sub_ph
+  // CHECK: %{{.*}} = fsub <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return (__m512h)_mm512_mask_sub_ph(__W, __U, __A, __B);
+}
+
+__m512h test_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sub_ph
+  // CHECK: %{{.*}} = fsub <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_maskz_sub_ph(__U, __A, __B);
+}
+
+__m512h test_mm512_sub_round_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_sub_round_ph
+  // CHECK: @llvm.x86.avx512fp16.sub.ph.512
+  return _mm512_sub_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+__m512h test_mm512_mask_sub_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_sub_round_ph
+  // CHECK: @llvm.x86.avx512fp16.sub.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_mask_sub_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+__m512h test_mm512_maskz_sub_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sub_round_ph
+  // CHECK: @llvm.x86.avx512fp16.sub.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_maskz_sub_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mul_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mul_ph
+  // CHECK: %{{.*}} = fmul <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_mul_ph(__A, __B);
+}
+
+__m512h test_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_mul_ph
+  // CHECK: %{{.*}} = fmul <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return (__m512h)_mm512_mask_mul_ph(__W, __U, __A, __B);
+}
+
+__m512h test_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_mul_ph
+  // CHECK: %{{.*}} = fmul <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_maskz_mul_ph(__U, __A, __B);
+}
+
+__m512h test_mm512_mul_round_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mul_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mul.ph.512
+  return _mm512_mul_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+__m512h test_mm512_mask_mul_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_mul_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mul.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_mask_mul_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+__m512h test_mm512_maskz_mul_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_mul_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mul.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_maskz_mul_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_div_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_div_ph
+  // CHECK: %{{.*}} = fdiv <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_div_ph(__A, __B);
+}
+
+__m512h test_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_div_ph
+  // CHECK: %{{.*}} = fdiv <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return (__m512h)_mm512_mask_div_ph(__W, __U, __A, __B);
+}
+
+__m512h test_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_div_ph
+  // CHECK: %{{.*}} = fdiv <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_maskz_div_ph(__U, __A, __B);
+}
+
+__m512h test_mm512_div_round_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_div_round_ph
+  // CHECK: @llvm.x86.avx512fp16.div.ph.512
+  return _mm512_div_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+__m512h test_mm512_mask_div_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_div_round_ph
+  // CHECK: @llvm.x86.avx512fp16.div.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_mask_div_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+__m512h test_mm512_maskz_div_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_div_round_ph
+  // CHECK: @llvm.x86.avx512fp16.div.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_maskz_div_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_min_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_min_ph
+  // CHECK: @llvm.x86.avx512fp16.min.ph.512
+  return _mm512_min_ph(__A, __B);
+}
+
+__m512h test_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_min_ph
+  // CHECK: @llvm.x86.avx512fp16.min.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return (__m512h)_mm512_mask_min_ph(__W, __U, __A, __B);
+}
+
+__m512h test_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_min_ph
+  // CHECK: @llvm.x86.avx512fp16.min.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_maskz_min_ph(__U, __A, __B);
+}
+
+__m512h test_mm512_min_round_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_min_round_ph
+  // CHECK: @llvm.x86.avx512fp16.min.ph.512
+  return _mm512_min_round_ph(__A, __B, _MM_FROUND_NO_EXC);
+}
+__m512h test_mm512_mask_min_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_min_round_ph
+  // CHECK: @llvm.x86.avx512fp16.min.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_mask_min_round_ph(__W, __U, __A, __B, _MM_FROUND_NO_EXC);
+}
+__m512h test_mm512_maskz_min_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_min_round_ph
+  // CHECK: @llvm.x86.avx512fp16.min.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_maskz_min_round_ph(__U, __A, __B, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_max_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_max_ph
+  // CHECK: @llvm.x86.avx512fp16.max.ph.512
+
+  return _mm512_max_ph(__A, __B);
+}
+
+__m512h test_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_max_ph
+  // CHECK: @llvm.x86.avx512fp16.max.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return (__m512h)_mm512_mask_max_ph(__W, __U, __A, __B);
+}
+
+__m512h test_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_max_ph
+  // CHECK: @llvm.x86.avx512fp16.max.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_maskz_max_ph(__U, __A, __B);
+}
+
+__m512h test_mm512_max_round_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_max_round_ph
+  // CHECK: @llvm.x86.avx512fp16.max.ph.512
+  return _mm512_max_round_ph(__A, __B, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_max_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_max_round_ph
+  // CHECK: @llvm.x86.avx512fp16.max.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_mask_max_round_ph(__W, __U, __A, __B, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_max_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_max_round_ph
+  // CHECK: @llvm.x86.avx512fp16.max.ph.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_maskz_max_round_ph(__U, __A, __B, _MM_FROUND_NO_EXC);
+}
+
 __m512h test_mm512_abs_ph(__m512h a) {
   // CHECK-LABEL: @test_mm512_abs_ph
   // CHECK: and <16 x i32>
   return _mm512_abs_ph(a);
 }
 
-// VMOVSH
+__m128h test_mm_add_round_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_add_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.add.sh.round
+  return _mm_add_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
+}
+__m128h test_mm_mask_add_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_add_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.add.sh.round
+  return _mm_mask_add_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
+}
+__m128h test_mm_maskz_add_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_add_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.add.sh.round
+  return _mm_maskz_add_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
+}
+__m128h test_mm_mask_add_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_add_sh
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = fadd half %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
+  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+  return _mm_mask_add_sh(__W, __U, __A, __B);
+}
+__m128h test_mm_maskz_add_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_add_sh
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = fadd half %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
+  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+  return _mm_maskz_add_sh(__U, __A, __B);
+}
 
-__m128h test_mm_load_sh(void const *A) {
-  // CHECK-LABEL: test_mm_load_sh
-  // CHECK: load half, half* %{{.*}}, align 1{{$}}
-  return _mm_load_sh(A);
+__m128h test_mm_add_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_add_sh
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = fadd half %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  return _mm_add_sh(__A, __B);
 }
 
-__m128h test_mm_mask_load_sh(__m128h __A, __mmask8 __U, const void *__W) {
-  // CHECK-LABEL: @test_mm_mask_load_sh
-  // CHECK: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
-  return _mm_mask_load_sh(__A, __U, __W);
+__m128h test_mm_sub_round_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_sub_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.sub.sh.round
+  return _mm_sub_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
+}
+__m128h test_mm_mask_sub_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_sub_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.sub.sh.round
+  return _mm_mask_sub_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
+}
+__m128h test_mm_maskz_sub_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_sub_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.sub.sh.round
+  return _mm_maskz_sub_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
+}
+__m128h test_mm_mask_sub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_sub_sh
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = fsub half %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
+  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+  return _mm_mask_sub_sh(__W, __U, __A, __B);
+}
+__m128h test_mm_maskz_sub_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_sub_sh
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = fsub half %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
+  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+  return _mm_maskz_sub_sh(__U, __A, __B);
 }
 
-__m128h test_mm_maskz_load_sh(__mmask8 __U, const void *__W) {
-  // CHECK-LABEL: @test_mm_maskz_load_sh
-  // CHECK: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
-  return _mm_maskz_load_sh(__U, __W);
+__m128h test_mm_sub_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_sub_sh
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = fsub half %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  return _mm_sub_sh(__A, __B);
 }
 
-__m512h test_mm512_load_ph(void *p) {
-  // CHECK-LABEL: @test_mm512_load_ph
-  // CHECK: load <32 x half>, <32 x half>* %{{.*}}, align 64
-  return _mm512_load_ph(p);
+__m128h test_mm_mul_round_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mul_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.mul.sh.round
+  return _mm_mul_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
+}
+__m128h test_mm_mask_mul_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_mul_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.mul.sh.round
+  return _mm_mask_mul_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
+}
+__m128h test_mm_maskz_mul_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_mul_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.mul.sh.round
+  return _mm_maskz_mul_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
+}
+__m128h test_mm_mask_mul_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_mul_sh
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = fmul half %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
+  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+  return _mm_mask_mul_sh(__W, __U, __A, __B);
+}
+__m128h test_mm_maskz_mul_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_mul_sh
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = fmul half %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
+  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+  return _mm_maskz_mul_sh(__U, __A, __B);
 }
 
-__m256h test_mm256_load_ph(void *p) {
-  // CHECK-LABEL: @test_mm256_load_ph
-  // CHECK: load <16 x half>, <16 x half>* %{{.*}}, align 32
-  return _mm256_load_ph(p);
+__m128h test_mm_mul_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mul_sh
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = fmul half %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  return _mm_mul_sh(__A, __B);
 }
 
-__m128h test_mm_load_ph(void *p) {
-  // CHECK-LABEL: @test_mm_load_ph
-  // CHECK: load <8 x half>, <8 x half>* %{{.*}}, align 16
-  return _mm_load_ph(p);
+__m128h test_mm_div_round_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_div_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.div.sh.round
+  return _mm_div_round_sh(__A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
+}
+__m128h test_mm_mask_div_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_div_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.div.sh.round
+  return _mm_mask_div_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
+}
+__m128h test_mm_maskz_div_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_div_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.div.sh.round
+  return _mm_maskz_div_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO);
+}
+__m128h test_mm_mask_div_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_div_sh
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = fdiv half %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
+  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+  return _mm_mask_div_sh(__W, __U, __A, __B);
+}
+__m128h test_mm_maskz_div_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_div_sh
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = fdiv half %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: %{{.*}} = extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: %{{.*}} = select i1 %{{.*}}, half %{{.*}}, half %{{.*}}
+  // CHECK-NEXT: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i64 0
+  return _mm_maskz_div_sh(__U, __A, __B);
 }
 
-__m512h test_mm512_loadu_ph(void *p) {
-  // CHECK-LABEL: @test_mm512_loadu_ph
-  // CHECK: load <32 x half>, <32 x half>* {{.*}}, align 1{{$}}
-  return _mm512_loadu_ph(p);
+__m128h test_mm_div_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_div_sh
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: %{{.*}} = fdiv half %{{.*}}, %{{.*}}
+  // CHECK: %{{.*}} = insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  return _mm_div_sh(__A, __B);
 }
 
-__m256h test_mm256_loadu_ph(void *p) {
-  // CHECK-LABEL: @test_mm256_loadu_ph
-  // CHECK: load <16 x half>, <16 x half>* {{.*}}, align 1{{$}}
-  return _mm256_loadu_ph(p);
+__m128h test_mm_min_round_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_min_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round
+  return _mm_min_round_sh(__A, __B, 0x08);
+}
+__m128h test_mm_mask_min_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_min_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round
+  return _mm_mask_min_round_sh(__W, __U, __A, __B, 0x08);
+}
+__m128h test_mm_maskz_min_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_min_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round
+  return _mm_maskz_min_round_sh(__U, __A, __B, 0x08);
+}
+__m128h test_mm_mask_min_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_min_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round
+  return _mm_mask_min_sh(__W, __U, __A, __B);
+}
+__m128h test_mm_maskz_min_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_min_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round
+  return _mm_maskz_min_sh(__U, __A, __B);
 }
 
-__m128h test_mm_loadu_ph(void *p) {
-  // CHECK-LABEL: @test_mm_loadu_ph
-  // CHECK: load <8 x half>, <8 x half>* {{.*}}, align 1{{$}}
-  return _mm_loadu_ph(p);
+__m128h test_mm_min_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_min_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.min.sh.round
+  return _mm_min_sh(__A, __B);
 }
 
-void test_mm_store_sh(void *A, __m128h B) {
-  // CHECK-LABEL: test_mm_store_sh
-  // CHECK: extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: store half %{{.*}}, half* %{{.*}}, align 1{{$}}
-  _mm_store_sh(A, B);
+__m128h test_mm_max_round_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_max_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round
+  return _mm_max_round_sh(__A, __B, 0x08);
+}
+__m128h test_mm_mask_max_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_max_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round
+  return _mm_mask_max_round_sh(__W, __U, __A, __B, 0x08);
+}
+__m128h test_mm_maskz_max_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_max_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round
+  return _mm_maskz_max_round_sh(__U, __A, __B, 0x08);
+}
+__m128h test_mm_mask_max_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_max_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round
+  return _mm_mask_max_sh(__W, __U, __A, __B);
+}
+__m128h test_mm_maskz_max_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_max_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round
+  return _mm_maskz_max_sh(__U, __A, __B);
 }
 
-void test_mm_mask_store_sh(void *__P, __mmask8 __U, __m128h __A) {
-  // CHECK-LABEL: @test_mm_mask_store_sh
-  // CHECK: call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %{{.*}}, <8 x half>* %{{.*}}, i32 1, <8 x i1> %{{.*}})
-  _mm_mask_store_sh(__P, __U, __A);
+__m128h test_mm_max_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_max_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.max.sh.round
+  return _mm_max_sh(__A, __B);
+}
+__mmask32 test_mm512_cmp_round_ph_mask(__m512h a, __m512h b) {
+  // CHECK-LABEL: @test_mm512_cmp_round_ph_mask
+  // CHECK: fcmp oeq <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_round_ph_mask(a, b, 0, _MM_FROUND_NO_EXC);
 }
 
-void test_mm512_store_ph(void *p, __m512h a) {
-  // CHECK-LABEL: @test_mm512_store_ph
-  // CHECK: store <32 x half> %{{.*}}, <32 x half>* %{{.*}}, align 64
-  _mm512_store_ph(p, a);
+__mmask32 test_mm512_mask_cmp_round_ph_mask(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: @test_mm512_mask_cmp_round_ph_mask
+  // CHECK: [[CMP:%.*]] = fcmp oeq <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_round_ph_mask(m, a, b, 0, _MM_FROUND_NO_EXC);
 }
 
-void test_mm256_store_ph(void *p, __m256h a) {
-  // CHECK-LABEL: @test_mm256_store_ph
-  // CHECK: store <16 x half> %{{.*}}, <16 x half>* %{{.*}}, align 32
-  _mm256_store_ph(p, a);
+__mmask32 test_mm512_cmp_ph_mask_eq_oq(__m512h a, __m512h b) {
+  // CHECK-LABEL: @test_mm512_cmp_ph_mask_eq_oq
+  // CHECK: fcmp oeq <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
 }
 
-void test_mm_store_ph(void *p, __m128h a) {
-  // CHECK-LABEL: @test_mm_store_ph
-  // CHECK: store <8 x half> %{{.*}}, <8 x half>* %{{.*}}, align 16
-  _mm_store_ph(p, a);
+__mmask32 test_mm512_cmp_ph_mask_lt_os(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_lt_os
+  // CHECK: fcmp olt <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_LT_OS);
 }
 
-void test_mm512_storeu_ph(void *p, __m512h a) {
-  // CHECK-LABEL: @test_mm512_storeu_ph
-  // CHECK: store <32 x half> %{{.*}}, <32 x half>* %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
-  _mm512_storeu_ph(p, a);
+__mmask32 test_mm512_cmp_ph_mask_le_os(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_le_os
+  // CHECK: fcmp ole <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_LE_OS);
 }
 
-void test_mm256_storeu_ph(void *p, __m256h a) {
-  // CHECK-LABEL: @test_mm256_storeu_ph
-  // CHECK: store <16 x half> %{{.*}}, <16 x half>* %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
-  _mm256_storeu_ph(p, a);
+__mmask32 test_mm512_cmp_ph_mask_unord_q(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_unord_q
+  // CHECK: fcmp uno <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_UNORD_Q);
 }
 
-void test_mm_storeu_ph(void *p, __m128h a) {
-  // CHECK-LABEL: @test_mm_storeu_ph
-  // CHECK: store <8 x half> %{{.*}}, <8 x half>* %{{.*}}, align 1{{$}}
-  // CHECK-NEXT: ret void
-  _mm_storeu_ph(p, a);
+__mmask32 test_mm512_cmp_ph_mask_neq_uq(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_uq
+  // CHECK: fcmp une <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_UQ);
 }
 
-__m128h test_mm_move_sh(__m128h A, __m128h B) {
-  // CHECK-LABEL: test_mm_move_sh
-  // CHECK: extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
-  return _mm_move_sh(A, B);
+__mmask32 test_mm512_cmp_ph_mask_nlt_us(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_nlt_us
+  // CHECK: fcmp uge <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_NLT_US);
 }
 
-__m128h test_mm_mask_move_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_mask_move_sh
-  // CHECK: [[EXT:%.*]] = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: insertelement <8 x half> %{{.*}}, half [[EXT]], i32 0
-  // CHECK: [[A:%.*]] = extractelement <8 x half> [[VEC:%.*]], i64 0
-  // CHECK-NEXT: [[B:%.*]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, half [[A]], half [[B]]
-  // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0
-  return _mm_mask_move_sh(__W, __U, __A, __B);
+__mmask32 test_mm512_cmp_ph_mask_nle_us(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_nle_us
+  // CHECK: fcmp ugt <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_NLE_US);
 }
 
-__m128h test_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) {
-  // CHECK-LABEL: @test_mm_maskz_move_sh
-  // CHECK: [[EXT:%.*]] = extractelement <8 x half> %{{.*}}, i32 0
-  // CHECK: insertelement <8 x half> %{{.*}}, half [[EXT]], i32 0
-  // CHECK: [[A:%.*]] = extractelement <8 x half> [[VEC:%.*]], i64 0
-  // CHECK-NEXT: [[B:%.*]] = extractelement <8 x half> %{{.*}}, i64 0
-  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
-  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
-  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, half [[A]], half [[B]]
-  // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0
-  return _mm_maskz_move_sh(__U, __A, __B);
+__mmask32 test_mm512_cmp_ph_mask_ord_q(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_ord_q
+  // CHECK: fcmp ord <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_ORD_Q);
 }
 
-short test_mm_cvtsi128_si16(__m128i A) {
-  // CHECK-LABEL: test_mm_cvtsi128_si16
-  // CHECK: extractelement <8 x i16> %{{.*}}, i32 0
-  return _mm_cvtsi128_si16(A);
+__mmask32 test_mm512_cmp_ph_mask_eq_uq(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_eq_uq
+  // CHECK: fcmp ueq <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_EQ_UQ);
 }
 
-__m128i test_mm_cvtsi16_si128(short A) {
-  // CHECK-LABEL: test_mm_cvtsi16_si128
-  // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 1
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 2
-  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 3
-  return _mm_cvtsi16_si128(A);
+__mmask32 test_mm512_cmp_ph_mask_nge_us(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_nge_us
+  // CHECK: fcmp ult <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_NGE_US);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_ngt_us(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_ngt_us
+  // CHECK: fcmp ule <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_NGT_US);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_false_oq(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_false_oq
+  // CHECK: fcmp false <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_FALSE_OQ);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_neq_oq(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_oq
+  // CHECK: fcmp one <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_OQ);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_ge_os(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_ge_os
+  // CHECK: fcmp oge <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_GE_OS);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_gt_os(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_gt_os
+  // CHECK: fcmp ogt <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_GT_OS);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_true_uq(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_true_uq
+  // CHECK: fcmp true <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_TRUE_UQ);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_eq_os(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_eq_os
+  // CHECK: fcmp oeq <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_EQ_OS);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_lt_oq(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_lt_oq
+  // CHECK: fcmp olt <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_le_oq(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_le_oq
+  // CHECK: fcmp ole <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_unord_s(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_unord_s
+  // CHECK: fcmp uno <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_UNORD_S);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_neq_us(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_us
+  // CHECK: fcmp une <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_US);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_nlt_uq(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_nlt_uq
+  // CHECK: fcmp uge <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_NLT_UQ);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_nle_uq(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_nle_uq
+  // CHECK: fcmp ugt <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_NLE_UQ);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_ord_s(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_ord_s
+  // CHECK: fcmp ord <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_ORD_S);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_eq_us(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_eq_us
+  // CHECK: fcmp ueq <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_EQ_US);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_nge_uq(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_nge_uq
+  // CHECK: fcmp ult <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_ngt_uq(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_ngt_uq
+  // CHECK: fcmp ule <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_NGT_UQ);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_false_os(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_false_os
+  // CHECK: fcmp false <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_FALSE_OS);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_neq_os(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_neq_os
+  // CHECK: fcmp one <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_NEQ_OS);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_ge_oq(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_ge_oq
+  // CHECK: fcmp oge <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_GE_OQ);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_gt_oq(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_gt_oq
+  // CHECK: fcmp ogt <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_GT_OQ);
+}
+
+__mmask32 test_mm512_cmp_ph_mask_true_us(__m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_cmp_ph_mask_true_us
+  // CHECK: fcmp true <32 x half> %{{.*}}, %{{.*}}
+  return _mm512_cmp_ph_mask(a, b, _CMP_TRUE_US);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_eq_oq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: @test_mm512_mask_cmp_ph_mask_eq_oq
+  // CHECK: [[CMP:%.*]] = fcmp oeq <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_lt_os(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_lt_os
+  // CHECK: [[CMP:%.*]] = fcmp olt <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LT_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_le_os(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_le_os
+  // CHECK: [[CMP:%.*]] = fcmp ole <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LE_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_unord_q(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_unord_q
+  // CHECK: [[CMP:%.*]] = fcmp uno <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_Q);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_neq_uq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_uq
+  // CHECK: [[CMP:%.*]] = fcmp une <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_nlt_us(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nlt_us
+  // CHECK: [[CMP:%.*]] = fcmp uge <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLT_US);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_nle_us(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nle_us
+  // CHECK: [[CMP:%.*]] = fcmp ugt <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLE_US);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_ord_q(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ord_q
+  // CHECK: [[CMP:%.*]] = fcmp ord <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_ORD_Q);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_eq_uq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_eq_uq
+  // CHECK: [[CMP:%.*]] = fcmp ueq <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_nge_us(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nge_us
+  // CHECK: [[CMP:%.*]] = fcmp ult <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGE_US);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_ngt_us(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ngt_us
+  // CHECK: [[CMP:%.*]] = fcmp ule <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGT_US);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_false_oq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_false_oq
+  // CHECK: [[CMP:%.*]] = fcmp false <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_neq_oq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_oq
+  // CHECK: [[CMP:%.*]] = fcmp one <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_ge_os(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ge_os
+  // CHECK: [[CMP:%.*]] = fcmp oge <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GE_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_gt_os(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_gt_os
+  // CHECK: [[CMP:%.*]] = fcmp ogt <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GT_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_true_uq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_true_uq
+  // CHECK: [[CMP:%.*]] = fcmp true <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_eq_os(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_eq_os
+  // CHECK: [[CMP:%.*]] = fcmp oeq <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_lt_oq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_lt_oq
+  // CHECK: [[CMP:%.*]] = fcmp olt <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LT_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_le_oq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_le_oq
+  // CHECK: [[CMP:%.*]] = fcmp ole <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_LE_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_unord_s(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_unord_s
+  // CHECK: [[CMP:%.*]] = fcmp uno <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_S);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_neq_us(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_us
+  // CHECK: [[CMP:%.*]] = fcmp une <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_US);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_nlt_uq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nlt_uq
+  // CHECK: [[CMP:%.*]] = fcmp uge <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLT_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_nle_uq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nle_uq
+  // CHECK: [[CMP:%.*]] = fcmp ugt <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NLE_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_ord_s(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ord_s
+  // CHECK: [[CMP:%.*]] = fcmp ord <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_ORD_S);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_eq_us(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_eq_us
+  // CHECK: [[CMP:%.*]] = fcmp ueq <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_EQ_US);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_nge_uq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_nge_uq
+  // CHECK: [[CMP:%.*]] = fcmp ult <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGE_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_ngt_uq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ngt_uq
+  // CHECK: [[CMP:%.*]] = fcmp ule <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NGT_UQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_false_os(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_false_os
+  // CHECK: [[CMP:%.*]] = fcmp false <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_neq_os(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_neq_os
+  // CHECK: [[CMP:%.*]] = fcmp one <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OS);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_ge_oq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_ge_oq
+  // CHECK: [[CMP:%.*]] = fcmp oge <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GE_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_gt_oq(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_gt_oq
+  // CHECK: [[CMP:%.*]] = fcmp ogt <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_GT_OQ);
+}
+
+__mmask32 test_mm512_mask_cmp_ph_mask_true_us(__mmask32 m, __m512h a, __m512h b) {
+  // CHECK-LABEL: test_mm512_mask_cmp_ph_mask_true_us
+  // CHECK: [[CMP:%.*]] = fcmp true <32 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> [[CMP]], {{.*}}
+  return _mm512_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_US);
+}
+
+__mmask8 test_mm_cmp_round_sh_mask(__m128h __X, __m128h __Y) {
+  // CHECK-LABEL: @test_mm_cmp_round_sh_mask
+  // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh
+  return _mm_cmp_round_sh_mask(__X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC);
+}
+
+__mmask8 test_mm_mask_cmp_round_sh_mask(__mmask8 __M, __m128h __X, __m128h __Y) {
+  // CHECK-LABEL: @test_mm_mask_cmp_round_sh_mask
+  // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh
+  return _mm_mask_cmp_round_sh_mask(__M, __X, __Y, _CMP_NLT_US, _MM_FROUND_NO_EXC);
+}
+
+__mmask8 test_mm_cmp_sh_mask(__m128h __X, __m128h __Y) {
+  // CHECK-LABEL: @test_mm_cmp_sh_mask
+  // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh
+  return _mm_cmp_sh_mask(__X, __Y, _CMP_NLT_US);
+}
+
+__mmask8 test_mm_mask_cmp_sh_mask(__mmask8 __M, __m128h __X, __m128h __Y) {
+  // CHECK-LABEL: @test_mm_mask_cmp_sh_mask
+  // CHECK: @llvm.x86.avx512fp16.mask.cmp.sh
+  return _mm_mask_cmp_sh_mask(__M, __X, __Y, _CMP_NLT_US);
+}
+
+// VMOVSH
+
+__m128h test_mm_load_sh(void const *A) {
+  // CHECK-LABEL: test_mm_load_sh
+  // CHECK: load half, half* %{{.*}}, align 1{{$}}
+  return _mm_load_sh(A);
+}
+
+__m128h test_mm_mask_load_sh(__m128h __A, __mmask8 __U, const void *__W) {
+  // CHECK-LABEL: @test_mm_mask_load_sh
+  // CHECK: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
+  return _mm_mask_load_sh(__A, __U, __W);
+}
+
+__m128h test_mm_maskz_load_sh(__mmask8 __U, const void *__W) {
+  // CHECK-LABEL: @test_mm_maskz_load_sh
+  // CHECK: %{{.*}} = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x half> %{{.*}})
+  return _mm_maskz_load_sh(__U, __W);
+}
+
+__m512h test_mm512_load_ph(void *p) {
+  // CHECK-LABEL: @test_mm512_load_ph
+  // CHECK: load <32 x half>, <32 x half>* %{{.*}}, align 64
+  return _mm512_load_ph(p);
+}
+
+__m256h test_mm256_load_ph(void *p) {
+  // CHECK-LABEL: @test_mm256_load_ph
+  // CHECK: load <16 x half>, <16 x half>* %{{.*}}, align 32
+  return _mm256_load_ph(p);
+}
+
+__m128h test_mm_load_ph(void *p) {
+  // CHECK-LABEL: @test_mm_load_ph
+  // CHECK: load <8 x half>, <8 x half>* %{{.*}}, align 16
+  return _mm_load_ph(p);
+}
+
+__m512h test_mm512_loadu_ph(void *p) {
+  // CHECK-LABEL: @test_mm512_loadu_ph
+  // CHECK: load <32 x half>, <32 x half>* {{.*}}, align 1{{$}}
+  return _mm512_loadu_ph(p);
+}
+
+__m256h test_mm256_loadu_ph(void *p) {
+  // CHECK-LABEL: @test_mm256_loadu_ph
+  // CHECK: load <16 x half>, <16 x half>* {{.*}}, align 1{{$}}
+  return _mm256_loadu_ph(p);
+}
+
+__m128h test_mm_loadu_ph(void *p) {
+  // CHECK-LABEL: @test_mm_loadu_ph
+  // CHECK: load <8 x half>, <8 x half>* {{.*}}, align 1{{$}}
+  return _mm_loadu_ph(p);
+}
+
+void test_mm_store_sh(void *A, __m128h B) {
+  // CHECK-LABEL: test_mm_store_sh
+  // CHECK: extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: store half %{{.*}}, half* %{{.*}}, align 1{{$}}
+  _mm_store_sh(A, B);
+}
+
+void test_mm_mask_store_sh(void *__P, __mmask8 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_mask_store_sh
+  // CHECK: call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %{{.*}}, <8 x half>* %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  _mm_mask_store_sh(__P, __U, __A);
+}
+
+void test_mm512_store_ph(void *p, __m512h a) {
+  // CHECK-LABEL: @test_mm512_store_ph
+  // CHECK: store <32 x half> %{{.*}}, <32 x half>* %{{.*}}, align 64
+  _mm512_store_ph(p, a);
+}
+
+void test_mm256_store_ph(void *p, __m256h a) {
+  // CHECK-LABEL: @test_mm256_store_ph
+  // CHECK: store <16 x half> %{{.*}}, <16 x half>* %{{.*}}, align 32
+  _mm256_store_ph(p, a);
+}
+
+void test_mm_store_ph(void *p, __m128h a) {
+  // CHECK-LABEL: @test_mm_store_ph
+  // CHECK: store <8 x half> %{{.*}}, <8 x half>* %{{.*}}, align 16
+  _mm_store_ph(p, a);
+}
+
+void test_mm512_storeu_ph(void *p, __m512h a) {
+  // CHECK-LABEL: @test_mm512_storeu_ph
+  // CHECK: store <32 x half> %{{.*}}, <32 x half>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm512_storeu_ph(p, a);
+}
+
+void test_mm256_storeu_ph(void *p, __m256h a) {
+  // CHECK-LABEL: @test_mm256_storeu_ph
+  // CHECK: store <16 x half> %{{.*}}, <16 x half>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm256_storeu_ph(p, a);
+}
+
+void test_mm_storeu_ph(void *p, __m128h a) {
+  // CHECK-LABEL: @test_mm_storeu_ph
+  // CHECK: store <8 x half> %{{.*}}, <8 x half>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm_storeu_ph(p, a);
+}
+
+__m128h test_mm_move_sh(__m128h A, __m128h B) {
+  // CHECK-LABEL: test_mm_move_sh
+  // CHECK: extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: insertelement <8 x half> %{{.*}}, half %{{.*}}, i32 0
+  return _mm_move_sh(A, B);
+}
+
+__m128h test_mm_mask_move_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_move_sh
+  // CHECK: [[EXT:%.*]] = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: insertelement <8 x half> %{{.*}}, half [[EXT]], i32 0
+  // CHECK: [[A:%.*]] = extractelement <8 x half> [[VEC:%.*]], i64 0
+  // CHECK-NEXT: [[B:%.*]] = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, half [[A]], half [[B]]
+  // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0
+  return _mm_mask_move_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_move_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_move_sh
+  // CHECK: [[EXT:%.*]] = extractelement <8 x half> %{{.*}}, i32 0
+  // CHECK: insertelement <8 x half> %{{.*}}, half [[EXT]], i32 0
+  // CHECK: [[A:%.*]] = extractelement <8 x half> [[VEC:%.*]], i64 0
+  // CHECK-NEXT: [[B:%.*]] = extractelement <8 x half> %{{.*}}, i64 0
+  // CHECK-NEXT: bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK-NEXT: extractelement <8 x i1> %{{.*}}, i64 0
+  // CHECK-NEXT: [[SEL:%.*]] = select i1 %{{.*}}, half [[A]], half [[B]]
+  // CHECK-NEXT: insertelement <8 x half> [[VEC]], half [[SEL]], i64 0
+  return _mm_maskz_move_sh(__U, __A, __B);
+}
+
+short test_mm_cvtsi128_si16(__m128i A) {
+  // CHECK-LABEL: test_mm_cvtsi128_si16
+  // CHECK: extractelement <8 x i16> %{{.*}}, i32 0
+  return _mm_cvtsi128_si16(A);
+}
+
+__m128i test_mm_cvtsi16_si128(short A) {
+  // CHECK-LABEL: test_mm_cvtsi16_si128
+  // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 1
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 2
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 3
+  return _mm_cvtsi16_si128(A);
+}
+
+__m512h test_mm512_rcp_ph(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_rcp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.512
+  return _mm512_rcp_ph(__A);
+}
+
+__m512h test_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_mask_rcp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.512
+  return (__m512h)_mm512_mask_rcp_ph(__W, __U, __A);
+}
+
+__m512h test_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_maskz_rcp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.512
+  return _mm512_maskz_rcp_ph(__U, __A);
+}
+
+__m512h test_mm512_rsqrt_ph(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_rsqrt_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.512
+  return _mm512_rsqrt_ph(__A);
+}
+
+__m512h test_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_mask_rsqrt_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.512
+  return (__m512h)_mm512_mask_rsqrt_ph(__W, __U, __A);
+}
+
+__m512h test_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_maskz_rsqrt_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.512
+  return _mm512_maskz_rsqrt_ph(__U, __A);
+}
+
+__m512h test_mm512_getmant_round_ph(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_getmant_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
+  return _mm512_getmant_round_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_getmant_round_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_mask_getmant_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
+  return _mm512_mask_getmant_round_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_getmant_round_ph(__mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getmant_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
+  return _mm512_maskz_getmant_round_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_getmant_ph(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_getmant_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
+  return _mm512_getmant_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m512h test_mm512_mask_getmant_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_mask_getmant_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
+  return _mm512_mask_getmant_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m512h test_mm512_maskz_getmant_ph(__mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getmant_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.512
+  return _mm512_maskz_getmant_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m512h test_mm512_scalef_round_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_scalef_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
+  return _mm512_scalef_round_ph(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_scalef_round_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_scalef_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
+  return _mm512_mask_scalef_round_ph(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_scalef_round_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_scalef_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
+  return _mm512_maskz_scalef_round_ph(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_scalef_ph(__m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_scalef_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
+  return _mm512_scalef_ph(__A, __B);
+}
+
+__m512h test_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_mask_scalef_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
+  return _mm512_mask_scalef_ph(__W, __U, __A, __B);
+}
+
+__m512h test_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+  // CHECK-LABEL: @test_mm512_maskz_scalef_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.512
+  return _mm512_maskz_scalef_ph(__U, __A, __B);
+}
+
+__m512h test_mm512_mask_roundscale_ph(__m512h __W, __mmask16 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_mask_roundscale_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
+  return _mm512_mask_roundscale_ph(__W, __U, __A, 1);
+}
+
+__m512h test_mm512_maskz_roundscale_ph(__mmask16 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_maskz_roundscale_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
+  return _mm512_maskz_roundscale_ph(__U, __A, 1);
+}
+
+__m512h test_mm512_mask_roundscale_round_ph(__m512h __A, __mmask16 __U, __m512h __C) {
+  // CHECK-LABEL: @test_mm512_mask_roundscale_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
+  return _mm512_mask_roundscale_round_ph(__A, __U, __C, 3, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_roundscale_round_ph(__m512h __A, __mmask16 __U) {
+  // CHECK-LABEL: @test_mm512_maskz_roundscale_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
+  return _mm512_maskz_roundscale_round_ph(__U, __A, 3, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_roundscale_round_ph(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_roundscale_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
+  return _mm512_roundscale_round_ph(__A, 3, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_roundscale_ph(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_roundscale_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.512
+  return _mm512_roundscale_ph(__A, 3);
+}
+
+__m512h test_mm512_getexp_round_ph(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_getexp_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
+  return _mm512_getexp_round_ph(__A, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_getexp_round_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_mask_getexp_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
+  return _mm512_mask_getexp_round_ph(__W, __U, __A, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_getexp_round_ph(__mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getexp_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
+  return _mm512_maskz_getexp_round_ph(__U, __A, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_getexp_ph(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_getexp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
+  return _mm512_getexp_ph(__A);
+}
+
+__m512h test_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_mask_getexp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
+  return _mm512_mask_getexp_ph(__W, __U, __A);
+}
+
+__m512h test_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getexp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.512
+  return _mm512_maskz_getexp_ph(__U, __A);
+}
+
+__m512h test_mm512_mask_reduce_ph(__m512h __W, __mmask16 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_mask_reduce_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
+  return _mm512_mask_reduce_ph(__W, __U, __A, 1);
+}
+
+__m512h test_mm512_maskz_reduce_ph(__mmask16 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_maskz_reduce_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
+  return _mm512_maskz_reduce_ph(__U, __A, 1);
+}
+
+__m512h test_mm512_mask_reduce_round_ph(__m512h __A, __mmask16 __U, __m512h __C) {
+  // CHECK-LABEL: @test_mm512_mask_reduce_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
+  return _mm512_mask_reduce_round_ph(__A, __U, __C, 3, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_reduce_round_ph(__m512h __A, __mmask16 __U) {
+  // CHECK-LABEL: @test_mm512_maskz_reduce_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
+  return _mm512_maskz_reduce_round_ph(__U, __A, 3, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_reduce_round_ph(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_reduce_round_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
+  return _mm512_reduce_round_ph(__A, 3, _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_reduce_ph(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_reduce_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.512
+  return _mm512_reduce_ph(__A, 3);
+}
+__m128h test_mm_rcp_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_rcp_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.rcp.sh
+  return _mm_rcp_sh(__A, __B);
+}
+
+__m128h test_mm_mask_rcp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_rcp_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.rcp.sh
+  return _mm_mask_rcp_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_rcp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_rcp_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.rcp.sh
+  return _mm_maskz_rcp_sh(__U, __A, __B);
+}
+
+__m128h test_mm_rsqrt_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_rsqrt_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.sh
+  return _mm_rsqrt_sh(__A, __B);
+}
+
+__m128h test_mm_mask_rsqrt_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_rsqrt_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.sh
+  return _mm_mask_rsqrt_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_rsqrt_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.sh
+  return _mm_maskz_rsqrt_sh(__U, __A, __B);
+}
+
+__m128h test_mm_getmant_round_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_getmant_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
+  return _mm_getmant_round_sh(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src, 8);
+}
+
+__m128h test_mm_getmant_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_getmant_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
+  return _mm_getmant_sh(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src);
+}
+
+__m128h test_mm_mask_getmant_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_getmant_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
+  return _mm_mask_getmant_sh(__W, __U, __A, __B, 1, 2);
+}
+
+__m128h test_mm_mask_getmant_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_getmant_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
+  return _mm_mask_getmant_round_sh(__W, __U, __A, __B, 1, 2, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_getmant_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_getmant_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
+  return _mm_maskz_getmant_sh(__U, __A, __B, 1, 2);
+}
+
+__m128h test_mm_maskz_getmant_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_getmant_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.sh
+  return _mm_maskz_getmant_round_sh(__U, __A, __B, 1, 2, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_getexp_round_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_getexp_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
+  return _mm_getexp_round_sh(__A, __B, 8);
+}
+
+__m128h test_mm_getexp_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_getexp_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
+  return _mm_getexp_sh(__A, __B);
+}
+
+__m128h test_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_getexp_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
+  return _mm_mask_getexp_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_mask_getexp_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_getexp_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
+  return _mm_mask_getexp_round_sh(__W, __U, __A, __B, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_getexp_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
+  return _mm_maskz_getexp_sh(__U, __A, __B);
+}
+
+__m128h test_mm_maskz_getexp_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_getexp_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.sh
+  return _mm_maskz_getexp_round_sh(__U, __A, __B, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_scalef_round_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_scalef_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 -1, i32 11)
+  return _mm_scalef_round_sh(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_scalef_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_scalef_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh
+  return _mm_scalef_sh(__A, __B);
+}
+
+__m128h test_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_scalef_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh
+  return _mm_mask_scalef_sh(__W, __U, __A, __B);
+}
+
+__m128h test_mm_mask_scalef_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_scalef_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 %{{.*}}, i32 11)
+  return _mm_mask_scalef_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_scalef_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh
+  return _mm_maskz_scalef_sh(__U, __A, __B);
+}
+
+__m128h test_mm_maskz_scalef_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_scalef_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 %{{.*}}, i32 11)
+  return _mm_maskz_scalef_round_sh(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_roundscale_round_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_roundscale_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
+  return _mm_roundscale_round_sh(__A, __B, 3, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_roundscale_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_roundscale_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
+  return _mm_roundscale_sh(__A, __B, 3);
+}
+
+__m128h test_mm_mask_roundscale_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_roundscale_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
+  return _mm_mask_roundscale_sh(__W, __U, __A, __B, 3);
+}
+
+__m128h test_mm_mask_roundscale_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_roundscale_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
+  return _mm_mask_roundscale_round_sh(__W, __U, __A, __B, 3, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_roundscale_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_roundscale_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
+  return _mm_maskz_roundscale_round_sh(__U, __A, __B, 3, _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_roundscale_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_roundscale_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.sh
+  return _mm_maskz_roundscale_sh(__U, __A, __B, 3);
+}
+
+__m128h test_mm_reduce_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_reduce_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
+  return _mm_reduce_sh(__A, __B, 4);
+}
+
+__m128h test_mm_mask_reduce_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_reduce_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
+  return _mm_mask_reduce_sh(__W, __U, __A, __B, 4);
+}
+
+__m128h test_mm_maskz_reduce_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_reduce_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
+  return _mm_maskz_reduce_sh(__U, __A, __B, 4);
+}
+
+__m128h test_mm_reduce_round_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_reduce_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
+  return _mm_reduce_round_sh(__A, __B, 4, 8);
+}
+
+__m128h test_mm_mask_reduce_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_reduce_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
+  return _mm_mask_reduce_round_sh(__W, __U, __A, __B, 4, 8);
+}
+
+__m128h test_mm_maskz_reduce_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_reduce_round_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.sh
+  return _mm_maskz_reduce_round_sh(__U, __A, __B, 4, 8);
+}
+
+__m512h test_mm512_sqrt_round_ph(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_sqrt_round_ph
+  // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11)
+  return _mm512_sqrt_round_ph(__A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_sqrt_round_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_mask_sqrt_round_ph
+  // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11)
+  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_mask_sqrt_round_ph(__W, __U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_sqrt_round_ph(__mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_maskz_sqrt_round_ph
+  // CHECK: call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %{{.*}}, i32 11)
+  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> {{.*}}
+  return _mm512_maskz_sqrt_round_ph(__U, __A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_sqrt_ph(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_sqrt_ph
+  // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}})
+  return _mm512_sqrt_ph(__A);
+}
+__m512h test_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_mask_sqrt_ph
+  // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}})
+  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> %{{.*}}
+  return _mm512_mask_sqrt_ph(__W, __U, __A);
+}
+__m512h test_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_maskz_sqrt_ph
+  // CHECK: %{{.*}} = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %{{.*}})
+  // CHECK: bitcast i32 %{{.*}} to <32 x i1>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x half> %{{.*}}, <32 x half> {{.*}}
+  return _mm512_maskz_sqrt_ph(__U, __A);
+}
+
+__m128h test_mm_sqrt_round_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_sqrt_round_sh
+  // CHECK: call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 -1, i32 11)
+  return _mm_sqrt_round_sh(__A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask_sqrt_round_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_sqrt_round_sh
+  // CHECK: call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 {{.*}}, i32 11)
+  return _mm_mask_sqrt_round_sh(__W, __U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_sqrt_round_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_sqrt_round_sh
+  // CHECK: call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}, i8 {{.*}}, i32 11)
+  return _mm_maskz_sqrt_round_sh(__U, __A, __B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_sqrt_sh(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_sqrt_sh
+  // CHECK: %{{.*}} = call half @llvm.sqrt.f16(half %{{.*}})
+  return _mm_sqrt_sh(__A, __B);
+}
+__m128h test_mm_mask_sqrt_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_sqrt_sh
+  // CHECK: %{{.*}} = call half @llvm.sqrt.f16(half %{{.*}})
+  return _mm_mask_sqrt_sh(__W, __U, __A, __B);
+}
+__m128h test_mm_maskz_sqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_sqrt_sh
+  // CHECK: %{{.*}} = call half @llvm.sqrt.f16(half %{{.*}})
+  return _mm_maskz_sqrt_sh(__U, __A, __B);
+}
+
+__mmask32 test_mm512_mask_fpclass_ph_mask(__mmask32 __U, __m512h __A) {
+  // CHECK-LABEL: @test_mm512_mask_fpclass_ph_mask
+  // CHECK: @llvm.x86.avx512fp16.fpclass.ph.512
+  return _mm512_mask_fpclass_ph_mask(__U, __A, 4);
+}
+
+__mmask32 test_mm512_fpclass_ph_mask(__m512h __A) {
+  // CHECK-LABEL: @test_mm512_fpclass_ph_mask
+  // CHECK: @llvm.x86.avx512fp16.fpclass.ph.512
+  return _mm512_fpclass_ph_mask(__A, 4);
+}
+
+__mmask8 test_mm_fpclash_sh_mask(__m128 __A) {
+  // CHECK-LABEL: @test_mm_fpclash_sh_mask
+  // CHECK: @llvm.x86.avx512fp16.mask.fpclass.sh
+  return _mm_fpclass_sh_mask(__A, 2);
+}
+
+__mmask8 test_mm_mask_fpclash_sh_mask(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_fpclash_sh_mask
+  // CHECK: @llvm.x86.avx512fp16.mask.fpclass.sh
+  return _mm_mask_fpclass_sh_mask(__U, __A, 2);
+}
+
+__m128h test_mm512_cvt_roundpd_ph(__m512d A) {
+  // CHECK-LABEL: test_mm512_cvt_roundpd_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512
+  return _mm512_cvt_roundpd_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm512_mask_cvt_roundpd_ph(__m128h A, __mmask8 B, __m512d C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundpd_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512
+  return _mm512_mask_cvt_roundpd_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm512_maskz_cvt_roundpd_ph(__mmask8 A, __m512d B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundpd_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512
+  return _mm512_maskz_cvt_roundpd_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm512_cvtpd_ph(__m512d A) {
+  // CHECK-LABEL: test_mm512_cvtpd_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512
+  return _mm512_cvtpd_ph(A);
+}
+
+__m128h test_mm512_mask_cvtpd_ph(__m128h A, __mmask8 B, __m512d C) {
+  // CHECK-LABEL: test_mm512_mask_cvtpd_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512
+  return _mm512_mask_cvtpd_ph(A, B, C);
+}
+
+__m128h test_mm512_maskz_cvtpd_ph(__mmask8 A, __m512d B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtpd_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.512
+  return _mm512_maskz_cvtpd_ph(A, B);
+}
+
+__m512d test_mm512_cvt_roundph_pd(__m128h A) {
+  // CHECK-LABEL: test_mm512_cvt_roundph_pd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512
+  return _mm512_cvt_roundph_pd(A, _MM_FROUND_NO_EXC);
+}
+
+__m512d test_mm512_mask_cvt_roundph_pd(__m512d A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundph_pd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512
+  return _mm512_mask_cvt_roundph_pd(A, B, C, _MM_FROUND_NO_EXC);
+}
+
+__m512d test_mm512_maskz_cvt_roundph_pd(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_pd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512
+  return _mm512_maskz_cvt_roundph_pd(A, B, _MM_FROUND_NO_EXC);
+}
+
+__m512d test_mm512_cvtph_pd(__m128h A) {
+  // CHECK-LABEL: test_mm512_cvtph_pd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512
+  return _mm512_cvtph_pd(A);
+}
+
+__m512d test_mm512_mask_cvtph_pd(__m512d A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtph_pd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512
+  return _mm512_mask_cvtph_pd(A, B, C);
+}
+
+__m512d test_mm512_maskz_cvtph_pd(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtph_pd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.512
+  return _mm512_maskz_cvtph_pd(A, B);
+}
+
+__m128 test_mm_cvt_roundsh_ss(__m128 A, __m128h B) {
+  // CHECK-LABEL: test_mm_cvt_roundsh_ss
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round
+  return _mm_cvt_roundsh_ss(A, B, _MM_FROUND_NO_EXC);
+}
+
+__m128 test_mm_mask_cvt_roundsh_ss(__m128 A, __mmask8 B, __m128 C, __m128h D) {
+  // CHECK-LABEL: test_mm_mask_cvt_roundsh_ss
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round
+  return _mm_mask_cvt_roundsh_ss(A, B, C, D, _MM_FROUND_NO_EXC);
+}
+
+__m128 test_mm_maskz_cvt_roundsh_ss(__mmask8 A, __m128 B, __m128h C) {
+  // CHECK-LABEL: test_mm_maskz_cvt_roundsh_ss
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round
+  return _mm_maskz_cvt_roundsh_ss(A, B, C, _MM_FROUND_NO_EXC);
+}
+
+__m128 test_mm_cvtsh_ss(__m128 A, __m128h B) {
+  // CHECK-LABEL: test_mm_cvtsh_ss
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round
+  return _mm_cvtsh_ss(A, B);
+}
+
+__m128 test_mm_mask_cvtsh_ss(__m128 A, __mmask8 B, __m128 C, __m128h D) {
+  // CHECK-LABEL: test_mm_mask_cvtsh_ss
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round
+  return _mm_mask_cvtsh_ss(A, B, C, D);
+}
+
+__m128 test_mm_maskz_cvtsh_ss(__mmask8 A, __m128 B, __m128h C) {
+  // CHECK-LABEL: test_mm_maskz_cvtsh_ss
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2ss.round
+  return _mm_maskz_cvtsh_ss(A, B, C);
+}
+
+__m128h test_mm_cvt_roundss_sh(__m128h A, __m128 B) {
+  // CHECK-LABEL: test_mm_cvt_roundss_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round
+  return _mm_cvt_roundss_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask_cvt_roundss_sh(__m128h A, __mmask8 B, __m128h C, __m128 D) {
+  // CHECK-LABEL: test_mm_mask_cvt_roundss_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round
+  return _mm_mask_cvt_roundss_sh(A, B, C, D, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_cvt_roundss_sh(__mmask8 A, __m128h B, __m128 C) {
+  // CHECK-LABEL: test_mm_maskz_cvt_roundss_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round
+  return _mm_maskz_cvt_roundss_sh(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_cvtss_sh(__m128h A, __m128 B) {
+  // CHECK-LABEL: test_mm_cvtss_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round
+  return _mm_cvtss_sh(A, B);
+}
+
+__m128h test_mm_mask_cvtss_sh(__m128h A, __mmask8 B, __m128h C, __m128 D) {
+  // CHECK-LABEL: test_mm_mask_cvtss_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round
+  return _mm_mask_cvtss_sh(A, B, C, D);
+}
+
+__m128h test_mm_maskz_cvtss_sh(__mmask8 A, __m128h B, __m128 C) {
+  // CHECK-LABEL: test_mm_maskz_cvtss_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtss2sh.round
+  return _mm_maskz_cvtss_sh(A, B, C);
+}
+
+__m128h test_mm_cvt_roundsd_sh(__m128h A, __m128d B) {
+  // CHECK-LABEL: test_mm_cvt_roundsd_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round
+  return _mm_cvt_roundsd_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_mask_cvt_roundsd_sh(__m128h A, __mmask8 B, __m128h C, __m128d D) {
+  // CHECK-LABEL: test_mm_mask_cvt_roundsd_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round
+  return _mm_mask_cvt_roundsd_sh(A, B, C, D, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_maskz_cvt_roundsd_sh(__mmask8 A, __m128h B, __m128d C) {
+  // CHECK-LABEL: test_mm_maskz_cvt_roundsd_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round
+  return _mm_maskz_cvt_roundsd_sh(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_cvtsd_sh(__m128h A, __m128d B) {
+  // CHECK-LABEL: test_mm_cvtsd_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round
+  return _mm_cvtsd_sh(A, B);
+}
+
+__m128h test_mm_mask_cvtsd_sh(__m128h A, __mmask8 B, __m128h C, __m128d D) {
+  // CHECK-LABEL: test_mm_mask_cvtsd_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round
+  return _mm_mask_cvtsd_sh(A, B, C, D);
+}
+
+__m128h test_mm_maskz_cvtsd_sh(__mmask8 A, __m128h B, __m128d C) {
+  // CHECK-LABEL: test_mm_maskz_cvtsd_sh
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsd2sh.round
+  return _mm_maskz_cvtsd_sh(A, B, C);
+}
+
+__m128d test_mm_cvt_roundsh_sd(__m128d A, __m128h B) {
+  // CHECK-LABEL: test_mm_cvt_roundsh_sd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round
+  return _mm_cvt_roundsh_sd(A, B, _MM_FROUND_NO_EXC);
+}
+
+__m128d test_mm_mask_cvt_roundsh_sd(__m128d A, __mmask8 B, __m128d C, __m128h D) {
+  // CHECK-LABEL: test_mm_mask_cvt_roundsh_sd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round
+  return _mm_mask_cvt_roundsh_sd(A, B, C, D, _MM_FROUND_NO_EXC);
+}
+
+__m128d test_mm_maskz_cvt_roundsh_sd(__mmask8 A, __m128d B, __m128h C) {
+  // CHECK-LABEL: test_mm_maskz_cvt_roundsh_sd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round
+  return _mm_maskz_cvt_roundsh_sd(A, B, C, _MM_FROUND_NO_EXC);
+}
+
+__m128d test_mm_cvtsh_sd(__m128d A, __m128h B) {
+  // CHECK-LABEL: test_mm_cvtsh_sd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round
+  return _mm_cvtsh_sd(A, B);
+}
+
+__m128d test_mm_mask_cvtsh_sd(__m128d A, __mmask8 B, __m128d C, __m128h D) {
+  // CHECK-LABEL: test_mm_mask_cvtsh_sd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round
+  return _mm_mask_cvtsh_sd(A, B, C, D);
+}
+
+__m128d test_mm_maskz_cvtsh_sd(__mmask8 A, __m128d B, __m128h C) {
+  // CHECK-LABEL: test_mm_maskz_cvtsh_sd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtsh2sd.round
+  return _mm_maskz_cvtsh_sd(A, B, C);
+}
+
+__m512i test_mm512_cvt_roundph_epi16(__m512h A) {
+  // CHECK-LABEL: test_mm512_cvt_roundph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512
+  return _mm512_cvt_roundph_epi16(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_mask_cvt_roundph_epi16(__m512i A, __mmask32 B, __m512h C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512
+  return _mm512_mask_cvt_roundph_epi16(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_maskz_cvt_roundph_epi16(__mmask32 A, __m512h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512
+  return _mm512_maskz_cvt_roundph_epi16(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_cvtph_epi16(__m512h A) {
+  // CHECK-LABEL: test_mm512_cvtph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512
+  return _mm512_cvtph_epi16(A);
+}
+
+__m512i test_mm512_mask_cvtph_epi16(__m512i A, __mmask32 B, __m512h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512
+  return _mm512_mask_cvtph_epi16(A, B, C);
+}
+
+__m512i test_mm512_maskz_cvtph_epi16(__mmask32 A, __m512h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.512
+  return _mm512_maskz_cvtph_epi16(A, B);
+}
+
+__m512i test_mm512_cvtt_roundph_epi16(__m512h A) {
+  // CHECK-LABEL: test_mm512_cvtt_roundph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512
+  return _mm512_cvtt_roundph_epi16(A, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_mask_cvtt_roundph_epi16(__m512i A, __mmask32 B, __m512h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512
+  return _mm512_mask_cvtt_roundph_epi16(A, B, C, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_maskz_cvtt_roundph_epi16(__mmask32 A, __m512h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512
+  return _mm512_maskz_cvtt_roundph_epi16(A, B, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_cvttph_epi16(__m512h A) {
+  // CHECK-LABEL: test_mm512_cvttph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512
+  return _mm512_cvttph_epi16(A);
+}
+
+__m512i test_mm512_mask_cvttph_epi16(__m512i A, __mmask32 B, __m512h C) {
+  // CHECK-LABEL: test_mm512_mask_cvttph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512
+  return _mm512_mask_cvttph_epi16(A, B, C);
+}
+
+__m512i test_mm512_maskz_cvttph_epi16(__mmask32 A, __m512h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvttph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.512
+  return _mm512_maskz_cvttph_epi16(A, B);
+}
+
+__m512h test_mm512_cvt_roundepi16_ph(__m512i A) {
+  // CHECK-LABEL: test_mm512_cvt_roundepi16_ph
+  // CHECK:   @llvm.x86.avx512.sitofp.round.v32f16.v32i16
+  return _mm512_cvt_roundepi16_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_cvt_roundepi16_ph(__m512h A, __mmask32 B, __m512i C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundepi16_ph
+  // CHECK: @llvm.x86.avx512.sitofp.round.v32f16.v32i16
+  return _mm512_mask_cvt_roundepi16_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_cvt_roundepi16_ph(__mmask32 A, __m512i B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundepi16_ph
+  // CHECK: @llvm.x86.avx512.sitofp.round.v32f16.v32i16
+  return _mm512_maskz_cvt_roundepi16_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_cvtepi16_ph(__m512i A) {
+  // CHECK-LABEL: test_mm512_cvtepi16_ph
+  // CHECK: %{{.*}} = sitofp <32 x i16> %{{.*}} to <32 x half>
+  return _mm512_cvtepi16_ph(A);
+}
+
+__m512h test_mm512_mask_cvtepi16_ph(__m512h A, __mmask32 B, __m512i C) {
+  // CHECK-LABEL: test_mm512_mask_cvtepi16_ph
+  // CHECK: %{{.*}} = sitofp <32 x i16> %{{.*}} to <32 x half>
+  return _mm512_mask_cvtepi16_ph(A, B, C);
+}
+
+__m512h test_mm512_maskz_cvtepi16_ph(__mmask32 A, __m512i B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtepi16_ph
+  // CHECK: %{{.*}} = sitofp <32 x i16> %{{.*}} to <32 x half>
+  return _mm512_maskz_cvtepi16_ph(A, B);
+}
+
+__m512i test_mm512_cvt_roundph_epu16(__m512h A) {
+  // CHECK-LABEL: test_mm512_cvt_roundph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512
+  return _mm512_cvt_roundph_epu16(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_mask_cvt_roundph_epu16(__m512i A, __mmask32 B, __m512h C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512
+  return _mm512_mask_cvt_roundph_epu16(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_maskz_cvt_roundph_epu16(__mmask32 A, __m512h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512
+  return _mm512_maskz_cvt_roundph_epu16(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_cvtph_epu16(__m512h A) {
+  // CHECK-LABEL: test_mm512_cvtph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512
+  return _mm512_cvtph_epu16(A);
+}
+
+__m512i test_mm512_mask_cvtph_epu16(__m512i A, __mmask32 B, __m512h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512
+  return _mm512_mask_cvtph_epu16(A, B, C);
+}
+
+__m512i test_mm512_maskz_cvtph_epu16(__mmask32 A, __m512h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.512
+  return _mm512_maskz_cvtph_epu16(A, B);
+}
+
+__m512i test_mm512_cvtt_roundph_epu16(__m512h A) {
+  // CHECK-LABEL: test_mm512_cvtt_roundph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512
+  return _mm512_cvtt_roundph_epu16(A, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_mask_cvtt_roundph_epu16(__m512i A, __mmask32 B, __m512h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512
+  return _mm512_mask_cvtt_roundph_epu16(A, B, C, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_maskz_cvtt_roundph_epu16(__mmask32 A, __m512h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512
+  return _mm512_maskz_cvtt_roundph_epu16(A, B, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_cvttph_epu16(__m512h A) {
+  // CHECK-LABEL: test_mm512_cvttph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512
+  return _mm512_cvttph_epu16(A);
+}
+
+__m512i test_mm512_mask_cvttph_epu16(__m512i A, __mmask32 B, __m512h C) {
+  // CHECK-LABEL: test_mm512_mask_cvttph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512
+  return _mm512_mask_cvttph_epu16(A, B, C);
+}
+
+__m512i test_mm512_maskz_cvttph_epu16(__mmask32 A, __m512h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvttph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.512
+  return _mm512_maskz_cvttph_epu16(A, B);
+}
+
+__m512h test_mm512_cvt_roundepu16_ph(__m512i A) {
+  // CHECK-LABEL: test_mm512_cvt_roundepu16_ph
+  // CHECK: @llvm.x86.avx512.uitofp.round.v32f16.v32i16
+  return _mm512_cvt_roundepu16_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_mask_cvt_roundepu16_ph(__m512h A, __mmask32 B, __m512i C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundepu16_ph
+  // CHECK: @llvm.x86.avx512.uitofp.round.v32f16.v32i16
+  return _mm512_mask_cvt_roundepu16_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_maskz_cvt_roundepu16_ph(__mmask32 A, __m512i B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundepu16_ph
+  // CHECK: @llvm.x86.avx512.uitofp.round.v32f16.v32i16
+  return _mm512_maskz_cvt_roundepu16_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512h test_mm512_cvtepu16_ph(__m512i A) {
+  // CHECK-LABEL: test_mm512_cvtepu16_ph
+  // CHECK: %{{.*}} = uitofp <32 x i16> %{{.*}} to <32 x half>
+  return _mm512_cvtepu16_ph(A);
+}
+
+__m512h test_mm512_mask_cvtepu16_ph(__m512h A, __mmask32 B, __m512i C) {
+  // CHECK-LABEL: test_mm512_mask_cvtepu16_ph
+  // CHECK: %{{.*}} = uitofp <32 x i16> %{{.*}} to <32 x half>
+  return _mm512_mask_cvtepu16_ph(A, B, C);
+}
+
+__m512h test_mm512_maskz_cvtepu16_ph(__mmask32 A, __m512i B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtepu16_ph
+  // CHECK: %{{.*}} = uitofp <32 x i16> %{{.*}} to <32 x half>
+  return _mm512_maskz_cvtepu16_ph(A, B);
+}
+
+__m512i test_mm512_cvt_roundph_epi32(__m256h A) {
+  // CHECK-LABEL: test_mm512_cvt_roundph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512
+  return _mm512_cvt_roundph_epi32(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_mask_cvt_roundph_epi32(__m512i A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512
+  return _mm512_mask_cvt_roundph_epi32(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_maskz_cvt_roundph_epi32(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512
+  return _mm512_maskz_cvt_roundph_epi32(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_cvtph_epi32(__m256h A) {
+  // CHECK-LABEL: test_mm512_cvtph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512
+  return _mm512_cvtph_epi32(A);
+}
+
+__m512i test_mm512_mask_cvtph_epi32(__m512i A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512
+  return _mm512_mask_cvtph_epi32(A, B, C);
+}
+
+__m512i test_mm512_maskz_cvtph_epi32(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.512
+  return _mm512_maskz_cvtph_epi32(A, B);
+}
+
+__m512i test_mm512_cvt_roundph_epu32(__m256h A) {
+  // CHECK-LABEL: test_mm512_cvt_roundph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512
+  return _mm512_cvt_roundph_epu32(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_mask_cvt_roundph_epu32(__m512i A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512
+  return _mm512_mask_cvt_roundph_epu32(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_maskz_cvt_roundph_epu32(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512
+  return _mm512_maskz_cvt_roundph_epu32(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_cvtph_epu32(__m256h A) {
+  // CHECK-LABEL: test_mm512_cvtph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512
+  return _mm512_cvtph_epu32(A);
+}
+
+__m512i test_mm512_mask_cvtph_epu32(__m512i A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512
+  return _mm512_mask_cvtph_epu32(A, B, C);
+}
+
+__m512i test_mm512_maskz_cvtph_epu32(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.512
+  return _mm512_maskz_cvtph_epu32(A, B);
+}
+
+__m256h test_mm512_cvt_roundepi32_ph(__m512i A) {
+  // CHECK-LABEL: test_mm512_cvt_roundepi32_ph
+  // CHECK: @llvm.x86.avx512.sitofp.round.v16f16.v16i32
+  return _mm512_cvt_roundepi32_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm512_mask_cvt_roundepi32_ph(__m256h A, __mmask16 B, __m512i C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundepi32_ph
+  // CHECK: @llvm.x86.avx512.sitofp.round.v16f16.v16i32
+  return _mm512_mask_cvt_roundepi32_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm512_maskz_cvt_roundepi32_ph(__mmask16 A, __m512i B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundepi32_ph
+  // CHECK: @llvm.x86.avx512.sitofp.round.v16f16.v16i32
+  return _mm512_maskz_cvt_roundepi32_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm512_cvtepi32_ph(__m512i A) {
+  // CHECK-LABEL: test_mm512_cvtepi32_ph
+  // CHECK: %{{.*}} = sitofp <16 x i32> %{{.*}} to <16 x half>
+  return _mm512_cvtepi32_ph(A);
+}
+
+__m256h test_mm512_mask_cvtepi32_ph(__m256h A, __mmask16 B, __m512i C) {
+  // CHECK-LABEL: test_mm512_mask_cvtepi32_ph
+  // CHECK: %{{.*}} = sitofp <16 x i32> %{{.*}} to <16 x half>
+  return _mm512_mask_cvtepi32_ph(A, B, C);
+}
+
+__m256h test_mm512_maskz_cvtepi32_ph(__mmask16 A, __m512i B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtepi32_ph
+  // CHECK: %{{.*}} = sitofp <16 x i32> %{{.*}} to <16 x half>
+  return _mm512_maskz_cvtepi32_ph(A, B);
+}
+
+__m256h test_mm512_cvt_roundepu32_ph(__m512i A) {
+  // CHECK-LABEL: test_mm512_cvt_roundepu32_ph
+  // CHECK: @llvm.x86.avx512.uitofp.round.v16f16.v16i32
+  return _mm512_cvt_roundepu32_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm512_mask_cvt_roundepu32_ph(__m256h A, __mmask16 B, __m512i C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundepu32_ph
+  // CHECK: @llvm.x86.avx512.uitofp.round.v16f16.v16i32
+  return _mm512_mask_cvt_roundepu32_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm512_maskz_cvt_roundepu32_ph(__mmask16 A, __m512i B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundepu32_ph
+  // CHECK: @llvm.x86.avx512.uitofp.round.v16f16.v16i32
+  return _mm512_maskz_cvt_roundepu32_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm512_cvtepu32_ph(__m512i A) {
+  // CHECK-LABEL: test_mm512_cvtepu32_ph
+  // CHECK: %{{.*}} = uitofp <16 x i32> %{{.*}} to <16 x half>
+  return _mm512_cvtepu32_ph(A);
+}
+
+__m256h test_mm512_mask_cvtepu32_ph(__m256h A, __mmask16 B, __m512i C) {
+  // CHECK-LABEL: test_mm512_mask_cvtepu32_ph
+  // CHECK: %{{.*}} = uitofp <16 x i32> %{{.*}} to <16 x half>
+  return _mm512_mask_cvtepu32_ph(A, B, C);
+}
+
+__m256h test_mm512_maskz_cvtepu32_ph(__mmask16 A, __m512i B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtepu32_ph
+  // CHECK: %{{.*}} = uitofp <16 x i32> %{{.*}} to <16 x half>
+  return _mm512_maskz_cvtepu32_ph(A, B);
+}
+
+__m512i test_mm512_cvtt_roundph_epi32(__m256h A) {
+  // CHECK-LABEL: test_mm512_cvtt_roundph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512
+  return _mm512_cvtt_roundph_epi32(A, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_mask_cvtt_roundph_epi32(__m512i A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512
+  return _mm512_mask_cvtt_roundph_epi32(A, B, C, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_maskz_cvtt_roundph_epi32(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512
+  return _mm512_maskz_cvtt_roundph_epi32(A, B, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_cvttph_epi32(__m256h A) {
+  // CHECK-LABEL: test_mm512_cvttph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512
+  return _mm512_cvttph_epi32(A);
+}
+
+__m512i test_mm512_mask_cvttph_epi32(__m512i A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm512_mask_cvttph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512
+  return _mm512_mask_cvttph_epi32(A, B, C);
+}
+
+__m512i test_mm512_maskz_cvttph_epi32(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvttph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.512
+  return _mm512_maskz_cvttph_epi32(A, B);
+}
+
+__m512i test_mm512_cvtt_roundph_epu32(__m256h A) {
+  // CHECK-LABEL: test_mm512_cvtt_roundph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512
+  return _mm512_cvtt_roundph_epu32(A, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_mask_cvtt_roundph_epu32(__m512i A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512
+  return _mm512_mask_cvtt_roundph_epu32(A, B, C, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_maskz_cvtt_roundph_epu32(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512
+  return _mm512_maskz_cvtt_roundph_epu32(A, B, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_cvttph_epu32(__m256h A) {
+  // CHECK-LABEL: test_mm512_cvttph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512
+  return _mm512_cvttph_epu32(A);
+}
+
+__m512i test_mm512_mask_cvttph_epu32(__m512i A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm512_mask_cvttph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512
+  return _mm512_mask_cvttph_epu32(A, B, C);
+}
+
+__m512i test_mm512_maskz_cvttph_epu32(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvttph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.512
+  return _mm512_maskz_cvttph_epu32(A, B);
+}
+
+__m128h test_mm512_cvt_roundepi64_ph(__m512i A) {
+  // CHECK-LABEL: test_mm512_cvt_roundepi64_ph
+  // CHECK: @llvm.x86.avx512.sitofp.round.v8f16.v8i64
+  return _mm512_cvt_roundepi64_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm512_mask_cvt_roundepi64_ph(__m128h A, __mmask8 B, __m512i C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundepi64_ph
+  // CHECK: @llvm.x86.avx512.sitofp.round.v8f16.v8i64
+  return _mm512_mask_cvt_roundepi64_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm512_maskz_cvt_roundepi64_ph(__mmask8 A, __m512i B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundepi64_ph
+  // CHECK: @llvm.x86.avx512.sitofp.round.v8f16.v8i64
+  return _mm512_maskz_cvt_roundepi64_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm512_cvtepi64_ph(__m512i A) {
+  // CHECK-LABEL: test_mm512_cvtepi64_ph
+  // CHECK: %{{.*}} = sitofp <8 x i64> %{{.*}} to <8 x half>
+  return _mm512_cvtepi64_ph(A);
+}
+
+__m128h test_mm512_mask_cvtepi64_ph(__m128h A, __mmask8 B, __m512i C) {
+  // CHECK-LABEL: test_mm512_mask_cvtepi64_ph
+  // CHECK: %{{.*}} = sitofp <8 x i64> %{{.*}} to <8 x half>
+  return _mm512_mask_cvtepi64_ph(A, B, C);
+}
+
+__m128h test_mm512_maskz_cvtepi64_ph(__mmask8 A, __m512i B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtepi64_ph
+  // CHECK: %{{.*}} = sitofp <8 x i64> %{{.*}} to <8 x half>
+  return _mm512_maskz_cvtepi64_ph(A, B);
+}
+
+__m512i test_mm512_cvt_roundph_epi64(__m128h A) {
+  // CHECK-LABEL: test_mm512_cvt_roundph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512
+  return _mm512_cvt_roundph_epi64(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_mask_cvt_roundph_epi64(__m512i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512
+  return _mm512_mask_cvt_roundph_epi64(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_maskz_cvt_roundph_epi64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512
+  return _mm512_maskz_cvt_roundph_epi64(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_cvtph_epi64(__m128h A) {
+  // CHECK-LABEL: test_mm512_cvtph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512
+  return _mm512_cvtph_epi64(A);
+}
+
+__m512i test_mm512_mask_cvtph_epi64(__m512i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512
+  return _mm512_mask_cvtph_epi64(A, B, C);
+}
+
+__m512i test_mm512_maskz_cvtph_epi64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.512
+  return _mm512_maskz_cvtph_epi64(A, B);
+}
+
+__m128h test_mm512_cvt_roundepu64_ph(__m512i A) {
+  // CHECK-LABEL: test_mm512_cvt_roundepu64_ph
+  // CHECK: @llvm.x86.avx512.uitofp.round.v8f16.v8i64
+  return _mm512_cvt_roundepu64_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm512_mask_cvt_roundepu64_ph(__m128h A, __mmask8 B, __m512i C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundepu64_ph
+  // CHECK: @llvm.x86.avx512.uitofp.round.v8f16.v8i64
+  return _mm512_mask_cvt_roundepu64_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm512_maskz_cvt_roundepu64_ph(__mmask8 A, __m512i B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundepu64_ph
+  // CHECK: @llvm.x86.avx512.uitofp.round.v8f16.v8i64
+  return _mm512_maskz_cvt_roundepu64_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm512_cvtepu64_ph(__m512i A) {
+  // CHECK-LABEL: test_mm512_cvtepu64_ph
+  // CHECK: %{{.*}} = uitofp <8 x i64> %{{.*}} to <8 x half>
+  return _mm512_cvtepu64_ph(A);
+}
+
+__m128h test_mm512_mask_cvtepu64_ph(__m128h A, __mmask8 B, __m512i C) {
+  // CHECK-LABEL: test_mm512_mask_cvtepu64_ph
+  // CHECK: %{{.*}} = uitofp <8 x i64> %{{.*}} to <8 x half>
+  return _mm512_mask_cvtepu64_ph(A, B, C);
+}
+
+__m128h test_mm512_maskz_cvtepu64_ph(__mmask8 A, __m512i B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtepu64_ph
+  // CHECK: %{{.*}} = uitofp <8 x i64> %{{.*}} to <8 x half>
+  return _mm512_maskz_cvtepu64_ph(A, B);
+}
+
+__m512i test_mm512_cvt_roundph_epu64(__m128h A) {
+  // CHECK-LABEL: test_mm512_cvt_roundph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512
+  return _mm512_cvt_roundph_epu64(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_mask_cvt_roundph_epu64(__m512i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm512_mask_cvt_roundph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512
+  return _mm512_mask_cvt_roundph_epu64(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_maskz_cvt_roundph_epu64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvt_roundph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512
+  return _mm512_maskz_cvt_roundph_epu64(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_cvtph_epu64(__m128h A) {
+  // CHECK-LABEL: test_mm512_cvtph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512
+  return _mm512_cvtph_epu64(A);
+}
+
+__m512i test_mm512_mask_cvtph_epu64(__m512i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512
+  return _mm512_mask_cvtph_epu64(A, B, C);
+}
+
+__m512i test_mm512_maskz_cvtph_epu64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.512
+  return _mm512_maskz_cvtph_epu64(A, B);
+}
+
+__m512i test_mm512_cvtt_roundph_epi64(__m128h A) {
+  // CHECK-LABEL: test_mm512_cvtt_roundph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512
+  return _mm512_cvtt_roundph_epi64(A, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_mask_cvtt_roundph_epi64(__m512i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512
+  return _mm512_mask_cvtt_roundph_epi64(A, B, C, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_maskz_cvtt_roundph_epi64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512
+  return _mm512_maskz_cvtt_roundph_epi64(A, B, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_cvttph_epi64(__m128h A) {
+  // CHECK-LABEL: test_mm512_cvttph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512
+  return _mm512_cvttph_epi64(A);
+}
+
+__m512i test_mm512_mask_cvttph_epi64(__m512i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm512_mask_cvttph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512
+  return _mm512_mask_cvttph_epi64(A, B, C);
+}
+
+__m512i test_mm512_maskz_cvttph_epi64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvttph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.512
+  return _mm512_maskz_cvttph_epi64(A, B);
+}
+
+__m512i test_mm512_cvtt_roundph_epu64(__m128h A) {
+  // CHECK-LABEL: test_mm512_cvtt_roundph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512
+  return _mm512_cvtt_roundph_epu64(A, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_mask_cvtt_roundph_epu64(__m512i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtt_roundph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512
+  return _mm512_mask_cvtt_roundph_epu64(A, B, C, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_maskz_cvtt_roundph_epu64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtt_roundph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512
+  return _mm512_maskz_cvtt_roundph_epu64(A, B, _MM_FROUND_NO_EXC);
+}
+
+__m512i test_mm512_cvttph_epu64(__m128h A) {
+  // CHECK-LABEL: test_mm512_cvttph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512
+  return _mm512_cvttph_epu64(A);
+}
+
+__m512i test_mm512_mask_cvttph_epu64(__m512i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm512_mask_cvttph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512
+  return _mm512_mask_cvttph_epu64(A, B, C);
+}
+
+__m512i test_mm512_maskz_cvttph_epu64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvttph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.512
+  return _mm512_maskz_cvttph_epu64(A, B);
+}
+
+int test_mm_cvt_roundsh_i32(__m128h A) {
+  // CHECK-LABEL: test_mm_cvt_roundsh_i32
+  // CHECK: @llvm.x86.avx512fp16.vcvtsh2si32
+  return _mm_cvt_roundsh_i32(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+int test_mm_cvtsh_i32(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtsh_i32
+  // CHECK: @llvm.x86.avx512fp16.vcvtsh2si32
+  return _mm_cvtsh_i32(A);
+}
+
+unsigned int test_mm_cvt_roundsh_u32(__m128h A) {
+  // CHECK-LABEL: test_mm_cvt_roundsh_u32
+  // CHECK: @llvm.x86.avx512fp16.vcvtsh2usi32
+  return _mm_cvt_roundsh_u32(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+unsigned int test_mm_cvtsh_u32(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtsh_u32
+  // CHECK: @llvm.x86.avx512fp16.vcvtsh2usi32
+  return _mm_cvtsh_u32(A);
+}
+
+#ifdef __x86_64__
+long long test_mm_cvt_roundsh_i64(__m128h A) {
+  // CHECK-LABEL: test_mm_cvt_roundsh_i64
+  // CHECK: @llvm.x86.avx512fp16.vcvtsh2si64
+  return _mm_cvt_roundsh_i64(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+long long test_mm_cvtsh_i64(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtsh_i64
+  // CHECK: @llvm.x86.avx512fp16.vcvtsh2si64
+  return _mm_cvtsh_i64(A);
+}
+
+unsigned long long test_mm_cvt_roundsh_u64(__m128h A) {
+  // CHECK-LABEL: test_mm_cvt_roundsh_u64
+  // CHECK: @llvm.x86.avx512fp16.vcvtsh2usi64
+  return _mm_cvt_roundsh_u64(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+unsigned long long test_mm_cvtsh_u64(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtsh_u64
+  // CHECK: @llvm.x86.avx512fp16.vcvtsh2usi64
+  return _mm_cvtsh_u64(A);
+}
+#endif
+
+__m128h test_mm_cvt_roundu32_sh(__m128h A, unsigned int B) {
+  // CHECK-LABEL: test_mm_cvt_roundu32_sh
+  // CHECK: @llvm.x86.avx512fp16.vcvtusi2sh
+  return _mm_cvt_roundu32_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_cvtu32_sh(__m128h A, unsigned int B) {
+  // CHECK-LABEL: test_mm_cvtu32_sh
+  // CHECK: %{{.*}} = uitofp i32 %{{.*}} to half
+  return _mm_cvtu32_sh(A, B);
+}
+
+#ifdef __x86_64__
+__m128h test_mm_cvt_roundu64_sh(__m128h A, unsigned long long B) {
+  // CHECK-LABEL: test_mm_cvt_roundu64_sh
+  // CHECK: @llvm.x86.avx512fp16.vcvtusi642sh
+  return _mm_cvt_roundu64_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_cvtu64_sh(__m128h A, unsigned long long B) {
+  // CHECK-LABEL: test_mm_cvtu64_sh
+  // CHECK: %{{.*}} = uitofp i64 %{{.*}} to half
+  return _mm_cvtu64_sh(A, B);
+}
+#endif
+
+__m128h test_mm_cvt_roundi32_sh(__m128h A, int B) {
+  // CHECK-LABEL: test_mm_cvt_roundi32_sh
+  // CHECK: @llvm.x86.avx512fp16.vcvtsi2sh
+  return _mm_cvt_roundi32_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_cvti32_sh(__m128h A, int B) {
+  // CHECK-LABEL: test_mm_cvti32_sh
+  // CHECK: %{{.*}} = sitofp i32 %{{.*}} to half
+  return _mm_cvti32_sh(A, B);
+}
+
+#ifdef __x86_64__
+__m128h test_mm_cvt_roundi64_sh(__m128h A, long long B) {
+  // CHECK-LABEL: test_mm_cvt_roundi64_sh
+  // CHECK: @llvm.x86.avx512fp16.vcvtsi642sh
+  return _mm_cvt_roundi64_sh(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m128h test_mm_cvti64_sh(__m128h A, long long B) {
+  // CHECK-LABEL: test_mm_cvti64_sh
+  // CHECK: %{{.*}} = sitofp i64 %{{.*}} to half
+  return _mm_cvti64_sh(A, B);
+}
+#endif
+
+int test_mm_cvtt_roundsh_i32(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtt_roundsh_i32
+  // CHECK: @llvm.x86.avx512fp16.vcvttsh2si32
+  return _mm_cvtt_roundsh_i32(A, _MM_FROUND_NO_EXC);
+}
+
+int test_mm_cvttsh_i32(__m128h A) {
+  // CHECK-LABEL: test_mm_cvttsh_i32
+  // CHECK: @llvm.x86.avx512fp16.vcvttsh2si32
+  return _mm_cvttsh_i32(A);
+}
+
+#ifdef __x86_64__
+long long test_mm_cvtt_roundsh_i64(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtt_roundsh_i64
+  // CHECK: @llvm.x86.avx512fp16.vcvttsh2si64
+  return _mm_cvtt_roundsh_i64(A, _MM_FROUND_NO_EXC);
+}
+
+long long test_mm_cvttsh_i64(__m128h A) {
+  // CHECK-LABEL: test_mm_cvttsh_i64
+  // CHECK: @llvm.x86.avx512fp16.vcvttsh2si64
+  return _mm_cvttsh_i64(A);
+}
+#endif
+
+unsigned int test_mm_cvtt_roundsh_u32(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtt_roundsh_u32
+  // CHECK: @llvm.x86.avx512fp16.vcvttsh2usi32
+  return _mm_cvtt_roundsh_u32(A, _MM_FROUND_NO_EXC);
+}
+
+unsigned int test_mm_cvttsh_u32(__m128h A) {
+  // CHECK-LABEL: test_mm_cvttsh_u32
+  // CHECK: @llvm.x86.avx512fp16.vcvttsh2usi32
+  return _mm_cvttsh_u32(A);
+}
+
+#ifdef __x86_64__
+unsigned long long test_mm_cvtt_roundsh_u64(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtt_roundsh_u64
+  // CHECK: @llvm.x86.avx512fp16.vcvttsh2usi64
+  return _mm_cvtt_roundsh_u64(A, _MM_FROUND_NO_EXC);
+}
+
+unsigned long long test_mm_cvttsh_u64(__m128h A) {
+  // CHECK-LABEL: test_mm_cvttsh_u64
+  // CHECK: @llvm.x86.avx512fp16.vcvttsh2usi64
+  return _mm_cvttsh_u64(A);
+}
+#endif
+
+__m512 test_mm512_cvtx_roundph_ps(__m256h A) {
+  // CHECK-LABEL: test_mm512_cvtx_roundph_ps
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512
+  return _mm512_cvtx_roundph_ps(A, _MM_FROUND_NO_EXC);
+}
+
+__m512 test_mm512_mask_cvtx_roundph_ps(__m512 A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtx_roundph_ps
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512
+  return _mm512_mask_cvtx_roundph_ps(A, B, C, _MM_FROUND_NO_EXC);
+}
+
+__m512 test_mm512_maskz_cvtx_roundph_ps(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtx_roundph_ps
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512
+  return _mm512_maskz_cvtx_roundph_ps(A, B, _MM_FROUND_NO_EXC);
+}
+
+__m512 test_mm512_cvtxph_ps(__m256h A) {
+  // CHECK-LABEL: test_mm512_cvtxph_ps
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512
+  return _mm512_cvtxph_ps(A);
+}
+
+__m512 test_mm512_mask_cvtxph_ps(__m512 A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm512_mask_cvtxph_ps
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512
+  return _mm512_mask_cvtxph_ps(A, B, C);
+}
+
+__m512 test_mm512_maskz_cvtxph_ps(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtxph_ps
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.512
+  return _mm512_maskz_cvtxph_ps(A, B);
+}
+
+__m256h test_mm512_cvtx_roundps_ph(__m512 A) {
+  // CHECK-LABEL: test_mm512_cvtx_roundps_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512
+  return _mm512_cvtx_roundps_ph(A, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm512_mask_cvtx_roundps_ph(__m256h A, __mmask16 B, __m512 C) {
+  // CHECK-LABEL: test_mm512_mask_cvtx_roundps_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512
+  return _mm512_mask_cvtx_roundps_ph(A, B, C, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm512_maskz_cvtx_roundps_ph(__mmask16 A, __m512 B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtx_roundps_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512
+  return _mm512_maskz_cvtx_roundps_ph(A, B, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+}
+
+__m256h test_mm512_cvtxps_ph(__m512 A) {
+  // CHECK-LABEL: test_mm512_cvtxps_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512
+  return _mm512_cvtxps_ph(A);
+}
+
+__m256h test_mm512_mask_cvtxps_ph(__m256h A, __mmask16 B, __m512 C) {
+  // CHECK-LABEL: test_mm512_mask_cvtxps_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512
+  return _mm512_mask_cvtxps_ph(A, B, C);
+}
+
+__m256h test_mm512_maskz_cvtxps_ph(__mmask16 A, __m512 B) {
+  // CHECK-LABEL: test_mm512_maskz_cvtxps_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.512
+  return _mm512_maskz_cvtxps_ph(A, B);
+}
+
+_Float16 test_mm512_reduce_add_ph(__m512h __W) {
+  // CHECK-LABEL: @test_mm512_reduce_add_ph
+  // CHECK: call reassoc half @llvm.vector.reduce.fadd.v32f16(half 0xH8000, <32 x half> %{{.*}})
+  return _mm512_reduce_add_ph(__W);
+}
+
+_Float16 test_mm512_reduce_mul_ph(__m512h __W) {
+  // CHECK-LABEL: @test_mm512_reduce_mul_ph
+  // CHECK: call reassoc half @llvm.vector.reduce.fmul.v32f16(half 0xH3C00, <32 x half> %{{.*}})
+  return _mm512_reduce_mul_ph(__W);
+}
+
+_Float16 test_mm512_reduce_max_ph(__m512h __W) {
+  // CHECK-LABEL: @test_mm512_reduce_max_ph
+  // CHECK: call nnan half @llvm.vector.reduce.fmax.v32f16(<32 x half> %{{.*}})
+  return _mm512_reduce_max_ph(__W);
+}
+
+_Float16 test_mm512_reduce_min_ph(__m512h __W) {
+  // CHECK-LABEL: @test_mm512_reduce_min_ph
+  // CHECK: call nnan half @llvm.vector.reduce.fmin.v32f16(<32 x half> %{{.*}})
+  return _mm512_reduce_min_ph(__W);
 }
 
 __m512h test_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
diff --git a/clang/test/CodeGen/X86/avx512fp16-complex-abi.c b/clang/test/CodeGen/X86/avx512fp16-complex-abi.c
new file mode 100644
index 0000000000000..066f8a89b8590
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx512fp16-complex-abi.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu -target-feature +avx512fp16 -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK
+
+// Return value should be passed in <2 x half> so the backend will use xmm0
+_Complex _Float16 f16(_Complex _Float16 A, _Complex _Float16 B) {
+  // CHECK-LABEL: define{{.*}}<2 x half> @f16({ half, half }* byval({ half, half }) align 4 %{{.*}}, { half, half }* byval({ half, half }) align 4 %{{.*}})
+  return A + B;
+}
diff --git a/clang/test/CodeGen/X86/avx512fp16-complex.c b/clang/test/CodeGen/X86/avx512fp16-complex.c
new file mode 100644
index 0000000000000..8e66cf8ab9e8d
--- /dev/null
+++ b/clang/test/CodeGen/X86/avx512fp16-complex.c
@@ -0,0 +1,133 @@
+// RUN: %clang_cc1 %s -O0 -fno-experimental-new-pass-manager -emit-llvm -triple x86_64-unknown-unknown -target-feature +avx512fp16 -o - | FileCheck %s --check-prefix=X86
+
+_Float16 _Complex add_half_rr(_Float16 a, _Float16 b) {
+  // X86-LABEL: @add_half_rr(
+  // X86: fadd
+  // X86-NOT: fadd
+  // X86: ret
+  return a + b;
+}
+_Float16 _Complex add_half_cr(_Float16 _Complex a, _Float16 b) {
+  // X86-LABEL: @add_half_cr(
+  // X86: fadd
+  // X86-NOT: fadd
+  // X86: ret
+  return a + b;
+}
+_Float16 _Complex add_half_rc(_Float16 a, _Float16 _Complex b) {
+  // X86-LABEL: @add_half_rc(
+  // X86: fadd
+  // X86-NOT: fadd
+  // X86: ret
+  return a + b;
+}
+_Float16 _Complex add_half_cc(_Float16 _Complex a, _Float16 _Complex b) {
+  // X86-LABEL: @add_half_cc(
+  // X86: fadd
+  // X86: fadd
+  // X86-NOT: fadd
+  // X86: ret
+  return a + b;
+}
+
+_Float16 _Complex sub_half_rr(_Float16 a, _Float16 b) {
+  // X86-LABEL: @sub_half_rr(
+  // X86: fsub
+  // X86-NOT: fsub
+  // X86: ret
+  return a - b;
+}
+_Float16 _Complex sub_half_cr(_Float16 _Complex a, _Float16 b) {
+  // X86-LABEL: @sub_half_cr(
+  // X86: fsub
+  // X86-NOT: fsub
+  // X86: ret
+  return a - b;
+}
+_Float16 _Complex sub_half_rc(_Float16 a, _Float16 _Complex b) {
+  // X86-LABEL: @sub_half_rc(
+  // X86: fsub
+  // X86: fneg
+  // X86-NOT: fsub
+  // X86: ret
+  return a - b;
+}
+_Float16 _Complex sub_half_cc(_Float16 _Complex a, _Float16 _Complex b) {
+  // X86-LABEL: @sub_half_cc(
+  // X86: fsub
+  // X86: fsub
+  // X86-NOT: fsub
+  // X86: ret
+  return a - b;
+}
+
+_Float16 _Complex mul_half_rr(_Float16 a, _Float16 b) {
+  // X86-LABEL: @mul_half_rr(
+  // X86: fmul
+  // X86-NOT: fmul
+  // X86: ret
+  return a * b;
+}
+_Float16 _Complex mul_half_cr(_Float16 _Complex a, _Float16 b) {
+  // X86-LABEL: @mul_half_cr(
+  // X86: fmul
+  // X86: fmul
+  // X86-NOT: fmul
+  // X86: ret
+  return a * b;
+}
+_Float16 _Complex mul_half_rc(_Float16 a, _Float16 _Complex b) {
+  // X86-LABEL: @mul_half_rc(
+  // X86: fmul
+  // X86: fmul
+  // X86-NOT: fmul
+  // X86: ret
+  return a * b;
+}
+_Float16 _Complex mul_half_cc(_Float16 _Complex a, _Float16 _Complex b) {
+  // X86-LABEL: @mul_half_cc(
+  // X86: %[[AC:[^ ]+]] = fmul
+  // X86: %[[BD:[^ ]+]] = fmul
+  // X86: %[[AD:[^ ]+]] = fmul
+  // X86: %[[BC:[^ ]+]] = fmul
+  // X86: %[[RR:[^ ]+]] = fsub half %[[AC]], %[[BD]]
+  // X86: %[[RI:[^ ]+]] = fadd half
+  // X86-DAG: %[[AD]]
+  // X86-DAG: ,
+  // X86-DAG: %[[BC]]
+  // X86: fcmp uno half %[[RR]]
+  // X86: fcmp uno half %[[RI]]
+  // X86: call {{.*}} @__mulhc3(
+  // X86: ret
+  return a * b;
+}
+
+_Float16 _Complex div_half_rr(_Float16 a, _Float16 b) {
+  // X86-LABEL: @div_half_rr(
+  // X86: fdiv
+  // X86-NOT: fdiv
+  // X86: ret
+  return a / b;
+}
+_Float16 _Complex div_half_cr(_Float16 _Complex a, _Float16 b) {
+  // X86-LABEL: @div_half_cr(
+  // X86: fdiv
+  // X86: fdiv
+  // X86-NOT: fdiv
+  // X86: ret
+  return a / b;
+}
+_Float16 _Complex div_half_rc(_Float16 a, _Float16 _Complex b) {
+  // X86-LABEL: @div_half_rc(
+  // X86-NOT: fdiv
+  // X86: call {{.*}} @__divhc3(
+  // X86: ret
+  return a / b;
+}
+_Float16 _Complex div_half_cc(_Float16 _Complex a, _Float16 _Complex b) {
+  // X86-LABEL: @div_half_cc(
+  // X86-NOT: fdiv
+  // X86: call {{.*}} @__divhc3(
+  // X86: ret
+  return a / b;
+}
diff --git a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
index 187f81e348b84..cb99d655f21c6 100644
--- a/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlfp16-builtins.c
@@ -139,16 +139,2186 @@ __m256h test_mm256_setr_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16
                         __h9, __h10, __h11, __h12, __h13, __h14, __h15, __h16);
 }
 
+__m256h test_mm256_add_ph(__m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_add_ph
+  // CHECK: %{{.*}} = fadd <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_add_ph(__A, __B);
+}
+
+__m256h test_mm256_mask_add_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_mask_add_ph
+  // CHECK: %{{.*}} = fadd <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+  return (__m256h)_mm256_mask_add_ph(__W, __U, __A, __B);
+}
+
+__m256h test_mm256_maskz_add_ph(__mmask32 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_maskz_add_ph
+  // CHECK: %{{.*}} = fadd <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+  return _mm256_maskz_add_ph(__U, __A, __B);
+}
+
+__m128h test_mm_add_ph(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_add_ph
+  // CHECK: %{{.*}} = fadd <8 x half> %{{.*}}, %{{.*}}
+  return _mm_add_ph(__A, __B);
+}
+
+__m128h test_mm_mask_add_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_add_ph
+  // CHECK: %{{.*}} = fadd <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+  return (__m128h)_mm_mask_add_ph(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_add_ph(__mmask32 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_add_ph
+  // CHECK: %{{.*}} = fadd <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+  return _mm_maskz_add_ph(__U, __A, __B);
+}
+
+__m256h test_mm256_sub_ph(__m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_sub_ph
+  // CHECK: %{{.*}} = fsub <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_sub_ph(__A, __B);
+}
+
+__m256h test_mm256_mask_sub_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_mask_sub_ph
+  // CHECK: %{{.*}} = fsub <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+  return (__m256h)_mm256_mask_sub_ph(__W, __U, __A, __B);
+}
+
+__m256h test_mm256_maskz_sub_ph(__mmask32 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_maskz_sub_ph
+  // CHECK: %{{.*}} = fsub <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+  return _mm256_maskz_sub_ph(__U, __A, __B);
+}
+
+__m128h test_mm_sub_ph(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_sub_ph
+  // CHECK: %{{.*}} = fsub <8 x half> %{{.*}}, %{{.*}}
+  return _mm_sub_ph(__A, __B);
+}
+
+__m128h test_mm_mask_sub_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_sub_ph
+  // CHECK: %{{.*}} = fsub <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+  return (__m128h)_mm_mask_sub_ph(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_sub_ph(__mmask32 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_sub_ph
+  // CHECK: %{{.*}} = fsub <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+  return _mm_maskz_sub_ph(__U, __A, __B);
+}
+
+__m256h test_mm256_mul_ph(__m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_mul_ph
+  // CHECK: %{{.*}} = fmul <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_mul_ph(__A, __B);
+}
+
+__m256h test_mm256_mask_mul_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_mask_mul_ph
+  // CHECK: %{{.*}} = fmul <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+  return (__m256h)_mm256_mask_mul_ph(__W, __U, __A, __B);
+}
+
+__m256h test_mm256_maskz_mul_ph(__mmask32 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_maskz_mul_ph
+  // CHECK: %{{.*}} = fmul <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+  return _mm256_maskz_mul_ph(__U, __A, __B);
+}
+
+__m128h test_mm_mul_ph(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mul_ph
+  // CHECK: %{{.*}} = fmul <8 x half> %{{.*}}, %{{.*}}
+  return _mm_mul_ph(__A, __B);
+}
+
+__m128h test_mm_mask_mul_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_mul_ph
+  // CHECK: %{{.*}} = fmul <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+  return (__m128h)_mm_mask_mul_ph(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_mul_ph(__mmask32 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_mul_ph
+  // CHECK: %{{.*}} = fmul <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+  return _mm_maskz_mul_ph(__U, __A, __B);
+}
+
+__m256h test_mm256_div_ph(__m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_div_ph
+  // CHECK: %{{.*}} = fdiv <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_div_ph(__A, __B);
+}
+
+__m256h test_mm256_mask_div_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_mask_div_ph
+  // CHECK: %{{.*}} = fdiv <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+  return (__m256h)_mm256_mask_div_ph(__W, __U, __A, __B);
+}
+
+__m256h test_mm256_maskz_div_ph(__mmask32 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_maskz_div_ph
+  // CHECK: %{{.*}} = fdiv <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+  return _mm256_maskz_div_ph(__U, __A, __B);
+}
+
+__m128h test_mm_div_ph(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_div_ph
+  // CHECK: %{{.*}} = fdiv <8 x half> %{{.*}}, %{{.*}}
+  return _mm_div_ph(__A, __B);
+}
+
+__m128h test_mm_mask_div_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_div_ph
+  // CHECK: %{{.*}} = fdiv <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+  return (__m128h)_mm_mask_div_ph(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_div_ph(__mmask32 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_div_ph
+  // CHECK: %{{.*}} = fdiv <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+  return _mm_maskz_div_ph(__U, __A, __B);
+}
+
+__m256h test_mm256_min_ph(__m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_min_ph
+  // CHECK: @llvm.x86.avx512fp16.min.ph.256
+  return _mm256_min_ph(__A, __B);
+}
+
+__m256h test_mm256_mask_min_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_mask_min_ph
+  // CHECK: @llvm.x86.avx512fp16.min.ph.256
+  return (__m256h)_mm256_mask_min_ph(__W, __U, __A, __B);
+}
+
+__m256h test_mm256_maskz_min_ph(__mmask32 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_maskz_min_ph
+  // CHECK: @llvm.x86.avx512fp16.min.ph.256
+  return _mm256_maskz_min_ph(__U, __A, __B);
+}
+
+__m128h test_mm_min_ph(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_min_ph
+  // CHECK: @llvm.x86.avx512fp16.min.ph.128
+  return _mm_min_ph(__A, __B);
+}
+
+__m128h test_mm_mask_min_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_min_ph
+  // CHECK: @llvm.x86.avx512fp16.min.ph.128
+  return (__m128h)_mm_mask_min_ph(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_min_ph(__mmask32 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_min_ph
+  // CHECK: @llvm.x86.avx512fp16.min.ph.128
+  return _mm_maskz_min_ph(__U, __A, __B);
+}
+
+__m256h test_mm256_max_ph(__m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_max_ph
+  // CHECK: @llvm.x86.avx512fp16.max.ph.256
+  return _mm256_max_ph(__A, __B);
+}
+
+__m256h test_mm256_mask_max_ph(__m256h __W, __mmask32 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_mask_max_ph
+  // CHECK: @llvm.x86.avx512fp16.max.ph.256
+  return (__m256h)_mm256_mask_max_ph(__W, __U, __A, __B);
+}
+
+__m256h test_mm256_maskz_max_ph(__mmask32 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_maskz_max_ph
+  // CHECK: @llvm.x86.avx512fp16.max.ph.256
+  return _mm256_maskz_max_ph(__U, __A, __B);
+}
+
+__m128h test_mm_max_ph(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_max_ph
+  // CHECK: @llvm.x86.avx512fp16.max.ph.128
+  return _mm_max_ph(__A, __B);
+}
+
+__m128h test_mm_mask_max_ph(__m128h __W, __mmask32 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_max_ph
+  // CHECK: @llvm.x86.avx512fp16.max.ph.128
+  return (__m128h)_mm_mask_max_ph(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_max_ph(__mmask32 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_max_ph
+  // CHECK: @llvm.x86.avx512fp16.max.ph.128
+  return _mm_maskz_max_ph(__U, __A, __B);
+}
+
 __m128h test_mm_abs_ph(__m128h a) {
   // CHECK-LABEL: @test_mm_abs_ph
   // CHECK: and <4 x i32>
   return _mm_abs_ph(a);
 }
 
-__m256h test_mm256_abs_ph(__m256h a) {
-  // CHECK-LABEL: @test_mm256_abs_ph
-  // CHECK: and <8 x i32>
-  return _mm256_abs_ph(a);
+__m256h test_mm256_abs_ph(__m256h a) {
+  // CHECK-LABEL: @test_mm256_abs_ph
+  // CHECK: and <8 x i32>
+  return _mm256_abs_ph(a);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_eq_oq(__m256h a, __m256h b) {
+  // CHECK-LABEL: @test_mm256_cmp_ph_mask_eq_oq
+  // CHECK: fcmp oeq <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_lt_os(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_lt_os
+  // CHECK: fcmp olt <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_LT_OS);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_le_os(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_le_os
+  // CHECK: fcmp ole <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_LE_OS);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_unord_q(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_unord_q
+  // CHECK: fcmp uno <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_UNORD_Q);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_neq_uq(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_neq_uq
+  // CHECK: fcmp une <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_NEQ_UQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_nlt_us(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_nlt_us
+  // CHECK: fcmp uge <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_NLT_US);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_nle_us(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_nle_us
+  // CHECK: fcmp ugt <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_NLE_US);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_ord_q(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_ord_q
+  // CHECK: fcmp ord <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_ORD_Q);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_eq_uq(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_eq_uq
+  // CHECK: fcmp ueq <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_EQ_UQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_nge_us(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_nge_us
+  // CHECK: fcmp ult <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_NGE_US);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_ngt_us(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_ngt_us
+  // CHECK: fcmp ule <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_NGT_US);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_false_oq(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_false_oq
+  // CHECK: fcmp false <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_FALSE_OQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_neq_oq(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_neq_oq
+  // CHECK: fcmp one <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_NEQ_OQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_ge_os(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_ge_os
+  // CHECK: fcmp oge <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_GE_OS);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_gt_os(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_gt_os
+  // CHECK: fcmp ogt <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_GT_OS);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_true_uq(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_true_uq
+  // CHECK: fcmp true <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_TRUE_UQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_eq_os(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_eq_os
+  // CHECK: fcmp oeq <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_EQ_OS);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_lt_oq(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_lt_oq
+  // CHECK: fcmp olt <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_LT_OQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_le_oq(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_le_oq
+  // CHECK: fcmp ole <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_LE_OQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_unord_s(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_unord_s
+  // CHECK: fcmp uno <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_UNORD_S);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_neq_us(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_neq_us
+  // CHECK: fcmp une <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_NEQ_US);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_nlt_uq(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_nlt_uq
+  // CHECK: fcmp uge <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_NLT_UQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_nle_uq(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_nle_uq
+  // CHECK: fcmp ugt <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_NLE_UQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_ord_s(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_ord_s
+  // CHECK: fcmp ord <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_ORD_S);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_eq_us(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_eq_us
+  // CHECK: fcmp ueq <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_EQ_US);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_nge_uq(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_nge_uq
+  // CHECK: fcmp ult <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_ngt_uq(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_ngt_uq
+  // CHECK: fcmp ule <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_NGT_UQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_false_os(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_false_os
+  // CHECK: fcmp false <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_FALSE_OS);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_neq_os(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_neq_os
+  // CHECK: fcmp one <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_NEQ_OS);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_ge_oq(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_ge_oq
+  // CHECK: fcmp oge <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_GE_OQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_gt_oq(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_gt_oq
+  // CHECK: fcmp ogt <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_GT_OQ);
+}
+
+__mmask16 test_mm256_cmp_ph_mask_true_us(__m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_cmp_ph_mask_true_us
+  // CHECK: fcmp true <16 x half> %{{.*}}, %{{.*}}
+  return _mm256_cmp_ph_mask(a, b, _CMP_TRUE_US);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_eq_oq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: @test_mm256_mask_cmp_ph_mask_eq_oq
+  // CHECK: [[CMP:%.*]] = fcmp oeq <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_lt_os(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_lt_os
+  // CHECK: [[CMP:%.*]] = fcmp olt <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_LT_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_le_os(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_le_os
+  // CHECK: [[CMP:%.*]] = fcmp ole <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_LE_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_unord_q(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_unord_q
+  // CHECK: [[CMP:%.*]] = fcmp uno <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_Q);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_neq_uq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_neq_uq
+  // CHECK: [[CMP:%.*]] = fcmp une <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_nlt_us(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nlt_us
+  // CHECK: [[CMP:%.*]] = fcmp uge <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NLT_US);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_nle_us(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nle_us
+  // CHECK: [[CMP:%.*]] = fcmp ugt <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NLE_US);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_ord_q(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ord_q
+  // CHECK: [[CMP:%.*]] = fcmp ord <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_ORD_Q);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_eq_uq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_eq_uq
+  // CHECK: [[CMP:%.*]] = fcmp ueq <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_EQ_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_nge_us(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nge_us
+  // CHECK: [[CMP:%.*]] = fcmp ult <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NGE_US);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_ngt_us(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ngt_us
+  // CHECK: [[CMP:%.*]] = fcmp ule <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NGT_US);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_false_oq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_false_oq
+  // CHECK: [[CMP:%.*]] = fcmp false <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_neq_oq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_neq_oq
+  // CHECK: [[CMP:%.*]] = fcmp one <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_ge_os(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ge_os
+  // CHECK: [[CMP:%.*]] = fcmp oge <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_GE_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_gt_os(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_gt_os
+  // CHECK: [[CMP:%.*]] = fcmp ogt <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_GT_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_true_uq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_true_uq
+  // CHECK: [[CMP:%.*]] = fcmp true <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_eq_os(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_eq_os
+  // CHECK: [[CMP:%.*]] = fcmp oeq <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_lt_oq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_lt_oq
+  // CHECK: [[CMP:%.*]] = fcmp olt <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_LT_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_le_oq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_le_oq
+  // CHECK: [[CMP:%.*]] = fcmp ole <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_LE_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_unord_s(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_unord_s
+  // CHECK: [[CMP:%.*]] = fcmp uno <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_S);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_neq_us(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_neq_us
+  // CHECK: [[CMP:%.*]] = fcmp une <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_US);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_nlt_uq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nlt_uq
+  // CHECK: [[CMP:%.*]] = fcmp uge <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NLT_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_nle_uq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nle_uq
+  // CHECK: [[CMP:%.*]] = fcmp ugt <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NLE_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_ord_s(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ord_s
+  // CHECK: [[CMP:%.*]] = fcmp ord <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_ORD_S);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_eq_us(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_eq_us
+  // CHECK: [[CMP:%.*]] = fcmp ueq <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_EQ_US);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_nge_uq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_nge_uq
+  // CHECK: [[CMP:%.*]] = fcmp ult <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NGE_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_ngt_uq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ngt_uq
+  // CHECK: [[CMP:%.*]] = fcmp ule <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NGT_UQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_false_os(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_false_os
+  // CHECK: [[CMP:%.*]] = fcmp false <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_neq_os(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_neq_os
+  // CHECK: [[CMP:%.*]] = fcmp one <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OS);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_ge_oq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_ge_oq
+  // CHECK: [[CMP:%.*]] = fcmp oge <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_GE_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_gt_oq(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_gt_oq
+  // CHECK: [[CMP:%.*]] = fcmp ogt <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_GT_OQ);
+}
+
+__mmask16 test_mm256_mask_cmp_ph_mask_true_us(__mmask16 m, __m256h a, __m256h b) {
+  // CHECK-LABEL: test_mm256_mask_cmp_ph_mask_true_us
+  // CHECK: [[CMP:%.*]] = fcmp true <16 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> [[CMP]], {{.*}}
+  return _mm256_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_US);
+}
+
+__mmask8 test_mm_cmp_ph_mask_eq_oq(__m128h a, __m128h b) {
+  // CHECK-LABEL: @test_mm_cmp_ph_mask_eq_oq
+  // CHECK: fcmp oeq <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_lt_os(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_lt_os
+  // CHECK: fcmp olt <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_LT_OS);
+}
+
+__mmask8 test_mm_cmp_ph_mask_le_os(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_le_os
+  // CHECK: fcmp ole <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_LE_OS);
+}
+
+__mmask8 test_mm_cmp_ph_mask_unord_q(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_unord_q
+  // CHECK: fcmp uno <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_UNORD_Q);
+}
+
+__mmask8 test_mm_cmp_ph_mask_neq_uq(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_neq_uq
+  // CHECK: fcmp une <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_nlt_us(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_nlt_us
+  // CHECK: fcmp uge <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm_cmp_ph_mask_nle_us(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_nle_us
+  // CHECK: fcmp ugt <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_NLE_US);
+}
+
+__mmask8 test_mm_cmp_ph_mask_ord_q(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_ord_q
+  // CHECK: fcmp ord <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm_cmp_ph_mask_eq_uq(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_eq_uq
+  // CHECK: fcmp ueq <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_nge_us(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_nge_us
+  // CHECK: fcmp ult <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm_cmp_ph_mask_ngt_us(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_ngt_us
+  // CHECK: fcmp ule <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm_cmp_ph_mask_false_oq(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_false_oq
+  // CHECK: fcmp false <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_neq_oq(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_neq_oq
+  // CHECK: fcmp one <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_ge_os(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_ge_os
+  // CHECK: fcmp oge <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm_cmp_ph_mask_gt_os(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_gt_os
+  // CHECK: fcmp ogt <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm_cmp_ph_mask_true_uq(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_true_uq
+  // CHECK: fcmp true <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_eq_os(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_eq_os
+  // CHECK: fcmp oeq <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm_cmp_ph_mask_lt_oq(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_lt_oq
+  // CHECK: fcmp olt <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_le_oq(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_le_oq
+  // CHECK: fcmp ole <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_unord_s(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_unord_s
+  // CHECK: fcmp uno <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm_cmp_ph_mask_neq_us(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_neq_us
+  // CHECK: fcmp une <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm_cmp_ph_mask_nlt_uq(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_nlt_uq
+  // CHECK: fcmp uge <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_nle_uq(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_nle_uq
+  // CHECK: fcmp ugt <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_ord_s(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_ord_s
+  // CHECK: fcmp ord <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm_cmp_ph_mask_eq_us(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_eq_us
+  // CHECK: fcmp ueq <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm_cmp_ph_mask_nge_uq(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_nge_uq
+  // CHECK: fcmp ult <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_ngt_uq(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_ngt_uq
+  // CHECK: fcmp ule <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_false_os(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_false_os
+  // CHECK: fcmp false <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm_cmp_ph_mask_neq_os(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_neq_os
+  // CHECK: fcmp one <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm_cmp_ph_mask_ge_oq(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_ge_oq
+  // CHECK: fcmp oge <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_gt_oq(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_gt_oq
+  // CHECK: fcmp ogt <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm_cmp_ph_mask_true_us(__m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_cmp_ph_mask_true_us
+  // CHECK: fcmp true <8 x half> %{{.*}}, %{{.*}}
+  return _mm_cmp_ph_mask(a, b, _CMP_TRUE_US);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_eq_oq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: @test_mm_mask_cmp_ph_mask_eq_oq
+  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_lt_os(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_lt_os
+  // CHECK: [[CMP:%.*]] = fcmp olt <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_LT_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_le_os(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_le_os
+  // CHECK: [[CMP:%.*]] = fcmp ole <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_LE_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_unord_q(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_unord_q
+  // CHECK: [[CMP:%.*]] = fcmp uno <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_Q);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_neq_uq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_neq_uq
+  // CHECK: [[CMP:%.*]] = fcmp une <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_nlt_us(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nlt_us
+  // CHECK: [[CMP:%.*]] = fcmp uge <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NLT_US);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_nle_us(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nle_us
+  // CHECK: [[CMP:%.*]] = fcmp ugt <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NLE_US);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_ord_q(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ord_q
+  // CHECK: [[CMP:%.*]] = fcmp ord <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_ORD_Q);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_eq_uq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_eq_uq
+  // CHECK: [[CMP:%.*]] = fcmp ueq <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_EQ_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_nge_us(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nge_us
+  // CHECK: [[CMP:%.*]] = fcmp ult <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NGE_US);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_ngt_us(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ngt_us
+  // CHECK: [[CMP:%.*]] = fcmp ule <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NGT_US);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_false_oq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_false_oq
+  // CHECK: [[CMP:%.*]] = fcmp false <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_neq_oq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_neq_oq
+  // CHECK: [[CMP:%.*]] = fcmp one <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_ge_os(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ge_os
+  // CHECK: [[CMP:%.*]] = fcmp oge <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_GE_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_gt_os(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_gt_os
+  // CHECK: [[CMP:%.*]] = fcmp ogt <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_GT_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_true_uq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_true_uq
+  // CHECK: [[CMP:%.*]] = fcmp true <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_eq_os(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_eq_os
+  // CHECK: [[CMP:%.*]] = fcmp oeq <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_EQ_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_lt_oq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_lt_oq
+  // CHECK: [[CMP:%.*]] = fcmp olt <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_LT_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_le_oq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_le_oq
+  // CHECK: [[CMP:%.*]] = fcmp ole <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_LE_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_unord_s(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_unord_s
+  // CHECK: [[CMP:%.*]] = fcmp uno <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_UNORD_S);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_neq_us(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_neq_us
+  // CHECK: [[CMP:%.*]] = fcmp une <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_US);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_nlt_uq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nlt_uq
+  // CHECK: [[CMP:%.*]] = fcmp uge <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NLT_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_nle_uq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nle_uq
+  // CHECK: [[CMP:%.*]] = fcmp ugt <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NLE_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_ord_s(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ord_s
+  // CHECK: [[CMP:%.*]] = fcmp ord <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_ORD_S);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_eq_us(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_eq_us
+  // CHECK: [[CMP:%.*]] = fcmp ueq <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_EQ_US);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_nge_uq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_nge_uq
+  // CHECK: [[CMP:%.*]] = fcmp ult <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NGE_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_ngt_uq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ngt_uq
+  // CHECK: [[CMP:%.*]] = fcmp ule <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NGT_UQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_false_os(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_false_os
+  // CHECK: [[CMP:%.*]] = fcmp false <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_FALSE_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_neq_os(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_neq_os
+  // CHECK: [[CMP:%.*]] = fcmp one <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_NEQ_OS);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_ge_oq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_ge_oq
+  // CHECK: [[CMP:%.*]] = fcmp oge <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_GE_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_gt_oq(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_gt_oq
+  // CHECK: [[CMP:%.*]] = fcmp ogt <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_GT_OQ);
+}
+
+__mmask8 test_mm_mask_cmp_ph_mask_true_us(__mmask8 m, __m128h a, __m128h b) {
+  // CHECK-LABEL: test_mm_mask_cmp_ph_mask_true_us
+  // CHECK: [[CMP:%.*]] = fcmp true <8 x half> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> [[CMP]], {{.*}}
+  return _mm_mask_cmp_ph_mask(m, a, b, _CMP_TRUE_US);
+}
+
+__m256h test_mm256_rcp_ph(__m256h __A) {
+  // CHECK-LABEL: @test_mm256_rcp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.256
+  return _mm256_rcp_ph(__A);
+}
+
+__m256h test_mm256_mask_rcp_ph(__m256h __W, __mmask32 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_mask_rcp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.256
+  return (__m256h)_mm256_mask_rcp_ph(__W, __U, __A);
+}
+
+__m256h test_mm256_maskz_rcp_ph(__mmask32 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rcp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.256
+  return _mm256_maskz_rcp_ph(__U, __A);
+}
+
+__m128h test_mm_rcp_ph(__m128h __A) {
+  // CHECK-LABEL: @test_mm_rcp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.128
+  return _mm_rcp_ph(__A);
+}
+
+__m128h test_mm_mask_rcp_ph(__m128h __W, __mmask32 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_mask_rcp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.128
+  return (__m128h)_mm_mask_rcp_ph(__W, __U, __A);
+}
+
+__m128h test_mm_maskz_rcp_ph(__mmask32 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_maskz_rcp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rcp.ph.128
+  return _mm_maskz_rcp_ph(__U, __A);
+}
+
+__m256h test_mm256_rsqrt_ph(__m256h __A) {
+  // CHECK-LABEL: @test_mm256_rsqrt_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.256
+  return _mm256_rsqrt_ph(__A);
+}
+
+__m256h test_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_mask_rsqrt_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.256
+  return (__m256h)_mm256_mask_rsqrt_ph(__W, __U, __A);
+}
+
+__m256h test_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rsqrt_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.256
+  return _mm256_maskz_rsqrt_ph(__U, __A);
+}
+
+__m128h test_mm_rsqrt_ph(__m128h __A) {
+  // CHECK-LABEL: @test_mm_rsqrt_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.128
+  return _mm_rsqrt_ph(__A);
+}
+
+__m128h test_mm_mask_rsqrt_ph(__m128h __W, __mmask32 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_mask_rsqrt_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.128
+  return (__m128h)_mm_mask_rsqrt_ph(__W, __U, __A);
+}
+
+__m128h test_mm_maskz_rsqrt_ph(__mmask32 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_maskz_rsqrt_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rsqrt.ph.128
+  return _mm_maskz_rsqrt_ph(__U, __A);
+}
+
+__m128h test_mm_getmant_ph(__m128h __A) {
+  // CHECK-LABEL: @test_mm_getmant_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.128
+  return _mm_getmant_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m128h test_mm_mask_getmant_ph(__m128h __W, __mmask8 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_mask_getmant_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.128
+  return _mm_mask_getmant_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m128h test_mm_maskz_getmant_ph(__mmask8 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_maskz_getmant_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.128
+  return _mm_maskz_getmant_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m256h test_mm256_getmant_ph(__m256h __A) {
+  // CHECK-LABEL: @test_mm256_getmant_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.256
+  return _mm256_getmant_ph(__A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m256h test_mm256_mask_getmant_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_mask_getmant_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.256
+  return _mm256_mask_getmant_ph(__W, __U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m256h test_mm256_maskz_getmant_ph(__mmask16 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_maskz_getmant_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getmant.ph.256
+  return _mm256_maskz_getmant_ph(__U, __A, _MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan);
+}
+
+__m128h test_mm_getexp_ph(__m128h __A) {
+  // CHECK-LABEL: @test_mm_getexp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.128
+  return _mm_getexp_ph(__A);
+}
+
+__m128h test_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_mask_getexp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.128
+  return _mm_mask_getexp_ph(__W, __U, __A);
+}
+
+__m128h test_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_maskz_getexp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.128
+  return _mm_maskz_getexp_ph(__U, __A);
+}
+
+__m256h test_mm256_getexp_ph(__m256h __A) {
+  // CHECK-LABEL: @test_mm256_getexp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.256
+  return _mm256_getexp_ph(__A);
+}
+
+__m256h test_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_mask_getexp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.256
+  return _mm256_mask_getexp_ph(__W, __U, __A);
+}
+
+__m256h test_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_maskz_getexp_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.getexp.ph.256
+  return _mm256_maskz_getexp_ph(__U, __A);
+}
+
+__m128h test_mm_scalef_ph(__m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_scalef_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.128
+  return _mm_scalef_ph(__A, __B);
+}
+
+__m128h test_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_mask_scalef_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.128
+  return _mm_mask_scalef_ph(__W, __U, __A, __B);
+}
+
+__m128h test_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) {
+  // CHECK-LABEL: @test_mm_maskz_scalef_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.128
+  return _mm_maskz_scalef_ph(__U, __A, __B);
+}
+
+__m256h test_mm256_scalef_ph(__m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_scalef_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.256
+  return _mm256_scalef_ph(__A, __B);
+}
+
+__m256h test_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_mask_scalef_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.256
+  return _mm256_mask_scalef_ph(__W, __U, __A, __B);
+}
+
+__m256h test_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+  // CHECK-LABEL: @test_mm256_maskz_scalef_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.scalef.ph.256
+  return _mm256_maskz_scalef_ph(__U, __A, __B);
+}
+
+__m128h test_mm_roundscale_ph(__m128h __A) {
+  // CHECK-LABEL: @test_mm_roundscale_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.128
+  return _mm_roundscale_ph(__A, 4);
+}
+
+__m128h test_mm_mask_roundscale_ph(__m128h __W, __mmask8 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_mask_roundscale_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.128
+  return _mm_mask_roundscale_ph(__W, __U, __A, 4);
+}
+
+__m128h test_mm_maskz_roundscale_ph(__mmask8 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_maskz_roundscale_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.128
+  return _mm_maskz_roundscale_ph(__U, __A, 4);
+}
+
+__m256h test_mm256_roundscale_ph(__m256h __A) {
+  // CHECK-LABEL: @test_mm256_roundscale_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.256
+  return _mm256_roundscale_ph(__A, 4);
+}
+
+__m256h test_mm256_mask_roundscale_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_mask_roundscale_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.256
+  return _mm256_mask_roundscale_ph(__W, __U, __A, 4);
+}
+
+__m256h test_mm256_maskz_roundscale_ph(__mmask16 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_maskz_roundscale_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.rndscale.ph.256
+  return _mm256_maskz_roundscale_ph(__U, __A, 4);
+}
+
+__m128h test_mm_reduce_ph(__m128h __A) {
+  // CHECK-LABEL: @test_mm_reduce_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.128
+  return _mm_reduce_ph(__A, 4);
+}
+
+__m128h test_mm_mask_reduce_ph(__m128h __W, __mmask8 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_mask_reduce_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.128
+  return _mm_mask_reduce_ph(__W, __U, __A, 4);
+}
+
+__m128h test_mm_maskz_reduce_ph(__mmask8 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_maskz_reduce_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.128
+  return _mm_maskz_reduce_ph(__U, __A, 4);
+}
+
+__m256h test_mm256_reduce_ph(__m256h __A) {
+  // CHECK-LABEL: @test_mm256_reduce_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.256
+  return _mm256_reduce_ph(__A, 4);
+}
+
+__m256h test_mm256_mask_reduce_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_mask_reduce_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.256
+  return _mm256_mask_reduce_ph(__W, __U, __A, 4);
+}
+
+__m256h test_mm256_maskz_reduce_ph(__mmask16 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_maskz_reduce_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.reduce.ph.256
+  return _mm256_maskz_reduce_ph(__U, __A, 4);
+}
+__m128h test_mm_sqrt_ph(__m128h x) {
+  // CHECK-LABEL: test_mm_sqrt_ph
+  // CHECK: call <8 x half> @llvm.sqrt.v8f16(<8 x half> {{.*}})
+  return _mm_sqrt_ph(x);
+}
+
+__m256h test_mm256_sqrt_ph(__m256h A) {
+  // CHECK-LABEL: test_mm256_sqrt_ph
+  // CHECK: call <16 x half> @llvm.sqrt.v16f16(<16 x half> %{{.*}})
+  return _mm256_sqrt_ph(A);
+}
+
+__m128h test_mm_mask_sqrt_ph(__m128h __W, __mmask8 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_mask_sqrt_ph
+  // CHECK: @llvm.sqrt.v8f16
+  // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+  return _mm_mask_sqrt_ph(__W, __U, __A);
+}
+
+__m128h test_mm_maskz_sqrt_ph(__mmask8 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_maskz_sqrt_ph
+  // CHECK: @llvm.sqrt.v8f16
+  // CHECK: select <8 x i1> %{{.*}}, <8 x half> %{{.*}}, <8 x half> %{{.*}}
+  return _mm_maskz_sqrt_ph(__U, __A);
+}
+
+__m256h test_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_mask_sqrt_ph
+  // CHECK: @llvm.sqrt.v16f16
+  // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+  return _mm256_mask_sqrt_ph(__W, __U, __A);
+}
+
+__m256h test_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_maskz_sqrt_ph
+  // CHECK: @llvm.sqrt.v16f16
+  // CHECK: select <16 x i1> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}
+  return _mm256_maskz_sqrt_ph(__U, __A);
+}
+__mmask8 test_mm_mask_fpclass_ph_mask(__mmask8 __U, __m128h __A) {
+  // CHECK-LABEL: @test_mm_mask_fpclass_ph_mask
+  // CHECK: @llvm.x86.avx512fp16.fpclass.ph.128
+  return _mm_mask_fpclass_ph_mask(__U, __A, 2);
+}
+
+__mmask8 test_mm_fpclass_ph_mask(__m128h __A) {
+  // CHECK-LABEL: @test_mm_fpclass_ph_mask
+  // CHECK: @llvm.x86.avx512fp16.fpclass.ph.128
+  return _mm_fpclass_ph_mask(__A, 2);
+}
+
+__mmask16 test_mm256_mask_fpclass_ph_mask(__mmask16 __U, __m256h __A) {
+  // CHECK-LABEL: @test_mm256_mask_fpclass_ph_mask
+  // CHECK: @llvm.x86.avx512fp16.fpclass.ph.256
+  return _mm256_mask_fpclass_ph_mask(__U, __A, 2);
+}
+
+__mmask16 test_mm256_fpclass_ph_mask(__m256h __A) {
+  // CHECK-LABEL: @test_mm256_fpclass_ph_mask
+  // CHECK: @llvm.x86.avx512fp16.fpclass.ph.256
+  return _mm256_fpclass_ph_mask(__A, 2);
+}
+
+__m128h test_mm_cvtpd_ph(__m128d A) {
+  // CHECK-LABEL: test_mm_cvtpd_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.128
+  return _mm_cvtpd_ph(A);
+}
+
+__m128h test_mm_mask_cvtpd_ph(__m128h A, __mmask8 B, __m128d C) {
+  // CHECK-LABEL: test_mm_mask_cvtpd_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.128
+  return _mm_mask_cvtpd_ph(A, B, C);
+}
+
+__m128h test_mm_maskz_cvtpd_ph(__mmask8 A, __m128d B) {
+  // CHECK-LABEL: test_mm_maskz_cvtpd_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.128
+  return _mm_maskz_cvtpd_ph(A, B);
+}
+
+__m128h test_mm256_cvtpd_ph(__m256d A) {
+  // CHECK-LABEL: test_mm256_cvtpd_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.256
+  return _mm256_cvtpd_ph(A);
+}
+
+__m128h test_mm256_mask_cvtpd_ph(__m128h A, __mmask8 B, __m256d C) {
+  // CHECK-LABEL: test_mm256_mask_cvtpd_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.256
+  return _mm256_mask_cvtpd_ph(A, B, C);
+}
+
+__m128h test_mm256_maskz_cvtpd_ph(__mmask8 A, __m256d B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtpd_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtpd2ph.256
+  return _mm256_maskz_cvtpd_ph(A, B);
+}
+
+__m128d test_mm_cvtph_pd(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtph_pd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.128
+  return _mm_cvtph_pd(A);
+}
+
+__m128d test_mm_mask_cvtph_pd(__m128d A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvtph_pd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.128
+  return _mm_mask_cvtph_pd(A, B, C);
+}
+
+__m128d test_mm_maskz_cvtph_pd(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvtph_pd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.128
+  return _mm_maskz_cvtph_pd(A, B);
+}
+
+__m256d test_mm256_cvtph_pd(__m128h A) {
+  // CHECK-LABEL: test_mm256_cvtph_pd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.256
+  return _mm256_cvtph_pd(A);
+}
+
+__m256d test_mm256_mask_cvtph_pd(__m256d A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm256_mask_cvtph_pd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.256
+  return _mm256_mask_cvtph_pd(A, B, C);
+}
+
+__m256d test_mm256_maskz_cvtph_pd(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtph_pd
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2pd.256
+  return _mm256_maskz_cvtph_pd(A, B);
+}
+
+__m128i test_mm_cvtph_epi16(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.128
+  return _mm_cvtph_epi16(A);
+}
+
+__m128i test_mm_mask_cvtph_epi16(__m128i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvtph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.128
+  return _mm_mask_cvtph_epi16(A, B, C);
+}
+
+__m128i test_mm_maskz_cvtph_epi16(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvtph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.128
+  return _mm_maskz_cvtph_epi16(A, B);
+}
+
+__m256i test_mm256_cvtph_epi16(__m256h A) {
+  // CHECK-LABEL: test_mm256_cvtph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.256
+  return _mm256_cvtph_epi16(A);
+}
+
+__m256i test_mm256_mask_cvtph_epi16(__m256i A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm256_mask_cvtph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.256
+  return _mm256_mask_cvtph_epi16(A, B, C);
+}
+
+__m256i test_mm256_maskz_cvtph_epi16(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2w.256
+  return _mm256_maskz_cvtph_epi16(A, B);
+}
+
+__m128i test_mm_cvttph_epi16(__m128h A) {
+  // CHECK-LABEL: test_mm_cvttph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.128
+  return _mm_cvttph_epi16(A);
+}
+
+__m128i test_mm_mask_cvttph_epi16(__m128i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvttph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.128
+  return _mm_mask_cvttph_epi16(A, B, C);
+}
+
+__m128i test_mm_maskz_cvttph_epi16(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvttph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.128
+  return _mm_maskz_cvttph_epi16(A, B);
+}
+
+__m256i test_mm256_cvttph_epi16(__m256h A) {
+  // CHECK-LABEL: test_mm256_cvttph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.256
+  return _mm256_cvttph_epi16(A);
+}
+
+__m256i test_mm256_mask_cvttph_epi16(__m256i A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm256_mask_cvttph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.256
+  return _mm256_mask_cvttph_epi16(A, B, C);
+}
+
+__m256i test_mm256_maskz_cvttph_epi16(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvttph_epi16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2w.256
+  return _mm256_maskz_cvttph_epi16(A, B);
+}
+
+__m128h test_mm_cvtepi16_ph(__m128i A) {
+  // CHECK-LABEL: test_mm_cvtepi16_ph
+  // CHECK: %{{.*}} = sitofp <8 x i16> %{{.*}} to <8 x half>
+  return _mm_cvtepi16_ph(A);
+}
+
+__m128h test_mm_mask_cvtepi16_ph(__m128h A, __mmask8 B, __m128i C) {
+  // CHECK-LABEL: test_mm_mask_cvtepi16_ph
+  // CHECK: %{{.*}} = sitofp <8 x i16> %{{.*}} to <8 x half>
+  return _mm_mask_cvtepi16_ph(A, B, C);
+}
+
+__m128h test_mm_maskz_cvtepi16_ph(__mmask8 A, __m128i B) {
+  // CHECK-LABEL: test_mm_maskz_cvtepi16_ph
+  // CHECK: %{{.*}} = sitofp <8 x i16> %{{.*}} to <8 x half>
+  return _mm_maskz_cvtepi16_ph(A, B);
+}
+
+__m256h test_mm256_cvtepi16_ph(__m256i A) {
+  // CHECK-LABEL: test_mm256_cvtepi16_ph
+  // CHECK: %{{.*}} = sitofp <16 x i16> %{{.*}} to <16 x half>
+  return _mm256_cvtepi16_ph(A);
+}
+
+__m256h test_mm256_mask_cvtepi16_ph(__m256h A, __mmask16 B, __m256i C) {
+  // CHECK-LABEL: test_mm256_mask_cvtepi16_ph
+  // CHECK: %{{.*}} = sitofp <16 x i16> %{{.*}} to <16 x half>
+  return _mm256_mask_cvtepi16_ph(A, B, C);
+}
+
+__m256h test_mm256_maskz_cvtepi16_ph(__mmask16 A, __m256i B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtepi16_ph
+  // CHECK: %{{.*}} = sitofp <16 x i16> %{{.*}} to <16 x half>
+  return _mm256_maskz_cvtepi16_ph(A, B);
+}
+
+__m128i test_mm_cvtph_epu16(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.128
+  return _mm_cvtph_epu16(A);
+}
+
+__m128i test_mm_mask_cvtph_epu16(__m128i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvtph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.128
+  return _mm_mask_cvtph_epu16(A, B, C);
+}
+
+__m128i test_mm_maskz_cvtph_epu16(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvtph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.128
+  return _mm_maskz_cvtph_epu16(A, B);
+}
+
+__m256i test_mm256_cvtph_epu16(__m256h A) {
+  // CHECK-LABEL: test_mm256_cvtph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.256
+  return _mm256_cvtph_epu16(A);
+}
+
+__m256i test_mm256_mask_cvtph_epu16(__m256i A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm256_mask_cvtph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.256
+  return _mm256_mask_cvtph_epu16(A, B, C);
+}
+
+__m256i test_mm256_maskz_cvtph_epu16(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uw.256
+  return _mm256_maskz_cvtph_epu16(A, B);
+}
+
+__m128i test_mm_cvttph_epu16(__m128h A) {
+  // CHECK-LABEL: test_mm_cvttph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.128
+  return _mm_cvttph_epu16(A);
+}
+
+__m128i test_mm_mask_cvttph_epu16(__m128i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvttph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.128
+  return _mm_mask_cvttph_epu16(A, B, C);
+}
+
+__m128i test_mm_maskz_cvttph_epu16(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvttph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.128
+  return _mm_maskz_cvttph_epu16(A, B);
+}
+
+__m256i test_mm256_cvttph_epu16(__m256h A) {
+  // CHECK-LABEL: test_mm256_cvttph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.256
+  return _mm256_cvttph_epu16(A);
+}
+
+__m256i test_mm256_mask_cvttph_epu16(__m256i A, __mmask16 B, __m256h C) {
+  // CHECK-LABEL: test_mm256_mask_cvttph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.256
+  return _mm256_mask_cvttph_epu16(A, B, C);
+}
+
+__m256i test_mm256_maskz_cvttph_epu16(__mmask16 A, __m256h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvttph_epu16
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uw.256
+  return _mm256_maskz_cvttph_epu16(A, B);
+}
+
+__m128h test_mm_cvtepu16_ph(__m128i A) {
+  // CHECK-LABEL: test_mm_cvtepu16_ph
+  // CHECK: %{{.*}} = uitofp <8 x i16> %{{.*}} to <8 x half>
+  return _mm_cvtepu16_ph(A);
+}
+
+__m128h test_mm_mask_cvtepu16_ph(__m128h A, __mmask8 B, __m128i C) {
+  // CHECK-LABEL: test_mm_mask_cvtepu16_ph
+  // CHECK: %{{.*}} = uitofp <8 x i16> %{{.*}} to <8 x half>
+  return _mm_mask_cvtepu16_ph(A, B, C);
+}
+
+__m128h test_mm_maskz_cvtepu16_ph(__mmask8 A, __m128i B) {
+  // CHECK-LABEL: test_mm_maskz_cvtepu16_ph
+  // CHECK: %{{.*}} = uitofp <8 x i16> %{{.*}} to <8 x half>
+  return _mm_maskz_cvtepu16_ph(A, B);
+}
+
+__m256h test_mm256_cvtepu16_ph(__m256i A) {
+  // CHECK-LABEL: test_mm256_cvtepu16_ph
+  // CHECK: %{{.*}} = uitofp <16 x i16> %{{.*}} to <16 x half>
+  return _mm256_cvtepu16_ph(A);
+}
+
+__m256h test_mm256_mask_cvtepu16_ph(__m256h A, __mmask16 B, __m256i C) {
+  // CHECK-LABEL: test_mm256_mask_cvtepu16_ph
+  // CHECK: %{{.*}} = uitofp <16 x i16> %{{.*}} to <16 x half>
+  return _mm256_mask_cvtepu16_ph(A, B, C);
+}
+
+__m256h test_mm256_maskz_cvtepu16_ph(__mmask16 A, __m256i B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtepu16_ph
+  // CHECK: %{{.*}} = uitofp <16 x i16> %{{.*}} to <16 x half>
+  return _mm256_maskz_cvtepu16_ph(A, B);
+}
+
+__m128i test_mm_cvtph_epi32(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.128
+  return _mm_cvtph_epi32(A);
+}
+
+__m128i test_mm_mask_cvtph_epi32(__m128i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvtph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.128
+  return _mm_mask_cvtph_epi32(A, B, C);
+}
+
+__m128i test_mm_maskz_cvtph_epi32(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvtph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.128
+  return _mm_maskz_cvtph_epi32(A, B);
+}
+
+__m256i test_mm256_cvtph_epi32(__m128h A) {
+  // CHECK-LABEL: test_mm256_cvtph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.256
+  return _mm256_cvtph_epi32(A);
+}
+
+__m256i test_mm256_mask_cvtph_epi32(__m256i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm256_mask_cvtph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.256
+  return _mm256_mask_cvtph_epi32(A, B, C);
+}
+
+__m256i test_mm256_maskz_cvtph_epi32(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2dq.256
+  return _mm256_maskz_cvtph_epi32(A, B);
+}
+
+__m128i test_mm_cvtph_epu32(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.128
+  return _mm_cvtph_epu32(A);
+}
+
+__m128i test_mm_mask_cvtph_epu32(__m128i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvtph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.128
+  return _mm_mask_cvtph_epu32(A, B, C);
+}
+
+__m128i test_mm_maskz_cvtph_epu32(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvtph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.128
+  return _mm_maskz_cvtph_epu32(A, B);
+}
+
+__m256i test_mm256_cvtph_epu32(__m128h A) {
+  // CHECK-LABEL: test_mm256_cvtph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.256
+  return _mm256_cvtph_epu32(A);
+}
+
+__m256i test_mm256_mask_cvtph_epu32(__m256i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm256_mask_cvtph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.256
+  return _mm256_mask_cvtph_epu32(A, B, C);
+}
+
+__m256i test_mm256_maskz_cvtph_epu32(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2udq.256
+  return _mm256_maskz_cvtph_epu32(A, B);
+}
+
+__m128h test_mm_cvtepi32_ph(__m128i A) {
+  // CHECK-LABEL: test_mm_cvtepi32_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtdq2ph.128
+  return _mm_cvtepi32_ph(A);
+}
+
+__m128h test_mm_mask_cvtepi32_ph(__m128h A, __mmask8 B, __m128i C) {
+  // CHECK-LABEL: test_mm_mask_cvtepi32_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtdq2ph.128
+  return _mm_mask_cvtepi32_ph(A, B, C);
+}
+
+__m128h test_mm_maskz_cvtepi32_ph(__mmask8 A, __m128i B) {
+  // CHECK-LABEL: test_mm_maskz_cvtepi32_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtdq2ph.128
+  return _mm_maskz_cvtepi32_ph(A, B);
+}
+
+__m128h test_mm256_cvtepi32_ph(__m256i A) {
+  // CHECK-LABEL: test_mm256_cvtepi32_ph
+  // CHECK: %{{.*}} = sitofp <8 x i32> %{{.*}} to <8 x half>
+  return _mm256_cvtepi32_ph(A);
+}
+
+__m128h test_mm256_mask_cvtepi32_ph(__m128h A, __mmask8 B, __m256i C) {
+  // CHECK-LABEL: test_mm256_mask_cvtepi32_ph
+  // CHECK: %{{.*}} = sitofp <8 x i32> %{{.*}} to <8 x half>
+  return _mm256_mask_cvtepi32_ph(A, B, C);
+}
+
+__m128h test_mm256_maskz_cvtepi32_ph(__mmask8 A, __m256i B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtepi32_ph
+  // CHECK: %{{.*}} = sitofp <8 x i32> %{{.*}} to <8 x half>
+  return _mm256_maskz_cvtepi32_ph(A, B);
+}
+
+__m128h test_mm_cvtepu32_ph(__m128i A) {
+  // CHECK-LABEL: test_mm_cvtepu32_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtudq2ph.128
+  return _mm_cvtepu32_ph(A);
+}
+
+__m128h test_mm_mask_cvtepu32_ph(__m128h A, __mmask8 B, __m128i C) {
+  // CHECK-LABEL: test_mm_mask_cvtepu32_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtudq2ph.128
+  return _mm_mask_cvtepu32_ph(A, B, C);
+}
+
+__m128h test_mm_maskz_cvtepu32_ph(__mmask8 A, __m128i B) {
+  // CHECK-LABEL: test_mm_maskz_cvtepu32_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtudq2ph.128
+  return _mm_maskz_cvtepu32_ph(A, B);
+}
+
+__m128h test_mm256_cvtepu32_ph(__m256i A) {
+  // CHECK-LABEL: test_mm256_cvtepu32_ph
+  // CHECK: %{{.*}} = uitofp <8 x i32> %{{.*}} to <8 x half>
+  return _mm256_cvtepu32_ph(A);
+}
+
+__m128h test_mm256_mask_cvtepu32_ph(__m128h A, __mmask8 B, __m256i C) {
+  // CHECK-LABEL: test_mm256_mask_cvtepu32_ph
+  // CHECK: %{{.*}} = uitofp <8 x i32> %{{.*}} to <8 x half>
+  return _mm256_mask_cvtepu32_ph(A, B, C);
+}
+
+__m128h test_mm256_maskz_cvtepu32_ph(__mmask8 A, __m256i B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtepu32_ph
+  // CHECK: %{{.*}} = uitofp <8 x i32> %{{.*}} to <8 x half>
+  return _mm256_maskz_cvtepu32_ph(A, B);
+}
+
+__m128i test_mm_cvttph_epi32(__m128h A) {
+  // CHECK-LABEL: test_mm_cvttph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.128
+  return _mm_cvttph_epi32(A);
+}
+
+__m128i test_mm_mask_cvttph_epi32(__m128i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvttph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.128
+  return _mm_mask_cvttph_epi32(A, B, C);
+}
+
+__m128i test_mm_maskz_cvttph_epi32(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvttph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.128
+  return _mm_maskz_cvttph_epi32(A, B);
+}
+
+__m256i test_mm256_cvttph_epi32(__m128h A) {
+  // CHECK-LABEL: test_mm256_cvttph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.256
+  return _mm256_cvttph_epi32(A);
+}
+
+__m256i test_mm256_mask_cvttph_epi32(__m256i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm256_mask_cvttph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.256
+  return _mm256_mask_cvttph_epi32(A, B, C);
+}
+
+__m256i test_mm256_maskz_cvttph_epi32(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvttph_epi32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2dq.256
+  return _mm256_maskz_cvttph_epi32(A, B);
+}
+
+__m128i test_mm_cvttph_epu32(__m128h A) {
+  // CHECK-LABEL: test_mm_cvttph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.128
+  return _mm_cvttph_epu32(A);
+}
+
+__m128i test_mm_mask_cvttph_epu32(__m128i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvttph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.128
+  return _mm_mask_cvttph_epu32(A, B, C);
+}
+
+__m128i test_mm_maskz_cvttph_epu32(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvttph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.128
+  return _mm_maskz_cvttph_epu32(A, B);
+}
+
+__m256i test_mm256_cvttph_epu32(__m128h A) {
+  // CHECK-LABEL: test_mm256_cvttph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.256
+  return _mm256_cvttph_epu32(A);
+}
+
+__m256i test_mm256_mask_cvttph_epu32(__m256i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm256_mask_cvttph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.256
+  return _mm256_mask_cvttph_epu32(A, B, C);
+}
+
+__m256i test_mm256_maskz_cvttph_epu32(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvttph_epu32
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2udq.256
+  return _mm256_maskz_cvttph_epu32(A, B);
+}
+
+__m128h test_mm_cvtepi64_ph(__m128i A) {
+  // CHECK-LABEL: test_mm_cvtepi64_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtqq2ph.128
+  return _mm_cvtepi64_ph(A);
+}
+
+__m128h test_mm_mask_cvtepi64_ph(__m128h A, __mmask8 B, __m128i C) {
+  // CHECK-LABEL: test_mm_mask_cvtepi64_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtqq2ph.128
+  return _mm_mask_cvtepi64_ph(A, B, C);
+}
+
+__m128h test_mm_maskz_cvtepi64_ph(__mmask8 A, __m128i B) {
+  // CHECK-LABEL: test_mm_maskz_cvtepi64_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtqq2ph.128
+  return _mm_maskz_cvtepi64_ph(A, B);
+}
+
+__m128h test_mm256_cvtepi64_ph(__m256i A) {
+  // CHECK-LABEL: test_mm256_cvtepi64_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtqq2ph.256
+  return _mm256_cvtepi64_ph(A);
+}
+
+__m128h test_mm256_mask_cvtepi64_ph(__m128h A, __mmask8 B, __m256i C) {
+  // CHECK-LABEL: test_mm256_mask_cvtepi64_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtqq2ph.256
+  return _mm256_mask_cvtepi64_ph(A, B, C);
+}
+
+__m128h test_mm256_maskz_cvtepi64_ph(__mmask8 A, __m256i B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtepi64_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtqq2ph.256
+  return _mm256_maskz_cvtepi64_ph(A, B);
+}
+
+__m128i test_mm_cvtph_epi64(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.128
+  return _mm_cvtph_epi64(A);
+}
+
+__m128i test_mm_mask_cvtph_epi64(__m128i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvtph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.128
+  return _mm_mask_cvtph_epi64(A, B, C);
+}
+
+__m128i test_mm_maskz_cvtph_epi64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvtph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.128
+  return _mm_maskz_cvtph_epi64(A, B);
+}
+
+__m256i test_mm256_cvtph_epi64(__m128h A) {
+  // CHECK-LABEL: test_mm256_cvtph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.256
+  return _mm256_cvtph_epi64(A);
+}
+
+__m256i test_mm256_mask_cvtph_epi64(__m256i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm256_mask_cvtph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.256
+  return _mm256_mask_cvtph_epi64(A, B, C);
+}
+
+__m256i test_mm256_maskz_cvtph_epi64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2qq.256
+  return _mm256_maskz_cvtph_epi64(A, B);
+}
+
+__m128h test_mm_cvtepu64_ph(__m128i A) {
+  // CHECK-LABEL: test_mm_cvtepu64_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128
+  return _mm_cvtepu64_ph(A);
+}
+
+__m128h test_mm_mask_cvtepu64_ph(__m128h A, __mmask8 B, __m128i C) {
+  // CHECK-LABEL: test_mm_mask_cvtepu64_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128
+  return _mm_mask_cvtepu64_ph(A, B, C);
+}
+
+__m128h test_mm_maskz_cvtepu64_ph(__mmask8 A, __m128i B) {
+  // CHECK-LABEL: test_mm_maskz_cvtepu64_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128
+  return _mm_maskz_cvtepu64_ph(A, B);
+}
+
+__m128h test_mm256_cvtepu64_ph(__m256i A) {
+  // CHECK-LABEL: test_mm256_cvtepu64_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256
+  return _mm256_cvtepu64_ph(A);
+}
+
+__m128h test_mm256_mask_cvtepu64_ph(__m128h A, __mmask8 B, __m256i C) {
+  // CHECK-LABEL: test_mm256_mask_cvtepu64_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256
+  return _mm256_mask_cvtepu64_ph(A, B, C);
+}
+
+__m128h test_mm256_maskz_cvtepu64_ph(__mmask8 A, __m256i B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtepu64_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256
+  return _mm256_maskz_cvtepu64_ph(A, B);
+}
+
+__m128i test_mm_cvtph_epu64(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.128
+  return _mm_cvtph_epu64(A);
+}
+
+__m128i test_mm_mask_cvtph_epu64(__m128i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvtph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.128
+  return _mm_mask_cvtph_epu64(A, B, C);
+}
+
+__m128i test_mm_maskz_cvtph_epu64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvtph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.128
+  return _mm_maskz_cvtph_epu64(A, B);
+}
+
+__m256i test_mm256_cvtph_epu64(__m128h A) {
+  // CHECK-LABEL: test_mm256_cvtph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.256
+  return _mm256_cvtph_epu64(A);
+}
+
+__m256i test_mm256_mask_cvtph_epu64(__m256i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm256_mask_cvtph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.256
+  return _mm256_mask_cvtph_epu64(A, B, C);
+}
+
+__m256i test_mm256_maskz_cvtph_epu64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2uqq.256
+  return _mm256_maskz_cvtph_epu64(A, B);
+}
+
+__m128i test_mm_cvttph_epi64(__m128h A) {
+  // CHECK-LABEL: test_mm_cvttph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.128
+  return _mm_cvttph_epi64(A);
+}
+
+__m128i test_mm_mask_cvttph_epi64(__m128i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvttph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.128
+  return _mm_mask_cvttph_epi64(A, B, C);
+}
+
+__m128i test_mm_maskz_cvttph_epi64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvttph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.128
+  return _mm_maskz_cvttph_epi64(A, B);
+}
+
+__m256i test_mm256_cvttph_epi64(__m128h A) {
+  // CHECK-LABEL: test_mm256_cvttph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.256
+  return _mm256_cvttph_epi64(A);
+}
+
+__m256i test_mm256_mask_cvttph_epi64(__m256i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm256_mask_cvttph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.256
+  return _mm256_mask_cvttph_epi64(A, B, C);
+}
+
+__m256i test_mm256_maskz_cvttph_epi64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvttph_epi64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2qq.256
+  return _mm256_maskz_cvttph_epi64(A, B);
+}
+
+__m128i test_mm_cvttph_epu64(__m128h A) {
+  // CHECK-LABEL: test_mm_cvttph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.128
+  return _mm_cvttph_epu64(A);
+}
+
+__m128i test_mm_mask_cvttph_epu64(__m128i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvttph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.128
+  return _mm_mask_cvttph_epu64(A, B, C);
+}
+
+__m128i test_mm_maskz_cvttph_epu64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvttph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.128
+  return _mm_maskz_cvttph_epu64(A, B);
+}
+
+__m256i test_mm256_cvttph_epu64(__m128h A) {
+  // CHECK-LABEL: test_mm256_cvttph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.256
+  return _mm256_cvttph_epu64(A);
+}
+
+__m256i test_mm256_mask_cvttph_epu64(__m256i A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm256_mask_cvttph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.256
+  return _mm256_mask_cvttph_epu64(A, B, C);
+}
+
+__m256i test_mm256_maskz_cvttph_epu64(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvttph_epu64
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvttph2uqq.256
+  return _mm256_maskz_cvttph_epu64(A, B);
+}
+
+__m128 test_mm_cvtxph_ps(__m128h A) {
+  // CHECK-LABEL: test_mm_cvtxph_ps
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.128
+  return _mm_cvtxph_ps(A);
+}
+
+__m128 test_mm_mask_cvtxph_ps(__m128 A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm_mask_cvtxph_ps
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.128
+  return _mm_mask_cvtxph_ps(A, B, C);
+}
+
+__m128 test_mm_maskz_cvtxph_ps(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm_maskz_cvtxph_ps
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.128
+  return _mm_maskz_cvtxph_ps(A, B);
+}
+
+__m256 test_mm256_cvtxph_ps(__m128h A) {
+  // CHECK-LABEL: test_mm256_cvtxph_ps
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.256
+  return _mm256_cvtxph_ps(A);
+}
+
+__m256 test_mm256_mask_cvtxph_ps(__m256 A, __mmask8 B, __m128h C) {
+  // CHECK-LABEL: test_mm256_mask_cvtxph_ps
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.256
+  return _mm256_mask_cvtxph_ps(A, B, C);
+}
+
+__m256 test_mm256_maskz_cvtxph_ps(__mmask8 A, __m128h B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtxph_ps
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtph2psx.256
+  return _mm256_maskz_cvtxph_ps(A, B);
+}
+
+__m128h test_mm_cvtxps_ph(__m128 A) {
+  // CHECK-LABEL: test_mm_cvtxps_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.128
+  return _mm_cvtxps_ph(A);
+}
+
+__m128h test_mm_mask_cvtxps_ph(__m128h A, __mmask8 B, __m128 C) {
+  // CHECK-LABEL: test_mm_mask_cvtxps_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.128
+  return _mm_mask_cvtxps_ph(A, B, C);
+}
+
+__m128h test_mm_maskz_cvtxps_ph(__mmask8 A, __m128 B) {
+  // CHECK-LABEL: test_mm_maskz_cvtxps_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.128
+  return _mm_maskz_cvtxps_ph(A, B);
+}
+
+__m128h test_mm256_cvtxps_ph(__m256 A) {
+  // CHECK-LABEL: test_mm256_cvtxps_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.256
+  return _mm256_cvtxps_ph(A);
+}
+
+__m128h test_mm256_mask_cvtxps_ph(__m128h A, __mmask8 B, __m256 C) {
+  // CHECK-LABEL: test_mm256_mask_cvtxps_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.256
+  return _mm256_mask_cvtxps_ph(A, B, C);
+}
+
+__m128h test_mm256_maskz_cvtxps_ph(__mmask8 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_maskz_cvtxps_ph
+  // CHECK: @llvm.x86.avx512fp16.mask.vcvtps2phx.256
+  return _mm256_maskz_cvtxps_ph(A, B);
 }
 
 __m128h test_mm_mask_blend_ph(__mmask8 __U, __m128h __A, __m128h __W) {
@@ -202,3 +2372,51 @@ __m256h test_mm256_permutexvar_ph(__m256i __A, __m256h __B) {
   // CHECK:  %{{.*}} = bitcast <16 x i16> %{{.*}} to <16 x half>
   return _mm256_permutexvar_ph(__A, __B);
 }
+
+_Float16 test_mm256_reduce_add_ph(__m256h __W) {
+  // CHECK-LABEL: @test_mm256_reduce_add_ph
+  // CHECK: call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH8000, <16 x half> %{{.*}})
+  return _mm256_reduce_add_ph(__W);
+}
+
+_Float16 test_mm256_reduce_mul_ph(__m256h __W) {
+  // CHECK-LABEL: @test_mm256_reduce_mul_ph
+  // CHECK: call reassoc half @llvm.vector.reduce.fmul.v16f16(half 0xH3C00, <16 x half> %{{.*}})
+  return _mm256_reduce_mul_ph(__W);
+}
+
+_Float16 test_mm256_reduce_max_ph(__m256h __W) {
+  // CHECK-LABEL: @test_mm256_reduce_max_ph
+  // CHECK: call nnan half @llvm.vector.reduce.fmax.v16f16(<16 x half> %{{.*}})
+  return _mm256_reduce_max_ph(__W);
+}
+
+_Float16 test_mm256_reduce_min_ph(__m256h __W) {
+  // CHECK-LABEL: @test_mm256_reduce_min_ph
+  // CHECK: call nnan half @llvm.vector.reduce.fmin.v16f16(<16 x half> %{{.*}})
+  return _mm256_reduce_min_ph(__W);
+}
+
+_Float16 test_mm_reduce_add_ph(__m128h __W) {
+  // CHECK-LABEL: @test_mm_reduce_add_ph
+  // CHECK: call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> %{{.*}})
+  return _mm_reduce_add_ph(__W);
+}
+
+_Float16 test_mm_reduce_mul_ph(__m128h __W) {
+  // CHECK-LABEL: @test_mm_reduce_mul_ph
+  // CHECK: call reassoc half @llvm.vector.reduce.fmul.v8f16(half 0xH3C00, <8 x half> %{{.*}})
+  return _mm_reduce_mul_ph(__W);
+}
+
+_Float16 test_mm_reduce_min_ph(__m128h __W) {
+  // CHECK-LABEL: @test_mm_reduce_min_ph
+  // CHECK: call nnan half @llvm.vector.reduce.fmin.v8f16(<8 x half> %{{.*}})
+  return _mm_reduce_min_ph(__W);
+}
+
+_Float16 test_mm_reduce_max_ph(__m128h __W) {
+  // CHECK-LABEL: @test_mm_reduce_max_ph
+  // CHECK: call nnan half @llvm.vector.reduce.fmax.v8f16(<8 x half> %{{.*}})
+  return _mm_reduce_max_ph(__W);
+}
diff --git a/clang/test/CodeGen/arm-sve-vector-bits-vscale-range.c b/clang/test/CodeGen/arm-sve-vector-bits-vscale-range.c
index 84541f9cb12db..eb5c4f31044cf 100644
--- a/clang/test/CodeGen/arm-sve-vector-bits-vscale-range.c
+++ b/clang/test/CodeGen/arm-sve-vector-bits-vscale-range.c
@@ -3,10 +3,13 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -msve-vector-bits=512 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=512
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -msve-vector-bits=1024 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=1024
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -msve-vector-bits=2048 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=2048
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -msve-vector-bits=128 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=128
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -msve-vector-bits=256 -S -emit-llvm -o - %s | FileCheck %s -D#VBITS=256
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2 -msve-vector-bits=scalable -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-NONE
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -msve-vector-bits=scalable -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-NONE
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-NONE
 
 // CHECK-LABEL: @func() #0
 // CHECK: attributes #0 = { {{.*}} vscale_range([[#div(VBITS,128)]],[[#div(VBITS,128)]]) {{.*}} }
-// CHECK-NONE-NOT: vscale_range
+// CHECK-NONE: attributes #0 = { {{.*}} vscale_range(0,16) {{.*}} }
 void func() {}
diff --git a/clang/test/CodeGen/arm64-microsoft-intrinsics.c b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
index ca8f270bd4f36..36f182433edd2 100644
--- a/clang/test/CodeGen/arm64-microsoft-intrinsics.c
+++ b/clang/test/CodeGen/arm64-microsoft-intrinsics.c
@@ -81,6 +81,28 @@ void check_ReadWriteBarrier() {
 // CHECK-MSVC: fence syncscope("singlethread")
 // CHECK-LINUX: error: implicit declaration of function '_ReadWriteBarrier'
 
+long long check_mulh(long long a, long long b) {
+  return __mulh(a, b);
+}
+
+// CHECK-MSVC: %[[ARG1:.*]] = sext i64 {{.*}} to i128
+// CHECK-MSVC: %[[ARG2:.*]] = sext i64 {{.*}} to i128
+// CHECK-MSVC: %[[PROD:.*]] = mul nsw i128 %[[ARG1]], %[[ARG2]]
+// CHECK-MSVC: %[[HIGH:.*]] = ashr i128 %[[PROD]], 64
+// CHECK-MSVC: %[[RES:.*]] = trunc i128 %[[HIGH]] to i64
+// CHECK-LINUX: error: implicit declaration of function '__mulh'
+
+unsigned long long check_umulh(unsigned long long a, unsigned long long b) {
+  return __umulh(a, b);
+}
+
+// CHECK-MSVC: %[[ARG1:.*]] = zext i64 {{.*}} to i128
+// CHECK-MSVC: %[[ARG2:.*]] = zext i64 {{.*}} to i128
+// CHECK-MSVC: %[[PROD:.*]] = mul nuw i128 %[[ARG1]], %[[ARG2]]
+// CHECK-MSVC: %[[HIGH:.*]] = lshr i128 %[[PROD]], 64
+// CHECK-MSVC: %[[RES:.*]] = trunc i128 %[[HIGH]] to i64
+// CHECK-LINUX: error: implicit declaration of function '__umulh'
+
 unsigned __int64 check__getReg() {
   unsigned volatile __int64 reg;
   reg = __getReg(18);
diff --git a/clang/test/CodeGen/attr-btf_tag-dicomposite-2.c b/clang/test/CodeGen/attr-btf_tag-dicomposite-2.c
new file mode 100644
index 0000000000000..ed937ec28c37a
--- /dev/null
+++ b/clang/test/CodeGen/attr-btf_tag-dicomposite-2.c
@@ -0,0 +1,14 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang -target x86_64 -g -S -emit-llvm -o - %s | FileCheck %s
+
+#define __tag1 __attribute__((btf_tag("tag1")))
+#define __tag2 __attribute__((btf_tag("tag2")))
+
+struct __tag1 __tag2 t1;
+
+int foo(struct t1 *arg) {
+  return (int)(long)arg;
+}
+
+// CHECK: define dso_local i32 @foo(
+// CHECK-NOT: annotations
diff --git a/clang/test/CodeGen/attr-btf_tag-dicomposite.c b/clang/test/CodeGen/attr-btf_tag-dicomposite.c
new file mode 100644
index 0000000000000..514dc4e0ccc10
--- /dev/null
+++ b/clang/test/CodeGen/attr-btf_tag-dicomposite.c
@@ -0,0 +1,52 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang -target x86_64 -g -S -emit-llvm -o - %s | FileCheck %s
+
+#define __tag1 __attribute__((btf_tag("tag1")))
+#define __tag2 __attribute__((btf_tag("tag2")))
+
+struct __tag1 __tag2 t1;
+struct t1 {
+  int a;
+};
+
+int foo(struct t1 *arg) {
+  return arg->a;
+}
+
+// CHECK: distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", file: ![[#]], line: [[#]], size: 32, elements: ![[#]], annotations: ![[ANNOT:[0-9]+]])
+// CHECK: ![[ANNOT]] = !{![[TAG1:[0-9]+]], ![[TAG2:[0-9]+]]}
+// CHECK: ![[TAG1]] = !{!"btf_tag", !"tag1"}
+// CHECK: ![[TAG2]] = !{!"btf_tag", !"tag2"}
+
+struct __tag1 t2;
+struct __tag2 t2 {
+  int a;
+};
+
+int foo2(struct t2 *arg) {
+  return arg->a;
+}
+
+// CHECK: distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t2", file: ![[#]], line: [[#]], size: 32, elements: ![[#]], annotations: ![[ANNOT]])
+
+struct __tag1 t3;
+struct t3 {
+  int a;
+} __tag2;
+
+int foo3(struct t3 *arg) {
+  return arg->a;
+}
+
+// CHECK: distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t3", file: ![[#]], line: [[#]], size: 32, elements: ![[#]], annotations: ![[ANNOT]])
+
+struct t4;
+struct t4 {
+  int a;
+} __tag1 __tag2;
+
+int foo4(struct t4 *arg) {
+  return arg->a;
+}
+
+// CHECK: distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t4", file: ![[#]], line: [[#]], size: 32, elements: ![[#]], annotations: ![[ANNOT]])
diff --git a/clang/test/CodeGen/attr-btf_tag-field.c b/clang/test/CodeGen/attr-btf_tag-field.c
new file mode 100644
index 0000000000000..93ba6419b12ec
--- /dev/null
+++ b/clang/test/CodeGen/attr-btf_tag-field.c
@@ -0,0 +1,27 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang -target x86_64 -g -S -emit-llvm -o - %s | FileCheck %s
+
+#define __tag1 __attribute__((btf_tag("tag1")))
+#define __tag2 __attribute__((btf_tag("tag2")))
+
+struct t1 {
+  int a __tag1 __tag2;
+};
+
+int foo(struct t1 *arg) {
+  return arg->a;
+}
+
+struct t2 {
+  int b:1 __tag1 __tag2;
+};
+
+int foo2(struct t2 *arg) {
+  return arg->b;
+}
+// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "a", scope: ![[#]], file: ![[#]], line: [[#]], baseType: ![[#]], size: 32, annotations: ![[ANNOT:[0-9]+]])
+// CHECK: ![[ANNOT]] = !{![[TAG1:[0-9]+]], ![[TAG2:[0-9]+]]}
+// CHECK: ![[TAG1]] = !{!"btf_tag", !"tag1"}
+// CHECK: ![[TAG2]] = !{!"btf_tag", !"tag2"}
+
+// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "b", scope: ![[#]], file: ![[#]], line: [[#]], baseType: ![[#]], size: 1, flags: DIFlagBitField, extraData: i64 0, annotations: ![[ANNOT]])
diff --git a/clang/test/CodeGen/attr-disable-sanitizer-instrumentation.c b/clang/test/CodeGen/attr-disable-sanitizer-instrumentation.c
new file mode 100644
index 0000000000000..74ebe1b548807
--- /dev/null
+++ b/clang/test/CodeGen/attr-disable-sanitizer-instrumentation.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -debug-info-kind=limited %s -emit-llvm -o - | FileCheck %s
+
+void t1() __attribute__((disable_sanitizer_instrumentation)) {
+}
+// CHECK: disable_sanitizer_instrumentation
+// CHECK-NEXT: void @t1
+
+// CHECK-NOT: disable_sanitizer_instrumentation
+// CHECK: void @t2
+void t2() {
+}
diff --git a/clang/test/CodeGen/builtins-ppc-xlcompat-cmplx.c b/clang/test/CodeGen/builtins-ppc-xlcompat-cmplx.c
index f3274fe19c1f2..5e1f6a60bc2c6 100644
--- a/clang/test/CodeGen/builtins-ppc-xlcompat-cmplx.c
+++ b/clang/test/CodeGen/builtins-ppc-xlcompat-cmplx.c
@@ -226,3 +226,115 @@ double _Complex testcmplx(double real, double imag) {
 float _Complex testcmplxf(float real, float imag) {
   return __cmplxf(real, imag);
 }
+
+// 64BIT-LABEL: @test_xl_cmplxl(
+// 64BIT-NEXT:  entry:
+// 64BIT-NEXT:    [[RETVAL:%.*]] = alloca { ppc_fp128, ppc_fp128 }, align 16
+// 64BIT-NEXT:    [[LDA_ADDR:%.*]] = alloca ppc_fp128, align 16
+// 64BIT-NEXT:    [[LDB_ADDR:%.*]] = alloca ppc_fp128, align 16
+// 64BIT-NEXT:    store ppc_fp128 [[LDA:%.*]], ppc_fp128* [[LDA_ADDR]], align 16
+// 64BIT-NEXT:    store ppc_fp128 [[LDB:%.*]], ppc_fp128* [[LDB_ADDR]], align 16
+// 64BIT-NEXT:    [[TMP0:%.*]] = load ppc_fp128, ppc_fp128* [[LDA_ADDR]], align 16
+// 64BIT-NEXT:    [[TMP1:%.*]] = load ppc_fp128, ppc_fp128* [[LDB_ADDR]], align 16
+// 64BIT-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[RETVAL]], i32 0, i32 0
+// 64BIT-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[RETVAL]], i32 0, i32 1
+// 64BIT-NEXT:    store ppc_fp128 [[TMP0]], ppc_fp128* [[RETVAL_REALP]], align 16
+// 64BIT-NEXT:    store ppc_fp128 [[TMP1]], ppc_fp128* [[RETVAL_IMAGP]], align 16
+// 64BIT-NEXT:    [[TMP2:%.*]] = load { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[RETVAL]], align 16
+// 64BIT-NEXT:    ret { ppc_fp128, ppc_fp128 } [[TMP2]]
+//
+// 64BITLE-LABEL: @test_xl_cmplxl(
+// 64BITLE-NEXT:  entry:
+// 64BITLE-NEXT:    [[RETVAL:%.*]] = alloca { ppc_fp128, ppc_fp128 }, align 16
+// 64BITLE-NEXT:    [[LDA_ADDR:%.*]] = alloca ppc_fp128, align 16
+// 64BITLE-NEXT:    [[LDB_ADDR:%.*]] = alloca ppc_fp128, align 16
+// 64BITLE-NEXT:    store ppc_fp128 [[LDA:%.*]], ppc_fp128* [[LDA_ADDR]], align 16
+// 64BITLE-NEXT:    store ppc_fp128 [[LDB:%.*]], ppc_fp128* [[LDB_ADDR]], align 16
+// 64BITLE-NEXT:    [[TMP0:%.*]] = load ppc_fp128, ppc_fp128* [[LDA_ADDR]], align 16
+// 64BITLE-NEXT:    [[TMP1:%.*]] = load ppc_fp128, ppc_fp128* [[LDB_ADDR]], align 16
+// 64BITLE-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[RETVAL]], i32 0, i32 0
+// 64BITLE-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[RETVAL]], i32 0, i32 1
+// 64BITLE-NEXT:    store ppc_fp128 [[TMP0]], ppc_fp128* [[RETVAL_REALP]], align 16
+// 64BITLE-NEXT:    store ppc_fp128 [[TMP1]], ppc_fp128* [[RETVAL_IMAGP]], align 16
+// 64BITLE-NEXT:    [[TMP2:%.*]] = load { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[RETVAL]], align 16
+// 64BITLE-NEXT:    ret { ppc_fp128, ppc_fp128 } [[TMP2]]
+//
+// 64BITAIX-LABEL: @test_xl_cmplxl(
+// 64BITAIX-NEXT:  entry:
+// 64BITAIX-NEXT:    [[RETVAL:%.*]] = alloca { double, double }, align 4
+// 64BITAIX-NEXT:    [[LDA_ADDR:%.*]] = alloca double, align 8
+// 64BITAIX-NEXT:    [[LDB_ADDR:%.*]] = alloca double, align 8
+// 64BITAIX-NEXT:    store double [[LDA:%.*]], double* [[LDA_ADDR]], align 8
+// 64BITAIX-NEXT:    store double [[LDB:%.*]], double* [[LDB_ADDR]], align 8
+// 64BITAIX-NEXT:    [[TMP0:%.*]] = load double, double* [[LDA_ADDR]], align 8
+// 64BITAIX-NEXT:    [[TMP1:%.*]] = load double, double* [[LDB_ADDR]], align 8
+// 64BITAIX-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { double, double }, { double, double }* [[RETVAL]], i32 0, i32 0
+// 64BITAIX-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { double, double }, { double, double }* [[RETVAL]], i32 0, i32 1
+// 64BITAIX-NEXT:    store double [[TMP0]], double* [[RETVAL_REALP]], align 4
+// 64BITAIX-NEXT:    store double [[TMP1]], double* [[RETVAL_IMAGP]], align 4
+// 64BITAIX-NEXT:    [[TMP2:%.*]] = load { double, double }, { double, double }* [[RETVAL]], align 4
+// 64BITAIX-NEXT:    ret { double, double } [[TMP2]]
+//
+// 32BIT-LABEL: @test_xl_cmplxl(
+// 32BIT-NEXT:  entry:
+// 32BIT-NEXT:    [[LDA_ADDR:%.*]] = alloca ppc_fp128, align 16
+// 32BIT-NEXT:    [[LDB_ADDR:%.*]] = alloca ppc_fp128, align 16
+// 32BIT-NEXT:    store ppc_fp128 [[LDA:%.*]], ppc_fp128* [[LDA_ADDR]], align 16
+// 32BIT-NEXT:    store ppc_fp128 [[LDB:%.*]], ppc_fp128* [[LDB_ADDR]], align 16
+// 32BIT-NEXT:    [[TMP0:%.*]] = load ppc_fp128, ppc_fp128* [[LDA_ADDR]], align 16
+// 32BIT-NEXT:    [[TMP1:%.*]] = load ppc_fp128, ppc_fp128* [[LDB_ADDR]], align 16
+// 32BIT-NEXT:    [[AGG_RESULT_REALP:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[AGG_RESULT:%.*]], i32 0, i32 0
+// 32BIT-NEXT:    [[AGG_RESULT_IMAGP:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[AGG_RESULT]], i32 0, i32 1
+// 32BIT-NEXT:    store ppc_fp128 [[TMP0]], ppc_fp128* [[AGG_RESULT_REALP]], align 16
+// 32BIT-NEXT:    store ppc_fp128 [[TMP1]], ppc_fp128* [[AGG_RESULT_IMAGP]], align 16
+// 32BIT-NEXT:    [[AGG_RESULT_REALP1:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[AGG_RESULT]], i32 0, i32 0
+// 32BIT-NEXT:    [[AGG_RESULT_REAL:%.*]] = load ppc_fp128, ppc_fp128* [[AGG_RESULT_REALP1]], align 16
+// 32BIT-NEXT:    [[AGG_RESULT_IMAGP2:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[AGG_RESULT]], i32 0, i32 1
+// 32BIT-NEXT:    [[AGG_RESULT_IMAG:%.*]] = load ppc_fp128, ppc_fp128* [[AGG_RESULT_IMAGP2]], align 16
+// 32BIT-NEXT:    [[AGG_RESULT_REALP3:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[AGG_RESULT]], i32 0, i32 0
+// 32BIT-NEXT:    [[AGG_RESULT_IMAGP4:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[AGG_RESULT]], i32 0, i32 1
+// 32BIT-NEXT:    store ppc_fp128 [[AGG_RESULT_REAL]], ppc_fp128* [[AGG_RESULT_REALP3]], align 16
+// 32BIT-NEXT:    store ppc_fp128 [[AGG_RESULT_IMAG]], ppc_fp128* [[AGG_RESULT_IMAGP4]], align 16
+// 32BIT-NEXT:    ret void
+//
+// 32BITLE-LABEL: @test_xl_cmplxl(
+// 32BITLE-NEXT:  entry:
+// 32BITLE-NEXT:    [[LDA_ADDR:%.*]] = alloca ppc_fp128, align 16
+// 32BITLE-NEXT:    [[LDB_ADDR:%.*]] = alloca ppc_fp128, align 16
+// 32BITLE-NEXT:    store ppc_fp128 [[LDA:%.*]], ppc_fp128* [[LDA_ADDR]], align 16
+// 32BITLE-NEXT:    store ppc_fp128 [[LDB:%.*]], ppc_fp128* [[LDB_ADDR]], align 16
+// 32BITLE-NEXT:    [[TMP0:%.*]] = load ppc_fp128, ppc_fp128* [[LDA_ADDR]], align 16
+// 32BITLE-NEXT:    [[TMP1:%.*]] = load ppc_fp128, ppc_fp128* [[LDB_ADDR]], align 16
+// 32BITLE-NEXT:    [[AGG_RESULT_REALP:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[AGG_RESULT:%.*]], i32 0, i32 0
+// 32BITLE-NEXT:    [[AGG_RESULT_IMAGP:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[AGG_RESULT]], i32 0, i32 1
+// 32BITLE-NEXT:    store ppc_fp128 [[TMP0]], ppc_fp128* [[AGG_RESULT_REALP]], align 16
+// 32BITLE-NEXT:    store ppc_fp128 [[TMP1]], ppc_fp128* [[AGG_RESULT_IMAGP]], align 16
+// 32BITLE-NEXT:    [[AGG_RESULT_REALP1:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[AGG_RESULT]], i32 0, i32 0
+// 32BITLE-NEXT:    [[AGG_RESULT_REAL:%.*]] = load ppc_fp128, ppc_fp128* [[AGG_RESULT_REALP1]], align 16
+// 32BITLE-NEXT:    [[AGG_RESULT_IMAGP2:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[AGG_RESULT]], i32 0, i32 1
+// 32BITLE-NEXT:    [[AGG_RESULT_IMAG:%.*]] = load ppc_fp128, ppc_fp128* [[AGG_RESULT_IMAGP2]], align 16
+// 32BITLE-NEXT:    [[AGG_RESULT_REALP3:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[AGG_RESULT]], i32 0, i32 0
+// 32BITLE-NEXT:    [[AGG_RESULT_IMAGP4:%.*]] = getelementptr inbounds { ppc_fp128, ppc_fp128 }, { ppc_fp128, ppc_fp128 }* [[AGG_RESULT]], i32 0, i32 1
+// 32BITLE-NEXT:    store ppc_fp128 [[AGG_RESULT_REAL]], ppc_fp128* [[AGG_RESULT_REALP3]], align 16
+// 32BITLE-NEXT:    store ppc_fp128 [[AGG_RESULT_IMAG]], ppc_fp128* [[AGG_RESULT_IMAGP4]], align 16
+// 32BITLE-NEXT:    ret void
+//
+// 32BITAIX-LABEL: @test_xl_cmplxl(
+// 32BITAIX-NEXT:  entry:
+// 32BITAIX-NEXT:    [[RETVAL:%.*]] = alloca { double, double }, align 4
+// 32BITAIX-NEXT:    [[LDA_ADDR:%.*]] = alloca double, align 8
+// 32BITAIX-NEXT:    [[LDB_ADDR:%.*]] = alloca double, align 8
+// 32BITAIX-NEXT:    store double [[LDA:%.*]], double* [[LDA_ADDR]], align 8
+// 32BITAIX-NEXT:    store double [[LDB:%.*]], double* [[LDB_ADDR]], align 8
+// 32BITAIX-NEXT:    [[TMP0:%.*]] = load double, double* [[LDA_ADDR]], align 8
+// 32BITAIX-NEXT:    [[TMP1:%.*]] = load double, double* [[LDB_ADDR]], align 8
+// 32BITAIX-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { double, double }, { double, double }* [[RETVAL]], i32 0, i32 0
+// 32BITAIX-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { double, double }, { double, double }* [[RETVAL]], i32 0, i32 1
+// 32BITAIX-NEXT:    store double [[TMP0]], double* [[RETVAL_REALP]], align 4
+// 32BITAIX-NEXT:    store double [[TMP1]], double* [[RETVAL_IMAGP]], align 4
+// 32BITAIX-NEXT:    [[TMP2:%.*]] = load { double, double }, { double, double }* [[RETVAL]], align 4
+// 32BITAIX-NEXT:    ret { double, double } [[TMP2]]
+//
+long double _Complex test_xl_cmplxl(long double lda, long double ldb) {
+  return __cmplxl(lda, ldb);
+}
diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c
index ac33ce5716e6d..7f67d78693d0c 100644
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@@ -506,6 +506,20 @@ f32x4 max_f32x4(f32x4 x, f32x4 y) {
   // WEBASSEMBLY-NEXT: ret
 }
 
+f32x4 pmin_f32x4(f32x4 x, f32x4 y) {
+  return __builtin_wasm_pmin_f32x4(x, y);
+  // WEBASSEMBLY: call <4 x float> @llvm.wasm.pmin.v4f32(
+  // WEBASSEMBLY-SAME: <4 x float> %x, <4 x float> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
+f32x4 pmax_f32x4(f32x4 x, f32x4 y) {
+  return __builtin_wasm_pmax_f32x4(x, y);
+  // WEBASSEMBLY: call <4 x float> @llvm.wasm.pmax.v4f32(
+  // WEBASSEMBLY-SAME: <4 x float> %x, <4 x float> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
 f64x2 min_f64x2(f64x2 x, f64x2 y) {
   return __builtin_wasm_min_f64x2(x, y);
   // WEBASSEMBLY: call <2 x double> @llvm.minimum.v2f64(
@@ -520,6 +534,20 @@ f64x2 max_f64x2(f64x2 x, f64x2 y) {
   // WEBASSEMBLY-NEXT: ret
 }
 
+f64x2 pmin_f64x2(f64x2 x, f64x2 y) {
+  return __builtin_wasm_pmin_f64x2(x, y);
+  // WEBASSEMBLY: call <2 x double> @llvm.wasm.pmin.v2f64(
+  // WEBASSEMBLY-SAME: <2 x double> %x, <2 x double> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
+f64x2 pmax_f64x2(f64x2 x, f64x2 y) {
+  return __builtin_wasm_pmax_f64x2(x, y);
+  // WEBASSEMBLY: call <2 x double> @llvm.wasm.pmax.v2f64(
+  // WEBASSEMBLY-SAME: <2 x double> %x, <2 x double> %y)
+  // WEBASSEMBLY-NEXT: ret
+}
+
 f32x4 ceil_f32x4(f32x4 x) {
   return __builtin_wasm_ceil_f32x4(x);
   // WEBASSEMBLY: call <4 x float> @llvm.ceil.v4f32(<4 x float> %x)
diff --git a/clang/test/CodeGen/char-literal.c b/clang/test/CodeGen/char-literal.c
index 6fdf8b7c02b1c..c7a2a7bee471f 100644
--- a/clang/test/CodeGen/char-literal.c
+++ b/clang/test/CodeGen/char-literal.c
@@ -1,5 +1,4 @@
 // RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-C %s
-// RUN: %clang_cc1 -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-C %s
 // RUN: %clang_cc1 -x c++ -std=c++11 -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -check-prefix=CHECK-CPP0X %s
 
 #include <stddef.h>
@@ -33,11 +32,6 @@ int main() {
   // CHECK-CPP0X: store i32 97
   wchar_t wa = L'a';
 
-  // Should pick second character.
-  // CHECK-C: store i32 98
-  // CHECK-CPP0X: store i32 98
-  wchar_t wb = L'ab';
-
 #if __cplusplus >= 201103L
   // CHECK-CPP0X: store i16 97
   char16_t ua = u'a';
@@ -83,8 +77,4 @@ int main() {
   char32_t Ud = U'\U0010F00B';
 #endif
 
-  // Should pick second character.
-  // CHECK-C: store i32 1110027
-  // CHECK-CPP0X: store i32 1110027
-  wchar_t we = L'\u1234\U0010F00B';
 }
diff --git a/clang/test/CodeGen/hwasan-stack-safety-analysis.c b/clang/test/CodeGen/hwasan-stack-safety-analysis.c
index ba50274de2825..59fc53f8b1d1c 100644
--- a/clang/test/CodeGen/hwasan-stack-safety-analysis.c
+++ b/clang/test/CodeGen/hwasan-stack-safety-analysis.c
@@ -1,3 +1,5 @@
+// REQUIRES: aarch64-registered-target
+
 // RUN: %clang -fno-legacy-pass-manager -fsanitize=hwaddress -target aarch64-linux-gnu -S -emit-llvm -mllvm -hwasan-use-stack-safety=true -mllvm -hwasan-generate-tags-with-calls -O2 %s -o - | FileCheck %s --check-prefix=SAFETY
 // RUN: %clang -fno-legacy-pass-manager -fsanitize=hwaddress -target aarch64-linux-gnu -S -emit-llvm -mllvm -hwasan-use-stack-safety=false -mllvm -hwasan-generate-tags-with-calls -O2 %s -o - | FileCheck %s --check-prefix=NOSAFETY
 
diff --git a/clang/test/CodeGen/sanitize-memory-disable.c b/clang/test/CodeGen/sanitize-memory-disable.c
new file mode 100644
index 0000000000000..da3593ac973b4
--- /dev/null
+++ b/clang/test/CodeGen/sanitize-memory-disable.c
@@ -0,0 +1,58 @@
+// RUN: %clang -target x86_64-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefixes CHECK,WITHOUT %s
+// RUN: %clang -target x86_64-linux-gnu -S -emit-llvm -o - %s -fsanitize=memory | FileCheck -check-prefixes CHECK,MSAN %s
+// RUN: %clang -target x86_64-linux-gnu -S -emit-llvm -o - %s -fsanitize=kernel-memory | FileCheck -check-prefixes CHECK,KMSAN %s
+
+// Instrumented function.
+// MSan uses memset(addr, -1, size) to poison allocas and stores shadow of the return value in
+// __msan_retval_tls. KMSAN uses __msan_poison_alloca() to poison allocas and calls
+// __msan_get_context_state() at function prologue to access the task context struct (including the
+// shadow of the return value).
+//
+// CHECK-LABEL: i32 @instrumented1
+// KMSAN: __msan_get_context_state
+// WITHOUT-NOT: __msan_poison_alloca
+// WITHOUT-NOT: @llvm.memset
+// MSAN: @llvm.memset{{.*}}({{.*}}, i8 -1
+// KMSAN: __msan_poison_alloca
+// WITHOUT-NOT: __msan_retval_tls
+// MSAN: __msan_retval_tls
+// CHECK: ret i32
+int instrumented1(int *a) {
+  volatile char buf[8];
+  return *a;
+}
+
+// Function with no_sanitize("memory")/no_sanitize("kernel-memory"): no shadow propagation, but
+// unpoisons memory to prevent false positives.
+// MSan uses memset(addr, 0, size) to unpoison locals, KMSAN uses __msan_unpoison_alloca(). Both
+// tools still access the retval shadow to write 0 to it.
+//
+// CHECK-LABEL: i32 @no_false_positives1
+// KMSAN: __msan_get_context_state
+// WITHOUT-NOT: __msan_unpoison_alloca
+// WITHOUT-NOT: @llvm.memset
+// MSAN: @llvm.memset{{.*}}({{.*}}, i8 0
+// KMSAN: __msan_unpoison_alloca
+// WITHOUT-NOT: __msan_retval_tls
+// MSAN: __msan_retval_tls
+// CHECK: ret i32
+__attribute__((no_sanitize("memory"))) __attribute__((no_sanitize("kernel-memory"))) int no_false_positives1(int *a) {
+  volatile char buf[8];
+  return *a;
+}
+
+// Function with disable_sanitizer_instrumentation: no instrumentation at all.
+//
+// CHECK-LABEL: i32 @no_instrumentation1
+// KMSAN-NOT: __msan_get_context_state
+// WITHOUT-NOT: __msan_poison_alloca
+// WITHOUT-NOT: @llvm.memset
+// MSAN-NOT: @llvm.memset{{.*}}({{.*}}, i8 0
+// KMSAN-NOT: __msan_unpoison_alloca
+// WITHOUT-NOT: __msan_retval_tls
+// MSAN-NOT: __msan_retval_tls
+// CHECK: ret i32
+__attribute__((disable_sanitizer_instrumentation)) int no_instrumentation1(int *a) {
+  volatile char buf[8];
+  return *a;
+}
diff --git a/clang/test/CodeGen/string-literal-short-wstring.c b/clang/test/CodeGen/string-literal-short-wstring.c
index 8894b8823e91a..899a820218467 100644
--- a/clang/test/CodeGen/string-literal-short-wstring.c
+++ b/clang/test/CodeGen/string-literal-short-wstring.c
@@ -1,11 +1,14 @@
-// RUN: %clang_cc1 -x c++ -triple %itanium_abi_triple -emit-llvm -fwchar-type=short -fno-signed-wchar %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=ITANIUM
-// RUN: %clang_cc1 -x c++ -triple %ms_abi_triple -emit-llvm -fwchar-type=short -fno-signed-wchar %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=MSABI
-// Runs in c++ mode so that wchar_t is available.
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -fwchar-type=short -fno-signed-wchar %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=ITANIUM
+// RUN: %clang_cc1 -triple %ms_abi_triple -emit-llvm -fwchar-type=short -fno-signed-wchar %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=MSABI
+
+// Run in C mode as wide multichar literals are not valid in C++
 
 // XFAIL: hexagon
 // Hexagon aligns arrays of size 8+ bytes to a 64-bit boundary, which fails
 // the first check line with "align 1".
 
+typedef __WCHAR_TYPE__ wchar_t;
+
 int main() {
   // This should convert to utf8.
   // CHECK: private unnamed_addr constant [10 x i8] c"\E1\84\A0\C8\A0\F4\82\80\B0\00", align 1
@@ -20,8 +23,6 @@ int main() {
   // MSABI: linkonce_odr dso_local unnamed_addr constant [5 x i16] [i16 4384, i16 544, i16 -9272, i16 -9168, i16 0]
   const wchar_t *bar = L"\u1120\u0220\U00102030";
 
-
-
   // Should pick second character.
   // CHECK: store i8 98
   char c = 'ab';
@@ -29,10 +30,6 @@ int main() {
   // CHECK: store i16 97
   wchar_t wa = L'a';
 
-  // Should pick second character.
-  // CHECK: store i16 98
-  wchar_t wb = L'ab';
-
   // -4085 == 0xf00b
   // CHECK: store i16 -4085
   wchar_t wc = L'\uF00B';
diff --git a/clang/test/CodeGen/thinlto-distributed-newpm.ll b/clang/test/CodeGen/thinlto-distributed-newpm.ll
index c61a6ff7fbeb5..8f7fc5e9b8411 100644
--- a/clang/test/CodeGen/thinlto-distributed-newpm.ll
+++ b/clang/test/CodeGen/thinlto-distributed-newpm.ll
@@ -134,7 +134,6 @@
 ; CHECK-O: Running pass: LoopSimplifyPass on main
 ; CHECK-O: Running analysis: LoopAnalysis on main
 ; CHECK-O: Running pass: LCSSAPass on main
-; CHECK-O: Running analysis: MemorySSAAnalysis on main
 ; CHECK-O: Running analysis: AAManager on main
 ; CHECK-O: Running analysis: BasicAA on main
 ; CHECK-O: Running analysis: ScalarEvolutionAnalysis on main
diff --git a/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu
new file mode 100644
index 0000000000000..96892286fd75e
--- /dev/null
+++ b/clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -fcuda-is-device \
+// RUN:   -target-cpu gfx90a -Rpass=atomic-expand -S -o - 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=GFX90A-CAS
+
+// REQUIRES: amdgpu-registered-target
+
+#include "Inputs/cuda.h"
+#include <stdatomic.h>
+
+// GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope
+// GFX90A-CAS-LABEL: _Z14atomic_add_casPf
+// GFX90A-CAS:  flat_atomic_cmpswap v0, v[2:3], v[4:5] glc
+// GFX90A-CAS:  s_cbranch_execnz
+__device__ float atomic_add_cas(float *p) {
+  return __atomic_fetch_add(p, 1.0f, memory_order_relaxed);
+}
diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
index 6b0dc7538b412..7433fbdb87ac3 100644
--- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu
+++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu
@@ -116,3 +116,20 @@ __global__ void test_ds_fmin_func(float src, float *__restrict shared) {
   volatile float x = __builtin_amdgcn_ds_fminf(shared, src, 0, 0, false);
   func(shared);
 }
+
+// CHECK: @_Z14test_is_sharedPf(float addrspace(1)* %[[X_COERCE:.*]])
+// CHECK: %[[X:.*]] = alloca float*, align 8, addrspace(5)
+// CHECK: %[[X_ASCAST:.*]] = addrspacecast float* addrspace(5)* %[[X]] to float**
+// CHECK: %[[X_ADDR:.*]] = alloca float*, align 8, addrspace(5)
+// CHECK: %[[X_ADDR_ASCAST:.*]] = addrspacecast float* addrspace(5)* %[[X_ADDR]] to float**
+// CHECK: %[[X_FP:.*]] = addrspacecast float addrspace(1)* %[[X_COERCE]] to float*
+// CHECK: store float* %[[X_FP]], float** %[[X_ASCAST]], align 8
+// CHECK: %[[X1:.*]] = load float*, float** %[[X_ASCAST]], align 8
+// CHECK: store float* %[[X1]], float** %[[X_ADDR_ASCAST]], align 8
+// CHECK: %[[X_TMP:.*]] = load float*, float** %[[X_ADDR_ASCAST]], align 8
+// CHECK: %[[X_ARG:.*]] = bitcast float* %[[X_TMP]] to i8*
+// CHECK: call i1 @llvm.amdgcn.is.shared(i8* %[[X_ARG]])
+
+__global__ void test_is_shared(float *x){
+  bool ret = __builtin_amdgcn_is_shared(x);
+}
diff --git a/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl
new file mode 100644
index 0000000000000..127866e84e051
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:     -Rpass=atomic-expand -S -o - 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=REMARK
+
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:     -Rpass=atomic-expand -S -emit-llvm -o - 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=GFX90A-CAS
+
+// REQUIRES: amdgpu-registered-target
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+typedef enum memory_scope {
+  memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+  memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+  memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+  memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
+  memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+#endif
+} memory_scope;
+
+// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope [-Rpass=atomic-expand]
+// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope [-Rpass=atomic-expand]
+// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope [-Rpass=atomic-expand]
+// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope [-Rpass=atomic-expand]
+// GFX90A-CAS-LABEL: @atomic_cas
+// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("workgroup-one-as") monotonic
+// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("agent-one-as") monotonic
+// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("one-as") monotonic
+// GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("wavefront-one-as") monotonic
+float atomic_cas(__global atomic_float *d, float a) {
+  float ret1 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group);
+  float ret2 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_device);
+  float ret3 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_all_svm_devices);
+  float ret4 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_sub_group);
+}
diff --git a/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl
new file mode 100644
index 0000000000000..ea3324126c209
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/atomics-unsafe-hw-remarks-gfx90a.cl
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:     -Rpass=si-lower -munsafe-fp-atomics %s -S -emit-llvm -o - 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=GFX90A-HW
+
+// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \
+// RUN:     -Rpass=si-lower -munsafe-fp-atomics %s -S -o - 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=GFX90A-HW-REMARK
+
+
+// REQUIRES: amdgpu-registered-target
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+typedef enum memory_scope {
+  memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+  memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+  memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+  memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
+  memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+#endif
+} memory_scope;
+
+// GFX90A-HW-REMARK: Hardware instruction generated for atomic fadd operation at memory scope workgroup-one-as due to an unsafe request. [-Rpass=si-lower]
+// GFX90A-HW-REMARK: Hardware instruction generated for atomic fadd operation at memory scope agent-one-as due to an unsafe request. [-Rpass=si-lower]
+// GFX90A-HW-REMARK: Hardware instruction generated for atomic fadd operation at memory scope wavefront-one-as due to an unsafe request. [-Rpass=si-lower]
+// GFX90A-HW-REMARK: global_atomic_add_f32 v0, v[0:1], v2, off glc
+// GFX90A-HW-REMARK: global_atomic_add_f32 v0, v[0:1], v2, off glc
+// GFX90A-HW-REMARK: global_atomic_add_f32 v0, v[0:1], v2, off glc
+// GFX90A-HW-LABEL: @atomic_unsafe_hw
+// GFX90A-HW:   atomicrmw fadd float addrspace(1)* %{{.*}}, float %{{.*}} syncscope("workgroup-one-as") monotonic, align 4
+// GFX90A-HW:   atomicrmw fadd float addrspace(1)* %{{.*}}, float %{{.*}} syncscope("agent-one-as") monotonic, align 4
+// GFX90A-HW:   atomicrmw fadd float addrspace(1)* %{{.*}}, float %{{.*}} syncscope("wavefront-one-as") monotonic, align 4
+void atomic_unsafe_hw(__global atomic_float *d, float a) {
+  float ret1 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group);
+  float ret2 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_device);
+  float ret3 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_sub_group);
+}
diff --git a/clang/test/CodeGenOpenCL/preserve_vec3.cl b/clang/test/CodeGenOpenCL/preserve_vec3.cl
index 9260af6167e99..2237cf1e27e46 100644
--- a/clang/test/CodeGenOpenCL/preserve_vec3.cl
+++ b/clang/test/CodeGenOpenCL/preserve_vec3.cl
@@ -1,5 +1,8 @@
 // RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown -fpreserve-vec3-type | FileCheck %s
 
+typedef char char3 __attribute__((ext_vector_type(3)));
+typedef short short3 __attribute__((ext_vector_type(3)));
+typedef double double2 __attribute__((ext_vector_type(2)));
 typedef float float3 __attribute__((ext_vector_type(3)));
 typedef float float4 __attribute__((ext_vector_type(4)));
 
@@ -25,3 +28,28 @@ void kernel float3_to_float4(global float3 *a, global float4 *b) {
   // CHECK: store <4 x float> %[[ASTYPE]], <4 x float> addrspace(1)* %b, align 16
   *b = __builtin_astype(*a, float4);
 }
+
+void kernel float3_to_double2(global float3 *a, global double2 *b) {
+  // CHECK-LABEL: spir_kernel void @float3_to_double2
+  // CHECK: %[[LOAD_A:.*]] = load <3 x float>, <3 x float> addrspace(1)* %a, align 16
+  // CHECK: %[[ASTYPE:.*]] = shufflevector <3 x float> %[[LOAD_A]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  // CHECK: %[[OUT_BC:.*]] = bitcast <2 x double> addrspace(1)* %b to <4 x float> addrspace(1)*
+  // CHECK: store <4 x float> %[[ASTYPE]], <4 x float> addrspace(1)* %[[OUT_BC]], align 16
+  *b = __builtin_astype(*a, double2);
+}
+
+void from_char3(char3 a, global int *out) {
+  // CHECK-LABEL: void @from_char3
+  // CHECK: %[[ASTYPE:.*]] = shufflevector <3 x i8> %a, <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  // CHECK: %[[OUT_BC:.*]] = bitcast i32 addrspace(1)* %out to <4 x i8> addrspace(1)*
+  // CHECK: store <4 x i8> %[[ASTYPE]], <4 x i8> addrspace(1)* %[[OUT_BC]]
+  *out = __builtin_astype(a, int);
+}
+
+void from_short3(short3 a, global long *out) {
+  // CHECK-LABEL: void @from_short3
+  // CHECK: %[[ASTYPE:.*]] = shufflevector <3 x i16> %a, <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  // CHECK: %[[OUT_BC:.*]] = bitcast i64 addrspace(1)* %out to <4 x i16> addrspace(1)*
+  // CHECK: store <4 x i16> %[[ASTYPE]], <4 x i16> addrspace(1)* %[[OUT_BC]]
+  *out = __builtin_astype(a, long);
+}
diff --git a/clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/avr/include/.keep b/clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/avr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/avr/lib/libavr.a b/clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/avr/lib/libavr.a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/lib/gcc/avr/10.3.0/libgcc.a b/clang/test/Driver/Inputs/basic_avr_tree_2/opt/local/lib/gcc/avr/10.3.0/libgcc.a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_avr_tree_2/usr/avr/include/.keep b/clang/test/Driver/Inputs/basic_avr_tree_2/usr/avr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/basic_avr_tree_2/usr/avr/lib/libavr.a b/clang/test/Driver/Inputs/basic_avr_tree_2/usr/avr/lib/libavr.a
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/empty-elf-template.yaml b/clang/test/Driver/Inputs/empty-elf-template.yaml
new file mode 100644
index 0000000000000..f77de07a430f6
--- /dev/null
+++ b/clang/test/Driver/Inputs/empty-elf-template.yaml
@@ -0,0 +1,5 @@
+--- !ELF
+FileHeader:
+  Class: ELFCLASS[[BITS]]
+  Data:  ELFDATA2[[ENCODING]]
+  Type:  ET_REL
diff --git a/clang/test/Driver/autocomplete.c b/clang/test/Driver/autocomplete.c
index efb6657c61e18..59055efac2cea 100644
--- a/clang/test/Driver/autocomplete.c
+++ b/clang/test/Driver/autocomplete.c
@@ -51,6 +51,8 @@
 // CLSTDALL-NEXT: CLC++
 // CLSTDALL-NEXT: clc++1.0
 // CLSTDALL-NEXT: CLC++1.0
+// CLSTDALL-NEXT: clc++2021
+// CLSTDALL-NEXT: CLC++2021
 // RUN: %clang --autocomplete=-fno-sanitize-coverage=,f | FileCheck %s -check-prefix=FNOSANICOVER
 // FNOSANICOVER: func
 // RUN: %clang --autocomplete=-fno-sanitize-coverage= | FileCheck %s -check-prefix=FNOSANICOVERALL
diff --git a/clang/test/Driver/avr-toolchain.c b/clang/test/Driver/avr-toolchain.c
index 74eb6ff37aed1..6bb7a00b6a3b5 100644
--- a/clang/test/Driver/avr-toolchain.c
+++ b/clang/test/Driver/avr-toolchain.c
@@ -6,12 +6,24 @@
 // CHECK1-SAME: "-resource-dir" "[[RESOURCE:[^"]+]]"
 // CHECK1-SAME: "-isysroot" "[[SYSROOT:[^"]+/basic_avr_tree]]"
 // CHECK1-SAME: "-internal-isystem"
-// CHECK1-SAME: {{^}} "[[SYSROOT]]/usr/lib/avr/include"
+// CHECK1-SAME: {{^}} "[[SYSROOT]]/usr/lib/gcc/avr/5.4.0/../../../avr/include"
 // CHECK1-NOT:  "-L
 // CHECK1:      avr-ld"
 // CHECK1-SAME: "-o" "a.out"
 // CHECK1-SAME: {{^}} "--gc-sections"
 
+// RUN: %clang %s -### -target avr --sysroot %S/Inputs/basic_avr_tree_2/opt/local -S 2>&1 | FileCheck --check-prefix=CHECK2 %s
+// CHECK2: clang{{.*}} "-cc1" "-triple" "avr"
+// CHECK2-SAME: "-isysroot" "[[SYSROOT:[^"]+/basic_avr_tree_2/opt/local]]"
+// CHECK2-SAME: "-internal-isystem"
+// CHECK2-SAME: {{^}} "[[SYSROOT]]/lib/gcc/avr/10.3.0/../../../../avr/include"
+
+// RUN: %clang %s -### -target avr --sysroot %S/Inputs/basic_avr_tree_2 -S 2>&1 | FileCheck --check-prefix=CHECK3 %s
+// CHECK3: clang{{.*}} "-cc1" "-triple" "avr"
+// CHECK3-SAME: "-isysroot" "[[SYSROOT:[^"]+/basic_avr_tree_2]]"
+// CHECK3-SAME: "-internal-isystem"
+// CHECK3-SAME: {{^}} "[[SYSROOT]]/usr/avr/include"
+
 // RUN: %clang %s -### -target avr 2>&1 | FileCheck -check-prefix=CC1 %s
 // CC1: clang{{.*}} "-cc1" "-triple" "avr" {{.*}} "-fno-use-init-array"
 
diff --git a/clang/test/Driver/clang-offload-wrapper.c b/clang/test/Driver/clang-offload-wrapper.c
index 3c2f73ad6ec9e..3b86abd3b4009 100644
--- a/clang/test/Driver/clang-offload-wrapper.c
+++ b/clang/test/Driver/clang-offload-wrapper.c
@@ -99,16 +99,17 @@
 // -------
 // Check bitcode produced by the wrapper tool.
 //
-// RUN: clang-offload-wrapper                                                         \
+// RUN: clang-offload-wrapper -add-omp-offload-notes                                  \
 // RUN:   -host=x86_64-pc-linux-gnu                                                   \
 // RUN:     -kind=openmp -target=tg2                -format=native %t3.tgt %t1_mf.txt \
 // RUN:     -kind=sycl   -target=tg1 -compile-opts=-g -link-opts=-cl-denorms-are-zero \
 // RUN:                  -format spirv  %t1.tgt                                       \
 // RUN:                  -target=tg2 -compile-opts= -link-opts=                       \
 // RUN:                  -format native %t2.tgt                                       \
-// RUN:   -o %t.wrapper.bc
+// RUN:   -o %t.wrapper.bc 2>&1 | FileCheck %s --check-prefix ELF-WARNING
 // RUN: llvm-dis %t.wrapper.bc -o - | FileCheck %s --check-prefix CHECK-IR
 
+// ELF-WARNING: is not an ELF image, so notes cannot be added to it.
 // CHECK-IR: target triple = "x86_64-pc-linux-gnu"
 
 // --- OpenMP device binary image descriptor structure
@@ -211,3 +212,24 @@
 // RUN: %clang -target x86_64-pc-linux-gnu -c %t.wrapper.bc -o %t.wrapper.o
 // RUN: clang-offload-bundler --type=o --inputs=%t.wrapper.o --targets=sycl-spir64-unknown-linux-sycldevice --outputs=%t1.out --unbundle
 // RUN: diff %t1.out %t1.tgt
+
+// Check that clang-offload-wrapper adds LLVMOMPOFFLOAD notes
+// into the ELF offload images:
+// RUN: yaml2obj %S/Inputs/empty-elf-template.yaml -o %t.64le -DBITS=64 -DENCODING=LSB
+// RUN: clang-offload-wrapper -add-omp-offload-notes -kind=openmp -target=x86_64-pc-linux-gnu -o %t.wrapper.elf64le.bc %t.64le
+// RUN: llvm-dis %t.wrapper.elf64le.bc -o - | FileCheck %s --check-prefix OMPNOTES
+// RUN: yaml2obj %S/Inputs/empty-elf-template.yaml -o %t.64be -DBITS=64 -DENCODING=MSB
+// RUN: clang-offload-wrapper -add-omp-offload-notes -kind=openmp -target=x86_64-pc-linux-gnu -o %t.wrapper.elf64be.bc %t.64be
+// RUN: llvm-dis %t.wrapper.elf64be.bc -o - | FileCheck %s --check-prefix OMPNOTES
+// RUN: yaml2obj %S/Inputs/empty-elf-template.yaml -o %t.32le -DBITS=32 -DENCODING=LSB
+// RUN: clang-offload-wrapper -add-omp-offload-notes -kind=openmp -target=x86_64-pc-linux-gnu -o %t.wrapper.elf32le.bc %t.32le
+// RUN: llvm-dis %t.wrapper.elf32le.bc -o - | FileCheck %s --check-prefix OMPNOTES
+// RUN: yaml2obj %S/Inputs/empty-elf-template.yaml -o %t.32be -DBITS=32 -DENCODING=MSB
+// RUN: clang-offload-wrapper -add-omp-offload-notes -kind=openmp -target=x86_64-pc-linux-gnu -o %t.wrapper.elf32be.bc %t.32be
+// RUN: llvm-dis %t.wrapper.elf32be.bc -o - | FileCheck %s --check-prefix OMPNOTES
+
+// There is no clean way for extracting the offload image
+// from the object file currently, so try to find
+// the inserted ELF notes in the device image variable's
+// initializer:
+// OMPNOTES: @{{.+}} = internal unnamed_addr constant [{{[0-9]+}} x i8] c"{{.*}}LLVMOMPOFFLOAD{{.*}}LLVMOMPOFFLOAD{{.*}}LLVMOMPOFFLOAD{{.*}}"
diff --git a/clang/test/Driver/fuse-ld.c b/clang/test/Driver/fuse-ld.c
index 74a2e5f4d9654..dfcf31e25ffa6 100644
--- a/clang/test/Driver/fuse-ld.c
+++ b/clang/test/Driver/fuse-ld.c
@@ -60,19 +60,19 @@
 
 // RUN: %clang %s -### -fuse-ld=ld \
 // RUN:     -target arm-linux-androideabi \
-// RUN:     -gcc-toolchain %S/Inputs/basic_android_tree 2>&1 \
+// RUN:     --gcc-toolchain=%S/Inputs/basic_android_tree 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-ANDROID-ARM-LD-TC
 // CHECK-ANDROID-ARM-LD-TC: Inputs/basic_android_tree/lib/gcc/arm-linux-androideabi/4.4.3/../../../../arm-linux-androideabi/bin{{/|\\+}}ld
 
 // RUN: %clang %s -### -fuse-ld=bfd \
 // RUN:     -target arm-linux-androideabi \
-// RUN:     -gcc-toolchain %S/Inputs/basic_android_tree 2>&1 \
+// RUN:     --gcc-toolchain=%S/Inputs/basic_android_tree 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-ANDROID-ARM-BFD-TC
 // CHECK-ANDROID-ARM-BFD-TC: Inputs/basic_android_tree/lib/gcc/arm-linux-androideabi/4.4.3/../../../../arm-linux-androideabi/bin{{/|\\+}}ld.bfd
 
 // RUN: %clang %s -### -fuse-ld=gold \
 // RUN:     -target arm-linux-androideabi \
-// RUN:     -gcc-toolchain %S/Inputs/basic_android_tree 2>&1 \
+// RUN:     --gcc-toolchain=%S/Inputs/basic_android_tree 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=CHECK-ANDROID-ARM-GOLD-TC
 // CHECK-ANDROID-ARM-GOLD-TC: Inputs/basic_android_tree/lib/gcc/arm-linux-androideabi/4.4.3/../../../../arm-linux-androideabi/bin{{/|\\+}}ld.gold
 
diff --git a/clang/test/Driver/gcc-toolchain.cpp b/clang/test/Driver/gcc-toolchain.cpp
index 7cdba0841b8cd..e3a1670110a5f 100644
--- a/clang/test/Driver/gcc-toolchain.cpp
+++ b/clang/test/Driver/gcc-toolchain.cpp
@@ -8,7 +8,7 @@
 //
 // Additionally check that the legacy spelling of the flag works.
 // RUN: %clangxx %s -### --target=x86_64-linux-gnu --sysroot= \
-// RUN:   -gcc-toolchain %S/Inputs/ubuntu_14.04_multiarch_tree/usr -stdlib=libstdc++ --rtlib=libgcc 2>&1 | \
+// RUN:   --gcc-toolchain=%S/Inputs/ubuntu_14.04_multiarch_tree/usr -stdlib=libstdc++ --rtlib=libgcc 2>&1 | \
 // RUN:   FileCheck %s
 //
 // Test for header search toolchain detection.
diff --git a/clang/test/Driver/unknown-std.cl b/clang/test/Driver/unknown-std.cl
index 7b02fe09f5deb..3570515ad076d 100644
--- a/clang/test/Driver/unknown-std.cl
+++ b/clang/test/Driver/unknown-std.cl
@@ -12,6 +12,7 @@
 // CHECK-NEXT: note: use 'cl2.0' for 'OpenCL 2.0' standard
 // CHECK-NEXT: note: use 'cl3.0' for 'OpenCL 3.0' standard
 // CHECK-NEXT: note: use 'clc++1.0' or 'clc++' for 'C++ for OpenCL 1.0' standard
+// CHECK-NEXT: note: use 'clc++2021' for 'C++ for OpenCL 2021' standard
 
 // Make sure that no other output is present.
 // CHECK-NOT: {{^.+$}}
diff --git a/clang/test/FixIt/fixit-unicode.c b/clang/test/FixIt/fixit-unicode.c
index 360ac69574954..70c9751a2bcd6 100644
--- a/clang/test/FixIt/fixit-unicode.c
+++ b/clang/test/FixIt/fixit-unicode.c
@@ -13,15 +13,11 @@ struct Foo {
 void test1() {
   struct Foo foo;
   foo.bar = 42☃
-// CHECK: error: non-ASCII characters are not allowed outside of literals and identifiers
-// CHECK: {{^              \^}}
-// CHECK: error: expected ';' after expression
-// Make sure we emit the fixit right in front of the snowman.
-// CHECK: {{^              \^}}
-// CHECK: {{^              ;}}
+  // CHECK: error: character <U+2603> not allowed in an identifier
+  // CHECK: {{^              \^}}
+  // Make sure we emit the fixit right in front of the snowman.
 
-// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-8]]:15-[[@LINE-8]]:18}:""
-// CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-9]]:15-[[@LINE-9]]:15}:";"
+  // CHECK-MACHINE: fix-it:"{{.*}}":{[[@LINE-5]]:15-[[@LINE-5]]:18}:""
 }
 
 
diff --git a/clang/test/Frontend/stdlang.c b/clang/test/Frontend/stdlang.c
index 0cb67deef23b3..0bad4735a75c0 100644
--- a/clang/test/Frontend/stdlang.c
+++ b/clang/test/Frontend/stdlang.c
@@ -7,12 +7,16 @@
 // RUN: %clang_cc1 -x cl -cl-std=cl2.0 -DOPENCL %s
 // RUN: %clang_cc1 -x cl -cl-std=cl3.0 -DOPENCL %s
 // RUN: %clang_cc1 -x cl -cl-std=clc++ -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=clc++1.0 -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=clc++2021 -DOPENCL %s
 // RUN: %clang_cc1 -x cl -cl-std=CL -DOPENCL %s
 // RUN: %clang_cc1 -x cl -cl-std=CL1.1 -DOPENCL %s
 // RUN: %clang_cc1 -x cl -cl-std=CL1.2 -DOPENCL %s
 // RUN: %clang_cc1 -x cl -cl-std=CL2.0 -DOPENCL %s
 // RUN: %clang_cc1 -x cl -cl-std=CL3.0 -DOPENCL %s
 // RUN: %clang_cc1 -x cl -cl-std=CLC++ -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=CLC++1.0 -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=CLC++2021 -DOPENCL %s
 // RUN: not %clang_cc1 -x cl -std=c99 -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-C99 %s
 // RUN: not %clang_cc1 -x cl -cl-std=invalid -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID %s
 // CHECK-C99: error: invalid argument '-std=c99' not allowed with 'OpenCL'
diff --git a/clang/test/Headers/Inputs/include/omp.h b/clang/test/Headers/Inputs/include/omp.h
new file mode 100644
index 0000000000000..9a6c115394793
--- /dev/null
+++ b/clang/test/Headers/Inputs/include/omp.h
@@ -0,0 +1,21 @@
+#ifndef __OMP_H
+#define __OMP_H
+
+#if _OPENMP
+// Follows the pattern in interface.h
+// Clang sema checks this type carefully, needs to closely match that from omp.h
+typedef enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
+} omp_allocator_handle_t;
+#endif
+
+#endif
diff --git a/clang/test/Headers/wasm.c b/clang/test/Headers/wasm.c
index f51f005974f23..ce24f4269ab92 100644
--- a/clang/test/Headers/wasm.c
+++ b/clang/test/Headers/wasm.c
@@ -3,7 +3,7 @@
 
 // FIXME: This should not be using -O2 and implicitly testing the entire IR opt pipeline.
 
-// RUN: %clang %s -O2 -emit-llvm -S -o - -target wasm32-unknown-unknown -msimd128 -Wcast-qual -fno-lax-vector-conversions -Werror | FileCheck %s
+// RUN: %clang %s -O2 -emit-llvm -S -o - -target wasm32-unknown-unknown -msimd128 -Wall -Weverything -Wno-missing-prototypes -fno-lax-vector-conversions -Werror | FileCheck %s
 
 #include <wasm_simd128.h>
 
@@ -213,7 +213,7 @@ v128_t test_v128_load64_lane(const uint64_t *ptr, v128_t vec) {
 // CHECK-NEXT:    ret void
 //
 void test_v128_store(void *mem, v128_t a) {
-  return wasm_v128_store(mem, a);
+  wasm_v128_store(mem, a);
 }
 
 // CHECK-LABEL: @test_v128_store8_lane(
@@ -224,7 +224,7 @@ void test_v128_store(void *mem, v128_t a) {
 // CHECK-NEXT:    ret void
 //
 void test_v128_store8_lane(uint8_t *ptr, v128_t vec) {
-  return wasm_v128_store8_lane(ptr, vec, 15);
+  wasm_v128_store8_lane(ptr, vec, 15);
 }
 
 // CHECK-LABEL: @test_v128_store16_lane(
@@ -235,7 +235,7 @@ void test_v128_store8_lane(uint8_t *ptr, v128_t vec) {
 // CHECK-NEXT:    ret void
 //
 void test_v128_store16_lane(uint16_t *ptr, v128_t vec) {
-  return wasm_v128_store16_lane(ptr, vec, 7);
+  wasm_v128_store16_lane(ptr, vec, 7);
 }
 
 // CHECK-LABEL: @test_v128_store32_lane(
@@ -245,7 +245,7 @@ void test_v128_store16_lane(uint16_t *ptr, v128_t vec) {
 // CHECK-NEXT:    ret void
 //
 void test_v128_store32_lane(uint32_t *ptr, v128_t vec) {
-  return wasm_v128_store32_lane(ptr, vec, 3);
+  wasm_v128_store32_lane(ptr, vec, 3);
 }
 
 // CHECK-LABEL: @test_v128_store64_lane(
@@ -256,7 +256,7 @@ void test_v128_store32_lane(uint32_t *ptr, v128_t vec) {
 // CHECK-NEXT:    ret void
 //
 void test_v128_store64_lane(uint64_t *ptr, v128_t vec) {
-  return wasm_v128_store64_lane(ptr, vec, 1);
+  wasm_v128_store64_lane(ptr, vec, 1);
 }
 
 // CHECK-LABEL: @test_i8x16_make(
@@ -284,6 +284,31 @@ v128_t test_i8x16_make(int8_t c0, int8_t c1, int8_t c2, int8_t c3, int8_t c4, in
   return wasm_i8x16_make(c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15);
 }
 
+// CHECK-LABEL: @test_u8x16_make(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 [[C0:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[C1:%.*]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[C2:%.*]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[C3:%.*]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[C4:%.*]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[C5:%.*]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[C6:%.*]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[C7:%.*]], i32 7
+// CHECK-NEXT:    [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[C8:%.*]], i32 8
+// CHECK-NEXT:    [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[C9:%.*]], i32 9
+// CHECK-NEXT:    [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[C10:%.*]], i32 10
+// CHECK-NEXT:    [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[C11:%.*]], i32 11
+// CHECK-NEXT:    [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[C12:%.*]], i32 12
+// CHECK-NEXT:    [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[C13:%.*]], i32 13
+// CHECK-NEXT:    [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[C14:%.*]], i32 14
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[C15:%.*]], i32 15
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[VECINIT15_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v128_t test_u8x16_make(uint8_t c0, uint8_t c1, uint8_t c2, uint8_t c3, uint8_t c4, uint8_t c5, uint8_t c6, uint8_t c7, uint8_t c8, uint8_t c9, uint8_t c10, uint8_t c11, uint8_t c12, uint8_t c13, uint8_t c14, uint8_t c15) {
+  return wasm_u8x16_make(c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15);
+}
+
 // CHECK-LABEL: @test_i16x8_make(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C0:%.*]], i32 0
@@ -301,6 +326,23 @@ v128_t test_i16x8_make(int16_t c0, int16_t c1, int16_t c2, int16_t c3, int16_t c
   return wasm_i16x8_make(c0, c1, c2, c3, c4, c5, c6, c7);
 }
 
+// CHECK-LABEL: @test_u16x8_make(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[C0:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 [[C1:%.*]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 [[C2:%.*]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 [[C3:%.*]], i32 3
+// CHECK-NEXT:    [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 [[C4:%.*]], i32 4
+// CHECK-NEXT:    [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 [[C5:%.*]], i32 5
+// CHECK-NEXT:    [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 [[C6:%.*]], i32 6
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[C7:%.*]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v128_t test_u16x8_make(uint16_t c0, uint16_t c1, uint16_t c2, uint16_t c3, uint16_t c4, uint16_t c5, uint16_t c6, uint16_t c7) {
+  return wasm_u16x8_make(c0, c1, c2, c3, c4, c5, c6, c7);
+}
+
 // CHECK-LABEL: @test_i32x4_make(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C0:%.*]], i32 0
@@ -313,6 +355,18 @@ v128_t test_i32x4_make(int32_t c0, int32_t c1, int32_t c2, int32_t c3) {
   return wasm_i32x4_make(c0, c1, c2, c3);
 }
 
+// CHECK-LABEL: @test_u32x4_make(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[C0:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 [[C1:%.*]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 [[C2:%.*]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[C3:%.*]], i32 3
+// CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
+//
+v128_t test_u32x4_make(uint32_t c0, uint32_t c1, uint32_t c2, uint32_t c3) {
+  return wasm_u32x4_make(c0, c1, c2, c3);
+}
+
 // CHECK-LABEL: @test_i64x2_make(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 [[C0:%.*]], i32 0
@@ -324,6 +378,17 @@ v128_t test_i64x2_make(int64_t c0, int64_t c1) {
   return wasm_i64x2_make(c0, c1);
 }
 
+// CHECK-LABEL: @test_u64x2_make(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 [[C0:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 [[C1:%.*]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[VECINIT1_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v128_t test_u64x2_make(uint64_t c0, uint64_t c1) {
+  return wasm_u64x2_make(c0, c1);
+}
+
 // CHECK-LABEL: @test_f32x4_make(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[C0:%.*]], i32 0
@@ -352,39 +417,71 @@ v128_t test_f64x2_make(double c0, double c1) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> <i32 50462976, i32 117835012, i32 185207048, i32 252579084>
 //
-v128_t test_i8x16_const() {
+v128_t test_i8x16_const(void) {
   return wasm_i8x16_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 }
 
+// CHECK-LABEL: @test_u8x16_const(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <4 x i32> <i32 50462976, i32 117835012, i32 185207048, i32 252579084>
+//
+v128_t test_u8x16_const(void) {
+  return wasm_u8x16_const(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
 // CHECK-LABEL: @test_i16x8_const(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> <i32 65536, i32 196610, i32 327684, i32 458758>
 //
-v128_t test_i16x8_const() {
+v128_t test_i16x8_const(void) {
   return wasm_i16x8_const(0, 1, 2, 3, 4, 5, 6, 7);
 }
 
+// CHECK-LABEL: @test_u16x8_const(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <4 x i32> <i32 65536, i32 196610, i32 327684, i32 458758>
+//
+v128_t test_u16x8_const(void) {
+  return wasm_u16x8_const(0, 1, 2, 3, 4, 5, 6, 7);
+}
+
 // CHECK-LABEL: @test_i32x4_const(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 //
-v128_t test_i32x4_const() {
+v128_t test_i32x4_const(void) {
   return wasm_i32x4_const(0, 1, 2, 3);
 }
 
+// CHECK-LABEL: @test_u32x4_const(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+//
+v128_t test_u32x4_const(void) {
+  return wasm_u32x4_const(0, 1, 2, 3);
+}
+
 // CHECK-LABEL: @test_i64x2_const(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 1, i32 0>
 //
-v128_t test_i64x2_const() {
+v128_t test_i64x2_const(void) {
   return wasm_i64x2_const(0, 1);
 }
 
+// CHECK-LABEL: @test_u64x2_const(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 1, i32 0>
+//
+v128_t test_u64x2_const(void) {
+  return wasm_u64x2_const(0, 1);
+}
+
 // CHECK-LABEL: @test_f32x4_const(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> <i32 0, i32 1065353216, i32 1073741824, i32 1077936128>
 //
-v128_t test_f32x4_const() {
+v128_t test_f32x4_const(void) {
   return wasm_f32x4_const(0, 1, 2, 3);
 }
 
@@ -392,7 +489,7 @@ v128_t test_f32x4_const() {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> <i32 0, i32 0, i32 0, i32 1072693248>
 //
-v128_t test_f64x2_const() {
+v128_t test_f64x2_const(void) {
   return wasm_f64x2_const(0, 1);
 }
 
@@ -400,39 +497,71 @@ v128_t test_f64x2_const() {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> <i32 707406378, i32 707406378, i32 707406378, i32 707406378>
 //
-v128_t test_i8x16_const_splat() {
+v128_t test_i8x16_const_splat(void) {
   return wasm_i8x16_const_splat(42);
 }
 
+// CHECK-LABEL: @test_u8x16_const_splat(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <4 x i32> <i32 707406378, i32 707406378, i32 707406378, i32 707406378>
+//
+v128_t test_u8x16_const_splat(void) {
+  return wasm_u8x16_const_splat(42);
+}
+
 // CHECK-LABEL: @test_i16x8_const_splat(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> <i32 2752554, i32 2752554, i32 2752554, i32 2752554>
 //
-v128_t test_i16x8_const_splat() {
+v128_t test_i16x8_const_splat(void) {
   return wasm_i16x8_const_splat(42);
 }
 
+// CHECK-LABEL: @test_u16x8_const_splat(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <4 x i32> <i32 2752554, i32 2752554, i32 2752554, i32 2752554>
+//
+v128_t test_u16x8_const_splat(void) {
+  return wasm_u16x8_const_splat(42);
+}
+
 // CHECK-LABEL: @test_i32x4_const_splat(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> <i32 42, i32 42, i32 42, i32 42>
 //
-v128_t test_i32x4_const_splat() {
+v128_t test_i32x4_const_splat(void) {
   return wasm_i32x4_const_splat(42);
 }
 
+// CHECK-LABEL: @test_u32x4_const_splat(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <4 x i32> <i32 42, i32 42, i32 42, i32 42>
+//
+v128_t test_u32x4_const_splat(void) {
+  return wasm_u32x4_const_splat(42);
+}
+
 // CHECK-LABEL: @test_i64x2_const_splat(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> <i32 42, i32 0, i32 42, i32 0>
 //
-v128_t test_i64x2_const_splat() {
+v128_t test_i64x2_const_splat(void) {
   return wasm_i64x2_const_splat(42);
 }
 
+// CHECK-LABEL: @test_u64x2_const_splat(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <4 x i32> <i32 42, i32 0, i32 42, i32 0>
+//
+v128_t test_u64x2_const_splat(void) {
+  return wasm_u64x2_const_splat(42);
+}
+
 // CHECK-LABEL: @test_f32x4_const_splat(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> <i32 1109917696, i32 1109917696, i32 1109917696, i32 1109917696>
 //
-v128_t test_f32x4_const_splat() {
+v128_t test_f32x4_const_splat(void) {
   return wasm_f32x4_const_splat(42);
 }
 
@@ -440,7 +569,7 @@ v128_t test_f32x4_const_splat() {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret <4 x i32> <i32 0, i32 1078263808, i32 0, i32 1078263808>
 //
-v128_t test_f64x2_const_splat() {
+v128_t test_f64x2_const_splat(void) {
   return wasm_f64x2_const_splat(42);
 }
 
@@ -455,6 +584,17 @@ v128_t test_i8x16_splat(int8_t a) {
   return wasm_i8x16_splat(a);
 }
 
+// CHECK-LABEL: @test_u8x16_splat(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 [[A:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT15_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[VECINIT15_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v128_t test_u8x16_splat(uint8_t a) {
+  return wasm_u8x16_splat(a);
+}
+
 // CHECK-LABEL: @test_i8x16_extract_lane(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
@@ -486,6 +626,17 @@ v128_t test_i8x16_replace_lane(v128_t a, int8_t b) {
   return wasm_i8x16_replace_lane(a, 15, b);
 }
 
+// CHECK-LABEL: @test_u8x16_replace_lane(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[B:%.*]], i32 15
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[VECINS_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+v128_t test_u8x16_replace_lane(v128_t a, uint8_t b) {
+  return wasm_u8x16_replace_lane(a, 15, b);
+}
+
 // CHECK-LABEL: @test_i16x8_splat(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[A:%.*]], i32 0
@@ -497,6 +648,17 @@ v128_t test_i16x8_splat(int16_t a) {
   return wasm_i16x8_splat(a);
 }
 
+// CHECK-LABEL: @test_u16x8_splat(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 [[A:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT7_I:%.*]] = shufflevector <8 x i16> [[VECINIT_I]], <8 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v128_t test_u16x8_splat(uint16_t a) {
+  return wasm_u16x8_splat(a);
+}
+
 // CHECK-LABEL: @test_i16x8_extract_lane(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
@@ -528,6 +690,17 @@ v128_t test_i16x8_replace_lane(v128_t a, int16_t b) {
   return wasm_i16x8_replace_lane(a, 7, b);
 }
 
+// CHECK-LABEL: @test_u16x8_replace_lane(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[B:%.*]], i32 7
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[VECINS_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+v128_t test_u16x8_replace_lane(v128_t a, uint16_t b) {
+  return wasm_u16x8_replace_lane(a, 7, b);
+}
+
 // CHECK-LABEL: @test_i32x4_splat(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
@@ -538,6 +711,16 @@ v128_t test_i32x4_splat(int32_t a) {
   return wasm_i32x4_splat(a);
 }
 
+// CHECK-LABEL: @test_u32x4_splat(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = shufflevector <4 x i32> [[VECINIT_I]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i32> [[VECINIT3_I]]
+//
+v128_t test_u32x4_splat(uint32_t a) {
+  return wasm_u32x4_splat(a);
+}
+
 // CHECK-LABEL: @test_i32x4_extract_lane(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 3
@@ -547,6 +730,15 @@ int32_t test_i32x4_extract_lane(v128_t a) {
   return wasm_i32x4_extract_lane(a, 3);
 }
 
+// CHECK-LABEL: @test_u32x4_extract_lane(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 3
+// CHECK-NEXT:    ret i32 [[VECEXT_I]]
+//
+uint32_t test_u32x4_extract_lane(v128_t a) {
+  return wasm_u32x4_extract_lane(a, 3);
+}
+
 // CHECK-LABEL: @test_i32x4_replace_lane(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 3
@@ -556,6 +748,15 @@ v128_t test_i32x4_replace_lane(v128_t a, int32_t b) {
   return wasm_i32x4_replace_lane(a, 3, b);
 }
 
+// CHECK-LABEL: @test_u32x4_replace_lane(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <4 x i32> [[A:%.*]], i32 [[B:%.*]], i32 3
+// CHECK-NEXT:    ret <4 x i32> [[VECINS_I]]
+//
+v128_t test_u32x4_replace_lane(v128_t a, uint32_t b) {
+  return wasm_u32x4_replace_lane(a, 3, b);
+}
+
 // CHECK-LABEL: @test_i64x2_splat(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 [[A:%.*]], i32 0
@@ -567,6 +768,17 @@ v128_t test_i64x2_splat(int64_t a) {
   return wasm_i64x2_splat(a);
 }
 
+// CHECK-LABEL: @test_u64x2_splat(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 [[A:%.*]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = shufflevector <2 x i64> [[VECINIT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[VECINIT1_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v128_t test_u64x2_splat(uint64_t a) {
+  return wasm_u64x2_splat(a);
+}
+
 // CHECK-LABEL: @test_i64x2_extract_lane(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
@@ -577,6 +789,16 @@ int64_t test_i64x2_extract_lane(v128_t a) {
   return wasm_i64x2_extract_lane(a, 1);
 }
 
+// CHECK-LABEL: @test_u64x2_extract_lane(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[VECEXT_I:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
+// CHECK-NEXT:    ret i64 [[VECEXT_I]]
+//
+uint64_t test_u64x2_extract_lane(v128_t a) {
+  return wasm_u64x2_extract_lane(a, 1);
+}
+
 // CHECK-LABEL: @test_i64x2_replace_lane(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
@@ -588,6 +810,17 @@ v128_t test_i64x2_replace_lane(v128_t a, int64_t b) {
   return wasm_i64x2_replace_lane(a, 1, b);
 }
 
+// CHECK-LABEL: @test_u64x2_replace_lane(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
+// CHECK-NEXT:    [[VECINS_I:%.*]] = insertelement <2 x i64> [[TMP0]], i64 [[B:%.*]], i32 1
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[VECINS_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+v128_t test_u64x2_replace_lane(v128_t a, uint64_t b) {
+  return wasm_u64x2_replace_lane(a, 1, b);
+}
+
 // CHECK-LABEL: @test_f32x4_splat(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[A:%.*]], i32 0
@@ -1345,7 +1578,7 @@ bool test_i8x16_all_true(v128_t a) {
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v16i8(<16 x i8> [[TMP0]]) #[[ATTR6]]
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
-int32_t test_i8x16_bitmask(v128_t a) {
+uint32_t test_i8x16_bitmask(v128_t a) {
   return wasm_i8x16_bitmask(a);
 }
 
@@ -1370,7 +1603,7 @@ v128_t test_i8x16_popcnt(v128_t a) {
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[SHL_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
-v128_t test_i8x16_shl(v128_t a, int32_t b) {
+v128_t test_i8x16_shl(v128_t a, uint32_t b) {
   return wasm_i8x16_shl(a, b);
 }
 
@@ -1384,7 +1617,7 @@ v128_t test_i8x16_shl(v128_t a, int32_t b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[SHR_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
-v128_t test_i8x16_shr(v128_t a, int32_t b) {
+v128_t test_i8x16_shr(v128_t a, uint32_t b) {
   return wasm_i8x16_shr(a, b);
 }
 
@@ -1398,7 +1631,7 @@ v128_t test_i8x16_shr(v128_t a, int32_t b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[SHR_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
-v128_t test_u8x16_shr(v128_t a, int32_t b) {
+v128_t test_u8x16_shr(v128_t a, uint32_t b) {
   return wasm_u8x16_shr(a, b);
 }
 
@@ -1577,7 +1810,7 @@ bool test_i16x8_all_true(v128_t a) {
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v8i16(<8 x i16> [[TMP0]]) #[[ATTR6]]
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
-int32_t test_i16x8_bitmask(v128_t a) {
+uint32_t test_i16x8_bitmask(v128_t a) {
   return wasm_i16x8_bitmask(a);
 }
 
@@ -1591,7 +1824,7 @@ int32_t test_i16x8_bitmask(v128_t a) {
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[SHL_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
-v128_t test_i16x8_shl(v128_t a, int32_t b) {
+v128_t test_i16x8_shl(v128_t a, uint32_t b) {
   return wasm_i16x8_shl(a, b);
 }
 
@@ -1605,7 +1838,7 @@ v128_t test_i16x8_shl(v128_t a, int32_t b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[SHR_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
-v128_t test_i16x8_shr(v128_t a, int32_t b) {
+v128_t test_i16x8_shr(v128_t a, uint32_t b) {
   return wasm_i16x8_shr(a, b);
 }
 
@@ -1619,7 +1852,7 @@ v128_t test_i16x8_shr(v128_t a, int32_t b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[SHR_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
-v128_t test_u16x8_shr(v128_t a, int32_t b) {
+v128_t test_u16x8_shr(v128_t a, uint32_t b) {
   return wasm_u16x8_shr(a, b);
 }
 
@@ -1804,7 +2037,7 @@ bool test_i32x4_all_true(v128_t a) {
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> [[A:%.*]]) #[[ATTR6]]
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
-int32_t test_i32x4_bitmask(v128_t a) {
+uint32_t test_i32x4_bitmask(v128_t a) {
   return wasm_i32x4_bitmask(a);
 }
 
@@ -1815,7 +2048,7 @@ int32_t test_i32x4_bitmask(v128_t a) {
 // CHECK-NEXT:    [[SHL_I:%.*]] = shl <4 x i32> [[A:%.*]], [[SPLAT_SPLAT_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SHL_I]]
 //
-v128_t test_i32x4_shl(v128_t a, int32_t b) {
+v128_t test_i32x4_shl(v128_t a, uint32_t b) {
   return wasm_i32x4_shl(a, b);
 }
 
@@ -1826,7 +2059,7 @@ v128_t test_i32x4_shl(v128_t a, int32_t b) {
 // CHECK-NEXT:    [[SHR_I:%.*]] = ashr <4 x i32> [[A:%.*]], [[SPLAT_SPLAT_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SHR_I]]
 //
-v128_t test_i32x4_shr(v128_t a, int32_t b) {
+v128_t test_i32x4_shr(v128_t a, uint32_t b) {
   return wasm_i32x4_shr(a, b);
 }
 
@@ -1837,7 +2070,7 @@ v128_t test_i32x4_shr(v128_t a, int32_t b) {
 // CHECK-NEXT:    [[SHR_I:%.*]] = lshr <4 x i32> [[A:%.*]], [[SPLAT_SPLAT_I]]
 // CHECK-NEXT:    ret <4 x i32> [[SHR_I]]
 //
-v128_t test_u32x4_shr(v128_t a, int32_t b) {
+v128_t test_u32x4_shr(v128_t a, uint32_t b) {
   return wasm_u32x4_shr(a, b);
 }
 
@@ -1958,49 +2191,49 @@ bool test_i64x2_all_true(v128_t a) {
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v2i64(<2 x i64> [[TMP0]]) #[[ATTR6]]
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
-int32_t test_i64x2_bitmask(v128_t a) {
+uint32_t test_i64x2_bitmask(v128_t a) {
   return wasm_i64x2_bitmask(a);
 }
 
 // CHECK-LABEL: @test_i64x2_shl(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[CONV_I:%.*]] = sext i32 [[B:%.*]] to i64
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i32 [[B:%.*]] to i64
 // CHECK-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i64> poison, i64 [[CONV_I]], i32 0
 // CHECK-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i64> [[SPLAT_SPLATINSERT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[SHL_I:%.*]] = shl <2 x i64> [[TMP0]], [[SPLAT_SPLAT_I]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SHL_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
-v128_t test_i64x2_shl(v128_t a, int32_t b) {
+v128_t test_i64x2_shl(v128_t a, uint32_t b) {
   return wasm_i64x2_shl(a, b);
 }
 
 // CHECK-LABEL: @test_i64x2_shr(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[CONV_I:%.*]] = sext i32 [[B:%.*]] to i64
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i32 [[B:%.*]] to i64
 // CHECK-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i64> poison, i64 [[CONV_I]], i32 0
 // CHECK-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i64> [[SPLAT_SPLATINSERT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[SHR_I:%.*]] = ashr <2 x i64> [[TMP0]], [[SPLAT_SPLAT_I]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SHR_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
-v128_t test_i64x2_shr(v128_t a, int32_t b) {
+v128_t test_i64x2_shr(v128_t a, uint32_t b) {
   return wasm_i64x2_shr(a, b);
 }
 
 // CHECK-LABEL: @test_u64x2_shr(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[CONV_I:%.*]] = sext i32 [[B:%.*]] to i64
+// CHECK-NEXT:    [[CONV_I:%.*]] = zext i32 [[B:%.*]] to i64
 // CHECK-NEXT:    [[SPLAT_SPLATINSERT_I:%.*]] = insertelement <2 x i64> poison, i64 [[CONV_I]], i32 0
 // CHECK-NEXT:    [[SPLAT_SPLAT_I:%.*]] = shufflevector <2 x i64> [[SPLAT_SPLATINSERT_I]], <2 x i64> poison, <2 x i32> zeroinitializer
 // CHECK-NEXT:    [[SHR_I:%.*]] = lshr <2 x i64> [[TMP0]], [[SPLAT_SPLAT_I]]
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[SHR_I]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
-v128_t test_u64x2_shr(v128_t a, int32_t b) {
+v128_t test_u64x2_shr(v128_t a, uint32_t b) {
   return wasm_u64x2_shr(a, b);
 }
 
@@ -2191,11 +2424,11 @@ v128_t test_f32x4_max(v128_t a, v128_t b) {
 
 // CHECK-LABEL: @test_f32x4_pmin(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[CMP_I]], <4 x i32> [[B]], <4 x i32> [[A]]
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_f32x4_pmin(v128_t a, v128_t b) {
   return wasm_f32x4_pmin(a, b);
@@ -2205,9 +2438,9 @@ v128_t test_f32x4_pmin(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
-// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <4 x float> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[CMP_I]], <4 x i32> [[B]], <4 x i32> [[A]]
-// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR6]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_f32x4_pmax(v128_t a, v128_t b) {
   return wasm_f32x4_pmax(a, b);
@@ -2364,10 +2597,9 @@ v128_t test_f64x2_max(v128_t a, v128_t b) {
 
 // CHECK-LABEL: @test_f64x2_pmin(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <2 x double> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[CMP_I]], <2 x double> [[TMP0]], <2 x double> [[TMP1]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@@ -2379,8 +2611,7 @@ v128_t test_f64x2_pmin(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
-// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <2 x double> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[CMP_I]], <2 x double> [[TMP1]], <2 x double> [[TMP0]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR6]]
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
diff --git a/clang/test/Lexer/char-literal.cpp b/clang/test/Lexer/char-literal.cpp
index 1cd14a9b01167..a71500f8f0107 100644
--- a/clang/test/Lexer/char-literal.cpp
+++ b/clang/test/Lexer/char-literal.cpp
@@ -21,7 +21,8 @@ auto f = '\xE2\x8C\x98'; // expected-warning {{multi-character character constan
 char16_t g = u'ab'; // expected-error {{Unicode character literals may not contain multiple characters}}
 char16_t h = u'\U0010FFFD'; // expected-error {{character too large for enclosing character literal type}}
 
-wchar_t i = L'ab'; // expected-warning {{extraneous characters in character constant ignored}}
+wchar_t i = L'ab'; // expected-error {{wide character literals may not contain multiple characters}}
+
 wchar_t j = L'\U0010FFFD';
 
 char32_t k = U'\U0010FFFD';
diff --git a/clang/test/Lexer/unicode.c b/clang/test/Lexer/unicode.c
index 01285fbc21380..447f90c0c52a0 100644
--- a/clang/test/Lexer/unicode.c
+++ b/clang/test/Lexer/unicode.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -x c -std=c11 %s
 // RUN: %clang_cc1 -fsyntax-only -verify -x c++ -std=c++11 %s
-// RUN: %clang_cc1 -E -DPP_ONLY=1 %s -o %t
-// RUN: FileCheck --strict-whitespace --input-file=%t %s
+// RUN: %clang_cc1 -std=c99 -E -DPP_ONLY=1 %s | FileCheck %s --strict-whitespace
+// RUN: %clang_cc1 -E -DPP_ONLY=1 %s | FileCheck %s --strict-whitespace
 
 // This file contains Unicode characters; please do not "fix" them!
 
@@ -23,9 +23,30 @@ static const char *copyright = STR(COPYRIGHT); // no-warning
 #if PP_ONLY
 COPYRIGHT
 // CHECK: Copyright © {{2012}}
-CHECK: The preprocessor should not complain about Unicode characters like ©.
+CHECK : The preprocessor should not complain about Unicode characters like ©.
 #endif
 
+        int _;
+
+#ifdef __cplusplus
+
+extern int ༀ;
+extern int 𑩐;
+extern int 𐠈;
+extern int ꙮ;
+
+extern int 🌹; // expected-error {{unexpected character <U+1F339>}} \
+                  expected-warning {{declaration does not declare anything}}
+
+extern int 👷; // expected-error {{unexpected character <U+1F477>}} \
+                  expected-warning {{declaration does not declare anything}}
+
+extern int 👷‍♀; // expected-warning {{declaration does not declare anything}} \
+                  expected-error {{unexpected character <U+1F477>}} \
+                  expected-error {{unexpected character <U+200D>}} \
+                  expected-error {{unexpected character <U+2640>}}
+#else
+
 // A 🌹 by any other name....
 extern int 🌹;
 int 🌵(int 🌻) { return 🌻+ 1; }
@@ -46,7 +67,9 @@ int ⁠x﻿x‍;
 int foo​bar = 0; // expected-warning {{identifier contains Unicode character <U+200B> that is invisible in some environments}}
 int x = foobar; // expected-error {{undeclared identifier}}
 
-int ∣foo; // expected-error {{non-ASCII character}}
+int ∣foo; // expected-error {{unexpected character <U+2223>}}
 #ifndef PP_ONLY
 #define ∶ x // expected-error {{macro name must be an identifier}}
 #endif
+
+#endif
diff --git a/clang/test/Lexer/wchar.c b/clang/test/Lexer/wchar.c
index 47417382c9549..4ecaee429f739 100644
--- a/clang/test/Lexer/wchar.c
+++ b/clang/test/Lexer/wchar.c
@@ -3,10 +3,8 @@
 void f() {
   (void)L"\U00010000"; // unicode escape produces UTF-16 sequence, so no warning
 
-  (void)L'\U00010000'; // expected-error {{character too large for enclosing character literal type}}
+  (void)L'ab';  // expected-error {{wide character literals may not contain multiple characters}}
 
-  (void)L'ab';  // expected-warning {{extraneous characters in character constant ignored}}
-
-  (void)L'a\u1000';  // expected-warning {{extraneous characters in character constant ignored}}
+  (void)L'a\u1000';  // expected-error {{wide character literals may not contain multiple characters}}
 }
 
diff --git a/clang/test/Misc/diag-inline-namespace.cpp b/clang/test/Misc/diag-inline-namespace.cpp
index 74bdeed68d219..34d549bc1cc0b 100644
--- a/clang/test/Misc/diag-inline-namespace.cpp
+++ b/clang/test/Misc/diag-inline-namespace.cpp
@@ -48,3 +48,14 @@ namespace N {
   T<struct A::B::C::i> t4; // expected-error {{implicit instantiation of undefined template 'N::T<N::A::i>'}}
   T<struct A::B::C::j> t5; // expected-error {{implicit instantiation of undefined template 'N::T<N::B::C::j>'}}
 }
+
+namespace dont_crash {
+// A malformed lookup involving inline namespaces in a linkage specification
+// would previous cause an assertion due to the way diagnostics are emitted.
+extern "C++" inline namespace {
+namespace a {
+  a : b // expected-error {{unexpected ':' in nested name specifier; did you mean '::'?}} \
+        // expected-error {{no type named 'b' in namespace 'dont_crash::a'}}
+} // expected-error {{expected unqualified-id}}
+} // inline namespace
+} // dont_crash
diff --git a/clang/test/Misc/diag-template-diffing.cpp b/clang/test/Misc/diag-template-diffing.cpp
index cc1cc9ca70679..1834a5fc1ae03 100644
--- a/clang/test/Misc/diag-template-diffing.cpp
+++ b/clang/test/Misc/diag-template-diffing.cpp
@@ -1488,6 +1488,32 @@ void run(A_reg<float> reg, A_ptr<float> ptr, A_ref<float> ref) {
 }
 }
 
+namespace SubstTemplateTypeParmType {
+template <typename T>
+class Array {};
+
+template <class T>
+class S {};
+
+template <class T>
+Array<T> Make();
+
+void Call() {
+  Array<S<int>> v = Make<const S<int>>();
+}
+}
+
+// CHECK-ELIDE-NOTREE: no viable conversion from 'Array<const S<...>>' to 'Array<S<...>>'
+// CHECK-NOELIDE-NOTREE: no viable conversion from 'Array<const S<int>>' to 'Array<S<int>>'
+// CHECK-ELIDE-TREE: no viable conversion
+// CHECK-ELIDE-TREE:   Array<
+// CHECK-ELIDE-TREE:     [const != (no qualifiers)] S<...>>
+// CHECK-NOELIDE-TREE: no viable conversion
+// CHECK-NOELIDE-TREE:   Array<
+// CHECK-NOELIDE-TREE:     [const != (no qualifiers)] S<
+// CHECK-NOELIDE-TREE:       int>>
+}
+
 // CHECK-ELIDE-NOTREE: {{[0-9]*}} errors generated.
 // CHECK-NOELIDE-NOTREE: {{[0-9]*}} errors generated.
 // CHECK-ELIDE-TREE: {{[0-9]*}} errors generated.
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index 66298f0b396a9..be032758e3928 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -57,6 +57,7 @@
 // CHECK-NEXT: DLLExport (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record, SubjectMatchRule_objc_interface)
 // CHECK-NEXT: DLLImport (SubjectMatchRule_function, SubjectMatchRule_variable, SubjectMatchRule_record, SubjectMatchRule_objc_interface)
 // CHECK-NEXT: Destructor (SubjectMatchRule_function)
+// CHECK-NEXT: DisableSanitizerInstrumentation (SubjectMatchRule_function, SubjectMatchRule_objc_method, SubjectMatchRule_variable_is_global)
 // CHECK-NEXT: DisableTailCalls (SubjectMatchRule_function, SubjectMatchRule_objc_method)
 // CHECK-NEXT: EnableIf (SubjectMatchRule_function)
 // CHECK-NEXT: EnforceTCB (SubjectMatchRule_function)
diff --git a/clang/test/Misc/warning-flags.c b/clang/test/Misc/warning-flags.c
index e4f9069b88c86..a9e0a784c5c81 100644
--- a/clang/test/Misc/warning-flags.c
+++ b/clang/test/Misc/warning-flags.c
@@ -18,7 +18,7 @@ This test serves two purposes:
 
 The list of warnings below should NEVER grow.  It should gradually shrink to 0.
 
-CHECK: Warnings without flags (68):
+CHECK: Warnings without flags (67):
 
 CHECK-NEXT:   ext_expected_semi_decl_list
 CHECK-NEXT:   ext_explicit_specialization_storage_class
@@ -50,7 +50,6 @@ CHECK-NEXT:   warn_drv_pch_not_first_include
 CHECK-NEXT:   warn_dup_category_def
 CHECK-NEXT:   warn_enum_value_overflow
 CHECK-NEXT:   warn_expected_qualified_after_typename
-CHECK-NEXT:   warn_extraneous_char_constant
 CHECK-NEXT:   warn_fe_backend_unsupported
 CHECK-NEXT:   warn_fe_cc_log_diagnostics_failure
 CHECK-NEXT:   warn_fe_cc_print_header_failure
diff --git a/clang/test/Misc/warning-wall.c b/clang/test/Misc/warning-wall.c
index ee3a63e7bbd48..a3686fb96a4ce 100644
--- a/clang/test/Misc/warning-wall.c
+++ b/clang/test/Misc/warning-wall.c
@@ -3,6 +3,7 @@ RUN: FileCheck --input-file=%t %s
 
      CHECK:-Wall
 CHECK-NEXT:  -Wmost
+CHECK-NEXT:    -Wbool-operation
 CHECK-NEXT:    -Wchar-subscripts
 CHECK-NEXT:    -Wcomment
 CHECK-NEXT:    -Wdelete-non-virtual-dtor
diff --git a/clang/test/Modules/implicit-modules-use-lock.m b/clang/test/Modules/implicit-modules-use-lock.m
index 811b83a7e4c76..6c9582df4de52 100644
--- a/clang/test/Modules/implicit-modules-use-lock.m
+++ b/clang/test/Modules/implicit-modules-use-lock.m
@@ -4,20 +4,20 @@
 //
 // RUN: rm -rf %t.cache
 // RUN: %clang_cc1 -fmodules -fimplicit-module-maps \
-// RUN:   -fimplicit-modules-use-lock -Rmodule-lock \
+// RUN:   -fimplicit-modules-use-lock -Rmodule-lock -Rmodule-build \
 // RUN:   -fmodules-cache-path=%t.cache -I%S/Inputs/system-out-of-date \
 // RUN:   -fsyntax-only %s -Wnon-modular-include-in-framework-module \
-// RUN:   -Werror=non-modular-include-in-framework-module 2>&1 \
-// RUN: | FileCheck %s -check-prefix=CHECK-LOCKS
+// RUN:   -Werror=non-modular-include-in-framework-module \
+// RUN:   -verify=locks,build
 //
 // RUN: rm -rf %t.cache
 // RUN: %clang_cc1 -fmodules -fimplicit-module-maps \
-// RUN:   -fno-implicit-modules-use-lock -Rmodule-lock \
+// RUN:   -fno-implicit-modules-use-lock -Rmodule-lock -Rmodule-build \
 // RUN:   -fmodules-cache-path=%t.cache -I%S/Inputs/system-out-of-date \
 // RUN:   -fsyntax-only %s -Wnon-modular-include-in-framework-module \
-// RUN:   -Werror=non-modular-include-in-framework-module 2>&1 \
-// RUN: | FileCheck %s -check-prefix=CHECK-NO-LOCKS -allow-empty
+// RUN:   -Werror=non-modular-include-in-framework-module \
+// RUN:   -verify=build
 
-// CHECK-NO-LOCKS-NOT: remark:
-// CHECK-LOCKS: remark: locking '{{.*}}.pcm' to build module 'X' [-Rmodule-lock]
-@import X;
+@import X; // locks-remark-re {{locking '{{.*}}.pcm' to build module 'X'}} \
+           // build-remark {{building module 'X'}}                         \
+           // build-remark {{finished building module 'X'}}
diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
index 7cb86b80e158f..712c5a41c573d 100644
--- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -485,7 +485,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
 // CHECK3-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
 // CHECK3-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 // CHECK3-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK3-NEXT:    store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@@ -508,7 +508,7 @@ int bar(int n){
 // CHECK3-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
 // CHECK3-NEXT:    br label [[OMP_CRITICAL_SYNC]]
 // CHECK3:       omp.critical.sync:
-// CHECK3-NEXT:    call void @__kmpc_syncwarp(i32 [[TMP1]])
+// CHECK3-NEXT:    call void @__kmpc_syncwarp(i64 [[TMP1]])
 // CHECK3-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
 // CHECK3-NEXT:    store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
 // CHECK3-NEXT:    br label [[OMP_CRITICAL_LOOP]]
@@ -938,7 +938,7 @@ int bar(int n){
 // CHECK4-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
 // CHECK4-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
 // CHECK4-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
-// CHECK4-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
+// CHECK4-NEXT:    [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
 // CHECK4-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 // CHECK4-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK4-NEXT:    store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@@ -961,7 +961,7 @@ int bar(int n){
 // CHECK4-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
 // CHECK4-NEXT:    br label [[OMP_CRITICAL_SYNC]]
 // CHECK4:       omp.critical.sync:
-// CHECK4-NEXT:    call void @__kmpc_syncwarp(i32 [[TMP1]])
+// CHECK4-NEXT:    call void @__kmpc_syncwarp(i64 [[TMP1]])
 // CHECK4-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
 // CHECK4-NEXT:    store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
 // CHECK4-NEXT:    br label [[OMP_CRITICAL_LOOP]]
@@ -1391,7 +1391,7 @@ int bar(int n){
 // CHECK5-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
 // CHECK5-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
 // CHECK5-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
-// CHECK5-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
+// CHECK5-NEXT:    [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
 // CHECK5-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 // CHECK5-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK5-NEXT:    store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@@ -1414,7 +1414,7 @@ int bar(int n){
 // CHECK5-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
 // CHECK5-NEXT:    br label [[OMP_CRITICAL_SYNC]]
 // CHECK5:       omp.critical.sync:
-// CHECK5-NEXT:    call void @__kmpc_syncwarp(i32 [[TMP1]])
+// CHECK5-NEXT:    call void @__kmpc_syncwarp(i64 [[TMP1]])
 // CHECK5-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
 // CHECK5-NEXT:    store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
 // CHECK5-NEXT:    br label [[OMP_CRITICAL_LOOP]]
@@ -1663,7 +1663,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
 // CHECK1-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 // CHECK1-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK1-NEXT:    store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@@ -1686,7 +1686,7 @@ int bar(int n){
 // CHECK1-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
 // CHECK1-NEXT:    br label [[OMP_CRITICAL_SYNC]]
 // CHECK1:       omp.critical.sync:
-// CHECK1-NEXT:    call void @__kmpc_syncwarp(i32 [[TMP1]])
+// CHECK1-NEXT:    call void @__kmpc_syncwarp(i64 [[TMP1]])
 // CHECK1-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
 // CHECK1-NEXT:    store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
 // CHECK1-NEXT:    br label [[OMP_CRITICAL_LOOP]]
@@ -1935,7 +1935,7 @@ int bar(int n){
 // CHECK2-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4
 // CHECK2-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask()
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
 // CHECK2-NEXT:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
 // CHECK2-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK2-NEXT:    store i32 0, i32* [[CRITICAL_COUNTER]], align 4
@@ -1958,7 +1958,7 @@ int bar(int n){
 // CHECK2-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
 // CHECK2-NEXT:    br label [[OMP_CRITICAL_SYNC]]
 // CHECK2:       omp.critical.sync:
-// CHECK2-NEXT:    call void @__kmpc_syncwarp(i32 [[TMP1]])
+// CHECK2-NEXT:    call void @__kmpc_syncwarp(i64 [[TMP1]])
 // CHECK2-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
 // CHECK2-NEXT:    store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
 // CHECK2-NEXT:    br label [[OMP_CRITICAL_LOOP]]
diff --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp
new file mode 100644
index 0000000000000..2d924820fbf77
--- /dev/null
+++ b/clang/test/OpenMP/reduction_implicit_map.cpp
@@ -0,0 +1,122 @@
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
+// RUN:  -triple powerpc64le-unknown-unknown -DCUDA \
+// RUN:  -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o \
+// RUN:  %t-ppc-host.bc
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
+// RUN:  -triple nvptx64-unknown-unknown -DCUA \
+// RUN:  -fopenmp-targets=nvptx64-nvidia-cuda -DCUDA -emit-llvm %s \
+// RUN:  -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc \
+// RUN:  -o - | FileCheck %s --check-prefix CHECK
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ \
+// RUN:   -triple powerpc64le-unknown-unknown -DDIAG\
+// RUN:   -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm \
+// RUN:   %s -o - | FileCheck  %s \
+// RUN:   --check-prefix=CHECK1
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ \
+// RUN:   -triple i386-unknown-unknown \
+// RUN:   -fopenmp-targets=i386-pc-linux-gnu -emit-llvm \
+// RUN:   %s -o - | FileCheck  %s \
+// RUN:   --check-prefix=CHECK2
+
+
+#if defined(CUDA)
+// expected-no-diagnostics
+
+int foo(int n) {
+  double *e;
+  //no error and no implicit map generated for e[:1]
+  #pragma omp target parallel reduction(+: e[:1])
+    *e=10;
+  ;
+  return 0;
+}
+// CHECK-NOT @.offload_maptypes
+// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
+#elif defined(DIAG)
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  S2 &operator +(S2 &s);
+};
+int bar() {
+ S2 o[5];
+  //warnig "copyable and not guaranteed to be mapped correctly" and
+  //implicit map generated.
+#pragma omp target parallel reduction(+:o[0]) //expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}}
+  for (int i = 0; i < 10; i++);
+  double b[10][10][10];
+  //no error no implicit map generated, the map for b is generated but not
+  //for b[0:2][2:4][1].
+#pragma omp target parallel for reduction(task, +: b[0:2][2:4][1])
+  for (long long i = 0; i < 10; ++i);
+  return 0;
+}
+// map for variable o
+// CHECK1: offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
+// CHECK1: offload_maptypes = private unnamed_addr constant [1 x i64] [i64 547]
+// map for b:
+// CHECK1: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 8000]
+// CHECK1: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 547]
+#else
+// expected-no-diagnostics
+
+// generate implicit map for array elements or array sections in reduction
+// clause. In following case: the implicit map is generate for output[0]
+// with map size 4 and output[:3] with map size 12.
+void sum(int* input, int size, int* output)
+{
+#pragma omp target teams distribute parallel for reduction(+: output[0]) \
+                                                 map(to: input [0:size])
+  for (int i = 0; i < size; i++)
+    output[0] += input[i];
+#pragma omp target teams distribute parallel for reduction(+: output[:3])  \
+                                                 map(to: input [0:size])
+  for (int i = 0; i < size; i++)
+    output[0] += input[i];
+  int a[10];
+#pragma omp target parallel reduction(+: a[:2])
+  for (int i = 0; i < size; i++)
+    ;
+#pragma omp target parallel reduction(+: a[3])
+  for (int i = 0; i < size; i++)
+    ;
+}
+//CHECK2: @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 4, i64 8]
+//CHECK2: @.offload_maptypes.10 = private unnamed_addr constant [2 x i64] [i64 800, i64 547]
+//CHECK2: @.offload_sizes.13 = private unnamed_addr constant [2 x i64] [i64 4, i64 4]
+//CHECK2: @.offload_maptypes.14 = private unnamed_addr constant [2 x i64] [i64 800, i64 547]
+//CHECK2: define dso_local void @_Z3sumPiiS_
+//CHECK2-NEXT: entry
+//CHECK2-NEXT: [[INP:%.*]] = alloca i32*
+//CHECK2-NEXT: [[SIZE:%.*]] = alloca i32
+//CHECK2-NEXT: [[OUTP:%.*]] = alloca i32*
+//CHECK2:      [[OFFSIZE:%.*]] = alloca [3 x i64]
+//CHECK2:      [[OFFSIZE10:%.*]] = alloca [3 x i64]
+//CHECK2:      [[T15:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE]], i32 0, i32 0
+//CHECK2-NEXT: store i64 4, i64* [[T15]]
+//CHECK2:      [[T21:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE]], i32 0, i32 1
+//CHECK2-NEXT: store i64 4, i64* [[T21]]
+//CHECK2:     [[T53:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE10]], i32 0, i32 0
+//CHECK2-NEXT: store i64 4, i64* [[T53]]
+//CHECK2:     [[T59:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE10]], i32 0, i32 1
+//CHECK2-NEXT: store i64 12, i64* [[T59]]
+#endif
+int main()
+{
+#if defined(CUDA)
+  int a = foo(10);
+#elif defined(DIAG)
+  int a = bar();
+#else
+  const int size = 100;
+  int *array = new int[size];
+  int result = 0;
+  sum(array, size, &result);
+#endif
+  return 0;
+}
diff --git a/clang/test/OpenMP/target_map_names.cpp b/clang/test/OpenMP/target_map_names.cpp
index a96a1e9d87719..92340cd68891e 100644
--- a/clang/test/OpenMP/target_map_names.cpp
+++ b/clang/test/OpenMP/target_map_names.cpp
@@ -166,6 +166,10 @@ void baz() {
 }
 
 struct S3 {
+  S3() {
+#pragma omp target data map(alloc : Z[0:64])
+    { }
+  }
   double Z[64];
 };
 
@@ -177,7 +181,10 @@ void qux() {
   { }
 }
 
+
+
 // DEBUG: @{{[0-9]+}} = private unnamed_addr constant [{{[0-9]+}} x i8] c";s.Z[0:64];{{.*}}.cpp;{{[0-9]+}};{{[0-9]+}};;\00"
+// DEBUG: @{{[0-9]+}} = private unnamed_addr constant [{{[0-9]+}} x i8] c";this->Z[0:64];{{.*}}.cpp;{{[0-9]+}};{{[0-9]+}};;\00"
 
 // Clang used to mistakenly generate the map name "x" for both x and y on this
 // directive.  Conditions to reproduce the bug: a single map clause has two
diff --git a/clang/test/Parser/cxx11-user-defined-literals.cpp b/clang/test/Parser/cxx11-user-defined-literals.cpp
index d3188f8f43ebd..714be62381c07 100644
--- a/clang/test/Parser/cxx11-user-defined-literals.cpp
+++ b/clang/test/Parser/cxx11-user-defined-literals.cpp
@@ -140,6 +140,6 @@ void operator""_℮""_℮(unsigned long long) {} // expected-note {{previous}}
 void operator""_\u212e""_\u212e(unsigned long long) {} // expected-error {{redefinition}}
 
 #define ¢ *0.01 // expected-error {{macro name must be an identifier}}
-constexpr int operator""_¢(long double d) { return d * 100; } // expected-error {{non-ASCII}}
-constexpr int operator""_¢(unsigned long long n) { return n; } // expected-error {{non-ASCII}}
-static_assert(0.02_¢ == 2_¢, ""); // expected-error 2{{non-ASCII}}
+constexpr int operator""_¢(long double d) { return d * 100; }  // expected-error {{character <U+00A2> not allowed in an identifier}}
+constexpr int operator""_¢(unsigned long long n) { return n; } // expected-error {{character <U+00A2> not allowed in an identifier}}
+static_assert(0.02_¢ == 2_¢, "");                              // expected-error 2{{character <U+00A2> not allowed in an identifier}}
diff --git a/clang/test/Preprocessor/Weverything_pragma.c b/clang/test/Preprocessor/Weverything_pragma.c
index f2cf97ed4a1ca..5300e7aebf464 100644
--- a/clang/test/Preprocessor/Weverything_pragma.c
+++ b/clang/test/Preprocessor/Weverything_pragma.c
@@ -10,21 +10,21 @@ void foo(void) // expected-warning {{no previous prototype for function}}
 // expected-note@-1{{declare 'static' if the function is not intended to be used outside of this translation unit}}
 {
  // A diagnostic without DefaultIgnore, and not part of a group.
- (void) L'ab'; // expected-warning {{extraneous characters in character constant ignored}}
+ (void) 'ab'; // expected-warning {{multi-character character constant}}
 
 #pragma clang diagnostic warning "-Weverything" // Should not change anyhting.
 #define UNUSED_MACRO2 1 // expected-warning{{macro is not used}}
- (void) L'cd'; // expected-warning {{extraneous characters in character constant ignored}}
+ (void) 'cd'; // expected-warning {{multi-character character constant}}
 
 #pragma clang diagnostic ignored "-Weverything" // Ignore warnings now.
 #define UNUSED_MACRO2 1 // no warning
- (void) L'ef'; // no warning here
+ (void) 'ef'; // no warning here
 
 #pragma clang diagnostic warning "-Weverything" // Revert back to warnings.
 #define UNUSED_MACRO3 1 // expected-warning{{macro is not used}}
- (void) L'gh'; // expected-warning {{extraneous characters in character constant ignored}}
+ (void) 'gh'; // expected-warning {{multi-character character constant}}
 
 #pragma clang diagnostic error "-Weverything"  // Give errors now.
 #define UNUSED_MACRO4 1 // expected-error{{macro is not used}}
- (void) L'ij'; // expected-error {{extraneous characters in character constant ignored}}
+ (void) 'ij'; // expected-error {{multi-character character constant}}
 }
diff --git a/clang/test/Preprocessor/assembler-with-cpp.c b/clang/test/Preprocessor/assembler-with-cpp.c
index f03cb06ea1158..8bdac0c495f29 100644
--- a/clang/test/Preprocessor/assembler-with-cpp.c
+++ b/clang/test/Preprocessor/assembler-with-cpp.c
@@ -73,8 +73,8 @@
 // CHECK-Identifiers-True: 11: #0
 
 // Universal character names can specify basic ascii and control characters
-12: \u0020\u0030\u0080\u0000
-// CHECK-Identifiers-False: 12: \u0020\u0030\u0080\u0000
+12: \u0020\u0030
+// CHECK-Identifiers-False: 12: \u0020\u0030
 
 // This should not crash
 // rdar://8823139
diff --git a/clang/test/Preprocessor/init-ve.c b/clang/test/Preprocessor/init-ve.c
index 4686315f4ea06..b3ff47d54c131 100644
--- a/clang/test/Preprocessor/init-ve.c
+++ b/clang/test/Preprocessor/init-ve.c
@@ -32,7 +32,6 @@
 // VE:#define __FLT_DENORM_MIN__ 1.40129846e-45F
 // VE:#define __FLT_DIG__ 6
 // VE:#define __FLT_EPSILON__ 1.19209290e-7F
-// VE:#define __FLT_EVAL_METHOD__ 0
 // VE:#define __FLT_HAS_DENORM__ 1
 // VE:#define __FLT_HAS_INFINITY__ 1
 // VE:#define __FLT_HAS_QUIET_NAN__ 1
diff --git a/clang/test/Preprocessor/ucn-allowed-chars.c b/clang/test/Preprocessor/ucn-allowed-chars.c
index d7d67fe0d60cc..f8bfb1ee2cdbc 100644
--- a/clang/test/Preprocessor/ucn-allowed-chars.c
+++ b/clang/test/Preprocessor/ucn-allowed-chars.c
@@ -16,7 +16,7 @@ extern char a\uFFFF; // none
 
 // Identifier initial characters
 extern char \u0E50; // C++03, C11, C++11
-extern char \u0300; // disallowed initially in C11/C++11, always in C99/C++03
+extern char \u0300; // disallowed in C99/C++03
 extern char \u0D61; // C99, C11, C++03, C++11
 
 
@@ -38,19 +38,16 @@ extern char \u0D61; // C99, C11, C++03, C++11
 
 
 #if __cplusplus
+// expected-error@9 {{character <U+0384> not allowed in an identifier}}
+// expected-error@11 {{character <U+FFFF> not allowed in an identifier}}
+// expected-error@18 {{expected unqualified-id}}
 # if __cplusplus >= 201103L
 // C++11
-// expected-warning@7 {{using this character in an identifier is incompatible with C++98}}
-// expected-warning@8 {{using this character in an identifier is incompatible with C++98}}
-// expected-error@11 {{expected ';'}}
 // expected-error@19 {{expected unqualified-id}}
 // expected-error@33 {{invalid universal character}}
 
 # else
 // C++03
-// expected-error@7 {{expected ';'}}
-// expected-error@8 {{expected ';'}}
-// expected-error@11 {{expected ';'}}
 // expected-error@19 {{expected unqualified-id}}
 // expected-warning@33 {{universal character name refers to a surrogate character}}
 
@@ -60,16 +57,16 @@ extern char \u0D61; // C99, C11, C++03, C++11
 // C11
 // expected-warning@7 {{using this character in an identifier is incompatible with C99}}
 // expected-warning@9 {{using this character in an identifier is incompatible with C99}}
-// expected-error@11 {{expected ';'}}
+// expected-error@11 {{character <U+FFFF> not allowed in an identifier}}
 // expected-warning@18 {{starting an identifier with this character is incompatible with C99}}
 // expected-error@19 {{expected identifier}}
 // expected-error@33 {{invalid universal character}}
 
 # else
 // C99
-// expected-error@7 {{expected ';'}}
-// expected-error@9 {{expected ';'}}
-// expected-error@11 {{expected ';'}}
+// expected-error@7 {{not allowed in an identifier}}
+// expected-error@9 {{not allowed in an identifier}}
+// expected-error@11 {{not allowed in an identifier}}
 // expected-error@18 {{expected identifier}}
 // expected-error@19 {{expected identifier}}
 // expected-error@33 {{invalid universal character}}
diff --git a/clang/test/Preprocessor/utf8-allowed-chars.c b/clang/test/Preprocessor/utf8-allowed-chars.c
index b10ca74376bd4..db8a62e29ff6c 100644
--- a/clang/test/Preprocessor/utf8-allowed-chars.c
+++ b/clang/test/Preprocessor/utf8-allowed-chars.c
@@ -18,51 +18,41 @@ extern char x̀; // C11, C++11. Note that this does not have a composed form.
 
 // Identifier initial characters
 extern char ๐; // C++03, C11, C++11
-extern char ̀; // disallowed initially in C11/C++11, always in C99/C++03
-
-
-
-
+extern char ̀; // disallowed initially in C11/C++, always in C99
 
 
 
 
 #if __cplusplus
-# if __cplusplus >= 201103L
-// C++11
-// expected-warning@9 {{using this character in an identifier is incompatible with C++98}}
-// expected-warning@10 {{using this character in an identifier is incompatible with C++98}}
-// expected-error@13 {{non-ASCII characters are not allowed outside of literals and identifiers}}
-// expected-warning@14 {{using this character in an identifier is incompatible with C++98}}
-// expected-error@21 {{expected unqualified-id}}
+// expected-error@11 {{not allowed in an identifier}}
+// expected-error@13 {{not allowed in an identifier}}
+// expected-error@20 {{character <U+0E50> not allowed at the start of an identifier}}
+// expected-error@21 {{character <U+0300> not allowed at the start of an identifier}}
+// expected-warning@20 {{declaration does not declare anything}}
+// expected-warning@21 {{declaration does not declare anything}}
 
-# else
-// C++03
-// expected-error@9 {{non-ASCII characters are not allowed outside of literals and identifiers}}
-// expected-error@10 {{non-ASCII characters are not allowed outside of literals and identifiers}}
-// expected-error@13 {{non-ASCII characters are not allowed outside of literals and identifiers}}
-// expected-error@14 {{non-ASCII characters are not allowed outside of literals and identifiers}}
-// expected-error@21 {{non-ASCII characters are not allowed outside of literals and identifiers}} expected-warning@21 {{declaration does not declare anything}}
-
-# endif
 #else
 # if __STDC_VERSION__ >= 201112L
 // C11
 // expected-warning@9 {{using this character in an identifier is incompatible with C99}}
 // expected-warning@11 {{using this character in an identifier is incompatible with C99}}
-// expected-error@13 {{non-ASCII characters are not allowed outside of literals and identifiers}}
+// expected-error@13 {{not allowed in an identifier}}
 // expected-warning@14 {{using this character in an identifier is incompatible with C99}}
 // expected-warning@20 {{starting an identifier with this character is incompatible with C99}}
-// expected-error@21 {{expected identifier}}
+// expected-warning@21 {{declaration does not declare anything}}
+// expected-error@21 {{not allowed at the start of an identifier}}
 
 # else
 // C99
-// expected-error@9 {{non-ASCII characters are not allowed outside of literals and identifiers}}
-// expected-error@11 {{non-ASCII characters are not allowed outside of literals and identifiers}}
-// expected-error@13 {{non-ASCII characters are not allowed outside of literals and identifiers}}
-// expected-error@14 {{non-ASCII characters are not allowed outside of literals and identifiers}}
-// expected-error@20 {{expected identifier}}
-// expected-error@21 {{non-ASCII characters are not allowed outside of literals and identifiers}} expected-warning@21 {{declaration does not declare anything}}
+// expected-error@9 {{not allowed in an identifier}}
+// expected-error@11 {{not allowed in an identifier}}
+// expected-error@13 {{not allowed in an identifier}}
+// expected-error@14 {{not allowed in an identifier}}
+// expected-error@20 {{character <U+0E50> not allowed at the start of an identifier}}
+// expected-error@21 {{unexpected character <U+0300>}}
+// expected-warning@20 {{declaration does not declare anything}}
+// expected-warning@21 {{declaration does not declare anything}}
+
 
 # endif
 #endif
diff --git a/clang/test/Sema/Float16.c b/clang/test/Sema/Float16.c
index 872bd732bd418..ff0bf9043a675 100644
--- a/clang/test/Sema/Float16.c
+++ b/clang/test/Sema/Float16.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-linux-pc %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-linux-pc -target-feature +avx512fp16 %s -DHAVE
 // RUN: %clang_cc1 -fsyntax-only -verify -triple spir-unknown-unknown %s -DHAVE
 // RUN: %clang_cc1 -fsyntax-only -verify -triple armv7a-linux-gnu %s -DHAVE
 // RUN: %clang_cc1 -fsyntax-only -verify -triple aarch64-linux-gnu %s -DHAVE
@@ -9,8 +10,7 @@
 _Float16 f;
 
 #ifdef HAVE
-// FIXME: Should this be valid?
-_Complex _Float16 a; // expected-error {{'_Complex _Float16' is invalid}}
+_Complex _Float16 a;
 void builtin_complex() {
   _Float16 a = 0;
   (void)__builtin_complex(a, a); // expected-error {{'_Complex _Float16' is invalid}}
diff --git a/clang/test/Sema/warn-bitwise-negation-bool.c b/clang/test/Sema/warn-bitwise-negation-bool.c
index c74705bc765a2..d9196ca592b26 100644
--- a/clang/test/Sema/warn-bitwise-negation-bool.c
+++ b/clang/test/Sema/warn-bitwise-negation-bool.c
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -x c -fsyntax-only -verify -Wbool-operation %s
-// RUN: %clang_cc1 -x c -fsyntax-only -verify %s
-// RUN: %clang_cc1 -x c -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -x c -fsyntax-only -verify -Wall %s
+// RUN: %clang_cc1 -x c -fsyntax-only -Wbool-operation -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
 // RUN: %clang_cc1 -x c++ -fsyntax-only -verify -Wbool-operation %s
-// RUN: %clang_cc1 -x c++ -fsyntax-only -verify %s
-// RUN: %clang_cc1 -x c++ -fsyntax-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -x c++ -fsyntax-only -verify -Wall %s
+// RUN: %clang_cc1 -x c++ -fsyntax-only -Wbool-operation -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
 
 #ifdef __cplusplus
 typedef bool boolean;
diff --git a/clang/test/SemaCXX/P30636.cpp b/clang/test/SemaCXX/P30636.cpp
index 2e2affb0cfdea..1d5400d3ba0ed 100644
--- a/clang/test/SemaCXX/P30636.cpp
+++ b/clang/test/SemaCXX/P30636.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough -Wunreachable-code-fallthrough %s
 // expected-no-diagnostics
 
 template<bool param>
diff --git a/clang/test/SemaCXX/switch-implicit-fallthrough.cpp b/clang/test/SemaCXX/switch-implicit-fallthrough.cpp
index 9676664a7a30a..0b790813506c3 100644
--- a/clang/test/SemaCXX/switch-implicit-fallthrough.cpp
+++ b/clang/test/SemaCXX/switch-implicit-fallthrough.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough -Wunreachable-code-fallthrough %s
 
 
 int fallthrough(int n) {
@@ -193,6 +193,26 @@ int fallthrough_position(int n) {
         ;
   }
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunreachable-code-fallthrough"
+  switch (n) {
+      n += 300;
+      [[clang::fallthrough]];  // no warning here
+    case 221:
+      return 1;
+      [[clang::fallthrough]];  // no warning here
+    case 222:
+      return 2;
+      __attribute__((fallthrough)); // no warning here
+    case 223:
+      if (1)
+        return 3;
+      __attribute__((fallthrough)); // no warning here
+    case 224:
+      n += 400;
+  }
+#pragma clang diagnostic pop
+
   long p = static_cast<long>(n) * n;
   switch (sizeof(p)) {
     case 9:
diff --git a/clang/test/VFS/Inputs/UsesFoo.framework/Headers/UsesFoo.h b/clang/test/VFS/Inputs/UsesFoo.framework/Headers/UsesFoo.h
deleted file mode 100644
index 375d3ea2a0449..0000000000000
--- a/clang/test/VFS/Inputs/UsesFoo.framework/Headers/UsesFoo.h
+++ /dev/null
@@ -1 +0,0 @@
-@import Foo;
diff --git a/clang/test/VFS/Inputs/UsesFoo.framework/Modules/module.modulemap b/clang/test/VFS/Inputs/UsesFoo.framework/Modules/module.modulemap
deleted file mode 100644
index 55be29e800193..0000000000000
--- a/clang/test/VFS/Inputs/UsesFoo.framework/Modules/module.modulemap
+++ /dev/null
@@ -1,4 +0,0 @@
-framework module UsesFoo {
-  umbrella header "UsesFoo.h"
-  export *
-}
diff --git a/clang/test/VFS/module-header-mismatches.m b/clang/test/VFS/module-header-mismatches.m
new file mode 100644
index 0000000000000..f4e77bd555e6f
--- /dev/null
+++ b/clang/test/VFS/module-header-mismatches.m
@@ -0,0 +1,86 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s;TEST_DIR;%/t;g" %t/sed-overlay.yaml > %t/overlay.yaml
+
+// These tests first build with an overlay such that the header is resolved
+// to %t/other/Mismatch.h. They then build again with the header resolved
+// to the one in their directory.
+//
+// This should cause a rebuild if the contents is different (and thus multiple
+// PCMs), but this currently isn't the case. We should at least not error,
+// since this does happen in real projects (with a different copy of the same
+// file).
+
+// RUN: %clang_cc1 -Werror -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/hf-mcp -ivfsoverlay %t/overlay.yaml -F %t/header-frameworks -fsyntax-only -verify %t/use.m
+// RUN: %clang_cc1 -Werror -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/hf-mcp -F %t/header-frameworks -fsyntax-only -verify %t/use.m
+// RUN: find %t/hf-mcp -name "Mismatch-*.pcm" | count 1
+
+// RUN: %clang_cc1 -Werror -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/df-mcp -ivfsoverlay %t/overlay.yaml -F %t/dir-frameworks -fsyntax-only -verify %t/use.m
+// RUN: %clang_cc1 -Werror -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/hf-mcp -F %t/dir-frameworks -fsyntax-only -verify %t/use.m
+// RUN: find %t/df-mcp -name "Mismatch-*.pcm" | count 1
+
+// RUN: %clang_cc1 -Werror -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/nf-mcp -ivfsoverlay %t/overlay.yaml -F %t/norm-frameworks -fsyntax-only -verify %t/use.m
+// RUN: %clang_cc1 -Werror -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/nf-mcp -F %t/norm-frameworks -fsyntax-only -verify %t/use.m
+// RUN: find %t/nf-mcp -name "Mismatch-*.pcm" | count 1
+
+// RUN: %clang_cc1 -Werror -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/m-mcp -ivfsoverlay %t/overlay.yaml -I %t/mod -fsyntax-only -verify %t/use.m
+// RUN: %clang_cc1 -Werror -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/m-mcp -I %t/mod -fsyntax-only -verify %t/use.m
+// RUN: find %t/m-mcp -name "Mismatch-*.pcm" | count 1
+
+//--- use.m
+// expected-no-diagnostics
+@import Mismatch;
+
+//--- header-frameworks/Mismatch.framework/Modules/module.modulemap
+framework module Mismatch {
+  umbrella header "Mismatch.h"
+}
+//--- header-frameworks/Mismatch.framework/Headers/Mismatch.h
+
+//--- dir-frameworks/Mismatch.framework/Modules/module.modulemap
+framework module Mismatch {
+  umbrella "someheaders"
+}
+//--- dir-frameworks/Mismatch.framework/someheaders/Mismatch.h
+
+//--- norm-frameworks/Mismatch.framework/Modules/module.modulemap
+framework module Mismatch {
+  header "Mismatch.h"
+}
+//--- norm-frameworks/Mismatch.framework/Headers/Mismatch.h
+
+//--- mod/module.modulemap
+module Mismatch {
+  umbrella header "Mismatch.h"
+}
+//--- mod/Mismatch.h
+
+//--- other/Mismatch.h
+
+//--- sed-overlay.yaml
+{
+  'version': 0,
+  'roots': [
+    { 'name': 'TEST_DIR', 'type': 'directory',
+      'contents': [
+        { 'name': 'header-frameworks/Mismatch.framework/Headers/Mismatch.h',
+          'type': 'file',
+          'external-contents': 'TEST_DIR/other/Mismatch.h'
+        },
+        { 'name': 'dir-frameworks/Mismatch.framework/someheaders',
+          'type': 'directory',
+          'external-contents': 'TEST_DIR/others'
+        },
+        { 'name': 'norm-frameworks/Mismatch.framework/Headers/Mismatch.h',
+          'type': 'file',
+          'external-contents': 'TEST_DIR/other/Mismatch.h'
+        },
+        { 'name': 'mod/Mismatch.h',
+          'type': 'file',
+          'external-contents': 'TEST_DIR/other/Mismatch.h'
+        }
+      ]
+    }
+  ]
+}
+
diff --git a/clang/test/VFS/umbrella-mismatch.m b/clang/test/VFS/umbrella-mismatch.m
deleted file mode 100644
index 8167a21f485bb..0000000000000
--- a/clang/test/VFS/umbrella-mismatch.m
+++ /dev/null
@@ -1,7 +0,0 @@
-// RUN: rm -rf %t
-// RUN: sed -e "s;INPUT_DIR;%/S/Inputs;g" -e "s;OUT_DIR;%/S/Inputs;g" %S/Inputs/vfsoverlay.yaml > %t.yaml
-
-// RUN: %clang_cc1 -Werror -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ivfsoverlay %t.yaml -F %S/Inputs -fsyntax-only %s -Wno-atimport-in-framework-header -verify
-// RUN: %clang_cc1 -Werror -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -F %S/Inputs -fsyntax-only %s -Wno-atimport-in-framework-header -verify
-// expected-no-diagnostics
-@import UsesFoo;
diff --git a/clang/test/utils/update_cc_test_checks/lit.local.cfg b/clang/test/utils/update_cc_test_checks/lit.local.cfg
index 14d1ba260bb4a..6b4f5cfc62ed3 100644
--- a/clang/test/utils/update_cc_test_checks/lit.local.cfg
+++ b/clang/test/utils/update_cc_test_checks/lit.local.cfg
@@ -1,4 +1,5 @@
 import os
+import glob
 
 import lit.util
 
@@ -19,7 +20,9 @@ extra_args += ' --opt ' + shell_quote(opt_path)
 script_path = os.path.join(config.llvm_src_root, 'utils',
                            'update_cc_test_checks.py')
 assert os.path.isfile(script_path)
-lit = config.llvm_external_lit if config.llvm_external_lit else shell_quote(os.path.join(config.llvm_src_root, 'utils', 'lit', 'lit.py'))
+# Windows: llvm-lit.py, Linux: llvm-lit
+llvm_lit = glob.glob(os.path.join(config.llvm_tools_dir, 'llvm-lit*'))[0]
+lit = config.llvm_external_lit if config.llvm_external_lit else shell_quote(llvm_lit)
 python = shell_quote(config.python_executable)
 config.substitutions.append(
     ('%update_cc_test_checks', "%s %s %s" % (
diff --git a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
index 855a502d599f3..de4a4b96ee47f 100644
--- a/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
+++ b/clang/tools/clang-offload-bundler/ClangOffloadBundler.cpp
@@ -1577,8 +1577,8 @@ bool isCodeObjectCompatible(OffloadTargetInfo &CodeObjectInfo,
 
 /// @brief Computes a list of targets among all given targets which are
 /// compatible with this code object
-/// @param [in] Code Object \p CodeObject
-/// @param [out] List of all compatible targets \p CompatibleTargets among all
+/// @param [in] CodeObjectInfo Code Object
+/// @param [out] CompatibleTargets List of all compatible targets among all
 /// given targets
 /// @return false, if no compatible target is found.
 static bool
diff --git a/clang/tools/clang-offload-wrapper/CMakeLists.txt b/clang/tools/clang-offload-wrapper/CMakeLists.txt
index 8bcb46267a37c..144edf5ab60c0 100644
--- a/clang/tools/clang-offload-wrapper/CMakeLists.txt
+++ b/clang/tools/clang-offload-wrapper/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_LINK_COMPONENTS BitWriter Core Support TransformUtils)
+set(LLVM_LINK_COMPONENTS BitWriter Core Object Support TransformUtils)
 
 add_clang_tool(clang-offload-wrapper
   ClangOffloadWrapper.cpp
diff --git a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
index 5d4f31ff59e4b..eba180f6acba3 100644
--- a/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
+++ b/clang/tools/clang-offload-wrapper/ClangOffloadWrapper.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -29,7 +30,10 @@
 #ifndef NDEBUG
 #include "llvm/IR/Verifier.h"
 #endif // NDEBUG
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
@@ -37,10 +41,13 @@
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
 #include "llvm/Support/PropertySetIO.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SimpleTable.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/VCSRevision.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -53,7 +60,10 @@
 #include <string>
 #include <tuple>
 
+#define OPENMP_OFFLOAD_IMAGE_VERSION "1.0"
+
 using namespace llvm;
+using namespace llvm::object;
 
 // Fields in the binary descriptor which are made available to SYCL runtime
 // by the offload wrapper. Must match across tools -
@@ -241,6 +251,16 @@ static StringRef formatToString(BinaryImageFormat Fmt) {
   return "<ERROR>";
 }
 
+static cl::opt<bool> SaveTemps(
+    "save-temps",
+    cl::desc("Save temporary files that may be produced by the tool. "
+             "This option forces print-out of the temporary files' names."),
+    cl::Hidden);
+
+static cl::opt<bool> AddOpenMPOffloadNotes(
+    "add-omp-offload-notes",
+    cl::desc("Add LLVMOMPOFFLOAD ELF notes to ELF device images."), cl::Hidden);
+
 namespace {
 
 struct OffloadKindToUint {
@@ -321,6 +341,15 @@ class BinaryWrapper {
         File, Manif, Tgt, Fmt, CompileOpts, LinkOpts, EntriesFile, PropsFile));
   }
 
+  std::string ToolName;
+  std::string ObjcopyPath;
+  // Temporary file names that may be created during adding notes
+  // to ELF offload images. Use -save-temps to keep them and also
+  // see their names. A temporary file's name includes the name
+  // of the original input ELF image, so you can easily match
+  // them, if you have multiple inputs.
+  std::vector<std::string> TempFiles;
+
 private:
   IntegerType *getSizeTTy() {
     switch (M.getDataLayout().getPointerTypeSize(Type::getInt8PtrTy(C))) {
@@ -804,6 +833,10 @@ class BinaryWrapper {
     return addStructArrayToModule(PropSetsInits, getSyclPropSetTy());
   }
 
+public:
+    MemoryBuffer *addELFNotes(MemoryBuffer *Buf, StringRef OriginalFileName);
+
+private:
   /// Creates binary descriptor for the given device images. Binary descriptor
   /// is an object that is passed to the offloading runtime at program startup
   /// and it describes all device images available in the executable or shared
@@ -938,6 +971,11 @@ class BinaryWrapper {
       if (!BinOrErr)
         return BinOrErr.takeError();
       MemoryBuffer *Bin = *BinOrErr;
+      if (Img.File != "-" && Kind == OffloadKind::OpenMP &&
+          AddOpenMPOffloadNotes) {
+        // Adding ELF notes for STDIN is not supported yet.
+        Bin = addELFNotes(Bin, Img.File);
+      }
       std::pair<Constant *, Constant *> Fbin = addDeviceImageToModule(
           makeArrayRef(Bin->getBufferStart(), Bin->getBufferSize()),
           Twine(OffloadKindTag) + Twine(ImgId) + Twine(".data"), Kind, Img.Tgt);
@@ -1087,8 +1125,61 @@ class BinaryWrapper {
   }
 
 public:
-  BinaryWrapper(StringRef Target) : M("offload.wrapper.object", C) {
+  BinaryWrapper(StringRef Target, StringRef ToolName)
+      : M("offload.wrapper.object", C), ToolName(ToolName) {
     M.setTargetTriple(Target);
+    // Look for llvm-objcopy in the same directory, from which
+    // clang-offload-wrapper is invoked. This helps OpenMP offload
+    // LIT tests.
+
+    // This just needs to be some symbol in the binary; C++ doesn't
+    // allow taking the address of ::main however.
+    void *P = (void *)(intptr_t)&Help;
+    std::string COWPath = sys::fs::getMainExecutable(ToolName.str().c_str(), P);
+    if (!COWPath.empty()) {
+      auto COWDir = sys::path::parent_path(COWPath);
+      ErrorOr<std::string> ObjcopyPathOrErr =
+          sys::findProgramByName("llvm-objcopy", {COWDir});
+      if (ObjcopyPathOrErr) {
+        ObjcopyPath = *ObjcopyPathOrErr;
+        return;
+      }
+
+      // Otherwise, look through PATH environment.
+    }
+
+    ErrorOr<std::string> ObjcopyPathOrErr =
+        sys::findProgramByName("llvm-objcopy");
+    if (!ObjcopyPathOrErr) {
+      WithColor::warning(errs(), ToolName)
+          << "cannot find llvm-objcopy[.exe] in PATH; ELF notes cannot be "
+             "added.\n";
+      return;
+    }
+
+    ObjcopyPath = *ObjcopyPathOrErr;
+  }
+
+  ~BinaryWrapper() {
+    if (TempFiles.empty())
+      return;
+
+    StringRef ToolNameRef(ToolName);
+    auto warningOS = [ToolNameRef]() -> raw_ostream & {
+      return WithColor::warning(errs(), ToolNameRef);
+    };
+
+    for (auto &F : TempFiles) {
+      if (SaveTemps) {
+        warningOS() << "keeping temporary file " << F << "\n";
+        continue;
+      }
+
+      auto EC = sys::fs::remove(F, false);
+      if (EC)
+        warningOS() << "cannot remove temporary file " << F << ": "
+                    << EC.message().c_str() << "\n";
+    }
   }
 
   Expected<const Module *> wrap() {
@@ -1109,6 +1200,209 @@ class BinaryWrapper {
   }
 };
 
+  // The whole function body is misaligned just to simplify
+  // conflict resolutions with llorg.
+  MemoryBuffer *BinaryWrapper::addELFNotes(
+      MemoryBuffer *Buf,
+      StringRef OriginalFileName) {
+    // Cannot add notes, if llvm-objcopy is not available.
+    //
+    // I did not find a clean way to add a new notes section into an existing
+    // ELF file. llvm-objcopy seems to recreate a new ELF from scratch,
+    // and we just try to use llvm-objcopy here.
+    if (ObjcopyPath.empty())
+      return Buf;
+
+    StringRef ToolNameRef(ToolName);
+
+    // Helpers to emit warnings.
+    auto warningOS = [ToolNameRef]() -> raw_ostream & {
+      return WithColor::warning(errs(), ToolNameRef);
+    };
+    auto handleErrorAsWarning = [&warningOS](Error E) {
+      logAllUnhandledErrors(std::move(E), warningOS());
+    };
+
+    Expected<std::unique_ptr<ObjectFile>> BinOrErr =
+        ObjectFile::createELFObjectFile(Buf->getMemBufferRef(),
+                                        /*InitContent=*/false);
+    if (Error E = BinOrErr.takeError()) {
+      consumeError(std::move(E));
+      // This warning is questionable, but let it be here,
+      // assuming that most OpenMP offload models use ELF offload images.
+      warningOS() << OriginalFileName
+                  << " is not an ELF image, so notes cannot be added to it.\n";
+      return Buf;
+    }
+
+    // If we fail to add the note section, we just pass through the original
+    // ELF image for wrapping. At some point we should enforce the note section
+    // and start emitting errors vs warnings.
+    support::endianness Endianness;
+    if (isa<ELF64LEObjectFile>(BinOrErr->get()) ||
+        isa<ELF32LEObjectFile>(BinOrErr->get())) {
+      Endianness = support::little;
+    } else if (isa<ELF64BEObjectFile>(BinOrErr->get()) ||
+               isa<ELF32BEObjectFile>(BinOrErr->get())) {
+      Endianness = support::big;
+    } else {
+      warningOS() << OriginalFileName
+                  << " is an ELF image of unrecognized format.\n";
+      return Buf;
+    }
+
+    // Create temporary file for the data of a new SHT_NOTE section.
+    // We fill it in with data and then pass to llvm-objcopy invocation
+    // for reading.
+    Twine NotesFileModel = OriginalFileName + Twine(".elfnotes.%%%%%%%.tmp");
+    Expected<sys::fs::TempFile> NotesTemp =
+        sys::fs::TempFile::create(NotesFileModel);
+    if (Error E = NotesTemp.takeError()) {
+      handleErrorAsWarning(createFileError(NotesFileModel, std::move(E)));
+      return Buf;
+    }
+    TempFiles.push_back(NotesTemp->TmpName);
+
+    // Create temporary file for the updated ELF image.
+    // This is an empty file that we pass to llvm-objcopy invocation
+    // for writing.
+    Twine ELFFileModel = OriginalFileName + Twine(".elfwithnotes.%%%%%%%.tmp");
+    Expected<sys::fs::TempFile> ELFTemp =
+        sys::fs::TempFile::create(ELFFileModel);
+    if (Error E = ELFTemp.takeError()) {
+      handleErrorAsWarning(createFileError(ELFFileModel, std::move(E)));
+      return Buf;
+    }
+    TempFiles.push_back(ELFTemp->TmpName);
+
+    // Keep the new ELF image file to reserve the name for the future
+    // llvm-objcopy invocation.
+    std::string ELFTmpFileName = ELFTemp->TmpName;
+    if (Error E = ELFTemp->keep(ELFTmpFileName)) {
+      handleErrorAsWarning(createFileError(ELFTmpFileName, std::move(E)));
+      return Buf;
+    }
+
+    // Write notes to the *elfnotes*.tmp file.
+    raw_fd_ostream NotesOS(NotesTemp->FD, false);
+
+    struct NoteTy {
+      // Note name is a null-terminated "LLVMOMPOFFLOAD".
+      std::string Name;
+      // Note type defined in llvm/include/llvm/BinaryFormat/ELF.h.
+      uint32_t Type = 0;
+      // Each note has type-specific associated data.
+      std::string Desc;
+
+      NoteTy(std::string &&Name, uint32_t Type, std::string &&Desc)
+          : Name(std::move(Name)), Type(Type), Desc(std::move(Desc)) {}
+    };
+
+    // So far we emit just three notes.
+    SmallVector<NoteTy, 3> Notes;
+    // Version of the offload image identifying the structure of the ELF image.
+    // Version 1.0 does not have any specific requirements.
+    // We may come up with some structure that has to be honored by all
+    // offload implementations in future (e.g. to let libomptarget
+    // get some information from the offload image).
+    Notes.emplace_back("LLVMOMPOFFLOAD", ELF::NT_LLVM_OPENMP_OFFLOAD_VERSION,
+                       OPENMP_OFFLOAD_IMAGE_VERSION);
+    // This is a producer identification string. We are LLVM!
+    Notes.emplace_back("LLVMOMPOFFLOAD", ELF::NT_LLVM_OPENMP_OFFLOAD_PRODUCER,
+                       "LLVM");
+    // This is a producer version. Use the same format that is used
+    // by clang to report the LLVM version.
+    Notes.emplace_back("LLVMOMPOFFLOAD",
+                       ELF::NT_LLVM_OPENMP_OFFLOAD_PRODUCER_VERSION,
+                       LLVM_VERSION_STRING
+#ifdef LLVM_REVISION
+                       " " LLVM_REVISION
+#endif
+    );
+
+    // Return the amount of padding required for a blob of N bytes
+    // to be aligned to Alignment bytes.
+    auto getPadAmount = [](uint32_t N, uint32_t Alignment) -> uint32_t {
+      uint32_t Mod = (N % Alignment);
+      if (Mod == 0)
+        return 0;
+      return Alignment - Mod;
+    };
+    auto emitPadding = [&getPadAmount](raw_ostream &OS, uint32_t Size) {
+      for (uint32_t I = 0; I < getPadAmount(Size, 4); ++I)
+        OS << '\0';
+    };
+
+    // Put notes into the file.
+    for (auto &N : Notes) {
+      assert(!N.Name.empty() && "We should not create notes with empty names.");
+      // Name must be null-terminated.
+      if (N.Name.back() != '\0')
+        N.Name += '\0';
+      uint32_t NameSz = N.Name.size();
+      uint32_t DescSz = N.Desc.size();
+      // A note starts with three 4-byte values:
+      //   NameSz
+      //   DescSz
+      //   Type
+      // These three fields are endian-sensitive.
+      support::endian::write<uint32_t>(NotesOS, NameSz, Endianness);
+      support::endian::write<uint32_t>(NotesOS, DescSz, Endianness);
+      support::endian::write<uint32_t>(NotesOS, N.Type, Endianness);
+      // Next, we have a null-terminated Name padded to a 4-byte boundary.
+      NotesOS << N.Name;
+      emitPadding(NotesOS, NameSz);
+      if (DescSz == 0)
+        continue;
+      // Finally, we have a descriptor, which is an arbitrary flow of bytes.
+      NotesOS << N.Desc;
+      emitPadding(NotesOS, DescSz);
+    }
+    NotesOS.flush();
+
+    // Keep the notes file.
+    std::string NotesTmpFileName = NotesTemp->TmpName;
+    if (Error E = NotesTemp->keep(NotesTmpFileName)) {
+      handleErrorAsWarning(createFileError(NotesTmpFileName, std::move(E)));
+      return Buf;
+    }
+
+    // Run llvm-objcopy like this:
+    //   llvm-objcopy --add-section=.note.openmp=<notes-tmp-file-name> \
+    //       <orig-file-name> <elf-tmp-file-name>
+    //
+    // This will add a SHT_NOTE section on top of the original ELF.
+    std::vector<StringRef> Args;
+    Args.push_back(ObjcopyPath);
+    std::string Option("--add-section=.note.openmp=" + NotesTmpFileName);
+    Args.push_back(Option);
+    Args.push_back(OriginalFileName);
+    Args.push_back(ELFTmpFileName);
+    bool ExecutionFailed = false;
+    std::string ErrMsg;
+    (void)sys::ExecuteAndWait(ObjcopyPath, Args,
+                              /*Env=*/llvm::None, /*Redirects=*/{},
+                              /*SecondsToWait=*/0,
+                              /*MemoryLimit=*/0, &ErrMsg, &ExecutionFailed);
+
+    if (ExecutionFailed) {
+      warningOS() << ErrMsg << "\n";
+      return Buf;
+    }
+
+    // Substitute the original ELF with new one.
+    ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
+        MemoryBuffer::getFile(ELFTmpFileName);
+    if (!BufOrErr) {
+      handleErrorAsWarning(
+          createFileError(ELFTmpFileName, BufOrErr.getError()));
+      return Buf;
+    }
+
+    AutoGcBufs.emplace_back(std::move(*BufOrErr));
+    return AutoGcBufs.back().get();
+  }
+
 llvm::raw_ostream &operator<<(llvm::raw_ostream &Out,
                               const BinaryWrapper::Image &Img) {
   Out << "\n{\n";
@@ -1349,7 +1643,7 @@ int main(int argc, const char **argv) {
   // Construct BinaryWrapper::Image instances based on command line args and
   // add them to the wrapper
 
-  BinaryWrapper Wr(Target);
+  BinaryWrapper Wr(Target, argv[0]);
   OffloadKind Knd = OffloadKind::Unknown;
   llvm::StringRef Tgt = "";
   BinaryImageFormat Fmt = BinaryImageFormat::none;
diff --git a/clang/tools/scan-build-py/lib/libscanbuild/report.py b/clang/tools/scan-build-py/lib/libscanbuild/report.py
index 729b25e6350f3..0962b636a9219 100644
--- a/clang/tools/scan-build-py/lib/libscanbuild/report.py
+++ b/clang/tools/scan-build-py/lib/libscanbuild/report.py
@@ -417,7 +417,7 @@ def parse_bug_html(filename):
         'bug_path_length': 1
     }
 
-    with open(filename) as handler:
+    with open(filename, encoding='utf-8') as handler:
         for line in handler.readlines():
             # do not read the file further
             if endsign.match(line):
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 1283aa67b3370..383c2cf9e6c45 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -8247,6 +8247,20 @@ TEST_F(FormatTest, ReturnTypeBreakingStyle) {
                "  return a + b < c;\n"
                "};",
                Style);
+  verifyFormat("byte *\n" // Break here.
+               "f(a)\n"   // Break here.
+               "byte a[];\n"
+               "{\n"
+               "  return a;\n"
+               "}",
+               Style);
+  verifyFormat("bool f(int a, int) override;\n"
+               "Bar g(int a, Bar) final;\n"
+               "Bar h(a, Bar) final;",
+               Style);
+  verifyFormat("int\n"
+               "f(a)",
+               Style);
 
   // The return breaking style doesn't affect:
   // * function and object definitions with attribute-like macros
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 1cb15669dda8b..d21f321cf5a75 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -3981,6 +3981,27 @@ void EmitClangAttrParsedAttrImpl(RecordKeeper &Records, raw_ostream &OS) {
       }
       OS << "};\n";
     }
+
+    std::vector<std::string> ArgNames;
+    for (const auto &Arg : Attr.getValueAsListOfDefs("Args")) {
+      bool UnusedUnset;
+      if (Arg->getValueAsBitOrUnset("Fake", UnusedUnset))
+        continue;
+      ArgNames.push_back(Arg->getValueAsString("Name").str());
+      for (const auto &Class : Arg->getSuperClasses()) {
+        if (Class.first->getName().startswith("Variadic")) {
+          ArgNames.back().append("...");
+          break;
+        }
+      }
+    }
+    if (!ArgNames.empty()) {
+      OS << "static constexpr const char *" << I->first << "ArgNames[] = {\n";
+      for (const auto &N : ArgNames)
+        OS << '"' << N << "\",";
+      OS << "};\n";
+    }
+
     OS << "struct ParsedAttrInfo" << I->first
        << " final : public ParsedAttrInfo {\n";
     OS << "  ParsedAttrInfo" << I->first << "() {\n";
@@ -4004,6 +4025,8 @@ void EmitClangAttrParsedAttrImpl(RecordKeeper &Records, raw_ostream &OS) {
     OS << Attr.getValueAsBit("SupportsNonconformingLambdaSyntax") << ";\n";
     if (!Spellings.empty())
       OS << "    Spellings = " << I->first << "Spellings;\n";
+    if (!ArgNames.empty())
+      OS << "    ArgNames = " << I->first << "ArgNames;\n";
     OS << "  }\n";
     GenerateAppertainsTo(Attr, OS);
     GenerateMutualExclusionsChecks(Attr, Records, OS, MergeDeclOS, MergeStmtOS);
@@ -4236,15 +4259,13 @@ void EmitClangAttrDocTable(RecordKeeper &Records, raw_ostream &OS) {
     if (!A->getValueAsBit("ASTNode"))
       continue;
     std::vector<Record *> Docs = A->getValueAsListOfDefs("Documentation");
-    for (const auto *D : Docs) {
-      OS << "\nstatic const char AttrDoc_" << A->getName() << "[] = "
-         << "R\"reST("
-         << D->getValueAsOptionalString("Content").getValueOr("").trim()
-         << ")reST\";\n";
-      // Only look at the first documentation if there are several.
-      // (Currently there's only one such attr, revisit if this becomes common).
-      break;
-    }
+    assert(!Docs.empty());
+    // Only look at the first documentation if there are several.
+    // (Currently there's only one such attr, revisit if this becomes common).
+    StringRef Text =
+        Docs.front()->getValueAsOptionalString("Content").getValueOr("");
+    OS << "\nstatic const char AttrDoc_" << A->getName() << "[] = "
+       << "R\"reST(" << Text.trim() << ")reST\";\n";
   }
 }
 
diff --git a/clang/utils/analyzer/SATest.py b/clang/utils/analyzer/SATest.py
index 176fe40a2b171..9931870b3b0dd 100755
--- a/clang/utils/analyzer/SATest.py
+++ b/clang/utils/analyzer/SATest.py
@@ -42,6 +42,7 @@ def build(parser, args):
                                           projects,
                                           args.override_compiler,
                                           args.extra_analyzer_config,
+                                          args.extra_checkers,
                                           args.regenerate,
                                           args.strictness)
     tests_passed = tester.test_all()
@@ -250,6 +251,10 @@ def main():
                               dest="extra_analyzer_config", type=str,
                               default="",
                               help="Arguments passed to to -analyzer-config")
+    build_parser.add_argument("--extra-checkers",
+                              dest="extra_checkers", type=str,
+                              default="",
+                              help="Extra checkers to enable")
     build_parser.add_argument("--projects", action="store", default="",
                               help="Comma-separated list of projects to test")
     build_parser.add_argument("--max-size", action="store", default=None,
diff --git a/clang/utils/analyzer/SATestBuild.py b/clang/utils/analyzer/SATestBuild.py
index ed5c7379bb5b4..1977a8fc2aeff 100644
--- a/clang/utils/analyzer/SATestBuild.py
+++ b/clang/utils/analyzer/SATestBuild.py
@@ -213,6 +213,7 @@ class TestInfo(NamedTuple):
     project: ProjectInfo
     override_compiler: bool = False
     extra_analyzer_config: str = ""
+    extra_checkers: str = ""
     is_reference_build: bool = False
     strictness: int = 0
 
@@ -233,13 +234,16 @@ class RegressionTester:
     """
     A component aggregating all of the project testing.
     """
+
     def __init__(self, jobs: int, projects: List[ProjectInfo],
                  override_compiler: bool, extra_analyzer_config: str,
+                 extra_checkers: str,
                  regenerate: bool, strictness: bool):
         self.jobs = jobs
         self.projects = projects
         self.override_compiler = override_compiler
         self.extra_analyzer_config = extra_analyzer_config
+        self.extra_checkers = extra_checkers
         self.regenerate = regenerate
         self.strictness = strictness
 
@@ -252,6 +256,7 @@ def test_all(self) -> bool:
                 TestInfo(project,
                          self.override_compiler,
                          self.extra_analyzer_config,
+                         self.extra_checkers,
                          self.regenerate, self.strictness))
         if self.jobs <= 1:
             return self._single_threaded_test_all(projects_to_test)
@@ -305,10 +310,12 @@ class ProjectTester:
     """
     A component aggregating testing for one project.
     """
+
     def __init__(self, test_info: TestInfo, silent: bool = False):
         self.project = test_info.project
         self.override_compiler = test_info.override_compiler
         self.extra_analyzer_config = test_info.extra_analyzer_config
+        self.extra_checkers = test_info.extra_checkers
         self.is_reference_build = test_info.is_reference_build
         self.strictness = test_info.strictness
         self.silent = silent
@@ -414,6 +421,8 @@ def scan_build(self, directory: str, output_dir: str,
         if 'SA_ADDITIONAL_CHECKERS' in os.environ:
             all_checkers = (all_checkers + ',' +
                             os.environ['SA_ADDITIONAL_CHECKERS'])
+        if self.extra_checkers != "":
+            all_checkers += "," + self.extra_checkers
 
         # Run scan-build from within the patched source directory.
         cwd = os.path.join(directory, PATCHED_SOURCE_DIR_NAME)
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 57093c1cf5b0c..da63e2eabeb72 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -824,7 +824,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://wg21.link/cwg131">131</a></td>
     <td>TC1</td>
     <td>Typo in Lao characters</td>
-    <td class="full" align="center">Yes</td>
+    <td class="na" align="center">Superseded by <a href="wg21.link/P1949">P1949</a></td>
   </tr>
   <tr id="132">
     <td><a href="https://wg21.link/cwg132">132</a></td>
@@ -1527,7 +1527,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://wg21.link/cwg248">248</a></td>
     <td>C++11</td>
     <td>Identifier characters</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="na" align="center">Superseded by <a href="wg21.link/P1949">P1949</a></td>
   </tr>
   <tr id="249">
     <td><a href="https://wg21.link/cwg249">249</a></td>
@@ -4020,7 +4020,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://wg21.link/cwg663">663</a></td>
     <td>CD1</td>
     <td>Valid Cyrillic identifier characters</td>
-    <td class="full" align="center">Yes (C++11 onwards)</td>
+    <td class="na" align="center">Superseded by <a href="wg21.link/P1949">P1949</a></td>
   </tr>
   <tr id="664">
     <td><a href="https://wg21.link/cwg664">664</a></td>
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index 3cbee7026c5c2..308648ab5aa4a 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -1314,7 +1314,7 @@ <h2 id="cxx23">C++2b implementation status</h2>
     <tr>
       <td>C++ identifier syntax using UAX 31</td>
       <td><a href="https://wg21.link/P1949R7">P1949R7</a></td>
-      <td class="none" align="center">No</td>
+      <td class="unreleased" align="center">Clang 14</td>
     </tr>
     <tr>
       <td>Mixed string literal concatenation</td>
diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status
index ee113779e32d2..ed727181c8b76 100755
--- a/clang/www/make_cxx_dr_status
+++ b/clang/www/make_cxx_dr_status
@@ -135,12 +135,16 @@ def availability(issue):
     avail_style = ' class="na"'
   elif status.startswith('sup '):
     dup = status.split(' ', 1)[1]
-    avail = 'Superseded by <a href="#%s">%s</a>' % (dup, dup)
-    try:
-      _, avail_style = availability(int(dup))
-    except:
-      print >>sys.stderr, "issue %s marked as sup %s" % (issue, dup)
-      avail_style = ' class="none"'
+    if dup.startswith('P'):
+      avail = 'Superseded by <a href="https://wg21.link/%s">%s</a>' % (dup, dup)
+      avail_style = ' class="na"'
+    else:
+      avail = 'Superseded by <a href="#%s">%s</a>' % (dup, dup)
+      try:
+        _, avail_style = availability(int(dup))
+      except:
+        print >>sys.stderr, "issue %s marked as sup %s" % (issue, dup)
+        avail_style = ' class="none"'
   elif status.startswith('dup '):
     dup = int(status.split(' ', 1)[1])
     avail = 'Duplicate of <a href="#%s">%s</a>' % (dup, dup)
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 83a68a4b04468..fc8a0cf6d46c5 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -378,7 +378,7 @@ if (NOT MSVC)
 
   # Build with optimization, unless we're in debug mode.
   if(COMPILER_RT_DEBUG)
-    list(APPEND SANITIZER_COMMON_CFLAGS -O0)
+    list(APPEND SANITIZER_COMMON_CFLAGS -O1)
   else()
     list(APPEND SANITIZER_COMMON_CFLAGS -O3)
   endif()
diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
index 8b2f1469bd0c9..882e12ada0279 100644
--- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
+++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake
@@ -23,10 +23,12 @@ if(APPLE)
 endif()
 
 set(ALL_SANITIZER_COMMON_SUPPORTED_ARCH ${X86} ${X86_64} ${PPC64} ${RISCV64}
-    ${ARM32} ${ARM64} ${MIPS32} ${MIPS64} ${S390X} ${SPARC} ${SPARCV9})
+    ${ARM32} ${ARM64} ${MIPS32} ${MIPS64} ${S390X} ${SPARC} ${SPARCV9}
+    ${HEXAGON})
 set(ALL_ASAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
-    ${MIPS32} ${MIPS64} ${PPC64} ${S390X} ${SPARC} ${SPARCV9})
-set(ALL_CRT_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${PPC32} ${PPC64} ${RISCV32} ${RISCV64} ${VE})
+    ${MIPS32} ${MIPS64} ${PPC64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON})
+set(ALL_CRT_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${PPC32}
+    ${PPC64} ${RISCV32} ${RISCV64} ${VE} ${HEXAGON})
 set(ALL_DFSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64})
 
 if(ANDROID)
@@ -49,20 +51,25 @@ set(ALL_GWP_ASAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64})
 if(APPLE)
   set(ALL_LSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64} ${ARM64})
 else()
-  set(ALL_LSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64} ${ARM64} ${ARM32} ${PPC64} ${S390X} ${RISCV64})
+  set(ALL_LSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${MIPS64} ${ARM64} ${ARM32}
+      ${PPC64} ${S390X} ${RISCV64} ${HEXAGON})
 endif()
 set(ALL_MSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${PPC64} ${S390X})
 set(ALL_HWASAN_SUPPORTED_ARCH ${X86_64} ${ARM64})
 set(ALL_MEMPROF_SUPPORTED_ARCH ${X86_64})
 set(ALL_PROFILE_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${PPC32} ${PPC64}
-    ${MIPS32} ${MIPS64} ${S390X} ${SPARC} ${SPARCV9})
+    ${MIPS32} ${MIPS64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON})
 set(ALL_TSAN_SUPPORTED_ARCH ${X86_64} ${MIPS64} ${ARM64} ${PPC64} ${S390X})
 set(ALL_UBSAN_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${RISCV64}
-    ${MIPS32} ${MIPS64} ${PPC64} ${S390X} ${SPARC} ${SPARCV9})
-set(ALL_SAFESTACK_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM64} ${MIPS32} ${MIPS64})
-set(ALL_CFI_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${MIPS64})
-set(ALL_SCUDO_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${MIPS32} ${MIPS64} ${PPC64})
-set(ALL_SCUDO_STANDALONE_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${MIPS32} ${MIPS64} ${PPC64})
+    ${MIPS32} ${MIPS64} ${PPC64} ${S390X} ${SPARC} ${SPARCV9} ${HEXAGON})
+set(ALL_SAFESTACK_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM64} ${MIPS32} ${MIPS64}
+    ${HEXAGON})
+set(ALL_CFI_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${MIPS64}
+    ${HEXAGON})
+set(ALL_SCUDO_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64} ${MIPS32}
+    ${MIPS64} ${PPC64} ${HEXAGON})
+set(ALL_SCUDO_STANDALONE_SUPPORTED_ARCH ${X86} ${X86_64} ${ARM32} ${ARM64}
+    ${MIPS32} ${MIPS64} ${PPC64} ${HEXAGON})
 if(APPLE)
 set(ALL_XRAY_SUPPORTED_ARCH ${X86_64})
 else()
diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index 0dc8043d28b83..5fe846d771038 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -708,7 +708,7 @@ else()
 endif()
 
 if (COMPILER_RT_HAS_SANITIZER_COMMON AND SCUDO_SUPPORTED_ARCH AND
-    OS_NAME MATCHES "Linux|Android|Fuchsia")
+    OS_NAME MATCHES "Linux|Fuchsia")
   set(COMPILER_RT_HAS_SCUDO TRUE)
 else()
   set(COMPILER_RT_HAS_SCUDO FALSE)
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 59d83631a5f43..116517ca9e7dd 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -492,7 +492,6 @@ if(MINGW)
     arm/aeabi_uidivmod.S
     arm/aeabi_uldivmod.S
     arm/chkstk.S
-    mingw_fixfloat.c
     ${arm_SOURCES}
   )
 elseif(NOT WIN32)
diff --git a/compiler-rt/lib/builtins/fixdfdi.c b/compiler-rt/lib/builtins/fixdfdi.c
index 511568fc12fd6..a48facb68598a 100644
--- a/compiler-rt/lib/builtins/fixdfdi.c
+++ b/compiler-rt/lib/builtins/fixdfdi.c
@@ -42,3 +42,7 @@ AEABI_RTABI di_int __aeabi_d2lz(fp_t a) { return __fixdfdi(a); }
 COMPILER_RT_ALIAS(__fixdfdi, __aeabi_d2lz)
 #endif
 #endif
+
+#if defined(__MINGW32__) && defined(__arm__)
+COMPILER_RT_ALIAS(__fixdfdi, __dtoi64)
+#endif
diff --git a/compiler-rt/lib/builtins/fixsfdi.c b/compiler-rt/lib/builtins/fixsfdi.c
index 0cf71c30311ad..3a66fb9e2f06a 100644
--- a/compiler-rt/lib/builtins/fixsfdi.c
+++ b/compiler-rt/lib/builtins/fixsfdi.c
@@ -42,3 +42,7 @@ AEABI_RTABI di_int __aeabi_f2lz(fp_t a) { return __fixsfdi(a); }
 COMPILER_RT_ALIAS(__fixsfdi, __aeabi_f2lz)
 #endif
 #endif
+
+#if defined(__MINGW32__) && defined(__arm__)
+COMPILER_RT_ALIAS(__fixsfdi, __stoi64)
+#endif
diff --git a/compiler-rt/lib/builtins/fixunsdfdi.c b/compiler-rt/lib/builtins/fixunsdfdi.c
index ccb256d2c7e0a..f15f86788e854 100644
--- a/compiler-rt/lib/builtins/fixunsdfdi.c
+++ b/compiler-rt/lib/builtins/fixunsdfdi.c
@@ -40,3 +40,7 @@ AEABI_RTABI du_int __aeabi_d2ulz(fp_t a) { return __fixunsdfdi(a); }
 COMPILER_RT_ALIAS(__fixunsdfdi, __aeabi_d2ulz)
 #endif
 #endif
+
+#if defined(__MINGW32__) && defined(__arm__)
+COMPILER_RT_ALIAS(__fixunsdfdi, __dtou64)
+#endif
diff --git a/compiler-rt/lib/builtins/fixunssfdi.c b/compiler-rt/lib/builtins/fixunssfdi.c
index 647185fbabf16..e8f600df97661 100644
--- a/compiler-rt/lib/builtins/fixunssfdi.c
+++ b/compiler-rt/lib/builtins/fixunssfdi.c
@@ -41,3 +41,7 @@ AEABI_RTABI du_int __aeabi_f2ulz(fp_t a) { return __fixunssfdi(a); }
 COMPILER_RT_ALIAS(__fixunssfdi, __aeabi_f2ulz)
 #endif
 #endif
+
+#if defined(__MINGW32__) && defined(__arm__)
+COMPILER_RT_ALIAS(__fixunssfdi, __stou64)
+#endif
diff --git a/compiler-rt/lib/builtins/floatdidf.c b/compiler-rt/lib/builtins/floatdidf.c
index 7ecb30bca71e0..d37c43b1f2f92 100644
--- a/compiler-rt/lib/builtins/floatdidf.c
+++ b/compiler-rt/lib/builtins/floatdidf.c
@@ -101,3 +101,7 @@ AEABI_RTABI double __aeabi_l2d(di_int a) { return __floatdidf(a); }
 COMPILER_RT_ALIAS(__floatdidf, __aeabi_l2d)
 #endif
 #endif
+
+#if defined(__MINGW32__) && defined(__arm__)
+COMPILER_RT_ALIAS(__floatdidf, __i64tod)
+#endif
diff --git a/compiler-rt/lib/builtins/floatdisf.c b/compiler-rt/lib/builtins/floatdisf.c
index faaa1bcb3c8ed..5c6316431e394 100644
--- a/compiler-rt/lib/builtins/floatdisf.c
+++ b/compiler-rt/lib/builtins/floatdisf.c
@@ -73,3 +73,7 @@ AEABI_RTABI float __aeabi_l2f(di_int a) { return __floatdisf(a); }
 COMPILER_RT_ALIAS(__floatdisf, __aeabi_l2f)
 #endif
 #endif
+
+#if defined(__MINGW32__) && defined(__arm__)
+COMPILER_RT_ALIAS(__floatdisf, __i64tos)
+#endif
diff --git a/compiler-rt/lib/builtins/floatundidf.c b/compiler-rt/lib/builtins/floatundidf.c
index e5e533042a346..2ec802cdc134f 100644
--- a/compiler-rt/lib/builtins/floatundidf.c
+++ b/compiler-rt/lib/builtins/floatundidf.c
@@ -104,3 +104,7 @@ AEABI_RTABI double __aeabi_ul2d(du_int a) { return __floatundidf(a); }
 COMPILER_RT_ALIAS(__floatundidf, __aeabi_ul2d)
 #endif
 #endif
+
+#if defined(__MINGW32__) && defined(__arm__)
+COMPILER_RT_ALIAS(__floatundidf, __u64tod)
+#endif
diff --git a/compiler-rt/lib/builtins/floatundisf.c b/compiler-rt/lib/builtins/floatundisf.c
index 00d61b0c63107..2a4157dc5e4b9 100644
--- a/compiler-rt/lib/builtins/floatundisf.c
+++ b/compiler-rt/lib/builtins/floatundisf.c
@@ -70,3 +70,7 @@ AEABI_RTABI float __aeabi_ul2f(du_int a) { return __floatundisf(a); }
 COMPILER_RT_ALIAS(__floatundisf, __aeabi_ul2f)
 #endif
 #endif
+
+#if defined(__MINGW32__) && defined(__arm__)
+COMPILER_RT_ALIAS(__floatundisf, __u64tos)
+#endif
diff --git a/compiler-rt/lib/builtins/mingw_fixfloat.c b/compiler-rt/lib/builtins/mingw_fixfloat.c
deleted file mode 100644
index 945be9d4344a7..0000000000000
--- a/compiler-rt/lib/builtins/mingw_fixfloat.c
+++ /dev/null
@@ -1,34 +0,0 @@
-//===-- mingw_fixfloat.c - Wrap int/float conversions for arm/windows -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "int_lib.h"
-
-COMPILER_RT_ABI di_int __fixdfdi(double a);
-COMPILER_RT_ABI di_int __fixsfdi(float a);
-COMPILER_RT_ABI du_int __fixunsdfdi(double a);
-COMPILER_RT_ABI du_int __fixunssfdi(float a);
-COMPILER_RT_ABI double __floatdidf(di_int a);
-COMPILER_RT_ABI float __floatdisf(di_int a);
-COMPILER_RT_ABI double __floatundidf(du_int a);
-COMPILER_RT_ABI float __floatundisf(du_int a);
-
-COMPILER_RT_ABI di_int __dtoi64(double a) { return __fixdfdi(a); }
-
-COMPILER_RT_ABI di_int __stoi64(float a) { return __fixsfdi(a); }
-
-COMPILER_RT_ABI du_int __dtou64(double a) { return __fixunsdfdi(a); }
-
-COMPILER_RT_ABI du_int __stou64(float a) { return __fixunssfdi(a); }
-
-COMPILER_RT_ABI double __i64tod(di_int a) { return __floatdidf(a); }
-
-COMPILER_RT_ABI float __i64tos(di_int a) { return __floatdisf(a); }
-
-COMPILER_RT_ABI double __u64tod(du_int a) { return __floatundidf(a); }
-
-COMPILER_RT_ABI float __u64tos(du_int a) { return __floatundisf(a); }
diff --git a/compiler-rt/lib/gwp_asan/CMakeLists.txt b/compiler-rt/lib/gwp_asan/CMakeLists.txt
index a6dcc7b3de770..bb5b2902f99db 100644
--- a/compiler-rt/lib/gwp_asan/CMakeLists.txt
+++ b/compiler-rt/lib/gwp_asan/CMakeLists.txt
@@ -39,7 +39,7 @@ append_list_if(COMPILER_RT_HAS_FPIC_FLAG -fPIC GWP_ASAN_CFLAGS)
 # append_list_if(COMPILER_RT_HAS_SANITIZER_COMMON ${SANITIZER_COMMON_CFLAGS} GWP_ASAN_CFLAGS)
 
 # Remove -stdlib= which is unused when passing -nostdinc++.
-string(REGEX REPLACE "-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+string(REGEX REPLACE "-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 
 # Options parsing support is optional. This is an optional library that can be
 # used by an allocator to automatically parse GwpAsan options from the
diff --git a/compiler-rt/lib/hwasan/hwasan.cpp b/compiler-rt/lib/hwasan/hwasan.cpp
index e687f56b63f59..465419022123f 100644
--- a/compiler-rt/lib/hwasan/hwasan.cpp
+++ b/compiler-rt/lib/hwasan/hwasan.cpp
@@ -360,6 +360,7 @@ __attribute__((constructor(0))) void __hwasan_init() {
   HwasanTSDThreadInit();
 
   HwasanAllocatorInit();
+  HwasanInstallAtForkHandler();
 
 #if HWASAN_CONTAINS_UBSAN
   __ubsan::InitAsPlugin();
diff --git a/compiler-rt/lib/hwasan/hwasan.h b/compiler-rt/lib/hwasan/hwasan.h
index 7338b696ad341..1ea310a7051f0 100644
--- a/compiler-rt/lib/hwasan/hwasan.h
+++ b/compiler-rt/lib/hwasan/hwasan.h
@@ -107,6 +107,8 @@ void InitThreads();
 void InitializeInterceptors();
 
 void HwasanAllocatorInit();
+void HwasanAllocatorLock();
+void HwasanAllocatorUnlock();
 
 void *hwasan_malloc(uptr size, StackTrace *stack);
 void *hwasan_calloc(uptr nmemb, uptr size, StackTrace *stack);
@@ -140,6 +142,8 @@ void HwasanAtExit();
 
 void HwasanOnDeadlySignal(int signo, void *info, void *context);
 
+void HwasanInstallAtForkHandler();
+
 void UpdateMemoryUsage();
 
 void AppendToErrorMessageBuffer(const char *buffer);
diff --git a/compiler-rt/lib/hwasan/hwasan_allocator.cpp b/compiler-rt/lib/hwasan/hwasan_allocator.cpp
index 7d8b9f9212a99..63d86cf99e582 100644
--- a/compiler-rt/lib/hwasan/hwasan_allocator.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_allocator.cpp
@@ -107,6 +107,10 @@ void HwasanAllocatorInit() {
     tail_magic[i] = GetCurrentThread()->GenerateRandomTag();
 }
 
+void HwasanAllocatorLock() { allocator.ForceLock(); }
+
+void HwasanAllocatorUnlock() { allocator.ForceUnlock(); }
+
 void AllocatorSwallowThreadLocalCache(AllocatorCache *cache) {
   allocator.SwallowCache(cache);
 }
@@ -215,9 +219,11 @@ static bool CheckInvalidFree(StackTrace *stack, void *untagged_ptr,
 static void HwasanDeallocate(StackTrace *stack, void *tagged_ptr) {
   CHECK(tagged_ptr);
   HWASAN_FREE_HOOK(tagged_ptr);
-  void *untagged_ptr = InTaggableRegion(reinterpret_cast<uptr>(tagged_ptr))
-                           ? UntagPtr(tagged_ptr)
-                           : tagged_ptr;
+
+  bool in_taggable_region =
+      InTaggableRegion(reinterpret_cast<uptr>(tagged_ptr));
+  void *untagged_ptr = in_taggable_region ? UntagPtr(tagged_ptr) : tagged_ptr;
+
   if (CheckInvalidFree(stack, untagged_ptr, tagged_ptr))
     return;
 
@@ -242,7 +248,11 @@ static void HwasanDeallocate(StackTrace *stack, void *tagged_ptr) {
     CHECK_LT(tail_size, kShadowAlignment);
     void *tail_beg = reinterpret_cast<void *>(
         reinterpret_cast<uptr>(aligned_ptr) + orig_size);
-    if (tail_size && internal_memcmp(tail_beg, tail_magic, tail_size))
+    tag_t short_granule_memtag = *(reinterpret_cast<tag_t *>(
+        reinterpret_cast<uptr>(tail_beg) + tail_size));
+    if (tail_size &&
+        (internal_memcmp(tail_beg, tail_magic, tail_size) ||
+         (in_taggable_region && pointer_tag != short_granule_memtag)))
       ReportTailOverwritten(stack, reinterpret_cast<uptr>(tagged_ptr),
                             orig_size, tail_magic);
   }
@@ -257,8 +267,7 @@ static void HwasanDeallocate(StackTrace *stack, void *tagged_ptr) {
         Min(TaggedSize(orig_size), (uptr)flags()->max_free_fill_size);
     internal_memset(aligned_ptr, flags()->free_fill_byte, fill_size);
   }
-  if (InTaggableRegion(reinterpret_cast<uptr>(tagged_ptr)) &&
-      flags()->tag_in_free && malloc_bisect(stack, 0) &&
+  if (in_taggable_region && flags()->tag_in_free && malloc_bisect(stack, 0) &&
       atomic_load_relaxed(&hwasan_allocator_tagging_enabled)) {
     // Always store full 8-bit tags on free to maximize UAF detection.
     tag_t tag;
diff --git a/compiler-rt/lib/hwasan/hwasan_fuchsia.cpp b/compiler-rt/lib/hwasan/hwasan_fuchsia.cpp
index e299a7e862eba..f51e148197b91 100644
--- a/compiler-rt/lib/hwasan/hwasan_fuchsia.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_fuchsia.cpp
@@ -180,6 +180,8 @@ void HwasanTSDThreadInit() {}
 // function is unneeded.
 void InstallAtExitHandler() {}
 
+void HwasanInstallAtForkHandler() {}
+
 // TODO(fxbug.dev/81499): Once we finalize the tagged pointer ABI in zircon, we should come back
 // here and implement the appropriate check that TBI is enabled.
 void InitializeOsSupport() {}
diff --git a/compiler-rt/lib/hwasan/hwasan_interceptors.cpp b/compiler-rt/lib/hwasan/hwasan_interceptors.cpp
index 68f8adec0776a..3a1dbf288a595 100644
--- a/compiler-rt/lib/hwasan/hwasan_interceptors.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_interceptors.cpp
@@ -125,22 +125,6 @@ INTERCEPTOR(void, longjmp, __hw_jmp_buf env, int val) {
 
 #endif // HWASAN_WITH_INTERCEPTORS && __aarch64__
 
-static void BeforeFork() {
-  StackDepotLockAll();
-}
-
-static void AfterFork() {
-  StackDepotUnlockAll();
-}
-
-INTERCEPTOR(int, fork, void) {
-  ENSURE_HWASAN_INITED();
-  BeforeFork();
-  int pid = REAL(fork)();
-  AfterFork();
-  return pid;
-}
-
 namespace __hwasan {
 
 int OnExit() {
@@ -156,8 +140,6 @@ void InitializeInterceptors() {
   static int inited = 0;
   CHECK_EQ(inited, 0);
 
-  INTERCEPT_FUNCTION(fork);
-
 #if HWASAN_WITH_INTERCEPTORS
 #if defined(__linux__)
   INTERCEPT_FUNCTION(vfork);
diff --git a/compiler-rt/lib/hwasan/hwasan_linux.cpp b/compiler-rt/lib/hwasan/hwasan_linux.cpp
index e22723529f449..1319db6e2d1b0 100644
--- a/compiler-rt/lib/hwasan/hwasan_linux.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_linux.cpp
@@ -15,30 +15,30 @@
 #include "sanitizer_common/sanitizer_platform.h"
 #if SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_NETBSD
 
-#include "hwasan.h"
-#include "hwasan_dynamic_shadow.h"
-#include "hwasan_interface_internal.h"
-#include "hwasan_mapping.h"
-#include "hwasan_report.h"
-#include "hwasan_thread.h"
-#include "hwasan_thread_list.h"
-
-#include <dlfcn.h>
-#include <elf.h>
-#include <link.h>
-#include <pthread.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/resource.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <unwind.h>
-#include <sys/prctl.h>
-#include <errno.h>
-
-#include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_procmaps.h"
+#  include <dlfcn.h>
+#  include <elf.h>
+#  include <errno.h>
+#  include <link.h>
+#  include <pthread.h>
+#  include <signal.h>
+#  include <stdio.h>
+#  include <stdlib.h>
+#  include <sys/prctl.h>
+#  include <sys/resource.h>
+#  include <sys/time.h>
+#  include <unistd.h>
+#  include <unwind.h>
+
+#  include "hwasan.h"
+#  include "hwasan_dynamic_shadow.h"
+#  include "hwasan_interface_internal.h"
+#  include "hwasan_mapping.h"
+#  include "hwasan_report.h"
+#  include "hwasan_thread.h"
+#  include "hwasan_thread_list.h"
+#  include "sanitizer_common/sanitizer_common.h"
+#  include "sanitizer_common/sanitizer_procmaps.h"
+#  include "sanitizer_common/sanitizer_stackdepot.h"
 
 // Configurations of HWASAN_WITH_INTERCEPTORS and SANITIZER_ANDROID.
 //
@@ -50,10 +50,10 @@
 //    Tested with check-hwasan on x86_64-linux.
 // HWASAN_WITH_INTERCEPTORS=ON, SANITIZER_ANDROID=ON
 //    Tested with check-hwasan on aarch64-linux-android.
-#if !SANITIZER_ANDROID
+#  if !SANITIZER_ANDROID
 SANITIZER_INTERFACE_ATTRIBUTE
 THREADLOCAL uptr __hwasan_tls;
-#endif
+#  endif
 
 namespace __hwasan {
 
@@ -111,9 +111,9 @@ static void InitializeShadowBaseAddress(uptr shadow_size_bytes) {
 }
 
 void InitializeOsSupport() {
-#define PR_SET_TAGGED_ADDR_CTRL 55
-#define PR_GET_TAGGED_ADDR_CTRL 56
-#define PR_TAGGED_ADDR_ENABLE (1UL << 0)
+#  define PR_SET_TAGGED_ADDR_CTRL 55
+#  define PR_GET_TAGGED_ADDR_CTRL 56
+#  define PR_TAGGED_ADDR_ENABLE (1UL << 0)
   // Check we're running on a kernel that can use the tagged address ABI.
   int local_errno = 0;
   if (internal_iserror(internal_prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0),
@@ -164,9 +164,9 @@ void InitializeOsSupport() {
       Die();
     }
   }
-#undef PR_SET_TAGGED_ADDR_CTRL
-#undef PR_GET_TAGGED_ADDR_CTRL
-#undef PR_TAGGED_ADDR_ENABLE
+#  undef PR_SET_TAGGED_ADDR_CTRL
+#  undef PR_GET_TAGGED_ADDR_CTRL
+#  undef PR_TAGGED_ADDR_ENABLE
 }
 
 bool InitShadow() {
@@ -244,9 +244,7 @@ bool MemIsApp(uptr p) {
   return p >= kHighMemStart || (p >= kLowMemStart && p <= kLowMemEnd);
 }
 
-void InstallAtExitHandler() {
-  atexit(HwasanAtExit);
-}
+void InstallAtExitHandler() { atexit(HwasanAtExit); }
 
 // ---------------------- TSD ---------------- {{{1
 
@@ -262,7 +260,7 @@ extern "C" void __hwasan_thread_exit() {
     hwasanThreadList().ReleaseThread(t);
 }
 
-#if HWASAN_WITH_INTERCEPTORS
+#  if HWASAN_WITH_INTERCEPTORS
 static pthread_key_t tsd_key;
 static bool tsd_key_inited = false;
 
@@ -286,22 +284,18 @@ void HwasanTSDInit() {
   tsd_key_inited = true;
   CHECK_EQ(0, pthread_key_create(&tsd_key, HwasanTSDDtor));
 }
-#else
+#  else
 void HwasanTSDInit() {}
 void HwasanTSDThreadInit() {}
-#endif
+#  endif
 
-#if SANITIZER_ANDROID
-uptr *GetCurrentThreadLongPtr() {
-  return (uptr *)get_android_tls_ptr();
-}
-#else
-uptr *GetCurrentThreadLongPtr() {
-  return &__hwasan_tls;
-}
-#endif
+#  if SANITIZER_ANDROID
+uptr *GetCurrentThreadLongPtr() { return (uptr *)get_android_tls_ptr(); }
+#  else
+uptr *GetCurrentThreadLongPtr() { return &__hwasan_tls; }
+#  endif
 
-#if SANITIZER_ANDROID
+#  if SANITIZER_ANDROID
 void AndroidTestTlsSlot() {
   uptr kMagicValue = 0x010203040A0B0C0D;
   uptr *tls_ptr = GetCurrentThreadLongPtr();
@@ -316,9 +310,9 @@ void AndroidTestTlsSlot() {
   }
   *tls_ptr = old_value;
 }
-#else
+#  else
 void AndroidTestTlsSlot() {}
-#endif
+#  endif
 
 static AccessInfo GetAccessInfo(siginfo_t *info, ucontext_t *uc) {
   // Access type is passed in a platform dependent way (see below) and encoded
@@ -326,32 +320,32 @@ static AccessInfo GetAccessInfo(siginfo_t *info, ucontext_t *uc) {
   // recoverable. Valid values of Y are 0 to 4, which are interpreted as
   // log2(access_size), and 0xF, which means that access size is passed via
   // platform dependent register (see below).
-#if defined(__aarch64__)
+#  if defined(__aarch64__)
   // Access type is encoded in BRK immediate as 0x900 + 0xXY. For Y == 0xF,
   // access size is stored in X1 register. Access address is always in X0
   // register.
   uptr pc = (uptr)info->si_addr;
   const unsigned code = ((*(u32 *)pc) >> 5) & 0xffff;
   if ((code & 0xff00) != 0x900)
-    return AccessInfo{}; // Not ours.
+    return AccessInfo{};  // Not ours.
 
   const bool is_store = code & 0x10;
   const bool recover = code & 0x20;
   const uptr addr = uc->uc_mcontext.regs[0];
   const unsigned size_log = code & 0xf;
   if (size_log > 4 && size_log != 0xf)
-    return AccessInfo{}; // Not ours.
+    return AccessInfo{};  // Not ours.
   const uptr size = size_log == 0xf ? uc->uc_mcontext.regs[1] : 1U << size_log;
 
-#elif defined(__x86_64__)
+#  elif defined(__x86_64__)
   // Access type is encoded in the instruction following INT3 as
   // NOP DWORD ptr [EAX + 0x40 + 0xXY]. For Y == 0xF, access size is stored in
   // RSI register. Access address is always in RDI register.
   uptr pc = (uptr)uc->uc_mcontext.gregs[REG_RIP];
-  uint8_t *nop = (uint8_t*)pc;
-  if (*nop != 0x0f || *(nop + 1) != 0x1f || *(nop + 2) != 0x40  ||
+  uint8_t *nop = (uint8_t *)pc;
+  if (*nop != 0x0f || *(nop + 1) != 0x1f || *(nop + 2) != 0x40 ||
       *(nop + 3) < 0x40)
-    return AccessInfo{}; // Not ours.
+    return AccessInfo{};  // Not ours.
   const unsigned code = *(nop + 3);
 
   const bool is_store = code & 0x10;
@@ -359,13 +353,13 @@ static AccessInfo GetAccessInfo(siginfo_t *info, ucontext_t *uc) {
   const uptr addr = uc->uc_mcontext.gregs[REG_RDI];
   const unsigned size_log = code & 0xf;
   if (size_log > 4 && size_log != 0xf)
-    return AccessInfo{}; // Not ours.
+    return AccessInfo{};  // Not ours.
   const uptr size =
       size_log == 0xf ? uc->uc_mcontext.gregs[REG_RSI] : 1U << size_log;
 
-#else
-# error Unsupported architecture
-#endif
+#  else
+#    error Unsupported architecture
+#  endif
 
   return AccessInfo{addr, size, is_store, !is_store, recover};
 }
@@ -378,12 +372,12 @@ static bool HwasanOnSIGTRAP(int signo, siginfo_t *info, ucontext_t *uc) {
   SignalContext sig{info, uc};
   HandleTagMismatch(ai, StackTrace::GetNextInstructionPc(sig.pc), sig.bp, uc);
 
-#if defined(__aarch64__)
+#  if defined(__aarch64__)
   uc->uc_mcontext.pc += 4;
-#elif defined(__x86_64__)
-#else
-# error Unsupported architecture
-#endif
+#  elif defined(__x86_64__)
+#  else
+#    error Unsupported architecture
+#  endif
   return true;
 }
 
@@ -396,7 +390,7 @@ static void OnStackUnwind(const SignalContext &sig, const void *,
 void HwasanOnDeadlySignal(int signo, void *info, void *context) {
   // Probably a tag mismatch.
   if (signo == SIGTRAP)
-    if (HwasanOnSIGTRAP(signo, (siginfo_t *)info, (ucontext_t*)context))
+    if (HwasanOnSIGTRAP(signo, (siginfo_t *)info, (ucontext_t *)context))
       return;
 
   HandleDeadlySignal(info, context, GetTid(), &OnStackUnwind, nullptr);
@@ -435,6 +429,18 @@ uptr TagMemoryAligned(uptr p, uptr size, tag_t tag) {
   return AddTagToPointer(p, tag);
 }
 
-} // namespace __hwasan
+void HwasanInstallAtForkHandler() {
+  auto before = []() {
+    HwasanAllocatorLock();
+    StackDepotLockAll();
+  };
+  auto after = []() {
+    StackDepotUnlockAll();
+    HwasanAllocatorUnlock();
+  };
+  pthread_atfork(before, after, after);
+}
+
+}  // namespace __hwasan
 
-#endif // SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_NETBSD
+#endif  // SANITIZER_FREEBSD || SANITIZER_LINUX || SANITIZER_NETBSD
diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 8e7f3bbf26bfb..5beb25cd512fb 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -604,6 +604,15 @@ void ReportInvalidFree(StackTrace *stack, uptr tagged_addr) {
 void ReportTailOverwritten(StackTrace *stack, uptr tagged_addr, uptr orig_size,
                            const u8 *expected) {
   uptr tail_size = kShadowAlignment - (orig_size % kShadowAlignment);
+  u8 actual_expected[kShadowAlignment];
+  internal_memcpy(actual_expected, expected, tail_size);
+  tag_t ptr_tag = GetTagFromPointer(tagged_addr);
+  // Short granule is stashed in the last byte of the magic string. To avoid
+  // confusion, make the expected magic string contain the short granule tag.
+  if (orig_size % kShadowAlignment != 0) {
+    actual_expected[tail_size - 1] = ptr_tag;
+  }
+
   ScopedReport R(flags()->halt_on_error);
   Decorator d;
   uptr untagged_addr = UntagAddr(tagged_addr);
@@ -640,14 +649,13 @@ void ReportTailOverwritten(StackTrace *stack, uptr tagged_addr, uptr orig_size,
   s.append("Expected:      ");
   for (uptr i = 0; i < kShadowAlignment - tail_size; i++)
     s.append(".. ");
-  for (uptr i = 0; i < tail_size; i++)
-    s.append("%02x ", expected[i]);
+  for (uptr i = 0; i < tail_size; i++) s.append("%02x ", actual_expected[i]);
   s.append("\n");
   s.append("               ");
   for (uptr i = 0; i < kShadowAlignment - tail_size; i++)
     s.append("   ");
   for (uptr i = 0; i < tail_size; i++)
-    s.append("%s ", expected[i] != tail[i] ? "^^" : "  ");
+    s.append("%s ", actual_expected[i] != tail[i] ? "^^" : "  ");
 
   s.append("\nThis error occurs when a buffer overflow overwrites memory\n"
     "to the right of a heap object, but within the %zd-byte granule, e.g.\n"
diff --git a/compiler-rt/lib/hwasan/hwasan_thread.cpp b/compiler-rt/lib/hwasan/hwasan_thread.cpp
index ee747a3beea5e..5b65718c4d3b2 100644
--- a/compiler-rt/lib/hwasan/hwasan_thread.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_thread.cpp
@@ -45,13 +45,13 @@ void Thread::Init(uptr stack_buffer_start, uptr stack_buffer_size,
   if (auto sz = flags()->heap_history_size)
     heap_allocations_ = HeapAllocationsRingBuffer::New(sz);
 
-  InitStackAndTls(state);
 #if !SANITIZER_FUCHSIA
   // Do not initialize the stack ring buffer just yet on Fuchsia. Threads will
   // be initialized before we enter the thread itself, so we will instead call
   // this later.
   InitStackRingBuffer(stack_buffer_start, stack_buffer_size);
 #endif
+  InitStackAndTls(state);
 }
 
 void Thread::InitStackRingBuffer(uptr stack_buffer_start,
diff --git a/compiler-rt/lib/lsan/lsan_allocator.h b/compiler-rt/lib/lsan/lsan_allocator.h
index 9d763789154fb..45c6ac406f8a1 100644
--- a/compiler-rt/lib/lsan/lsan_allocator.h
+++ b/compiler-rt/lib/lsan/lsan_allocator.h
@@ -50,7 +50,7 @@ struct ChunkMetadata {
 };
 
 #if defined(__mips64) || defined(__aarch64__) || defined(__i386__) || \
-    defined(__arm__) || SANITIZER_RISCV64
+    defined(__arm__) || SANITIZER_RISCV64 || defined(__hexagon__)
 template <typename AddressSpaceViewTy>
 struct AP32 {
   static const uptr kSpaceBeg = 0;
diff --git a/compiler-rt/lib/msan/msan_linux.cpp b/compiler-rt/lib/msan/msan_linux.cpp
index c27e66661d222..bced00ba24282 100644
--- a/compiler-rt/lib/msan/msan_linux.cpp
+++ b/compiler-rt/lib/msan/msan_linux.cpp
@@ -106,7 +106,7 @@ static void CheckMemoryLayoutSanity() {
 
 bool InitShadow(bool init_origins) {
   // Let user know mapping parameters first.
-  VPrintf(1, "__msan_init %p\n", &__msan_init);
+  VPrintf(1, "__msan_init %p\n", reinterpret_cast<void *>(&__msan_init));
   for (unsigned i = 0; i < kMemoryLayoutSize; ++i)
     VPrintf(1, "%s: %zx - %zx\n", kMemoryLayout[i].name, kMemoryLayout[i].start,
             kMemoryLayout[i].end - 1);
@@ -115,7 +115,7 @@ bool InitShadow(bool init_origins) {
 
   if (!MEM_IS_APP(&__msan_init)) {
     Printf("FATAL: Code %p is out of application range. Non-PIE build?\n",
-           &__msan_init);
+           reinterpret_cast<void *>(&__msan_init));
     return false;
   }
 
diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt
index 9688236c52ea8..42a2740e2ab60 100644
--- a/compiler-rt/lib/profile/CMakeLists.txt
+++ b/compiler-rt/lib/profile/CMakeLists.txt
@@ -113,10 +113,7 @@ endif()
 # We don't use the C++ Standard Library here, so avoid including it by mistake.
 append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS)
 # XRay uses C++ standard library headers.
-string(REGEX REPLACE "-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
-# The Windows specific code uses a #pragma comment(linker, ...) which requires
-# -fms-extensions on MinGW targets.
-append_list_if(MINGW -fms-extensions EXTRA_FLAGS)
+string(REGEX REPLACE "-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 
 # This appears to be a C-only warning banning the use of locals in aggregate
 # initializers. All other compilers accept this, though.
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index b99db321b4e60..fb4c2fecefacb 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -109,7 +109,7 @@ intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_DEFAULT_VAR = 0;
 
 /* This variable is a weak external reference which could be used to detect
  * whether or not the compiler defined this symbol. */
-#if defined(_WIN32)
+#if defined(_MSC_VER)
 COMPILER_RT_VISIBILITY extern intptr_t INSTR_PROF_PROFILE_COUNTER_BIAS_VAR;
 #if defined(_M_IX86) || defined(__i386__)
 #define WIN_SYM_PREFIX "_"
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index 7c15f97aff898..5d47083b8bfe7 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -94,8 +94,8 @@ static size_t RoundUp(size_t size, size_t align) {
  * Write binary id length and then its data, because binary id does not
  * have a fixed length.
  */
-int WriteOneBinaryId(ProfDataWriter *Writer, uint64_t BinaryIdLen,
-                     const uint8_t *BinaryIdData) {
+static int WriteOneBinaryId(ProfDataWriter *Writer, uint64_t BinaryIdLen,
+                            const uint8_t *BinaryIdData) {
   ProfDataIOVec BinaryIdIOVec[] = {
       {&BinaryIdLen, sizeof(uint64_t), 1, 0},
       {BinaryIdData, sizeof(uint8_t), BinaryIdLen, 0}};
@@ -119,7 +119,8 @@ int WriteOneBinaryId(ProfDataWriter *Writer, uint64_t BinaryIdLen,
  * Note sections like .note.ABI-tag and .note.gnu.build-id are aligned
  * to 4 bytes, so round n_namesz and n_descsz to the nearest 4 bytes.
  */
-int WriteBinaryIdForNote(ProfDataWriter *Writer, const ElfW(Nhdr) * Note) {
+static int WriteBinaryIdForNote(ProfDataWriter *Writer,
+                                const ElfW(Nhdr) * Note) {
   int BinaryIdSize = 0;
 
   const char *NoteName = (const char *)Note + sizeof(ElfW(Nhdr));
@@ -144,8 +145,8 @@ int WriteBinaryIdForNote(ProfDataWriter *Writer, const ElfW(Nhdr) * Note) {
  * If writer is given, write binary ids into profiles.
  * If an error happens while writing, return -1.
  */
-int WriteBinaryIds(ProfDataWriter *Writer, const ElfW(Nhdr) * Note,
-                   const ElfW(Nhdr) * NotesEnd) {
+static int WriteBinaryIds(ProfDataWriter *Writer, const ElfW(Nhdr) * Note,
+                          const ElfW(Nhdr) * NotesEnd) {
   int TotalBinaryIdsSize = 0;
   while (Note < NotesEnd) {
     int Result = WriteBinaryIdForNote(Writer, Note);
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
index 0e59148e2044d..48946ce94253d 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
@@ -46,17 +46,19 @@ void __llvm_profile_register_function(void *Data_) {
   if (!DataFirst) {
     DataFirst = Data;
     DataLast = Data + 1;
-    CountersFirst = Data->CounterPtr;
-    CountersLast = (uint64_t *)Data->CounterPtr + Data->NumCounters;
+    CountersFirst = (uint64_t *)((uintptr_t)Data_ + Data->CounterPtr);
+    CountersLast = CountersFirst + Data->NumCounters;
     return;
   }
 
   DataFirst = (const __llvm_profile_data *)getMinAddr(DataFirst, Data);
-  CountersFirst = (uint64_t *)getMinAddr(CountersFirst, Data->CounterPtr);
+  CountersFirst = (uint64_t *)getMinAddr(
+      CountersFirst, (uint64_t *)((uintptr_t)Data_ + Data->CounterPtr));
 
   DataLast = (const __llvm_profile_data *)getMaxAddr(DataLast, Data + 1);
   CountersLast = (uint64_t *)getMaxAddr(
-      CountersLast, (uint64_t *)Data->CounterPtr + Data->NumCounters);
+      CountersLast,
+      (uint64_t *)((uintptr_t)Data_ + Data->CounterPtr) + Data->NumCounters);
 }
 
 COMPILER_RT_VISIBILITY
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_size_class_map.h b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_size_class_map.h
index c50d13303edec..361793f2490ac 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_allocator_size_class_map.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_allocator_size_class_map.h
@@ -193,13 +193,13 @@ class SizeClassMap {
       uptr cached = MaxCachedHint(s) * s;
       if (i == kBatchClassID)
         d = p = l = 0;
-      Printf("c%02zd => s: %zd diff: +%zd %02zd%% l %zd "
-             "cached: %zd %zd; id %zd\n",
-             i, Size(i), d, p, l, MaxCachedHint(s), cached, ClassID(s));
+      Printf(
+          "c%02zu => s: %zu diff: +%zu %02zu%% l %zu cached: %zu %zu; id %zu\n",
+          i, Size(i), d, p, l, MaxCachedHint(s), cached, ClassID(s));
       total_cached += cached;
       prev_s = s;
     }
-    Printf("Total cached: %zd\n", total_cached);
+    Printf("Total cached: %zu\n", total_cached);
   }
 
   static void Validate() {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
index 417a36502afd3..17c29c7504642 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
@@ -697,7 +697,8 @@ enum ModuleArch {
   kModuleArchARMV7S,
   kModuleArchARMV7K,
   kModuleArchARM64,
-  kModuleArchRISCV64
+  kModuleArchRISCV64,
+  kModuleArchHexagon
 };
 
 // Sorts and removes duplicates from the container.
@@ -764,6 +765,8 @@ inline const char *ModuleArchToString(ModuleArch arch) {
       return "arm64";
     case kModuleArchRISCV64:
       return "riscv64";
+    case kModuleArchHexagon:
+      return "hexagon";
   }
   CHECK(0 && "Invalid module arch");
   return "";
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
index f7fdc160eeb1c..e97cc9ac0df17 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
@@ -139,8 +139,13 @@ namespace __sanitizer {
 typedef unsigned long long uptr;
 typedef signed long long sptr;
 #else
+#  if (SANITIZER_WORDSIZE == 64) || SANITIZER_MAC || SANITIZER_WINDOWS
 typedef unsigned long uptr;
 typedef signed long sptr;
+#  else
+typedef unsigned int uptr;
+typedef signed int sptr;
+#  endif
 #endif  // defined(_WIN64)
 #if defined(__x86_64__)
 // Since x32 uses ILP32 data model in 64-bit hardware mode, we must use
@@ -172,10 +177,9 @@ typedef long pid_t;
 typedef int pid_t;
 #endif
 
-#if SANITIZER_FREEBSD || SANITIZER_NETBSD || \
-    SANITIZER_MAC || \
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_MAC ||             \
     (SANITIZER_SOLARIS && (defined(_LP64) || _FILE_OFFSET_BITS == 64)) || \
-    (SANITIZER_LINUX && defined(__x86_64__))
+    (SANITIZER_LINUX && (defined(__x86_64__) || defined(__hexagon__)))
 typedef u64 OFF_T;
 #else
 typedef uptr OFF_T;
@@ -287,14 +291,16 @@ void NORETURN CheckFailed(const char *file, int line, const char *cond,
                           u64 v1, u64 v2);
 
 // Check macro
-#define RAW_CHECK_MSG(expr, msg) do { \
-  if (UNLIKELY(!(expr))) { \
-    RawWrite(msg); \
-    Die(); \
-  } \
-} while (0)
+#define RAW_CHECK_MSG(expr, msg, ...)          \
+  do {                                         \
+    if (UNLIKELY(!(expr))) {                   \
+      const char* msgs[] = {msg, __VA_ARGS__}; \
+      for (const char* m : msgs) RawWrite(m);  \
+      Die();                                   \
+    }                                          \
+  } while (0)
 
-#define RAW_CHECK(expr) RAW_CHECK_MSG(expr, #expr)
+#define RAW_CHECK(expr, ...) RAW_CHECK_MSG(expr, #expr "\n", __VA_ARGS__)
 
 #define CHECK_IMPL(c1, op, c2) \
   do { \
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_libignore.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_libignore.cpp
index 79f2f6a1517e5..caaba3155a7be 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_libignore.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_libignore.cpp
@@ -24,7 +24,7 @@ LibIgnore::LibIgnore(LinkerInitialized) {
 void LibIgnore::AddIgnoredLibrary(const char *name_templ) {
   Lock lock(&mutex_);
   if (count_ >= kMaxLibs) {
-    Report("%s: too many ignored libraries (max: %lu)\n", SanitizerToolName,
+    Report("%s: too many ignored libraries (max: %zu)\n", SanitizerToolName,
            kMaxLibs);
     Die();
   }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 33671800549da..70f7c21bb9168 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -158,9 +158,11 @@ namespace __sanitizer {
 #include "sanitizer_syscall_linux_aarch64.inc"
 #elif SANITIZER_LINUX && defined(__arm__)
 #include "sanitizer_syscall_linux_arm.inc"
-#else
-#include "sanitizer_syscall_generic.inc"
-#endif
+#  elif SANITIZER_LINUX && defined(__hexagon__)
+#    include "sanitizer_syscall_linux_hexagon.inc"
+#  else
+#    include "sanitizer_syscall_generic.inc"
+#  endif
 
 // --------------- sanitizer_libc.h
 #if !SANITIZER_SOLARIS && !SANITIZER_NETBSD
@@ -2097,9 +2099,14 @@ static void GetPcSpBp(void *context, uptr *pc, uptr *sp, uptr *bp) {
   *pc = ucontext->uc_mcontext.__gregs[REG_PC];
   *bp = ucontext->uc_mcontext.__gregs[REG_S0];
   *sp = ucontext->uc_mcontext.__gregs[REG_SP];
-#else
-# error "Unsupported arch"
-#endif
+#  elif defined(__hexagon__)
+  ucontext_t *ucontext = (ucontext_t *)context;
+  *pc = ucontext->uc_mcontext.pc;
+  *bp = ucontext->uc_mcontext.r30;
+  *sp = ucontext->uc_mcontext.r29;
+#  else
+#    error "Unsupported arch"
+#  endif
 }
 
 void SignalContext::InitPcSpBp() { GetPcSpBp(context, &pc, &sp, &bp); }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
index 9b4a0f06237c9..3153de34e5a3f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
@@ -281,11 +281,12 @@
 // mandated by the upstream linux community for all new ports. Other ports
 // may still use legacy syscalls.
 #ifndef SANITIZER_USES_CANONICAL_LINUX_SYSCALLS
-# if (defined(__aarch64__) || defined(__riscv)) && SANITIZER_LINUX
-# define SANITIZER_USES_CANONICAL_LINUX_SYSCALLS 1
-# else
-# define SANITIZER_USES_CANONICAL_LINUX_SYSCALLS 0
-# endif
+#  if (defined(__aarch64__) || defined(__riscv) || defined(__hexagon__)) && \
+      SANITIZER_LINUX
+#    define SANITIZER_USES_CANONICAL_LINUX_SYSCALLS 1
+#  else
+#    define SANITIZER_USES_CANONICAL_LINUX_SYSCALLS 0
+#  endif
 #endif
 
 // udi16 syscalls can only be used when the following conditions are
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_linux.cpp
index c51327e1269e0..9d577570ea1e2 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_linux.cpp
@@ -28,32 +28,32 @@
 // are not defined anywhere in userspace headers. Fake them. This seems to work
 // fine with newer headers, too.
 #include <linux/posix_types.h>
-#if defined(__x86_64__) ||  defined(__mips__)
-#include <sys/stat.h>
-#else
-#define ino_t __kernel_ino_t
-#define mode_t __kernel_mode_t
-#define nlink_t __kernel_nlink_t
-#define uid_t __kernel_uid_t
-#define gid_t __kernel_gid_t
-#define off_t __kernel_off_t
-#define time_t __kernel_time_t
+#  if defined(__x86_64__) || defined(__mips__) || defined(__hexagon__)
+#    include <sys/stat.h>
+#  else
+#    define ino_t __kernel_ino_t
+#    define mode_t __kernel_mode_t
+#    define nlink_t __kernel_nlink_t
+#    define uid_t __kernel_uid_t
+#    define gid_t __kernel_gid_t
+#    define off_t __kernel_off_t
+#    define time_t __kernel_time_t
 // This header seems to contain the definitions of _kernel_ stat* structs.
-#include <asm/stat.h>
-#undef ino_t
-#undef mode_t
-#undef nlink_t
-#undef uid_t
-#undef gid_t
-#undef off_t
-#endif
-
-#include <linux/aio_abi.h>
-
-#if !SANITIZER_ANDROID
-#include <sys/statfs.h>
-#include <linux/perf_event.h>
-#endif
+#    include <asm/stat.h>
+#    undef ino_t
+#    undef mode_t
+#    undef nlink_t
+#    undef uid_t
+#    undef gid_t
+#    undef off_t
+#  endif
+
+#  include <linux/aio_abi.h>
+
+#  if !SANITIZER_ANDROID
+#    include <sys/statfs.h>
+#    include <linux/perf_event.h>
+#  endif
 
 using namespace __sanitizer;
 
@@ -63,9 +63,9 @@ namespace __sanitizer {
 #endif
 }  // namespace __sanitizer
 
-#if !defined(__powerpc64__) && !defined(__x86_64__) && !defined(__aarch64__)\
-                            && !defined(__mips__) && !defined(__s390__)\
-                            && !defined(__sparc__) && !defined(__riscv)
+#  if !defined(__powerpc64__) && !defined(__x86_64__) &&                   \
+      !defined(__aarch64__) && !defined(__mips__) && !defined(__s390__) && \
+      !defined(__sparc__) && !defined(__riscv) && !defined(__hexagon__)
 COMPILER_CHECK(struct___old_kernel_stat_sz == sizeof(struct __old_kernel_stat));
 #endif
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
index 6e5c330b98eff..c9e44ee900695 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
@@ -91,10 +91,10 @@
 #if SANITIZER_LINUX
 # include <utime.h>
 # include <sys/ptrace.h>
-#if defined(__mips64) || defined(__aarch64__) || defined(__arm__) || \
-    SANITIZER_RISCV64
-#  include <asm/ptrace.h>
-#  ifdef __arm__
+#    if defined(__mips64) || defined(__aarch64__) || defined(__arm__) || \
+        defined(__hexagon__) || SANITIZER_RISCV64
+#      include <asm/ptrace.h>
+#      ifdef __arm__
 typedef struct user_fpregs elf_fpregset_t;
 #   define ARM_VFPREGS_SIZE_ASAN (32 * 8 /*fpregs*/ + 4 /*fpscr*/)
 #   if !defined(ARM_VFPREGS_SIZE)
@@ -242,12 +242,13 @@ namespace __sanitizer {
     defined(__powerpc64__) || defined(__arch64__) || defined(__sparcv9) || \
     defined(__x86_64__) || SANITIZER_RISCV64
 #define SIZEOF_STRUCT_USTAT 32
-#elif defined(__arm__) || defined(__i386__) || defined(__mips__) \
-  || defined(__powerpc__) || defined(__s390__) || defined(__sparc__)
-#define SIZEOF_STRUCT_USTAT 20
-#else
-#error Unknown size of struct ustat
-#endif
+#    elif defined(__arm__) || defined(__i386__) || defined(__mips__) ||    \
+        defined(__powerpc__) || defined(__s390__) || defined(__sparc__) || \
+        defined(__hexagon__)
+#      define SIZEOF_STRUCT_USTAT 20
+#    else
+#      error Unknown size of struct ustat
+#    endif
   unsigned struct_ustat_sz = SIZEOF_STRUCT_USTAT;
   unsigned struct_rlimit64_sz = sizeof(struct rlimit64);
   unsigned struct_statvfs64_sz = sizeof(struct statvfs64);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index 4dd27644ed116..ca6171bd64976 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -102,7 +102,10 @@ const unsigned struct_kernel_stat64_sz = 104;
 #elif SANITIZER_RISCV64
 const unsigned struct_kernel_stat_sz = 128;
 const unsigned struct_kernel_stat64_sz = 0;  // RISCV64 does not use stat64
-#endif
+#    elif defined(__hexagon__)
+const unsigned struct_kernel_stat_sz = 128;
+const unsigned struct_kernel_stat64_sz = 0;
+#    endif
 struct __sanitizer_perf_event_attr {
   unsigned type;
   unsigned size;
@@ -367,7 +370,7 @@ struct __sanitizer_group {
   char **gr_mem;
 };
 
-#if defined(__x86_64__) && !defined(_LP64)
+#  if (defined(__x86_64__) && !defined(_LP64)) || defined(__hexagon__)
 typedef long long __sanitizer_time_t;
 #else
 typedef long __sanitizer_time_t;
@@ -475,23 +478,23 @@ struct __sanitizer_dirent {
   unsigned short d_reclen;
   // more fields that we don't care about
 };
-#elif SANITIZER_ANDROID || defined(__x86_64__)
+#  elif SANITIZER_ANDROID || defined(__x86_64__) || defined(__hexagon__)
 struct __sanitizer_dirent {
   unsigned long long d_ino;
   unsigned long long d_off;
   unsigned short d_reclen;
   // more fields that we don't care about
 };
-#else
+#  else
 struct __sanitizer_dirent {
   uptr d_ino;
   uptr d_off;
   unsigned short d_reclen;
   // more fields that we don't care about
 };
-#endif
+#  endif
 
-#if SANITIZER_LINUX && !SANITIZER_ANDROID
+#  if SANITIZER_LINUX && !SANITIZER_ANDROID
 struct __sanitizer_dirent64 {
   unsigned long long d_ino;
   unsigned long long d_off;
@@ -511,8 +514,8 @@ typedef int __sanitizer_clockid_t;
 #endif
 
 #if SANITIZER_LINUX
-#if defined(_LP64) || defined(__x86_64__) || defined(__powerpc__) || \
-    defined(__mips__)
+#    if defined(_LP64) || defined(__x86_64__) || defined(__powerpc__) || \
+        defined(__mips__) || defined(__hexagon__)
 typedef unsigned __sanitizer___kernel_uid_t;
 typedef unsigned __sanitizer___kernel_gid_t;
 #else
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp
index e25f384bb6e37..79aee8ba62823 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp
@@ -128,8 +128,8 @@ static int AppendPointer(char **buff, const char *buff_end, u64 ptr_value) {
 int VSNPrintf(char *buff, int buff_length,
               const char *format, va_list args) {
   static const char *kPrintfFormatsHelp =
-      "Supported Printf formats: %([0-9]*)?(z|ll)?{d,u,x,X}; %p; "
-      "%[-]([0-9]*)?(\\.\\*)?s; %c\n";
+      "Supported Printf formats: %([0-9]*)?(z|l|ll)?{d,u,x,X}; %p; "
+      "%[-]([0-9]*)?(\\.\\*)?s; %c\nProvided format: ";
   RAW_CHECK(format);
   RAW_CHECK(buff_length > 0);
   const char *buff_end = &buff[buff_length - 1];
@@ -160,9 +160,11 @@ int VSNPrintf(char *buff, int buff_length,
     }
     bool have_z = (*cur == 'z');
     cur += have_z;
-    bool have_ll = !have_z && (cur[0] == 'l' && cur[1] == 'l');
+    bool have_l = cur[0] == 'l' && cur[1] != 'l';
+    cur += have_l;
+    bool have_ll = cur[0] == 'l' && cur[1] == 'l';
     cur += have_ll * 2;
-    const bool have_length = have_z || have_ll;
+    const bool have_length = have_z || have_l || have_ll;
     const bool have_flags = have_width || have_length;
     // At the moment only %s supports precision and left-justification.
     CHECK(!((precision >= 0 || left_justified) && *cur != 's'));
@@ -170,6 +172,7 @@ int VSNPrintf(char *buff, int buff_length,
       case 'd': {
         s64 dval = have_ll  ? va_arg(args, s64)
                    : have_z ? va_arg(args, sptr)
+                   : have_l ? va_arg(args, long)
                             : va_arg(args, int);
         result += AppendSignedDecimal(&buff, buff_end, dval, width,
                                       pad_with_zero);
@@ -180,6 +183,7 @@ int VSNPrintf(char *buff, int buff_length,
       case 'X': {
         u64 uval = have_ll  ? va_arg(args, u64)
                    : have_z ? va_arg(args, uptr)
+                   : have_l ? va_arg(args, unsigned long)
                             : va_arg(args, unsigned);
         bool uppercase = (*cur == 'X');
         result += AppendUnsigned(&buff, buff_end, uval, (*cur == 'u') ? 10 : 16,
@@ -187,12 +191,12 @@ int VSNPrintf(char *buff, int buff_length,
         break;
       }
       case 'p': {
-        RAW_CHECK_MSG(!have_flags, kPrintfFormatsHelp);
+        RAW_CHECK(!have_flags, kPrintfFormatsHelp, format);
         result += AppendPointer(&buff, buff_end, va_arg(args, uptr));
         break;
       }
       case 's': {
-        RAW_CHECK_MSG(!have_length, kPrintfFormatsHelp);
+        RAW_CHECK(!have_length, kPrintfFormatsHelp, format);
         // Only left-justified width is supported.
         CHECK(!have_width || left_justified);
         result += AppendString(&buff, buff_end, left_justified ? -width : width,
@@ -200,17 +204,17 @@ int VSNPrintf(char *buff, int buff_length,
         break;
       }
       case 'c': {
-        RAW_CHECK_MSG(!have_flags, kPrintfFormatsHelp);
+        RAW_CHECK(!have_flags, kPrintfFormatsHelp, format);
         result += AppendChar(&buff, buff_end, va_arg(args, int));
         break;
       }
       case '%' : {
-        RAW_CHECK_MSG(!have_flags, kPrintfFormatsHelp);
+        RAW_CHECK(!have_flags, kPrintfFormatsHelp, format);
         result += AppendChar(&buff, buff_end, '%');
         break;
       }
       default: {
-        RAW_CHECK_MSG(false, kPrintfFormatsHelp);
+        RAW_CHECK(false, kPrintfFormatsHelp, format);
       }
     }
   }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp
index 07e4409f4a5d6..637a93d1398b2 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp
@@ -22,7 +22,8 @@ namespace __sanitizer {
 uptr StackTrace::GetNextInstructionPc(uptr pc) {
 #if defined(__sparc__) || defined(__mips__)
   return pc + 8;
-#elif defined(__powerpc__) || defined(__arm__) || defined(__aarch64__)
+#elif defined(__powerpc__) || defined(__arm__) || defined(__aarch64__) || \
+    defined(__hexagon__)
   return pc + 4;
 #elif SANITIZER_RISCV64
   // Current check order is 4 -> 2 -> 6 -> 8
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_syscall_linux_hexagon.inc b/compiler-rt/lib/sanitizer_common/sanitizer_syscall_linux_hexagon.inc
new file mode 100644
index 0000000000000..553bff7503b43
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_syscall_linux_hexagon.inc
@@ -0,0 +1,131 @@
+//===-- sanitizer_syscall_linux_hexagon.inc ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementations of internal_syscall and internal_iserror for Linux/hexagon.
+//
+//===----------------------------------------------------------------------===//
+
+#define SYSCALL(name) __NR_##name
+
+#define __internal_syscall_LL_E(x) \
+  ((union {                        \
+    long long ll;                  \
+    long l[2];                     \
+  }){.ll = x})                     \
+      .l[0],                       \
+      ((union {                    \
+        long long ll;              \
+        long l[2];                 \
+      }){.ll = x})                 \
+          .l[1]
+#define __internal_syscall_LL_O(x) 0, __SYSCALL_LL_E((x))
+
+#define __asm_syscall(...)                                                 \
+  do {                                                                     \
+    __asm__ __volatile__("trap0(#1)" : "=r"(r0) : __VA_ARGS__ : "memory"); \
+    return r0;                                                             \
+  } while (0)
+
+#define __internal_syscall0(n) (__internal_syscall)(n)
+
+static uptr __internal_syscall(long n) {
+  register u32 r6 __asm__("r6") = n;
+  register u32 r0 __asm__("r0");
+  __asm_syscall("r"(r6));
+}
+
+#define __internal_syscall1(n, a1) (__internal_syscall)(n, (long)(a1))
+
+static uptr __internal_syscall(long n, long a) {
+  register u32 r6 __asm__("r6") = n;
+  register u32 r0 __asm__("r0") = a;
+  __asm_syscall("r"(r6), "0"(r0));
+}
+
+#define __internal_syscall2(n, a1, a2) \
+  (__internal_syscall)(n, (long)(a1), (long)(a2))
+
+static uptr __internal_syscall(long n, long a, long b) {
+  register u32 r6 __asm__("r6") = n;
+  register u32 r0 __asm__("r0") = a;
+  register u32 r1 __asm__("r1") = b;
+  __asm_syscall("r"(r6), "0"(r0), "r"(r1));
+}
+
+#define __internal_syscall3(n, a1, a2, a3) \
+  (__internal_syscall)(n, (long)(a1), (long)(a2), (long)(a3))
+
+static uptr __internal_syscall(long n, long a, long b, long c) {
+  register u32 r6 __asm__("r6") = n;
+  register u32 r0 __asm__("r0") = a;
+  register u32 r1 __asm__("r1") = b;
+  register u32 r2 __asm__("r2") = c;
+  __asm_syscall("r"(r6), "0"(r0), "r"(r1), "r"(r2));
+}
+
+#define __internal_syscall4(n, a1, a2, a3, a4) \
+  (__internal_syscall)(n, (long)(a1), (long)(a2), (long)(a3), (long)(a4))
+
+static uptr __internal_syscall(long n, long a, long b, long c, long d) {
+  register u32 r6 __asm__("r6") = n;
+  register u32 r0 __asm__("r0") = a;
+  register u32 r1 __asm__("r1") = b;
+  register u32 r2 __asm__("r2") = c;
+  register u32 r3 __asm__("r3") = d;
+  __asm_syscall("r"(r6), "0"(r0), "r"(r1), "r"(r2), "r"(r3));
+}
+
+#define __internal_syscall5(n, a1, a2, a3, a4, a5)                        \
+  (__internal_syscall)(n, (long)(a1), (long)(a2), (long)(a3), (long)(a4), \
+                       (long)(a5))
+
+static uptr __internal_syscall(long n, long a, long b, long c, long d, long e) {
+  register u32 r6 __asm__("r6") = n;
+  register u32 r0 __asm__("r0") = a;
+  register u32 r1 __asm__("r1") = b;
+  register u32 r2 __asm__("r2") = c;
+  register u32 r3 __asm__("r3") = d;
+  register u32 r4 __asm__("r4") = e;
+  __asm_syscall("r"(r6), "0"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4));
+}
+
+#define __internal_syscall6(n, a1, a2, a3, a4, a5, a6)                    \
+  (__internal_syscall)(n, (long)(a1), (long)(a2), (long)(a3), (long)(a4), \
+                       (long)(a5), (long)(a6))
+
+static uptr __internal_syscall(long n, long a, long b, long c, long d, long e,
+                               long f) {
+  register u32 r6 __asm__("r6") = n;
+  register u32 r0 __asm__("r0") = a;
+  register u32 r1 __asm__("r1") = b;
+  register u32 r2 __asm__("r2") = c;
+  register u32 r3 __asm__("r3") = d;
+  register u32 r4 __asm__("r4") = e;
+  register u32 r5 __asm__("r5") = f;
+  __asm_syscall("r"(r6), "0"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4), "r"(r5));
+}
+
+#define __SYSCALL_NARGS_X(a1, a2, a3, a4, a5, a6, a7, a8, n, ...) n
+#define __SYSCALL_NARGS(...) \
+  __SYSCALL_NARGS_X(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1, 0, )
+#define __SYSCALL_CONCAT_X(a, b) a##b
+#define __SYSCALL_CONCAT(a, b) __SYSCALL_CONCAT_X(a, b)
+#define __SYSCALL_DISP(b, ...) \
+  __SYSCALL_CONCAT(b, __SYSCALL_NARGS(__VA_ARGS__))(__VA_ARGS__)
+
+#define internal_syscall(...) __SYSCALL_DISP(__internal_syscall, __VA_ARGS__)
+
+// Helper function used to avoid clobbering of errno.
+bool internal_iserror(uptr retval, int *rverrno) {
+  if (retval >= (uptr)-4095) {
+    if (rverrno)
+      *rverrno = -retval;
+    return true;
+  }
+  return false;
+}
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp
index 670e96552c68f..385b6158300ca 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_bitvector_test.cpp
@@ -71,7 +71,7 @@ void Print(const set<uptr> &s) {
 #if defined(_WIN64)
     fprintf(stderr, "%llu ", *it);
 #else
-    fprintf(stderr, "%lu ", *it);
+    fprintf(stderr, "%zu ", *it);
 #endif
   }
   fprintf(stderr, "\n");
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_libc_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_libc_test.cpp
index 863a4332105ae..0e269d682f979 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_libc_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_libc_test.cpp
@@ -77,11 +77,7 @@ static void temp_file_name(char *buf, size_t bufsize, const char *prefix) {
 #else
   const char *tmpdir = "/tmp";
 #if SANITIZER_ANDROID
-  // I don't know a way to query temp directory location on Android without
-  // going through Java interfaces. The code below is not ideal, but should
-  // work. May require "adb root", but it is needed for almost any use of ASan
-  // on Android already.
-  tmpdir = GetEnv("EXTERNAL_STORAGE");
+  tmpdir = GetEnv("TMPDIR");
 #endif
   internal_snprintf(buf, bufsize, "%s/%sXXXXXX", tmpdir, prefix);
   ASSERT_TRUE(mkstemp(buf));
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_printf_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_printf_test.cpp
index d213d107c0195..01e81fb0b6df6 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_printf_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_printf_test.cpp
@@ -115,6 +115,9 @@ TEST(Printf, MinMax) {
   TestAgainstLibc<int>("%d-%d", INT_MIN, INT_MAX);
   TestAgainstLibc<unsigned>("%u-%u", 0, UINT_MAX);
   TestAgainstLibc<unsigned>("%x-%x", 0, UINT_MAX);
+  TestAgainstLibc<long>("%ld-%ld", LONG_MIN, LONG_MAX);
+  TestAgainstLibc<unsigned long>("%lu-%lu", 0, LONG_MAX);
+  TestAgainstLibc<unsigned long>("%lx-%lx", 0, LONG_MAX);
 #if !defined(_WIN32)
   // %z* format doesn't seem to be supported by MSVS.
   TestAgainstLibc<long>("%zd-%zd", LONG_MIN, LONG_MAX);
diff --git a/compiler-rt/lib/scudo/standalone/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
index 17af724541071..053ab2fc41682 100644
--- a/compiler-rt/lib/scudo/standalone/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
@@ -10,11 +10,8 @@ list(APPEND SCUDO_CFLAGS
   -g
   -nostdinc++)
 
-# Too many existing bugs, needs cleanup.
-append_list_if(COMPILER_RT_HAS_WNO_FORMAT -Wno-format SCUDO_CFLAGS)
-
 # Remove -stdlib= which is unused when passing -nostdinc++.
-string(REGEX REPLACE "-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+string(REGEX REPLACE "-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 
 append_list_if(COMPILER_RT_HAS_FVISIBILITY_HIDDEN_FLAG -fvisibility=hidden SCUDO_CFLAGS)
 
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 922c2e50bb0b4..371fb783a06eb 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -920,7 +920,7 @@ class Allocator {
     if (!Depot->find(Hash, &RingPos, &Size))
       return;
     for (unsigned I = 0; I != Size && I != MaxTraceSize; ++I)
-      Trace[I] = (*Depot)[RingPos + I];
+      Trace[I] = static_cast<uintptr_t>((*Depot)[RingPos + I]);
   }
 
   static void getErrorInfo(struct scudo_error_info *ErrorInfo,
diff --git a/compiler-rt/lib/scudo/standalone/internal_defs.h b/compiler-rt/lib/scudo/standalone/internal_defs.h
index c9ffad136b78d..621fc9c45e952 100644
--- a/compiler-rt/lib/scudo/standalone/internal_defs.h
+++ b/compiler-rt/lib/scudo/standalone/internal_defs.h
@@ -78,16 +78,16 @@
 
 namespace scudo {
 
-typedef unsigned long uptr;
-typedef unsigned char u8;
-typedef unsigned short u16;
-typedef unsigned int u32;
-typedef unsigned long long u64;
-typedef signed long sptr;
-typedef signed char s8;
-typedef signed short s16;
-typedef signed int s32;
-typedef signed long long s64;
+typedef uintptr_t uptr;
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef intptr_t sptr;
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
 
 // The following two functions have platform specific implementations.
 void outputRaw(const char *Buffer);
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 13420bf3d2225..6c1785512c658 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -164,9 +164,9 @@ template <typename Config> class SizeClassAllocator64 {
       PoppedBlocks += Region->Stats.PoppedBlocks;
       PushedBlocks += Region->Stats.PushedBlocks;
     }
-    Str->append("Stats: SizeClassAllocator64: %zuM mapped (%zuM rss) in %zu "
+    Str->append("Stats: SizeClassAllocator64: %zuM mapped (%uM rss) in %zu "
                 "allocations; remains %zu\n",
-                TotalMapped >> 20, 0, PoppedBlocks,
+                TotalMapped >> 20, 0U, PoppedBlocks,
                 PoppedBlocks - PushedBlocks);
 
     for (uptr I = 0; I < NumClasses; I++)
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index 630e64d46edf7..abb58a2882aff 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -485,7 +485,7 @@ void *MapAllocator<Config>::allocate(Options Options, uptr Size, uptr Alignment,
                                      FillContentsMode FillContents) {
   if (Options.get(OptionBit::AddLargeAllocationSlack))
     Size += 1UL << SCUDO_MIN_ALIGNMENT_LOG;
-  Alignment = Max(Alignment, 1UL << SCUDO_MIN_ALIGNMENT_LOG);
+  Alignment = Max(Alignment, uptr(1U) << SCUDO_MIN_ALIGNMENT_LOG);
   const uptr PageSize = getPageSizeCached();
   uptr RoundedSize =
       roundUpTo(roundUpTo(Size, Alignment) + LargeBlock::getHeaderSize() +
@@ -602,12 +602,11 @@ void MapAllocator<Config>::deallocate(Options Options, void *Ptr) {
 
 template <typename Config>
 void MapAllocator<Config>::getStats(ScopedString *Str) const {
-  Str->append(
-      "Stats: MapAllocator: allocated %zu times (%zuK), freed %zu times "
-      "(%zuK), remains %zu (%zuK) max %zuM\n",
-      NumberOfAllocs, AllocatedBytes >> 10, NumberOfFrees, FreedBytes >> 10,
-      NumberOfAllocs - NumberOfFrees, (AllocatedBytes - FreedBytes) >> 10,
-      LargestSize >> 20);
+  Str->append("Stats: MapAllocator: allocated %u times (%zuK), freed %u times "
+              "(%zuK), remains %u (%zuK) max %zuM\n",
+              NumberOfAllocs, AllocatedBytes >> 10, NumberOfFrees,
+              FreedBytes >> 10, NumberOfAllocs - NumberOfFrees,
+              (AllocatedBytes - FreedBytes) >> 10, LargestSize >> 20);
 }
 
 } // namespace scudo
diff --git a/compiler-rt/lib/scudo/standalone/size_class_map.h b/compiler-rt/lib/scudo/standalone/size_class_map.h
index ba0f78453bcb8..28b16d976e5ea 100644
--- a/compiler-rt/lib/scudo/standalone/size_class_map.h
+++ b/compiler-rt/lib/scudo/standalone/size_class_map.h
@@ -335,8 +335,8 @@ template <typename SCMap> inline void printMap() {
     const uptr L = S ? getMostSignificantSetBitIndex(S) : 0;
     const uptr Cached = SCMap::getMaxCachedHint(S) * S;
     Buffer.append(
-        "C%02zu => S: %zu diff: +%zu %02zu%% L %zu Cached: %zu %zu; id %zu\n",
-        I, S, D, P, L, SCMap::getMaxCachedHint(S), Cached,
+        "C%02zu => S: %zu diff: +%zu %02zu%% L %zu Cached: %u %zu; id %zu\n", I,
+        S, D, P, L, SCMap::getMaxCachedHint(S), Cached,
         SCMap::getClassIdBySize(S));
     TotalCached += Cached;
     PrevS = S;
diff --git a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
index def0a20a4348f..eaa47a04a179a 100644
--- a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt
@@ -17,9 +17,6 @@ set(SCUDO_UNITTEST_CFLAGS
   # TODO(kostyak): find a way to make -fsized-deallocation work
   -Wno-mismatched-new-delete)
 
-# Too many existing bugs, needs cleanup.
-append_list_if(COMPILER_RT_HAS_WNO_FORMAT -Wno-format SCUDO_UNITTEST_CFLAGS)
-
 if(COMPILER_RT_DEBUG)
   list(APPEND SCUDO_UNITTEST_CFLAGS -DSCUDO_DEBUG=1)
 endif()
@@ -42,7 +39,10 @@ foreach(lib ${SANITIZER_TEST_CXX_LIBRARIES})
 endforeach()
 list(APPEND LINK_FLAGS -pthread)
 # Linking against libatomic is required with some compilers
-list(APPEND LINK_FLAGS -latomic)
+check_library_exists(atomic __atomic_load_8 "" COMPILER_RT_HAS_LIBATOMIC)
+if (COMPILER_RT_HAS_LIBATOMIC)
+  list(APPEND LINK_FLAGS -latomic)
+endif()
 
 set(SCUDO_TEST_HEADERS
   scudo_unit_test.h
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c.inc b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
index 43efb02cb8603..6c6bcb6783a7e 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c.inc
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c.inc
@@ -226,7 +226,7 @@ INTERFACE WEAK int SCUDO_PREFIX(malloc_info)(UNUSED int options, FILE *stream) {
   fputs("<malloc version=\"scudo-1\">\n", stream);
   for (scudo::uptr i = 0; i != max_size; ++i)
     if (sizes[i])
-      fprintf(stream, "<alloc size=\"%lu\" count=\"%lu\"/>\n", i, sizes[i]);
+      fprintf(stream, "<alloc size=\"%zu\" count=\"%zu\"/>\n", i, sizes[i]);
   fputs("</malloc>\n", stream);
   SCUDO_PREFIX(free)(sizes);
   return 0;
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c_checks.h b/compiler-rt/lib/scudo/standalone/wrappers_c_checks.h
index 7fc1a9600e53b..ec9c1a104e83c 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c_checks.h
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c_checks.h
@@ -46,8 +46,10 @@ inline bool checkPosixMemalignAlignment(uptr Alignment) {
 // builtin supported by recent clang & GCC if it exists, otherwise fallback to a
 // costly division.
 inline bool checkForCallocOverflow(uptr Size, uptr N, uptr *Product) {
-#if __has_builtin(__builtin_umull_overflow)
+#if __has_builtin(__builtin_umull_overflow) && (SCUDO_WORDSIZE == 64U)
   return __builtin_umull_overflow(Size, N, Product);
+#elif __has_builtin(__builtin_umul_overflow) && (SCUDO_WORDSIZE == 32U)
+  return __builtin_umul_overflow(Size, N, Product);
 #else
   *Product = Size * N;
   if (!Size)
diff --git a/compiler-rt/lib/tsan/go/buildgo.sh b/compiler-rt/lib/tsan/go/buildgo.sh
index 661a2e21c2f32..489da16066dd4 100755
--- a/compiler-rt/lib/tsan/go/buildgo.sh
+++ b/compiler-rt/lib/tsan/go/buildgo.sh
@@ -59,7 +59,7 @@ if [ "`uname -a | grep Linux`" != "" ]; then
 	elif [ "`uname -a | grep x86_64`" != "" ]; then
 		SUFFIX="linux_amd64"
 		ARCHCFLAGS="-m64 -msse4.2"
-		OSCFLAGS="$OSCFLAGS -ffreestanding -Wno-unused-const-variable -Werror -Wno-unknown-warning-option"
+		OSCFLAGS="$OSCFLAGS -ffreestanding -Wno-unused-const-variable -Wno-unknown-warning-option"
 	elif [ "`uname -a | grep aarch64`" != "" ]; then
 		SUFFIX="linux_arm64"
 		ARCHCFLAGS=""
diff --git a/compiler-rt/lib/tsan/rtl/tsan_defs.h b/compiler-rt/lib/tsan/rtl/tsan_defs.h
index 2146a2f40f7a3..fe0c1da31599b 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_defs.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_defs.h
@@ -51,13 +51,18 @@ typedef __m128i m128;
 
 namespace __tsan {
 
+constexpr uptr kByteBits = 8;
+
 // Thread slot ID.
 enum class Sid : u8 {};
 constexpr uptr kThreadSlotCount = 256;
+constexpr Sid kFreeSid = static_cast<Sid>(255);
 
 // Abstract time unit, vector clock element.
 enum class Epoch : u16 {};
+constexpr uptr kEpochBits = 14;
 constexpr Epoch kEpochZero = static_cast<Epoch>(0);
+constexpr Epoch kEpochOver = static_cast<Epoch>(1 << kEpochBits);
 
 const int kClkBits = 42;
 const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1;
diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
index af0073726765c..ec3a548f05aa4 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp
@@ -855,9 +855,15 @@ constexpr u32 kGuardDone = 1;
 constexpr u32 kGuardRunning = 1 << 16;
 constexpr u32 kGuardWaiter = 1 << 17;
 
-static int guard_acquire(ThreadState *thr, uptr pc, atomic_uint32_t *g) {
-  OnPotentiallyBlockingRegionBegin();
-  auto on_exit = at_scope_exit(&OnPotentiallyBlockingRegionEnd);
+static int guard_acquire(ThreadState *thr, uptr pc, atomic_uint32_t *g,
+                         bool blocking_hooks = true) {
+  if (blocking_hooks)
+    OnPotentiallyBlockingRegionBegin();
+  auto on_exit = at_scope_exit([blocking_hooks] {
+    if (blocking_hooks)
+      OnPotentiallyBlockingRegionEnd();
+  });
+
   for (;;) {
     u32 cmp = atomic_load(g, memory_order_acquire);
     if (cmp == kGuardInit) {
@@ -1509,7 +1515,9 @@ TSAN_INTERCEPTOR(int, pthread_once, void *o, void (*f)()) {
   else
     a = static_cast<atomic_uint32_t*>(o);
 
-  if (guard_acquire(thr, pc, a)) {
+  // Mac OS X appears to use pthread_once() where calling BlockingRegion hooks
+  // result in crashes due to too little stack space.
+  if (guard_acquire(thr, pc, a, !SANITIZER_MAC)) {
     (*f)();
     guard_release(thr, pc, a);
   }
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
index ece43153d1a4b..6dc0791f53d0a 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp
@@ -555,6 +555,188 @@ StackID CurrentStackId(ThreadState *thr, uptr pc) {
   return id;
 }
 
+namespace v3 {
+
+ALWAYS_INLINE USED bool TryTraceMemoryAccess(ThreadState *thr, uptr pc,
+                                             uptr addr, uptr size,
+                                             AccessType typ) {
+  DCHECK(size == 1 || size == 2 || size == 4 || size == 8);
+  if (!kCollectHistory)
+    return true;
+  EventAccess *ev;
+  if (UNLIKELY(!TraceAcquire(thr, &ev)))
+    return false;
+  u64 size_log = size == 1 ? 0 : size == 2 ? 1 : size == 4 ? 2 : 3;
+  uptr pc_delta = pc - thr->trace_prev_pc + (1 << (EventAccess::kPCBits - 1));
+  thr->trace_prev_pc = pc;
+  if (LIKELY(pc_delta < (1 << EventAccess::kPCBits))) {
+    ev->is_access = 1;
+    ev->is_read = !!(typ & kAccessRead);
+    ev->is_atomic = !!(typ & kAccessAtomic);
+    ev->size_log = size_log;
+    ev->pc_delta = pc_delta;
+    DCHECK_EQ(ev->pc_delta, pc_delta);
+    ev->addr = CompressAddr(addr);
+    TraceRelease(thr, ev);
+    return true;
+  }
+  auto *evex = reinterpret_cast<EventAccessExt *>(ev);
+  evex->is_access = 0;
+  evex->is_func = 0;
+  evex->type = EventType::kAccessExt;
+  evex->is_read = !!(typ & kAccessRead);
+  evex->is_atomic = !!(typ & kAccessAtomic);
+  evex->size_log = size_log;
+  evex->addr = CompressAddr(addr);
+  evex->pc = pc;
+  TraceRelease(thr, evex);
+  return true;
+}
+
+ALWAYS_INLINE USED bool TryTraceMemoryAccessRange(ThreadState *thr, uptr pc,
+                                                  uptr addr, uptr size,
+                                                  AccessType typ) {
+  if (!kCollectHistory)
+    return true;
+  EventAccessRange *ev;
+  if (UNLIKELY(!TraceAcquire(thr, &ev)))
+    return false;
+  thr->trace_prev_pc = pc;
+  ev->is_access = 0;
+  ev->is_func = 0;
+  ev->type = EventType::kAccessRange;
+  ev->is_read = !!(typ & kAccessRead);
+  ev->is_free = !!(typ & kAccessFree);
+  ev->size_lo = size;
+  ev->pc = CompressAddr(pc);
+  ev->addr = CompressAddr(addr);
+  ev->size_hi = size >> EventAccessRange::kSizeLoBits;
+  TraceRelease(thr, ev);
+  return true;
+}
+
+void TraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                            AccessType typ) {
+  if (LIKELY(TryTraceMemoryAccessRange(thr, pc, addr, size, typ)))
+    return;
+  TraceSwitchPart(thr);
+  UNUSED bool res = TryTraceMemoryAccessRange(thr, pc, addr, size, typ);
+  DCHECK(res);
+}
+
+void TraceFunc(ThreadState *thr, uptr pc) {
+  if (LIKELY(TryTraceFunc(thr, pc)))
+    return;
+  TraceSwitchPart(thr);
+  UNUSED bool res = TryTraceFunc(thr, pc);
+  DCHECK(res);
+}
+
+void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr,
+                    StackID stk) {
+  DCHECK(type == EventType::kLock || type == EventType::kRLock);
+  if (!kCollectHistory)
+    return;
+  EventLock ev;
+  ev.is_access = 0;
+  ev.is_func = 0;
+  ev.type = type;
+  ev.pc = CompressAddr(pc);
+  ev.stack_lo = stk;
+  ev.stack_hi = stk >> EventLock::kStackIDLoBits;
+  ev._ = 0;
+  ev.addr = CompressAddr(addr);
+  TraceEvent(thr, ev);
+}
+
+void TraceMutexUnlock(ThreadState *thr, uptr addr) {
+  if (!kCollectHistory)
+    return;
+  EventUnlock ev;
+  ev.is_access = 0;
+  ev.is_func = 0;
+  ev.type = EventType::kUnlock;
+  ev._ = 0;
+  ev.addr = CompressAddr(addr);
+  TraceEvent(thr, ev);
+}
+
+void TraceTime(ThreadState *thr) {
+  if (!kCollectHistory)
+    return;
+  EventTime ev;
+  ev.is_access = 0;
+  ev.is_func = 0;
+  ev.type = EventType::kTime;
+  ev.sid = static_cast<u64>(thr->sid);
+  ev.epoch = static_cast<u64>(thr->epoch);
+  ev._ = 0;
+  TraceEvent(thr, ev);
+}
+
+NOINLINE
+void TraceSwitchPart(ThreadState *thr) {
+  Trace *trace = &thr->tctx->trace;
+  Event *pos = reinterpret_cast<Event *>(atomic_load_relaxed(&thr->trace_pos));
+  DCHECK_EQ(reinterpret_cast<uptr>(pos + 1) & TracePart::kAlignment, 0);
+  auto *part = trace->parts.Back();
+  DPrintf("TraceSwitchPart part=%p pos=%p\n", part, pos);
+  if (part) {
+    // We can get here when we still have space in the current trace part.
+    // The fast-path check in TraceAcquire has false positives in the middle of
+    // the part. Check if we are indeed at the end of the current part or not,
+    // and fill any gaps with NopEvent's.
+    Event *end = &part->events[TracePart::kSize];
+    DCHECK_GE(pos, &part->events[0]);
+    DCHECK_LE(pos, end);
+    if (pos + 1 < end) {
+      if ((reinterpret_cast<uptr>(pos) & TracePart::kAlignment) ==
+          TracePart::kAlignment)
+        *pos++ = NopEvent;
+      *pos++ = NopEvent;
+      DCHECK_LE(pos + 2, end);
+      atomic_store_relaxed(&thr->trace_pos, reinterpret_cast<uptr>(pos));
+      // Ensure we setup trace so that the next TraceAcquire
+      // won't detect trace part end.
+      Event *ev;
+      CHECK(TraceAcquire(thr, &ev));
+      return;
+    }
+    // We are indeed at the end.
+    for (; pos < end; pos++) *pos = NopEvent;
+  }
+#if !SANITIZER_GO
+  if (ctx->after_multithreaded_fork) {
+    // We just need to survive till exec.
+    CHECK(part);
+    atomic_store_relaxed(&thr->trace_pos,
+                         reinterpret_cast<uptr>(&part->events[0]));
+    return;
+  }
+#endif
+  part = new (MmapOrDie(sizeof(TracePart), "TracePart")) TracePart();
+  part->trace = trace;
+  thr->trace_prev_pc = 0;
+  {
+    Lock lock(&trace->mtx);
+    trace->parts.PushBack(part);
+    atomic_store_relaxed(&thr->trace_pos,
+                         reinterpret_cast<uptr>(&part->events[0]));
+  }
+  // Make this part self-sufficient by restoring the current stack
+  // and mutex set in the beginning of the trace.
+  TraceTime(thr);
+  for (uptr *pos = &thr->shadow_stack[0]; pos < thr->shadow_stack_pos; pos++)
+    CHECK(TryTraceFunc(thr, *pos));
+  for (uptr i = 0; i < thr->mset.Size(); i++) {
+    MutexSet::Desc d = thr->mset.Get(i);
+    TraceMutexLock(thr, d.write ? EventType::kLock : EventType::kRLock, 0,
+                   d.addr, d.stack_id);
+  }
+}
+
+}  // namespace v3
+
 void TraceSwitch(ThreadState *thr) {
 #if !SANITIZER_GO
   if (ctx->after_multithreaded_fork)
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.h b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
index 4d05b55ee40e7..1eb9b8c138237 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.h
@@ -444,6 +444,13 @@ struct ThreadState {
 
   const ReportDesc *current_report;
 
+  // Current position in tctx->trace.Back()->events (Event*).
+  atomic_uintptr_t trace_pos;
+  // PC of the last memory access, used to compute PC deltas in the trace.
+  uptr trace_prev_pc;
+  Sid sid;
+  Epoch epoch;
+
   explicit ThreadState(Context *ctx, Tid tid, int unique_id, u64 epoch,
                        unsigned reuse_count, uptr stk_addr, uptr stk_size,
                        uptr tls_addr, uptr tls_size);
@@ -486,6 +493,8 @@ class ThreadContext final : public ThreadContextBase {
   u64 epoch0;
   u64 epoch1;
 
+  v3::Trace trace;
+
   // Override superclass callbacks.
   void OnDead() override;
   void OnJoined(void *arg) override;
@@ -549,6 +558,8 @@ struct Context {
   ClockAlloc clock_alloc;
 
   Flags flags;
+
+  Mutex slot_mtx;
 };
 
 extern Context *ctx;  // The one and the only global runtime context.
@@ -892,6 +903,88 @@ void LazyInitialize(ThreadState *thr) {
 #endif
 }
 
+namespace v3 {
+
+void TraceSwitchPart(ThreadState *thr);
+bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
+                  uptr size, AccessType typ, VarSizeStackTrace *pstk,
+                  MutexSet *pmset, uptr *ptag);
+
+template <typename EventT>
+ALWAYS_INLINE WARN_UNUSED_RESULT bool TraceAcquire(ThreadState *thr,
+                                                   EventT **ev) {
+  Event *pos = reinterpret_cast<Event *>(atomic_load_relaxed(&thr->trace_pos));
+#if SANITIZER_DEBUG
+  // TraceSwitch acquires these mutexes,
+  // so we lock them here to detect deadlocks more reliably.
+  { Lock lock(&ctx->slot_mtx); }
+  { Lock lock(&thr->tctx->trace.mtx); }
+  TracePart *current = thr->tctx->trace.parts.Back();
+  if (current) {
+    DCHECK_GE(pos, &current->events[0]);
+    DCHECK_LE(pos, &current->events[TracePart::kSize]);
+  } else {
+    DCHECK_EQ(pos, nullptr);
+  }
+#endif
+  // TracePart is allocated with mmap and is at least 4K aligned.
+  // So the following check is a faster way to check for part end.
+  // It may have false positives in the middle of the trace,
+  // they are filtered out in TraceSwitch.
+  if (UNLIKELY(((uptr)(pos + 1) & TracePart::kAlignment) == 0))
+    return false;
+  *ev = reinterpret_cast<EventT *>(pos);
+  return true;
+}
+
+template <typename EventT>
+ALWAYS_INLINE void TraceRelease(ThreadState *thr, EventT *evp) {
+  DCHECK_LE(evp + 1, &thr->tctx->trace.parts.Back()->events[TracePart::kSize]);
+  atomic_store_relaxed(&thr->trace_pos, (uptr)(evp + 1));
+}
+
+template <typename EventT>
+void TraceEvent(ThreadState *thr, EventT ev) {
+  EventT *evp;
+  if (!TraceAcquire(thr, &evp)) {
+    TraceSwitchPart(thr);
+    UNUSED bool res = TraceAcquire(thr, &evp);
+    DCHECK(res);
+  }
+  *evp = ev;
+  TraceRelease(thr, evp);
+}
+
+ALWAYS_INLINE WARN_UNUSED_RESULT bool TryTraceFunc(ThreadState *thr,
+                                                   uptr pc = 0) {
+  if (!kCollectHistory)
+    return true;
+  EventFunc *ev;
+  if (UNLIKELY(!TraceAcquire(thr, &ev)))
+    return false;
+  ev->is_access = 0;
+  ev->is_func = 1;
+  ev->pc = pc;
+  TraceRelease(thr, ev);
+  return true;
+}
+
+WARN_UNUSED_RESULT
+bool TryTraceMemoryAccess(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                          AccessType typ);
+WARN_UNUSED_RESULT
+bool TryTraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                               AccessType typ);
+void TraceMemoryAccessRange(ThreadState *thr, uptr pc, uptr addr, uptr size,
+                            AccessType typ);
+void TraceFunc(ThreadState *thr, uptr pc = 0);
+void TraceMutexLock(ThreadState *thr, EventType type, uptr pc, uptr addr,
+                    StackID stk);
+void TraceMutexUnlock(ThreadState *thr, uptr addr);
+void TraceTime(ThreadState *thr);
+
+}  // namespace v3
+
 }  // namespace __tsan
 
 #endif  // TSAN_RTL_H
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp
index 06d3fa1326dd5..49e867a63aa92 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_report.cpp
@@ -450,6 +450,225 @@ void RestoreStack(Tid tid, const u64 epoch, VarSizeStackTrace *stk,
   ExtractTagFromStack(stk, tag);
 }
 
+namespace v3 {
+
+// Replays the trace up to last_pos position in the last part
+// or up to the provided epoch/sid (whichever is earlier)
+// and calls the provided function f for each event.
+template <typename Func>
+void TraceReplay(Trace *trace, TracePart *last, Event *last_pos, Sid sid,
+                 Epoch epoch, Func f) {
+  TracePart *part = trace->parts.Front();
+  Sid ev_sid = kFreeSid;
+  Epoch ev_epoch = kEpochOver;
+  for (;;) {
+    DCHECK_EQ(part->trace, trace);
+    // Note: an event can't start in the last element.
+    // Since an event can take up to 2 elements,
+    // we ensure we have at least 2 before adding an event.
+    Event *end = &part->events[TracePart::kSize - 1];
+    if (part == last)
+      end = last_pos;
+    for (Event *evp = &part->events[0]; evp < end; evp++) {
+      Event *evp0 = evp;
+      if (!evp->is_access && !evp->is_func) {
+        switch (evp->type) {
+          case EventType::kTime: {
+            auto *ev = reinterpret_cast<EventTime *>(evp);
+            ev_sid = static_cast<Sid>(ev->sid);
+            ev_epoch = static_cast<Epoch>(ev->epoch);
+            if (ev_sid == sid && ev_epoch > epoch)
+              return;
+            break;
+          }
+          case EventType::kAccessExt:
+            FALLTHROUGH;
+          case EventType::kAccessRange:
+            FALLTHROUGH;
+          case EventType::kLock:
+            FALLTHROUGH;
+          case EventType::kRLock:
+            // These take 2 Event elements.
+            evp++;
+            break;
+          case EventType::kUnlock:
+            // This takes 1 Event element.
+            break;
+        }
+      }
+      CHECK_NE(ev_sid, kFreeSid);
+      CHECK_NE(ev_epoch, kEpochOver);
+      f(ev_sid, ev_epoch, evp0);
+    }
+    if (part == last)
+      return;
+    part = trace->parts.Next(part);
+    CHECK(part);
+  }
+  CHECK(0);
+}
+
+static void RestoreStackMatch(VarSizeStackTrace *pstk, MutexSet *pmset,
+                              Vector<uptr> *stack, MutexSet *mset, uptr pc,
+                              bool *found) {
+  DPrintf2("    MATCHED\n");
+  *pmset = *mset;
+  stack->PushBack(pc);
+  pstk->Init(&(*stack)[0], stack->Size());
+  stack->PopBack();
+  *found = true;
+}
+
+// Checks if addr1|size1 is fully contained in addr2|size2.
+// We check for fully contained instread of just overlapping
+// because a memory access is always traced once, but can be
+// split into multiple accesses in the shadow.
+static constexpr bool IsWithinAccess(uptr addr1, uptr size1, uptr addr2,
+                                     uptr size2) {
+  return addr1 >= addr2 && addr1 + size1 <= addr2 + size2;
+}
+
+// Replays the trace of thread tid up to the target event identified
+// by sid/epoch/addr/size/typ and restores and returns stack, mutex set
+// and tag for that event. If there are multiple such events, it returns
+// the last one. Returns false if the event is not present in the trace.
+bool RestoreStack(Tid tid, EventType type, Sid sid, Epoch epoch, uptr addr,
+                  uptr size, AccessType typ, VarSizeStackTrace *pstk,
+                  MutexSet *pmset, uptr *ptag) {
+  // This function restores stack trace and mutex set for the thread/epoch.
+  // It does so by getting stack trace and mutex set at the beginning of
+  // trace part, and then replaying the trace till the given epoch.
+  DPrintf2("RestoreStack: tid=%u sid=%u@%u addr=0x%zx/%zu typ=%x\n", tid, sid,
+           epoch, addr, size, typ);
+  ctx->slot_mtx.CheckLocked();  // needed to prevent trace part recycling
+  ctx->thread_registry.CheckLocked();
+  ThreadContext *tctx =
+      static_cast<ThreadContext *>(ctx->thread_registry.GetThreadLocked(tid));
+  Trace *trace = &tctx->trace;
+  // Snapshot first/last parts and the current position in the last part.
+  TracePart *first_part;
+  TracePart *last_part;
+  Event *last_pos;
+  {
+    Lock lock(&trace->mtx);
+    first_part = trace->parts.Front();
+    if (!first_part)
+      return false;
+    last_part = trace->parts.Back();
+    last_pos = trace->final_pos;
+    if (tctx->thr)
+      last_pos = (Event *)atomic_load_relaxed(&tctx->thr->trace_pos);
+  }
+  // Too large for stack.
+  alignas(MutexSet) static char mset_storage[sizeof(MutexSet)];
+  MutexSet &mset = *new (mset_storage) MutexSet();
+  Vector<uptr> stack;
+  uptr prev_pc = 0;
+  bool found = false;
+  bool is_read = typ & kAccessRead;
+  bool is_atomic = typ & kAccessAtomic;
+  bool is_free = typ & kAccessFree;
+  TraceReplay(
+      trace, last_part, last_pos, sid, epoch,
+      [&](Sid ev_sid, Epoch ev_epoch, Event *evp) {
+        bool match = ev_sid == sid && ev_epoch == epoch;
+        if (evp->is_access) {
+          if (evp->is_func == 0 && evp->type == EventType::kAccessExt &&
+              evp->_ == 0)  // NopEvent
+            return;
+          auto *ev = reinterpret_cast<EventAccess *>(evp);
+          uptr ev_addr = RestoreAddr(ev->addr);
+          uptr ev_size = 1 << ev->size_log;
+          uptr ev_pc =
+              prev_pc + ev->pc_delta - (1 << (EventAccess::kPCBits - 1));
+          prev_pc = ev_pc;
+          DPrintf2("  Access: pc=0x%zx addr=0x%llx/%llu type=%llu/%llu\n",
+                   ev_pc, ev_addr, ev_size, ev->is_read, ev->is_atomic);
+          if (match && type == EventType::kAccessExt &&
+              IsWithinAccess(addr, size, ev_addr, ev_size) &&
+              is_read == ev->is_read && is_atomic == ev->is_atomic && !is_free)
+            RestoreStackMatch(pstk, pmset, &stack, &mset, ev_pc, &found);
+          return;
+        }
+        if (evp->is_func) {
+          auto *ev = reinterpret_cast<EventFunc *>(evp);
+          if (ev->pc) {
+            DPrintf2("  FuncEnter: pc=0x%zx\n", ev->pc);
+            stack.PushBack(ev->pc);
+          } else {
+            DPrintf2("  FuncExit\n");
+            CHECK(stack.Size());
+            stack.PopBack();
+          }
+          return;
+        }
+        switch (evp->type) {
+          case EventType::kAccessExt: {
+            auto *ev = reinterpret_cast<EventAccessExt *>(evp);
+            uptr ev_addr = RestoreAddr(ev->addr);
+            uptr ev_size = 1 << ev->size_log;
+            prev_pc = ev->pc;
+            DPrintf2("  AccessExt: pc=0x%zx addr=0x%llx/%llu type=%llu/%llu\n",
+                     ev->pc, ev_addr, ev_size, ev->is_read, ev->is_atomic);
+            if (match && type == EventType::kAccessExt &&
+                IsWithinAccess(addr, size, ev_addr, ev_size) &&
+                is_read == ev->is_read && is_atomic == ev->is_atomic &&
+                !is_free)
+              RestoreStackMatch(pstk, pmset, &stack, &mset, ev->pc, &found);
+            break;
+          }
+          case EventType::kAccessRange: {
+            auto *ev = reinterpret_cast<EventAccessRange *>(evp);
+            uptr ev_addr = RestoreAddr(ev->addr);
+            uptr ev_size =
+                (ev->size_hi << EventAccessRange::kSizeLoBits) + ev->size_lo;
+            uptr ev_pc = RestoreAddr(ev->pc);
+            prev_pc = ev_pc;
+            DPrintf2("  Range: pc=0x%zx addr=0x%llx/%llu type=%llu/%llu\n",
+                     ev_pc, ev_addr, ev_size, ev->is_read, ev->is_free);
+            if (match && type == EventType::kAccessExt &&
+                IsWithinAccess(addr, size, ev_addr, ev_size) &&
+                is_read == ev->is_read && !is_atomic && is_free == ev->is_free)
+              RestoreStackMatch(pstk, pmset, &stack, &mset, ev_pc, &found);
+            break;
+          }
+          case EventType::kLock:
+            FALLTHROUGH;
+          case EventType::kRLock: {
+            auto *ev = reinterpret_cast<EventLock *>(evp);
+            bool is_write = ev->type == EventType::kLock;
+            uptr ev_addr = RestoreAddr(ev->addr);
+            uptr ev_pc = RestoreAddr(ev->pc);
+            StackID stack_id =
+                (ev->stack_hi << EventLock::kStackIDLoBits) + ev->stack_lo;
+            DPrintf2("  Lock: pc=0x%zx addr=0x%llx stack=%u write=%d\n", ev_pc,
+                     ev_addr, stack_id, is_write);
+            mset.AddAddr(ev_addr, stack_id, is_write);
+            // Events with ev_pc == 0 are written to the beginning of trace
+            // part as initial mutex set (are not real).
+            if (match && type == EventType::kLock && addr == ev_addr && ev_pc)
+              RestoreStackMatch(pstk, pmset, &stack, &mset, ev_pc, &found);
+            break;
+          }
+          case EventType::kUnlock: {
+            auto *ev = reinterpret_cast<EventUnlock *>(evp);
+            uptr ev_addr = RestoreAddr(ev->addr);
+            DPrintf2("  Unlock: addr=0x%llx\n", ev_addr);
+            mset.DelAddr(ev_addr);
+            break;
+          }
+          case EventType::kTime:
+            // TraceReplay already extracted sid/epoch from it,
+            // nothing else to do here.
+            break;
+        }
+      });
+  ExtractTagFromStack(pstk, ptag);
+  return found;
+}
+
+}  // namespace v3
+
 static bool FindRacyStacks(const RacyStacks &hash) {
   for (uptr i = 0; i < ctx->racy_stacks.Size(); i++) {
     if (hash == ctx->racy_stacks[i]) {
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp
index aefbda2f83720..32261f5ee685b 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_thread.cpp
@@ -252,6 +252,8 @@ void ThreadStart(ThreadState *thr, Tid tid, tid_t os_id,
   thr->tctx = (ThreadContext*)tr->GetThreadLocked(tid);
   tr->Unlock();
 
+  while (!thr->tctx->trace.parts.Empty()) thr->tctx->trace.parts.PopBack();
+
 #if !SANITIZER_GO
   if (ctx->after_multithreaded_fork) {
     thr->ignore_interceptors++;
diff --git a/compiler-rt/lib/tsan/rtl/tsan_trace.h b/compiler-rt/lib/tsan/rtl/tsan_trace.h
index f5e0c407cda86..a771ad9f52fd3 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_trace.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_trace.h
@@ -13,8 +13,9 @@
 #define TSAN_TRACE_H
 
 #include "tsan_defs.h"
-#include "tsan_stack_trace.h"
+#include "tsan_ilist.h"
 #include "tsan_mutexset.h"
+#include "tsan_stack_trace.h"
 
 namespace __tsan {
 
@@ -67,6 +68,155 @@ struct Trace {
   Trace() : mtx(MutexTypeTrace) {}
 };
 
+namespace v3 {
+
+enum class EventType : u64 {
+  kAccessExt,
+  kAccessRange,
+  kLock,
+  kRLock,
+  kUnlock,
+  kTime,
+};
+
+// "Base" type for all events for type dispatch.
+struct Event {
+  // We use variable-length type encoding to give more bits to some event
+  // types that need them. If is_access is set, this is EventAccess.
+  // Otherwise, if is_func is set, this is EventFunc.
+  // Otherwise type denotes the type.
+  u64 is_access : 1;
+  u64 is_func : 1;
+  EventType type : 3;
+  u64 _ : 59;
+};
+static_assert(sizeof(Event) == 8, "bad Event size");
+
+// Nop event used as padding and does not affect state during replay.
+static constexpr Event NopEvent = {1, 0, EventType::kAccessExt, 0};
+
+// Compressed memory access can represent only some events with PCs
+// close enough to each other. Otherwise we fall back to EventAccessExt.
+struct EventAccess {
+  static constexpr uptr kPCBits = 15;
+
+  u64 is_access : 1;  // = 1
+  u64 is_read : 1;
+  u64 is_atomic : 1;
+  u64 size_log : 2;
+  u64 pc_delta : kPCBits;  // signed delta from the previous memory access PC
+  u64 addr : kCompressedAddrBits;
+};
+static_assert(sizeof(EventAccess) == 8, "bad EventAccess size");
+
+// Function entry (pc != 0) or exit (pc == 0).
+struct EventFunc {
+  u64 is_access : 1;  // = 0
+  u64 is_func : 1;    // = 1
+  u64 pc : 62;
+};
+static_assert(sizeof(EventFunc) == 8, "bad EventFunc size");
+
+// Extended memory access with full PC.
+struct EventAccessExt {
+  u64 is_access : 1;   // = 0
+  u64 is_func : 1;     // = 0
+  EventType type : 3;  // = EventType::kAccessExt
+  u64 is_read : 1;
+  u64 is_atomic : 1;
+  u64 size_log : 2;
+  u64 _ : 11;
+  u64 addr : kCompressedAddrBits;
+  u64 pc;
+};
+static_assert(sizeof(EventAccessExt) == 16, "bad EventAccessExt size");
+
+// Access to a memory range.
+struct EventAccessRange {
+  static constexpr uptr kSizeLoBits = 13;
+
+  u64 is_access : 1;   // = 0
+  u64 is_func : 1;     // = 0
+  EventType type : 3;  // = EventType::kAccessRange
+  u64 is_read : 1;
+  u64 is_free : 1;
+  u64 size_lo : kSizeLoBits;
+  u64 pc : kCompressedAddrBits;
+  u64 addr : kCompressedAddrBits;
+  u64 size_hi : 64 - kCompressedAddrBits;
+};
+static_assert(sizeof(EventAccessRange) == 16, "bad EventAccessRange size");
+
+// Mutex lock.
+struct EventLock {
+  static constexpr uptr kStackIDLoBits = 15;
+
+  u64 is_access : 1;   // = 0
+  u64 is_func : 1;     // = 0
+  EventType type : 3;  // = EventType::kLock or EventType::kRLock
+  u64 pc : kCompressedAddrBits;
+  u64 stack_lo : kStackIDLoBits;
+  u64 stack_hi : sizeof(StackID) * kByteBits - kStackIDLoBits;
+  u64 _ : 3;
+  u64 addr : kCompressedAddrBits;
+};
+static_assert(sizeof(EventLock) == 16, "bad EventLock size");
+
+// Mutex unlock.
+struct EventUnlock {
+  u64 is_access : 1;   // = 0
+  u64 is_func : 1;     // = 0
+  EventType type : 3;  // = EventType::kUnlock
+  u64 _ : 15;
+  u64 addr : kCompressedAddrBits;
+};
+static_assert(sizeof(EventUnlock) == 8, "bad EventUnlock size");
+
+// Time change event.
+struct EventTime {
+  u64 is_access : 1;   // = 0
+  u64 is_func : 1;     // = 0
+  EventType type : 3;  // = EventType::kTime
+  u64 sid : sizeof(Sid) * kByteBits;
+  u64 epoch : kEpochBits;
+  u64 _ : 64 - 5 - sizeof(Sid) * kByteBits - kEpochBits;
+};
+static_assert(sizeof(EventTime) == 8, "bad EventTime size");
+
+struct Trace;
+
+struct TraceHeader {
+  Trace* trace = nullptr;  // back-pointer to Trace containing this part
+  INode trace_parts;       // in Trace::parts
+};
+
+struct TracePart : TraceHeader {
+  static constexpr uptr kByteSize = 256 << 10;
+  static constexpr uptr kSize =
+      (kByteSize - sizeof(TraceHeader)) / sizeof(Event);
+  // TraceAcquire does a fast event pointer overflow check by comparing
+  // pointer into TracePart::events with kAlignment mask. Since TracePart's
+  // are allocated page-aligned, this check detects end of the array
+  // (it also have false positives in the middle that are filtered separately).
+  // This also requires events to be the last field.
+  static constexpr uptr kAlignment = 0xff0;
+  Event events[kSize];
+
+  TracePart() {}
+};
+static_assert(sizeof(TracePart) == TracePart::kByteSize, "bad TracePart size");
+
+struct Trace {
+  Mutex mtx;
+  IList<TraceHeader, &TraceHeader::trace_parts, TracePart> parts;
+  Event* final_pos =
+      nullptr;  // final position in the last part for finished threads
+
+  Trace() : mtx(MutexTypeTrace) {}
+};
+
+}  // namespace v3
+
 }  // namespace __tsan
 
 #endif  // TSAN_TRACE_H
diff --git a/compiler-rt/lib/tsan/tests/unit/CMakeLists.txt b/compiler-rt/lib/tsan/tests/unit/CMakeLists.txt
index 576aeda9ab0a7..ed614a26955e6 100644
--- a/compiler-rt/lib/tsan/tests/unit/CMakeLists.txt
+++ b/compiler-rt/lib/tsan/tests/unit/CMakeLists.txt
@@ -7,6 +7,7 @@ set(TSAN_UNIT_TEST_SOURCES
   tsan_shadow_test.cpp
   tsan_stack_test.cpp
   tsan_sync_test.cpp
+  tsan_trace_test.cpp
   tsan_unit_test_main.cpp
   tsan_vector_clock_test.cpp
   )
diff --git a/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp b/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp
new file mode 100644
index 0000000000000..0863850e4f114
--- /dev/null
+++ b/compiler-rt/lib/tsan/tests/unit/tsan_trace_test.cpp
@@ -0,0 +1,231 @@
+//===-- tsan_trace_test.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+//===----------------------------------------------------------------------===//
+#include "tsan_trace.h"
+
+#include <pthread.h>
+
+#include "gtest/gtest.h"
+#include "tsan_rtl.h"
+
+namespace __tsan {
+
+using namespace v3;
+
+// We need to run all trace tests in a new thread,
+// so that the thread trace is empty initially.
+static void run_in_thread(void *(*f)(void *), void *arg = nullptr) {
+  pthread_t th;
+  pthread_create(&th, nullptr, f, arg);
+  pthread_join(th, nullptr);
+}
+
+#if SANITIZER_MAC
+// These tests are currently failing on Mac.
+// See https://reviews.llvm.org/D107911 for more details.
+#  define MAYBE_RestoreAccess DISABLED_RestoreAccess
+#  define MAYBE_MemoryAccessSize DISABLED_MemoryAccessSize
+#  define MAYBE_RestoreMutexLock DISABLED_RestoreMutexLock
+#  define MAYBE_MultiPart DISABLED_MultiPart
+#else
+#  define MAYBE_RestoreAccess RestoreAccess
+#  define MAYBE_MemoryAccessSize MemoryAccessSize
+#  define MAYBE_RestoreMutexLock RestoreMutexLock
+#  define MAYBE_MultiPart MultiPart
+#endif
+
+TEST(Trace, MAYBE_RestoreAccess) {
+  struct Thread {
+    static void *Func(void *arg) {
+      // A basic test with some function entry/exit events,
+      // some mutex lock/unlock events and some other distracting
+      // memory events.
+      ThreadState *thr = cur_thread();
+      TraceFunc(thr, 0x1000);
+      TraceFunc(thr, 0x1001);
+      TraceMutexLock(thr, v3::EventType::kLock, 0x4000, 0x5000, 0x6000);
+      TraceMutexLock(thr, v3::EventType::kLock, 0x4001, 0x5001, 0x6001);
+      TraceMutexUnlock(thr, 0x5000);
+      TraceFunc(thr);
+      CHECK(TryTraceMemoryAccess(thr, 0x2001, 0x3001, 8, kAccessRead));
+      TraceMutexLock(thr, v3::EventType::kRLock, 0x4002, 0x5002, 0x6002);
+      TraceFunc(thr, 0x1002);
+      CHECK(TryTraceMemoryAccess(thr, 0x2000, 0x3000, 8, kAccessRead));
+      // This is the access we want to find.
+      // The previous one is equivalent, but RestoreStack must prefer
+      // the last of the matchig accesses.
+      CHECK(TryTraceMemoryAccess(thr, 0x2002, 0x3000, 8, kAccessRead));
+      Lock lock1(&ctx->slot_mtx);
+      ThreadRegistryLock lock2(&ctx->thread_registry);
+      VarSizeStackTrace stk;
+      MutexSet mset;
+      uptr tag = kExternalTagNone;
+      bool res =
+          RestoreStack(thr->tid, v3::EventType::kAccessExt, thr->sid,
+                       thr->epoch, 0x3000, 8, kAccessRead, &stk, &mset, &tag);
+      CHECK(res);
+      CHECK_EQ(stk.size, 3);
+      CHECK_EQ(stk.trace[0], 0x1000);
+      CHECK_EQ(stk.trace[1], 0x1002);
+      CHECK_EQ(stk.trace[2], 0x2002);
+      CHECK_EQ(mset.Size(), 2);
+      CHECK_EQ(mset.Get(0).addr, 0x5001);
+      CHECK_EQ(mset.Get(0).stack_id, 0x6001);
+      CHECK_EQ(mset.Get(0).write, true);
+      CHECK_EQ(mset.Get(1).addr, 0x5002);
+      CHECK_EQ(mset.Get(1).stack_id, 0x6002);
+      CHECK_EQ(mset.Get(1).write, false);
+      CHECK_EQ(tag, kExternalTagNone);
+      return nullptr;
+    }
+  };
+  run_in_thread(Thread::Func);
+}
+
+TEST(Trace, MAYBE_MemoryAccessSize) {
+  struct Thread {
+    struct Params {
+      uptr access_size, offset, size;
+      bool res;
+      int type;
+    };
+    static void *Func(void *arg) {
+      // Test tracing and matching of accesses of different sizes.
+      const Params *params = static_cast<Params *>(arg);
+      Printf("access_size=%zu, offset=%zu, size=%zu, res=%d, type=%d\n",
+             params->access_size, params->offset, params->size, params->res,
+             params->type);
+      ThreadState *thr = cur_thread();
+      TraceFunc(thr, 0x1000);
+      switch (params->type) {
+        case 0:
+          // This should emit compressed event.
+          CHECK(TryTraceMemoryAccess(thr, 0x2000, 0x3000, params->access_size,
+                                     kAccessRead));
+          break;
+        case 1:
+          // This should emit full event.
+          CHECK(TryTraceMemoryAccess(thr, 0x2000000, 0x3000,
+                                     params->access_size, kAccessRead));
+          break;
+        case 2:
+          TraceMemoryAccessRange(thr, 0x2000000, 0x3000, params->access_size,
+                                 kAccessRead);
+          break;
+      }
+      Lock lock1(&ctx->slot_mtx);
+      ThreadRegistryLock lock2(&ctx->thread_registry);
+      VarSizeStackTrace stk;
+      MutexSet mset;
+      uptr tag = kExternalTagNone;
+      bool res = RestoreStack(thr->tid, v3::EventType::kAccessExt, thr->sid,
+                              thr->epoch, 0x3000 + params->offset, params->size,
+                              kAccessRead, &stk, &mset, &tag);
+      CHECK_EQ(res, params->res);
+      if (params->res) {
+        CHECK_EQ(stk.size, 2);
+        CHECK_EQ(stk.trace[0], 0x1000);
+        CHECK_EQ(stk.trace[1], params->type ? 0x2000000 : 0x2000);
+      }
+      return nullptr;
+    }
+  };
+  Thread::Params tests[] = {
+      {1, 0, 1, true, 0},  {4, 0, 2, true, 0},
+      {4, 2, 2, true, 0},  {8, 3, 1, true, 0},
+      {2, 1, 1, true, 0},  {1, 1, 1, false, 0},
+      {8, 5, 4, false, 0}, {4, static_cast<uptr>(-1l), 4, false, 0},
+  };
+  for (auto params : tests) {
+    for (params.type = 0; params.type < 3; params.type++)
+      run_in_thread(Thread::Func, &params);
+  }
+}
+
+TEST(Trace, MAYBE_RestoreMutexLock) {
+  struct Thread {
+    static void *Func(void *arg) {
+      // Check of restoration of a mutex lock event.
+      ThreadState *thr = cur_thread();
+      TraceFunc(thr, 0x1000);
+      TraceMutexLock(thr, v3::EventType::kLock, 0x4000, 0x5000, 0x6000);
+      TraceMutexLock(thr, v3::EventType::kRLock, 0x4001, 0x5001, 0x6001);
+      TraceMutexLock(thr, v3::EventType::kRLock, 0x4002, 0x5001, 0x6002);
+      Lock lock1(&ctx->slot_mtx);
+      ThreadRegistryLock lock2(&ctx->thread_registry);
+      VarSizeStackTrace stk;
+      MutexSet mset;
+      uptr tag = kExternalTagNone;
+      bool res = RestoreStack(thr->tid, v3::EventType::kLock, thr->sid,
+                              thr->epoch, 0x5001, 0, 0, &stk, &mset, &tag);
+      CHECK(res);
+      CHECK_EQ(stk.size, 2);
+      CHECK_EQ(stk.trace[0], 0x1000);
+      CHECK_EQ(stk.trace[1], 0x4002);
+      CHECK_EQ(mset.Size(), 2);
+      CHECK_EQ(mset.Get(0).addr, 0x5000);
+      CHECK_EQ(mset.Get(0).stack_id, 0x6000);
+      CHECK_EQ(mset.Get(0).write, true);
+      CHECK_EQ(mset.Get(1).addr, 0x5001);
+      CHECK_EQ(mset.Get(1).stack_id, 0x6001);
+      CHECK_EQ(mset.Get(1).write, false);
+      return nullptr;
+    }
+  };
+  run_in_thread(Thread::Func);
+}
+
+TEST(Trace, MAYBE_MultiPart) {
+  struct Thread {
+    static void *Func(void *arg) {
+      // Check replay of a trace with multiple parts.
+      ThreadState *thr = cur_thread();
+      TraceFunc(thr, 0x1000);
+      TraceFunc(thr, 0x2000);
+      TraceMutexLock(thr, v3::EventType::kLock, 0x4000, 0x5000, 0x6000);
+      const uptr kEvents = 3 * sizeof(TracePart) / sizeof(v3::Event);
+      for (uptr i = 0; i < kEvents; i++) {
+        TraceFunc(thr, 0x3000);
+        TraceMutexLock(thr, v3::EventType::kLock, 0x4002, 0x5002, 0x6002);
+        TraceMutexUnlock(thr, 0x5002);
+        TraceFunc(thr);
+      }
+      TraceFunc(thr, 0x4000);
+      TraceMutexLock(thr, v3::EventType::kRLock, 0x4001, 0x5001, 0x6001);
+      CHECK(TryTraceMemoryAccess(thr, 0x2002, 0x3000, 8, kAccessRead));
+      Lock lock1(&ctx->slot_mtx);
+      ThreadRegistryLock lock2(&ctx->thread_registry);
+      VarSizeStackTrace stk;
+      MutexSet mset;
+      uptr tag = kExternalTagNone;
+      bool res =
+          RestoreStack(thr->tid, v3::EventType::kAccessExt, thr->sid,
+                       thr->epoch, 0x3000, 8, kAccessRead, &stk, &mset, &tag);
+      CHECK(res);
+      CHECK_EQ(stk.size, 4);
+      CHECK_EQ(stk.trace[0], 0x1000);
+      CHECK_EQ(stk.trace[1], 0x2000);
+      CHECK_EQ(stk.trace[2], 0x4000);
+      CHECK_EQ(stk.trace[3], 0x2002);
+      CHECK_EQ(mset.Size(), 2);
+      CHECK_EQ(mset.Get(0).addr, 0x5000);
+      CHECK_EQ(mset.Get(0).stack_id, 0x6000);
+      CHECK_EQ(mset.Get(0).write, true);
+      CHECK_EQ(mset.Get(1).addr, 0x5001);
+      CHECK_EQ(mset.Get(1).stack_id, 0x6001);
+      CHECK_EQ(mset.Get(1).write, false);
+      return nullptr;
+    }
+  };
+  run_in_thread(Thread::Func);
+}
+
+}  // namespace __tsan
diff --git a/compiler-rt/test/asan/TestCases/Posix/unpoison-alternate-stack.cpp b/compiler-rt/test/asan/TestCases/Posix/unpoison-alternate-stack.cpp
index a2082ed082154..50d28ddf84c2f 100644
--- a/compiler-rt/test/asan/TestCases/Posix/unpoison-alternate-stack.cpp
+++ b/compiler-rt/test/asan/TestCases/Posix/unpoison-alternate-stack.cpp
@@ -6,9 +6,6 @@
 // RUN: %clangxx_asan -std=c++20 -fexceptions -O0 %s -o %t -pthread
 // RUN: %run %t
 
-// longjmp from signal handler is unportable.
-// XFAIL: solaris
-
 #include <algorithm>
 #include <cassert>
 #include <cerrno>
diff --git a/compiler-rt/test/crt/lit.cfg.py b/compiler-rt/test/crt/lit.cfg.py
index 68e7eda7d59ba..d5a6aa9862d9d 100644
--- a/compiler-rt/test/crt/lit.cfg.py
+++ b/compiler-rt/test/crt/lit.cfg.py
@@ -2,6 +2,7 @@
 
 import os
 import subprocess
+import shlex
 
 # Setup config name.
 config.name = 'CRT' + config.name_suffix
@@ -23,8 +24,8 @@
 
 def get_library_path(file):
     cmd = subprocess.Popen([config.clang.strip(),
-                            config.target_cflags.strip(),
-                            '-print-file-name=%s' % file],
+                            '-print-file-name=%s' % file] +
+                           shlex.split(config.target_cflags),
                            stdout=subprocess.PIPE,
                            env=config.environment,
                            universal_newlines=True)
@@ -39,8 +40,8 @@ def get_library_path(file):
 
 def get_libgcc_file_name():
     cmd = subprocess.Popen([config.clang.strip(),
-                            config.target_cflags.strip(),
-                            '-print-libgcc-file-name'],
+                            '-print-libgcc-file-name'] +
+                           shlex.split(config.target_cflags),
                            stdout=subprocess.PIPE,
                            env=config.environment,
                            universal_newlines=True)
diff --git a/compiler-rt/test/fuzzer/fuzzer-flags.test b/compiler-rt/test/fuzzer/fuzzer-flags.test
index 9712b0131ea0a..5d23e667f3e36 100644
--- a/compiler-rt/test/fuzzer/fuzzer-flags.test
+++ b/compiler-rt/test/fuzzer/fuzzer-flags.test
@@ -1,4 +1,5 @@
-# Linker errors on AArch64 Linux in stage 2 build
+# Binutils < 2.32 fails with undefined reference to `__start_<...>`
+# AArch64 bots currently use 2.30.
 UNSUPPORTED: aarch64-linux
 
 RUN: %cpp_compiler %S/FlagsTest.cpp -o %t-FlagsTest
diff --git a/compiler-rt/test/hwasan/TestCases/Linux/atfork.cpp b/compiler-rt/test/hwasan/TestCases/Linux/atfork.cpp
new file mode 100644
index 0000000000000..b9f805f327d64
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/Linux/atfork.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_hwasan -O0 %s -o %t && %run %t 2>&1
+
+// REQUIRES: aarch64-target-arch || x86_64-target-arch
+// REQUIRES: pointer-tagging
+
+#include <assert.h>
+#include <pthread.h>
+#include <sanitizer/hwasan_interface.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+void *volatile sink;
+
+int main(int argc, char **argv) {
+  pthread_atfork(nullptr, nullptr, []() {
+    alarm(5);
+    sink = malloc(10);
+  });
+  int pid = fork();
+  if (pid) {
+    int wstatus;
+    do {
+      waitpid(pid, &wstatus, 0);
+    } while (!WIFEXITED(wstatus) && !WIFSIGNALED(wstatus));
+    if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus)) {
+      fprintf(stderr, "abnormal exit\n");
+      return 1;
+    }
+  }
+  return 0;
+}
diff --git a/compiler-rt/test/hwasan/TestCases/tail-magic.c b/compiler-rt/test/hwasan/TestCases/tail-magic.c
index fcbc8f115d73c..efece53aa699c 100644
--- a/compiler-rt/test/hwasan/TestCases/tail-magic.c
+++ b/compiler-rt/test/hwasan/TestCases/tail-magic.c
@@ -1,8 +1,13 @@
 // Tests free_checks_tail_magic=1.
-// RUN: %clang_hwasan  %s -o %t
+// RUN: %clang_hwasan %s -o %t
 // RUN: %env_hwasan_opts=free_checks_tail_magic=0     %run %t
-// RUN: %env_hwasan_opts=free_checks_tail_magic=1 not %run %t 2>&1 | FileCheck %s
-// RUN:                                           not %run %t 2>&1 | FileCheck %s
+// RUN: %env_hwasan_opts=free_checks_tail_magic=1 not %run %t 2>&1 | \
+// RUN:   FileCheck --check-prefixes=CHECK,CHECK-NONLASTGRANULE --strict-whitespace %s
+// RUN:                                           not %run %t 2>&1 | \
+// RUN:   FileCheck --check-prefixes=CHECK,CHECK-NONLASTGRANULE --strict-whitespace %s
+// RUN: %clang_hwasan -DLAST_GRANULE %s -o %t
+// RUN: not %run %t 2>&1 | \
+// RUN:   FileCheck --check-prefixes=CHECK,CHECK-LASTGRANULE --strict-whitespace %s
 
 // REQUIRES: stable-runtime
 
@@ -15,22 +20,33 @@ static volatile char *sink;
 // Overwrite the tail in a non-hwasan function so that we don't detect the
 // stores as OOB.
 __attribute__((no_sanitize("hwaddress"))) void overwrite_tail() {
+#ifdef LAST_GRANULE
+  sink[31] = 0x71;
+#else // LAST_GRANULE
   sink[20] = 0x42;
   sink[24] = 0x66;
+#endif // LAST_GRANULE
 }
 
 int main(int argc, char **argv) {
   __hwasan_enable_allocator_tagging();
 
   char *p = (char*)malloc(20);
+  __hwasan_print_shadow(p, 1);
   sink = p;
   overwrite_tail();
   free(p);
+// CHECK: HWASan shadow map for {{.*}} (pointer tag [[TAG:[a-f0-9]+]])
 // CHECK: ERROR: HWAddressSanitizer: allocation-tail-overwritten; heap object [{{.*}}) of size 20
 // CHECK: Stack of invalid access unknown. Issue detected at deallocation time.
 // CHECK: deallocated here:
-// CHECK: in main {{.*}}tail-magic.c:[[@LINE-4]]
+// CHECK: in main {{.*}}tail-magic.c:[[@LINE-5]]
 // CHECK: allocated here:
-// CHECK: in main {{.*}}tail-magic.c:[[@LINE-9]]
-// CHECK: Tail contains: .. .. .. .. 42 {{.. .. ..}} 66
+// CHECK: in main {{.*}}tail-magic.c:[[@LINE-11]]
+// CHECK-NONLASTGRANULE: Tail contains: .. .. .. .. 42 {{(([a-f0-9]{2} ){3})}}66
+// CHECK-LASTGRANULE: Tail contains: .. .. .. .. {{(([a-f0-9]{2} ?)+)}}71{{ *$}}
+// CHECK-NEXT: Expected: {{ +}} .. .. .. .. {{([a-f0-9]{2} )+0?}}[[TAG]]{{ *$}}
+// CHECK-NONLASTGRANULE-NEXT: {{ +}}^^{{ +}}^^{{ *$}}
+// CHECK-LASTGRANULE-NEXT: {{ +}}^^{{ *$}}
+  return 0;
 }
diff --git a/compiler-rt/test/hwasan/lit.cfg.py b/compiler-rt/test/hwasan/lit.cfg.py
index c94d5e0dac83d..dcae2b2932815 100644
--- a/compiler-rt/test/hwasan/lit.cfg.py
+++ b/compiler-rt/test/hwasan/lit.cfg.py
@@ -15,7 +15,7 @@
 
 if config.target_arch == 'x86_64' and config.enable_aliases == '1':
   clang_hwasan_common_cflags += ["-fsanitize-hwaddress-experimental-aliasing"]
-if config.enable_aliases != '1':
+else:
   config.available_features.add('pointer-tagging')
 if config.target_arch == 'x86_64':
   # This does basically the same thing as tagged-globals on aarch64. Because
diff --git a/compiler-rt/test/profile/Linux/instrprof-cs.c b/compiler-rt/test/profile/Linux/instrprof-cs.c
index d825525a532db..0ad6f0350c560 100644
--- a/compiler-rt/test/profile/Linux/instrprof-cs.c
+++ b/compiler-rt/test/profile/Linux/instrprof-cs.c
@@ -8,7 +8,7 @@
 // RUN: %clang_profgen=%t.profraw -o %t.gen.cis -O2 %s
 // RUN: %run %t.gen.cis
 // RUN: llvm-profdata merge -o %t.cis.profdata %t.profraw
-// Check context insenstive profile
+// Check context insensitive profile
 // RUN: %clang_profuse=%t.cis.profdata  -O2 -emit-llvm -S %s -o - | FileCheck %s --check-prefix=CIS
 int g1 = 1;
 int volatile g2 = 2;
diff --git a/compiler-rt/test/tsan/Linux/check_memcpy.c b/compiler-rt/test/tsan/Linux/check_memcpy.c
index 75dd7da8316ba..55705ce8154c2 100644
--- a/compiler-rt/test/tsan/Linux/check_memcpy.c
+++ b/compiler-rt/test/tsan/Linux/check_memcpy.c
@@ -3,9 +3,7 @@
 // its objdump.
 
 // RUN: %clang_tsan -O1 %s -o %t
-// RUN: llvm-objdump -d %t | FileCheck %s
-
-// REQUIRES: compiler-rt-optimized
+// RUN: llvm-objdump -d -l %t | FileCheck %s
 
 int main() {
   return 0;
diff --git a/compiler-rt/test/tsan/cxa_guard_acquire.cpp b/compiler-rt/test/tsan/cxa_guard_acquire.cpp
index cdbe609003d43..9d87e4912e3ba 100644
--- a/compiler-rt/test/tsan/cxa_guard_acquire.cpp
+++ b/compiler-rt/test/tsan/cxa_guard_acquire.cpp
@@ -1,3 +1,4 @@
+// UNSUPPORTED: darwin
 // RUN: %clangxx_tsan -O1 %s -o %t && %run %t 2>&1 | FileCheck %s
 
 #include <stdio.h>
diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index f17816ee29b82..f5e602b6b8441 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -72,6 +72,7 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   include(AddLLVM)
   include(HandleLLVMOptions)
   include(VersionFromVCS)
+  include(GetErrcMessages)
 
   include(AddClang)
 
@@ -132,6 +133,8 @@ if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
          "Generate build targets for the Flang unit tests."
          ON)
 
+  get_errc_messages(LLVM_LIT_ERRC_MESSAGES)
+
 #Handle unittests when out-of-tree
 #LLVM_BUILD_MAIN_SRC_DIR - Path to llvm source when out-of-tree.
   set(FLANG_GTEST_AVAIL 0)
diff --git a/flang/examples/CMakeLists.txt b/flang/examples/CMakeLists.txt
index c4ef3bf20d4b0..41a29fbda1f49 100644
--- a/flang/examples/CMakeLists.txt
+++ b/flang/examples/CMakeLists.txt
@@ -11,4 +11,4 @@ target_link_libraries(external-hello-world
   FortranRuntime
 )
 
-add_subdirectory(HelloWorld)
+add_subdirectory(PrintFlangFunctionNames)
diff --git a/flang/examples/HelloWorld/HelloWorldPlugin.cpp b/flang/examples/HelloWorld/HelloWorldPlugin.cpp
deleted file mode 100644
index 11100384aed9f..0000000000000
--- a/flang/examples/HelloWorld/HelloWorldPlugin.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//===-- HelloWorldPlugin.cpp ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Basic example Flang plugin which simply prints a Hello World statement
-//
-//===----------------------------------------------------------------------===//
-
-#include "flang/Frontend/FrontendActions.h"
-#include "flang/Frontend/FrontendPluginRegistry.h"
-
-using namespace Fortran::frontend;
-
-class HelloWorldFlangPlugin : public PluginParseTreeAction {
-  void ExecuteAction() override {
-    llvm::outs() << "Hello World from your new Flang plugin\n";
-  }
-};
-
-static FrontendPluginRegistry::Add<HelloWorldFlangPlugin> X(
-    "-hello-world", "Hello World Plugin example");
diff --git a/flang/examples/HelloWorld/CMakeLists.txt b/flang/examples/PrintFlangFunctionNames/CMakeLists.txt
similarity index 72%
rename from flang/examples/HelloWorld/CMakeLists.txt
rename to flang/examples/PrintFlangFunctionNames/CMakeLists.txt
index 8552284c80529..6b107b4e1ea53 100644
--- a/flang/examples/HelloWorld/CMakeLists.txt
+++ b/flang/examples/PrintFlangFunctionNames/CMakeLists.txt
@@ -1,7 +1,7 @@
 # TODO: Note that this is currently only available on Linux.
 # On Windows, we would also have to specify e.g. `PLUGIN_TOOL`.
 add_llvm_library(
-    flangHelloWorldPlugin
+    flangPrintFunctionNames
     MODULE
-    HelloWorldPlugin.cpp
+    PrintFlangFunctionNames.cpp
 )
diff --git a/flang/examples/PrintFlangFunctionNames/PrintFlangFunctionNames.cpp b/flang/examples/PrintFlangFunctionNames/PrintFlangFunctionNames.cpp
new file mode 100644
index 0000000000000..0afbf9f35e531
--- /dev/null
+++ b/flang/examples/PrintFlangFunctionNames/PrintFlangFunctionNames.cpp
@@ -0,0 +1,81 @@
+//===-- PrintFlangFunctionNames.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Small example Flang plugin to count/print Functions & Subroutines names.
+// It walks the Parse Tree using a Visitor struct that has Post functions for
+// FunctionStmt and SubroutineStmt to access the names of functions &
+// subroutines. It also has Pre functions for FunctionSubprogram and
+// SubroutineSubprogram so a Bool can be set to show that it is the definition
+// of a function/subroutine, and not print those that are in an Interface.
+// This plugin does not recognise Statement Functions or Module Procedures,
+// which could be dealt with through StmtFunctionStmt and MpSubprogramStmt nodes
+// respectively.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Frontend/CompilerInstance.h"
+#include "flang/Frontend/FrontendActions.h"
+#include "flang/Frontend/FrontendPluginRegistry.h"
+#include "flang/Parser/dump-parse-tree.h"
+#include "flang/Parser/parsing.h"
+
+using namespace Fortran::frontend;
+
+class PrintFunctionNamesAction : public PluginParseTreeAction {
+
+  // Visitor struct that defines Pre/Post functions for different types of nodes
+  struct ParseTreeVisitor {
+    template <typename A> bool Pre(const A &) { return true; }
+    template <typename A> void Post(const A &) {}
+
+    bool Pre(const Fortran::parser::FunctionSubprogram &) {
+      isInSubprogram_ = true;
+      return true;
+    }
+    void Post(const Fortran::parser::FunctionStmt &f) {
+      if (isInSubprogram_) {
+        llvm::outs() << "Function:\t"
+                     << std::get<Fortran::parser::Name>(f.t).ToString() << "\n";
+        fcounter++;
+        isInSubprogram_ = false;
+      }
+    }
+
+    bool Pre(const Fortran::parser::SubroutineSubprogram &) {
+      isInSubprogram_ = true;
+      return true;
+    }
+    void Post(const Fortran::parser::SubroutineStmt &s) {
+      if (isInSubprogram_) {
+        llvm::outs() << "Subroutine:\t"
+                     << std::get<Fortran::parser::Name>(s.t).ToString() << "\n";
+        scounter++;
+        isInSubprogram_ = false;
+      }
+    }
+
+    int fcounter{0};
+    int scounter{0};
+
+  private:
+    bool isInSubprogram_{false};
+  };
+
+  void ExecuteAction() override {
+    auto &parseTree{instance().parsing().parseTree()};
+
+    ParseTreeVisitor visitor;
+    Fortran::parser::Walk(parseTree, visitor);
+
+    llvm::outs() << "\n====   Functions: " << visitor.fcounter << " ====\n";
+    llvm::outs() << "==== Subroutines: " << visitor.scounter << " ====\n";
+  }
+};
+
+static FrontendPluginRegistry::Add<PrintFunctionNamesAction> X(
+    "print-fns", "Print Function names");
diff --git a/flang/include/flang/Frontend/CompilerInstance.h b/flang/include/flang/Frontend/CompilerInstance.h
index 956fe144ac9ef..2871fdcadf222 100644
--- a/flang/include/flang/Frontend/CompilerInstance.h
+++ b/flang/include/flang/Frontend/CompilerInstance.h
@@ -18,6 +18,21 @@
 
 namespace Fortran::frontend {
 
+/// Helper class for managing a single instance of the Flang compiler.
+///
+/// This class serves two purposes:
+///  (1) It manages the various objects which are necessary to run the compiler
+///  (2) It provides utility routines for constructing and manipulating the
+///      common Flang objects.
+///
+/// The compiler instance generally owns the instance of all the objects that it
+/// manages. However, clients can still share objects by manually setting the
+/// object and retaking ownership prior to destroying the CompilerInstance.
+///
+/// The compiler instance is intended to simplify clients, but not to lock them
+/// in to the compiler instance for everything. When possible, utility functions
+/// come in two forms; a short form that reuses the CompilerInstance objects,
+/// and a long form that takes explicit instances of any required objects.
 class CompilerInstance {
 
   /// The options used in this compiler instance.
@@ -30,6 +45,8 @@ class CompilerInstance {
 
   std::shared_ptr<Fortran::parser::Parsing> parsing_;
 
+  std::unique_ptr<Fortran::semantics::Semantics> semantics_;
+
   /// The stream for diagnostics from Semantics
   llvm::raw_ostream *semaOutputStream_ = &llvm::errs();
 
@@ -46,11 +63,6 @@ class CompilerInstance {
         : filename_(std::move(inputFilename)) {}
   };
 
-  /// Output stream that doesn't support seeking (e.g. terminal, pipe).
-  /// This stream is normally wrapped in buffer_ostream before being passed
-  /// to users (e.g. via CreateOutputFile).
-  std::unique_ptr<llvm::raw_fd_ostream> nonSeekStream_;
-
   /// The list of active output files.
   std::list<OutputFile> outputFiles_;
 
@@ -110,6 +122,13 @@ class CompilerInstance {
   /// Get the current stream for verbose output.
   llvm::raw_ostream &semaOutputStream() { return *semaOutputStream_; }
 
+  Fortran::semantics::Semantics &semantics() { return *semantics_; }
+  const Fortran::semantics::Semantics &semantics() const { return *semantics_; }
+
+  void setSemantics(std::unique_ptr<Fortran::semantics::Semantics> semantics) {
+    semantics_ = std::move(semantics);
+  }
+
   /// }
   /// @name High-Level Operations
   /// {
@@ -165,17 +184,12 @@ class CompilerInstance {
   /// @name Output Files
   /// {
 
-  /// Add an output file onto the list of tracked output files.
-  ///
-  /// \param outFile - The output file info.
-  void AddOutputFile(OutputFile &&outFile);
-
   /// Clear the output file list.
   void ClearOutputFiles(bool eraseFiles);
 
   /// Create the default output file (based on the invocation's options) and
   /// add it to the list of tracked output files. If the name of the output
-  /// file is not provided, it is derived from the input file.
+  /// file is not provided, it will be derived from the input file.
   ///
   /// \param binary     The mode to open the file in.
   /// \param baseInput  If the invocation contains no output file name (i.e.
@@ -183,20 +197,21 @@ class CompilerInstance {
   ///                   name to use for deriving the output path.
   /// \param extension  The extension to use for output names derived from
   ///                   \p baseInput.
-  /// \return           ostream for the output file or nullptr on error.
+  /// \return           Null on error, ostream for the output file otherwise
   std::unique_ptr<llvm::raw_pwrite_stream> CreateDefaultOutputFile(
       bool binary = true, llvm::StringRef baseInput = "",
       llvm::StringRef extension = "");
 
+private:
   /// Create a new output file
   ///
   /// \param outputPath   The path to the output file.
-  /// \param error [out]  On failure, the error.
   /// \param binary       The mode to open the file in.
-  /// \return             ostream for the output file or nullptr on error.
-  std::unique_ptr<llvm::raw_pwrite_stream> CreateOutputFile(
-      llvm::StringRef outputPath, std::error_code &error, bool binary);
+  /// \return             Null on error, ostream for the output file otherwise
+  llvm::Expected<std::unique_ptr<llvm::raw_pwrite_stream>> CreateOutputFileImpl(
+      llvm::StringRef outputPath, bool binary);
 
+public:
   /// }
   /// @name Construction Utility Methods
   /// {
diff --git a/flang/include/flang/Frontend/FrontendAction.h b/flang/include/flang/Frontend/FrontendAction.h
index 87e82fe5274f5..aac1fcf268a08 100644
--- a/flang/include/flang/Frontend/FrontendAction.h
+++ b/flang/include/flang/Frontend/FrontendAction.h
@@ -43,7 +43,7 @@ class FrontendAction {
   ///
   /// \return True on success; on failure ExecutionAction() and
   /// EndSourceFileAction() will not be called.
-  virtual bool BeginSourceFileAction(CompilerInstance &ci) { return true; }
+  virtual bool BeginSourceFileAction() { return true; }
 
   /// @}
 
@@ -100,6 +100,34 @@ class FrontendAction {
   /// Perform any per-file post processing, deallocate per-file
   /// objects, and run statistics and output file cleanup code.
   void EndSourceFile();
+
+  /// @}
+protected:
+  // Prescan the current input file. Return False if fatal errors are reported,
+  // True otherwise.
+  bool RunPrescan();
+  // Parse the current input file. Return False if fatal errors are reported,
+  // True otherwise.
+  bool RunParse();
+  // Run semantic checks for the current input file. Return False if fatal
+  // errors are reported, True otherwise.
+  bool RunSemanticChecks();
+
+  // Report fatal semantic errors. Return True if present, false otherwise.
+  bool reportFatalSemanticErrors();
+
+  // Report fatal scanning errors. Return True if present, false otherwise.
+  inline bool reportFatalScanningErrors() {
+    return reportFatalErrors("Could not scan %0");
+  }
+
+  // Report fatal parsing errors. Return True if present, false otherwise
+  inline bool reportFatalParsingErrors() {
+    return reportFatalErrors("Could not parse %0");
+  }
+
+private:
+  template <unsigned N> bool reportFatalErrors(const char (&message)[N]);
 };
 
 } // namespace Fortran::frontend
diff --git a/flang/include/flang/Frontend/FrontendActions.h b/flang/include/flang/Frontend/FrontendActions.h
index d30ae1dbed0ff..ec9d9f79c6942 100644
--- a/flang/include/flang/Frontend/FrontendActions.h
+++ b/flang/include/flang/Frontend/FrontendActions.h
@@ -30,10 +30,6 @@ struct MeasurementVisitor {
 // Custom Consumer Actions
 //===----------------------------------------------------------------------===//
 
-class PluginParseTreeAction : public FrontendAction {
-  void ExecuteAction() override;
-};
-
 class InputOutputTestAction : public FrontendAction {
   void ExecuteAction() override;
 };
@@ -51,7 +47,7 @@ class InitOnlyAction : public FrontendAction {
 //===----------------------------------------------------------------------===//
 class PrescanAction : public FrontendAction {
   void ExecuteAction() override = 0;
-  bool BeginSourceFileAction(CompilerInstance &ci) override;
+  bool BeginSourceFileAction() override;
 };
 
 class PrintPreprocessedAction : public PrescanAction {
@@ -75,7 +71,7 @@ class DebugMeasureParseTreeAction : public PrescanAction {
 //===----------------------------------------------------------------------===//
 class PrescanAndParseAction : public FrontendAction {
   void ExecuteAction() override = 0;
-  bool BeginSourceFileAction(CompilerInstance &ci) override;
+  bool BeginSourceFileAction() override;
 };
 
 class DebugUnparseNoSemaAction : public PrescanAndParseAction {
@@ -90,18 +86,9 @@ class DebugDumpParseTreeNoSemaAction : public PrescanAndParseAction {
 // PrescanAndSema Actions
 //===----------------------------------------------------------------------===//
 class PrescanAndSemaAction : public FrontendAction {
-  std::unique_ptr<Fortran::semantics::Semantics> semantics_;
 
   void ExecuteAction() override = 0;
-  bool BeginSourceFileAction(CompilerInstance &ci) override;
-
-public:
-  Fortran::semantics::Semantics &semantics() { return *semantics_; }
-  const Fortran::semantics::Semantics &semantics() const { return *semantics_; }
-
-  void setSemantics(std::unique_ptr<Fortran::semantics::Semantics> semantics) {
-    semantics_ = std::move(semantics);
-  }
+  bool BeginSourceFileAction() override;
 };
 
 class DebugUnparseWithSymbolsAction : public PrescanAndSemaAction {
@@ -140,6 +127,10 @@ class ParseSyntaxOnlyAction : public PrescanAndSemaAction {
   void ExecuteAction() override;
 };
 
+class PluginParseTreeAction : public PrescanAndSemaAction {
+  void ExecuteAction() override;
+};
+
 } // namespace Fortran::frontend
 
 #endif // LLVM_FLANG_FRONTEND_FRONTENDACTIONS_H
diff --git a/flang/lib/Frontend/CompilerInstance.cpp b/flang/lib/Frontend/CompilerInstance.cpp
index 4c6a875420654..20a9b58a5a3c1 100644
--- a/flang/lib/Frontend/CompilerInstance.cpp
+++ b/flang/lib/Frontend/CompilerInstance.cpp
@@ -51,10 +51,6 @@ void CompilerInstance::set_semaOutputStream(
   semaOutputStream_ = ownedSemaOutputStream_.get();
 }
 
-void CompilerInstance::AddOutputFile(OutputFile &&outFile) {
-  outputFiles_.push_back(std::move(outFile));
-}
-
 // Helper method to generate the path of the output file. The following logic
 // applies:
 // 1. If the user specifies the output file via `-o`, then use that (i.e.
@@ -84,48 +80,52 @@ static std::string GetOutputFilePath(llvm::StringRef outputFilename,
 std::unique_ptr<llvm::raw_pwrite_stream>
 CompilerInstance::CreateDefaultOutputFile(
     bool binary, llvm::StringRef baseName, llvm::StringRef extension) {
-  std::string outputPathName;
-  std::error_code ec;
 
   // Get the path of the output file
   std::string outputFilePath =
       GetOutputFilePath(frontendOpts().outputFile, baseName, extension);
 
   // Create the output file
-  std::unique_ptr<llvm::raw_pwrite_stream> os =
-      CreateOutputFile(outputFilePath, ec, binary);
-
-  // Add the file to the list of tracked output files (provided it was created
-  // successfully)
-  if (os)
-    AddOutputFile(OutputFile(outputPathName));
+  llvm::Expected<std::unique_ptr<llvm::raw_pwrite_stream>> os =
+      CreateOutputFileImpl(outputFilePath, binary);
+
+  // If successful, add the file to the list of tracked output files and
+  // return.
+  if (os) {
+    outputFiles_.emplace_back(OutputFile(outputFilePath));
+    return std::move(*os);
+  }
 
-  return os;
+  // If unsuccessful, issue an error and return Null
+  unsigned DiagID = diagnostics().getCustomDiagID(
+      clang::DiagnosticsEngine::Error, "unable to open output file '%0': '%1'");
+  diagnostics().Report(DiagID)
+      << outputFilePath << llvm::errorToErrorCode(os.takeError()).message();
+  return nullptr;
 }
 
-std::unique_ptr<llvm::raw_pwrite_stream> CompilerInstance::CreateOutputFile(
-    llvm::StringRef outputFilePath, std::error_code &error, bool binary) {
+llvm::Expected<std::unique_ptr<llvm::raw_pwrite_stream>>
+CompilerInstance::CreateOutputFileImpl(
+    llvm::StringRef outputFilePath, bool binary) {
 
   // Creates the file descriptor for the output file
   std::unique_ptr<llvm::raw_fd_ostream> os;
-  std::string osFile;
-  if (!os) {
-    osFile = outputFilePath;
-    os.reset(new llvm::raw_fd_ostream(osFile, error,
-        (binary ? llvm::sys::fs::OF_None : llvm::sys::fs::OF_TextWithCRLF)));
-    if (error)
-      return nullptr;
+
+  std::error_code error;
+  os.reset(new llvm::raw_fd_ostream(outputFilePath, error,
+      (binary ? llvm::sys::fs::OF_None : llvm::sys::fs::OF_TextWithCRLF)));
+  if (error) {
+    return llvm::errorCodeToError(error);
   }
 
-  // Return the stream corresponding to the output file.
-  // For non-seekable streams, wrap it in llvm::buffer_ostream first.
+  // For seekable streams, just return the stream corresponding to the output
+  // file.
   if (!binary || os->supportsSeeking())
     return std::move(os);
 
-  assert(!nonSeekStream_ && "The non-seek stream has already been set!");
-  auto b = std::make_unique<llvm::buffer_ostream>(*os);
-  nonSeekStream_ = std::move(os);
-  return std::move(b);
+  // For non-seekable streams, we need to wrap the output stream into something
+  // that supports 'pwrite' and takes care of the ownership for us.
+  return std::make_unique<llvm::buffer_unique_ostream>(std::move(os));
 }
 
 void CompilerInstance::ClearOutputFiles(bool eraseFiles) {
@@ -134,7 +134,6 @@ void CompilerInstance::ClearOutputFiles(bool eraseFiles) {
       llvm::sys::fs::remove(of.filename_);
 
   outputFiles_.clear();
-  nonSeekStream_.reset();
 }
 
 bool CompilerInstance::ExecuteAction(FrontendAction &act) {
diff --git a/flang/lib/Frontend/FrontendAction.cpp b/flang/lib/Frontend/FrontendAction.cpp
index 77700d2abec78..5285681e2904a 100644
--- a/flang/lib/Frontend/FrontendAction.cpp
+++ b/flang/lib/Frontend/FrontendAction.cpp
@@ -89,7 +89,7 @@ bool FrontendAction::BeginSourceFile(
     invoc.fortranOpts().isFixedForm = currentInput().IsFixedForm();
   }
 
-  if (!BeginSourceFileAction(ci)) {
+  if (!BeginSourceFileAction()) {
     BeginSourceFileCleanUp(*this, ci);
     return false;
   }
@@ -117,3 +117,96 @@ void FrontendAction::EndSourceFile() {
   set_instance(nullptr);
   set_currentInput(FrontendInputFile());
 }
+
+bool FrontendAction::RunPrescan() {
+  CompilerInstance &ci = this->instance();
+  std::string currentInputPath{GetCurrentFileOrBufferName()};
+  Fortran::parser::Options parserOptions = ci.invocation().fortranOpts();
+
+  if (ci.invocation().frontendOpts().fortranForm == FortranForm::Unknown) {
+    // Switch between fixed and free form format based on the input file
+    // extension.
+    //
+    // Ideally we should have all Fortran options set before entering this
+    // method (i.e. before processing any specific input files). However, we
+    // can't decide between fixed and free form based on the file extension
+    // earlier than this.
+    parserOptions.isFixedForm = currentInput().IsFixedForm();
+  }
+
+  // Prescan. In case of failure, report and return.
+  ci.parsing().Prescan(currentInputPath, parserOptions);
+
+  return !reportFatalScanningErrors();
+}
+
+bool FrontendAction::RunParse() {
+  CompilerInstance &ci = this->instance();
+
+  // Parse. In case of failure, report and return.
+  ci.parsing().Parse(llvm::outs());
+
+  if (reportFatalParsingErrors()) {
+    return false;
+  }
+
+  // Report the diagnostics from parsing
+  ci.parsing().messages().Emit(llvm::errs(), ci.allCookedSources());
+
+  return true;
+}
+
+bool FrontendAction::RunSemanticChecks() {
+  CompilerInstance &ci = this->instance();
+  std::optional<parser::Program> &parseTree{ci.parsing().parseTree()};
+  assert(parseTree && "Cannot run semantic checks without a parse tree!");
+
+  // Prepare semantics
+  ci.setSemantics(std::make_unique<Fortran::semantics::Semantics>(
+      ci.invocation().semanticsContext(), *parseTree,
+      ci.invocation().debugModuleDir()));
+  auto &semantics = ci.semantics();
+
+  // Run semantic checks
+  semantics.Perform();
+
+  if (reportFatalSemanticErrors()) {
+    return false;
+  }
+
+  // Report the diagnostics from the semantic checks
+  semantics.EmitMessages(ci.semaOutputStream());
+
+  return true;
+}
+
+template <unsigned N>
+bool FrontendAction::reportFatalErrors(const char (&message)[N]) {
+  if (!instance_->parsing().messages().empty() &&
+      (instance_->invocation().warnAsErr() ||
+          instance_->parsing().messages().AnyFatalError())) {
+    const unsigned diagID = instance_->diagnostics().getCustomDiagID(
+        clang::DiagnosticsEngine::Error, message);
+    instance_->diagnostics().Report(diagID) << GetCurrentFileOrBufferName();
+    instance_->parsing().messages().Emit(
+        llvm::errs(), instance_->allCookedSources());
+    return true;
+  }
+  return false;
+}
+
+bool FrontendAction::reportFatalSemanticErrors() {
+  auto &diags = instance_->diagnostics();
+  auto &sema = instance_->semantics();
+
+  if (instance_->semantics().AnyFatalError()) {
+    unsigned DiagID = diags.getCustomDiagID(
+        clang::DiagnosticsEngine::Error, "Semantic errors in %0");
+    diags.Report(DiagID) << GetCurrentFileOrBufferName();
+    sema.EmitMessages(instance_->semaOutputStream());
+
+    return true;
+  }
+
+  return false;
+}
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index c12cafc02dbf3..b43cf086bcfce 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -27,132 +27,22 @@
 
 using namespace Fortran::frontend;
 
-/// Report fatal semantic errors if present.
-///
-/// \param semantics The semantics instance
-/// \param diags The diagnostics engine instance
-/// \param bufferName The file or buffer name
-///
-/// \return True if fatal semantic errors are present, false if not
-bool reportFatalSemanticErrors(const Fortran::semantics::Semantics &semantics,
-    clang::DiagnosticsEngine &diags, const llvm::StringRef &bufferName) {
-  if (semantics.AnyFatalError()) {
-    unsigned DiagID = diags.getCustomDiagID(
-        clang::DiagnosticsEngine::Error, "Semantic errors in %0");
-    diags.Report(DiagID) << bufferName;
-    return true;
-  }
-  return false;
-}
-
-template <unsigned N>
-static bool reportFatalErrors(
-    const FrontendAction *act, const char (&message)[N]) {
-  CompilerInstance &ci = act->instance();
-  if (!ci.parsing().messages().empty() &&
-      (ci.invocation().warnAsErr() ||
-          ci.parsing().messages().AnyFatalError())) {
-    const unsigned diagID = ci.diagnostics().getCustomDiagID(
-        clang::DiagnosticsEngine::Error, message);
-    ci.diagnostics().Report(diagID) << act->GetCurrentFileOrBufferName();
-    ci.parsing().messages().Emit(llvm::errs(), ci.allCookedSources());
-    return true;
-  }
-  return false;
-}
-
-inline bool reportFatalScanningErrors(const FrontendAction *act) {
-  return reportFatalErrors(act, "Could not scan %0");
-}
-
-inline bool reportFatalParsingErrors(const FrontendAction *act) {
-  return reportFatalErrors(act, "Could not parse %0");
-}
-
-bool PrescanAction::BeginSourceFileAction(CompilerInstance &c1) {
-  CompilerInstance &ci = this->instance();
-  std::string currentInputPath{GetCurrentFileOrBufferName()};
-  Fortran::parser::Options parserOptions = ci.invocation().fortranOpts();
-
-  // Prescan. In case of failure, report and return.
-  ci.parsing().Prescan(currentInputPath, parserOptions);
-
-  return !reportFatalScanningErrors(this);
-}
-
-bool PrescanAndParseAction::BeginSourceFileAction(CompilerInstance &c1) {
-  CompilerInstance &ci = this->instance();
-
-  std::string currentInputPath{GetCurrentFileOrBufferName()};
-
-  Fortran::parser::Options parserOptions = ci.invocation().fortranOpts();
-
-  if (ci.invocation().frontendOpts().fortranForm == FortranForm::Unknown) {
-    // Switch between fixed and free form format based on the input file
-    // extension.
-    //
-    // Ideally we should have all Fortran options set before entering this
-    // method (i.e. before processing any specific input files). However, we
-    // can't decide between fixed and free form based on the file extension
-    // earlier than this.
-    parserOptions.isFixedForm = currentInput().IsFixedForm();
-  }
-
-  // Prescan. In case of failure, report and return.
-  ci.parsing().Prescan(currentInputPath, parserOptions);
-
-  if (reportFatalScanningErrors(this))
-    return false;
-
-  // Parse. In case of failure, report and return.
-  ci.parsing().Parse(llvm::outs());
-
-  if (reportFatalParsingErrors(this))
-    return false;
-
-  // Report the diagnostics from parsing
-  ci.parsing().messages().Emit(llvm::errs(), ci.allCookedSources());
+//===----------------------------------------------------------------------===//
+// Custom BeginSourceFileAction
+//===----------------------------------------------------------------------===//
+bool PrescanAction::BeginSourceFileAction() { return RunPrescan(); }
 
-  return true;
+bool PrescanAndParseAction::BeginSourceFileAction() {
+  return RunPrescan() && RunParse();
 }
 
-bool PrescanAndSemaAction::BeginSourceFileAction(CompilerInstance &c1) {
-  CompilerInstance &ci = this->instance();
-  std::string currentInputPath{GetCurrentFileOrBufferName()};
-  Fortran::parser::Options parserOptions = ci.invocation().fortranOpts();
-
-  // Prescan. In case of failure, report and return.
-  ci.parsing().Prescan(currentInputPath, parserOptions);
-
-  if (reportFatalScanningErrors(this))
-    return false;
-
-  // Parse. In case of failure, report and return.
-  ci.parsing().Parse(llvm::outs());
-
-  if (reportFatalParsingErrors(this))
-    return false;
-
-  // Report the diagnostics from parsing
-  ci.parsing().messages().Emit(llvm::errs(), ci.allCookedSources());
-
-  auto &parseTree{*ci.parsing().parseTree()};
-
-  // Prepare semantics
-  setSemantics(std::make_unique<Fortran::semantics::Semantics>(
-      ci.invocation().semanticsContext(), parseTree,
-      ci.invocation().debugModuleDir()));
-  auto &semantics = this->semantics();
-
-  // Run semantic checks
-  semantics.Perform();
-
-  // Report the diagnostics from the semantic checks
-  semantics.EmitMessages(ci.semaOutputStream());
-
-  return true;
+bool PrescanAndSemaAction::BeginSourceFileAction() {
+  return RunPrescan() & RunParse() && RunSemanticChecks();
 }
 
+//===----------------------------------------------------------------------===//
+// Custom ExecuteAction
+//===----------------------------------------------------------------------===//
 void InputOutputTestAction::ExecuteAction() {
   CompilerInstance &ci = instance();
 
@@ -200,6 +90,9 @@ void PrintPreprocessedAction::ExecuteAction() {
         outForPP, !ci.invocation().preprocessorOpts().noLineDirectives);
   }
 
+  // Print diagnostics from the prescanner
+  ci.parsing().messages().Emit(llvm::errs(), ci.allCookedSources());
+
   // If a pre-defined output stream exists, dump the preprocessed content there
   if (!ci.IsOutputStreamNull()) {
     // Send the output to the pre-defined output buffer.
@@ -207,16 +100,14 @@ void PrintPreprocessedAction::ExecuteAction() {
     return;
   }
 
-  // Print diagnostics from the prescanner
-  ci.parsing().messages().Emit(llvm::errs(), ci.allCookedSources());
-
   // Create a file and save the preprocessed output there
-  if (auto os{ci.CreateDefaultOutputFile(
-          /*Binary=*/true, /*InFile=*/GetCurrentFileOrBufferName())}) {
-    (*os) << outForPP.str();
-  } else {
-    llvm::errs() << "Unable to create the output file\n";
+  std::unique_ptr<llvm::raw_pwrite_stream> os{ci.CreateDefaultOutputFile(
+      /*Binary=*/true, /*InFile=*/GetCurrentFileOrBufferName())};
+  if (!os) {
+    return;
   }
+
+  (*os) << outForPP.str();
 }
 
 void DebugDumpProvenanceAction::ExecuteAction() {
@@ -224,8 +115,6 @@ void DebugDumpProvenanceAction::ExecuteAction() {
 }
 
 void ParseSyntaxOnlyAction::ExecuteAction() {
-  reportFatalSemanticErrors(semantics(), this->instance().diagnostics(),
-      GetCurrentFileOrBufferName());
 }
 
 void DebugUnparseNoSemaAction::ExecuteAction() {
@@ -256,8 +145,7 @@ void DebugUnparseAction::ExecuteAction() {
       invoc.useAnalyzedObjectsForUnparse() ? &invoc.asFortran() : nullptr);
 
   // Report fatal semantic errors
-  reportFatalSemanticErrors(semantics(), this->instance().diagnostics(),
-      GetCurrentFileOrBufferName());
+  reportFatalSemanticErrors();
 }
 
 void DebugUnparseWithSymbolsAction::ExecuteAction() {
@@ -267,21 +155,19 @@ void DebugUnparseWithSymbolsAction::ExecuteAction() {
       llvm::outs(), parseTree, /*encoding=*/Fortran::parser::Encoding::UTF_8);
 
   // Report fatal semantic errors
-  reportFatalSemanticErrors(semantics(), this->instance().diagnostics(),
-      GetCurrentFileOrBufferName());
+  reportFatalSemanticErrors();
 }
 
 void DebugDumpSymbolsAction::ExecuteAction() {
   CompilerInstance &ci = this->instance();
-  auto &semantics = this->semantics();
+  auto &semantics = ci.semantics();
 
   auto tables{Fortran::semantics::BuildRuntimeDerivedTypeTables(
       instance().invocation().semanticsContext())};
   // The runtime derived type information table builder may find and report
   // semantic errors. So it is important that we report them _after_
   // BuildRuntimeDerivedTypeTables is run.
-  reportFatalSemanticErrors(
-      semantics, this->instance().diagnostics(), GetCurrentFileOrBufferName());
+  reportFatalSemanticErrors();
 
   if (!tables.schemata) {
     unsigned DiagID =
@@ -306,14 +192,13 @@ void DebugDumpAllAction::ExecuteAction() {
   Fortran::parser::DumpTree(
       llvm::outs(), parseTree, &ci.invocation().asFortran());
 
-  auto &semantics = this->semantics();
+  auto &semantics = ci.semantics();
   auto tables{Fortran::semantics::BuildRuntimeDerivedTypeTables(
       instance().invocation().semanticsContext())};
   // The runtime derived type information table builder may find and report
   // semantic errors. So it is important that we report them _after_
   // BuildRuntimeDerivedTypeTables is run.
-  reportFatalSemanticErrors(
-      semantics, this->instance().diagnostics(), GetCurrentFileOrBufferName());
+  reportFatalSemanticErrors();
 
   if (!tables.schemata) {
     unsigned DiagID =
@@ -346,8 +231,7 @@ void DebugDumpParseTreeAction::ExecuteAction() {
       llvm::outs(), parseTree, &this->instance().invocation().asFortran());
 
   // Report fatal semantic errors
-  reportFatalSemanticErrors(semantics(), this->instance().diagnostics(),
-      GetCurrentFileOrBufferName());
+  reportFatalSemanticErrors();
 }
 
 void DebugMeasureParseTreeAction::ExecuteAction() {
@@ -384,8 +268,7 @@ void DebugMeasureParseTreeAction::ExecuteAction() {
 void DebugPreFIRTreeAction::ExecuteAction() {
   CompilerInstance &ci = this->instance();
   // Report and exit if fatal semantic errors are present
-  if (reportFatalSemanticErrors(
-          semantics(), ci.diagnostics(), GetCurrentFileOrBufferName())) {
+  if (reportFatalSemanticErrors()) {
     return;
   }
 
@@ -410,12 +293,13 @@ void DebugDumpParsingLogAction::ExecuteAction() {
 }
 
 void GetDefinitionAction::ExecuteAction() {
+  CompilerInstance &ci = this->instance();
+
   // Report and exit if fatal semantic errors are present
-  if (reportFatalSemanticErrors(semantics(), this->instance().diagnostics(),
-          GetCurrentFileOrBufferName()))
+  if (reportFatalSemanticErrors()) {
     return;
+  }
 
-  CompilerInstance &ci = this->instance();
   parser::AllCookedSources &cs = ci.allCookedSources();
   unsigned diagID = ci.diagnostics().getCustomDiagID(
       clang::DiagnosticsEngine::Error, "Symbol not found");
@@ -457,12 +341,14 @@ void GetDefinitionAction::ExecuteAction() {
 }
 
 void GetSymbolsSourcesAction::ExecuteAction() {
+  CompilerInstance &ci = this->instance();
+
   // Report and exit if fatal semantic errors are present
-  if (reportFatalSemanticErrors(semantics(), this->instance().diagnostics(),
-          GetCurrentFileOrBufferName()))
+  if (reportFatalSemanticErrors()) {
     return;
+  }
 
-  semantics().DumpSymbolsSources(llvm::outs());
+  ci.semantics().DumpSymbolsSources(llvm::outs());
 }
 
 void EmitObjAction::ExecuteAction() {
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 366a6506a8ec4..a81914f7db278 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -288,6 +288,9 @@ void OmpStructureChecker::Enter(const parser::OpenMPConstruct &x) {
     if (GetDirectiveNest(SIMDNest) > 0) {
       CheckSIMDNest(x);
     }
+    if (GetDirectiveNest(TargetNest) > 0) {
+      CheckTargetNest(x);
+    }
   }
 }
 
@@ -473,6 +476,53 @@ void OmpStructureChecker::CheckSIMDNest(const parser::OpenMPConstruct &c) {
   }
 }
 
+void OmpStructureChecker::CheckTargetNest(const parser::OpenMPConstruct &c) {
+  // 2.12.5 Target Construct Restriction
+  bool eligibleTarget{true};
+  llvm::omp::Directive ineligibleTargetDir;
+  std::visit(
+      common::visitors{
+          [&](const parser::OpenMPBlockConstruct &c) {
+            const auto &beginBlockDir{
+                std::get<parser::OmpBeginBlockDirective>(c.t)};
+            const auto &beginDir{
+                std::get<parser::OmpBlockDirective>(beginBlockDir.t)};
+            if (beginDir.v == llvm::omp::Directive::OMPD_target_data) {
+              eligibleTarget = false;
+              ineligibleTargetDir = beginDir.v;
+            }
+          },
+          [&](const parser::OpenMPStandaloneConstruct &c) {
+            std::visit(
+                common::visitors{
+                    [&](const parser::OpenMPSimpleStandaloneConstruct &c) {
+                      const auto &dir{
+                          std::get<parser::OmpSimpleStandaloneDirective>(c.t)};
+                      if (dir.v == llvm::omp::Directive::OMPD_target_update ||
+                          dir.v ==
+                              llvm::omp::Directive::OMPD_target_enter_data ||
+                          dir.v ==
+                              llvm::omp::Directive::OMPD_target_exit_data) {
+                        eligibleTarget = false;
+                        ineligibleTargetDir = dir.v;
+                      }
+                    },
+                    [&](const auto &c) {},
+                },
+                c.u);
+          },
+          [&](const auto &c) {},
+      },
+      c.u);
+  if (!eligibleTarget) {
+    context_.Say(parser::FindSourceLocation(c),
+        "If %s directive is nested inside TARGET region, the behaviour "
+        "is unspecified"_en_US,
+        parser::ToUpperCaseLetters(
+            getDirectiveName(ineligibleTargetDir).str()));
+  }
+}
+
 std::int64_t OmpStructureChecker::GetOrdCollapseLevel(
     const parser::OpenMPLoopConstruct &x) {
   const auto &beginLoopDir{std::get<parser::OmpBeginLoopDirective>(x.t)};
@@ -616,6 +666,9 @@ void OmpStructureChecker::Enter(const parser::OpenMPBlockConstruct &x) {
   CheckMatching<parser::OmpBlockDirective>(beginDir, endDir);
 
   PushContextAndClauseSets(beginDir.source, beginDir.v);
+  if (GetContext().directive == llvm::omp::Directive::OMPD_target) {
+    EnterDirectiveNest(TargetNest);
+  }
 
   if (CurrentDirectiveIsNested()) {
     CheckIfDoOrderedClause(beginDir);
@@ -710,6 +763,9 @@ void OmpStructureChecker::Leave(const parser::OpenMPBlockConstruct &) {
   if (GetDirectiveNest(TargetBlockOnlyTeams)) {
     ExitDirectiveNest(TargetBlockOnlyTeams);
   }
+  if (GetContext().directive == llvm::omp::Directive::OMPD_target) {
+    ExitDirectiveNest(TargetNest);
+  }
   dirContext_.pop_back();
 }
 
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 03cd1f8d3f298..bd0d8100ddd3d 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -226,6 +226,7 @@ class OmpStructureChecker
   void CheckCycleConstraints(const parser::OpenMPLoopConstruct &x);
   void CheckDistLinear(const parser::OpenMPLoopConstruct &x);
   void CheckSIMDNest(const parser::OpenMPConstruct &x);
+  void CheckTargetNest(const parser::OpenMPConstruct &x);
   void CheckCancellationNest(
       const parser::CharBlock &source, const parser::OmpCancelType::Type &type);
   std::int64_t GetOrdCollapseLevel(const parser::OpenMPLoopConstruct &x);
@@ -253,7 +254,12 @@ class OmpStructureChecker
   void ExitDirectiveNest(const int index) { directiveNest_[index]--; }
   int GetDirectiveNest(const int index) { return directiveNest_[index]; }
 
-  enum directiveNestType { SIMDNest, TargetBlockOnlyTeams, LastType };
+  enum directiveNestType {
+    SIMDNest,
+    TargetBlockOnlyTeams,
+    TargetNest,
+    LastType
+  };
   int directiveNest_[LastType + 1] = {0};
 };
 } // namespace Fortran::semantics
diff --git a/flang/runtime/connection.h b/flang/runtime/connection.h
index 6d0678f18abfa..3a2f30303199e 100644
--- a/flang/runtime/connection.h
+++ b/flang/runtime/connection.h
@@ -57,6 +57,12 @@ struct ConnectionState : public ConnectionAttributes {
   // or an end-of-file READ condition on a sequential access file
   std::optional<std::int64_t> endfileRecordNumber;
 
+  // Set when processing repeated items during list-directed & NAMELIST input
+  // in order to keep a span of records in frame on a non-positionable file,
+  // so that backspacing to the beginning of the repeated item doesn't require
+  // repositioning the external storage medium when that's impossible.
+  std::optional<std::int64_t> resumptionRecordNumber;
+
   // Mutable modes set at OPEN() that can be overridden in READ/WRITE & FORMAT
   MutableModes modes; // BLANK=, DECIMAL=, SIGN=, ROUND=, PAD=, DELIM=, kP
 };
diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index 084fc781673ea..fc1dcbf37e15d 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -653,7 +653,11 @@ ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
     comma = ';';
   }
   if (remaining_ > 0 && !realPart_) { // "r*c" repetition in progress
-    while (connection.currentRecordNumber > repeatRecordNumber_) {
+    RUNTIME_CHECK(
+        io.GetIoErrorHandler(), connection.resumptionRecordNumber.has_value());
+    while (connection.currentRecordNumber >
+        connection.resumptionRecordNumber.value_or(
+            connection.currentRecordNumber)) {
       io.BackspaceRecord();
     }
     connection.HandleAbsolutePosition(repeatPositionInRecord_);
@@ -666,6 +670,9 @@ ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
       }
     }
     remaining_ -= edit.repeat;
+    if (remaining_ <= 0) {
+      connection.resumptionRecordNumber.reset();
+    }
     return edit;
   }
   // Skip separators, handle a "r*c" repeat count; see 13.10.2 in Fortran 2018
@@ -726,7 +733,11 @@ ListDirectedStatementState<Direction::Input>::GetNextDataEdit(
       }
       edit.repeat = std::min<int>(r, maxRepeat);
       remaining_ = r - edit.repeat;
-      repeatRecordNumber_ = connection.currentRecordNumber;
+      if (remaining_ > 0) {
+        connection.resumptionRecordNumber = connection.currentRecordNumber;
+      } else {
+        connection.resumptionRecordNumber.reset();
+      }
       repeatPositionInRecord_ = connection.positionInRecord;
     } else { // not a repetition count, just an integer value; rewind
       connection.positionInRecord = start;
diff --git a/flang/runtime/io-stmt.h b/flang/runtime/io-stmt.h
index cfbafe2995a9c..32aeae09c3afc 100644
--- a/flang/runtime/io-stmt.h
+++ b/flang/runtime/io-stmt.h
@@ -229,7 +229,6 @@ class ListDirectedStatementState<Direction::Input>
 
 private:
   int remaining_{0}; // for "r*" repetition
-  std::int64_t repeatRecordNumber_;
   std::int64_t repeatPositionInRecord_;
   bool eatComma_{false}; // consume comma after previously read item
   bool hitSlash_{false}; // once '/' is seen, nullify further items
diff --git a/flang/runtime/namelist.cpp b/flang/runtime/namelist.cpp
index 7ca8d5cfb1698..4aee148cd1f30 100644
--- a/flang/runtime/namelist.cpp
+++ b/flang/runtime/namelist.cpp
@@ -263,6 +263,7 @@ bool IONAME(InputNamelist)(Cookie cookie, const NamelistGroup &group) {
   auto *listInput{io.get_if<ListDirectedStatementState<Direction::Input>>()};
   RUNTIME_CHECK(handler, listInput != nullptr);
   // Check the group header
+  io.BeginReadingRecord();
   std::optional<char32_t> next{io.GetNextNonBlank()};
   if (!next || *next != '&') {
     handler.SignalError(
diff --git a/flang/runtime/time-intrinsic.cpp b/flang/runtime/time-intrinsic.cpp
index d6b1c36bf9e00..7aab29e7510dc 100644
--- a/flang/runtime/time-intrinsic.cpp
+++ b/flang/runtime/time-intrinsic.cpp
@@ -13,6 +13,8 @@
 #include <ctime>
 
 // CPU_TIME (Fortran 2018 16.9.57)
+// SYSTEM_CLOCK (Fortran 2018 16.9.168)
+//
 // We can use std::clock() from the <ctime> header as a fallback implementation
 // that should be available everywhere. This may not provide the best resolution
 // and is particularly troublesome on (some?) POSIX systems where CLOCKS_PER_SEC
@@ -68,11 +70,116 @@ double GetCpuTime(preferred_implementation,
   // Return some negative value to represent failure.
   return -1.0;
 }
+
+using count_t =
+    Fortran::runtime::CppTypeFor<Fortran::common::TypeCategory::Integer, 8>;
+
+// This is the fallback implementation, which should work everywhere. Note that
+// in general we can't recover after std::clock has reached its maximum value.
+template <typename Unused = void>
+count_t GetSystemClockCount(fallback_implementation) {
+  std::clock_t timestamp{std::clock()};
+  if (timestamp == static_cast<std::clock_t>(-1)) {
+    // Return -HUGE() to represent failure.
+    return -std::numeric_limits<count_t>::max();
+  }
+
+  // If our return type is large enough to hold any value returned by
+  // std::clock, our work is done. Otherwise, we have to wrap around.
+  static constexpr auto max{std::numeric_limits<count_t>::max()};
+  if constexpr (std::numeric_limits<std::clock_t>::max() <= max) {
+    return static_cast<count_t>(timestamp);
+  } else {
+    // Since std::clock_t could be a floating point type, we can't just use the
+    // % operator, so we have to wrap around manually.
+    return static_cast<count_t>(timestamp - max * std::floor(timestamp / max));
+  }
+}
+
+template <typename Unused = void>
+count_t GetSystemClockCountRate(fallback_implementation) {
+  return CLOCKS_PER_SEC;
+}
+
+template <typename Unused = void>
+count_t GetSystemClockCountMax(fallback_implementation) {
+  static constexpr auto max_clock_t = std::numeric_limits<std::clock_t>::max();
+  static constexpr auto max_count_t = std::numeric_limits<count_t>::max();
+  if constexpr (max_clock_t < max_count_t) {
+    return static_cast<count_t>(max_clock_t);
+  } else {
+    return max_count_t;
+  }
+}
+
+constexpr count_t NSECS_PER_SEC{1'000'000'000};
+
+// POSIX implementation using clock_gettime. This is only enabled if
+// clock_gettime is available.
+template <typename T = int, typename U = struct timespec>
+count_t GetSystemClockCount(preferred_implementation,
+    // We need some dummy parameters to pass to decltype(clock_gettime).
+    T ClockId = 0, U *Timespec = nullptr,
+    decltype(clock_gettime(ClockId, Timespec)) *Enabled = nullptr) {
+#if defined CLOCK_THREAD_CPUTIME_ID
+#define CLOCKID CLOCK_THREAD_CPUTIME_ID
+#elif defined CLOCK_PROCESS_CPUTIME_ID
+#define CLOCKID CLOCK_PROCESS_CPUTIME_ID
+#elif defined CLOCK_MONOTONIC
+#define CLOCKID CLOCK_MONOTONIC
+#else
+#define CLOCKID CLOCK_REALTIME
+#endif
+  struct timespec tspec;
+  if (clock_gettime(CLOCKID, &tspec) != 0) {
+    // Return -HUGE() to represent failure.
+    return -std::numeric_limits<count_t>::max();
+  }
+
+  // Wrap around to avoid overflows.
+  constexpr count_t max_secs{
+      std::numeric_limits<count_t>::max() / NSECS_PER_SEC};
+  count_t wrapped_secs{tspec.tv_sec % max_secs};
+
+  // At this point, wrapped_secs < max_secs, and max_secs has already been
+  // truncated by the division. Therefore, we should still have enough room to
+  // add tv_nsec, since it is < NSECS_PER_SEC.
+  return tspec.tv_nsec + wrapped_secs * NSECS_PER_SEC;
+}
+
+template <typename T = int, typename U = struct timespec>
+count_t GetSystemClockCountRate(preferred_implementation,
+    // We need some dummy parameters to pass to decltype(clock_gettime).
+    T ClockId = 0, U *Timespec = nullptr,
+    decltype(clock_gettime(ClockId, Timespec)) *Enabled = nullptr) {
+  return NSECS_PER_SEC;
+}
+
+template <typename T = int, typename U = struct timespec>
+count_t GetSystemClockCountMax(preferred_implementation,
+    // We need some dummy parameters to pass to decltype(clock_gettime).
+    T ClockId = 0, U *Timespec = nullptr,
+    decltype(clock_gettime(ClockId, Timespec)) *Enabled = nullptr) {
+  count_t max_secs{std::numeric_limits<count_t>::max() / NSECS_PER_SEC};
+  return max_secs * NSECS_PER_SEC - 1;
+}
 } // anonymous namespace
 
 namespace Fortran::runtime {
 extern "C" {
 
 double RTNAME(CpuTime)() { return GetCpuTime(0); }
+
+CppTypeFor<TypeCategory::Integer, 8> RTNAME(SystemClockCount)() {
+  return GetSystemClockCount(0);
+}
+
+CppTypeFor<TypeCategory::Integer, 8> RTNAME(SystemClockCountRate)() {
+  return GetSystemClockCountRate(0);
+}
+
+CppTypeFor<TypeCategory::Integer, 8> RTNAME(SystemClockCountMax)() {
+  return GetSystemClockCountMax(0);
+}
 } // extern "C"
 } // namespace Fortran::runtime
diff --git a/flang/runtime/transformational.cpp b/flang/runtime/transformational.cpp
index 3fc294bc948bc..1bf93386dd88a 100644
--- a/flang/runtime/transformational.cpp
+++ b/flang/runtime/transformational.cpp
@@ -284,6 +284,8 @@ void RTNAME(EoshiftVector)(Descriptor &result, const Descriptor &source,
     SubscriptValue sourceAt{lb + j - 1 + shift};
     if (sourceAt >= lb && sourceAt < lb + extent) {
       CopyElement(result, &j, source, &sourceAt, terminator);
+    } else if (boundary) {
+      CopyElement(result, &j, *boundary, 0, terminator);
     }
   }
 }
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index b36acf42f6fb3..55304a8f34e08 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -350,13 +350,10 @@ bool ExternalFileUnit::SetSequentialVariableFormattedRecordLength() {
       if (*recordLength > 0 && record[*recordLength - 1] == '\r') {
         --*recordLength;
       }
-    } else {
-      recordLength = bytes; // final record w/o \n
+      return true;
     }
-    return true;
-  } else {
-    return false;
   }
+  return false;
 }
 
 void ExternalFileUnit::SetLeftTabLimit() {
@@ -421,7 +418,10 @@ void ExternalFileUnit::FinishReadingRecord(IoErrorHandler &handler) {
             Frame()[recordOffsetInFrame_ + *recordLength] == '\n') {
           ++recordOffsetInFrame_;
         }
-        recordOffsetInFrame_ += *recordLength;
+        if (!resumptionRecordNumber || mayPosition()) {
+          frameOffsetInFile_ += recordOffsetInFrame_ + *recordLength;
+          recordOffsetInFrame_ = 0;
+        }
         recordLength.reset();
       }
     }
@@ -612,10 +612,17 @@ void ExternalFileUnit::BeginSequentialVariableFormattedInputRecord(
   }
   std::size_t length{0};
   do {
-    std::size_t need{recordOffsetInFrame_ + length + 1};
-    length = ReadFrame(frameOffsetInFile_, need, handler);
+    std::size_t need{length + 1};
+    length =
+        ReadFrame(frameOffsetInFile_, recordOffsetInFrame_ + need, handler) -
+        recordOffsetInFrame_;
     if (length < need) {
-      handler.SignalEnd();
+      if (length > 0) {
+        // final record w/o \n
+        recordLength = length;
+      } else {
+        handler.SignalEnd();
+      }
       break;
     }
   } while (!SetSequentialVariableFormattedRecordLength());
diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt
index d7a7dccdc1727..1d0bef1cbe66e 100644
--- a/flang/test/CMakeLists.txt
+++ b/flang/test/CMakeLists.txt
@@ -56,7 +56,9 @@ if (FLANG_INCLUDE_TESTS)
 endif()
 
 if (FLANG_BUILD_EXAMPLES)
-  list(APPEND FLANG_TEST_DEPENDS flangHelloWorldPlugin)
+  list(APPEND FLANG_TEST_DEPENDS
+    flangPrintFunctionNames
+    )
 endif ()
 
 add_custom_target(flang-test-depends DEPENDS ${FLANG_TEST_DEPENDS})
diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
index 9e63408b75591..c91fdaa42f0f1 100644
--- a/flang/test/Driver/driver-help-hidden.f90
+++ b/flang/test/Driver/driver-help-hidden.f90
@@ -39,12 +39,7 @@
 ! CHECK-NEXT:                        Specify where to find the compiled intrinsic modules
 ! CHECK-NEXT: -flarge-sizes          Use INTEGER(KIND=8) for the result type in size-related intrinsics
 ! CHECK-NEXT: -flogical-abbreviations Enable logical abbreviations
-! CHECK-NEXT: -fno-backslash
 ! CHECK-NEXT: -fno-color-diagnostics  Disable colors in diagnostics
-! CHECK-NEXT: -fno-implicit-none
-! CHECK-NEXT: -fno-logical-abbreviations
-! CHECK-NEXT: {{[[:space:]]$}}
-! CHECK-NEXT: -fno-xor-operator
 ! CHECK-NEXT: -fopenacc              Enable OpenACC
 ! CHECK-NEXT: -fopenmp               Parse OpenMP pragmas and generate parallel code.
 ! CHECK-NEXT: -fxor-operator         Enable .XOR. as a synonym of .NEQV.
diff --git a/flang/test/Driver/driver-help.f90 b/flang/test/Driver/driver-help.f90
index 441cf73f9d0c1..b895dc4b1efdf 100644
--- a/flang/test/Driver/driver-help.f90
+++ b/flang/test/Driver/driver-help.f90
@@ -39,12 +39,7 @@
 ! HELP-NEXT:                        Specify where to find the compiled intrinsic modules
 ! HELP-NEXT: -flarge-sizes          Use INTEGER(KIND=8) for the result type in size-related intrinsics
 ! HELP-NEXT: -flogical-abbreviations Enable logical abbreviations
-! HELP-NEXT: -fno-backslash
 ! HELP-NEXT: -fno-color-diagnostics  Disable colors in diagnostics
-! HELP-NEXT: -fno-implicit-none
-! HELP-NEXT: -fno-logical-abbreviations
-! HELP-NEXT: {{[[:space:]]$}}
-! HELP-NEXT: -fno-xor-operator
 ! HELP-NEXT: -fopenacc              Enable OpenACC
 ! HELP-NEXT: -fopenmp               Parse OpenMP pragmas and generate parallel code.
 ! HELP-NEXT: -fxor-operator         Enable .XOR. as a synonym of .NEQV.
@@ -73,8 +68,6 @@
 ! HELP-FC1-NEXT: -E                     Only run the preprocessor
 ! HELP-FC1-NEXT: -falternative-parameter-statement
 ! HELP-FC1-NEXT: Enable the old style PARAMETER statement
-! HELP-FC1-NEXT: -fanalyzed-objects-for-unparse
-! HELP-FC1-NEXT: {{[[:space:]]$}}
 ! HELP-FC1-NEXT: -fbackslash            Specify that backslash in string introduces an escape character
 ! HELP-FC1-NEXT: -fdebug-dump-all       Dump symbols and the parse tree after the semantic checks
 ! HELP-FC1-NEXT: -fdebug-dump-parse-tree-no-sema
@@ -110,12 +103,7 @@
 ! HELP-FC1-NEXT: -flogical-abbreviations Enable logical abbreviations
 ! HELP-FC1-NEXT: -fno-analyzed-objects-for-unparse
 ! HELP-FC1-NEXT:                        Do not use the analyzed objects when unparsing
-! HELP-FC1-NEXT: -fno-backslash
-! HELP-FC1-NEXT: -fno-implicit-none
-! HELP-FC1-NEXT: -fno-logical-abbreviations
-! HELP-FC1-NEXT: {{[[:space:]]$}}
 ! HELP-FC1-NEXT: -fno-reformat          Dump the cooked character stream in -E mode
-! HELP-FC1-NEXT: -fno-xor-operator
 ! HELP-FC1-NEXT: -fopenacc              Enable OpenACC
 ! HELP-FC1-NEXT: -fopenmp               Parse OpenMP pragmas and generate parallel code.
 ! HELP-FC1-NEXT: -fxor-operator         Enable .XOR. as a synonym of .NEQV.
diff --git a/flang/test/Driver/output-paths.f90 b/flang/test/Driver/output-paths.f90
new file mode 100644
index 0000000000000..394425fb4310a
--- /dev/null
+++ b/flang/test/Driver/output-paths.f90
@@ -0,0 +1,12 @@
+! Test the diagnostic for cases when the output file cannot be generated
+
+!--------------------------
+! RUN lines
+!--------------------------
+! RUN: not %flang_fc1 -E -o %t.doesnotexist/somename %s 2> %t
+! RUN: FileCheck -check-prefix=OUTPUTFAIL -DMSG=%errc_ENOENT -input-file=%t %s
+
+!-----------------------
+! EXPECTED OUTPUT
+!-----------------------
+! OUTPUTFAIL: error: unable to open output file '{{.*}}doesnotexist{{.}}somename': '[[MSG]]'
diff --git a/flang/test/Driver/plugin-example.f90 b/flang/test/Driver/plugin-example.f90
deleted file mode 100644
index 73a48e34e72c3..0000000000000
--- a/flang/test/Driver/plugin-example.f90
+++ /dev/null
@@ -1,11 +0,0 @@
-! Check that loading and running the Hello World plugin example results in the correct print statement
-! Also check that when a plugin name isn't found, the error diagnostic is correct
-! This requires that the examples are built (FLANG_BUILD_EXAMPLES=ON)
-
-! REQUIRES: plugins, examples, shell
-
-! RUN: %flang_fc1 -load %llvmshlibdir/flangHelloWorldPlugin%pluginext -plugin -hello-world %s 2>&1 | FileCheck %s
-! CHECK: Hello World from your new Flang plugin
-
-! RUN: not %flang_fc1 -load %llvmshlibdir/flangHelloWorldPlugin%pluginext -plugin -wrong-name %s 2>&1 | FileCheck %s --check-prefix=ERROR
-! ERROR: error: unable to find plugin '-wrong-name'
diff --git a/flang/test/Driver/plugin-invalid-name.f90 b/flang/test/Driver/plugin-invalid-name.f90
new file mode 100644
index 0000000000000..55fc423b5e8f9
--- /dev/null
+++ b/flang/test/Driver/plugin-invalid-name.f90
@@ -0,0 +1,7 @@
+! Check the correct error diagnostic is reported when a plugin name isn't found
+
+! REQUIRES: plugins, shell
+
+! RUN: not %flang_fc1 -plugin -wrong-name %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+! ERROR: error: unable to find plugin '-wrong-name'
diff --git a/flang/test/Examples/print-fns-calls.f90 b/flang/test/Examples/print-fns-calls.f90
new file mode 100644
index 0000000000000..4702e1b56a57c
--- /dev/null
+++ b/flang/test/Examples/print-fns-calls.f90
@@ -0,0 +1,21 @@
+! Check the Flang Print Function Names example plugin doesn't count/print function/subroutine calls (should only count definitions)
+! This requires that the examples are built (FLANG_BUILD_EXAMPLES=ON) to access flangPrintFunctionNames.so
+
+! REQUIRES: plugins, examples, shell
+
+! RUN: %flang_fc1 -load %llvmshlibdir/flangPrintFunctionNames%pluginext -plugin print-fns %s 2>&1 | FileCheck %s
+
+!-----------------------------
+! EXPECTED OUTPUT: Counts == 0
+!-----------------------------
+! CHECK: ==== Functions: 0 ====
+! CHECK-NEXT: ==== Subroutines: 0 ====
+
+!-----------------------------
+! INPUT
+!-----------------------------
+program main
+    call subroutine1
+    fn1 = function1()
+    fn2 = function2()
+end program main
diff --git a/flang/test/Examples/print-fns-definitions.f90 b/flang/test/Examples/print-fns-definitions.f90
new file mode 100644
index 0000000000000..fc8fcb29f97b9
--- /dev/null
+++ b/flang/test/Examples/print-fns-definitions.f90
@@ -0,0 +1,40 @@
+! Check the Flang Print Function Names example plugin prints and counts function/subroutine definitions
+! This includes internal and external Function/Subroutines, but not Statement Functions
+! This requires that the examples are built (FLANG_BUILD_EXAMPLES=ON) to access flangPrintFunctionNames.so
+
+! REQUIRES: plugins, examples, shell
+
+! RUN: %flang_fc1 -load %llvmshlibdir/flangPrintFunctionNames%pluginext -plugin print-fns %s 2>&1 | FileCheck %s
+
+!-------------------------------------------------
+! EXPECTED OUTPUT: Names printed and counts != 0
+!-------------------------------------------------
+! CHECK: Function: external_func1
+! CHECK-NEXT: Function: external_func2
+! CHECK-NEXT: Subroutine: external_subr
+! CHECK-NEXT: Function: internal_func
+! CHECK-NEXT: Subroutine: internal_subr
+! CHECK-EMPTY:
+! CHECK-NEXT: ==== Functions: 3 ====
+! CHECK-NEXT: ==== Subroutines: 2 ====
+
+!--------------------------
+! INPUT
+!--------------------------
+function external_func1()
+end function
+
+function external_func2()
+end function
+
+subroutine external_subr
+end subroutine
+
+program main
+contains
+    function internal_func()
+    end function
+
+    subroutine internal_subr
+    end subroutine
+end program main
diff --git a/flang/test/Examples/print-fns-interfaces.f90 b/flang/test/Examples/print-fns-interfaces.f90
new file mode 100644
index 0000000000000..39a2bf3e465f3
--- /dev/null
+++ b/flang/test/Examples/print-fns-interfaces.f90
@@ -0,0 +1,26 @@
+! Check the Flang Print Function Names example plugin doesn't count/print Functions/Subroutines in interfaces
+! (It should only count definitions, which will appear elsewhere for interfaced functions/subroutines)
+! This requires that the examples are built (FLANG_BUILD_EXAMPLES=ON) to access flangPrintFunctionNames.so
+
+! REQUIRES: plugins, examples, shell
+
+! RUN: %flang_fc1 -load %llvmshlibdir/flangPrintFunctionNames%pluginext -plugin print-fns %s 2>&1 | FileCheck %s
+
+!-----------------------------
+! EXPECTED OUTPUT: Counts == 0
+!-----------------------------
+! CHECK: ==== Functions: 0 ====
+! CHECK-NEXT: ==== Subroutines: 0 ====
+
+!--------------------------
+! INPUT
+!--------------------------
+program main
+    interface
+        function interface_func()
+        end function
+
+        subroutine interface_subr()
+        end subroutine
+    end interface
+end program main
diff --git a/flang/test/Semantics/omp-nested-target.f90 b/flang/test/Semantics/omp-nested-target.f90
new file mode 100644
index 0000000000000..da4d1619f1063
--- /dev/null
+++ b/flang/test/Semantics/omp-nested-target.f90
@@ -0,0 +1,54 @@
+! RUN: %S/test_errors.sh %s %t %flang_fc1 -fopenmp
+! REQUIRES: shell
+
+! OpenMP Version 5.0
+! Check OpenMP construct validity for the following directives:
+! 2.12.5 Target Construct
+
+program main
+  integer :: i, j, N = 10
+  real :: a, arrayA(512), arrayB(512), ai(10)
+  real, allocatable :: B(:)
+
+  !$omp target
+  !WARNING: If TARGET UPDATE directive is nested inside TARGET region, the behaviour is unspecified
+  !$omp target update from(arrayA) to(arrayB)
+  do i = 1, 512
+    arrayA(i) = arrayB(i)
+  end do
+  !$omp end target
+
+  !$omp parallel
+  !$omp target
+  !$omp parallel
+  !WARNING: If TARGET UPDATE directive is nested inside TARGET region, the behaviour is unspecified
+  !$omp target update from(arrayA) to(arrayB)
+  do i = 1, 512
+    arrayA(i) = arrayB(i)
+  end do
+  !$omp end parallel
+  !$omp end target
+  !$omp end parallel
+
+  !$omp target
+  !WARNING: If TARGET DATA directive is nested inside TARGET region, the behaviour is unspecified
+  !$omp target data map(to: a)
+  do i = 1, N
+    a = 3.14
+  end do
+  !$omp end target data
+  !$omp end target
+
+  allocate(B(N))
+  !$omp target
+  !WARNING: If TARGET ENTER DATA directive is nested inside TARGET region, the behaviour is unspecified
+  !$omp target enter data map(alloc:B)
+  !$omp end target
+
+  !$omp target
+  !WARNING: If TARGET EXIT DATA directive is nested inside TARGET region, the behaviour is unspecified
+  !$omp target exit data map(delete:B)
+  !$omp end target
+  deallocate(B)
+
+end program main
diff --git a/flang/test/Semantics/test_symbols.py b/flang/test/Semantics/test_symbols.py
index 2ca2301745d0d..e284e340267a8 100755
--- a/flang/test/Semantics/test_symbols.py
+++ b/flang/test/Semantics/test_symbols.py
@@ -4,7 +4,7 @@
 we get the right symbols in the output, i.e. the output should be
 the same as the input, except for the copyright comment.
 Expects a source file passed as the first argument;
-Expects the Flang frontdriver with options as second argument."""
+Expects the Flang frontend driver with options as second argument."""
 
 import sys
 import re
@@ -44,9 +44,8 @@
 diff_check = ""
 
 # Compares the input with the output
-for line in unified_diff(diff1, diff3, n=999999,
-                         fromfile="Expected output", tofile="Actual output"):
-    diff_check += line
+diff_check = "\n".join(unified_diff(diff1.split("\n"), diff3.split("\n"), n=999999,
+                       fromfile="Expected_output", tofile="Actual_output"))
 
 if diff_check != "":
     print(diff_check.replace(" ", ""))
diff --git a/flang/test/lit.site.cfg.py.in b/flang/test/lit.site.cfg.py.in
index 5314be8502f6b..378602dd71568 100644
--- a/flang/test/lit.site.cfg.py.in
+++ b/flang/test/lit.site.cfg.py.in
@@ -6,6 +6,7 @@ config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
 config.llvm_shlib_dir = path(r"@SHLIBDIR@")
 config.llvm_plugin_ext = "@LLVM_PLUGIN_EXT@"
 config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
+config.errc_messages = "@LLVM_LIT_ERRC_MESSAGES@"
 config.flang_obj_root = "@FLANG_BINARY_DIR@"
 config.flang_src_dir = "@FLANG_SOURCE_DIR@"
 config.flang_tools_dir = "@FLANG_TOOLS_DIR@"
diff --git a/flang/unittests/Runtime/Time.cpp b/flang/unittests/Runtime/Time.cpp
index b2ca81f98c4e3..3439a7caa05ce 100644
--- a/flang/unittests/Runtime/Time.cpp
+++ b/flang/unittests/Runtime/Time.cpp
@@ -26,3 +26,33 @@ TEST(TimeIntrinsics, CpuTime) {
     ASSERT_GE(end, start);
   }
 }
+
+using count_t = CppTypeFor<TypeCategory::Integer, 8>;
+
+TEST(TimeIntrinsics, SystemClock) {
+  // We can't really test that we get the "right" result for SYSTEM_CLOCK, but
+  // we can have a smoke test to see that we get something reasonable on the
+  // platforms where we expect to support it.
+
+  // The value of the count rate and max will vary by platform, but they should
+  // always be strictly positive if we have a working implementation of
+  // SYSTEM_CLOCK.
+  EXPECT_GT(RTNAME(SystemClockCountRate)(), 0);
+
+  count_t max{RTNAME(SystemClockCountMax)()};
+  EXPECT_GT(max, 0);
+
+  count_t start{RTNAME(SystemClockCount)()};
+  EXPECT_GE(start, 0);
+  EXPECT_LE(start, max);
+
+  // Loop until we get a different value from SystemClockCount. If we don't get
+  // one before we time out, then we should probably look into an implementation
+  // for SystemClokcCount with a better timer resolution on this platform.
+  for (count_t end = start; end == start; end = RTNAME(SystemClockCount)()) {
+    EXPECT_GE(end, 0);
+    EXPECT_LE(end, max);
+
+    EXPECT_GE(end, start);
+  }
+}
diff --git a/flang/unittests/Runtime/Transformational.cpp b/flang/unittests/Runtime/Transformational.cpp
index 1394e53aefc66..52e0b7ecfec0c 100644
--- a/flang/unittests/Runtime/Transformational.cpp
+++ b/flang/unittests/Runtime/Transformational.cpp
@@ -110,6 +110,26 @@ TEST(Transformational, Shifts) {
         *result.ZeroBasedIndexedElement<std::int32_t>(j), eoshiftExpect1[j]);
   }
   result.Destroy();
+
+  // VECTOR EOSHIFT
+  StaticDescriptor<0> boundaryDescriptor;
+  Descriptor vectorBoundary{boundaryDescriptor.descriptor()};
+  std::int32_t boundaryValue{343};
+  vectorBoundary.Establish(TypeCategory::Integer, 4,
+      const_cast<void *>(reinterpret_cast<const void *>(&boundaryValue)), 0);
+  RTNAME(EoshiftVector)
+  (vectorResult, *vector, 2, &vectorBoundary, __FILE__, __LINE__);
+  EXPECT_EQ(vectorResult.type(), array->type());
+  EXPECT_EQ(vectorResult.rank(), 1);
+  EXPECT_EQ(vectorResult.GetDimension(0).LowerBound(), 1);
+  EXPECT_EQ(vectorResult.GetDimension(0).Extent(), 6);
+  EXPECT_EQ(vectorResult.type(), (TypeCode{TypeCategory::Integer, 4}));
+  static std::int32_t eoshiftVectorExpect[6]{3, 4, 5, 6, 343, 343};
+  for (int j{0}; j < 6; ++j) {
+    EXPECT_EQ(*vectorResult.ZeroBasedIndexedElement<std::int32_t>(j),
+        eoshiftVectorExpect[j]);
+  }
+  vectorResult.Destroy();
 }
 
 TEST(Transformational, Pack) {
diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt
index ba46faeaca839..2f06836b0fadd 100644
--- a/libc/benchmarks/CMakeLists.txt
+++ b/libc/benchmarks/CMakeLists.txt
@@ -10,7 +10,7 @@ set(LLVM_LINK_COMPONENTS Support)
 set(GOOGLE_BENCHMARK_TARGET_FLAGS ${BENCHMARK_DIALECT_FLAG})
 if (LIBCXX_BENCHMARK_GCC_TOOLCHAIN)
   set(GOOGLE_BENCHMARK_TARGET_FLAGS
-      -gcc-toolchain ${LIBCXX_BENCHMARK_GCC_TOOLCHAIN})
+      --gcc-toolchain=${LIBCXX_BENCHMARK_GCC_TOOLCHAIN})
 endif()
 string(REPLACE ";" " " GOOGLE_BENCHMARK_TARGET_FLAGS "${GOOGLE_BENCHMARK_TARGET_FLAGS}")
 
@@ -171,6 +171,7 @@ add_libc_multi_impl_benchmark(memcpy)
 add_libc_multi_impl_benchmark(memset)
 add_libc_multi_impl_benchmark(bzero)
 add_libc_multi_impl_benchmark(memcmp)
+add_libc_multi_impl_benchmark(bcmp)
 
 #==============================================================================
 # Google Benchmarking tool
@@ -188,6 +189,7 @@ target_link_libraries(libc.benchmarks.memory_functions.opt_host
   PRIVATE
   libc-memory-benchmark
   libc.src.string.memcmp_opt_host
+  libc.src.string.bcmp_opt_host
   libc.src.string.memcpy_opt_host
   libc.src.string.memset_opt_host
   libc.src.string.bzero_opt_host
diff --git a/libc/benchmarks/LibcMemoryBenchmarkMain.cpp b/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
index 30bbfd44cd9b6..8539402a8cdaa 100644
--- a/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
+++ b/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
@@ -27,6 +27,7 @@ extern void *memcpy(void *__restrict, const void *__restrict, size_t);
 extern void *memset(void *, int, size_t);
 extern void bzero(void *, size_t);
 extern int memcmp(const void *, const void *, size_t);
+extern int bcmp(const void *, const void *, size_t);
 
 } // namespace __llvm_libc
 
@@ -76,6 +77,9 @@ using BenchmarkSetup = SetSetup;
 #elif defined(LIBC_BENCHMARK_FUNCTION_MEMCMP)
 #define LIBC_BENCHMARK_FUNCTION LIBC_BENCHMARK_FUNCTION_MEMCMP
 using BenchmarkSetup = ComparisonSetup;
+#elif defined(LIBC_BENCHMARK_FUNCTION_BCMP)
+#define LIBC_BENCHMARK_FUNCTION LIBC_BENCHMARK_FUNCTION_BCMP
+using BenchmarkSetup = ComparisonSetup;
 #else
 #error "Missing LIBC_BENCHMARK_FUNCTION_XXX definition"
 #endif
diff --git a/libc/benchmarks/LibcMemoryGoogleBenchmarkMain.cpp b/libc/benchmarks/LibcMemoryGoogleBenchmarkMain.cpp
index 055f9902fcdc6..e48b3a79c06c4 100644
--- a/libc/benchmarks/LibcMemoryGoogleBenchmarkMain.cpp
+++ b/libc/benchmarks/LibcMemoryGoogleBenchmarkMain.cpp
@@ -28,6 +28,7 @@ extern void *memcpy(void *__restrict, const void *__restrict, size_t);
 extern void *memset(void *, int, size_t);
 extern void bzero(void *, size_t);
 extern int memcmp(const void *, const void *, size_t);
+extern int bcmp(const void *, const void *, size_t);
 
 } // namespace __llvm_libc
 
@@ -38,6 +39,9 @@ static constexpr MemcpyConfiguration kMemcpyConfigurations[] = {
 static constexpr MemcmpConfiguration kMemcmpConfigurations[] = {
     {__llvm_libc::memcmp, "__llvm_libc::memcmp"}};
 
+static constexpr MemcmpConfiguration kBcmpConfigurations[] = {
+    {__llvm_libc::bcmp, "__llvm_libc::bcmp"}};
+
 static constexpr MemsetConfiguration kMemsetConfigurations[] = {
     {__llvm_libc::memset, "__llvm_libc::memset"}};
 
@@ -116,6 +120,8 @@ BENCHMARK_MEMORY_FUNCTION(BM_Memcpy, CopySetup, MemcpyConfiguration,
                           llvm::makeArrayRef(kMemcpyConfigurations));
 BENCHMARK_MEMORY_FUNCTION(BM_Memcmp, ComparisonSetup, MemcmpConfiguration,
                           llvm::makeArrayRef(kMemcmpConfigurations));
+BENCHMARK_MEMORY_FUNCTION(BM_Bcmp, ComparisonSetup, MemcmpConfiguration,
+                          llvm::makeArrayRef(kBcmpConfigurations));
 BENCHMARK_MEMORY_FUNCTION(BM_Memset, SetSetup, MemsetConfiguration,
                           llvm::makeArrayRef(kMemsetConfigurations));
 BENCHMARK_MEMORY_FUNCTION(BM_Bzero, SetSetup, BzeroConfiguration,
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index e16a81197a96a..ba06ccfef536a 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -178,6 +178,11 @@ function(add_libc_exhaustive_testsuite suite_name)
   add_dependencies(exhaustive-check-libc ${suite_name})
 endfunction(add_libc_exhaustive_testsuite)
 
+function(add_libc_long_running_testsuite suite_name)
+  add_custom_target(${suite_name})
+  add_dependencies(libc-long-running-tests ${suite_name})
+endfunction(add_libc_long_running_testsuite)
+
 # Rule to add a fuzzer test.
 # Usage
 #    add_libc_fuzzer(
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 21bbaa466e4bb..0bb899cbc7d0b 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -21,6 +21,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.errno.__errno_location
 
     # string.h entrypoints
+    libc.src.string.bcmp
     libc.src.string.bzero
     libc.src.string.memchr
     libc.src.string.memcmp
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 90346b4b3461f..4fb4ad4ce9a7d 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -21,6 +21,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.errno.__errno_location
 
     # string.h entrypoints
+    libc.src.string.bcmp
     libc.src.string.bzero
     libc.src.string.memchr
     libc.src.string.memcmp
@@ -156,9 +157,15 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.stdlib._Exit
     libc.src.stdlib.abort
     libc.src.stdlib.abs
+    libc.src.stdlib.atoi
+    libc.src.stdlib.atol
+    libc.src.stdlib.atoll
     libc.src.stdlib.labs
     libc.src.stdlib.llabs
+    libc.src.stdlib.strtol
     libc.src.stdlib.strtoll
+    libc.src.stdlib.strtoul
+    libc.src.stdlib.strtoull
 
     # signal.h entrypoints
     libc.src.signal.raise
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 1d1020d921f7d..28092183560df 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -21,6 +21,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.errno.__errno_location
 
     # string.h entrypoints
+    libc.src.string.bcmp
     libc.src.string.bzero
     libc.src.string.memchr
     libc.src.string.memcmp
@@ -60,13 +61,15 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
+    # TODO: Add double precision sin, cos and tan to this list.
+    # They are currently disabled because of a problem with the
+    # bot hardware.
     libc.src.math.copysign
     libc.src.math.copysignf
     libc.src.math.copysignl
     libc.src.math.ceil
     libc.src.math.ceilf
     libc.src.math.ceill
-    libc.src.math.cos
     libc.src.math.cosf
     libc.src.math.expf
     libc.src.math.exp2f
@@ -129,13 +132,11 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.round
     libc.src.math.roundf
     libc.src.math.roundl
-    libc.src.math.sin
     libc.src.math.sincosf
     libc.src.math.sinf
     libc.src.math.sqrt
     libc.src.math.sqrtf
     libc.src.math.sqrtl
-    libc.src.math.tan
     libc.src.math.trunc
     libc.src.math.truncf
     libc.src.math.truncl
diff --git a/libc/spec/llvm_libc_ext.td b/libc/spec/llvm_libc_ext.td
index a70eb18dd61b7..d4a133a5c5575 100644
--- a/libc/spec/llvm_libc_ext.td
+++ b/libc/spec/llvm_libc_ext.td
@@ -8,8 +8,12 @@ def LLVMLibcExt : StandardSpec<"llvm_libc_ext"> {
           FunctionSpec<
               "bzero",
               RetValSpec<VoidType>,
-              [ArgSpec<VoidPtr>,
-               ArgSpec<SizeTType>]
+              [ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
+          >,
+          FunctionSpec<
+              "bcmp",
+              RetValSpec<IntType>,
+              [ArgSpec<ConstVoidPtr>, ArgSpec<ConstVoidPtr>, ArgSpec<SizeTType>]
           >,
       ]
   >;
@@ -23,10 +27,7 @@ def LLVMLibcExt : StandardSpec<"llvm_libc_ext"> {
           FunctionSpec<
               "__assert_fail",
               RetValSpec<NoReturn>,
-              [ArgSpec<ConstCharPtr>,
-               ArgSpec<ConstCharPtr>,
-               ArgSpec<UnsignedType>,
-               ArgSpec<ConstCharPtr>,]
+              [ArgSpec<ConstCharPtr>, ArgSpec<ConstCharPtr>, ArgSpec<UnsignedType>, ArgSpec<ConstCharPtr>]
 
           >,
       ]
diff --git a/libc/spec/spec.td b/libc/spec/spec.td
index e49c759c1f790..242303de8e3b5 100644
--- a/libc/spec/spec.td
+++ b/libc/spec/spec.td
@@ -40,7 +40,9 @@ def VarArgType : Type {}
 def VoidType : NamedType<"void">;
 def IntType : NamedType<"int">;
 def LongType : NamedType<"long">;
+def UnsignedLongType : NamedType<"unsigned long">;
 def LongLongType : NamedType<"long long">;
+def UnsignedLongLongType : NamedType<"unsigned long long">;
 def FloatType : NamedType<"float">;
 def DoubleType : NamedType<"double">;
 def LongDoubleType : NamedType<"long double">;
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index eb0ffdfe2e9d2..dadf90e34f7e3 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -476,9 +476,15 @@ def StdC : StandardSpec<"stdc"> {
       [
           FunctionSpec<"abort", RetValSpec<NoReturn>, [ArgSpec<VoidType>]>,
           FunctionSpec<"abs", RetValSpec<IntType>, [ArgSpec<IntType>]>,
+          FunctionSpec<"atoi", RetValSpec<IntType>, [ArgSpec<ConstCharPtr>]>,
+          FunctionSpec<"atol", RetValSpec<LongType>, [ArgSpec<ConstCharPtr>]>,
+          FunctionSpec<"atoll", RetValSpec<LongLongType>, [ArgSpec<ConstCharPtr>]>,
           FunctionSpec<"labs", RetValSpec<LongType>, [ArgSpec<LongType>]>,
           FunctionSpec<"llabs", RetValSpec<LongLongType>, [ArgSpec<LongLongType>]>,
+          FunctionSpec<"strtol", RetValSpec<LongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
           FunctionSpec<"strtoll", RetValSpec<LongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
+          FunctionSpec<"strtoul", RetValSpec<UnsignedLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
+          FunctionSpec<"strtoull", RetValSpec<UnsignedLongLongType>, [ArgSpec<ConstCharRestrictedPtr>, ArgSpec<CharRestrictedPtrPtr>, ArgSpec<IntType>]>,
           FunctionSpec<"_Exit", RetValSpec<NoReturn>, [ArgSpec<IntType>]>,
       ]
   >;
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index dce6cb871ec56..a09ca4bcee6fe 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -20,6 +20,7 @@ add_header_library(
     .ctype_utils
     libc.include.errno
     libc.src.errno.__errno_location
+    libc.utils.CPP.standalone_cpp
 )
 
 add_header_library(
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index 510b6917df421..589b9b597f813 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -26,6 +26,8 @@ add_header_library(
     NormalFloat.h
     PlatformDefs.h
     PolyEval.h
+    UInt.h
+    XFloat.h
   DEPENDS
     libc.include.math
     libc.include.errno
diff --git a/libc/src/__support/FPUtil/UInt.h b/libc/src/__support/FPUtil/UInt.h
new file mode 100644
index 0000000000000..9e3f02914eac1
--- /dev/null
+++ b/libc/src/__support/FPUtil/UInt.h
@@ -0,0 +1,236 @@
+//===-- A class to manipulate wide integers. --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_FPUTIL_UINT_H
+#define LLVM_LIBC_UTILS_FPUTIL_UINT_H
+
+#include <stddef.h> // For size_t
+#include <stdint.h>
+
+namespace __llvm_libc {
+namespace fputil {
+
+template <size_t Bits> class UInt {
+
+  // This is mainly used for debugging.
+  enum Kind {
+    NotANumber,
+    Valid,
+  };
+
+  static_assert(Bits > 0 && Bits % 64 == 0,
+                "Number of bits in UInt should be a multiple of 64.");
+  static constexpr uint64_t Mask32 = 0xFFFFFFFF;
+  static constexpr size_t WordCount = Bits / 64;
+  static constexpr uint64_t InvalidHexDigit = 20;
+  uint64_t val[WordCount];
+  Kind kind;
+
+  uint64_t low(uint64_t v) { return v & Mask32; }
+
+  uint64_t high(uint64_t v) { return (v >> 32) & Mask32; }
+
+  uint64_t hexval(char c) {
+    uint64_t diff;
+    if ((diff = uint64_t(c) - 'A') < 6)
+      return diff + 10;
+    else if ((diff = uint64_t(c) - 'a') < 6)
+      return diff + 10;
+    else if ((diff = uint64_t(c) - '0') < 10)
+      return diff;
+    else
+      return InvalidHexDigit;
+  }
+
+  size_t strlen(const char *s) {
+    size_t len;
+    for (len = 0; *s != '\0'; ++s, ++len)
+      ;
+    return len;
+  }
+
+public:
+  UInt() { kind = Valid; }
+
+  UInt(const UInt<Bits> &other) : kind(other.kind) {
+    if (kind == Valid) {
+      for (size_t i = 0; i < WordCount; ++i)
+        val[i] = other.val[i];
+    }
+  }
+
+  // This constructor is used for debugging.
+  explicit UInt(const char *s) {
+    size_t len = strlen(s);
+    if (len > Bits / 4 + 2 || len < 3) {
+      kind = NotANumber;
+      return;
+    }
+
+    if (!(s[0] == '0' && s[1] == 'x')) {
+      kind = NotANumber;
+      return;
+    }
+
+    for (size_t i = 0; i < WordCount; ++i)
+      val[i] = 0;
+
+    for (size_t i = len - 1, w = 0; i >= 2; --i, w += 4) {
+      uint64_t hex = hexval(s[i]);
+      if (hex == InvalidHexDigit) {
+        kind = NotANumber;
+        return;
+      }
+      val[w / 64] |= (hex << (w % 64));
+    }
+
+    kind = Valid;
+  }
+
+  explicit UInt(uint64_t v) {
+    val[0] = v;
+    for (size_t i = 1; i < WordCount; ++i)
+      val[i] = 0;
+    kind = Valid;
+  }
+
+  explicit UInt(uint64_t data[WordCount]) {
+    for (size_t i = 0; i < WordCount; ++i)
+      val[i] = data[i];
+    kind = Valid;
+  }
+
+  bool is_valid() const { return kind == Valid; }
+
+  // Add x to this number and store the result in this number.
+  // Returns the carry value produced by the addition operation.
+  uint64_t add(const UInt<Bits> &x) {
+    uint64_t carry = 0;
+    for (size_t i = 0; i < WordCount; ++i) {
+      uint64_t res_lo = low(val[i]) + low(x.val[i]) + carry;
+      carry = high(res_lo);
+      res_lo = low(res_lo);
+
+      uint64_t res_hi = high(val[i]) + high(x.val[i]) + carry;
+      carry = high(res_hi);
+      res_hi = low(res_hi);
+
+      val[i] = res_lo + (res_hi << 32);
+    }
+    return carry;
+  }
+
+  // Multiply this number with x and store the result in this number. It is
+  // implemented using the long multiplication algorithm by splitting the
+  // 64-bit words of this number and |x| in to 32-bit halves but peforming
+  // the operations using 64-bit numbers. This ensures that we don't lose the
+  // carry bits.
+  // Returns the carry value produced by the multiplication operation.
+  uint64_t mul(uint64_t x) {
+    uint64_t x_lo = low(x);
+    uint64_t x_hi = high(x);
+
+    uint64_t row1[WordCount + 1];
+    uint64_t carry = 0;
+    for (size_t i = 0; i < WordCount; ++i) {
+      uint64_t l = low(val[i]);
+      uint64_t h = high(val[i]);
+      uint64_t p1 = x_lo * l;
+      uint64_t p2 = x_lo * h;
+
+      uint64_t res_lo = low(p1) + carry;
+      carry = high(res_lo);
+      uint64_t res_hi = high(p1) + low(p2) + carry;
+      carry = high(res_hi) + high(p2);
+
+      res_lo = low(res_lo);
+      res_hi = low(res_hi);
+      row1[i] = res_lo + (res_hi << 32);
+    }
+    row1[WordCount] = carry;
+
+    uint64_t row2[WordCount + 1];
+    row2[0] = 0;
+    carry = 0;
+    for (size_t i = 0; i < WordCount; ++i) {
+      uint64_t l = low(val[i]);
+      uint64_t h = high(val[i]);
+      uint64_t p1 = x_hi * l;
+      uint64_t p2 = x_hi * h;
+
+      uint64_t res_lo = low(p1) + carry;
+      carry = high(res_lo);
+      uint64_t res_hi = high(p1) + low(p2) + carry;
+      carry = high(res_hi) + high(p2);
+
+      res_lo = low(res_lo);
+      res_hi = low(res_hi);
+      row2[i] = res_lo + (res_hi << 32);
+    }
+    row2[WordCount] = carry;
+
+    UInt<(WordCount + 1) * 64> r1(row1), r2(row2);
+    r2.shift_left(32);
+    r1.add(r2);
+    for (size_t i = 0; i < WordCount; ++i) {
+      val[i] = r1[i];
+    }
+    return r1[WordCount];
+  }
+
+  void shift_left(size_t s) {
+    const size_t drop = s / 64;  // Number of words to drop
+    const size_t shift = s % 64; // Bits to shift in the remaining words.
+    const uint64_t mask = ((uint64_t(1) << shift) - 1) << (64 - shift);
+
+    for (size_t i = WordCount; drop > 0 && i > 0; --i) {
+      if (i - drop > 0)
+        val[i - 1] = val[i - drop - 1];
+      else
+        val[i - 1] = 0;
+    }
+    for (size_t i = WordCount; shift > 0 && i > drop; --i) {
+      uint64_t drop_val = (val[i - 1] & mask) >> (64 - shift);
+      val[i - 1] <<= shift;
+      if (i < WordCount)
+        val[i] |= drop_val;
+    }
+  }
+
+  void shift_right(size_t s) {
+    const size_t drop = s / 64;  // Number of words to drop
+    const size_t shift = s % 64; // Bit shift in the remaining words.
+    const uint64_t mask = (uint64_t(1) << shift) - 1;
+
+    for (size_t i = 0; drop > 0 && i < WordCount; ++i) {
+      if (i + drop < WordCount)
+        val[i] = val[i + drop];
+      else
+        val[i] = 0;
+    }
+    for (size_t i = 0; shift > 0 && i < WordCount; ++i) {
+      uint64_t drop_val = ((val[i] & mask) << (64 - shift));
+      val[i] >>= shift;
+      if (i > 0)
+        val[i - 1] |= drop_val;
+    }
+  }
+
+  const uint64_t &operator[](size_t i) const { return val[i]; }
+
+  uint64_t &operator[](size_t i) { return val[i]; }
+
+  uint64_t *data() { return val; }
+
+  const uint64_t *data() const { return val; }
+};
+
+} // namespace fputil
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_UTILS_FPUTIL_UINT_H
diff --git a/libc/src/__support/FPUtil/XFloat.h b/libc/src/__support/FPUtil/XFloat.h
new file mode 100644
index 0000000000000..11dbf7df14758
--- /dev/null
+++ b/libc/src/__support/FPUtil/XFloat.h
@@ -0,0 +1,180 @@
+//===-- Utility class to manipulate wide floats. ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FPBits.h"
+#include "NormalFloat.h"
+#include "UInt.h"
+
+#include <stdint.h>
+
+namespace __llvm_libc {
+namespace fputil {
+
+// Store and manipulate positive double precision numbers at |Precision| bits.
+template <size_t Precision> class XFloat {
+  static constexpr uint64_t OneMask = (uint64_t(1) << 63);
+  UInt<Precision> man;
+  static constexpr uint64_t WordCount = Precision / 64;
+  int exp;
+
+  size_t bit_width(uint64_t x) {
+    if (x == 0)
+      return 0;
+    size_t shift = 0;
+    while ((OneMask & x) == 0) {
+      ++shift;
+      x <<= 1;
+    }
+    return 64 - shift;
+  }
+
+public:
+  XFloat() : exp(0) {
+    for (int i = 0; i < WordCount; ++i)
+      man[i] = 0;
+  }
+
+  XFloat(const XFloat &other) : exp(other.exp) {
+    for (int i = 0; i < WordCount; ++i)
+      man[i] = other.man[i];
+  }
+
+  explicit XFloat(double x) {
+    auto xn = NormalFloat<double>(x);
+    exp = xn.exponent;
+    man[WordCount - 1] = xn.mantissa << 11;
+    for (int i = 0; i < WordCount - 1; ++i)
+      man[i] = 0;
+  }
+
+  XFloat(int e, const UInt<Precision> &bits) : exp(e) {
+    for (size_t i = 0; i < WordCount; ++i)
+      man[i] = bits[i];
+  }
+
+  // Multiply this number with x and store the result in this number.
+  void mul(double x) {
+    auto xn = NormalFloat<double>(x);
+    exp += xn.exponent;
+    uint64_t carry = man.mul(xn.mantissa << 11);
+    size_t carry_width = bit_width(carry);
+    carry_width = (carry_width == 64 ? 64 : 63);
+    man.shift_right(carry_width);
+    man[WordCount - 1] = man[WordCount - 1] + (carry << (64 - carry_width));
+    exp += carry_width == 64 ? 1 : 0;
+    normalize();
+  }
+
+  void drop_int() {
+    if (exp < 0)
+      return;
+    if (exp > int(Precision - 1)) {
+      for (size_t i = 0; i < WordCount; ++i)
+        man[i] = 0;
+      return;
+    }
+
+    man.shift_left(exp + 1);
+    man.shift_right(exp + 1);
+
+    normalize();
+  }
+
+  double mul(const XFloat<Precision> &other) {
+    constexpr size_t row_words = 2 * WordCount + 1;
+    constexpr size_t row_precision = row_words * 64;
+    constexpr size_t result_bits = 2 * Precision;
+    UInt<row_precision> rows[WordCount];
+
+    for (size_t r = 0; r < WordCount; ++r) {
+      for (size_t i = 0; i < row_words; ++i) {
+        if (i < WordCount)
+          rows[r][i] = man[i];
+        else
+          rows[r][i] = 0;
+      }
+      rows[r].mul(other.man[r]);
+      rows[r].shift_left(r * 64);
+    }
+
+    for (size_t r = 1; r < WordCount; ++r) {
+      rows[0].add(rows[r]);
+    }
+    int result_exp = exp + other.exp;
+    uint64_t carry = rows[0][row_words - 1];
+    if (carry) {
+      size_t carry_width = bit_width(carry);
+      rows[0].shift_right(carry_width);
+      rows[0][row_words - 2] =
+          rows[0][row_words - 2] + (carry << (64 - carry_width));
+      result_exp += carry_width;
+    }
+
+    if (rows[0][row_words - 2] & OneMask) {
+      ++result_exp;
+    } else {
+      rows[0].shift_left(1);
+    }
+
+    UInt<result_bits> result_man;
+    for (size_t i = 0; i < result_bits / 64; ++i)
+      result_man[i] = rows[0][i];
+    XFloat<result_bits> result(result_exp, result_man);
+    result.normalize();
+    return double(result);
+  }
+
+  explicit operator double() {
+    normalize();
+
+    constexpr uint64_t one = uint64_t(1) << 10;
+    constexpr uint64_t excess_mask = (one << 1) - 1;
+    uint64_t excess = man[WordCount - 1] & excess_mask;
+    uint64_t new_man = man[WordCount - 1] >> 11;
+    if (excess > one) {
+      // We have to round up.
+      ++new_man;
+    } else if (excess == one) {
+      bool greater_than_one = false;
+      for (size_t i = 0; i < WordCount - 1; ++i) {
+        greater_than_one = (man[i] != 0);
+        if (greater_than_one)
+          break;
+      }
+      if (greater_than_one || (new_man & 1) != 0) {
+        ++new_man;
+      }
+    }
+
+    if (new_man == (uint64_t(1) << 53))
+      ++exp;
+
+    // We use NormalFloat as it can produce subnormal numbers or underflow to 0
+    // if necessary.
+    NormalFloat<double> d(exp, new_man, 0);
+    return double(d);
+  }
+
+  // Normalizes this number.
+  void normalize() {
+    uint64_t man_bits = 0;
+    for (size_t i = 0; i < WordCount; ++i)
+      man_bits |= man[i];
+
+    if (man_bits == 0)
+      return;
+
+    while ((man[WordCount - 1] & OneMask) == 0) {
+      man.shift_left(1);
+      --exp;
+    }
+  }
+};
+
+} // namespace fputil
+} // namespace __llvm_libc
diff --git a/libc/src/__support/str_conv_utils.h b/libc/src/__support/str_conv_utils.h
index 6a83fa6799a77..adde15823ef65 100644
--- a/libc/src/__support/str_conv_utils.h
+++ b/libc/src/__support/str_conv_utils.h
@@ -10,6 +10,7 @@
 #define LIBC_SRC_STDLIB_STDLIB_UTILS_H
 
 #include "src/__support/ctype_utils.h"
+#include "utils/CPP/Limits.h"
 #include <errno.h>
 #include <limits.h>
 
@@ -50,8 +51,9 @@ static inline int infer_base(const char *__restrict *__restrict src) {
 // Takes a pointer to a string, a pointer to a string pointer, and the base to
 // convert to. This function is used as the backend for all of the string to int
 // functions.
-static inline long long strtoll(const char *__restrict src,
-                                char **__restrict str_end, int base) {
+template <class T>
+static inline T strtointeger(const char *__restrict src,
+                             char **__restrict str_end, int base) {
   unsigned long long result = 0;
 
   if (base < 0 || base == 1 || base > 36) {
@@ -73,36 +75,56 @@ static inline long long strtoll(const char *__restrict src,
     src = src + 2;
   }
 
+  constexpr bool is_unsigned = (__llvm_libc::cpp::NumericLimits<T>::min() == 0);
+  const bool is_positive = (result_sign == '+');
+  unsigned long long constexpr NEGATIVE_MAX =
+      !is_unsigned ? static_cast<unsigned long long>(
+                         __llvm_libc::cpp::NumericLimits<T>::max()) +
+                         1
+                   : __llvm_libc::cpp::NumericLimits<T>::max();
   unsigned long long const ABS_MAX =
-      (result_sign == '+' ? LLONG_MAX
-                          : static_cast<unsigned long long>(LLONG_MAX) + 1);
+      (is_positive ? __llvm_libc::cpp::NumericLimits<T>::max() : NEGATIVE_MAX);
   unsigned long long const ABS_MAX_DIV_BY_BASE = ABS_MAX / base;
   while (isalnum(*src)) {
     int cur_digit = b36_char_to_int(*src);
     if (cur_digit >= base)
       break;
+
+    ++src;
+
+    // If the number has already hit the maximum value for the current type then
+    // the result cannot change, but we still need to advance src to the end of
+    // the number.
+    if (result == ABS_MAX) {
+      errno = ERANGE; // NOLINT
+      continue;
+    }
+
     if (result > ABS_MAX_DIV_BY_BASE) {
       result = ABS_MAX;
       errno = ERANGE; // NOLINT
-      break;
+    } else {
+      result = result * base;
     }
-    result = result * base;
     if (result > ABS_MAX - cur_digit) {
       result = ABS_MAX;
       errno = ERANGE; // NOLINT
-      break;
+    } else {
+      result = result + cur_digit;
     }
-    result = result + cur_digit;
-
-    ++src;
   }
 
   if (str_end != nullptr)
     *str_end = const_cast<char *>(src);
-  if (result_sign == '+')
-    return result;
-  else
-    return -result;
+
+  if (result == ABS_MAX) {
+    if (is_positive || is_unsigned)
+      return __llvm_libc::cpp::NumericLimits<T>::max();
+    else // T is signed and there is a negative overflow
+      return __llvm_libc::cpp::NumericLimits<T>::min();
+  }
+
+  return is_positive ? static_cast<T>(result) : -static_cast<T>(result);
 }
 
 } // namespace internal
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index ff5e814652eeb..6728e44e635fb 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -992,3 +992,15 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O2
 )
+
+add_object_library(
+  dp_trig
+  SRCS
+    dp_trig.cpp
+  HDRS
+    dp_trig.h
+  DEPENDS
+    libc.src.__support.FPUtil.fputil
+  COMPILE_OPTIONS
+   -O3
+)
diff --git a/libc/src/math/generic/dp_trig.cpp b/libc/src/math/generic/dp_trig.cpp
new file mode 100644
index 0000000000000..77077925e4510
--- /dev/null
+++ b/libc/src/math/generic/dp_trig.cpp
@@ -0,0 +1,105 @@
+//===-- Utilities for double precision trigonometric functions ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/ManipulationFunctions.h"
+#include "src/__support/FPUtil/UInt.h"
+#include "src/__support/FPUtil/XFloat.h"
+
+using FPBits = __llvm_libc::fputil::FPBits<double>;
+
+namespace __llvm_libc {
+
+// Implementation is based on the Payne and Hanek range reduction algorithm.
+// The caller should ensure that x is positive.
+// Consider:
+//   x/y = x * 1/y = I + F
+// I is the integral part and F the fractional part of the result of the
+// division operation. Then M = mod(x, y) = F * y. In order to compute M, we
+// first compute F. We do it by dropping bits from 1/y which would only
+// contribute integral results in the operation x * 1/y. This helps us get
+// accurate values of F even when x is a very large number.
+//
+// Internal operations are performed at 192 bits of precision.
+static double mod_impl(double x, const uint64_t y_bits[3],
+                       const uint64_t inv_y_bits[20], int y_exponent,
+                       int inv_y_exponent) {
+  FPBits bits(x);
+  int exponent = bits.getExponent();
+  int bit_drop = (exponent - 52) + inv_y_exponent + 1;
+  bit_drop = bit_drop >= 0 ? bit_drop : 0;
+  int word_drop = bit_drop / 64;
+  bit_drop %= 64;
+  fputil::UInt<256> man4;
+  for (size_t i = 0; i < 4; ++i) {
+    man4[3 - i] = inv_y_bits[word_drop + i];
+  }
+  man4.shift_left(bit_drop);
+  fputil::UInt<192> man_bits;
+  for (size_t i = 0; i < 3; ++i)
+    man_bits[i] = man4[i + 1];
+  fputil::XFloat<192> result(inv_y_exponent - word_drop * 64 - bit_drop,
+                             man_bits);
+  result.mul(x);
+  result.drop_int(); // |result| now holds fractional part of x/y.
+
+  fputil::UInt<192> y_man;
+  for (size_t i = 0; i < 3; ++i)
+    y_man[i] = y_bits[2 - i];
+  fputil::XFloat<192> y_192(y_exponent, y_man);
+  return result.mul(y_192);
+}
+
+static constexpr int TwoPIExponent = 2;
+
+// The mantissa bits of 2 * PI.
+// The most signification bits are in the first uint64_t word
+// and the least signification bits are in the last word. The
+// first word includes the implicit '1' bit.
+static constexpr uint64_t TwoPI[] = {0xc90fdaa22168c234, 0xc4c6628b80dc1cd1,
+                                     0x29024e088a67cc74};
+
+static constexpr int InvTwoPIExponent = -3;
+
+// The mantissa bits of 1/(2 * PI).
+// The most signification bits are in the first uint64_t word
+// and the least signification bits are in the last word. The
+// first word includes the implicit '1' bit.
+static constexpr uint64_t InvTwoPI[] = {
+    0xa2f9836e4e441529, 0xfc2757d1f534ddc0, 0xdb6295993c439041,
+    0xfe5163abdebbc561, 0xb7246e3a424dd2e0, 0x6492eea09d1921c,
+    0xfe1deb1cb129a73e, 0xe88235f52ebb4484, 0xe99c7026b45f7e41,
+    0x3991d639835339f4, 0x9c845f8bbdf9283b, 0x1ff897ffde05980f,
+    0xef2f118b5a0a6d1f, 0x6d367ecf27cb09b7, 0x4f463f669e5fea2d,
+    0x7527bac7ebe5f17b, 0x3d0739f78a5292ea, 0x6bfb5fb11f8d5d08,
+    0x56033046fc7b6bab, 0xf0cfbc209af4361e};
+
+double mod_2pi(double x) {
+  static constexpr double _2pi = 6.283185307179586;
+  if (x < _2pi)
+    return x;
+  return mod_impl(x, TwoPI, InvTwoPI, TwoPIExponent, InvTwoPIExponent);
+}
+
+// Returns mod(x, pi/2)
+double mod_pi_over_2(double x) {
+  static constexpr double pi_over_2 = 1.5707963267948966;
+  if (x < pi_over_2)
+    return x;
+  return mod_impl(x, TwoPI, InvTwoPI, TwoPIExponent - 2, InvTwoPIExponent + 2);
+}
+
+// Returns mod(x, pi/4)
+double mod_pi_over_4(double x) {
+  static constexpr double pi_over_4 = 0.7853981633974483;
+  if (x < pi_over_4)
+    return x;
+  return mod_impl(x, TwoPI, InvTwoPI, TwoPIExponent - 3, InvTwoPIExponent + 3);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/math/generic/dp_trig.h b/libc/src/math/generic/dp_trig.h
new file mode 100644
index 0000000000000..a724da04c1717
--- /dev/null
+++ b/libc/src/math/generic/dp_trig.h
@@ -0,0 +1,22 @@
+//===-- Utilities for double precision trigonometric functions --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_GENERIC_DP_TRIG_H
+#define LLVM_LIBC_SRC_MATH_GENERIC_DP_TRIG_H
+
+namespace __llvm_libc {
+
+double mod_2pi(double);
+
+double mod_pi_over_2(double);
+
+double mod_pi_over_4(double);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_GENERIC_DP_TRIG_H
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index 8457310897f5f..cd2f2f1f1e7e4 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -31,6 +31,36 @@ add_entrypoint_object(
     libc.src.__support.integer_operations
 )
 
+add_entrypoint_object(
+  atoi
+  SRCS
+    atoi.cpp
+  HDRS
+    atoi.h
+  DEPENDS
+    libc.src.__support.str_conv_utils
+)
+
+add_entrypoint_object(
+  atol
+  SRCS
+    atol.cpp
+  HDRS
+    atol.h
+  DEPENDS
+    libc.src.__support.str_conv_utils
+)
+
+add_entrypoint_object(
+  atoll
+  SRCS
+    atoll.cpp
+  HDRS
+    atoll.h
+  DEPENDS
+    libc.src.__support.str_conv_utils
+)
+
 add_entrypoint_object(
   labs
   SRCS
@@ -51,6 +81,16 @@ add_entrypoint_object(
     libc.src.__support.integer_operations
 )
 
+add_entrypoint_object(
+  strtol
+  SRCS
+    strtol.cpp
+  HDRS
+    strtol.h
+  DEPENDS
+    libc.src.__support.str_conv_utils
+)
+
 add_entrypoint_object(
   strtoll
   SRCS
@@ -60,3 +100,23 @@ add_entrypoint_object(
   DEPENDS
     libc.src.__support.str_conv_utils
 )
+
+add_entrypoint_object(
+  strtoul
+  SRCS
+    strtoul.cpp
+  HDRS
+    strtoul.h
+  DEPENDS
+    libc.src.__support.str_conv_utils
+)
+
+add_entrypoint_object(
+  strtoull
+  SRCS
+    strtoull.cpp
+  HDRS
+    strtoull.h
+  DEPENDS
+    libc.src.__support.str_conv_utils
+)
diff --git a/libc/src/stdlib/atoi.cpp b/libc/src/stdlib/atoi.cpp
new file mode 100644
index 0000000000000..f0e57caf743d5
--- /dev/null
+++ b/libc/src/stdlib/atoi.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of atoi --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/atoi.h"
+#include "src/__support/common.h"
+#include "src/__support/str_conv_utils.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(int, atoi, (const char *str)) {
+  return internal::strtointeger<int>(str, nullptr, 10);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdlib/atoi.h b/libc/src/stdlib/atoi.h
new file mode 100644
index 0000000000000..bb6c978d42f32
--- /dev/null
+++ b/libc/src/stdlib/atoi.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for atoi --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_ATOI_H
+#define LLVM_LIBC_SRC_STDLIB_ATOI_H
+
+namespace __llvm_libc {
+
+int atoi(const char *str);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_ATOI_H
diff --git a/libc/src/stdlib/atol.cpp b/libc/src/stdlib/atol.cpp
new file mode 100644
index 0000000000000..8f0ed885a9c97
--- /dev/null
+++ b/libc/src/stdlib/atol.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of atol --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/atol.h"
+#include "src/__support/common.h"
+#include "src/__support/str_conv_utils.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(long, atol, (const char *str)) {
+  return internal::strtointeger<long>(str, nullptr, 10);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdlib/atol.h b/libc/src/stdlib/atol.h
new file mode 100644
index 0000000000000..8ba6337aa6eba
--- /dev/null
+++ b/libc/src/stdlib/atol.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for atol --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_ATOL_H
+#define LLVM_LIBC_SRC_STDLIB_ATOL_H
+
+namespace __llvm_libc {
+
+long atol(const char *str);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_ATOL_H
diff --git a/libc/src/stdlib/atoll.cpp b/libc/src/stdlib/atoll.cpp
new file mode 100644
index 0000000000000..c75e521382190
--- /dev/null
+++ b/libc/src/stdlib/atoll.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of atoll -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/atoll.h"
+#include "src/__support/common.h"
+#include "src/__support/str_conv_utils.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(long long, atoll, (const char *str)) {
+  return internal::strtointeger<long long>(str, nullptr, 10);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdlib/atoll.h b/libc/src/stdlib/atoll.h
new file mode 100644
index 0000000000000..d99184fdd0591
--- /dev/null
+++ b/libc/src/stdlib/atoll.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for atoll -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_ATOLL_H
+#define LLVM_LIBC_SRC_STDLIB_ATOLL_H
+
+namespace __llvm_libc {
+
+long long atoll(const char *str);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_ATOLL_H
diff --git a/libc/src/stdlib/strtol.cpp b/libc/src/stdlib/strtol.cpp
new file mode 100644
index 0000000000000..1c744c929e283
--- /dev/null
+++ b/libc/src/stdlib/strtol.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of strtol ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/strtol.h"
+#include "src/__support/common.h"
+#include "src/__support/str_conv_utils.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(long, strtol,
+                   (const char *__restrict str, char **__restrict str_end,
+                    int base)) {
+  return internal::strtointeger<long>(str, str_end, base);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdlib/strtol.h b/libc/src/stdlib/strtol.h
new file mode 100644
index 0000000000000..741c4e8028148
--- /dev/null
+++ b/libc/src/stdlib/strtol.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for strtol ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_STRTOL_H
+#define LLVM_LIBC_SRC_STDLIB_STRTOL_H
+
+namespace __llvm_libc {
+
+long strtol(const char *__restrict str, char **__restrict str_end, int base);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_STRTOL_H
diff --git a/libc/src/stdlib/strtoll.cpp b/libc/src/stdlib/strtoll.cpp
index c99106d7c4232..e2fc37f8bf822 100644
--- a/libc/src/stdlib/strtoll.cpp
+++ b/libc/src/stdlib/strtoll.cpp
@@ -15,7 +15,7 @@ namespace __llvm_libc {
 LLVM_LIBC_FUNCTION(long long, strtoll,
                    (const char *__restrict str, char **__restrict str_end,
                     int base)) {
-  return internal::strtoll(str, str_end, base);
+  return internal::strtointeger<long long>(str, str_end, base);
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/stdlib/strtoul.cpp b/libc/src/stdlib/strtoul.cpp
new file mode 100644
index 0000000000000..eab264f33347f
--- /dev/null
+++ b/libc/src/stdlib/strtoul.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of strtoul -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/strtoul.h"
+#include "src/__support/common.h"
+#include "src/__support/str_conv_utils.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(unsigned long, strtoul,
+                   (const char *__restrict str, char **__restrict str_end,
+                    int base)) {
+  return internal::strtointeger<unsigned long>(str, str_end, base);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdlib/strtoul.h b/libc/src/stdlib/strtoul.h
new file mode 100644
index 0000000000000..6aee32f48f6e1
--- /dev/null
+++ b/libc/src/stdlib/strtoul.h
@@ -0,0 +1,19 @@
+//===-- Implementation header for strtoul -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_STRTOUL_H
+#define LLVM_LIBC_SRC_STDLIB_STRTOUL_H
+
+namespace __llvm_libc {
+
+unsigned long strtoul(const char *__restrict str, char **__restrict str_end,
+                      int base);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_STRTOUL_H
diff --git a/libc/src/stdlib/strtoull.cpp b/libc/src/stdlib/strtoull.cpp
new file mode 100644
index 0000000000000..bece2787ba7ea
--- /dev/null
+++ b/libc/src/stdlib/strtoull.cpp
@@ -0,0 +1,21 @@
+//===-- Implementation of strtoull ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/strtoull.h"
+#include "src/__support/common.h"
+#include "src/__support/str_conv_utils.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(unsigned long long, strtoull,
+                   (const char *__restrict str, char **__restrict str_end,
+                    int base)) {
+  return internal::strtointeger<unsigned long long>(str, str_end, base);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/stdlib/strtoull.h b/libc/src/stdlib/strtoull.h
new file mode 100644
index 0000000000000..b64c7be1eb972
--- /dev/null
+++ b/libc/src/stdlib/strtoull.h
@@ -0,0 +1,19 @@
+//===-- Implementation header for strtoull ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_STRTOULL_H
+#define LLVM_LIBC_SRC_STDLIB_STRTOULL_H
+
+namespace __llvm_libc {
+
+unsigned long long strtoull(const char *__restrict str,
+                            char **__restrict str_end, int base);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STDLIB_STRTOULL_H
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index a7dbac375d2ea..e41f0e1ec6062 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -331,3 +331,35 @@ else()
   add_memcmp(memcmp_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memcmp(memcmp)
 endif()
+
+# ------------------------------------------------------------------------------
+# bcmp
+# ------------------------------------------------------------------------------
+
+function(add_bcmp bcmp_name)
+  add_implementation(bcmp ${bcmp_name}
+    SRCS ${LIBC_BCMP_SRC}
+    HDRS ${LIBC_SOURCE_DIR}/src/string/bcmp.h
+    DEPENDS
+      .memory_utils.memory_utils
+      libc.include.string
+    COMPILE_OPTIONS
+      -fno-builtin-memcmp
+      -fno-builtin-bcmp
+    ${ARGN}
+  )
+endfunction()
+
+if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
+  set(LIBC_BCMP_SRC ${LIBC_SOURCE_DIR}/src/string/bcmp.cpp)
+  add_bcmp(bcmp_x86_64_opt_sse2   COMPILE_OPTIONS -march=k8             REQUIRE SSE2)
+  add_bcmp(bcmp_x86_64_opt_sse4   COMPILE_OPTIONS -march=nehalem        REQUIRE SSE4_2)
+  add_bcmp(bcmp_x86_64_opt_avx2   COMPILE_OPTIONS -march=haswell        REQUIRE AVX2)
+  add_bcmp(bcmp_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+  add_bcmp(bcmp_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
+  add_bcmp(bcmp)
+else()
+  set(LIBC_BCMP_SRC ${LIBC_SOURCE_DIR}/src/string/bcmp.cpp)
+  add_bcmp(bcmp_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
+  add_bcmp(bcmp)
+endif()
diff --git a/libc/src/string/bcmp.cpp b/libc/src/string/bcmp.cpp
new file mode 100644
index 0000000000000..196f3272eab1a
--- /dev/null
+++ b/libc/src/string/bcmp.cpp
@@ -0,0 +1,27 @@
+//===-- Implementation of bcmp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/bcmp.h"
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(int, bcmp,
+                   (const void *lhs, const void *rhs, size_t count)) {
+  const unsigned char *_lhs = reinterpret_cast<const unsigned char *>(lhs);
+  const unsigned char *_rhs = reinterpret_cast<const unsigned char *>(rhs);
+  for (size_t i = 0; i < count; ++i) {
+    if (_lhs[i] != _rhs[i]) {
+      return 1;
+    }
+  }
+  // count is 0 or _lhs and _rhs are the same.
+  return 0;
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/string/bcmp.h b/libc/src/string/bcmp.h
new file mode 100644
index 0000000000000..6503240319ed2
--- /dev/null
+++ b/libc/src/string/bcmp.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for bzero -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_BCMP_H
+#define LLVM_LIBC_SRC_STRING_BCMP_H
+
+#include <stddef.h> // size_t
+
+namespace __llvm_libc {
+
+int bcmp(const void *lhs, const void *rhs, size_t count);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_BCMP_H
diff --git a/libc/src/string/memcmp.cpp b/libc/src/string/memcmp.cpp
index 5569ab29f7653..bb2b5e2f37791 100644
--- a/libc/src/string/memcmp.cpp
+++ b/libc/src/string/memcmp.cpp
@@ -8,20 +8,44 @@
 
 #include "src/string/memcmp.h"
 #include "src/__support/common.h"
+#include "src/string/memory_utils/elements.h"
+
 #include <stddef.h> // size_t
 
 namespace __llvm_libc {
 
-// TODO: It is a simple implementation, an optimized version is preparing.
+static int memcmp_impl(const char *lhs, const char *rhs, size_t count) {
+#if defined(__i386__) || defined(__x86_64__)
+  using namespace ::__llvm_libc::x86;
+#else
+  using namespace ::__llvm_libc::scalar;
+#endif
+
+  if (count == 0)
+    return 0;
+  if (count == 1)
+    return ThreeWayCompare<_1>(lhs, rhs);
+  if (count == 2)
+    return ThreeWayCompare<_2>(lhs, rhs);
+  if (count == 3)
+    return ThreeWayCompare<_3>(lhs, rhs);
+  if (count <= 8)
+    return ThreeWayCompare<HeadTail<_4>>(lhs, rhs, count);
+  if (count <= 16)
+    return ThreeWayCompare<HeadTail<_8>>(lhs, rhs, count);
+  if (count <= 32)
+    return ThreeWayCompare<HeadTail<_16>>(lhs, rhs, count);
+  if (count <= 64)
+    return ThreeWayCompare<HeadTail<_32>>(lhs, rhs, count);
+  if (count <= 128)
+    return ThreeWayCompare<HeadTail<_64>>(lhs, rhs, count);
+  return ThreeWayCompare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
+}
+
 LLVM_LIBC_FUNCTION(int, memcmp,
                    (const void *lhs, const void *rhs, size_t count)) {
-  const unsigned char *_lhs = reinterpret_cast<const unsigned char *>(lhs);
-  const unsigned char *_rhs = reinterpret_cast<const unsigned char *>(rhs);
-  for (size_t i = 0; i < count; ++i)
-    if (_lhs[i] != _rhs[i])
-      return _lhs[i] - _rhs[i];
-  // count is 0 or _lhs and _rhs are the same.
-  return 0;
+  return memcmp_impl(static_cast<const char *>(lhs),
+                     static_cast<const char *>(rhs), count);
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h
index d3fadbb11c611..5f19e861e19f4 100644
--- a/libc/src/string/memory_utils/elements.h
+++ b/libc/src/string/memory_utils/elements.h
@@ -234,32 +234,47 @@ template <typename T> struct HeadTail {
 //
 // Precondition:
 // - size >= T::kSize
-template <typename T> struct Loop {
+template <typename T, typename TailT = T> struct Loop {
+  static_assert(T::kSize == TailT::kSize,
+                "Tail type must have the same size as T");
+
   static void Copy(char *__restrict dst, const char *__restrict src,
                    size_t size) {
-    for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize)
+    size_t offset = 0;
+    do {
       T::Copy(dst + offset, src + offset);
-    Tail<T>::Copy(dst, src, size);
+      offset += T::kSize;
+    } while (offset < size - T::kSize);
+    Tail<TailT>::Copy(dst, src, size);
   }
 
   static bool Equals(const char *lhs, const char *rhs, size_t size) {
-    for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize)
+    size_t offset = 0;
+    do {
       if (!T::Equals(lhs + offset, rhs + offset))
         return false;
-    return Tail<T>::Equals(lhs, rhs, size);
+      offset += T::kSize;
+    } while (offset < size - T::kSize);
+    return Tail<TailT>::Equals(lhs, rhs, size);
   }
 
   static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) {
-    for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize)
+    size_t offset = 0;
+    do {
       if (!T::Equals(lhs + offset, rhs + offset))
         return T::ThreeWayCompare(lhs + offset, rhs + offset);
-    return Tail<T>::ThreeWayCompare(lhs, rhs, size);
+      offset += T::kSize;
+    } while (offset < size - T::kSize);
+    return Tail<TailT>::ThreeWayCompare(lhs, rhs, size);
   }
 
   static void SplatSet(char *dst, const unsigned char value, size_t size) {
-    for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize)
+    size_t offset = 0;
+    do {
       T::SplatSet(dst + offset, value);
-    Tail<T>::SplatSet(dst, value, size);
+      offset += T::kSize;
+    } while (offset < size - T::kSize);
+    Tail<TailT>::SplatSet(dst, value, size);
   }
 };
 
diff --git a/libc/src/string/memory_utils/memset_utils.h b/libc/src/string/memory_utils/memset_utils.h
index be1048a9be03b..5b955a3e30b1e 100644
--- a/libc/src/string/memory_utils/memset_utils.h
+++ b/libc/src/string/memory_utils/memset_utils.h
@@ -63,8 +63,6 @@ inline static void GeneralPurposeMemset(char *dst, unsigned char value,
     return SplatSet<_2>(dst, value);
   if (count == 3)
     return SplatSet<_3>(dst, value);
-  if (count == 4)
-    return SplatSet<_4>(dst, value);
   if (count <= 8)
     return SplatSet<HeadTail<_4>>(dst, value, count);
   if (count <= 16)
diff --git a/libc/src/threads/linux/mtx_lock.cpp b/libc/src/threads/linux/mtx_lock.cpp
index 7ae30a78ba81f..39bd329e9e9a9 100644
--- a/libc/src/threads/linux/mtx_lock.cpp
+++ b/libc/src/threads/linux/mtx_lock.cpp
@@ -21,12 +21,16 @@ namespace __llvm_libc {
 // The implementation currently handles only plain mutexes.
 LLVM_LIBC_FUNCTION(int, mtx_lock, (mtx_t * mutex)) {
   FutexData *futex_data = reinterpret_cast<FutexData *>(mutex->__internal_data);
+  bool was_waiting = false;
   while (true) {
     uint32_t mutex_status = MS_Free;
     uint32_t locked_status = MS_Locked;
 
-    if (atomic_compare_exchange_strong(futex_data, &mutex_status, MS_Locked))
+    if (atomic_compare_exchange_strong(futex_data, &mutex_status, MS_Locked)) {
+      if (was_waiting)
+        atomic_store(futex_data, MS_Waiting);
       return thrd_success;
+    }
 
     switch (mutex_status) {
     case MS_Waiting:
@@ -35,6 +39,7 @@ LLVM_LIBC_FUNCTION(int, mtx_lock, (mtx_t * mutex)) {
       // 4th argument to the syscall function below.)
       __llvm_libc::syscall(SYS_futex, futex_data, FUTEX_WAIT_PRIVATE,
                            MS_Waiting, 0, 0, 0);
+      was_waiting = true;
       // Once woken up/unblocked, try everything all over.
       continue;
     case MS_Locked:
@@ -47,6 +52,7 @@ LLVM_LIBC_FUNCTION(int, mtx_lock, (mtx_t * mutex)) {
         // syscall will block only if the futex data is still `MS_Waiting`.
         __llvm_libc::syscall(SYS_futex, futex_data, FUTEX_WAIT_PRIVATE,
                              MS_Waiting, 0, 0, 0);
+        was_waiting = true;
       }
       continue;
     case MS_Free:
diff --git a/libc/test/CMakeLists.txt b/libc/test/CMakeLists.txt
index 969ee0d83600d..3cf1f8d59d953 100644
--- a/libc/test/CMakeLists.txt
+++ b/libc/test/CMakeLists.txt
@@ -9,6 +9,7 @@ add_custom_target(check-llvmlibc)
 add_dependencies(check-libc check-llvmlibc)
 
 add_custom_target(exhaustive-check-libc)
+add_custom_target(libc-long-running-tests)
 
 add_subdirectory(src)
 add_subdirectory(utils)
diff --git a/libc/test/integration/scudo/CMakeLists.txt b/libc/test/integration/scudo/CMakeLists.txt
index 35e3a78918e4f..045dc9a67bbd3 100644
--- a/libc/test/integration/scudo/CMakeLists.txt
+++ b/libc/test/integration/scudo/CMakeLists.txt
@@ -37,3 +37,9 @@ target_link_libraries(libc-gwp-asan-uaf-should-crash
   PRIVATE
   llvmlibc
 )
+
+add_custom_command(TARGET libc-scudo-integration-test
+                   POST_BUILD
+                   COMMAND $<TARGET_FILE:libc-scudo-integration-test>
+                   COMMENT "Run the test after it is built." 
+                   VERBATIM)
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index a0d06688a4e74..5a7a145dfd2c1 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1169,6 +1169,18 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fputil
 )
 
+add_fp_unittest(
+  mod_k_pi_test
+  NEED_MPFR
+  SUITE
+    libc-long-running-tests
+  SRCS
+    mod_k_pi_test.cpp
+  DEPENDS
+    libc.src.math.generic.dp_trig
+    libc.src.__support.FPUtil.fputil
+)
+
 add_subdirectory(generic)
 add_subdirectory(exhaustive)
 add_subdirectory(differential_testing)
diff --git a/libc/test/src/math/mod_k_pi_test.cpp b/libc/test/src/math/mod_k_pi_test.cpp
new file mode 100644
index 0000000000000..008e93da9519b
--- /dev/null
+++ b/libc/test/src/math/mod_k_pi_test.cpp
@@ -0,0 +1,56 @@
+//===-- Unittests mod_2pi, mod_pi_over_4 and mod_pi_over_2 ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/TestHelpers.h"
+#include "src/math/generic/dp_trig.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+#include "utils/UnitTest/Test.h"
+
+#include <math.h>
+
+namespace mpfr = __llvm_libc::testing::mpfr;
+using FPBits = __llvm_libc::fputil::FPBits<double>;
+using UIntType = FPBits::UIntType;
+
+TEST(LlvmLibcMod2PITest, Range) {
+  constexpr UIntType count = 1000000000;
+  constexpr UIntType step = UIntType(-1) / count;
+  for (UIntType i = 0, v = 0; i <= count; ++i, v += step) {
+    double x = double(FPBits(v));
+    if (isnan(x) || isinf(x) || x <= 0.0)
+      continue;
+
+    ASSERT_MPFR_MATCH(mpfr::Operation::Mod2PI, x, __llvm_libc::mod_2pi(x), 0);
+  }
+}
+
+TEST(LlvmLibcModPIOver2Test, Range) {
+  constexpr UIntType count = 1000000000;
+  constexpr UIntType step = UIntType(-1) / count;
+  for (UIntType i = 0, v = 0; i <= count; ++i, v += step) {
+    double x = double(FPBits(v));
+    if (isnan(x) || isinf(x) || x <= 0.0)
+      continue;
+
+    ASSERT_MPFR_MATCH(mpfr::Operation::ModPIOver2, x,
+                      __llvm_libc::mod_pi_over_2(x), 0);
+  }
+}
+
+TEST(LlvmLibcModPIOver4Test, Range) {
+  constexpr UIntType count = 1000000000;
+  constexpr UIntType step = UIntType(-1) / count;
+  for (UIntType i = 0, v = 0; i <= count; ++i, v += step) {
+    double x = double(FPBits(v));
+    if (isnan(x) || isinf(x) || x <= 0.0)
+      continue;
+
+    ASSERT_MPFR_MATCH(mpfr::Operation::ModPIOver4, x,
+                      __llvm_libc::mod_pi_over_4(x), 0);
+  }
+}
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index d95f95b8d79e2..648d85552bed7 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -55,6 +55,16 @@ add_libc_unittest(
     libc.src.stdlib.llabs
 )
 
+add_libc_unittest(
+  strtol_test
+  SUITE
+    libc_stdlib_unittests
+  SRCS
+    strtol_test.cpp
+  DEPENDS
+    libc.src.stdlib.strtol
+)
+
 add_libc_unittest(
   strtoll_test
   SUITE
@@ -63,6 +73,24 @@ add_libc_unittest(
     strtoll_test.cpp
   DEPENDS
     libc.src.stdlib.strtoll
-    libc.include.errno
-    libc.test.errno_setter_matcher
+)
+
+add_libc_unittest(
+  strtoul_test
+  SUITE
+    libc_stdlib_unittests
+  SRCS
+    strtoul_test.cpp
+  DEPENDS
+    libc.src.stdlib.strtoul
+)
+
+add_libc_unittest(
+  strtoull_test
+  SUITE
+    libc_stdlib_unittests
+  SRCS
+    strtoull_test.cpp
+  DEPENDS
+    libc.src.stdlib.strtoull
 )
diff --git a/libc/test/src/stdlib/atoi_test.cpp b/libc/test/src/stdlib/atoi_test.cpp
new file mode 100644
index 0000000000000..892f93ac1b28b
--- /dev/null
+++ b/libc/test/src/stdlib/atoi_test.cpp
@@ -0,0 +1,68 @@
+//===-- Unittests for atoi -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/atoi.h"
+
+#include "utils/UnitTest/Test.h"
+
+#include <limits.h>
+
+TEST(LlvmLibcAToITest, ValidNumbers) {
+  const char *zero = "0";
+  ASSERT_EQ(__llvm_libc::atoi(zero), 0);
+
+  const char *ten = "10";
+  ASSERT_EQ(__llvm_libc::atoi(ten), 10);
+
+  const char *negative_hundred = "-100";
+  ASSERT_EQ(__llvm_libc::atoi(negative_hundred), -100);
+
+  const char *positive_thousand = "+1000";
+  ASSERT_EQ(__llvm_libc::atoi(positive_thousand), 1000);
+
+  const char *spaces_before = "     12345";
+  ASSERT_EQ(__llvm_libc::atoi(spaces_before), 12345);
+
+  const char *tabs_before = "\t\t\t\t67890";
+  ASSERT_EQ(__llvm_libc::atoi(tabs_before), 67890);
+
+  const char *letters_after = "123abc";
+  ASSERT_EQ(__llvm_libc::atoi(letters_after), 123);
+
+  const char *letters_between = "456def789";
+  ASSERT_EQ(__llvm_libc::atoi(letters_between), 456);
+
+  const char *all_together = "\t   110 times 5 = 550";
+  ASSERT_EQ(__llvm_libc::atoi(all_together), 110);
+
+  const char *biggest_int = "2147483647";
+  ASSERT_EQ(__llvm_libc::atoi(biggest_int), INT_MAX);
+
+  const char *smallest_int = "-2147483648";
+  ASSERT_EQ(__llvm_libc::atoi(smallest_int), INT_MIN);
+}
+
+TEST(LlvmLibcAToITest, NonBaseTenWholeNumbers) {
+  const char *hexadecimal = "0x10";
+  ASSERT_EQ(__llvm_libc::atoi(hexadecimal), 0);
+
+  const char *octal = "010";
+  ASSERT_EQ(__llvm_libc::atoi(octal), 10);
+
+  const char *decimal_point = "5.9";
+  ASSERT_EQ(__llvm_libc::atoi(decimal_point), 5);
+}
+
+TEST(LlvmLibcAToITest, NotNumbers) {
+  const char *ten_as_word = "ten";
+  ASSERT_EQ(__llvm_libc::atoi(ten_as_word), 0);
+
+  const char *lots_of_letters =
+      "wtragsdhfgjykutjdyfhgnchgmjhkyurktfgjhlu;po7urtdjyfhgklyk";
+  ASSERT_EQ(__llvm_libc::atoi(lots_of_letters), 0);
+}
diff --git a/libc/test/src/stdlib/atol_test.cpp b/libc/test/src/stdlib/atol_test.cpp
new file mode 100644
index 0000000000000..a2e0444a629fc
--- /dev/null
+++ b/libc/test/src/stdlib/atol_test.cpp
@@ -0,0 +1,62 @@
+//===-- Unittests for atol -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/atol.h"
+
+#include "utils/UnitTest/Test.h"
+
+#include <limits.h>
+
+TEST(LlvmLibcAToLTest, ValidNumbers) {
+  const char *zero = "0";
+  ASSERT_EQ(__llvm_libc::atol(zero), 0l);
+
+  const char *ten = "10";
+  ASSERT_EQ(__llvm_libc::atol(ten), 10l);
+
+  const char *negative_hundred = "-100";
+  ASSERT_EQ(__llvm_libc::atol(negative_hundred), -100l);
+
+  const char *positive_thousand = "+1000";
+  ASSERT_EQ(__llvm_libc::atol(positive_thousand), 1000l);
+
+  const char *spaces_before = "     12345";
+  ASSERT_EQ(__llvm_libc::atol(spaces_before), 12345l);
+
+  const char *tabs_before = "\t\t\t\t67890";
+  ASSERT_EQ(__llvm_libc::atol(tabs_before), 67890l);
+
+  const char *letters_after = "123abc";
+  ASSERT_EQ(__llvm_libc::atol(letters_after), 123l);
+
+  const char *letters_between = "456def789";
+  ASSERT_EQ(__llvm_libc::atol(letters_between), 456l);
+
+  const char *all_together = "\t   110 times 5 = 550";
+  ASSERT_EQ(__llvm_libc::atol(all_together), 110l);
+}
+
+TEST(LlvmLibcAToLTest, NonBaseTenWholeNumbers) {
+  const char *hexadecimal = "0x10";
+  ASSERT_EQ(__llvm_libc::atol(hexadecimal), 0l);
+
+  const char *octal = "010";
+  ASSERT_EQ(__llvm_libc::atol(hexadecimal), 10l);
+
+  const char *decimal_point = "5.9";
+  ASSERT_EQ(__llvm_libc::atol(decimal_point), 5l);
+}
+
+TEST(LlvmLibcAToLTest, NotNumbers) {
+  const char *ten_as_word = "ten";
+  ASSERT_EQ(__llvm_libc::atol(ten_as_word), 0l);
+
+  const char *lots_of_letters =
+      "wtragsdhfgjykutjdyfhgnchgmjhkyurktfgjhlu;po7urtdjyfhgklyk";
+  ASSERT_EQ(__llvm_libc::atol(lots_of_letters), 0l);
+}
diff --git a/libc/test/src/stdlib/atoll_test.cpp b/libc/test/src/stdlib/atoll_test.cpp
new file mode 100644
index 0000000000000..c716367295cb8
--- /dev/null
+++ b/libc/test/src/stdlib/atoll_test.cpp
@@ -0,0 +1,68 @@
+//===-- Unittests for atoll -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/atoll.h"
+
+#include "utils/UnitTest/Test.h"
+
+#include <limits.h>
+
+TEST(LlvmLibcAToLLTest, ValidNumbers) {
+  const char *zero = "0";
+  ASSERT_EQ(__llvm_libc::atoll(zero), 0ll);
+
+  const char *ten = "10";
+  ASSERT_EQ(__llvm_libc::atoll(ten), 10ll);
+
+  const char *negative_hundred = "-100";
+  ASSERT_EQ(__llvm_libc::atoll(negative_hundred), -100ll);
+
+  const char *positive_thousand = "+1000";
+  ASSERT_EQ(__llvm_libc::atoll(positive_thousand), 1000ll);
+
+  const char *spaces_before = "     12345";
+  ASSERT_EQ(__llvm_libc::atoll(spaces_before), 12345ll);
+
+  const char *tabs_before = "\t\t\t\t67890";
+  ASSERT_EQ(__llvm_libc::atoll(tabs_before), 67890ll);
+
+  const char *letters_after = "123abc";
+  ASSERT_EQ(__llvm_libc::atoll(letters_after), 123ll);
+
+  const char *letters_between = "456def789";
+  ASSERT_EQ(__llvm_libc::atoll(letters_between), 456ll);
+
+  const char *all_together = "\t   110 times 5 = 550";
+  ASSERT_EQ(__llvm_libc::atoll(all_together), 110ll);
+
+  const char *biggest_long_long = "9223372036854775807";
+  ASSERT_EQ(__llvm_libc::atoll(biggest_int), LLONG_MAX);
+
+  const char *smallest_long_long = "-9223372036854775808";
+  ASSERT_EQ(__llvm_libc::atoll(smallest_int), LLONG_MIN);
+}
+
+TEST(LlvmLibcAToLLTest, NonBaseTenWholeNumbers) {
+  const char *hexadecimal = "0x10";
+  ASSERT_EQ(__llvm_libc::atoll(hexadecimal), 0ll);
+
+  const char *octal = "010";
+  ASSERT_EQ(__llvm_libc::atoll(hexadecimal), 10ll);
+
+  const char *decimal_point = "5.9";
+  ASSERT_EQ(__llvm_libc::atol(decimal_point), 5l);
+}
+
+TEST(LlvmLibcAToLLTest, NotNumbers) {
+  const char *ten_as_word = "ten";
+  ASSERT_EQ(__llvm_libc::atoll(ten_as_word), 0ll);
+
+  const char *lots_of_letters =
+      "wtragsdhfgjykutjdyfhgnchgmjhkyurktfgjhlu;po7urtdjyfhgklyk";
+  ASSERT_EQ(__llvm_libc::atoll(lots_of_letters), 0ll);
+}
diff --git a/libc/test/src/stdlib/strtol_test.cpp b/libc/test/src/stdlib/strtol_test.cpp
new file mode 100644
index 0000000000000..14a473537b398
--- /dev/null
+++ b/libc/test/src/stdlib/strtol_test.cpp
@@ -0,0 +1,289 @@
+//===-- Unittests for strtol ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/strtol.h"
+
+#include "utils/UnitTest/Test.h"
+
+#include <errno.h>
+#include <limits.h>
+
+TEST(LlvmLibcStrToLTest, InvalidBase) {
+  const char *ten = "10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(ten, nullptr, -1), 0l);
+  ASSERT_EQ(errno, EINVAL);
+}
+
+TEST(LlvmLibcStrToLTest, CleanBaseTenDecode) {
+  char *str_end = nullptr;
+
+  const char *ten = "10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(ten, &str_end, 10), 10l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - ten, 2l);
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(ten, nullptr, 10), 10l);
+  ASSERT_EQ(errno, 0);
+
+  const char *hundred = "100";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(hundred, &str_end, 10), 100l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - hundred, 3l);
+
+  const char *negative = "-100";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(negative, &str_end, 10), -100l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - negative, 4l);
+
+  const char *big_number = "123456789012345";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(big_number, &str_end, 10), 123456789012345l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - big_number, 15l);
+
+  const char *big_negative_number = "-123456789012345";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(big_negative_number, &str_end, 10),
+            -123456789012345l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - big_negative_number, 16l);
+
+  const char *too_big_number = "123456789012345678901";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(too_big_number, &str_end, 10), LONG_MAX);
+  ASSERT_EQ(errno, ERANGE);
+  EXPECT_EQ(str_end - too_big_number, 21l);
+
+  const char *too_big_negative_number = "-123456789012345678901";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(too_big_negative_number, &str_end, 10),
+            LONG_MIN);
+  ASSERT_EQ(errno, ERANGE);
+  EXPECT_EQ(str_end - too_big_negative_number, 22l);
+
+  const char *long_number_range_test =
+      "10000000000000000000000000000000000000000000000000";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(long_number_range_test, &str_end, 10),
+            LONG_MAX);
+  ASSERT_EQ(errno, ERANGE);
+  EXPECT_EQ(str_end - long_number_range_test, 50l);
+}
+
+TEST(LlvmLibcStrToLTest, MessyBaseTenDecode) {
+  char *str_end = nullptr;
+
+  const char *spaces_before = "     10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(spaces_before, &str_end, 10), 10l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - spaces_before, 7l);
+
+  const char *spaces_after = "10      ";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(spaces_after, &str_end, 10), 10l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - spaces_after, 2l);
+
+  const char *word_before = "word10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(word_before, &str_end, 10), 0l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - word_before, 0l);
+
+  const char *word_after = "10word";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(word_after, &str_end, 10), 10l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - word_after, 2l);
+
+  const char *two_numbers = "10 999";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(two_numbers, &str_end, 10), 10l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - two_numbers, 2l);
+
+  const char *two_signs = "--10 999";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(two_signs, &str_end, 10), 0l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - two_signs, 1l);
+
+  const char *sign_before = "+2=4";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(sign_before, &str_end, 10), 2l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - sign_before, 2l);
+
+  const char *sign_after = "2+2=4";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(sign_after, &str_end, 10), 2l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - sign_after, 1l);
+
+  const char *tab_before = "\t10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(tab_before, &str_end, 10), 10l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - tab_before, 3l);
+
+  const char *all_together = "\t  -12345and+67890";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(all_together, &str_end, 10), -12345l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - all_together, 9l);
+}
+
+static char int_to_b36_char(int input) {
+  if (input < 0 || input > 36)
+    return '0';
+  if (input < 10)
+    return '0' + input;
+  return 'A' + input - 10;
+}
+
+TEST(LlvmLibcStrToLTest, DecodeInOtherBases) {
+  char small_string[4] = {'\0', '\0', '\0', '\0'};
+  for (unsigned int base = 2; base <= 36; ++base) {
+    for (long first_digit = 0; first_digit <= 36; ++first_digit) {
+      small_string[0] = int_to_b36_char(first_digit);
+      if (first_digit < base) {
+        errno = 0;
+        ASSERT_EQ(__llvm_libc::strtol(small_string, nullptr, base),
+                  first_digit);
+        ASSERT_EQ(errno, 0);
+      } else {
+        errno = 0;
+        ASSERT_EQ(__llvm_libc::strtol(small_string, nullptr, base), 0l);
+        ASSERT_EQ(errno, 0);
+      }
+    }
+  }
+
+  for (unsigned int base = 2; base <= 36; ++base) {
+    for (long first_digit = 0; first_digit <= 36; ++first_digit) {
+      small_string[0] = int_to_b36_char(first_digit);
+      for (long second_digit = 0; second_digit <= 36; ++second_digit) {
+        small_string[1] = int_to_b36_char(second_digit);
+        if (first_digit < base && second_digit < base) {
+          errno = 0;
+          ASSERT_EQ(__llvm_libc::strtol(small_string, nullptr, base),
+                    second_digit + (first_digit * base));
+          ASSERT_EQ(errno, 0);
+        } else if (first_digit < base) {
+          errno = 0;
+          ASSERT_EQ(__llvm_libc::strtol(small_string, nullptr, base),
+                    first_digit);
+          ASSERT_EQ(errno, 0);
+        } else {
+          errno = 0;
+          ASSERT_EQ(__llvm_libc::strtol(small_string, nullptr, base), 0l);
+          ASSERT_EQ(errno, 0);
+        }
+      }
+    }
+  }
+
+  for (unsigned int base = 2; base <= 36; ++base) {
+    for (long first_digit = 0; first_digit <= 36; ++first_digit) {
+      small_string[0] = int_to_b36_char(first_digit);
+      for (long second_digit = 0; second_digit <= 36; ++second_digit) {
+        small_string[1] = int_to_b36_char(second_digit);
+        for (long third_digit = 0; third_digit <= 36; ++third_digit) {
+          small_string[2] = int_to_b36_char(third_digit);
+
+          if (first_digit < base && second_digit < base && third_digit < base) {
+            errno = 0;
+            ASSERT_EQ(__llvm_libc::strtol(small_string, nullptr, base),
+                      third_digit + (second_digit * base) +
+                          (first_digit * base * base));
+            ASSERT_EQ(errno, 0);
+          } else if (first_digit < base && second_digit < base) {
+            errno = 0;
+            ASSERT_EQ(__llvm_libc::strtol(small_string, nullptr, base),
+                      second_digit + (first_digit * base));
+            ASSERT_EQ(errno, 0);
+          } else if (first_digit < base) {
+            // if the base is 16 there is a special case for the prefix 0X.
+            // The number is treated as a one digit hexadecimal.
+            if (base == 16 && first_digit == 0 && second_digit == 33) {
+              if (third_digit < base) {
+                errno = 0;
+                ASSERT_EQ(__llvm_libc::strtol(small_string, nullptr, base),
+                          third_digit);
+                ASSERT_EQ(errno, 0);
+              } else {
+                errno = 0;
+                ASSERT_EQ(__llvm_libc::strtol(small_string, nullptr, base), 0l);
+                ASSERT_EQ(errno, 0);
+              }
+            } else {
+              errno = 0;
+              ASSERT_EQ(__llvm_libc::strtol(small_string, nullptr, base),
+                        first_digit);
+              ASSERT_EQ(errno, 0);
+            }
+          } else {
+            errno = 0;
+            ASSERT_EQ(__llvm_libc::strtol(small_string, nullptr, base), 0l);
+            ASSERT_EQ(errno, 0);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(LlvmLibcStrToLTest, CleanBaseSixteenDecode) {
+  char *str_end = nullptr;
+
+  const char *no_prefix = "123abc";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(no_prefix, &str_end, 16), 0x123abcl);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - no_prefix, 6l);
+
+  const char *yes_prefix = "0x456def";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(yes_prefix, &str_end, 16), 0x456defl);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - yes_prefix, 8l);
+}
+
+TEST(LlvmLibcStrToLTest, AutomaticBaseSelection) {
+  char *str_end = nullptr;
+
+  const char *base_ten = "12345";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(base_ten, &str_end, 0), 12345l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - base_ten, 5l);
+
+  const char *base_sixteen_no_prefix = "123abc";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(base_sixteen_no_prefix, &str_end, 0), 123l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - base_sixteen_no_prefix, 3l);
+
+  const char *base_sixteen_with_prefix = "0x456def";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(base_sixteen_with_prefix, &str_end, 0),
+            0x456defl);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - base_sixteen_with_prefix, 8l);
+
+  const char *base_eight_with_prefix = "012345";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtol(base_eight_with_prefix, &str_end, 0), 012345l);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - base_eight_with_prefix, 6l);
+}
diff --git a/libc/test/src/stdlib/strtoll_test.cpp b/libc/test/src/stdlib/strtoll_test.cpp
index 437d42f5bab80..ca5467182b7d6 100644
--- a/libc/test/src/stdlib/strtoll_test.cpp
+++ b/libc/test/src/stdlib/strtoll_test.cpp
@@ -57,18 +57,49 @@ TEST(LlvmLibcStrToLLTest, CleanBaseTenDecode) {
   ASSERT_EQ(errno, 0);
   EXPECT_EQ(str_end - big_negative_number, 16l);
 
+  const char *long_long_max_number = "9223372036854775807";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoll(long_long_max_number, &str_end, 10),
+            9223372036854775807ll);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - long_long_max_number, 19l);
+
+  const char *long_long_min_number = "-9223372036854775808";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoll(long_long_min_number, &str_end, 10),
+            -9223372036854775807ll - 1ll);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - long_long_min_number, 20l);
+
   const char *too_big_number = "123456789012345678901";
   errno = 0;
   ASSERT_EQ(__llvm_libc::strtoll(too_big_number, &str_end, 10), LLONG_MAX);
   ASSERT_EQ(errno, ERANGE);
-  EXPECT_EQ(str_end - too_big_number, 19l);
+  EXPECT_EQ(str_end - too_big_number, 21l);
 
   const char *too_big_negative_number = "-123456789012345678901";
   errno = 0;
   ASSERT_EQ(__llvm_libc::strtoll(too_big_negative_number, &str_end, 10),
             LLONG_MIN);
   ASSERT_EQ(errno, ERANGE);
-  EXPECT_EQ(str_end - too_big_negative_number, 20l);
+  EXPECT_EQ(str_end - too_big_negative_number, 22l);
+
+  const char *long_number_range_test =
+      "10000000000000000000000000000000000000000000000000";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoll(long_number_range_test, &str_end, 10),
+            LLONG_MAX);
+  ASSERT_EQ(errno, ERANGE);
+  EXPECT_EQ(str_end - long_number_range_test, 50l);
+
+  const char *long_long_max_number_with_numbers_after =
+      "9223372036854775807123";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoll(long_long_max_number_with_numbers_after,
+                                 &str_end, 10),
+            LLONG_MAX);
+  ASSERT_EQ(errno, ERANGE);
+  EXPECT_EQ(str_end - long_long_max_number_with_numbers_after, 22l);
 }
 
 TEST(LlvmLibcStrToLLTest, MessyBaseTenDecode) {
@@ -145,7 +176,7 @@ static char int_to_b36_char(int input) {
 
 TEST(LlvmLibcStrToLLTest, DecodeInOtherBases) {
   char small_string[4] = {'\0', '\0', '\0', '\0'};
-  for (int base = 2; base <= 36; ++base) {
+  for (unsigned int base = 2; base <= 36; ++base) {
     for (long long first_digit = 0; first_digit <= 36; ++first_digit) {
       small_string[0] = int_to_b36_char(first_digit);
       if (first_digit < base) {
@@ -161,7 +192,7 @@ TEST(LlvmLibcStrToLLTest, DecodeInOtherBases) {
     }
   }
 
-  for (int base = 2; base <= 36; ++base) {
+  for (unsigned int base = 2; base <= 36; ++base) {
     for (long long first_digit = 0; first_digit <= 36; ++first_digit) {
       small_string[0] = int_to_b36_char(first_digit);
       for (long long second_digit = 0; second_digit <= 36; ++second_digit) {
@@ -185,7 +216,7 @@ TEST(LlvmLibcStrToLLTest, DecodeInOtherBases) {
     }
   }
 
-  for (int base = 2; base <= 36; ++base) {
+  for (unsigned int base = 2; base <= 36; ++base) {
     for (long long first_digit = 0; first_digit <= 36; ++first_digit) {
       small_string[0] = int_to_b36_char(first_digit);
       for (long long second_digit = 0; second_digit <= 36; ++second_digit) {
diff --git a/libc/test/src/stdlib/strtoul_test.cpp b/libc/test/src/stdlib/strtoul_test.cpp
new file mode 100644
index 0000000000000..cff208618f504
--- /dev/null
+++ b/libc/test/src/stdlib/strtoul_test.cpp
@@ -0,0 +1,284 @@
+//===-- Unittests for strtoul ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/strtoul.h"
+
+#include "utils/UnitTest/Test.h"
+
+#include <errno.h>
+#include <limits.h>
+
+TEST(LlvmLibcStrToULTest, InvalidBase) {
+  const char *ten = "10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(ten, nullptr, -1), 0ul);
+  ASSERT_EQ(errno, EINVAL);
+}
+
+TEST(LlvmLibcStrToULTest, CleanBaseTenDecode) {
+  char *str_end = nullptr;
+
+  const char *ten = "10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(ten, &str_end, 10), 10ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - ten, 2l);
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(ten, nullptr, 10), 10ul);
+  ASSERT_EQ(errno, 0);
+
+  const char *hundred = "100";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(hundred, &str_end, 10), 100ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - hundred, 3l);
+
+  const char *negative = "-100";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(negative, &str_end, 10), -(100ul));
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - negative, 4l);
+
+  const char *big_number = "123456789012345";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(big_number, &str_end, 10), 123456789012345ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - big_number, 15l);
+
+  const char *too_big_number = "123456789012345678901";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(too_big_number, &str_end, 10), ULONG_MAX);
+  ASSERT_EQ(errno, ERANGE);
+  EXPECT_EQ(str_end - too_big_number, 21l);
+
+  const char *too_big_negative_number = "-123456789012345678901";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(too_big_negative_number, &str_end, 10),
+            ULONG_MAX);
+  ASSERT_EQ(errno, ERANGE);
+  EXPECT_EQ(str_end - too_big_negative_number, 22l);
+
+  const char *long_number_range_test =
+      "10000000000000000000000000000000000000000000000000";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(long_number_range_test, &str_end, 10),
+            ULONG_MAX);
+  ASSERT_EQ(errno, ERANGE);
+  EXPECT_EQ(str_end - long_number_range_test, 50l);
+}
+
+TEST(LlvmLibcStrToULTest, MessyBaseTenDecode) {
+  char *str_end = nullptr;
+
+  const char *spaces_before = "     10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(spaces_before, &str_end, 10), 10ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - spaces_before, 7l);
+
+  const char *spaces_after = "10      ";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(spaces_after, &str_end, 10), 10ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - spaces_after, 2l);
+
+  const char *word_before = "word10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(word_before, &str_end, 10), 0ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - word_before, 0l);
+
+  const char *word_after = "10word";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(word_after, &str_end, 10), 10ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - word_after, 2l);
+
+  const char *two_numbers = "10 999";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(two_numbers, &str_end, 10), 10ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - two_numbers, 2l);
+
+  const char *two_signs = "--10 999";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(two_signs, &str_end, 10), 0ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - two_signs, 1l);
+
+  const char *sign_before = "+2=4";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(sign_before, &str_end, 10), 2ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - sign_before, 2l);
+
+  const char *sign_after = "2+2=4";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(sign_after, &str_end, 10), 2ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - sign_after, 1l);
+
+  const char *tab_before = "\t10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(tab_before, &str_end, 10), 10ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - tab_before, 3l);
+
+  const char *all_together = "\t  -12345and+67890";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(all_together, &str_end, 10), -(12345ul));
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - all_together, 9l);
+}
+
+static char int_to_b36_char(int input) {
+  if (input < 0 || input > 36)
+    return '0';
+  if (input < 10)
+    return '0' + input;
+  return 'A' + input - 10;
+}
+
+TEST(LlvmLibcStrToULTest, DecodeInOtherBases) {
+  char small_string[4] = {'\0', '\0', '\0', '\0'};
+  for (unsigned int base = 2; base <= 36; ++base) {
+    for (unsigned long first_digit = 0; first_digit <= 36; ++first_digit) {
+      small_string[0] = int_to_b36_char(first_digit);
+      if (first_digit < base) {
+        errno = 0;
+        ASSERT_EQ(__llvm_libc::strtoul(small_string, nullptr, base),
+                  first_digit);
+        ASSERT_EQ(errno, 0);
+      } else {
+        errno = 0;
+        ASSERT_EQ(__llvm_libc::strtoul(small_string, nullptr, base), 0ul);
+        ASSERT_EQ(errno, 0);
+      }
+    }
+  }
+
+  for (unsigned int base = 2; base <= 36; ++base) {
+    for (unsigned long first_digit = 0; first_digit <= 36; ++first_digit) {
+      small_string[0] = int_to_b36_char(first_digit);
+      for (unsigned long second_digit = 0; second_digit <= 36; ++second_digit) {
+        small_string[1] = int_to_b36_char(second_digit);
+        if (first_digit < base && second_digit < base) {
+          errno = 0;
+          ASSERT_EQ(__llvm_libc::strtoul(small_string, nullptr, base),
+                    second_digit + (first_digit * base));
+          ASSERT_EQ(errno, 0);
+        } else if (first_digit < base) {
+          errno = 0;
+          ASSERT_EQ(__llvm_libc::strtoul(small_string, nullptr, base),
+                    first_digit);
+          ASSERT_EQ(errno, 0);
+        } else {
+          errno = 0;
+          ASSERT_EQ(__llvm_libc::strtoul(small_string, nullptr, base), 0ul);
+          ASSERT_EQ(errno, 0);
+        }
+      }
+    }
+  }
+
+  for (unsigned int base = 2; base <= 36; ++base) {
+    for (unsigned long first_digit = 0; first_digit <= 36; ++first_digit) {
+      small_string[0] = int_to_b36_char(first_digit);
+      for (unsigned long second_digit = 0; second_digit <= 36; ++second_digit) {
+        small_string[1] = int_to_b36_char(second_digit);
+        for (unsigned long third_digit = 0; third_digit <= 36; ++third_digit) {
+          small_string[2] = int_to_b36_char(third_digit);
+
+          if (first_digit < base && second_digit < base && third_digit < base) {
+            errno = 0;
+            ASSERT_EQ(__llvm_libc::strtoul(small_string, nullptr, base),
+                      third_digit + (second_digit * base) +
+                          (first_digit * base * base));
+            ASSERT_EQ(errno, 0);
+          } else if (first_digit < base && second_digit < base) {
+            errno = 0;
+            ASSERT_EQ(__llvm_libc::strtoul(small_string, nullptr, base),
+                      second_digit + (first_digit * base));
+            ASSERT_EQ(errno, 0);
+          } else if (first_digit < base) {
+            // if the base is 16 there is a special case for the prefix 0X.
+            // The number is treated as a one digit hexadecimal.
+            if (base == 16 && first_digit == 0 && second_digit == 33) {
+              if (third_digit < base) {
+                errno = 0;
+                ASSERT_EQ(__llvm_libc::strtoul(small_string, nullptr, base),
+                          third_digit);
+                ASSERT_EQ(errno, 0);
+              } else {
+                errno = 0;
+                ASSERT_EQ(__llvm_libc::strtoul(small_string, nullptr, base),
+                          0ul);
+                ASSERT_EQ(errno, 0);
+              }
+            } else {
+              errno = 0;
+              ASSERT_EQ(__llvm_libc::strtoul(small_string, nullptr, base),
+                        first_digit);
+              ASSERT_EQ(errno, 0);
+            }
+          } else {
+            errno = 0;
+            ASSERT_EQ(__llvm_libc::strtoul(small_string, nullptr, base), 0ul);
+            ASSERT_EQ(errno, 0);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(LlvmLibcStrToULTest, CleanBaseSixteenDecode) {
+  char *str_end = nullptr;
+
+  const char *no_prefix = "123abc";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(no_prefix, &str_end, 16), 0x123abcul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - no_prefix, 6l);
+
+  const char *yes_prefix = "0x456def";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(yes_prefix, &str_end, 16), 0x456deful);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - yes_prefix, 8l);
+}
+
+TEST(LlvmLibcStrToULTest, AutomaticBaseSelection) {
+  char *str_end = nullptr;
+
+  const char *base_ten = "12345";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(base_ten, &str_end, 0), 12345ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - base_ten, 5l);
+
+  const char *base_sixteen_no_prefix = "123abc";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(base_sixteen_no_prefix, &str_end, 0), 123ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - base_sixteen_no_prefix, 3l);
+
+  const char *base_sixteen_with_prefix = "0x456def";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(base_sixteen_with_prefix, &str_end, 0),
+            0x456deful);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - base_sixteen_with_prefix, 8l);
+
+  const char *base_eight_with_prefix = "012345";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoul(base_eight_with_prefix, &str_end, 0),
+            012345ul);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - base_eight_with_prefix, 6l);
+}
diff --git a/libc/test/src/stdlib/strtoull_test.cpp b/libc/test/src/stdlib/strtoull_test.cpp
new file mode 100644
index 0000000000000..e6b52c92432ab
--- /dev/null
+++ b/libc/test/src/stdlib/strtoull_test.cpp
@@ -0,0 +1,295 @@
+//===-- Unittests for strtoull --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdlib/strtoull.h"
+
+#include "utils/UnitTest/Test.h"
+
+#include <errno.h>
+#include <limits.h>
+
+TEST(LlvmLibcStrToULLTest, InvalidBase) {
+  const char *ten = "10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(ten, nullptr, -1), 0ull);
+  ASSERT_EQ(errno, EINVAL);
+}
+
+TEST(LlvmLibcStrToULLTest, CleanBaseTenDecode) {
+  char *str_end = nullptr;
+
+  const char *ten = "10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(ten, &str_end, 10), 10ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - ten, 2l);
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(ten, nullptr, 10), 10ull);
+  ASSERT_EQ(errno, 0);
+
+  const char *hundred = "100";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(hundred, &str_end, 10), 100ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - hundred, 3l);
+
+  const char *negative = "-100";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(negative, &str_end, 10), -(100ull));
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - negative, 4l);
+
+  const char *big_number = "123456789012345";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(big_number, &str_end, 10),
+            123456789012345ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - big_number, 15l);
+
+  const char *unsigned_long_long_max_number = "18446744073709551615";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(unsigned_long_long_max_number, &str_end, 10),
+            18446744073709551615ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - unsigned_long_long_max_number, 20l);
+
+  const char *too_big_number = "123456789012345678901";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(too_big_number, &str_end, 10), ULLONG_MAX);
+  ASSERT_EQ(errno, ERANGE);
+  EXPECT_EQ(str_end - too_big_number, 21l);
+
+  const char *too_big_negative_number = "-123456789012345678901";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(too_big_negative_number, &str_end, 10),
+            ULLONG_MAX);
+  ASSERT_EQ(errno, ERANGE);
+  EXPECT_EQ(str_end - too_big_negative_number, 22l);
+
+  const char *long_number_range_test =
+      "10000000000000000000000000000000000000000000000000";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(long_number_range_test, &str_end, 10),
+            ULLONG_MAX);
+  ASSERT_EQ(errno, ERANGE);
+  EXPECT_EQ(str_end - long_number_range_test, 50l);
+}
+
+TEST(LlvmLibcStrToULLTest, MessyBaseTenDecode) {
+  char *str_end = nullptr;
+
+  const char *spaces_before = "     10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(spaces_before, &str_end, 10), 10ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - spaces_before, 7l);
+
+  const char *spaces_after = "10      ";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(spaces_after, &str_end, 10), 10ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - spaces_after, 2l);
+
+  const char *word_before = "word10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(word_before, &str_end, 10), 0ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - word_before, 0l);
+
+  const char *word_after = "10word";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(word_after, &str_end, 10), 10ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - word_after, 2l);
+
+  const char *two_numbers = "10 999";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(two_numbers, &str_end, 10), 10ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - two_numbers, 2l);
+
+  const char *two_signs = "--10 999";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(two_signs, &str_end, 10), 0ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - two_signs, 1l);
+
+  const char *sign_before = "+2=4";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(sign_before, &str_end, 10), 2ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - sign_before, 2l);
+
+  const char *sign_after = "2+2=4";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(sign_after, &str_end, 10), 2ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - sign_after, 1l);
+
+  const char *tab_before = "\t10";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(tab_before, &str_end, 10), 10ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - tab_before, 3l);
+
+  const char *all_together = "\t  -12345and+67890";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(all_together, &str_end, 10), -(12345ull));
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - all_together, 9l);
+}
+
+static char int_to_b36_char(int input) {
+  if (input < 0 || input > 36)
+    return '0';
+  if (input < 10)
+    return '0' + input;
+  return 'A' + input - 10;
+}
+
+TEST(LlvmLibcStrToULLTest, DecodeInOtherBases) {
+  char small_string[4] = {'\0', '\0', '\0', '\0'};
+  for (unsigned int base = 2; base <= 36; ++base) {
+    for (unsigned long long first_digit = 0; first_digit <= 36; ++first_digit) {
+      small_string[0] = int_to_b36_char(first_digit);
+      if (first_digit < base) {
+        errno = 0;
+        ASSERT_EQ(__llvm_libc::strtoull(small_string, nullptr, base),
+                  first_digit);
+        ASSERT_EQ(errno, 0);
+      } else {
+        errno = 0;
+        ASSERT_EQ(__llvm_libc::strtoull(small_string, nullptr, base), 0ull);
+        ASSERT_EQ(errno, 0);
+      }
+    }
+  }
+
+  for (unsigned int base = 2; base <= 36; ++base) {
+    for (unsigned long long first_digit = 0; first_digit <= 36; ++first_digit) {
+      small_string[0] = int_to_b36_char(first_digit);
+      for (unsigned long long second_digit = 0; second_digit <= 36;
+           ++second_digit) {
+        small_string[1] = int_to_b36_char(second_digit);
+        if (first_digit < base && second_digit < base) {
+          errno = 0;
+          ASSERT_EQ(__llvm_libc::strtoull(small_string, nullptr, base),
+                    second_digit + (first_digit * base));
+          ASSERT_EQ(errno, 0);
+        } else if (first_digit < base) {
+          errno = 0;
+          ASSERT_EQ(__llvm_libc::strtoull(small_string, nullptr, base),
+                    first_digit);
+          ASSERT_EQ(errno, 0);
+        } else {
+          errno = 0;
+          ASSERT_EQ(__llvm_libc::strtoull(small_string, nullptr, base), 0ull);
+          ASSERT_EQ(errno, 0);
+        }
+      }
+    }
+  }
+
+  for (unsigned int base = 2; base <= 36; ++base) {
+    for (unsigned long long first_digit = 0; first_digit <= 36; ++first_digit) {
+      small_string[0] = int_to_b36_char(first_digit);
+      for (unsigned long long second_digit = 0; second_digit <= 36;
+           ++second_digit) {
+        small_string[1] = int_to_b36_char(second_digit);
+        for (unsigned long long third_digit = 0; third_digit <= 36;
+             ++third_digit) {
+          small_string[2] = int_to_b36_char(third_digit);
+
+          if (first_digit < base && second_digit < base && third_digit < base) {
+            errno = 0;
+            ASSERT_EQ(__llvm_libc::strtoull(small_string, nullptr, base),
+                      third_digit + (second_digit * base) +
+                          (first_digit * base * base));
+            ASSERT_EQ(errno, 0);
+          } else if (first_digit < base && second_digit < base) {
+            errno = 0;
+            ASSERT_EQ(__llvm_libc::strtoull(small_string, nullptr, base),
+                      second_digit + (first_digit * base));
+            ASSERT_EQ(errno, 0);
+          } else if (first_digit < base) {
+            // if the base is 16 there is a special case for the prefix 0X.
+            // The number is treated as a one digit hexadecimal.
+            if (base == 16 && first_digit == 0 && second_digit == 33) {
+              if (third_digit < base) {
+                errno = 0;
+                ASSERT_EQ(__llvm_libc::strtoull(small_string, nullptr, base),
+                          third_digit);
+                ASSERT_EQ(errno, 0);
+              } else {
+                errno = 0;
+                ASSERT_EQ(__llvm_libc::strtoull(small_string, nullptr, base),
+                          0ull);
+                ASSERT_EQ(errno, 0);
+              }
+            } else {
+              errno = 0;
+              ASSERT_EQ(__llvm_libc::strtoull(small_string, nullptr, base),
+                        first_digit);
+              ASSERT_EQ(errno, 0);
+            }
+          } else {
+            errno = 0;
+            ASSERT_EQ(__llvm_libc::strtoull(small_string, nullptr, base), 0ull);
+            ASSERT_EQ(errno, 0);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(LlvmLibcStrToULLTest, CleanBaseSixteenDecode) {
+  char *str_end = nullptr;
+
+  const char *no_prefix = "123abc";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(no_prefix, &str_end, 16), 0x123abcull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - no_prefix, 6l);
+
+  const char *yes_prefix = "0x456def";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(yes_prefix, &str_end, 16), 0x456defull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - yes_prefix, 8l);
+}
+
+TEST(LlvmLibcStrToULLTest, AutomaticBaseSelection) {
+  char *str_end = nullptr;
+
+  const char *base_ten = "12345";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(base_ten, &str_end, 0), 12345ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - base_ten, 5l);
+
+  const char *base_sixteen_no_prefix = "123abc";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(base_sixteen_no_prefix, &str_end, 0), 123ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - base_sixteen_no_prefix, 3l);
+
+  const char *base_sixteen_with_prefix = "0x456def";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(base_sixteen_with_prefix, &str_end, 0),
+            0x456defull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - base_sixteen_with_prefix, 8l);
+
+  const char *base_eight_with_prefix = "012345";
+  errno = 0;
+  ASSERT_EQ(__llvm_libc::strtoull(base_eight_with_prefix, &str_end, 0),
+            012345ull);
+  ASSERT_EQ(errno, 0);
+  EXPECT_EQ(str_end - base_eight_with_prefix, 6l);
+}
diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt
index ee9121094fea3..c8e8812c6817b 100644
--- a/libc/test/src/string/CMakeLists.txt
+++ b/libc/test/src/string/CMakeLists.txt
@@ -199,4 +199,5 @@ add_libc_multi_impl_test(memcpy SRCS memcpy_test.cpp)
 add_libc_multi_impl_test(memset SRCS memset_test.cpp)
 add_libc_multi_impl_test(bzero SRCS bzero_test.cpp)
 add_libc_multi_impl_test(memcmp SRCS memcmp_test.cpp)
+add_libc_multi_impl_test(bcmp SRCS bcmp_test.cpp)
 add_libc_multi_impl_test(memmove SRCS memmove_test.cpp)
diff --git a/libc/test/src/string/bcmp_test.cpp b/libc/test/src/string/bcmp_test.cpp
new file mode 100644
index 0000000000000..7c2353fcd7fc0
--- /dev/null
+++ b/libc/test/src/string/bcmp_test.cpp
@@ -0,0 +1,58 @@
+//===-- Unittests for bcmp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/bcmp.h"
+#include "utils/UnitTest/Test.h"
+
+TEST(LlvmLibcBcmpTest, CmpZeroByte) {
+  const char *lhs = "ab";
+  const char *rhs = "bc";
+  EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0);
+}
+
+TEST(LlvmLibcBcmpTest, LhsRhsAreTheSame) {
+  const char *lhs = "ab";
+  const char *rhs = "ab";
+  EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0);
+}
+
+TEST(LlvmLibcBcmpTest, LhsBeforeRhsLexically) {
+  const char *lhs = "ab";
+  const char *rhs = "ac";
+  EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 1);
+}
+
+TEST(LlvmLibcBcmpTest, LhsAfterRhsLexically) {
+  const char *lhs = "ac";
+  const char *rhs = "ab";
+  EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 1);
+}
+
+TEST(LlvmLibcBcmpTest, Sweep) {
+  static constexpr size_t kMaxSize = 1024;
+  char lhs[kMaxSize];
+  char rhs[kMaxSize];
+
+  const auto reset = [](char *const ptr) {
+    for (size_t i = 0; i < kMaxSize; ++i)
+      ptr[i] = 'a';
+  };
+
+  reset(lhs);
+  reset(rhs);
+  for (size_t i = 0; i < kMaxSize; ++i)
+    EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0);
+
+  reset(lhs);
+  reset(rhs);
+  for (size_t i = 0; i < kMaxSize; ++i) {
+    rhs[i] = 'b';
+    EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, kMaxSize), 1);
+    rhs[i] = 'a';
+  }
+}
diff --git a/libc/test/src/threads/mtx_test.cpp b/libc/test/src/threads/mtx_test.cpp
index 0f3df0ee03849..536ff520c6018 100644
--- a/libc/test/src/threads/mtx_test.cpp
+++ b/libc/test/src/threads/mtx_test.cpp
@@ -127,3 +127,56 @@ TEST(LlvmLibcMutexTest, WaitAndStep) {
   __llvm_libc::thrd_join(&thread, &retval);
   ASSERT_EQ(retval, 0);
 }
+
+static constexpr int THREAD_COUNT = 10;
+static mtx_t multiple_waiter_lock;
+static mtx_t counter_lock;
+static int wait_count = 0;
+
+int waiter_func(void *) {
+  __llvm_libc::mtx_lock(&counter_lock);
+  ++wait_count;
+  __llvm_libc::mtx_unlock(&counter_lock);
+
+  // Block on the waiter lock until the main
+  // thread unblocks.
+  __llvm_libc::mtx_lock(&multiple_waiter_lock);
+  __llvm_libc::mtx_unlock(&multiple_waiter_lock);
+
+  __llvm_libc::mtx_lock(&counter_lock);
+  --wait_count;
+  __llvm_libc::mtx_unlock(&counter_lock);
+
+  return 0;
+}
+
+TEST(LlvmLibcMutexTest, MultipleWaiters) {
+  __llvm_libc::mtx_init(&multiple_waiter_lock, mtx_plain);
+  __llvm_libc::mtx_init(&counter_lock, mtx_plain);
+
+  __llvm_libc::mtx_lock(&multiple_waiter_lock);
+  thrd_t waiters[THREAD_COUNT];
+  for (int i = 0; i < THREAD_COUNT; ++i) {
+    __llvm_libc::thrd_create(waiters + i, waiter_func, nullptr);
+  }
+
+  // Spin until the counter is incremented to the desired
+  // value.
+  while (true) {
+    __llvm_libc::mtx_lock(&counter_lock);
+    if (wait_count == THREAD_COUNT) {
+      __llvm_libc::mtx_unlock(&counter_lock);
+      break;
+    }
+    __llvm_libc::mtx_unlock(&counter_lock);
+  }
+
+  __llvm_libc::mtx_unlock(&multiple_waiter_lock);
+
+  int retval;
+  for (int i = 0; i < THREAD_COUNT; ++i) {
+    __llvm_libc::thrd_join(waiters + i, &retval);
+  }
+
+  ASSERT_EQ(wait_count, 0);
+}
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index d0c20987cb173..86305af009173 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -62,7 +62,7 @@ class MPFRNumber {
   mpfr_t value;
 
 public:
-  MPFRNumber() : mpfrPrecision(128) { mpfr_init2(value, mpfrPrecision); }
+  MPFRNumber() : mpfrPrecision(256) { mpfr_init2(value, mpfrPrecision); }
 
   // We use explicit EnableIf specializations to disallow implicit
   // conversions. Implicit conversions can potentially lead to loss of
@@ -202,6 +202,33 @@ class MPFRNumber {
     return result;
   }
 
+  MPFRNumber mod_2pi() const {
+    MPFRNumber result(0.0, 1280);
+    MPFRNumber _2pi(0.0, 1280);
+    mpfr_const_pi(_2pi.value, MPFR_RNDN);
+    mpfr_mul_si(_2pi.value, _2pi.value, 2, MPFR_RNDN);
+    mpfr_fmod(result.value, value, _2pi.value, MPFR_RNDN);
+    return result;
+  }
+
+  MPFRNumber mod_pi_over_2() const {
+    MPFRNumber result(0.0, 1280);
+    MPFRNumber pi_over_2(0.0, 1280);
+    mpfr_const_pi(pi_over_2.value, MPFR_RNDN);
+    mpfr_mul_d(pi_over_2.value, pi_over_2.value, 0.5, MPFR_RNDN);
+    mpfr_fmod(result.value, value, pi_over_2.value, MPFR_RNDN);
+    return result;
+  }
+
+  MPFRNumber mod_pi_over_4() const {
+    MPFRNumber result(0.0, 1280);
+    MPFRNumber pi_over_4(0.0, 1280);
+    mpfr_const_pi(pi_over_4.value, MPFR_RNDN);
+    mpfr_mul_d(pi_over_4.value, pi_over_4.value, 0.25, MPFR_RNDN);
+    mpfr_fmod(result.value, value, pi_over_4.value, MPFR_RNDN);
+    return result;
+  }
+
   MPFRNumber sin() const {
     MPFRNumber result;
     mpfr_sin(result.value, value, MPFR_RNDN);
@@ -281,6 +308,9 @@ class MPFRNumber {
   template <typename T>
   cpp::EnableIfType<cpp::IsFloatingPointType<T>::Value, double> ulp(T input) {
     T thisAsT = as<T>();
+    if (thisAsT == input)
+      return T(0.0);
+
     int thisExponent = fputil::FPBits<T>(thisAsT).getExponent();
     int inputExponent = fputil::FPBits<T>(input).getExponent();
     if (thisAsT * input < 0 || thisExponent == inputExponent) {
@@ -339,6 +369,12 @@ unaryOperation(Operation op, InputType input) {
     return mpfrInput.expm1();
   case Operation::Floor:
     return mpfrInput.floor();
+  case Operation::Mod2PI:
+    return mpfrInput.mod_2pi();
+  case Operation::ModPIOver2:
+    return mpfrInput.mod_pi_over_2();
+  case Operation::ModPIOver4:
+    return mpfrInput.mod_pi_over_4();
   case Operation::Round:
     return mpfrInput.round();
   case Operation::Sin:
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index ee98c8f7ecc2d..4e8dd385d4f9b 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -30,6 +30,9 @@ enum class Operation : int {
   Exp2,
   Expm1,
   Floor,
+  Mod2PI,
+  ModPIOver2,
+  ModPIOver4,
   Round,
   Sin,
   Sqrt,
diff --git a/libclc/generic/libspirv/gen_core_convert.py b/libclc/generic/libspirv/gen_core_convert.py
index 2669844dc22dd..fef5d93e3dfe0 100755
--- a/libclc/generic/libspirv/gen_core_convert.py
+++ b/libclc/generic/libspirv/gen_core_convert.py
@@ -304,7 +304,7 @@ def generate_float_conversion(src, dst, size, mode, sat):
   else:
     print("  {DST}{N} r = {FN}(x);".format(FN=clc_core_fn_name(dst, size=size),
                                            DST=dst, N=size))
-    print("  {SRC}{N} y = {FN}(y);".format(FN=clc_core_fn_name(src, size=size),
+    print("  {SRC}{N} y = {FN}(r);".format(FN=clc_core_fn_name(src, size=size),
                                            SRC=src, N=size))
     if mode == '_rtz':
       if src in int_types:
diff --git a/libcxx/CREDITS.TXT b/libcxx/CREDITS.TXT
index 49f095d296459..e9321e0fec613 100644
--- a/libcxx/CREDITS.TXT
+++ b/libcxx/CREDITS.TXT
@@ -24,10 +24,6 @@ N: Holger Arnold
 E: holgerar@gmail.com
 D: Minor fix.
 
-N: Ruben Van Boxem
-E: vanboxem dot ruben at gmail dot com
-D: Initial Windows patches.
-
 N: David Chisnall
 E: theraven at theravensnest dot org
 D: FreeBSD and Solaris ports, libcxxrt support, some atomics work.
@@ -41,6 +37,10 @@ N: Jonathan B Coe
 E: jbcoe@me.com
 D: Implementation of propagate_const.
 
+N: Matthew Dempsky
+E: matthew@dempsky.org
+D: Minor patches and bug fixes.
+
 N: Christopher Di Bella
 E: cjdb@google.com
 E: cjdb.ns@gmail.com
@@ -58,10 +58,6 @@ N: Bill Fisher
 E: william.w.fisher@gmail.com
 D: Regex bug fixes.
 
-N: Matthew Dempsky
-E: matthew@dempsky.org
-D: Minor patches and bug fixes.
-
 N: Google Inc.
 D: Copyright owner and contributor of the CityHash algorithm
 
@@ -113,6 +109,10 @@ N: Jon Roelofs
 E: jroelofS@jroelofs.com
 D: Remote testing, Newlib port, baremetal/single-threaded support.
 
+N: Kent Ross
+E: k@mad.cash
+D: Patches for operator<=> support
+
 N: Jonathan Sauer
 D: Minor patches, mostly related to constexpr
 
@@ -131,6 +131,10 @@ N: Stephan Tolksdorf
 E: st@quanttec.com
 D: Minor <atomic> fix
 
+N: Ruben Van Boxem
+E: vanboxem dot ruben at gmail dot com
+D: Initial Windows patches.
+
 N: Michael van der Westhuizen
 E: r1mikey at gmail dot com
 
@@ -149,11 +153,11 @@ N: Xing Xue
 E: xingxue@ca.ibm.com
 D: AIX port
 
-N: Zhihao Yuan
-E: lichray@gmail.com
-D: Standard compatibility fixes.
-
 N: Jeffrey Yasskin
 E: jyasskin@gmail.com
 E: jyasskin@google.com
 D: Linux fixes.
+
+N: Zhihao Yuan
+E: lichray@gmail.com
+D: Standard compatibility fixes.
diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index c4b8247da63b1..8e33f8ebcf0cf 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -47,7 +47,7 @@ ExternalProject_Add(google-benchmark-libcxx
 set(BENCHMARK_NATIVE_TARGET_FLAGS)
 if (LIBCXX_BENCHMARK_NATIVE_GCC_TOOLCHAIN)
   set(BENCHMARK_NATIVE_TARGET_FLAGS
-      -gcc-toolchain ${LIBCXX_BENCHMARK_NATIVE_GCC_TOOLCHAIN})
+      --gcc-toolchain=${LIBCXX_BENCHMARK_NATIVE_GCC_TOOLCHAIN})
 endif()
 split_list(BENCHMARK_NATIVE_TARGET_FLAGS)
 
diff --git a/libcxx/docs/Status/RangesPaper.csv b/libcxx/docs/Status/RangesPaper.csv
index e174cda1a7ca2..fcdbe8fb61dd1 100644
--- a/libcxx/docs/Status/RangesPaper.csv
+++ b/libcxx/docs/Status/RangesPaper.csv
@@ -138,9 +138,9 @@ Section,Description,Dependencies,Assignee,Complete
 `[range.view.ref] <http://wg21.link/range.view>`_,`ref-view <https://llvm.org/D102020>`_,[view.interface],Zoe Carver,✅
 `[range.filter] <http://wg21.link/range.filter>`_,filter_view,[range.all],Louis Dionne,Not started
 `[range.transform] <http://wg21.link/range.transform>`_,`transform_view <https://llvm.org/D103056>`_,[range.all],Zoe Carver,✅
-`[range.iota] <http://wg21.link/range.iota>`_,iota_view,[range.all],Zoe Carver,In Progress
+`[range.iota] <http://wg21.link/range.iota>`_,iota_view,[range.all],Zoe Carver,✅
 `[range.take] <http://wg21.link/range.take>`_,take_view,[range.all],Zoe Carver,✅
-`[range.join] <http://wg21.link/range.join>`_,join_view,[range.all],Zoe Carver,In Progress
+`[range.join] <http://wg21.link/range.join>`_,join_view,[range.all],Zoe Carver,✅
 `[range.empty] <http://wg21.link/range.empty>`_,`empty_view <https://llvm.org/D103208>`_,[view.interface],Zoe Carver,✅
 `[range.single] <http://wg21.link/range.single>`_,single_view,[view.interface],Zoe Carver,✅
 `[range.split] <http://wg21.link/range.split>`_,split_view,[range.all],Zoe Carver,In Progress
diff --git a/libcxx/docs/Status/Spaceship.rst b/libcxx/docs/Status/Spaceship.rst
new file mode 100644
index 0000000000000..41116edd73b7b
--- /dev/null
+++ b/libcxx/docs/Status/Spaceship.rst
@@ -0,0 +1,49 @@
+.. spaceship-status:
+
+==============================================
+libc++ Spaceship Operator Status (operator<=>)
+==============================================
+
+.. include:: ../Helpers/Styles.rst
+
+.. contents::
+   :local:
+
+
+Overview
+================================
+
+This document contains the status of the C++20 spaceship operator support
+in libc++. It is used to track both the status of the sub-projects of the effort
+and who is assigned to these sub-projects. This is imperative to effective
+implementation so that work is not duplicated and implementors are not blocked
+by each other.
+
+If you are interested in contributing to this effort, please send a message
+to the #libcxx channel in the LLVM discord. Please *do not* start working on any
+of the assigned items below.
+
+
+Sub-Projects in the Implementation Effort
+=========================================
+
+.. csv-table::
+   :file: SpaceshipProjects.csv
+   :header-rows: 1
+   :widths: auto
+
+
+Misc. Items and TODOs
+====================================
+
+(Note: files with required updates will contain the TODO at the beginning of the
+list item so they can be easily found via global search.)
+
+
+Paper and Issue Status
+====================================
+
+.. csv-table::
+   :file: SpaceshipPapers.csv
+   :header-rows: 1
+   :widths: auto
diff --git a/libcxx/docs/Status/SpaceshipPapers.csv b/libcxx/docs/Status/SpaceshipPapers.csv
new file mode 100644
index 0000000000000..0a944df8be839
--- /dev/null
+++ b/libcxx/docs/Status/SpaceshipPapers.csv
@@ -0,0 +1,10 @@
+"Number","Name","Status","First released version"
+`P1614R2 <https://wg21.link/P1614>`_,The Mothership has Landed,|In Progress|,
+`P2404R0 <https://wg21.link/P2404>`_,"Relaxing equality_comparable_with's, totally_ordered_with's, and three_way_comparable_with's common reference requirements to support move-only types",,
+`P2405R0 <https://wg21.link/P2405>`_,nullopt_t and nullptr_t should both have operator<=> and operator==,,
+`LWG3330 <https://wg21.link/LWG3330>`_,Include <compare> from most library headers,|In Progress|,
+`LWG3347 <https://wg21.link/LWG3347>`_,"std::pair<T, U> now requires T and U to be less-than-comparable",|Nothing To Do|,
+`LWG3350 <https://wg21.link/LWG3350>`_,Simplify return type of lexicographical_compare_three_way,|Nothing To Do|,
+`LWG3360 <https://wg21.link/LWG3360>`_,three_way_comparable_with is inconsistent with similar concepts,|Nothing To Do|,
+`LWG3380 <https://wg21.link/LWG3380>`_,common_type and comparison categories,|Nothing To Do|,
+`LWG3395 <https://wg21.link/LWG3395>`_,Definition for three-way comparison needs to be updated,|Nothing To Do|,
diff --git a/libcxx/docs/Status/SpaceshipProjects.csv b/libcxx/docs/Status/SpaceshipProjects.csv
new file mode 100644
index 0000000000000..f361d4790013e
--- /dev/null
+++ b/libcxx/docs/Status/SpaceshipProjects.csv
@@ -0,0 +1,77 @@
+Section,Description,Dependencies,Assignee,Complete
+[cmp.concept],"| `three_way_comparable <https://reviews.llvm.org/D103478>`_
+| `three_way_comparable_with <https://reviews.llvm.org/D103478>`_",None,Ruslan Arutyunyan,In progress
+[cmp.result],| `compare_three_way_result <https://reviews.llvm.org/D103581>`_,None,Arthur O'Dwyer,In progress
+[expos.only.func],"| `synth-three-way <https://reviews.llvm.org/D107721>`_
+| `synth-three-way-result <https://reviews.llvm.org/D107721>`_",[cmp.concept],Kent Ross,In progress
+[comparisons.three.way],| `compare_three_way <https://reviews.llvm.org/D80899>`_,[cmp.concept],Christopher Di Bella,In progress
+[cmp.alg],"| `strong_order <https://reviews.llvm.org/D107036>`_
+| `weak_order <https://reviews.llvm.org/D107036>`_
+| `partial_order <https://reviews.llvm.org/D107036>`_",None,Arthur O'Dwyer,In progress
+[alg.three.way],| `lexicographical_compare_three_way <https://reviews.llvm.org/D80902>`_,[comparisons.three.way],Christopher Di Bella,In progress
+[coroutine.handle.compare],| coroutine_handle,[comparisons.three.way],Unassigned,Not started
+[pairs.spec],| `pair <https://reviews.llvm.org/D107721>`_,[expos.only.func],Kent Ross,In progress
+[syserr.errcat.nonvirtuals],| error_category,[comparisons.three.way],Unassigned,Not started
+[syserr.compare],"| error_code
+| error_condition",None,Unassigned,Not started
+[tuple.rel],| `tuple <https://reviews.llvm.org/D108250>`_,[expos.only.func],Kent Ross,In progress
+"[optional.relops]
+[optional.nullops]
+[optional.comp.with.t]","| optional
+| nullopt",None,Kent Ross,In progress
+"[variant.relops]
+[variant.monostate.relops]","| monostate
+| variant",None,Kent Ross,In progress
+[unique.ptr.special],| unique_ptr,[comparisons.three.way],Unassigned,Not started
+[util.smartptr.shared.cmp],| shared_ptr,[comparisons.three.way],Unassigned,Not started
+[type.index.members],| type_index,None,Unassigned,Not started
+[stacktrace.entry.cmp],| stacktrace_entry,None,Unassigned,Not started
+[stacktrace.basic.cmp],| basic_stacktrace,[alg.three.way],Unassigned,Not started
+[string.cmp],| `basic_string <https://reviews.llvm.org/D80895>`_,None,Christopher Di Bella,In progress
+[string.view.comparison],| `basic_string_view <https://reviews.llvm.org/D80891>`_,None,Christopher Di Bella,In progress
+[array.syn],| array,[expos.only.func],Unassigned,Not started
+[deque.syn],| deque,[expos.only.func],Unassigned,Not started
+[forward.list.syn],| forward_list,[expos.only.func],Unassigned,Not started
+[list.syn],| list,[expos.only.func],Unassigned,Not started
+[vector.syn],| vector,[expos.only.func],Unassigned,Not started
+[associative.map.syn],"| map
+| multimap",[expos.only.func],Unassigned,Not started
+[associative.set.syn],"| multiset
+| set",[expos.only.func],Unassigned,Not started
+[queue.ops],| queue,None,Unassigned,Not started
+[stack.ops],| stack,None,Unassigned,Not started
+[reverse.iter.cmp],| reverse_iterator,None,Unassigned,Not started
+[move.iter.cmp],| move_iterator,None,Unassigned,Not started
+[counted.iter.cmp],| counted_iterator,None,Unassigned,Not started
+[range.iota.iterator],| ranges::iota_view::iterator,[concepts.cmp],Unassigned,Not started
+[range.transform.iterator],| ranges::transform_view::iterator,[concepts.cmp],Unassigned,Not started
+[range.elements.iterator],| ranges::elements_view::iterator,[concepts.cmp],Unassigned,Not started
+"[time.duration.comparisons]
+[time.point.comparisons]
+[time.cal.day.nonmembers]
+[time.cal.month.nonmembers]
+[time.cal.year.nonmembers]
+[time.cal.md.nonmembers]
+[time.cal.mdlast]
+[time.cal.ym.nonmembers]
+[time.cal.ymd.nonmembers]
+[time.cal.ymdlast.nonmembers]
+[time.zone.nonmembers]
+[time.zone.leap.nonmembers]
+[time.zone.link.nonmembers]","| chrono::duration
+| chrono::time_point
+| chrono::day
+| chrono::month
+| chrono::year
+| chrono::month_day
+| chrono::month_day_last
+| chrono::year_month
+| chrono::year_month_day
+| chrono::year_month_day_last
+| chrono::time_zone
+| chrono::leap_second
+| chrono::time_zone_link",None,Unassigned,Not started
+[fs.path.nonmember],| filesystem::path,None,Unassigned,Not started
+[fs.dir.entry.obs],| filesystem::directory_entry,None,Unassigned,Not started
+[re.submatch.op],| sub_match,None,Unassigned,Not started
+[thread.thread.id],| thread::id,None,Unassigned,Not started
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index d4205a1bcc275..a4bbac7ecf252 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -43,8 +43,9 @@ Getting Started with libc++
    Status/Cxx17
    Status/Cxx20
    Status/Cxx2b
-   Status/Ranges
    Status/Format
+   Status/Ranges
+   Status/Spaceship
 
 
 .. toctree::
@@ -128,8 +129,8 @@ for all the platforms and compilers that we claim to support. If a platform or c
 is not listed here, it is not officially supported. It may happen to work, and
 in practice the library is known to work on some platforms not listed here, but
 we don't make any guarantees. If you would like your compiler and/or platform
-to be formally supported and listed here,
-please work with the libc++ team to set up testing for your configuration.
+to be formally supported and listed here, please work with the libc++ team to set
+up testing for your configuration.
 
 
 C++ Dialect Support
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index d2760ee45ad3c..62f5eea1216ca 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -98,6 +98,7 @@ set(files
   __bsd_locale_defaults.h
   __bsd_locale_fallbacks.h
   __compare/common_comparison_category.h
+  __compare/compare_three_way_result.h
   __compare/ordering.h
   __concepts/arithmetic.h
   __concepts/assignable.h
@@ -223,6 +224,8 @@ set(files
   __ranges/empty.h
   __ranges/enable_borrowed_range.h
   __ranges/enable_view.h
+  __ranges/iota_view.h
+  __ranges/join_view.h
   __ranges/non_propagating_cache.h
   __ranges/ref_view.h
   __ranges/reverse_view.h
diff --git a/libcxx/include/__algorithm/comp_ref_type.h b/libcxx/include/__algorithm/comp_ref_type.h
index b3bca82c09532..bbb94506c2734 100644
--- a/libcxx/include/__algorithm/comp_ref_type.h
+++ b/libcxx/include/__algorithm/comp_ref_type.h
@@ -10,7 +10,6 @@
 #define _LIBCPP___ALGORITHM_COMP_REF_TYPE_H
 
 #include <__config>
-#include <type_traits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -73,7 +72,7 @@ struct __comp_ref_type {
   // Pass the comparator by lvalue reference. Or in debug mode, using a
   // debugging wrapper that stores a reference.
 #ifndef _LIBCPP_DEBUG
-  typedef typename add_lvalue_reference<_Comp>::type type;
+  typedef _Comp& type;
 #else
   typedef __debug_less<_Comp> type;
 #endif
diff --git a/libcxx/include/__algorithm/equal.h b/libcxx/include/__algorithm/equal.h
index bc67559819634..3658d0a177cdc 100644
--- a/libcxx/include/__algorithm/equal.h
+++ b/libcxx/include/__algorithm/equal.h
@@ -12,8 +12,8 @@
 
 #include <__config>
 #include <__algorithm/comp.h>
+#include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
-#include <iterator> // FIXME: replace with <__iterator/distance.h> when it lands
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -60,14 +60,14 @@ __equal(_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _Random
   if (_VSTD::distance(__first1, __last1) != _VSTD::distance(__first2, __last2))
     return false;
   return _VSTD::equal<_RandomAccessIterator1, _RandomAccessIterator2,
-                      typename add_lvalue_reference<_BinaryPredicate>::type>(__first1, __last1, __first2, __pred);
+                      _BinaryPredicate&>(__first1, __last1, __first2, __pred);
 }
 
 template <class _InputIterator1, class _InputIterator2, class _BinaryPredicate>
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 bool
 equal(_InputIterator1 __first1, _InputIterator1 __last1, _InputIterator2 __first2, _InputIterator2 __last2,
       _BinaryPredicate __pred) {
-  return _VSTD::__equal<typename add_lvalue_reference<_BinaryPredicate>::type>(
+  return _VSTD::__equal<_BinaryPredicate&>(
       __first1, __last1, __first2, __last2, __pred, typename iterator_traits<_InputIterator1>::iterator_category(),
       typename iterator_traits<_InputIterator2>::iterator_category());
 }
diff --git a/libcxx/include/__algorithm/find_end.h b/libcxx/include/__algorithm/find_end.h
index f4277f003aa8f..7dd20eba06115 100644
--- a/libcxx/include/__algorithm/find_end.h
+++ b/libcxx/include/__algorithm/find_end.h
@@ -13,7 +13,6 @@
 #include <__config>
 #include <__algorithm/comp.h>
 #include <__iterator/iterator_traits.h>
-#include <type_traits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -134,7 +133,7 @@ template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredica
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 _ForwardIterator1
 find_end(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2,
          _BinaryPredicate __pred) {
-  return _VSTD::__find_end<typename add_lvalue_reference<_BinaryPredicate>::type>(
+  return _VSTD::__find_end<_BinaryPredicate&>(
       __first1, __last1, __first2, __last2, __pred, typename iterator_traits<_ForwardIterator1>::iterator_category(),
       typename iterator_traits<_ForwardIterator2>::iterator_category());
 }
diff --git a/libcxx/include/__algorithm/is_permutation.h b/libcxx/include/__algorithm/is_permutation.h
index 0545eb76370a4..727308aceea66 100644
--- a/libcxx/include/__algorithm/is_permutation.h
+++ b/libcxx/include/__algorithm/is_permutation.h
@@ -12,9 +12,9 @@
 
 #include <__algorithm/comp.h>
 #include <__config>
+#include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
 #include <__iterator/next.h>
-#include <iterator> // FIXME: replace with <__iterator/distance.h> when it lands
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -136,15 +136,14 @@ _LIBCPP_CONSTEXPR_AFTER_CXX17 bool __is_permutation(_RandomAccessIterator1 __fir
   if (_VSTD::distance(__first1, __last1) != _VSTD::distance(__first2, __last2))
     return false;
   return _VSTD::is_permutation<_RandomAccessIterator1, _RandomAccessIterator2,
-                               typename add_lvalue_reference<_BinaryPredicate>::type>(__first1, __last1, __first2,
-                                                                                      __pred);
+                               _BinaryPredicate&>(__first1, __last1, __first2, __pred);
 }
 
 template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredicate>
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 bool
 is_permutation(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2,
                _ForwardIterator2 __last2, _BinaryPredicate __pred) {
-  return _VSTD::__is_permutation<typename add_lvalue_reference<_BinaryPredicate>::type>(
+  return _VSTD::__is_permutation<_BinaryPredicate&>(
       __first1, __last1, __first2, __last2, __pred, typename iterator_traits<_ForwardIterator1>::iterator_category(),
       typename iterator_traits<_ForwardIterator2>::iterator_category());
 }
diff --git a/libcxx/include/__algorithm/lower_bound.h b/libcxx/include/__algorithm/lower_bound.h
index 1448c89633274..f353b6e57132e 100644
--- a/libcxx/include/__algorithm/lower_bound.h
+++ b/libcxx/include/__algorithm/lower_bound.h
@@ -51,8 +51,7 @@ _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _ForwardIterator
 lower_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value_, _Compare __comp)
 {
-    typedef typename add_lvalue_reference<_Compare>::type _Comp_ref;
-    return _VSTD::__lower_bound<_Comp_ref>(__first, __last, __value_, __comp);
+    return _VSTD::__lower_bound<_Compare&>(__first, __last, __value_, __comp);
 }
 
 template <class _ForwardIterator, class _Tp>
diff --git a/libcxx/include/__algorithm/partition.h b/libcxx/include/__algorithm/partition.h
index c859eaca28a35..5e3872e80b253 100644
--- a/libcxx/include/__algorithm/partition.h
+++ b/libcxx/include/__algorithm/partition.h
@@ -13,7 +13,6 @@
 #include <__iterator/iterator_traits.h>
 #include <__utility/swap.h>
 #include <utility> // pair
-#include <type_traits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -77,8 +76,8 @@ inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _ForwardIterator
 partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
 {
-    return _VSTD::__partition<typename add_lvalue_reference<_Predicate>::type>
-                            (__first, __last, __pred, typename iterator_traits<_ForwardIterator>::iterator_category());
+    return _VSTD::__partition<_Predicate&>(
+        __first, __last, __pred, typename iterator_traits<_ForwardIterator>::iterator_category());
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/remove_if.h b/libcxx/include/__algorithm/remove_if.h
index e506b4c67fbaa..893ec8d7b4239 100644
--- a/libcxx/include/__algorithm/remove_if.h
+++ b/libcxx/include/__algorithm/remove_if.h
@@ -12,7 +12,6 @@
 #include <__config>
 #include <__algorithm/find_if.h>
 #include <utility>
-#include <type_traits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -27,8 +26,7 @@ template <class _ForwardIterator, class _Predicate>
 _LIBCPP_NODISCARD_EXT _LIBCPP_CONSTEXPR_AFTER_CXX17 _ForwardIterator
 remove_if(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
 {
-    __first = _VSTD::find_if<_ForwardIterator, typename add_lvalue_reference<_Predicate>::type>
-                           (__first, __last, __pred);
+    __first = _VSTD::find_if<_ForwardIterator, _Predicate&>(__first, __last, __pred);
     if (__first != __last)
     {
         _ForwardIterator __i = __first;
diff --git a/libcxx/include/__algorithm/search.h b/libcxx/include/__algorithm/search.h
index 008b8ebb04adb..2bedbb8afb675 100644
--- a/libcxx/include/__algorithm/search.h
+++ b/libcxx/include/__algorithm/search.h
@@ -13,7 +13,6 @@
 #include <__algorithm/comp.h>
 #include <__config>
 #include <__iterator/iterator_traits.h>
-#include <type_traits>
 #include <utility>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -101,7 +100,7 @@ template <class _ForwardIterator1, class _ForwardIterator2, class _BinaryPredica
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 _ForwardIterator1
 search(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2,
        _BinaryPredicate __pred) {
-  return _VSTD::__search<typename add_lvalue_reference<_BinaryPredicate>::type>(
+  return _VSTD::__search<_BinaryPredicate&>(
              __first1, __last1, __first2, __last2, __pred,
              typename iterator_traits<_ForwardIterator1>::iterator_category(),
              typename iterator_traits<_ForwardIterator2>::iterator_category()).first;
diff --git a/libcxx/include/__algorithm/search_n.h b/libcxx/include/__algorithm/search_n.h
index 1584e8e613ceb..ad560ad2889c9 100644
--- a/libcxx/include/__algorithm/search_n.h
+++ b/libcxx/include/__algorithm/search_n.h
@@ -13,7 +13,7 @@
 #include <__config>
 #include <__algorithm/comp.h>
 #include <__iterator/iterator_traits.h>
-#include <type_traits>
+#include <type_traits>  // __convert_to_integral
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -97,7 +97,7 @@ _LIBCPP_CONSTEXPR_AFTER_CXX17 _RandomAccessIterator __search_n(_RandomAccessIter
 template <class _ForwardIterator, class _Size, class _Tp, class _BinaryPredicate>
 _LIBCPP_NODISCARD_EXT inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 _ForwardIterator search_n(
     _ForwardIterator __first, _ForwardIterator __last, _Size __count, const _Tp& __value_, _BinaryPredicate __pred) {
-  return _VSTD::__search_n<typename add_lvalue_reference<_BinaryPredicate>::type>(
+  return _VSTD::__search_n<_BinaryPredicate&>(
       __first, __last, _VSTD::__convert_to_integral(__count), __value_, __pred,
       typename iterator_traits<_ForwardIterator>::iterator_category());
 }
diff --git a/libcxx/include/__algorithm/sort.h b/libcxx/include/__algorithm/sort.h
index 39ec21302d20c..ae99c8bde129b 100644
--- a/libcxx/include/__algorithm/sort.h
+++ b/libcxx/include/__algorithm/sort.h
@@ -17,7 +17,6 @@
 #include <__algorithm/unwrap_iter.h>
 #include <__utility/swap.h>
 #include <memory>
-#include <type_traits> // swap
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -131,9 +130,7 @@ __selection_sort(_BidirectionalIterator __first, _BidirectionalIterator __last,
     _BidirectionalIterator __lm1 = __last;
     for (--__lm1; __first != __lm1; ++__first)
     {
-        _BidirectionalIterator __i = _VSTD::min_element<_BidirectionalIterator,
-                                                        typename add_lvalue_reference<_Compare>::type>
-                                                       (__first, __last, __comp);
+        _BidirectionalIterator __i = _VSTD::min_element<_BidirectionalIterator, _Compare&>(__first, __last, __comp);
         if (__i != __first)
             swap(*__first, *__i);
     }
diff --git a/libcxx/include/__algorithm/stable_partition.h b/libcxx/include/__algorithm/stable_partition.h
index 931335f444746..e0be0b6e5ef8e 100644
--- a/libcxx/include/__algorithm/stable_partition.h
+++ b/libcxx/include/__algorithm/stable_partition.h
@@ -85,8 +85,7 @@ __stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate
     // recurse on [__first, __m), *__first know to be false
     // F?????????????????
     // f       m         l
-    typedef typename add_lvalue_reference<_Predicate>::type _PredRef;
-    _ForwardIterator __first_false = _VSTD::__stable_partition<_PredRef>(__first, __m, __pred, __len2, __p, __fit);
+    _ForwardIterator __first_false = _VSTD::__stable_partition<_Predicate&>(__first, __m, __pred, __len2, __p, __fit);
     // TTTFFFFF??????????
     // f  ff   m         l
     // recurse on [__m, __last], except increase __m until *(__m) is false, *__last know to be true
@@ -101,7 +100,7 @@ __stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate
     }
     // TTTFFFFFTTTF??????
     // f  ff   m  m1     l
-    __second_false = _VSTD::__stable_partition<_PredRef>(__m1, __last, __pred, __len_half, __p, __fit);
+    __second_false = _VSTD::__stable_partition<_Predicate&>(__m1, __last, __pred, __len_half, __p, __fit);
 __second_half_done:
     // TTTFFFFFTTTTTFFFFF
     // f  ff   m    sf   l
@@ -137,8 +136,7 @@ __stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate
         __p = _VSTD::get_temporary_buffer<value_type>(__len);
         __h.reset(__p.first);
     }
-    return _VSTD::__stable_partition<typename add_lvalue_reference<_Predicate>::type>
-                             (__first, __last, __pred, __len, __p, forward_iterator_tag());
+    return _VSTD::__stable_partition<_Predicate&>(__first, __last, __pred, __len, __p, forward_iterator_tag());
 }
 
 template <class _Predicate, class _BidirectionalIterator, class _Distance, class _Pair>
@@ -222,8 +220,7 @@ __stable_partition(_BidirectionalIterator __first, _BidirectionalIterator __last
     }
     // F???TFFF?????????T
     // f   m1  m        l
-    typedef typename add_lvalue_reference<_Predicate>::type _PredRef;
-    __first_false = _VSTD::__stable_partition<_PredRef>(__first, __m1, __pred, __len_half, __p, __bit);
+    __first_false = _VSTD::__stable_partition<_Predicate&>(__first, __m1, __pred, __len_half, __p, __bit);
 __first_half_done:
     // TTTFFFFF?????????T
     // f  ff   m        l
@@ -240,7 +237,7 @@ __stable_partition(_BidirectionalIterator __first, _BidirectionalIterator __last
     }
     // TTTFFFFFTTTF?????T
     // f  ff   m  m1    l
-    __second_false = _VSTD::__stable_partition<_PredRef>(__m1, __last, __pred, __len_half, __p, __bit);
+    __second_false = _VSTD::__stable_partition<_Predicate&>(__m1, __last, __pred, __len_half, __p, __bit);
 __second_half_done:
     // TTTFFFFFTTTTTFFFFF
     // f  ff   m    sf  l
@@ -285,8 +282,7 @@ __stable_partition(_BidirectionalIterator __first, _BidirectionalIterator __last
         __p = _VSTD::get_temporary_buffer<value_type>(__len);
         __h.reset(__p.first);
     }
-    return _VSTD::__stable_partition<typename add_lvalue_reference<_Predicate>::type>
-                             (__first, __last, __pred, __len, __p, bidirectional_iterator_tag());
+    return _VSTD::__stable_partition<_Predicate&>(__first, __last, __pred, __len, __p, bidirectional_iterator_tag());
 }
 
 template <class _ForwardIterator, class _Predicate>
@@ -294,8 +290,7 @@ inline _LIBCPP_INLINE_VISIBILITY
 _ForwardIterator
 stable_partition(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred)
 {
-    return _VSTD::__stable_partition<typename add_lvalue_reference<_Predicate>::type>
-                             (__first, __last, __pred, typename iterator_traits<_ForwardIterator>::iterator_category());
+    return _VSTD::__stable_partition<_Predicate&>(__first, __last, __pred, typename iterator_traits<_ForwardIterator>::iterator_category());
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__algorithm/unique.h b/libcxx/include/__algorithm/unique.h
index fb6251a39a821..e03c2669a2d58 100644
--- a/libcxx/include/__algorithm/unique.h
+++ b/libcxx/include/__algorithm/unique.h
@@ -14,7 +14,6 @@
 #include <__algorithm/adjacent_find.h>
 #include <__iterator/iterator_traits.h>
 #include <__utility/move.h>
-#include <type_traits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -31,8 +30,7 @@ template <class _ForwardIterator, class _BinaryPredicate>
 _LIBCPP_NODISCARD_EXT _LIBCPP_CONSTEXPR_AFTER_CXX17 _ForwardIterator
 unique(_ForwardIterator __first, _ForwardIterator __last, _BinaryPredicate __pred)
 {
-    __first = _VSTD::adjacent_find<_ForwardIterator, typename add_lvalue_reference<_BinaryPredicate>::type>
-                                 (__first, __last, __pred);
+    __first = _VSTD::adjacent_find<_ForwardIterator, _BinaryPredicate&>(__first, __last, __pred);
     if (__first != __last)
     {
         // ...  a  a  ?  ...
diff --git a/libcxx/include/__algorithm/unique_copy.h b/libcxx/include/__algorithm/unique_copy.h
index 974a7c4df2d4f..027b3244557e8 100644
--- a/libcxx/include/__algorithm/unique_copy.h
+++ b/libcxx/include/__algorithm/unique_copy.h
@@ -13,7 +13,6 @@
 #include <__algorithm/comp.h>
 #include <__iterator/iterator_traits.h>
 #include <utility>
-#include <type_traits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
@@ -91,8 +90,7 @@ inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _OutputIterator
 unique_copy(_InputIterator __first, _InputIterator __last, _OutputIterator __result, _BinaryPredicate __pred)
 {
-    return _VSTD::__unique_copy<typename add_lvalue_reference<_BinaryPredicate>::type>
-                              (__first, __last, __result, __pred,
+    return _VSTD::__unique_copy<_BinaryPredicate&>(__first, __last, __result, __pred,
                                typename iterator_traits<_InputIterator>::iterator_category(),
                                typename iterator_traits<_OutputIterator>::iterator_category());
 }
diff --git a/libcxx/include/__algorithm/upper_bound.h b/libcxx/include/__algorithm/upper_bound.h
index 7be607f825388..b580190efa649 100644
--- a/libcxx/include/__algorithm/upper_bound.h
+++ b/libcxx/include/__algorithm/upper_bound.h
@@ -51,8 +51,7 @@ _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17
 _ForwardIterator
 upper_bound(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value_, _Compare __comp)
 {
-    typedef typename add_lvalue_reference<_Compare>::type _Comp_ref;
-    return _VSTD::__upper_bound<_Comp_ref>(__first, __last, __value_, __comp);
+    return _VSTD::__upper_bound<_Compare&>(__first, __last, __value_, __comp);
 }
 
 template <class _ForwardIterator, class _Tp>
diff --git a/libcxx/include/__compare/compare_three_way_result.h b/libcxx/include/__compare/compare_three_way_result.h
new file mode 100644
index 0000000000000..14908c6bbc1bb
--- /dev/null
+++ b/libcxx/include/__compare/compare_three_way_result.h
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___COMPARE_COMPARE_THREE_WAY_RESULT_H
+#define _LIBCPP___COMPARE_COMPARE_THREE_WAY_RESULT_H
+
+#include <__config>
+#include <type_traits>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER > 17
+
+template<class, class, class>
+struct _LIBCPP_HIDE_FROM_ABI __compare_three_way_result { };
+
+template<class _Tp, class _Up>
+struct _LIBCPP_HIDE_FROM_ABI __compare_three_way_result<_Tp, _Up, decltype(
+  declval<__make_const_lvalue_ref<_Tp>>() <=> declval<__make_const_lvalue_ref<_Up>>(), void()
+)> {
+    using type = decltype(declval<__make_const_lvalue_ref<_Tp>>() <=> declval<__make_const_lvalue_ref<_Up>>());
+};
+
+template<class _Tp, class _Up = _Tp>
+struct _LIBCPP_TEMPLATE_VIS compare_three_way_result : __compare_three_way_result<_Tp, _Up, void> { };
+
+template<class _Tp, class _Up = _Tp>
+using compare_three_way_result_t = typename compare_three_way_result<_Tp, _Up>::type;
+
+#endif // _LIBCPP_STD_VER > 17
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___COMPARE_COMPARE_THREE_WAY_RESULT_H
diff --git a/libcxx/include/__config b/libcxx/include/__config
index a573629994316..740106ce8d5a2 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -354,6 +354,16 @@
 #  define _LIBCPP_NO_CFI
 #endif
 
+// If the compiler supports using_if_exists, pretend we have those functions and they'll
+// be picked up if the C library provides them.
+//
+// TODO: Once we drop support for Clang 12, we can assume the compiler supports using_if_exists
+//       for platforms that don't have a conforming C11 library, so we can drop this whole thing.
+#if __has_attribute(using_if_exists)
+# define _LIBCPP_HAS_TIMESPEC_GET
+# define _LIBCPP_HAS_QUICK_EXIT
+# define _LIBCPP_HAS_ALIGNED_ALLOC
+#else
 #if (defined(__ISO_C_VISIBLE) && (__ISO_C_VISIBLE >= 2011)) || __cplusplus >= 201103L
 #  if defined(__FreeBSD__)
 #    define _LIBCPP_HAS_ALIGNED_ALLOC
@@ -408,6 +418,7 @@
 #    endif
 #  endif // __APPLE__
 #endif
+#endif // __has_attribute(using_if_exists)
 
 #ifndef _LIBCPP_CXX03_LANG
 # define _LIBCPP_ALIGNOF(_Tp) alignof(_Tp)
@@ -1350,10 +1361,6 @@ extern "C" _LIBCPP_FUNC_VIS void __sanitizer_annotate_contiguous_container(
 #define _LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS
 #endif // _LIBCPP_ENABLE_CXX20_REMOVED_FEATURES
 
-#if !defined(__cpp_deduction_guides) || __cpp_deduction_guides < 201611
-#define _LIBCPP_HAS_NO_DEDUCTION_GUIDES
-#endif
-
 #if !defined(__cpp_coroutines) || __cpp_coroutines < 201703L
 #define _LIBCPP_HAS_NO_COROUTINES
 #endif
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index baa0135876be2..386f8aaf075d7 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -1045,7 +1045,7 @@ class _LIBCPP_TEMPLATE_VIS function<_Rp(_ArgTypes...)>
 #endif // _LIBCPP_NO_RTTI
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _Rp, class ..._Ap>
 function(_Rp(*)(_Ap...)) -> function<_Rp(_Ap...)>;
 
@@ -1090,7 +1090,7 @@ struct __strip_signature<_Rp (_Gp::*) (_Ap...) const volatile & noexcept> { usin
 
 template<class _Fp, class _Stripped = typename __strip_signature<decltype(&_Fp::operator())>::type>
 function(_Fp) -> function<_Stripped>;
-#endif // !_LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#endif // _LIBCPP_STD_VER >= 17
 
 template<class _Rp, class ..._ArgTypes>
 function<_Rp(_ArgTypes...)>::function(const function& __f) : __f_(__f.__f_) {}
diff --git a/libcxx/include/__functional/operations.h b/libcxx/include/__functional/operations.h
index 667d17988bc4e..0c7c6d4fcfaf8 100644
--- a/libcxx/include/__functional/operations.h
+++ b/libcxx/include/__functional/operations.h
@@ -53,9 +53,9 @@ struct _LIBCPP_TEMPLATE_VIS plus<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) + _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) + _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) + _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) + _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) + _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) + _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -90,9 +90,9 @@ struct _LIBCPP_TEMPLATE_VIS minus<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) - _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) - _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) - _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) - _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) - _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) - _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -127,9 +127,9 @@ struct _LIBCPP_TEMPLATE_VIS multiplies<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) * _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) * _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) * _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) * _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) * _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) * _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -164,9 +164,9 @@ struct _LIBCPP_TEMPLATE_VIS divides<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) / _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) / _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) / _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) / _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) / _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) / _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -201,9 +201,9 @@ struct _LIBCPP_TEMPLATE_VIS modulus<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) % _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) % _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) % _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) % _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) % _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) % _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -237,9 +237,9 @@ struct _LIBCPP_TEMPLATE_VIS negate<void>
     template <class _Tp>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_Tp&& __x) const
-    _NOEXCEPT_(noexcept(- _VSTD::forward<_Tp>(__x)))
-    -> decltype        (- _VSTD::forward<_Tp>(__x))
-        { return        - _VSTD::forward<_Tp>(__x); }
+        noexcept(noexcept(- _VSTD::forward<_Tp>(__x)))
+        -> decltype(      - _VSTD::forward<_Tp>(__x))
+        { return          - _VSTD::forward<_Tp>(__x); }
     typedef void is_transparent;
 };
 #endif
@@ -276,9 +276,9 @@ struct _LIBCPP_TEMPLATE_VIS bit_and<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) & _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) & _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) & _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) & _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) & _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) & _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -307,9 +307,9 @@ struct _LIBCPP_TEMPLATE_VIS bit_not<void>
     template <class _Tp>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_Tp&& __x) const
-    _NOEXCEPT_(noexcept(~_VSTD::forward<_Tp>(__x)))
-    -> decltype        (~_VSTD::forward<_Tp>(__x))
-        { return        ~_VSTD::forward<_Tp>(__x); }
+        noexcept(noexcept(~_VSTD::forward<_Tp>(__x)))
+        -> decltype(      ~_VSTD::forward<_Tp>(__x))
+        { return          ~_VSTD::forward<_Tp>(__x); }
     typedef void is_transparent;
 };
 #endif
@@ -344,9 +344,9 @@ struct _LIBCPP_TEMPLATE_VIS bit_or<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) | _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) | _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) | _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) | _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) | _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) | _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -381,9 +381,9 @@ struct _LIBCPP_TEMPLATE_VIS bit_xor<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) ^ _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) ^ _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) ^ _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) ^ _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) ^ _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) ^ _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -420,9 +420,9 @@ struct _LIBCPP_TEMPLATE_VIS equal_to<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) == _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) == _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) == _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) == _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) == _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) == _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -457,9 +457,9 @@ struct _LIBCPP_TEMPLATE_VIS not_equal_to<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) != _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) != _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) != _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) != _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) != _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) != _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -494,9 +494,9 @@ struct _LIBCPP_TEMPLATE_VIS less<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) < _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) < _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) < _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) < _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) < _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) < _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -531,9 +531,9 @@ struct _LIBCPP_TEMPLATE_VIS less_equal<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) <= _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) <= _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) <= _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) <= _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) <= _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) <= _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -568,9 +568,9 @@ struct _LIBCPP_TEMPLATE_VIS greater_equal<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) >= _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) >= _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) >= _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) >= _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) >= _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) >= _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -605,9 +605,9 @@ struct _LIBCPP_TEMPLATE_VIS greater<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) > _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) > _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) > _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) > _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) > _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) > _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -644,9 +644,9 @@ struct _LIBCPP_TEMPLATE_VIS logical_and<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) && _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) && _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) && _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) && _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) && _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) && _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
@@ -680,9 +680,9 @@ struct _LIBCPP_TEMPLATE_VIS logical_not<void>
     template <class _Tp>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_Tp&& __x) const
-    _NOEXCEPT_(noexcept(!_VSTD::forward<_Tp>(__x)))
-    -> decltype        (!_VSTD::forward<_Tp>(__x))
-        { return        !_VSTD::forward<_Tp>(__x); }
+        noexcept(noexcept(!_VSTD::forward<_Tp>(__x)))
+        -> decltype(      !_VSTD::forward<_Tp>(__x))
+        { return          !_VSTD::forward<_Tp>(__x); }
     typedef void is_transparent;
 };
 #endif
@@ -717,9 +717,9 @@ struct _LIBCPP_TEMPLATE_VIS logical_or<void>
     template <class _T1, class _T2>
     _LIBCPP_CONSTEXPR_AFTER_CXX11 _LIBCPP_INLINE_VISIBILITY
     auto operator()(_T1&& __t, _T2&& __u) const
-    _NOEXCEPT_(noexcept(_VSTD::forward<_T1>(__t) || _VSTD::forward<_T2>(__u)))
-    -> decltype        (_VSTD::forward<_T1>(__t) || _VSTD::forward<_T2>(__u))
-        { return        _VSTD::forward<_T1>(__t) || _VSTD::forward<_T2>(__u); }
+        noexcept(noexcept(_VSTD::forward<_T1>(__t) || _VSTD::forward<_T2>(__u)))
+        -> decltype(      _VSTD::forward<_T1>(__t) || _VSTD::forward<_T2>(__u))
+        { return          _VSTD::forward<_T1>(__t) || _VSTD::forward<_T2>(__u); }
     typedef void is_transparent;
 };
 #endif
diff --git a/libcxx/include/__functional/reference_wrapper.h b/libcxx/include/__functional/reference_wrapper.h
index 09f4a64945022..20b3eedc96db5 100644
--- a/libcxx/include/__functional/reference_wrapper.h
+++ b/libcxx/include/__functional/reference_wrapper.h
@@ -176,7 +176,7 @@ class _LIBCPP_TEMPLATE_VIS reference_wrapper
 #endif // _LIBCPP_CXX03_LANG
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template <class _Tp>
 reference_wrapper(_Tp&) -> reference_wrapper<_Tp>;
 #endif
diff --git a/libcxx/include/__iterator/concepts.h b/libcxx/include/__iterator/concepts.h
index 6eb4aef10528a..db836bda25391 100644
--- a/libcxx/include/__iterator/concepts.h
+++ b/libcxx/include/__iterator/concepts.h
@@ -72,6 +72,8 @@ concept __signed_integer_like = signed_integral<_Tp>;
 
 template<class _Ip>
 concept weakly_incrementable =
+  // TODO: remove this once the clang bug is fixed (bugs.llvm.org/PR48173).
+  !same_as<_Ip, bool> && // Currently, clang does not handle bool correctly.
   movable<_Ip> &&
   requires(_Ip __i) {
     typename iter_difference_t<_Ip>;
diff --git a/libcxx/include/__iterator/iterator_traits.h b/libcxx/include/__iterator/iterator_traits.h
index 5275705c117c5..a0454d2e495cc 100644
--- a/libcxx/include/__iterator/iterator_traits.h
+++ b/libcxx/include/__iterator/iterator_traits.h
@@ -477,7 +477,7 @@ struct __is_exactly_cpp17_input_iterator
          __has_iterator_category_convertible_to<_Tp, input_iterator_tag>::value &&
         !__has_iterator_category_convertible_to<_Tp, forward_iterator_tag>::value> {};
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator>
 using __iter_value_type = typename iterator_traits<_InputIterator>::value_type;
 
@@ -491,7 +491,7 @@ template<class _InputIterator>
 using __iter_to_alloc_type = pair<
     add_const_t<typename iterator_traits<_InputIterator>::value_type::first_type>,
     typename iterator_traits<_InputIterator>::value_type::second_type>;
-#endif // _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#endif // _LIBCPP_STD_VER >= 17
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 04161c4b73ed3..9ac7ba8e3ecda 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -686,7 +686,7 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS shared_ptr
     template <class _Up> friend class _LIBCPP_TEMPLATE_VIS weak_ptr;
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _Tp>
 shared_ptr(weak_ptr<_Tp>) -> shared_ptr<_Tp>;
 template<class _Tp, class _Dp>
@@ -1406,7 +1406,7 @@ class _LIBCPP_SHARED_PTR_TRIVIAL_ABI _LIBCPP_TEMPLATE_VIS weak_ptr
     template <class _Up> friend class _LIBCPP_TEMPLATE_VIS shared_ptr;
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _Tp>
 weak_ptr(shared_ptr<_Tp>) -> weak_ptr<_Tp>;
 #endif
diff --git a/libcxx/include/__ranges/all.h b/libcxx/include/__ranges/all.h
index d678d3e5d357c..f44beaa272722 100644
--- a/libcxx/include/__ranges/all.h
+++ b/libcxx/include/__ranges/all.h
@@ -32,7 +32,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if !defined(_LIBCPP_HAS_NO_RANGES)
 
-namespace views {
+namespace ranges::views {
 
 namespace __all {
   struct __fn {
@@ -75,7 +75,7 @@ inline namespace __cpo {
 template<ranges::viewable_range _Range>
 using all_t = decltype(views::all(declval<_Range>()));
 
-} // namespace views
+} // namespace ranges::views
 
 #endif // !defined(_LIBCPP_HAS_NO_RANGES)
 
diff --git a/libcxx/include/__ranges/counted.h b/libcxx/include/__ranges/counted.h
index d891c1f4efac4..d7240803608f7 100644
--- a/libcxx/include/__ranges/counted.h
+++ b/libcxx/include/__ranges/counted.h
@@ -36,7 +36,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if !defined(_LIBCPP_HAS_NO_RANGES)
 
-namespace views {
+namespace ranges::views {
 
 namespace __counted {
   template<class _From, class _To>
@@ -88,7 +88,7 @@ inline namespace __cpo {
   inline constexpr auto counted = __counted::__fn{};
 } // namespace __cpo
 
-} // namespace views
+} // namespace ranges::views
 
 #endif // !defined(_LIBCPP_HAS_NO_RANGES)
 
diff --git a/libcxx/include/__ranges/drop_view.h b/libcxx/include/__ranges/drop_view.h
index 6d1a5a2efff66..d732d02f2f2a0 100644
--- a/libcxx/include/__ranges/drop_view.h
+++ b/libcxx/include/__ranges/drop_view.h
@@ -77,7 +77,7 @@ namespace ranges {
 
       auto __tmp = ranges::next(ranges::begin(__base_), __count_, ranges::end(__base_));
       if constexpr (_UseCache)
-        __cached_begin_.__set(__tmp);
+        __cached_begin_.__emplace(__tmp);
       return __tmp;
     }
 
diff --git a/libcxx/include/__ranges/iota_view.h b/libcxx/include/__ranges/iota_view.h
new file mode 100644
index 0000000000000..bd679cdf35d60
--- /dev/null
+++ b/libcxx/include/__ranges/iota_view.h
@@ -0,0 +1,399 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef _LIBCPP___RANGES_IOTA_VIEW_H
+#define _LIBCPP___RANGES_IOTA_VIEW_H
+
+#include <__config>
+#include <__debug>
+#include <__functional/ranges_operations.h>
+#include <__iterator/concepts.h>
+#include <__iterator/incrementable_traits.h>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/unreachable_sentinel.h>
+#include <__ranges/copyable_box.h>
+#include <__ranges/enable_borrowed_range.h>
+#include <__ranges/view_interface.h>
+#include <__utility/forward.h>
+#include <__utility/move.h>
+#include <concepts>
+#include <type_traits>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if !defined(_LIBCPP_HAS_NO_RANGES)
+
+namespace ranges {
+  template<class _Int>
+  struct __get_wider_signed {
+    static auto __call() {
+           if constexpr (sizeof(_Int) < sizeof(short)) return type_identity<short>{};
+      else if constexpr (sizeof(_Int) < sizeof(int))   return type_identity<int>{};
+      else if constexpr (sizeof(_Int) < sizeof(long))  return type_identity<long>{};
+      else                                             return type_identity<long long>{};
+
+      static_assert(sizeof(_Int) <= sizeof(long long),
+        "Found integer-like type that is bigger than largest integer like type.");
+    }
+
+    using type = typename decltype(__call())::type;
+  };
+
+  template<class _Start>
+  using _IotaDiffT = typename _If<
+      (!integral<_Start> || sizeof(iter_difference_t<_Start>) > sizeof(_Start)),
+      type_identity<iter_difference_t<_Start>>,
+      __get_wider_signed<_Start>
+    >::type;
+
+  template<class _Iter>
+  concept __decrementable = incrementable<_Iter> && requires(_Iter __i) {
+    { --__i } -> same_as<_Iter&>;
+    { __i-- } -> same_as<_Iter>;
+  };
+
+  template<class _Iter>
+  concept __advanceable =
+    __decrementable<_Iter> && totally_ordered<_Iter> &&
+    requires(_Iter __i, const _Iter __j, const _IotaDiffT<_Iter> __n) {
+      { __i += __n } -> same_as<_Iter&>;
+      { __i -= __n } -> same_as<_Iter&>;
+      _Iter(__j + __n);
+      _Iter(__n + __j);
+      _Iter(__j - __n);
+      { __j - __j } -> convertible_to<_IotaDiffT<_Iter>>;
+    };
+
+  template<class>
+  struct __iota_iterator_category {};
+
+  template<incrementable _Tp>
+  struct __iota_iterator_category<_Tp> {
+    using iterator_category = input_iterator_tag;
+  };
+
+  template<weakly_incrementable _Start, semiregular _Bound = unreachable_sentinel_t>
+    requires __weakly_equality_comparable_with<_Start, _Bound> && copyable<_Start>
+  class iota_view : public view_interface<iota_view<_Start, _Bound>> {
+    struct __iterator : public __iota_iterator_category<_Start> {
+      friend class iota_view;
+
+      using iterator_concept =
+        _If<__advanceable<_Start>,   random_access_iterator_tag,
+        _If<__decrementable<_Start>, bidirectional_iterator_tag,
+        _If<incrementable<_Start>,   forward_iterator_tag,
+        /*Else*/                     input_iterator_tag>>>;
+
+      using value_type = _Start;
+      using difference_type = _IotaDiffT<_Start>;
+
+      _Start __value_ = _Start();
+
+      _LIBCPP_HIDE_FROM_ABI
+      __iterator() requires default_initializable<_Start> = default;
+
+      _LIBCPP_HIDE_FROM_ABI
+      constexpr explicit __iterator(_Start __value) : __value_(_VSTD::move(__value)) {}
+
+      _LIBCPP_HIDE_FROM_ABI
+      constexpr _Start operator*() const noexcept(is_nothrow_copy_constructible_v<_Start>) {
+        return __value_;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      constexpr __iterator& operator++() {
+        ++__value_;
+        return *this;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      constexpr void operator++(int) { ++*this; }
+
+      _LIBCPP_HIDE_FROM_ABI
+      constexpr __iterator operator++(int) requires incrementable<_Start> {
+        auto __tmp = *this;
+        ++*this;
+        return __tmp;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      constexpr __iterator& operator--() requires __decrementable<_Start> {
+        --__value_;
+        return *this;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      constexpr __iterator  operator--(int) requires __decrementable<_Start> {
+        auto __tmp = *this;
+        --*this;
+        return __tmp;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      constexpr __iterator& operator+=(difference_type __n)
+        requires __advanceable<_Start>
+      {
+        if constexpr (__integer_like<_Start> && !__signed_integer_like<_Start>) {
+          if (__n >= difference_type(0)) {
+            __value_ += static_cast<_Start>(__n);
+          } else {
+            __value_ -= static_cast<_Start>(-__n);
+          }
+        } else {
+          __value_ += __n;
+        }
+        return *this;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      constexpr __iterator& operator-=(difference_type __n)
+        requires __advanceable<_Start>
+      {
+        if constexpr (__integer_like<_Start> && !__signed_integer_like<_Start>) {
+          if (__n >= difference_type(0)) {
+            __value_ -= static_cast<_Start>(__n);
+          } else {
+            __value_ += static_cast<_Start>(-__n);
+          }
+        } else {
+          __value_ -= __n;
+        }
+        return *this;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      constexpr _Start operator[](difference_type __n) const
+        requires __advanceable<_Start>
+      {
+        return _Start(__value_ + __n);
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      friend constexpr bool operator==(const __iterator& __x, const __iterator& __y)
+        requires equality_comparable<_Start>
+      {
+        return __x.__value_ == __y.__value_;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      friend constexpr bool operator<(const __iterator& __x, const __iterator& __y)
+        requires totally_ordered<_Start>
+      {
+        return __x.__value_ < __y.__value_;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      friend constexpr bool operator>(const __iterator& __x, const __iterator& __y)
+        requires totally_ordered<_Start>
+      {
+        return __y < __x;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      friend constexpr bool operator<=(const __iterator& __x, const __iterator& __y)
+        requires totally_ordered<_Start>
+      {
+        return !(__y < __x);
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      friend constexpr bool operator>=(const __iterator& __x, const __iterator& __y)
+        requires totally_ordered<_Start>
+      {
+        return !(__x < __y);
+      }
+
+//       friend constexpr auto operator<=>(const __iterator& __x, const __iterator& __y)
+//         requires totally_ordered<_Start> && three_way_comparable<_Start>
+//       {
+//         return __x.__value_ <=> __y.__value_;
+//       }
+
+      _LIBCPP_HIDE_FROM_ABI
+      friend constexpr __iterator operator+(__iterator __i, difference_type __n)
+        requires __advanceable<_Start>
+      {
+        __i += __n;
+        return __i;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      friend constexpr __iterator operator+(difference_type __n, __iterator __i)
+        requires __advanceable<_Start>
+      {
+        return __i + __n;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      friend constexpr __iterator operator-(__iterator __i, difference_type __n)
+        requires __advanceable<_Start>
+      {
+        __i -= __n;
+        return __i;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      friend constexpr difference_type operator-(const __iterator& __x, const __iterator& __y)
+        requires __advanceable<_Start>
+      {
+        if constexpr (__integer_like<_Start>) {
+          if constexpr (__signed_integer_like<_Start>) {
+            return difference_type(difference_type(__x.__value_) - difference_type(__y.__value_));
+          }
+          if (__y.__value_ > __x.__value_) {
+            return difference_type(-difference_type(__y.__value_ - __x.__value_));
+          }
+          return difference_type(__x.__value_ - __y.__value_);
+        }
+        return __x.__value_ - __y.__value_;
+      }
+    };
+
+    struct __sentinel {
+      friend class iota_view;
+
+    private:
+      _Bound __bound_ = _Bound();
+
+    public:
+      _LIBCPP_HIDE_FROM_ABI
+      __sentinel() = default;
+      constexpr explicit __sentinel(_Bound __bound) : __bound_(_VSTD::move(__bound)) {}
+
+      _LIBCPP_HIDE_FROM_ABI
+      friend constexpr bool operator==(const __iterator& __x, const __sentinel& __y) {
+        return __x.__value_ == __y.__bound_;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      friend constexpr iter_difference_t<_Start> operator-(const __iterator& __x, const __sentinel& __y)
+        requires sized_sentinel_for<_Bound, _Start>
+      {
+        return __x.__value_ - __y.__bound_;
+      }
+
+      _LIBCPP_HIDE_FROM_ABI
+      friend constexpr iter_difference_t<_Start> operator-(const __sentinel& __x, const __iterator& __y)
+        requires sized_sentinel_for<_Bound, _Start>
+      {
+        return -(__y - __x);
+      }
+    };
+
+    _Start __value_ = _Start();
+    _Bound __bound_ = _Bound();
+
+  public:
+    _LIBCPP_HIDE_FROM_ABI
+    iota_view() requires default_initializable<_Start> = default;
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr explicit iota_view(_Start __value) : __value_(_VSTD::move(__value)) { }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr iota_view(type_identity_t<_Start> __value, type_identity_t<_Bound> __bound)
+      : __value_(_VSTD::move(__value)), __bound_(_VSTD::move(__bound)) {
+      // Validate the precondition if possible.
+      if constexpr (totally_ordered_with<_Start, _Bound>) {
+        _LIBCPP_ASSERT(ranges::less_equal()(__value_, __bound_),
+                       "Precondition violated: value is greater than bound.");
+      }
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr iota_view(__iterator __first, __iterator __last)
+      requires same_as<_Start, _Bound>
+      : iota_view(_VSTD::move(__first.__value_), _VSTD::move(__last.__value_)) {}
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr iota_view(__iterator __first, _Bound __last)
+      requires same_as<_Bound, unreachable_sentinel_t>
+      : iota_view(_VSTD::move(__first.__value_), _VSTD::move(__last)) {}
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr iota_view(__iterator __first, __sentinel __last)
+      requires (!same_as<_Start, _Bound> && !same_as<_Start, unreachable_sentinel_t>)
+      : iota_view(_VSTD::move(__first.__value_), _VSTD::move(__last.__bound_)) {}
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr __iterator begin() const { return __iterator{__value_}; }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr auto end() const {
+      if constexpr (same_as<_Bound, unreachable_sentinel_t>)
+        return unreachable_sentinel;
+      else
+        return __sentinel{__bound_};
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr __iterator end() const requires same_as<_Start, _Bound> {
+      return __iterator{__bound_};
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr auto size() const
+      requires (same_as<_Start, _Bound> && __advanceable<_Start>) ||
+               (integral<_Start> && integral<_Bound>) ||
+               sized_sentinel_for<_Bound, _Start>
+    {
+      if constexpr (__integer_like<_Start> && __integer_like<_Bound>) {
+        if (__value_ < 0) {
+          if (__bound_ < 0) {
+            return _VSTD::__to_unsigned_like(-__value_) - _VSTD::__to_unsigned_like(-__bound_);
+          }
+          return _VSTD::__to_unsigned_like(__bound_) + _VSTD::__to_unsigned_like(-__value_);
+        }
+        return _VSTD::__to_unsigned_like(__bound_) - _VSTD::__to_unsigned_like(__value_);
+      }
+      return _VSTD::__to_unsigned_like(__bound_ - __value_);
+    }
+  };
+
+  template<class _Start, class _Bound>
+    requires (!__integer_like<_Start> || !__integer_like<_Bound> ||
+              (__signed_integer_like<_Start> == __signed_integer_like<_Bound>))
+  iota_view(_Start, _Bound) -> iota_view<_Start, _Bound>;
+
+  template<class _Start, class _Bound>
+  inline constexpr bool enable_borrowed_range<iota_view<_Start, _Bound>> = true;
+
+namespace views {
+namespace __iota {
+  struct __fn {
+    template<class _Start>
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr auto operator()(_Start&& __start) const
+      noexcept(noexcept(ranges::iota_view(_VSTD::forward<_Start>(__start))))
+      -> decltype(      ranges::iota_view(_VSTD::forward<_Start>(__start)))
+      { return          ranges::iota_view(_VSTD::forward<_Start>(__start)); }
+
+    template<class _Start, class _Bound>
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr auto operator()(_Start&& __start, _Bound&& __bound) const
+      noexcept(noexcept(ranges::iota_view(_VSTD::forward<_Start>(__start), _VSTD::forward<_Bound>(__bound))))
+      -> decltype(      ranges::iota_view(_VSTD::forward<_Start>(__start), _VSTD::forward<_Bound>(__bound)))
+      { return          ranges::iota_view(_VSTD::forward<_Start>(__start), _VSTD::forward<_Bound>(__bound)); }
+  };
+} // namespace __iota
+
+inline namespace __cpo {
+  inline constexpr auto iota = __iota::__fn{};
+}
+} // namespace views
+} // namespace ranges
+
+#endif // !defined(_LIBCPP_HAS_NO_RANGES)
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___RANGES_IOTA_VIEW_H
diff --git a/libcxx/include/__ranges/join_view.h b/libcxx/include/__ranges/join_view.h
new file mode 100644
index 0000000000000..9aa69da76cf0b
--- /dev/null
+++ b/libcxx/include/__ranges/join_view.h
@@ -0,0 +1,350 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef _LIBCPP___RANGES_JOIN_VIEW_H
+#define _LIBCPP___RANGES_JOIN_VIEW_H
+
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__iterator/iterator_traits.h>
+#include <__ranges/access.h>
+#include <__ranges/all.h>
+#include <__ranges/concepts.h>
+#include <__ranges/non_propagating_cache.h>
+#include <__ranges/ref_view.h>
+#include <__ranges/subrange.h>
+#include <__ranges/view_interface.h>
+#include <__utility/declval.h>
+#include <__utility/forward.h>
+#include <optional>
+#include <type_traits>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if !defined(_LIBCPP_HAS_NO_RANGES)
+
+namespace ranges {
+  template<class>
+  struct __join_view_iterator_category {};
+
+  template<class _View>
+    requires is_reference_v<range_reference_t<_View>> &&
+             forward_range<_View> &&
+             forward_range<range_reference_t<_View>>
+  struct __join_view_iterator_category<_View> {
+    using _OuterC = typename iterator_traits<iterator_t<_View>>::iterator_category;
+    using _InnerC = typename iterator_traits<iterator_t<range_reference_t<_View>>>::iterator_category;
+
+    using iterator_category = _If<
+      derived_from<_OuterC, bidirectional_iterator_tag> && derived_from<_InnerC, bidirectional_iterator_tag>,
+      bidirectional_iterator_tag,
+      _If<
+        derived_from<_OuterC, forward_iterator_tag> && derived_from<_InnerC, forward_iterator_tag>,
+        forward_iterator_tag,
+        input_iterator_tag
+      >
+    >;
+  };
+
+  template<input_range _View>
+    requires view<_View> && input_range<range_reference_t<_View>>
+  class join_view
+    : public view_interface<join_view<_View>> {
+  private:
+    using _InnerRange = range_reference_t<_View>;
+
+    template<bool> struct __iterator;
+    template<bool> struct __sentinel;
+
+    static constexpr bool _UseCache = !is_reference_v<_InnerRange>;
+    using _Cache = _If<_UseCache, __non_propagating_cache<remove_cvref_t<_InnerRange>>, __empty_cache>;
+    [[no_unique_address]] _Cache __cache_;
+    _View __base_ = _View(); // TODO: [[no_unique_address]] makes clang crash! File a bug :)
+
+  public:
+    _LIBCPP_HIDE_FROM_ABI
+    join_view() requires default_initializable<_View> = default;
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr explicit join_view(_View __base)
+      : __base_(_VSTD::move(__base)) {}
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr _View base() const& requires copy_constructible<_View> { return __base_; }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr _View base() && { return _VSTD::move(__base_); }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr auto begin() {
+      constexpr bool __use_const = __simple_view<_View> &&
+                                   is_reference_v<range_reference_t<_View>>;
+      return __iterator<__use_const>{*this, ranges::begin(__base_)};
+    }
+
+    template<class _V2 = _View>
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr auto begin() const
+      requires input_range<const _V2> &&
+               is_reference_v<range_reference_t<const _V2>>
+    {
+      return __iterator<true>{*this, ranges::begin(__base_)};
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr auto end() {
+      if constexpr (forward_range<_View> &&
+                    is_reference_v<_InnerRange> &&
+                    forward_range<_InnerRange> &&
+                    common_range<_View> &&
+                    common_range<_InnerRange>)
+        return __iterator<__simple_view<_View>>{*this, ranges::end(__base_)};
+      else
+        return __sentinel<__simple_view<_View>>{*this};
+    }
+
+    template<class _V2 = _View>
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr auto end() const
+      requires input_range<const _V2> &&
+               is_reference_v<range_reference_t<const _V2>>
+    {
+      using _ConstInnerRange = range_reference_t<const _View>;
+      if constexpr (forward_range<const _View> &&
+                    is_reference_v<_ConstInnerRange> &&
+                    forward_range<_ConstInnerRange> &&
+                    common_range<const _View> &&
+                    common_range<_ConstInnerRange>) {
+        return __iterator<true>{*this, ranges::end(__base_)};
+      } else {
+        return __sentinel<true>{*this};
+      }
+    }
+  };
+
+  template<input_range _View>
+    requires view<_View> && input_range<range_reference_t<_View>>
+  template<bool _Const> struct join_view<_View>::__sentinel {
+    template<bool> friend struct __sentinel;
+
+  private:
+    using _Parent = __maybe_const<_Const, join_view>;
+    using _Base = __maybe_const<_Const, _View>;
+    sentinel_t<_Base> __end_ = sentinel_t<_Base>();
+
+  public:
+    _LIBCPP_HIDE_FROM_ABI
+    __sentinel() = default;
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr explicit __sentinel(_Parent& __parent)
+      : __end_(ranges::end(__parent.__base_)) {}
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr __sentinel(__sentinel<!_Const> __s)
+      requires _Const && convertible_to<sentinel_t<_View>, sentinel_t<_Base>>
+      : __end_(_VSTD::move(__s.__end_)) {}
+
+    template<bool _OtherConst>
+      requires sentinel_for<sentinel_t<_Base>, iterator_t<__maybe_const<_OtherConst, _View>>>
+    _LIBCPP_HIDE_FROM_ABI
+    friend constexpr bool operator==(const __iterator<_OtherConst>& __x, const __sentinel& __y) {
+      return __x.__outer_ == __y.__end_;
+    }
+  };
+
+  template<input_range _View>
+    requires view<_View> && input_range<range_reference_t<_View>>
+  template<bool _Const> struct join_view<_View>::__iterator
+    : public __join_view_iterator_category<__maybe_const<_Const, _View>> {
+
+    template<bool> friend struct __iterator;
+
+  private:
+    using _Parent = __maybe_const<_Const, join_view>;
+    using _Base = __maybe_const<_Const, _View>;
+    using _Outer = iterator_t<_Base>;
+    using _Inner = iterator_t<range_reference_t<_Base>>;
+
+    static constexpr bool __ref_is_glvalue = is_reference_v<range_reference_t<_Base>>;
+
+  public:
+    _Outer __outer_ = _Outer();
+
+  private:
+    optional<_Inner> __inner_;
+    _Parent *__parent_ = nullptr;
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr void __satisfy() {
+      for (; __outer_ != ranges::end(__parent_->__base_); ++__outer_) {
+        auto&& __inner = [&]() -> auto&& {
+          if constexpr (__ref_is_glvalue)
+            return *__outer_;
+          else
+            return __parent_->__cache_.__emplace_from([&]() -> decltype(auto) { return *__outer_; });
+        }();
+        __inner_ = ranges::begin(__inner);
+        if (*__inner_ != ranges::end(__inner))
+          return;
+      }
+
+      if constexpr (__ref_is_glvalue)
+        __inner_.reset();
+    }
+
+  public:
+    using iterator_concept = _If<
+      __ref_is_glvalue && bidirectional_range<_Base> && bidirectional_range<range_reference_t<_Base>>,
+      bidirectional_iterator_tag,
+      _If<
+        __ref_is_glvalue && forward_range<_Base> && forward_range<range_reference_t<_Base>>,
+        forward_iterator_tag,
+        input_iterator_tag
+      >
+    >;
+
+    using value_type = range_value_t<range_reference_t<_Base>>;
+
+    using difference_type = common_type_t<
+      range_difference_t<_Base>, range_difference_t<range_reference_t<_Base>>>;
+
+    _LIBCPP_HIDE_FROM_ABI
+    __iterator() requires default_initializable<_Outer> = default;
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr __iterator(_Parent& __parent, _Outer __outer)
+      : __outer_(_VSTD::move(__outer))
+      , __parent_(_VSTD::addressof(__parent)) {
+      __satisfy();
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr __iterator(__iterator<!_Const> __i)
+      requires _Const &&
+               convertible_to<iterator_t<_View>, _Outer> &&
+               convertible_to<iterator_t<_InnerRange>, _Inner>
+      : __outer_(_VSTD::move(__i.__outer_))
+      , __inner_(_VSTD::move(__i.__inner_))
+      , __parent_(__i.__parent_) {}
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr decltype(auto) operator*() const {
+      return **__inner_;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr _Inner operator->() const
+      requires __has_arrow<_Inner> && copyable<_Inner>
+    {
+      return *__inner_;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr __iterator& operator++() {
+      auto&& __inner = [&]() -> auto&& {
+        if constexpr (__ref_is_glvalue)
+          return *__outer_;
+        else
+          return *__parent_->__cache_;
+      }();
+      if (++*__inner_ == ranges::end(__inner)) {
+        ++__outer_;
+        __satisfy();
+      }
+      return *this;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr void operator++(int) {
+      ++*this;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr __iterator operator++(int)
+      requires __ref_is_glvalue &&
+               forward_range<_Base> &&
+               forward_range<range_reference_t<_Base>>
+    {
+      auto __tmp = *this;
+      ++*this;
+      return __tmp;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr __iterator& operator--()
+      requires __ref_is_glvalue &&
+               bidirectional_range<_Base> &&
+               bidirectional_range<range_reference_t<_Base>> &&
+               common_range<range_reference_t<_Base>>
+    {
+      if (__outer_ == ranges::end(__parent_->__base_))
+        __inner_ = ranges::end(*--__outer_);
+
+      // Skip empty inner ranges when going backwards.
+      while (*__inner_ == ranges::begin(*__outer_)) {
+        __inner_ = ranges::end(*--__outer_);
+      }
+
+      --*__inner_;
+      return *this;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    constexpr __iterator operator--(int)
+      requires __ref_is_glvalue &&
+               bidirectional_range<_Base> &&
+               bidirectional_range<range_reference_t<_Base>> &&
+               common_range<range_reference_t<_Base>>
+    {
+      auto __tmp = *this;
+      --*this;
+      return __tmp;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    friend constexpr bool operator==(const __iterator& __x, const __iterator& __y)
+      requires __ref_is_glvalue &&
+               equality_comparable<iterator_t<_Base>> &&
+               equality_comparable<iterator_t<range_reference_t<_Base>>>
+    {
+      return __x.__outer_ == __y.__outer_ && __x.__inner_ == __y.__inner_;
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    friend constexpr decltype(auto) iter_move(const __iterator& __i)
+      noexcept(noexcept(ranges::iter_move(*__i.__inner_)))
+    {
+      return ranges::iter_move(*__i.__inner_);
+    }
+
+    _LIBCPP_HIDE_FROM_ABI
+    friend constexpr void iter_swap(const __iterator& __x, const __iterator& __y)
+      noexcept(noexcept(ranges::iter_swap(*__x.__inner_, *__y.__inner_)))
+      requires indirectly_swappable<_Inner>
+    {
+      return ranges::iter_swap(*__x.__inner_, *__y.__inner_);
+    }
+  };
+
+  template<class _Range>
+  explicit join_view(_Range&&) -> join_view<views::all_t<_Range>>;
+
+} // namespace ranges
+
+#undef _CONSTEXPR_TERNARY
+
+#endif // !defined(_LIBCPP_HAS_NO_RANGES)
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___RANGES_JOIN_VIEW_H
diff --git a/libcxx/include/__ranges/non_propagating_cache.h b/libcxx/include/__ranges/non_propagating_cache.h
index 878f7070a07f4..456e08d8c971a 100644
--- a/libcxx/include/__ranges/non_propagating_cache.h
+++ b/libcxx/include/__ranges/non_propagating_cache.h
@@ -13,6 +13,7 @@
 #include <__iterator/concepts.h>        // indirectly_readable
 #include <__iterator/iterator_traits.h> // iter_reference_t
 #include <__memory/addressof.h>
+#include <__utility/forward.h>
 #include <concepts>                     // constructible_from
 #include <optional>
 #include <type_traits>
@@ -21,13 +22,8 @@
 #pragma GCC system_header
 #endif
 
-_LIBCPP_PUSH_MACROS
-#include <__undef_macros>
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-// clang-format off
-
 #if !defined(_LIBCPP_HAS_NO_RANGES)
 
 namespace ranges {
@@ -42,7 +38,20 @@ namespace ranges {
   template<class _Tp>
     requires is_object_v<_Tp>
   class _LIBCPP_TEMPLATE_VIS __non_propagating_cache {
-    optional<_Tp> __value_ = nullopt;
+    struct __from_tag { };
+    struct __forward_tag { };
+
+    // This helper class is needed to perform copy and move elision when
+    // constructing the contained type from an iterator.
+    struct __wrapper {
+      template<class ..._Args>
+      constexpr explicit __wrapper(__forward_tag, _Args&& ...__args) : __t_(_VSTD::forward<_Args>(__args)...) { }
+      template<class _Fn>
+      constexpr explicit __wrapper(__from_tag, _Fn const& __f) : __t_(__f()) { }
+      _Tp __t_;
+    };
+
+    optional<__wrapper> __value_ = nullopt;
 
   public:
     _LIBCPP_HIDE_FROM_ABI __non_propagating_cache() = default;
@@ -75,16 +84,24 @@ namespace ranges {
     }
 
     _LIBCPP_HIDE_FROM_ABI
-    constexpr _Tp& operator*() { return *__value_; }
+    constexpr _Tp& operator*() { return __value_->__t_; }
     _LIBCPP_HIDE_FROM_ABI
-    constexpr _Tp const& operator*() const { return *__value_; }
+    constexpr _Tp const& operator*() const { return __value_->__t_; }
 
     _LIBCPP_HIDE_FROM_ABI
     constexpr bool __has_value() const { return __value_.has_value(); }
+
+    template<class _Fn>
     _LIBCPP_HIDE_FROM_ABI
-    constexpr void __set(_Tp const& __value) { __value_.emplace(__value); }
+    constexpr _Tp& __emplace_from(_Fn const& __f) {
+      return __value_.emplace(__from_tag{}, __f).__t_;
+    }
+
+    template<class ..._Args>
     _LIBCPP_HIDE_FROM_ABI
-    constexpr void __set(_Tp&& __value) { __value_.emplace(_VSTD::move(__value)); }
+    constexpr _Tp& __emplace(_Args&& ...__args) {
+      return __value_.emplace(__forward_tag{}, _VSTD::forward<_Args>(__args)...).__t_;
+    }
   };
 
   struct __empty_cache { };
@@ -94,6 +111,4 @@ namespace ranges {
 
 _LIBCPP_END_NAMESPACE_STD
 
-_LIBCPP_POP_MACROS
-
 #endif // _LIBCPP___RANGES_NON_PROPAGATING_CACHE_H
diff --git a/libcxx/include/__ranges/reverse_view.h b/libcxx/include/__ranges/reverse_view.h
index 5953f74fd77d5..ad88dc7138053 100644
--- a/libcxx/include/__ranges/reverse_view.h
+++ b/libcxx/include/__ranges/reverse_view.h
@@ -64,7 +64,7 @@ namespace ranges {
 
       auto __tmp = _VSTD::make_reverse_iterator(ranges::next(ranges::begin(__base_), ranges::end(__base_)));
       if constexpr (_UseCache)
-        __cached_begin_.__set(__tmp);
+        __cached_begin_.__emplace(__tmp);
       return __tmp;
     }
 
diff --git a/libcxx/include/__ranges/subrange.h b/libcxx/include/__ranges/subrange.h
index acae70cf3cc88..e162d6b316952 100644
--- a/libcxx/include/__ranges/subrange.h
+++ b/libcxx/include/__ranges/subrange.h
@@ -9,6 +9,11 @@
 #ifndef _LIBCPP___RANGES_SUBRANGE_H
 #define _LIBCPP___RANGES_SUBRANGE_H
 
+#include <__concepts/constructible.h>
+#include <__concepts/convertible_to.h>
+#include <__concepts/copyable.h>
+#include <__concepts/derived_from.h>
+#include <__concepts/different_from.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
@@ -22,21 +27,16 @@
 #include <__ranges/view_interface.h>
 #include <__tuple>
 #include <__utility/move.h>
-#include <concepts>
 #include <type_traits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #pragma GCC system_header
 #endif
 
-_LIBCPP_PUSH_MACROS
-#include <__undef_macros>
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if !defined(_LIBCPP_HAS_NO_RANGES)
 
-// clang-format off
 namespace ranges {
   template<class _From, class _To>
   concept __convertible_to_non_slicing =
@@ -253,17 +253,40 @@ namespace ranges {
   inline constexpr bool enable_borrowed_range<subrange<_Ip, _Sp, _Kp>> = true;
 
   template<range _Rp>
-  using borrowed_subrange_t = _If<borrowed_range<_Rp>, subrange<iterator_t<_Rp> >, dangling>;
+  using borrowed_subrange_t = _If<borrowed_range<_Rp>, subrange<iterator_t<_Rp>>, dangling>;
 } // namespace ranges
 
+// [range.subrange.general]
+
 using ranges::get;
 
-// clang-format off
+// [ranges.syn]
+
+template<class _Ip, class _Sp, ranges::subrange_kind _Kp>
+struct tuple_size<ranges::subrange<_Ip, _Sp, _Kp>> : integral_constant<size_t, 2> {};
+
+template<class _Ip, class _Sp, ranges::subrange_kind _Kp>
+struct tuple_element<0, ranges::subrange<_Ip, _Sp, _Kp>> {
+  using type = _Ip;
+};
+
+template<class _Ip, class _Sp, ranges::subrange_kind _Kp>
+struct tuple_element<1, ranges::subrange<_Ip, _Sp, _Kp>> {
+  using type = _Sp;
+};
+
+template<class _Ip, class _Sp, ranges::subrange_kind _Kp>
+struct tuple_element<0, const ranges::subrange<_Ip, _Sp, _Kp>> {
+  using type = _Ip;
+};
+
+template<class _Ip, class _Sp, ranges::subrange_kind _Kp>
+struct tuple_element<1, const ranges::subrange<_Ip, _Sp, _Kp>> {
+  using type = _Sp;
+};
 
 #endif // !defined(_LIBCPP_HAS_NO_RANGES)
 
 _LIBCPP_END_NAMESPACE_STD
 
-_LIBCPP_POP_MACROS
-
 #endif // _LIBCPP___RANGES_SUBRANGE_H
diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer
index 901c0374aa1c4..0c02e035814bd 100644
--- a/libcxx/include/__split_buffer
+++ b/libcxx/include/__split_buffer
@@ -17,17 +17,8 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <bool>
-class __split_buffer_common
-{
-protected:
-    _LIBCPP_NORETURN void __throw_length_error() const;
-    _LIBCPP_NORETURN void __throw_out_of_range() const;
-};
-
 template <class _Tp, class _Allocator = allocator<_Tp> >
 struct __split_buffer
-    : private __split_buffer_common<true>
 {
 private:
     __split_buffer(const __split_buffer&);
diff --git a/libcxx/include/__utility/pair.h b/libcxx/include/__utility/pair.h
index e0216f3402b82..06d68d5a94887 100644
--- a/libcxx/include/__utility/pair.h
+++ b/libcxx/include/__utility/pair.h
@@ -310,10 +310,10 @@ struct _LIBCPP_TEMPLATE_VIS pair
 #endif
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _T1, class _T2>
 pair(_T1, _T2) -> pair<_T1, _T2>;
-#endif // _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#endif
 
 template <class _T1, class _T2>
 inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX11
diff --git a/libcxx/include/array b/libcxx/include/array
index 022172b86ec9c..b76b9fa9a2f09 100644
--- a/libcxx/include/array
+++ b/libcxx/include/array
@@ -356,7 +356,7 @@ struct _LIBCPP_TEMPLATE_VIS array<_Tp, 0>
 };
 
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _Tp, class... _Args,
          class = _EnableIf<__all<_IsSame<_Tp, _Args>::value...>::value>
          >
diff --git a/libcxx/include/compare b/libcxx/include/compare
index 4683078188ed9..4c11f138b9c90 100644
--- a/libcxx/include/compare
+++ b/libcxx/include/compare
@@ -35,6 +35,12 @@ namespace std {
   template<class... Ts>
     using common_comparison_category_t = typename common_comparison_category<Ts...>::type;
 
+  // [cmp.result], result of three-way comparison
+  template<class T, class U = T> struct compare_three_way_result;
+
+  template<class T, class U = T>
+    using compare_three_way_result_t = typename compare_three_way_result<T, U>::type;
+
   // [cmp.alg], comparison algorithms
   template<class T> constexpr strong_ordering strong_order(const T& a, const T& b);
   template<class T> constexpr weak_ordering weak_order(const T& a, const T& b);
@@ -121,9 +127,9 @@ namespace std {
 */
 
 #include <__compare/common_comparison_category.h>
+#include <__compare/compare_three_way_result.h>
 #include <__compare/ordering.h>
 #include <__config>
-#include <type_traits>
 
 #ifndef _LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER
 #pragma GCC system_header
diff --git a/libcxx/include/ctime b/libcxx/include/ctime
index 8b2efd7449ca9..2a3fdd12e874b 100644
--- a/libcxx/include/ctime
+++ b/libcxx/include/ctime
@@ -59,7 +59,7 @@ int timespec_get( struct timespec *ts, int base); // C++17
 // we're detecting this here instead of in <__config> because we can't include
 // system headers from <__config>, since it leads to circular module dependencies.
 // This is also meant to be a very temporary workaround until the SDKs are fixed.
-#if defined(__APPLE__)
+#if defined(__APPLE__) && !__has_attribute(using_if_exists)
 #   include <sys/cdefs.h>
 #   if defined(_LIBCPP_HAS_TIMESPEC_GET) && (__DARWIN_C_LEVEL < __DARWIN_C_FULL)
 #       define _LIBCPP_HAS_TIMESPEC_GET_NOT_ACTUALLY_PROVIDED
diff --git a/libcxx/include/deque b/libcxx/include/deque
index aff93deb10107..f4caa9ef0af24 100644
--- a/libcxx/include/deque
+++ b/libcxx/include/deque
@@ -908,31 +908,8 @@ move_backward(__deque_iterator<_V1, _P1, _R1, _M1, _D1, _B1> __f,
     return __r;
 }
 
-template <bool>
-class __deque_base_common
-{
-protected:
-    _LIBCPP_NORETURN void __throw_length_error() const;
-    _LIBCPP_NORETURN void __throw_out_of_range() const;
-};
-
-template <bool __b>
-void
-__deque_base_common<__b>::__throw_length_error() const
-{
-    _VSTD::__throw_length_error("deque");
-}
-
-template <bool __b>
-void
-__deque_base_common<__b>::__throw_out_of_range() const
-{
-    _VSTD::__throw_out_of_range("deque");
-}
-
 template <class _Tp, class _Allocator>
 class __deque_base
-    : protected __deque_base_common<true>
 {
     __deque_base(const __deque_base& __c);
     __deque_base& operator=(const __deque_base& __c);
@@ -1588,7 +1565,7 @@ public:
     void __move_assign(deque& __c, false_type);
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator,
          class _Alloc = allocator<__iter_value_type<_InputIterator>>,
          class = _EnableIf<__is_allocator<_Alloc>::value>
@@ -1873,7 +1850,7 @@ typename deque<_Tp, _Allocator>::reference
 deque<_Tp, _Allocator>::at(size_type __i)
 {
     if (__i >= __base::size())
-        __base::__throw_out_of_range();
+        _VSTD::__throw_out_of_range("deque");
     size_type __p = __base::__start_ + __i;
     return *(*(__base::__map_.begin() + __p / __base::__block_size) + __p % __base::__block_size);
 }
@@ -1884,7 +1861,7 @@ typename deque<_Tp, _Allocator>::const_reference
 deque<_Tp, _Allocator>::at(size_type __i) const
 {
     if (__i >= __base::size())
-        __base::__throw_out_of_range();
+        _VSTD::__throw_out_of_range("deque");
     size_type __p = __base::__start_ + __i;
     return *(*(__base::__map_.begin() + __p / __base::__block_size) + __p % __base::__block_size);
 }
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 0ae8f19ec20c9..66aa75765b8fd 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -870,7 +870,7 @@ private:
 };
 
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator,
          class _Alloc = allocator<__iter_value_type<_InputIterator>>,
          class = _EnableIf<__is_allocator<_Alloc>::value>
diff --git a/libcxx/include/list b/libcxx/include/list
index 23da5fcfc92d6..2ae6d588dd849 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -1141,7 +1141,7 @@ private:
     void __move_assign(list& __c, false_type);
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator,
          class _Alloc = allocator<__iter_value_type<_InputIterator>>,
          class = _EnableIf<__is_allocator<_Alloc>::value>
diff --git a/libcxx/include/map b/libcxx/include/map
index 513a04dd79230..3c2b6ff5eb8d2 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -1502,7 +1502,7 @@ private:
 #endif
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator, class _Compare = less<__iter_key_type<_InputIterator>>,
          class _Allocator = allocator<__iter_to_alloc_type<_InputIterator>>,
          class = _EnableIf<!__is_allocator<_Compare>::value, void>,
@@ -2175,7 +2175,7 @@ private:
     typedef unique_ptr<__node, _Dp> __node_holder;
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator, class _Compare = less<__iter_key_type<_InputIterator>>,
          class _Allocator = allocator<__iter_to_alloc_type<_InputIterator>>,
          class = _EnableIf<!__is_allocator<_Compare>::value, void>,
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index f7c899c5fe535..98ba8d38c71a3 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -361,6 +361,7 @@ module std [system] {
 
     module __compare {
       module common_comparison_category { private header "__compare/common_comparison_category.h" }
+      module compare_three_way_result   { private header "__compare/compare_three_way_result.h"   }
       module ordering                   { private header "__compare/ordering.h"                   }
     }
   }
@@ -658,6 +659,8 @@ module std [system] {
       module empty_view             { private header "__ranges/empty_view.h"            }
       module enable_borrowed_range  { private header "__ranges/enable_borrowed_range.h" }
       module enable_view            { private header "__ranges/enable_view.h"           }
+      module iota_view              { private header "__ranges/iota_view.h"             }
+      module join_view              { private header "__ranges/join_view.h"             }
       module non_propagating_cache  { private header "__ranges/non_propagating_cache.h" }
       module ref_view               { private header "__ranges/ref_view.h"              }
       module reverse_view           { private header "__ranges/reverse_view.h"          }
diff --git a/libcxx/include/optional b/libcxx/include/optional
index 5aec96927acf1..4469360667fdf 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -1000,7 +1000,7 @@ public:
     using __base::reset;
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class T>
     optional(T) -> optional<T>;
 #endif
diff --git a/libcxx/include/queue b/libcxx/include/queue
index 276fff9a56601..2a09bac7611d0 100644
--- a/libcxx/include/queue
+++ b/libcxx/include/queue
@@ -358,7 +358,7 @@ public:
     operator< (const queue<_T1, _C1>& __x,const queue<_T1, _C1>& __y);
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _Container,
          class = _EnableIf<!__is_allocator<_Container>::value>
 >
@@ -584,7 +584,7 @@ public:
                    __is_nothrow_swappable<value_compare>::value);
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template <class _Compare,
           class _Container,
           class = _EnableIf<!__is_allocator<_Compare>::value>,
diff --git a/libcxx/include/ranges b/libcxx/include/ranges
index 49e79647739c8..a4cbfafd5f99e 100644
--- a/libcxx/include/ranges
+++ b/libcxx/include/ranges
@@ -177,6 +177,18 @@ namespace std::ranges {
   template<copy_constructible T>
     requires is_object_v<T>
   class single_view;
+
+  template<weakly_incrementable W, semiregular Bound = unreachable_sentinel_t>
+    requires weakly-equality-comparable-with<W, Bound> && copyable<W>
+  class iota_view;
+
+  template<class W, class Bound>
+    inline constexpr bool enable_borrowed_range<iota_view<W, Bound>> = true;
+
+  // [range.join], join view
+  template<input_range V>
+    requires view<V> && input_range<range_reference_t<V>>
+  class join_view;
 }
 
 */
@@ -199,6 +211,8 @@ namespace std::ranges {
 #include <__ranges/empty_view.h>
 #include <__ranges/enable_borrowed_range.h>
 #include <__ranges/enable_view.h>
+#include <__ranges/iota_view.h>
+#include <__ranges/join_view.h>
 #include <__ranges/ref_view.h>
 #include <__ranges/reverse_view.h>
 #include <__ranges/take_view.h>
@@ -224,6 +238,8 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER > 17 && !defined(_LIBCPP_HAS_NO_RANGES)
 
+namespace views = ranges::views;
+
 #endif // _LIBCPP_STD_VER > 17 && !defined(_LIBCPP_HAS_NO_RANGES)
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/regex b/libcxx/include/regex
index e47cf9fca5d27..a0bf3e2f732b8 100644
--- a/libcxx/include/regex
+++ b/libcxx/include/regex
@@ -3014,7 +3014,7 @@ private:
     template <class, class> friend class __lookahead;
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template <class _ForwardIterator,
           class = typename enable_if<__is_cpp17_forward_iterator<_ForwardIterator>::value, nullptr_t>::type
 >
diff --git a/libcxx/include/set b/libcxx/include/set
index 21ec8435dd840..04f8fc2521d9f 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -868,7 +868,7 @@ public:
 #endif
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator,
          class _Compare = less<__iter_value_type<_InputIterator>>,
          class _Allocator = allocator<__iter_value_type<_InputIterator>>,
@@ -1399,7 +1399,7 @@ public:
 #endif
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator,
          class _Compare = less<__iter_value_type<_InputIterator>>,
          class _Allocator = allocator<__iter_value_type<_InputIterator>>,
diff --git a/libcxx/include/stack b/libcxx/include/stack
index aefef31ac97b4..5ee4d5176ec4e 100644
--- a/libcxx/include/stack
+++ b/libcxx/include/stack
@@ -231,7 +231,7 @@ public:
     operator< (const stack<T1, _C1>& __x, const stack<T1, _C1>& __y);
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _Container,
          class = _EnableIf<!__is_allocator<_Container>::value>
 >
diff --git a/libcxx/include/string b/libcxx/include/string
index 4940021b0c68a..543234e40fbda 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -522,6 +522,7 @@ basic_string<char32_t> operator "" s( const char32_t *str, size_t len ); // C++1
 #include <algorithm>
 #include <compare>
 #include <cstdio>  // EOF
+#include <cstdlib>
 #include <cstring>
 #include <cwchar>
 #include <initializer_list>
@@ -1714,6 +1715,24 @@ private:
         return data() <= __p && __p <= data() + size();
     }
 
+    _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI
+    void __throw_length_error() const {
+#ifndef _LIBCPP_NO_EXCEPTIONS
+        __basic_string_common<true>::__throw_length_error();
+#else
+        _VSTD::abort();
+#endif
+    }
+
+    _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI
+    void __throw_out_of_range() const {
+#ifndef _LIBCPP_NO_EXCEPTIONS
+        __basic_string_common<true>::__throw_out_of_range();
+#else
+        _VSTD::abort();
+#endif
+    }
+
     friend basic_string operator+<>(const basic_string&, const basic_string&);
     friend basic_string operator+<>(const value_type*, const basic_string&);
     friend basic_string operator+<>(value_type, const basic_string&);
@@ -1732,7 +1751,7 @@ _LIBCPP_STRING_V1_EXTERN_TEMPLATE_LIST(_LIBCPP_EXTERN_TEMPLATE, wchar_t)
 #endif
 
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator,
          class _CharT = __iter_value_type<_InputIterator>,
          class _Allocator = allocator<_CharT>,
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index e1019ef999d5b..2e339f807aa8a 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -1097,7 +1097,7 @@ public:
     void swap(tuple&) _NOEXCEPT {}
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template <class ..._Tp>
 tuple(_Tp...) -> tuple<_Tp...>;
 template <class _Tp1, class _Tp2>
diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits
index e685a126cc85c..b6e1c20bf397f 100644
--- a/libcxx/include/type_traits
+++ b/libcxx/include/type_traits
@@ -3811,10 +3811,6 @@ using __enable_if_bullet6 = typename enable_if
 
 // fall back - none of the bullets
 
-#define _LIBCPP_INVOKE_RETURN(...) \
-    noexcept(noexcept(__VA_ARGS__)) -> decltype(__VA_ARGS__) \
-    { return __VA_ARGS__; }
-
 template <class ..._Args>
 auto __invoke(__any, _Args&& ...__args) -> __nat;
 
@@ -3828,42 +3824,54 @@ template <class _Fp, class _A0, class ..._Args,
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _A0&& __a0, _Args&& ...__args)
-_LIBCPP_INVOKE_RETURN((static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...))
+    noexcept(noexcept((static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...)))
+    -> decltype(      (static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...))
+    { return          (static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...); }
 
 template <class _Fp, class _A0, class ..._Args,
           class = __enable_if_bullet1<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR auto
 __invoke_constexpr(_Fp&& __f, _A0&& __a0, _Args&& ...__args)
-_LIBCPP_INVOKE_RETURN((static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...))
+    noexcept(noexcept((static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...)))
+    -> decltype(      (static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...))
+    { return          (static_cast<_A0&&>(__a0).*__f)(static_cast<_Args&&>(__args)...); }
 
 template <class _Fp, class _A0, class ..._Args,
           class = __enable_if_bullet2<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _A0&& __a0, _Args&& ...__args)
-_LIBCPP_INVOKE_RETURN((__a0.get().*__f)(static_cast<_Args&&>(__args)...))
+    noexcept(noexcept((__a0.get().*__f)(static_cast<_Args&&>(__args)...)))
+    -> decltype(      (__a0.get().*__f)(static_cast<_Args&&>(__args)...))
+    { return          (__a0.get().*__f)(static_cast<_Args&&>(__args)...); }
 
 template <class _Fp, class _A0, class ..._Args,
           class = __enable_if_bullet2<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR auto
 __invoke_constexpr(_Fp&& __f, _A0&& __a0, _Args&& ...__args)
-_LIBCPP_INVOKE_RETURN((__a0.get().*__f)(static_cast<_Args&&>(__args)...))
+    noexcept(noexcept((__a0.get().*__f)(static_cast<_Args&&>(__args)...)))
+    -> decltype(      (__a0.get().*__f)(static_cast<_Args&&>(__args)...))
+    { return          (__a0.get().*__f)(static_cast<_Args&&>(__args)...); }
 
 template <class _Fp, class _A0, class ..._Args,
           class = __enable_if_bullet3<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _A0&& __a0, _Args&& ...__args)
-_LIBCPP_INVOKE_RETURN(((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...))
+    noexcept(noexcept(((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...)))
+    -> decltype(      ((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...))
+    { return          ((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...); }
 
 template <class _Fp, class _A0, class ..._Args,
           class = __enable_if_bullet3<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR auto
 __invoke_constexpr(_Fp&& __f, _A0&& __a0, _Args&& ...__args)
-_LIBCPP_INVOKE_RETURN(((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...))
+    noexcept(noexcept(((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...)))
+    -> decltype(      ((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...))
+    { return          ((*static_cast<_A0&&>(__a0)).*__f)(static_cast<_Args&&>(__args)...); }
 
 // bullets 4, 5 and 6
 
@@ -3872,42 +3880,54 @@ template <class _Fp, class _A0,
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _A0&& __a0)
-_LIBCPP_INVOKE_RETURN(static_cast<_A0&&>(__a0).*__f)
+    noexcept(noexcept(static_cast<_A0&&>(__a0).*__f))
+    -> decltype(      static_cast<_A0&&>(__a0).*__f)
+    { return          static_cast<_A0&&>(__a0).*__f; }
 
 template <class _Fp, class _A0,
           class = __enable_if_bullet4<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR auto
 __invoke_constexpr(_Fp&& __f, _A0&& __a0)
-_LIBCPP_INVOKE_RETURN(static_cast<_A0&&>(__a0).*__f)
+    noexcept(noexcept(static_cast<_A0&&>(__a0).*__f))
+    -> decltype(      static_cast<_A0&&>(__a0).*__f)
+    { return          static_cast<_A0&&>(__a0).*__f; }
 
 template <class _Fp, class _A0,
           class = __enable_if_bullet5<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _A0&& __a0)
-_LIBCPP_INVOKE_RETURN(__a0.get().*__f)
+    noexcept(noexcept(__a0.get().*__f))
+    -> decltype(      __a0.get().*__f)
+    { return          __a0.get().*__f; }
 
 template <class _Fp, class _A0,
           class = __enable_if_bullet5<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR auto
 __invoke_constexpr(_Fp&& __f, _A0&& __a0)
-_LIBCPP_INVOKE_RETURN(__a0.get().*__f)
+    noexcept(noexcept(__a0.get().*__f))
+    -> decltype(      __a0.get().*__f)
+    { return          __a0.get().*__f; }
 
 template <class _Fp, class _A0,
           class = __enable_if_bullet6<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _A0&& __a0)
-_LIBCPP_INVOKE_RETURN((*static_cast<_A0&&>(__a0)).*__f)
+    noexcept(noexcept((*static_cast<_A0&&>(__a0)).*__f))
+    -> decltype(      (*static_cast<_A0&&>(__a0)).*__f)
+    { return          (*static_cast<_A0&&>(__a0)).*__f; }
 
 template <class _Fp, class _A0,
           class = __enable_if_bullet6<_Fp, _A0>>
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR auto
 __invoke_constexpr(_Fp&& __f, _A0&& __a0)
-_LIBCPP_INVOKE_RETURN((*static_cast<_A0&&>(__a0)).*__f)
+    noexcept(noexcept((*static_cast<_A0&&>(__a0)).*__f))
+    -> decltype(      (*static_cast<_A0&&>(__a0)).*__f)
+    { return          (*static_cast<_A0&&>(__a0)).*__f; }
 
 // bullet 7
 
@@ -3915,15 +3935,17 @@ template <class _Fp, class ..._Args>
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR_AFTER_CXX17 auto
 __invoke(_Fp&& __f, _Args&& ...__args)
-_LIBCPP_INVOKE_RETURN(static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...))
+    noexcept(noexcept(static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...)))
+    -> decltype(      static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...))
+    { return          static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...); }
 
 template <class _Fp, class ..._Args>
 inline _LIBCPP_INLINE_VISIBILITY
 _LIBCPP_CONSTEXPR auto
 __invoke_constexpr(_Fp&& __f, _Args&& ...__args)
-_LIBCPP_INVOKE_RETURN(static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...))
-
-#undef _LIBCPP_INVOKE_RETURN
+    noexcept(noexcept(static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...)))
+    -> decltype(      static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...))
+    { return          static_cast<_Fp&&>(__f)(static_cast<_Args&&>(__args)...); }
 
 // __invokable
 template <class _Ret, class _Fp, class ..._Args>
diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map
index ea0382de7d401..a0c1c55e4c715 100644
--- a/libcxx/include/unordered_map
+++ b/libcxx/include/unordered_map
@@ -1464,7 +1464,7 @@ private:
 #endif
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator,
          class _Hash = hash<__iter_key_type<_InputIterator>>,
          class _Pred = equal_to<__iter_key_type<_InputIterator>>,
@@ -2263,7 +2263,7 @@ public:
 
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator,
          class _Hash = hash<__iter_key_type<_InputIterator>>,
          class _Pred = equal_to<__iter_key_type<_InputIterator>>,
diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set
index a775a9250268d..de87dcd2b74ed 100644
--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@@ -799,7 +799,7 @@ public:
 
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator,
          class _Hash = hash<__iter_value_type<_InputIterator>>,
          class _Pred = equal_to<__iter_value_type<_InputIterator>>,
@@ -1465,7 +1465,7 @@ public:
 
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator,
          class _Hash = hash<__iter_value_type<_InputIterator>>,
          class _Pred = equal_to<__iter_value_type<_InputIterator>>,
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 700e6f3f11514..b0ea73de07670 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -1718,11 +1718,10 @@ inline _LIBCPP_INLINE_VISIBILITY
 
 template <class... _Types>
 inline _LIBCPP_INLINE_VISIBILITY
-auto swap(variant<_Types...>& __lhs,
-          variant<_Types...>& __rhs) noexcept(noexcept(__lhs.swap(__rhs)))
-    -> decltype(__lhs.swap(__rhs)) {
-  __lhs.swap(__rhs);
-}
+auto swap(variant<_Types...>& __lhs, variant<_Types...>& __rhs)
+  noexcept(noexcept(__lhs.swap(__rhs)))
+  -> decltype(      __lhs.swap(__rhs))
+  { return          __lhs.swap(__rhs); }
 
 template <class... _Types>
 struct _LIBCPP_TEMPLATE_VIS hash<
diff --git a/libcxx/include/vector b/libcxx/include/vector
index 9189ed44a80c7..fac1f95d7c523 100644
--- a/libcxx/include/vector
+++ b/libcxx/include/vector
@@ -281,6 +281,7 @@ erase_if(vector<T, Allocator>& c, Predicate pred);    // C++20
 #include <algorithm>
 #include <climits>
 #include <compare>
+#include <cstdlib>
 #include <cstring>
 #include <initializer_list>
 #include <iosfwd> // for forward declaration of vector
@@ -390,6 +391,25 @@ protected:
             is_nothrow_move_assignable<allocator_type>::value)
         {__move_assign_alloc(__c, integral_constant<bool,
                       __alloc_traits::propagate_on_container_move_assignment::value>());}
+
+    _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI
+    void __throw_length_error() const {
+#ifndef _LIBCPP_NO_EXCEPTIONS
+        __vector_base_common<true>::__throw_length_error();
+#else
+        _VSTD::abort();
+#endif
+    }
+
+    _LIBCPP_NORETURN _LIBCPP_HIDE_FROM_ABI
+    void __throw_out_of_range() const {
+#ifndef _LIBCPP_NO_EXCEPTIONS
+        __vector_base_common<true>::__throw_out_of_range();
+#else
+        _VSTD::abort();
+#endif
+    }
+
 private:
     _LIBCPP_INLINE_VISIBILITY
     void __copy_assign_alloc(const __vector_base& __c, true_type)
@@ -931,7 +951,7 @@ private:
   }
 };
 
-#ifndef _LIBCPP_HAS_NO_DEDUCTION_GUIDES
+#if _LIBCPP_STD_VER >= 17
 template<class _InputIterator,
          class _Alloc = allocator<__iter_value_type<_InputIterator>>,
          class = _EnableIf<__is_allocator<_Alloc>::value>
diff --git a/libcxx/test/libcxx/algorithms/alg.modifying.operations/alg.random.shuffle/random_shuffle.depr_in_cxx14.verify.cpp b/libcxx/test/libcxx/algorithms/alg.modifying.operations/alg.random.shuffle/random_shuffle.depr_in_cxx14.verify.cpp
index 2cf9d0971c826..fc58955c1c2cb 100644
--- a/libcxx/test/libcxx/algorithms/alg.modifying.operations/alg.random.shuffle/random_shuffle.depr_in_cxx14.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.modifying.operations/alg.random.shuffle/random_shuffle.depr_in_cxx14.verify.cpp
@@ -17,7 +17,6 @@
 //     random_shuffle(RandomAccessIterator first, RandomAccessIterator last,
 //                    RandomNumberGenerator& rand);
 
-// UNSUPPORTED: clang-4.0
 // UNSUPPORTED: c++03, c++11
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX17_REMOVED_RANDOM_SHUFFLE
diff --git a/libcxx/test/libcxx/atomics/ext-int.verify.cpp b/libcxx/test/libcxx/atomics/ext-int.verify.cpp
index 203414b641ed4..90034426fadaa 100644
--- a/libcxx/test/libcxx/atomics/ext-int.verify.cpp
+++ b/libcxx/test/libcxx/atomics/ext-int.verify.cpp
@@ -12,8 +12,7 @@
 // disable them for now until their behavior can be designed better later.
 // See https://reviews.llvm.org/D84049 for details.
 
-// UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8, clang-9, clang-10
-// UNSUPPORTED: apple-clang-9, apple-clang-10, apple-clang-11, apple-clang-12
+// UNSUPPORTED: apple-clang-12
 
 // UNSUPPORTED: libcpp-has-no-threads
 // UNSUPPORTED: c++03
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/missing_hash_specialization.fail.cpp b/libcxx/test/libcxx/containers/unord/unord.set/missing_hash_specialization.fail.cpp
index 390e0e523cf1f..45be37d1c86de 100644
--- a/libcxx/test/libcxx/containers/unord/unord.set/missing_hash_specialization.fail.cpp
+++ b/libcxx/test/libcxx/containers/unord/unord.set/missing_hash_specialization.fail.cpp
@@ -13,9 +13,6 @@
 // newer.
 // UNSUPPORTED: c++11
 
-// Clang doesn't support filename wildcards in verify tests until 05eedf1f5b44.
-// UNSUPPORTED: clang-10
-
 // <unordered_set>
 
 // Test that we generate a reasonable diagnostic when the specified hash is
diff --git a/libcxx/test/libcxx/depr/depr.auto.ptr/auto.ptr/auto_ptr.depr_in_cxx11.verify.cpp b/libcxx/test/libcxx/depr/depr.auto.ptr/auto.ptr/auto_ptr.depr_in_cxx11.verify.cpp
index 6f3cdb7cbf49b..9fc83062c69a6 100644
--- a/libcxx/test/libcxx/depr/depr.auto.ptr/auto.ptr/auto_ptr.depr_in_cxx11.verify.cpp
+++ b/libcxx/test/libcxx/depr/depr.auto.ptr/auto.ptr/auto_ptr.depr_in_cxx11.verify.cpp
@@ -18,7 +18,6 @@
 //
 // Deprecated in C++11
 
-// UNSUPPORTED: clang-4.0
 // UNSUPPORTED: c++03
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR
diff --git a/libcxx/test/libcxx/depr/depr.function.objects/adaptors.depr_in_cxx11.verify.cpp b/libcxx/test/libcxx/depr/depr.function.objects/adaptors.depr_in_cxx11.verify.cpp
index de8df7d0812ae..2b3a05eaad749 100644
--- a/libcxx/test/libcxx/depr/depr.function.objects/adaptors.depr_in_cxx11.verify.cpp
+++ b/libcxx/test/libcxx/depr/depr.function.objects/adaptors.depr_in_cxx11.verify.cpp
@@ -8,7 +8,6 @@
 
 // <functional>
 
-// UNSUPPORTED: clang-4.0
 // UNSUPPORTED: c++03
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX17_REMOVED_BINDERS
diff --git a/libcxx/test/libcxx/language.support/has_timespec_get.compile.pass.cpp b/libcxx/test/libcxx/diagnostics/detail.headers/compare/compare_three_way_result.module.verify.cpp
similarity index 50%
rename from libcxx/test/libcxx/language.support/has_timespec_get.compile.pass.cpp
rename to libcxx/test/libcxx/diagnostics/detail.headers/compare/compare_three_way_result.module.verify.cpp
index 8b86a5ef97195..f9e705d331cfd 100644
--- a/libcxx/test/libcxx/language.support/has_timespec_get.compile.pass.cpp
+++ b/libcxx/test/libcxx/diagnostics/detail.headers/compare/compare_three_way_result.module.verify.cpp
@@ -1,3 +1,4 @@
+// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -6,14 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14
+// REQUIRES: modules-build
 
-// Make sure TEST_HAS_TIMESPEC_GET (defined by the test suite) and
-// _LIBCPP_HAS_TIMESPEC_GET (defined by libc++) stay in sync.
+// WARNING: This test was generated by 'generate_private_header_tests.py'
+// and should not be edited manually.
 
-#include <__config>
-#include "test_macros.h"
-
-#if defined(TEST_HAS_TIMESPEC_GET) != defined(_LIBCPP_HAS_TIMESPEC_GET)
-#   error "TEST_HAS_TIMESPEC_GET and _LIBCPP_HAS_TIMESPEC_GET are out of sync"
-#endif
+// expected-error@*:* {{use of private header from outside its module: '__compare/compare_three_way_result.h'}}
+#include <__compare/compare_three_way_result.h>
diff --git a/libcxx/test/std/language.support/support.start.term/quick_exit.compile.fail.cpp b/libcxx/test/libcxx/diagnostics/detail.headers/ranges/iota_view.module.verify.cpp
similarity index 55%
rename from libcxx/test/std/language.support/support.start.term/quick_exit.compile.fail.cpp
rename to libcxx/test/libcxx/diagnostics/detail.headers/ranges/iota_view.module.verify.cpp
index f9c5bdc835694..8b2d4cad2da25 100644
--- a/libcxx/test/std/language.support/support.start.term/quick_exit.compile.fail.cpp
+++ b/libcxx/test/libcxx/diagnostics/detail.headers/ranges/iota_view.module.verify.cpp
@@ -1,3 +1,4 @@
+// -*- C++ -*-
 //===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -6,20 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03
+// REQUIRES: modules-build
 
-// test that referencing quick_exit when TEST_HAS_QUICK_EXIT is not defined
-// results in a compile error.
+// WARNING: This test was generated by 'generate_private_header_tests.py'
+// and should not be edited manually.
 
-#include <cstdlib>
-
-#include "test_macros.h"
-
-int main(int, char**) {
-#if !defined(TEST_HAS_QUICK_EXIT)
-    std::quick_exit(0);
-#else
-#   error
-#endif
-    return 0;
-}
+// expected-error@*:* {{use of private header from outside its module: '__ranges/iota_view.h'}}
+#include <__ranges/iota_view.h>
diff --git a/libcxx/test/libcxx/diagnostics/detail.headers/ranges/join_view.module.verify.cpp b/libcxx/test/libcxx/diagnostics/detail.headers/ranges/join_view.module.verify.cpp
new file mode 100644
index 0000000000000..ff48b744d5d93
--- /dev/null
+++ b/libcxx/test/libcxx/diagnostics/detail.headers/ranges/join_view.module.verify.cpp
@@ -0,0 +1,16 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: modules-build
+
+// WARNING: This test was generated by 'generate_private_header_tests.py'
+// and should not be edited manually.
+
+// expected-error@*:* {{use of private header from outside its module: '__ranges/join_view.h'}}
+#include <__ranges/join_view.h>
diff --git a/libcxx/test/libcxx/diagnostics/nodiscard_extensions.pass.cpp b/libcxx/test/libcxx/diagnostics/nodiscard_extensions.pass.cpp
index 390d997f22eb4..ae5bfa7f9e37e 100644
--- a/libcxx/test/libcxx/diagnostics/nodiscard_extensions.pass.cpp
+++ b/libcxx/test/libcxx/diagnostics/nodiscard_extensions.pass.cpp
@@ -13,10 +13,6 @@
 // This test intentionally leaks memory, so it is unsupported under ASAN.
 // UNSUPPORTED: asan
 
-// AppleClang9 and GCC 5 don't support C++17's implicitly synthesized
-// deduction guides from existing ctors, needed by default_searcher() below.
-// UNSUPPORTED: apple-clang-9
-
 // All entities to which libc++ applies [[nodiscard]] as an extension should
 // be tested here and in nodiscard_extensions.fail.cpp. They should also
 // be listed in `UsingLibcxx.rst` in the documentation for the extension.
diff --git a/libcxx/test/libcxx/diagnostics/nodiscard_extensions.verify.cpp b/libcxx/test/libcxx/diagnostics/nodiscard_extensions.verify.cpp
index 5186746339c22..5f3703cb86d08 100644
--- a/libcxx/test/libcxx/diagnostics/nodiscard_extensions.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/nodiscard_extensions.verify.cpp
@@ -9,10 +9,6 @@
 
 // UNSUPPORTED: c++03
 
-// AppleClang9 doesn't yet support C++17's implicitly synthesized deduction
-// guides from existing ctors, needed by default_searcher() below.
-// UNSUPPORTED: apple-clang-9
-
 // Test that entities declared [[nodiscard]] as an extension by libc++, are
 // only actually declared such when _LIBCPP_ENABLE_NODISCARD is specified.
 
diff --git a/libcxx/test/libcxx/language.support/has_aligned_alloc.compile.pass.cpp b/libcxx/test/libcxx/language.support/has_aligned_alloc.compile.pass.cpp
deleted file mode 100644
index d1b41de5dc1a9..0000000000000
--- a/libcxx/test/libcxx/language.support/has_aligned_alloc.compile.pass.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14
-
-// Make sure TEST_HAS_ALIGNED_ALLOC (defined by the test suite) and
-// _LIBCPP_HAS_ALIGNED_ALLOC (defined by libc++) stay in sync.
-
-#include <__config>
-#include "test_macros.h"
-
-#if defined(TEST_HAS_ALIGNED_ALLOC) != defined(_LIBCPP_HAS_ALIGNED_ALLOC)
-#   error "TEST_HAS_ALIGNED_ALLOC and _LIBCPP_HAS_ALIGNED_ALLOC are out of sync"
-#endif
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/aligned_alloc_availability.verify.cpp b/libcxx/test/libcxx/language.support/support.dynamic/aligned_alloc_availability.verify.cpp
index 8d5a73060e673..321b966c2660a 100644
--- a/libcxx/test/libcxx/language.support/support.dynamic/aligned_alloc_availability.verify.cpp
+++ b/libcxx/test/libcxx/language.support/support.dynamic/aligned_alloc_availability.verify.cpp
@@ -7,15 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 // Make sure we get compile-time availability errors when trying to use aligned
-// allocation/deallocation on deployment targets that don't support it.
+// allocation/deallocation on deployment targets that don't support it (before macosx10.14).
 
 // UNSUPPORTED: c++03, c++11, c++14
 
 // Aligned allocation was not provided before macosx10.14.
-// Support for that is broken prior to Clang 8 and Apple Clang 11.
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-// UNSUPPORTED: clang-5, clang-6, clang-7
-
 // REQUIRES: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13}}
 
 #include <new>
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
index 94d8c959c6b04..b33280b9ec352 100644
--- a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
+++ b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
@@ -22,10 +22,6 @@
 // nor does the dynamic library shipped with z/OS.
 // UNSUPPORTED: target={{.+}}-zos{{.*}}
 
-// AppleClang < 10 incorrectly warns that aligned allocation is not supported
-// even when it is supported.
-// UNSUPPORTED: apple-clang-9
-
 // XFAIL: sanitizer-new-delete, ubsan
 
 // GCC doesn't support the aligned-allocation flags.
diff --git a/libcxx/test/libcxx/memory/aligned_allocation_macro.compile.pass.cpp b/libcxx/test/libcxx/memory/aligned_allocation_macro.compile.pass.cpp
index 0d505e2506c31..979dd76bd399e 100644
--- a/libcxx/test/libcxx/memory/aligned_allocation_macro.compile.pass.cpp
+++ b/libcxx/test/libcxx/memory/aligned_allocation_macro.compile.pass.cpp
@@ -8,10 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// AppleClang <= 10 enables aligned allocation regardless of the deployment
-// target, so this test would fail.
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-
 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13}}
 
 #include <new>
diff --git a/libcxx/test/libcxx/memory/trivial_abi/shared_ptr_arg.pass.cpp b/libcxx/test/libcxx/memory/trivial_abi/shared_ptr_arg.pass.cpp
index fdf4a172bab28..42f19ddf11968 100644
--- a/libcxx/test/libcxx/memory/trivial_abi/shared_ptr_arg.pass.cpp
+++ b/libcxx/test/libcxx/memory/trivial_abi/shared_ptr_arg.pass.cpp
@@ -12,8 +12,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI
 
-// There were assertion failures in both parse and codegen, which are fixed in clang 11.
-// UNSUPPORTED: gcc, clang-4, clang-5, clang-6, clang-7, clang-8, clang-9, clang-10
+// XFAIL: gcc
 // UNSUPPORTED: c++03
 
 #include <memory>
diff --git a/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_arg.pass.cpp b/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_arg.pass.cpp
index e7336ddcea713..3a57d64e8b485 100644
--- a/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_arg.pass.cpp
+++ b/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_arg.pass.cpp
@@ -12,8 +12,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ABI_ENABLE_UNIQUE_PTR_TRIVIAL_ABI
 
-// There were assertion failures in both parse and codegen, which are fixed in clang 11.
-// UNSUPPORTED: gcc, clang-4, clang-5, clang-6, clang-7, clang-8, clang-9, clang-10
+// XFAIL: gcc
 
 #include <memory>
 #include <cassert>
diff --git a/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_array.pass.cpp b/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_array.pass.cpp
index 4e4b521131a2f..4b6a9eeba2232 100644
--- a/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_array.pass.cpp
+++ b/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_array.pass.cpp
@@ -12,8 +12,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ABI_ENABLE_UNIQUE_PTR_TRIVIAL_ABI
 
-// There were assertion failures in both parse and codegen, which are fixed in clang 11.
-// UNSUPPORTED: gcc, clang-4, clang-5, clang-6, clang-7, clang-8, clang-9, clang-10
+// XFAIL: gcc
 // UNSUPPORTED: c++03
 
 #include <memory>
diff --git a/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_destruction_order.pass.cpp b/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_destruction_order.pass.cpp
index 5a8d0590a5ef7..8686acc8fb9a2 100644
--- a/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_destruction_order.pass.cpp
+++ b/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_destruction_order.pass.cpp
@@ -12,9 +12,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ABI_ENABLE_UNIQUE_PTR_TRIVIAL_ABI
 
-// There were assertion failures in both parse and codegen, which are fixed in clang 11.
-// UNSUPPORTED: gcc, clang-4, clang-5, clang-6, clang-7, clang-8, clang-9, clang-10
-
+// XFAIL: gcc
 
 #include <memory>
 #include <cassert>
diff --git a/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp b/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp
index ce5adfaa63e7d..cebfb24e7b079 100644
--- a/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp
+++ b/libcxx/test/libcxx/memory/trivial_abi/unique_ptr_ret.pass.cpp
@@ -12,8 +12,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ABI_ENABLE_UNIQUE_PTR_TRIVIAL_ABI
 
-// There were assertion failures in both parse and codegen, which are fixed in clang 11.
-// UNSUPPORTED: gcc, clang-4, clang-5, clang-6, clang-7, clang-8, clang-9, clang-10
+// XFAIL: gcc
 
 #include <memory>
 #include <cassert>
diff --git a/libcxx/test/libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp b/libcxx/test/libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp
index f556eaf0a0c99..9cd29703f20ff 100644
--- a/libcxx/test/libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp
+++ b/libcxx/test/libcxx/memory/trivial_abi/weak_ptr_ret.pass.cpp
@@ -12,8 +12,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ABI_ENABLE_SHARED_PTR_TRIVIAL_ABI
 
-// There were assertion failures in both parse and codegen, which are fixed in clang 11.
-// UNSUPPORTED: gcc, clang-4, clang-5, clang-6, clang-7, clang-8, clang-9, clang-10
+// XFAIL: gcc
 
 #include <memory>
 #include <cassert>
diff --git a/libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp b/libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp
index 7a9f55f569e26..4415c3296c8e0 100644
--- a/libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp
+++ b/libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // Test the set of C++11 features that Clang provides as an extension in C++03 mode.
 // The language features we expect are:
 //
diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.copy.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.copy.pass.cpp
index 393cde1199b21..108422fa4e228 100644
--- a/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.copy.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.copy.pass.cpp
@@ -47,7 +47,7 @@ constexpr void test() {
 
   // Assign to an empty cache
   {
-    Cache a; a.__set(T{3});
+    Cache a; a.__emplace(3);
     Cache b;
 
     Cache& result = (b = a);
@@ -60,8 +60,8 @@ constexpr void test() {
 
   // Assign to a non-empty cache
   {
-    Cache a; a.__set(T{3});
-    Cache b; b.__set(T{5});
+    Cache a; a.__emplace(3);
+    Cache b; b.__emplace(5);
 
     Cache& result = (b = a);
     assert(&result == &b);
@@ -81,7 +81,7 @@ constexpr void test() {
 
   // Self-assignment should not do anything (case with non-empty cache)
   {
-    Cache b; b.__set(T{5});
+    Cache b; b.__emplace(5);
     Cache& result = (b = b);
     assert(&result == &b);
     assert(b.__has_value());
diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.move.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.move.pass.cpp
index 6c5ccb100293a..5f04619832fbc 100644
--- a/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.move.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/assign.move.pass.cpp
@@ -46,7 +46,7 @@ constexpr void test() {
 
   // Assign to an empty cache
   {
-    Cache a; a.__set(T{3});
+    Cache a; a.__emplace(3);
     Cache b;
 
     Cache& result = (b = std::move(a));
@@ -57,8 +57,8 @@ constexpr void test() {
 
   // Assign to a non-empty cache
   {
-    Cache a; a.__set(T{3});
-    Cache b; b.__set(T{5});
+    Cache a; a.__emplace(3);
+    Cache b; b.__emplace(5);
 
     Cache& result = (b = std::move(a));
     assert(&result == &b);
@@ -77,7 +77,7 @@ constexpr void test() {
 
   // Self-assignment should clear the cache (case with non-empty cache)
   {
-    Cache b; b.__set(T{5});
+    Cache b; b.__emplace(5);
 
     Cache& result = (b = std::move(b));
     assert(&result == &b);
diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.copy.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.copy.pass.cpp
index fae7a3e92939a..762222058e1a9 100644
--- a/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.copy.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.copy.pass.cpp
@@ -39,7 +39,7 @@ constexpr void test() {
   using Cache = std::ranges::__non_propagating_cache<T>;
   static_assert(std::is_nothrow_copy_constructible_v<Cache>);
   Cache a;
-  a.__set(T{3});
+  a.__emplace(3);
 
   // Test with direct initialization
   {
diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.move.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.move.pass.cpp
index 1fa454f7acc17..b28c751281d67 100644
--- a/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.move.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/ctor.move.pass.cpp
@@ -34,7 +34,7 @@ constexpr void test() {
   // Test with direct initialization
   {
     Cache a;
-    a.__set(T{3});
+    a.__emplace(3);
 
     Cache b(std::move(a));
     assert(!b.__has_value()); // make sure we don't propagate
@@ -44,7 +44,7 @@ constexpr void test() {
   // Test with copy initialization
   {
     Cache a;
-    a.__set(T{3});
+    a.__emplace(3);
 
     Cache b = std::move(a);
     assert(!b.__has_value()); // make sure we don't propagate
diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/deref.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/deref.pass.cpp
index 55b288028af3d..51508c59a22aa 100644
--- a/libcxx/test/libcxx/ranges/range.nonprop.cache/deref.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/deref.pass.cpp
@@ -23,14 +23,14 @@ constexpr void test() {
 
   // non-const version
   {
-    Cache cache; cache.__set(T{3});
+    Cache cache; cache.__emplace(3);
     T& result = *cache;
     assert(result == T{3});
   }
 
   // const version
   {
-    Cache cache; cache.__set(T{3});
+    Cache cache; cache.__emplace(3);
     T const& result = *static_cast<Cache const&>(cache);
     assert(result == T{3});
   }
diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/emplace.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/emplace.pass.cpp
new file mode 100644
index 0000000000000..636eda8aa6d91
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/emplace.pass.cpp
@@ -0,0 +1,97 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// template<class ...Args>
+// constexpr T& __emplace(Args&& ...);
+
+#include <ranges>
+
+#include <cassert>
+#include <tuple>
+
+template<int I>
+struct X {
+  int value = -1;
+  template<int J>
+  friend constexpr bool operator==(X const& a, X<J> const& b) { return I == J && a.value == b.value; }
+};
+
+struct NonMovable {
+  int value = -1;
+  NonMovable() = default;
+  constexpr explicit NonMovable(int v) : value(v) { }
+  NonMovable(NonMovable&&) = delete;
+  NonMovable& operator=(NonMovable&&) = delete;
+};
+
+constexpr bool test() {
+  {
+    using T = std::tuple<>;
+    using Cache = std::ranges::__non_propagating_cache<T>;
+    Cache cache;
+    T& result = cache.__emplace();
+    assert(&result == &*cache);
+    assert(result == T());
+  }
+
+  {
+    using T = std::tuple<X<0>>;
+    using Cache = std::ranges::__non_propagating_cache<T>;
+    Cache cache;
+    T& result = cache.__emplace();
+    assert(&result == &*cache);
+    assert(result == T());
+  }
+  {
+    using T = std::tuple<X<0>>;
+    using Cache = std::ranges::__non_propagating_cache<T>;
+    Cache cache;
+    T& result = cache.__emplace(X<0>{0});
+    assert(&result == &*cache);
+    assert(result == T(X<0>{0}));
+  }
+
+  {
+    using T = std::tuple<X<0>, X<1>>;
+    using Cache = std::ranges::__non_propagating_cache<T>;
+    Cache cache;
+    T& result = cache.__emplace();
+    assert(&result == &*cache);
+    assert(result == T());
+  }
+  {
+    using T = std::tuple<X<0>, X<1>>;
+    using Cache = std::ranges::__non_propagating_cache<T>;
+    Cache cache;
+    T& result = cache.__emplace(X<0>{0}, X<1>{1});
+    assert(&result == &*cache);
+    assert(result == T(X<0>{0}, X<1>{1}));
+  }
+
+  // Make sure that we do not require the type to be movable when we emplace it.
+  // Move elision should be performed instead, see http://eel.is/c++draft/range.nonprop.cache#note-1.
+  {
+    using Cache = std::ranges::__non_propagating_cache<NonMovable>;
+    Cache cache;
+    NonMovable& result = cache.__emplace();
+    assert(&result == &*cache);
+    assert(result.value == -1);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  static_assert(test());
+  test();
+  return 0;
+}
diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/emplace_from.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/emplace_from.pass.cpp
new file mode 100644
index 0000000000000..d87db6570cdb9
--- /dev/null
+++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/emplace_from.pass.cpp
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// template<class Fn>
+// constexpr T& __emplace_from(Fn const&);
+
+#include <ranges>
+
+#include <cassert>
+#include <tuple>
+
+template<int I>
+struct X {
+  int value = -1;
+  template<int J>
+  friend constexpr bool operator==(X const& a, X<J> const& b) { return I == J && a.value == b.value; }
+};
+
+struct NonMovable {
+  int value = -1;
+  NonMovable() = default;
+  constexpr explicit NonMovable(int v) : value(v) { }
+  NonMovable(NonMovable&&) = delete;
+  NonMovable& operator=(NonMovable&&) = delete;
+};
+
+constexpr bool test() {
+  {
+    using T = std::tuple<>;
+    using Cache = std::ranges::__non_propagating_cache<T>;
+    Cache cache;
+    T& result = cache.__emplace_from([] { return T(); });
+    assert(&result == &*cache);
+    assert(result == T());
+  }
+  {
+    using T = std::tuple<X<0>>;
+    using Cache = std::ranges::__non_propagating_cache<T>;
+    Cache cache;
+    T& result = cache.__emplace_from([] { return T(X<0>{0}); });
+    assert(&result == &*cache);
+    assert(result == T(X<0>{0}));
+  }
+  {
+    using T = std::tuple<X<0>, X<1>>;
+    using Cache = std::ranges::__non_propagating_cache<T>;
+    Cache cache;
+    T& result = cache.__emplace_from([] { return T(X<0>{0}, X<1>{1}); });
+    assert(&result == &*cache);
+    assert(result == T(X<0>{0}, X<1>{1}));
+  }
+
+  // Make sure that we do not require the type to be movable when we emplace it.
+  // Move elision should be performed instead, see http://eel.is/c++draft/range.nonprop.cache#note-1.
+  {
+    using Cache = std::ranges::__non_propagating_cache<NonMovable>;
+    Cache cache;
+    NonMovable& result = cache.__emplace_from([] { return NonMovable(3); });
+    assert(&result == &*cache);
+    assert(result.value == 3);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  static_assert(test());
+  test();
+  return 0;
+}
diff --git a/libcxx/test/libcxx/ranges/range.nonprop.cache/has_value.pass.cpp b/libcxx/test/libcxx/ranges/range.nonprop.cache/has_value.pass.cpp
index 4268cf9abe903..2b30e99ce06e7 100644
--- a/libcxx/test/libcxx/ranges/range.nonprop.cache/has_value.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.nonprop.cache/has_value.pass.cpp
@@ -28,7 +28,7 @@ constexpr void test() {
 
   // __has_value on a non-empty cache
   {
-    Cache cache; cache.__set(T{});
+    Cache cache; cache.__emplace();
     assert(cache.__has_value());
   }
 }
diff --git a/libcxx/test/libcxx/thread/thread.lock/thread.lock.guard/nodiscard.verify.cpp b/libcxx/test/libcxx/thread/thread.lock/thread.lock.guard/nodiscard.verify.cpp
index e76a690a86998..2ad1370c9d61e 100644
--- a/libcxx/test/libcxx/thread/thread.lock/thread.lock.guard/nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/thread/thread.lock/thread.lock.guard/nodiscard.verify.cpp
@@ -8,10 +8,6 @@
 
 // UNSUPPORTED: libcpp-has-no-threads
 
-// [[nodiscard]] on constructors isn't supported by all compilers
-// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
-// UNSUPPORTED: apple-clang-9, apple-clang-10, apple-clang-11
-
 // [[nodiscard]] isn't supported in C++03 (not even as an extension)
 // UNSUPPORTED: c++03
 
diff --git a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/function_type_default_deleter.fail.cpp b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/function_type_default_deleter.fail.cpp
index ab77958f5f0d1..4ee0188d247c9 100644
--- a/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/function_type_default_deleter.fail.cpp
+++ b/libcxx/test/libcxx/utilities/memory/util.smartptr/util.smartptr.shared/function_type_default_deleter.fail.cpp
@@ -8,9 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// Clang doesn't support filename wildcards in verify tests until 05eedf1f5b44.
-// UNSUPPORTED: clang-10
-
 #include <memory>
 
 template <int> struct Tag {};
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
index bb171d59b8c29..19f11de555ee8 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <algorithm>
 
 // template<InputIterator InIter, OutputIterator<auto, InIter::reference> OutIter>
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp
index 78dda56b8b94e..7b620dbb63e07 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <algorithm>
 
 // template<BidirectionalIterator InIter, BidirectionalIterator OutIter>
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp
index df75e75914e51..06f2efd565fc4 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <algorithm>
 
 // template<InputIterator InIter, OutputIterator<auto, InIter::reference> OutIter>
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/contiguous_trivial_optimization.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/contiguous_trivial_optimization.pass.cpp
index 9b65c4a220d9b..fe0b7ad30379d 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/contiguous_trivial_optimization.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/contiguous_trivial_optimization.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Older compilers don't support std::is_constant_evaluated
-// UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8
-// UNSUPPORTED: apple-clang-9, apple-clang-10
 // UNSUPPORTED: c++03
 
 // <algorithm>
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp
index f98909991d275..8b9d3368a2509 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp
@@ -13,10 +13,6 @@
 //   OutIter
 //   move(InIter first, InIter last, OutIter result);
 
-// Older compilers don't support std::is_constant_evaluated
-// UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8
-// UNSUPPORTED: apple-clang-9, apple-clang-10, apple-clang-11
-
 #include <algorithm>
 #include <cassert>
 #include <memory>
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp
index bc0943685b735..4aafcd24131be 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp
@@ -6,10 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Older compilers don't support std::is_constant_evaluated
-// UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-
 // <algorithm>
 
 // template<BidirectionalIterator InIter, BidirectionalIterator OutIter>
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate.pass.cpp
index 2bab73c58b84d..7bbf3abd43dc5 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate.pass.cpp
@@ -12,10 +12,6 @@
 //   Iter
 //   rotate(Iter first, Iter middle, Iter last);
 
-// Older compilers don't support std::is_constant_evaluated
-// UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-
 #include <algorithm>
 #include <cassert>
 #include <memory>
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate_copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate_copy.pass.cpp
index d9dca0c6ebf09..8acb1a129e386 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate_copy.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.rotate/rotate_copy.pass.cpp
@@ -12,10 +12,6 @@
 //   constexpr OutIter          // constexpr after C++17
 //   rotate_copy(InIter first, InIter middle, InIter last, OutIter result);
 
-// Older compilers don't support std::is_constant_evaluated
-// UNSUPPORTED: clang-4, clang-5, clang-6, clang-7, clang-8
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-
 #include <algorithm>
 #include <cassert>
 
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge.pass.cpp
index e6ca995d976f6..28fc8c4801176 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <algorithm>
 
 // template<InputIterator InIter1, InputIterator InIter2, typename OutIter>
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge_comp.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge_comp.pass.cpp
index 235f94d563e68..2472f9f2e7417 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge_comp.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.merge/merge_comp.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <algorithm>
 
 // template<InputIterator InIter1, InputIterator InIter2, typename OutIter,
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.difference/set_difference.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.difference/set_difference.pass.cpp
index 03a643b8fdfc0..d35740a009c2a 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.difference/set_difference.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.difference/set_difference.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <algorithm>
 
 // template<InputIterator InIter1, InputIterator InIter2, typename OutIter>
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.difference/set_difference_comp.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.difference/set_difference_comp.pass.cpp
index 7c5ee8252071c..bfe0755e69d9b 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.difference/set_difference_comp.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.difference/set_difference_comp.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <algorithm>
 
 // template<InputIterator InIter1, InputIterator InIter2, typename OutIter,
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.symmetric.difference/set_symmetric_difference.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.symmetric.difference/set_symmetric_difference.pass.cpp
index ee88b38ff2ca2..6bd3d976d3fac 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.symmetric.difference/set_symmetric_difference.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.symmetric.difference/set_symmetric_difference.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <algorithm>
 
 // template<InputIterator InIter1, InputIterator InIter2, typename OutIter>
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.symmetric.difference/set_symmetric_difference_comp.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.symmetric.difference/set_symmetric_difference_comp.pass.cpp
index e872165eb6c50..314588536e747 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.symmetric.difference/set_symmetric_difference_comp.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.symmetric.difference/set_symmetric_difference_comp.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <algorithm>
 
 // template<InputIterator InIter1, InputIterator InIter2, typename OutIter,
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.union/set_union.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.union/set_union.pass.cpp
index 59720fee62896..c16555e173b51 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.union/set_union.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.union/set_union.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <algorithm>
 
 // template<InputIterator InIter1, InputIterator InIter2, typename OutIter>
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.union/set_union_comp.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.union/set_union_comp.pass.cpp
index 9831cff6d8f8c..f7a0bddbfebe0 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.union/set_union_comp.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.set.operations/set.union/set_union_comp.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <algorithm>
 
 // template<InputIterator InIter1, InputIterator InIter2, typename OutIter,
diff --git a/libcxx/test/std/algorithms/robust_against_adl.pass.cpp b/libcxx/test/std/algorithms/robust_against_adl.pass.cpp
index a9ea46a50c907..fca6f98f8b0a5 100644
--- a/libcxx/test/std/algorithms/robust_against_adl.pass.cpp
+++ b/libcxx/test/std/algorithms/robust_against_adl.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
 // UNSUPPORTED: c++03
 
 // <algorithm>
diff --git a/libcxx/test/std/atomics/atomics.flag/atomic_flag_test.pass.cpp b/libcxx/test/std/atomics/atomics.flag/atomic_flag_test.pass.cpp
index 22e4b66d45c5a..44e4668d6d909 100644
--- a/libcxx/test/std/atomics/atomics.flag/atomic_flag_test.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.flag/atomic_flag_test.pass.cpp
@@ -12,8 +12,8 @@
 
 // struct atomic_flag
 
-// bool atomic_flag_test_and_set(volatile atomic_flag*);
-// bool atomic_flag_test_and_set(atomic_flag*);
+// bool atomic_flag_test(const volatile atomic_flag*);
+// bool atomic_flag_test(const atomic_flag*);
 
 #include <atomic>
 #include <cassert>
@@ -25,15 +25,17 @@ int main(int, char**)
     {
         std::atomic_flag f;
         f.clear();
-        assert(atomic_flag_test_and_set(&f) == 0);
-        assert(f.test_and_set() == 1);
+        assert(atomic_flag_test(&f) == 0);
+        assert(f.test_and_set() == 0);
+        assert(atomic_flag_test(&f) == 1);
     }
     {
         volatile std::atomic_flag f;
         f.clear();
-        assert(atomic_flag_test_and_set(&f) == 0);
-        assert(f.test_and_set() == 1);
+        assert(atomic_flag_test(&f) == 0);
+        assert(f.test_and_set() == 0);
+        assert(atomic_flag_test(&f) == 1);
     }
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/atomics/atomics.flag/atomic_flag_test_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.flag/atomic_flag_test_explicit.pass.cpp
index af3665f13300a..ee9b1470b2cde 100644
--- a/libcxx/test/std/atomics/atomics.flag/atomic_flag_test_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.flag/atomic_flag_test_explicit.pass.cpp
@@ -111,5 +111,5 @@ int main(int, char**)
         assert(atomic_flag_test_explicit(&f, std::memory_order_seq_cst) == 1);
     }
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/containers/associative/map/map.cons/deduct.fail.cpp b/libcxx/test/std/containers/associative/map/map.cons/deduct.fail.cpp
index 13722e9c3501e..a9b730caf261e 100644
--- a/libcxx/test/std/containers/associative/map/map.cons/deduct.fail.cpp
+++ b/libcxx/test/std/containers/associative/map/map.cons/deduct.fail.cpp
@@ -6,11 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <map>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-// XFAIL: clang-6, apple-clang-9.0, apple-clang-9.1, apple-clang-10.0.0
-//  clang-6 gives different error messages.
+
+// <map>
 
 // template<class InputIterator,
 //          class Compare = less<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/associative/map/map.cons/deduct.pass.cpp b/libcxx/test/std/containers/associative/map/map.cons/deduct.pass.cpp
index 139714bc47c33..4444782291ecb 100644
--- a/libcxx/test/std/containers/associative/map/map.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.cons/deduct.pass.cpp
@@ -8,7 +8,6 @@
 
 // <map>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // template<class InputIterator,
 //          class Compare = less<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/associative/map/map.cons/deduct_const.pass.cpp b/libcxx/test/std/containers/associative/map/map.cons/deduct_const.pass.cpp
index d584ffd32b211..3da55fa69759a 100644
--- a/libcxx/test/std/containers/associative/map/map.cons/deduct_const.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.cons/deduct_const.pass.cpp
@@ -8,7 +8,6 @@
 
 // <map>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // template<class InputIterator,
 //          class Compare = less<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct.fail.cpp b/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct.fail.cpp
index fc8a40717719a..4f324990bbfc4 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct.fail.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct.fail.cpp
@@ -6,11 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <map>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-// XFAIL: clang-6, apple-clang-9.0, apple-clang-9.1, apple-clang-10.0.0
-//  clang-6 gives different error messages.
+
+// <map>
 
 // template<class InputIterator,
 //          class Compare = less<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct.pass.cpp
index e5b42ebb19cce..b1faba622346c 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct.pass.cpp
@@ -8,7 +8,6 @@
 
 // <map>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // template<class InputIterator,
 //          class Compare = less<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct_const.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct_const.pass.cpp
index 127ee6e084583..716c5221dfb92 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct_const.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.cons/deduct_const.pass.cpp
@@ -8,7 +8,6 @@
 
 // <map>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // template<class InputIterator,
 //          class Compare = less<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/associative/multiset/multiset.cons/deduct.fail.cpp b/libcxx/test/std/containers/associative/multiset/multiset.cons/deduct.fail.cpp
index 59f90d291d015..48412e9f4c43b 100644
--- a/libcxx/test/std/containers/associative/multiset/multiset.cons/deduct.fail.cpp
+++ b/libcxx/test/std/containers/associative/multiset/multiset.cons/deduct.fail.cpp
@@ -6,11 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <set>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-// XFAIL: clang-6, apple-clang-9.0, apple-clang-9.1, apple-clang-10.0.0
-//  clang-6 gives different error messages.
+
+// <set>
 
 // template<class InputIterator,
 //          class Compare = less<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/associative/multiset/multiset.cons/deduct.pass.cpp b/libcxx/test/std/containers/associative/multiset/multiset.cons/deduct.pass.cpp
index 9e0a7fefe0046..a24588f8fd951 100644
--- a/libcxx/test/std/containers/associative/multiset/multiset.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/associative/multiset/multiset.cons/deduct.pass.cpp
@@ -6,10 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <set>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-// UNSUPPORTED: apple-clang-9.1
+
+// <set>
 
 // template<class InputIterator,
 //          class Compare = less<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/associative/set/set.cons/deduct.fail.cpp b/libcxx/test/std/containers/associative/set/set.cons/deduct.fail.cpp
index 2f068912a0367..02d2528870e54 100644
--- a/libcxx/test/std/containers/associative/set/set.cons/deduct.fail.cpp
+++ b/libcxx/test/std/containers/associative/set/set.cons/deduct.fail.cpp
@@ -6,11 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <set>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-// XFAIL: clang-6, apple-clang-9.0, apple-clang-9.1, apple-clang-10.0.0
-//  clang-6 gives different error messages.
+
+// <set>
 
 // template<class InputIterator,
 //          class Compare = less<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/associative/set/set.cons/deduct.pass.cpp b/libcxx/test/std/containers/associative/set/set.cons/deduct.pass.cpp
index a4175ae806e12..7234ee1d913fa 100644
--- a/libcxx/test/std/containers/associative/set/set.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/associative/set/set.cons/deduct.pass.cpp
@@ -6,10 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <set>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-// UNSUPPORTED: apple-clang-9.1
+
+// <set>
 
 // template<class InputIterator,
 //          class Compare = less<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/container.adaptors/priority.queue/priqueue.cons/deduct.fail.cpp b/libcxx/test/std/containers/container.adaptors/priority.queue/priqueue.cons/deduct.fail.cpp
index 88433cf0ea501..7dd0e256d7e7a 100644
--- a/libcxx/test/std/containers/container.adaptors/priority.queue/priqueue.cons/deduct.fail.cpp
+++ b/libcxx/test/std/containers/container.adaptors/priority.queue/priqueue.cons/deduct.fail.cpp
@@ -8,7 +8,6 @@
 
 // <queue>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 #include <queue>
 #include <deque>
diff --git a/libcxx/test/std/containers/container.adaptors/priority.queue/priqueue.cons/deduct.pass.cpp b/libcxx/test/std/containers/container.adaptors/priority.queue/priqueue.cons/deduct.pass.cpp
index 89ff2e71fc0b7..6804c4df667dd 100644
--- a/libcxx/test/std/containers/container.adaptors/priority.queue/priqueue.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/priority.queue/priqueue.cons/deduct.pass.cpp
@@ -8,7 +8,6 @@
 
 // <queue>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // template<class Compare, class Container>
 // priority_queue(Compare, Container)
diff --git a/libcxx/test/std/containers/container.adaptors/queue/queue.cons/deduct.fail.cpp b/libcxx/test/std/containers/container.adaptors/queue/queue.cons/deduct.fail.cpp
index 4a7bfc8878735..301acca8c67c6 100644
--- a/libcxx/test/std/containers/container.adaptors/queue/queue.cons/deduct.fail.cpp
+++ b/libcxx/test/std/containers/container.adaptors/queue/queue.cons/deduct.fail.cpp
@@ -8,7 +8,6 @@
 
 // <queue>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 #include <queue>
 #include <list>
diff --git a/libcxx/test/std/containers/container.adaptors/queue/queue.cons/deduct.pass.cpp b/libcxx/test/std/containers/container.adaptors/queue/queue.cons/deduct.pass.cpp
index 4404cc19ba02d..b346edfc8016f 100644
--- a/libcxx/test/std/containers/container.adaptors/queue/queue.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/queue/queue.cons/deduct.pass.cpp
@@ -8,10 +8,6 @@
 
 // <queue>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-5, apple-clang-9
-// UNSUPPORTED: libcpp-no-deduction-guides
-// Clang 5 will generate bad implicit deduction guides
-//  Specifically, for the copy constructor.
 
 // template<class Container>
 //   queue(Container) -> queue<typename Container::value_type, Container>;
diff --git a/libcxx/test/std/containers/container.adaptors/stack/stack.cons/deduct.fail.cpp b/libcxx/test/std/containers/container.adaptors/stack/stack.cons/deduct.fail.cpp
index a38848b4a0d4e..55296f4122335 100644
--- a/libcxx/test/std/containers/container.adaptors/stack/stack.cons/deduct.fail.cpp
+++ b/libcxx/test/std/containers/container.adaptors/stack/stack.cons/deduct.fail.cpp
@@ -8,8 +8,6 @@
 
 // <stack>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-
 
 // template <class InputIterator, class Allocator = allocator<typename iterator_traits<InputIterator>::value_type>>
 //    vector(InputIterator, InputIterator, Allocator = Allocator())
diff --git a/libcxx/test/std/containers/container.adaptors/stack/stack.cons/deduct.pass.cpp b/libcxx/test/std/containers/container.adaptors/stack/stack.cons/deduct.pass.cpp
index 5e279fcedbdd6..14a847d68818c 100644
--- a/libcxx/test/std/containers/container.adaptors/stack/stack.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/stack/stack.cons/deduct.pass.cpp
@@ -8,11 +8,6 @@
 
 // <stack>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-5, apple-clang-9
-// UNSUPPORTED: libcpp-no-deduction-guides
-// Clang 5 will generate bad implicit deduction guides
-//	Specifically, for the copy constructor.
-
 
 // template<class Container>
 //   stack(Container) -> stack<typename Container::value_type, Container>;
diff --git a/libcxx/test/std/containers/sequences/array/array.cons/deduct.fail.cpp b/libcxx/test/std/containers/sequences/array/array.cons/deduct.fail.cpp
index ede849f3964c0..37b4cc0231a58 100644
--- a/libcxx/test/std/containers/sequences/array/array.cons/deduct.fail.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.cons/deduct.fail.cpp
@@ -8,8 +8,6 @@
 
 // <array>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-
 
 // template <class T, class... U>
 //   array(T, U...) -> array<T, 1 + sizeof...(U)>;
diff --git a/libcxx/test/std/containers/sequences/array/array.cons/deduct.pass.cpp b/libcxx/test/std/containers/sequences/array/array.cons/deduct.pass.cpp
index 292fe8aca6959..9fea6b1f0679d 100644
--- a/libcxx/test/std/containers/sequences/array/array.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/sequences/array/array.cons/deduct.pass.cpp
@@ -8,18 +8,12 @@
 
 // <array>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-5, apple-clang-9
-// UNSUPPORTED: libcpp-no-deduction-guides
-// Clang 5 will generate bad implicit deduction guides
-//	Specifically, for the copy constructor.
-
 
 // template <class T, class... U>
 //   array(T, U...) -> array<T, 1 + sizeof...(U)>;
 //
 //  Requires: (is_same_v<T, U> && ...) is true. Otherwise the program is ill-formed.
 
-
 #include <array>
 #include <cassert>
 #include <cstddef>
diff --git a/libcxx/test/std/containers/sequences/deque/deque.cons/deduct.fail.cpp b/libcxx/test/std/containers/sequences/deque/deque.cons/deduct.fail.cpp
index f40577d43bd9b..044669aaec822 100644
--- a/libcxx/test/std/containers/sequences/deque/deque.cons/deduct.fail.cpp
+++ b/libcxx/test/std/containers/sequences/deque/deque.cons/deduct.fail.cpp
@@ -8,15 +8,12 @@
 
 // <array>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-
 
 // template <class InputIterator, class Allocator = allocator<typename iterator_traits<InputIterator>::value_type>>
 //    deque(InputIterator, InputIterator, Allocator = Allocator())
 //    -> deque<typename iterator_traits<InputIterator>::value_type, Allocator>;
 //
 
-
 #include <deque>
 #include <iterator>
 #include <cassert>
diff --git a/libcxx/test/std/containers/sequences/deque/deque.cons/deduct.pass.cpp b/libcxx/test/std/containers/sequences/deque/deque.cons/deduct.pass.cpp
index 3f0e16f4d1674..fb5f69cd042e2 100644
--- a/libcxx/test/std/containers/sequences/deque/deque.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/sequences/deque/deque.cons/deduct.pass.cpp
@@ -8,15 +8,12 @@
 
 // <deque>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-
 
 // template <class InputIterator, class Allocator = allocator<typename iterator_traits<InputIterator>::value_type>>
 //    deque(InputIterator, InputIterator, Allocator = Allocator())
 //    -> deque<typename iterator_traits<InputIterator>::value_type, Allocator>;
 //
 
-
 #include <deque>
 #include <iterator>
 #include <cassert>
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/deduct.fail.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/deduct.fail.cpp
index dcfb89f146fd4..47c1cdcea0e40 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/deduct.fail.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/deduct.fail.cpp
@@ -8,15 +8,12 @@
 
 // <forward_list>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-
 
 // template <class InputIterator, class Allocator = allocator<typename iterator_traits<InputIterator>::value_type>>
 //    forward_list(InputIterator, InputIterator, Allocator = Allocator())
 //    -> forward_list<typename iterator_traits<InputIterator>::value_type, Allocator>;
 //
 
-
 #include <forward_list>
 #include <iterator>
 #include <cassert>
diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/deduct.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/deduct.pass.cpp
index 6c134d840a238..9af93f9a316a4 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/deduct.pass.cpp
@@ -8,15 +8,12 @@
 
 // <forward_list>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-
 
 // template <class InputIterator, class Allocator = allocator<typename iterator_traits<InputIterator>::value_type>>
 //    forward_list(InputIterator, InputIterator, Allocator = Allocator())
 //    -> forward_list<typename iterator_traits<InputIterator>::value_type, Allocator>;
 //
 
-
 #include <forward_list>
 #include <iterator>
 #include <cassert>
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/deduct.fail.cpp b/libcxx/test/std/containers/sequences/list/list.cons/deduct.fail.cpp
index b06674f78f159..96d14514456c5 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/deduct.fail.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/deduct.fail.cpp
@@ -8,15 +8,12 @@
 
 // <list>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-
 
 // template <class InputIterator, class Allocator = allocator<typename iterator_traits<InputIterator>::value_type>>
 //    list(InputIterator, InputIterator, Allocator = Allocator())
 //    -> list<typename iterator_traits<InputIterator>::value_type, Allocator>;
 //
 
-
 #include <list>
 #include <iterator>
 #include <cassert>
diff --git a/libcxx/test/std/containers/sequences/list/list.cons/deduct.pass.cpp b/libcxx/test/std/containers/sequences/list/list.cons/deduct.pass.cpp
index 9196317e81053..afebdf8d817da 100644
--- a/libcxx/test/std/containers/sequences/list/list.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/sequences/list/list.cons/deduct.pass.cpp
@@ -8,15 +8,12 @@
 
 // <list>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-
 
 // template <class InputIterator, class Allocator = allocator<typename iterator_traits<InputIterator>::value_type>>
 //    list(InputIterator, InputIterator, Allocator = Allocator())
 //    -> list<typename iterator_traits<InputIterator>::value_type, Allocator>;
 //
 
-
 #include <list>
 #include <iterator>
 #include <cassert>
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.fail.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.fail.cpp
index 387328ff6f7ce..7ce00d70f8442 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.fail.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.fail.cpp
@@ -8,15 +8,12 @@
 
 // <vector>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-
 
 // template <class InputIterator, class Allocator = allocator<typename iterator_traits<InputIterator>::value_type>>
 //    vector(InputIterator, InputIterator, Allocator = Allocator())
 //    -> vector<typename iterator_traits<InputIterator>::value_type, Allocator>;
 //
 
-
 #include <deque>
 #include <iterator>
 #include <cassert>
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.pass.cpp
index f343ba84fcaf4..6cdc19a2c89d3 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.cons/deduct.pass.cpp
@@ -8,15 +8,12 @@
 
 // <vector>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-
 
 // template <class InputIterator, class Allocator = allocator<typename iterator_traits<InputIterator>::value_type>>
 //    vector(InputIterator, InputIterator, Allocator = Allocator())
 //    -> vector<typename iterator_traits<InputIterator>::value_type, Allocator>;
 //
 
-
 #include <vector>
 #include <iterator>
 #include <cassert>
diff --git a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct.fail.cpp b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct.fail.cpp
index a84a1c0ef0084..aa7a0580750f7 100644
--- a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct.fail.cpp
+++ b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct.fail.cpp
@@ -6,10 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <unordered_map>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-// XFAIL: clang-6, apple-clang-9.0, apple-clang-9.1, apple-clang-10.0.0
+
+// <unordered_map>
 
 // template<class InputIterator,
 //          class Hash = hash<iter-key-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct.pass.cpp b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct.pass.cpp
index 195495b5e06b4..791830b2af1f2 100644
--- a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct.pass.cpp
@@ -8,7 +8,6 @@
 
 // <unordered_map>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // template<class InputIterator,
 //          class Hash = hash<iter-key-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct_const.pass.cpp b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct_const.pass.cpp
index 277639f57224d..506a2b7edd3e2 100644
--- a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct_const.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/deduct_const.pass.cpp
@@ -8,7 +8,6 @@
 
 // <unordered_map>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // template<class InputIterator,
 //          class Hash = hash<iter-key-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct.fail.cpp b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct.fail.cpp
index ab60b3c42def8..5e8db678b6e20 100644
--- a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct.fail.cpp
+++ b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct.fail.cpp
@@ -6,10 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <unordered_map>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-// XFAIL: clang-6, apple-clang-9.0, apple-clang-9.1, apple-clang-10.0.0
+
+// <unordered_map>
 
 // template<class InputIterator,
 //          class Hash = hash<iter-key-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct.pass.cpp b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct.pass.cpp
index a7639ff5b7048..8d23afaeb0c3b 100644
--- a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct.pass.cpp
@@ -8,7 +8,6 @@
 
 // <unordered_map>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // template<class InputIterator,
 //          class Hash = hash<iter-key-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct_const.pass.cpp b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct_const.pass.cpp
index 9f2bf24d3f6c8..c22f12161c125 100644
--- a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct_const.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/deduct_const.pass.cpp
@@ -8,7 +8,6 @@
 
 // <unordered_map>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // template<class InputIterator,
 //          class Hash = hash<iter-key-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/deduct.fail.cpp b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/deduct.fail.cpp
index bf0d97029a636..16dd268f4b08a 100644
--- a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/deduct.fail.cpp
+++ b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/deduct.fail.cpp
@@ -6,10 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <unordered_set>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-// XFAIL: clang-6, apple-clang-9.0, apple-clang-9.1, apple-clang-10.0.0
+
+// <unordered_set>
 
 // template<class InputIterator,
 //        class Hash = hash<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/deduct.pass.cpp b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/deduct.pass.cpp
index a4e5f97e950ad..d8da875030dd2 100644
--- a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/deduct.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/deduct.pass.cpp
@@ -6,10 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <unordered_set>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-// UNSUPPORTED: apple-clang-9.1
+
+// <unordered_set>
 
 // template<class InputIterator,
 //        class Hash = hash<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/deduct.fail.cpp b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/deduct.fail.cpp
index b65ac2c226554..d6082810216df 100644
--- a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/deduct.fail.cpp
+++ b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/deduct.fail.cpp
@@ -6,10 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <unordered_set>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-// XFAIL: clang-6, apple-clang-9.0, apple-clang-9.1, apple-clang-10.0.0
+
+// <unordered_set>
 
 // template<class InputIterator,
 //        class Hash = hash<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/deduct.pass.cpp b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/deduct.pass.cpp
index 7af310db77ec5..a252497a3cdd7 100644
--- a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/deduct.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/deduct.pass.cpp
@@ -6,10 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <unordered_set>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-// UNSUPPORTED: apple-clang-9.1
+
+// <unordered_set>
 
 // template<class InputIterator,
 //        class Hash = hash<iter-value-type<InputIterator>>,
diff --git a/libcxx/test/std/depr/depr.c.headers/float_h.pass.cpp b/libcxx/test/std/depr/depr.c.headers/float_h.pass.cpp
index ae061ed851590..b3f42303e8814 100644
--- a/libcxx/test/std/depr/depr.c.headers/float_h.pass.cpp
+++ b/libcxx/test/std/depr/depr.c.headers/float_h.pass.cpp
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Before Clang 9.0, <float.h> does not define FLT_HAS_SUBNORM & friends in C++.
-// XFAIL: clang-4, clang-5, clang-6, clang-7, clang-8
-
 // test <float.h>
 
 #include <float.h>
diff --git a/libcxx/test/std/depr/depr.c.headers/stdlib_h.aligned_alloc.compile.pass.cpp b/libcxx/test/std/depr/depr.c.headers/stdlib_h.aligned_alloc.compile.pass.cpp
new file mode 100644
index 0000000000000..9dfe3af4b55f7
--- /dev/null
+++ b/libcxx/test/std/depr/depr.c.headers/stdlib_h.aligned_alloc.compile.pass.cpp
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// test <stdlib.h>
+// ::aligned_alloc
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// ::aligned_alloc is provided by the C library, but it's marked as unavailable
+// until macOS 10.15
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+
+// ::aligned_alloc is not implemented on Windows
+// XFAIL: target={{.+}}-pc-windows-{{.+}}
+
+#include <stdlib.h>
+#include <type_traits>
+
+static_assert(std::is_same<decltype(aligned_alloc(1, 0)), void*>::value, "");
diff --git a/libcxx/test/std/depr/depr.c.headers/stdlib_h.pass.cpp b/libcxx/test/std/depr/depr.c.headers/stdlib_h.pass.cpp
index 295f51807d41f..83d17107fbbcc 100644
--- a/libcxx/test/std/depr/depr.c.headers/stdlib_h.pass.cpp
+++ b/libcxx/test/std/depr/depr.c.headers/stdlib_h.pass.cpp
@@ -132,10 +132,7 @@ int main(int, char**)
     static_assert((std::is_same<decltype(rand()), int>::value), "");
     static_assert((std::is_same<decltype(srand(0)), void>::value), "");
 
-#if TEST_STD_VER > 14 && defined(TEST_HAS_ALIGNED_ALLOC)
-    static_assert((std::is_same<decltype(aligned_alloc(1, 0)), void*>::value),
-                  "");
-#endif
+    // aligned_alloc tested in stdlib_h.aligned_alloc.compile.pass.cpp
 
     void* pv = 0;
     void (*handler)() = 0;
diff --git a/libcxx/test/std/depr/depr.lib.binders/depr.lib.bind.1st/bind1st.depr_in_cxx11.verify.cpp b/libcxx/test/std/depr/depr.lib.binders/depr.lib.bind.1st/bind1st.depr_in_cxx11.verify.cpp
index 38be748441273..93cfbb60a7d79 100644
--- a/libcxx/test/std/depr/depr.lib.binders/depr.lib.bind.1st/bind1st.depr_in_cxx11.verify.cpp
+++ b/libcxx/test/std/depr/depr.lib.binders/depr.lib.bind.1st/bind1st.depr_in_cxx11.verify.cpp
@@ -10,7 +10,6 @@
 //
 // bind1st
 
-// UNSUPPORTED: clang-4.0
 // UNSUPPORTED: c++03
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX17_REMOVED_BINDERS
 
diff --git a/libcxx/test/std/depr/depr.lib.binders/depr.lib.bind.2nd/bind2nd.depr_in_cxx11.verify.cpp b/libcxx/test/std/depr/depr.lib.binders/depr.lib.bind.2nd/bind2nd.depr_in_cxx11.verify.cpp
index f2a2e34f24e28..3e55db60572dc 100644
--- a/libcxx/test/std/depr/depr.lib.binders/depr.lib.bind.2nd/bind2nd.depr_in_cxx11.verify.cpp
+++ b/libcxx/test/std/depr/depr.lib.binders/depr.lib.bind.2nd/bind2nd.depr_in_cxx11.verify.cpp
@@ -10,7 +10,6 @@
 //
 // bind2nd
 
-// UNSUPPORTED: clang-4.0
 // UNSUPPORTED: c++03
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX17_REMOVED_BINDERS
 
diff --git a/libcxx/test/std/depr/depr.lib.binders/depr.lib.binder.1st/binder1st.depr_in_cxx11.verify.cpp b/libcxx/test/std/depr/depr.lib.binders/depr.lib.binder.1st/binder1st.depr_in_cxx11.verify.cpp
index 2125283d7ddc9..5048d7b2ea1ef 100644
--- a/libcxx/test/std/depr/depr.lib.binders/depr.lib.binder.1st/binder1st.depr_in_cxx11.verify.cpp
+++ b/libcxx/test/std/depr/depr.lib.binders/depr.lib.binder.1st/binder1st.depr_in_cxx11.verify.cpp
@@ -10,7 +10,6 @@
 //
 // binder1st
 
-// UNSUPPORTED: clang-4.0
 // UNSUPPORTED: c++03
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX17_REMOVED_BINDERS
 
diff --git a/libcxx/test/std/depr/depr.lib.binders/depr.lib.binder.2nd/binder2nd.depr_in_cxx11.verify.cpp b/libcxx/test/std/depr/depr.lib.binders/depr.lib.binder.2nd/binder2nd.depr_in_cxx11.verify.cpp
index e63a08d1137fb..5a883ab040f47 100644
--- a/libcxx/test/std/depr/depr.lib.binders/depr.lib.binder.2nd/binder2nd.depr_in_cxx11.verify.cpp
+++ b/libcxx/test/std/depr/depr.lib.binders/depr.lib.binder.2nd/binder2nd.depr_in_cxx11.verify.cpp
@@ -10,7 +10,6 @@
 //
 // binder2nd
 
-// UNSUPPORTED: clang-4.0
 // UNSUPPORTED: c++03
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX17_REMOVED_BINDERS
 
diff --git a/libcxx/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.winc/weakly_incrementable.compile.pass.cpp b/libcxx/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.winc/weakly_incrementable.compile.pass.cpp
index a3198357bbb76..5fad38d4d6cfd 100644
--- a/libcxx/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.winc/weakly_incrementable.compile.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.requirements/iterator.concepts/iterator.concept.winc/weakly_incrementable.compile.pass.cpp
@@ -30,6 +30,7 @@ static_assert(!std::weakly_incrementable<int&>);
 static_assert(!std::weakly_incrementable<int()>);
 static_assert(!std::weakly_incrementable<int (*)()>);
 static_assert(!std::weakly_incrementable<int (&)()>);
+static_assert(!std::weakly_incrementable<bool>);
 
 struct S {};
 static_assert(!std::weakly_incrementable<int S::*>);
diff --git a/libcxx/test/std/language.support/cmp/cmp.categories.pre/zero_type.verify.cpp b/libcxx/test/std/language.support/cmp/cmp.categories.pre/zero_type.verify.cpp
index 55353f5007d67..50abba9c5b6b5 100644
--- a/libcxx/test/std/language.support/cmp/cmp.categories.pre/zero_type.verify.cpp
+++ b/libcxx/test/std/language.support/cmp/cmp.categories.pre/zero_type.verify.cpp
@@ -8,11 +8,11 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: apple-clang-9, apple-clang-10, apple-clang-11, apple-clang-12.0.0
+// UNSUPPORTED: apple-clang-12
 
 // In MSVC mode, there's a slightly different number of errors printed for
 // each of these, so it doesn't add up to the exact expected count of 18.
-// UNSUPPORTED: msvc
+// XFAIL: msvc
 
 // <compare>
 
diff --git a/libcxx/test/std/language.support/cmp/cmp.partialord/partialord.pass.cpp b/libcxx/test/std/language.support/cmp/cmp.partialord/partialord.pass.cpp
index 60f4c97239f05..f42e451a36802 100644
--- a/libcxx/test/std/language.support/cmp/cmp.partialord/partialord.pass.cpp
+++ b/libcxx/test/std/language.support/cmp/cmp.partialord/partialord.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: apple-clang-9, apple-clang-10, apple-clang-11, apple-clang-12.0.0
+// UNSUPPORTED: apple-clang-12
 
 // <compare>
 
diff --git a/libcxx/test/std/language.support/cmp/cmp.result/compare_three_way_result.compile.pass.cpp b/libcxx/test/std/language.support/cmp/cmp.result/compare_three_way_result.compile.pass.cpp
new file mode 100644
index 0000000000000..4ad764fd65186
--- /dev/null
+++ b/libcxx/test/std/language.support/cmp/cmp.result/compare_three_way_result.compile.pass.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+
+// <compare>
+
+// template<class T, class U = T> struct compare_three_way_result;
+// template<class T, class U = T>
+//   using compare_three_way_result_t = typename compare_three_way_result<T, U>::type;
+
+#include <compare>
+
+#include "test_macros.h"
+
+template<class T>
+concept has_no_nested_type = !requires { typename T::type; };
+
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<int>, std::strong_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<float>, std::partial_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<unsigned>, std::strong_ordering);
+
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<int, int>, std::strong_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<int, float>, std::partial_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<float, int>, std::partial_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<float, float>, std::partial_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<float, unsigned>, std::partial_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<unsigned, float>, std::partial_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<unsigned, unsigned>, std::strong_ordering);
+
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<const int&>, std::strong_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<const int&, int>, std::strong_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<const int*>, std::strong_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<const int*, int*>, std::strong_ordering);
+
+static_assert(has_no_nested_type<std::compare_three_way_result<void>>);
+static_assert(has_no_nested_type<std::compare_three_way_result<void, void>>);
+static_assert(has_no_nested_type<std::compare_three_way_result<int, void>>);
+static_assert(has_no_nested_type<std::compare_three_way_result<int, int*>>);
+static_assert(has_no_nested_type<std::compare_three_way_result<int, unsigned>>);
+static_assert(has_no_nested_type<std::compare_three_way_result<unsigned, int>>);
+
+struct A {
+    float operator<=>(const A&) const;  // a non-comparison-category type is OK
+};
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<A>, float);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<A, A>, float);
+
+struct B {
+    using T = int(&)();
+    T operator<=>(const B&) const;  // no decay takes place either
+};
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<B>, int(&)());
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<B, B>, int(&)());
+
+struct C {
+    std::strong_ordering operator<=>(C&);  // C isn't const-comparable
+};
+static_assert(has_no_nested_type<std::compare_three_way_result<C>>);
+static_assert(has_no_nested_type<std::compare_three_way_result<C&>>);
+static_assert(has_no_nested_type<std::compare_three_way_result<C&&>>);
+
+static_assert(has_no_nested_type<std::compare_three_way_result<C, C>>);
+static_assert(has_no_nested_type<std::compare_three_way_result<C&, C&>>);
+static_assert(has_no_nested_type<std::compare_three_way_result<C&&, C&&>>);
+
+struct D {
+    std::strong_ordering operator<=>(D&) &;
+    std::strong_ordering operator<=>(D&&) &&;
+    std::weak_ordering operator<=>(const D&) const&;  // comparison is always done by const&
+    std::strong_ordering operator<=>(const D&&) const&&;
+};
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<D>, std::weak_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<D&>, std::weak_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<D&&>, std::weak_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<const D&>, std::weak_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<const D&&>, std::weak_ordering);
+
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<D, D>, std::weak_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<D&, D&>, std::weak_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<D&&, D&&>, std::weak_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<const D&, const D&>, std::weak_ordering);
+ASSERT_SAME_TYPE(std::compare_three_way_result_t<const D&&, const D&&>, std::weak_ordering);
diff --git a/libcxx/test/std/language.support/cmp/cmp.strongord/strongord.pass.cpp b/libcxx/test/std/language.support/cmp/cmp.strongord/strongord.pass.cpp
index a0e5037de44d3..b53b3c674c144 100644
--- a/libcxx/test/std/language.support/cmp/cmp.strongord/strongord.pass.cpp
+++ b/libcxx/test/std/language.support/cmp/cmp.strongord/strongord.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: apple-clang-9, apple-clang-10, apple-clang-11, apple-clang-12.0.0
+// UNSUPPORTED: apple-clang-12
 
 // <compare>
 
diff --git a/libcxx/test/std/language.support/cmp/cmp.weakord/weakord.pass.cpp b/libcxx/test/std/language.support/cmp/cmp.weakord/weakord.pass.cpp
index 4f560dbbc2f2f..c6c56b77eb883 100644
--- a/libcxx/test/std/language.support/cmp/cmp.weakord/weakord.pass.cpp
+++ b/libcxx/test/std/language.support/cmp/cmp.weakord/weakord.pass.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: apple-clang-9, apple-clang-10, apple-clang-11, apple-clang-12.0.0
+// UNSUPPORTED: apple-clang-12
 
 // <compare>
 
diff --git a/libcxx/test/std/language.support/support.dynamic/destroying_delete_t.pass.cpp b/libcxx/test/std/language.support/support.dynamic/destroying_delete_t.pass.cpp
index ae9feca42d09a..7f52e2d8d5080 100644
--- a/libcxx/test/std/language.support/support.dynamic/destroying_delete_t.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/destroying_delete_t.pass.cpp
@@ -14,9 +14,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-// UNSUPPORTED: clang-6, clang-7
-
 #include <new>
 
 #include <cassert>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/delete_align_val_t_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/delete_align_val_t_replace.pass.cpp
index eea2ece11c0d4..3dd7855e4adfd 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/delete_align_val_t_replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/delete_align_val_t_replace.pass.cpp
@@ -10,12 +10,6 @@
 
 // UNSUPPORTED: sanitizer-new-delete, c++03, c++11, c++14
 
-// Aligned allocation was not provided before macosx10.14 and as a result we
-// get availability errors when the deployment target is older than macosx10.14.
-// However, support for that was broken prior to Clang 8 and AppleClang 11.
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-// UNSUPPORTED: clang-5, clang-6, clang-7
-
 // Libcxx when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // UNSUPPORTED: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t.pass.cpp
index 6ea01e03f854a..d46eb22b520e9 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t.pass.cpp
@@ -13,9 +13,6 @@
 
 // Aligned allocation was not provided before macosx10.14 and as a result we
 // get availability errors when the deployment target is older than macosx10.14.
-// However, support for that was broken prior to Clang 8 and AppleClang 11.
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-// UNSUPPORTED: clang-5, clang-6, clang-7
 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13}}
 
 // Libcxx when built for z/OS doesn't contain the aligned allocation functions,
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow.pass.cpp
index 6e1d5ef0cc579..ffadd400b040c 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow.pass.cpp
@@ -13,9 +13,6 @@
 
 // Aligned allocation was not provided before macosx10.14 and as a result we
 // get availability errors when the deployment target is older than macosx10.14.
-// However, support for that was broken prior to Clang 8 and AppleClang 11.
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-// UNSUPPORTED: clang-5, clang-6, clang-7
 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13}}
 
 // Libcxx when built for z/OS doesn't contain the aligned allocation functions,
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow_replace.pass.cpp
index ee2f932761f1d..5e85227444bd5 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow_replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow_replace.pass.cpp
@@ -11,9 +11,6 @@
 
 // Aligned allocation was not provided before macosx10.14 and as a result we
 // get availability errors when the deployment target is older than macosx10.14.
-// However, support for that was broken prior to Clang 8 and AppleClang 11.
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-// UNSUPPORTED: clang-5, clang-6, clang-7
 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13}}
 
 // Libcxx when built for z/OS doesn't contain the aligned allocation functions,
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
index aa04afbcd04e5..21663cdf956d4 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array14.pass.cpp
@@ -12,8 +12,7 @@
 
 // NOTE: Clang does not enable sized-deallocation in C++14 and beyond by
 // default. It is only enabled when -fsized-deallocation is given.
-// (except clang-3.6 which temporarily enabled sized-deallocation)
-// UNSUPPORTED: clang, apple-clang
+// XFAIL: clang, apple-clang
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/delete_align_val_t_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/delete_align_val_t_replace.pass.cpp
index e699073718965..4814be37a6ff3 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/delete_align_val_t_replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/delete_align_val_t_replace.pass.cpp
@@ -12,9 +12,6 @@
 
 // Aligned allocation was not provided before macosx10.14 and as a result we
 // get availability errors when the deployment target is older than macosx10.14.
-// However, support for that was broken prior to Clang 8 and AppleClang 11.
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-// UNSUPPORTED: clang-5, clang-6, clang-7
 
 // Libcxx when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t.pass.cpp
index 975763e6382be..cc3eba19656f3 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t.pass.cpp
@@ -10,9 +10,6 @@
 
 // Aligned allocation was not provided before macosx10.14 and as a result we
 // get availability errors when the deployment target is older than macosx10.14.
-// However, support for that was broken prior to Clang 8 and AppleClang 11.
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-// UNSUPPORTED: clang-5, clang-6, clang-7
 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13}}
 
 // asan and msan will not call the new handler.
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow.pass.cpp
index 7a1b20ea3e09c..ec72b176c7625 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow.pass.cpp
@@ -10,9 +10,6 @@
 
 // Aligned allocation was not provided before macosx10.14 and as a result we
 // get availability errors when the deployment target is older than macosx10.14.
-// However, support for that was broken prior to Clang 8 and AppleClang 11.
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-// UNSUPPORTED: clang-5, clang-6, clang-7
 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13}}
 
 // asan and msan will not call the new handler.
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow_replace.pass.cpp
index 8057e2974987d..fddc4494ce50c 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow_replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow_replace.pass.cpp
@@ -11,9 +11,6 @@
 
 // Aligned allocation was not provided before macosx10.14 and as a result we
 // get availability errors when the deployment target is older than macosx10.14.
-// However, support for that was broken prior to Clang 8 and AppleClang 11.
-// UNSUPPORTED: apple-clang-9, apple-clang-10
-// UNSUPPORTED: clang-5, clang-6, clang-7
 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13}}
 
 // Libcxx when built for z/OS doesn't contain the aligned allocation functions,
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
index f19bdbe0030cb..a8701ce7a86cf 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete14.pass.cpp
@@ -12,8 +12,7 @@
 
 // NOTE: Clang does not enable sized-deallocation in C++14 and beyond by
 // default. It is only enabled when -fsized-deallocation is given.
-// (except clang-3.6 which temporarily enabled sized-deallocation)
-// UNSUPPORTED: clang, apple-clang
+// XFAIL: clang, apple-clang
 
 #include <new>
 #include <cstddef>
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete_fsizeddeallocation.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete_fsizeddeallocation.pass.cpp
index da296f1a3f933..826b2a304c037 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete_fsizeddeallocation.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete_fsizeddeallocation.pass.cpp
@@ -14,7 +14,6 @@
 // UNSUPPORTED: sanitizer-new-delete
 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11}}
 
-// NOTE: Only clang-3.7 and GCC 5.1 and greater support -fsized-deallocation.
 // REQUIRES: -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS: -fsized-deallocation, -O3
 
diff --git a/libcxx/test/std/language.support/support.limits/c.limits/cfloat.pass.cpp b/libcxx/test/std/language.support/support.limits/c.limits/cfloat.pass.cpp
index 1406908b52464..4d1a2907b228e 100644
--- a/libcxx/test/std/language.support/support.limits/c.limits/cfloat.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/c.limits/cfloat.pass.cpp
@@ -6,10 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Before Clang 9.0, <float.h> does not define FLT_HAS_SUBNORM & friends in C++.
-// XFAIL: clang-4, clang-5, clang-6, clang-7, clang-8
-
-// test cfloat
+// test <cfloat>
 
 #include <cfloat>
 
diff --git a/libcxx/test/std/language.support/support.runtime/cstdlib.aligned_alloc.compile.pass.cpp b/libcxx/test/std/language.support/support.runtime/cstdlib.aligned_alloc.compile.pass.cpp
new file mode 100644
index 0000000000000..f8af33714479a
--- /dev/null
+++ b/libcxx/test/std/language.support/support.runtime/cstdlib.aligned_alloc.compile.pass.cpp
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// test <cstdlib>
+// std::aligned_alloc
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// ::aligned_alloc is provided by the C library, but it's marked as unavailable
+// until macOS 10.15
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
+
+// ::aligned_alloc is not implemented on Windows
+// XFAIL: target={{.+}}-pc-windows-{{.+}}
+
+#include <cstdlib>
+#include <type_traits>
+
+static_assert(std::is_same<decltype(std::aligned_alloc(1, 0)), void*>::value, "");
diff --git a/libcxx/test/std/language.support/support.runtime/cstdlib.pass.cpp b/libcxx/test/std/language.support/support.runtime/cstdlib.pass.cpp
index 668213f84838c..19d07503b9633 100644
--- a/libcxx/test/std/language.support/support.runtime/cstdlib.pass.cpp
+++ b/libcxx/test/std/language.support/support.runtime/cstdlib.pass.cpp
@@ -119,10 +119,7 @@ int main(int, char**)
     static_assert((std::is_same<decltype(std::rand()), int>::value), "");
     static_assert((std::is_same<decltype(std::srand(0)), void>::value), "");
 
-#if TEST_STD_VER > 14 && defined(TEST_HAS_ALIGNED_ALLOC)
-    static_assert(
-        (std::is_same<decltype(std::aligned_alloc(1, 0)), void*>::value), "");
-#endif
+    // std::aligned_alloc tested in cstdlib.aligned_alloc.compile.pass.cpp
 
     void* pv = 0;
     void (*handler)() = 0;
diff --git a/libcxx/test/std/language.support/support.runtime/ctime.pass.cpp b/libcxx/test/std/language.support/support.runtime/ctime.pass.cpp
index df827793a1282..6c3f36610cdf4 100644
--- a/libcxx/test/std/language.support/support.runtime/ctime.pass.cpp
+++ b/libcxx/test/std/language.support/support.runtime/ctime.pass.cpp
@@ -20,7 +20,7 @@
 #error CLOCKS_PER_SEC not defined
 #endif
 
-#if TEST_STD_VER > 14 && defined(TEST_HAS_TIMESPEC_GET)
+#if TEST_STD_VER > 14
 #ifndef TIME_UTC
 #error TIME_UTC not defined
 #endif
@@ -32,10 +32,7 @@ int main(int, char**)
     std::size_t s = 0;
     std::time_t t = 0;
     std::tm tm = {};
-#if TEST_STD_VER > 14 && defined(TEST_HAS_TIMESPEC_GET)
-    std::timespec tmspec = {};
-    ((void)tmspec); // Prevent unused warning
-#endif
+    // std::timespec and std::timespec_get tested in ctime.timespec.compile.pass.cpp
     ((void)c); // Prevent unused warning
     ((void)s); // Prevent unused warning
     ((void)t); // Prevent unused warning
@@ -44,9 +41,6 @@ int main(int, char**)
     static_assert((std::is_same<decltype(std::difftime(t,t)), double>::value), "");
     static_assert((std::is_same<decltype(std::mktime(&tm)), std::time_t>::value), "");
     static_assert((std::is_same<decltype(std::time(&t)), std::time_t>::value), "");
-#if TEST_STD_VER > 14 && defined(TEST_HAS_TIMESPEC_GET)
-    static_assert((std::is_same<decltype(std::timespec_get(&tmspec, 0)), int>::value), "");
-#endif
 #ifndef _LIBCPP_HAS_NO_THREAD_UNSAFE_C_FUNCTIONS
     static_assert((std::is_same<decltype(std::asctime(&tm)), char*>::value), "");
     static_assert((std::is_same<decltype(std::ctime(&t)), char*>::value), "");
diff --git a/libcxx/test/libcxx/language.support/has_quick_exit.compile.pass.cpp b/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp
similarity index 51%
rename from libcxx/test/libcxx/language.support/has_quick_exit.compile.pass.cpp
rename to libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp
index ad0fe6fb18fd1..b8a4460d5a9f3 100644
--- a/libcxx/test/libcxx/language.support/has_quick_exit.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.runtime/ctime.timespec.compile.pass.cpp
@@ -6,14 +6,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+// test <ctime>
+// std::timespec and std::timespec_get
+
 // UNSUPPORTED: c++03, c++11, c++14
 
-// Make sure TEST_HAS_QUICK_EXIT (defined by the test suite) and
-// _LIBCPP_HAS_QUICK_EXIT (defined by libc++) stay in sync.
+// ::timespec_get is provided by the C library, but it's marked as
+// unavailable until macOS 10.15
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14}}
 
-#include <__config>
-#include "test_macros.h"
+#include <ctime>
+#include <type_traits>
 
-#if defined(TEST_HAS_QUICK_EXIT) != defined(_LIBCPP_HAS_QUICK_EXIT)
-#   error "TEST_HAS_QUICK_EXIT and _LIBCPP_HAS_QUICK_EXIT are out of sync"
-#endif
+std::timespec tmspec = {};
+static_assert(std::is_same<decltype(std::timespec_get(&tmspec, 0)), int>::value, "");
diff --git a/libcxx/test/std/language.support/support.start.term/quick_exit.pass.cpp b/libcxx/test/std/language.support/support.start.term/quick_exit.pass.cpp
index 16f68d435640c..5a70ea5bd570b 100644
--- a/libcxx/test/std/language.support/support.start.term/quick_exit.pass.cpp
+++ b/libcxx/test/std/language.support/support.start.term/quick_exit.pass.cpp
@@ -5,22 +5,23 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03
 
+// ::quick_exit and ::at_quick_exit are not implemented on macOS.
+// TODO: We should never be using `darwin` as the triple, but LLVM's config.guess script
+//       guesses the host triple to be darwin instead of macosx when on macOS.
+// XFAIL: target={{.+}}-apple-macos{{.*}}
+// XFAIL: target={{.+}}-apple-darwin{{.+}}
+
 // test quick_exit and at_quick_exit
 
 #include <cstdlib>
 
-#include "test_macros.h"
-
 void f() {}
 
-int main(int, char**)
-{
-#ifdef TEST_HAS_QUICK_EXIT
+int main(int, char**) {
     std::at_quick_exit(f);
     std::quick_exit(0);
-#endif
-
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/numerics/numeric.ops/accumulate/accumulate.pass.cpp b/libcxx/test/std/numerics/numeric.ops/accumulate/accumulate.pass.cpp
index cdf6549effe73..b7071230619a0 100644
--- a/libcxx/test/std/numerics/numeric.ops/accumulate/accumulate.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/accumulate/accumulate.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // <numeric>
-// UNSUPPORTED: clang-8
 
 // Became constexpr in C++20
 // template <InputIterator Iter, MoveConstructible T>
diff --git a/libcxx/test/std/numerics/numeric.ops/accumulate/accumulate_op.pass.cpp b/libcxx/test/std/numerics/numeric.ops/accumulate/accumulate_op.pass.cpp
index 56694d1674528..83ea03f3817eb 100644
--- a/libcxx/test/std/numerics/numeric.ops/accumulate/accumulate_op.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/accumulate/accumulate_op.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // <numeric>
-// UNSUPPORTED: clang-8
 
 // Became constexpr in C++20
 // template <InputIterator Iter, MoveConstructible T,
diff --git a/libcxx/test/std/numerics/numeric.ops/adjacent.difference/adjacent_difference.pass.cpp b/libcxx/test/std/numerics/numeric.ops/adjacent.difference/adjacent_difference.pass.cpp
index 6d6ad494635af..25f310f0b63bb 100644
--- a/libcxx/test/std/numerics/numeric.ops/adjacent.difference/adjacent_difference.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/adjacent.difference/adjacent_difference.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // <numeric>
-// UNSUPPORTED: clang-8
 
 // Became constexpr in C++20
 // template <InputIterator InIter,
diff --git a/libcxx/test/std/numerics/numeric.ops/adjacent.difference/adjacent_difference_op.pass.cpp b/libcxx/test/std/numerics/numeric.ops/adjacent.difference/adjacent_difference_op.pass.cpp
index 2e4413d4b5574..352692276f761 100644
--- a/libcxx/test/std/numerics/numeric.ops/adjacent.difference/adjacent_difference_op.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/adjacent.difference/adjacent_difference_op.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // <numeric>
-// UNSUPPORTED: clang-8
 
 // Became constexpr in C++20
 // template <InputIterator InIter,
diff --git a/libcxx/test/std/numerics/numeric.ops/exclusive.scan/exclusive_scan.pass.cpp b/libcxx/test/std/numerics/numeric.ops/exclusive.scan/exclusive_scan.pass.cpp
index 25d58b93918b2..69f25fcbc3c6b 100644
--- a/libcxx/test/std/numerics/numeric.ops/exclusive.scan/exclusive_scan.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/exclusive.scan/exclusive_scan.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
 
 // <numeric>
 
diff --git a/libcxx/test/std/numerics/numeric.ops/exclusive.scan/exclusive_scan_init_op.pass.cpp b/libcxx/test/std/numerics/numeric.ops/exclusive.scan/exclusive_scan_init_op.pass.cpp
index 5619955be8c47..53698b1d7b2e1 100644
--- a/libcxx/test/std/numerics/numeric.ops/exclusive.scan/exclusive_scan_init_op.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/exclusive.scan/exclusive_scan_init_op.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
 
 // <numeric>
 
diff --git a/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan.pass.cpp b/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan.pass.cpp
index 840f638ef0e7f..6e005060cbda5 100644
--- a/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
 
 // <numeric>
 
diff --git a/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op.pass.cpp b/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op.pass.cpp
index 97f3932f62ae6..5e33394f7483e 100644
--- a/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
 
 // <numeric>
 
diff --git a/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op_init.pass.cpp b/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op_init.pass.cpp
index d24de30dcb173..f7c2be2c66316 100644
--- a/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op_init.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/inclusive.scan/inclusive_scan_op_init.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
 
 // <numeric>
 
diff --git a/libcxx/test/std/numerics/numeric.ops/inner.product/inner_product.pass.cpp b/libcxx/test/std/numerics/numeric.ops/inner.product/inner_product.pass.cpp
index b236c6b94295f..00aeb90fc49b1 100644
--- a/libcxx/test/std/numerics/numeric.ops/inner.product/inner_product.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/inner.product/inner_product.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // <numeric>
-// UNSUPPORTED: clang-8
 
 // Became constexpr in C++20
 // template <InputIterator Iter1, InputIterator Iter2, MoveConstructible T>
diff --git a/libcxx/test/std/numerics/numeric.ops/inner.product/inner_product_comp.pass.cpp b/libcxx/test/std/numerics/numeric.ops/inner.product/inner_product_comp.pass.cpp
index 0e04054511db3..13cc3291eef20 100644
--- a/libcxx/test/std/numerics/numeric.ops/inner.product/inner_product_comp.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/inner.product/inner_product_comp.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // <numeric>
-// UNSUPPORTED: clang-8
 
 // Became constexpr in C++20
 // template <InputIterator Iter1, InputIterator Iter2, MoveConstructible T,
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.iota/iota.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.iota/iota.pass.cpp
index 50d0a0632f6a1..8cb49b123faf9 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.iota/iota.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.iota/iota.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // <numeric>
-// UNSUPPORTED: clang-8
 
 // Became constexpr in C++20
 // template <class ForwardIterator, class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.midpoint/midpoint.float.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.midpoint/midpoint.float.pass.cpp
index 5cbc40a3578e3..4247e2b9e2313 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.midpoint/midpoint.float.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.midpoint/midpoint.float.pass.cpp
@@ -92,7 +92,7 @@ void fp_test()
     assert(d0 < d1);  // sanity checking
     assert(d1 < d2);  // sanity checking
 
-#if defined(__PPC__) && __LONG_DOUBLE_128__ && !__LONG_DOUBLE_IEEE128__
+#if defined(__PPC__) && __LONG_DOUBLE_128__ && !(defined(__LONG_DOUBLE_IEEE128__) && __LONG_DOUBLE_IEEE128__)
 //	For 128 bit long double implemented as 2 doubles on PowerPC,
 //	nextafterl() of libm gives imprecise results which fails the
 //	midpoint() tests below. So skip the test for this case.
diff --git a/libcxx/test/std/numerics/numeric.ops/partial.sum/partial_sum.pass.cpp b/libcxx/test/std/numerics/numeric.ops/partial.sum/partial_sum.pass.cpp
index 704494801d7e7..2140a80bdff7a 100644
--- a/libcxx/test/std/numerics/numeric.ops/partial.sum/partial_sum.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/partial.sum/partial_sum.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // <numeric>
-// UNSUPPORTED: clang-8
 
 // Became constexpr in C++20
 // template <InputIterator InIter, OutputIterator<auto, const InIter::value_type&> OutIter>
diff --git a/libcxx/test/std/numerics/numeric.ops/partial.sum/partial_sum_op.pass.cpp b/libcxx/test/std/numerics/numeric.ops/partial.sum/partial_sum_op.pass.cpp
index 5662debaebb4b..eeb06ab6dc78e 100644
--- a/libcxx/test/std/numerics/numeric.ops/partial.sum/partial_sum_op.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/partial.sum/partial_sum_op.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // <numeric>
-// UNSUPPORTED: clang-8
 
 // Became constexpr in C++20
 // template<InputIterator InIter,
diff --git a/libcxx/test/std/numerics/numeric.ops/reduce/reduce.pass.cpp b/libcxx/test/std/numerics/numeric.ops/reduce/reduce.pass.cpp
index 717b8519e19b6..53a595e0bed30 100644
--- a/libcxx/test/std/numerics/numeric.ops/reduce/reduce.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/reduce/reduce.pass.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <numeric>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
+
+// <numeric>
 
 // Became constexpr in C++20
 // template<class InputIterator>
diff --git a/libcxx/test/std/numerics/numeric.ops/reduce/reduce_init.pass.cpp b/libcxx/test/std/numerics/numeric.ops/reduce/reduce_init.pass.cpp
index e57981845ba45..59590b1a1fa69 100644
--- a/libcxx/test/std/numerics/numeric.ops/reduce/reduce_init.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/reduce/reduce_init.pass.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <numeric>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
+
+// <numeric>
 
 // Became constexpr in C++20
 // template<class InputIterator, class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/reduce/reduce_init_op.pass.cpp b/libcxx/test/std/numerics/numeric.ops/reduce/reduce_init_op.pass.cpp
index 87b8032ed17de..a51519ed71db3 100644
--- a/libcxx/test/std/numerics/numeric.ops/reduce/reduce_init_op.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/reduce/reduce_init_op.pass.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <numeric>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
+
+// <numeric>
 
 // Became constexpr in C++20
 // template<class InputIterator, class T, class BinaryOperation>
diff --git a/libcxx/test/std/numerics/numeric.ops/transform.exclusive.scan/transform_exclusive_scan_init_bop_uop.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.exclusive.scan/transform_exclusive_scan_init_bop_uop.pass.cpp
index ffec3b4b13141..9ab546a06d85b 100644
--- a/libcxx/test/std/numerics/numeric.ops/transform.exclusive.scan/transform_exclusive_scan_init_bop_uop.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/transform.exclusive.scan/transform_exclusive_scan_init_bop_uop.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
 
 // <numeric>
 
diff --git a/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop.pass.cpp
index 505a43f9c89f9..d4a95bf582978 100644
--- a/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
 
 // <numeric>
 
diff --git a/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop_init.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop_init.pass.cpp
index 8d2f9ea52be53..7d89c5b320468 100644
--- a/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop_init.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/transform.inclusive.scan/transform_inclusive_scan_bop_uop_init.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
 
 // <numeric>
 
diff --git a/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_init_bop_uop.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_init_bop_uop.pass.cpp
index 6047e340c120e..42911b7efe5fd 100644
--- a/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_init_bop_uop.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_init_bop_uop.pass.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <numeric>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
+
+// <numeric>
 
 // Became constexpr in C++20
 // template <class InputIterator1, class T,
diff --git a/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_iter_init.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_iter_init.pass.cpp
index b310d21691d4c..9e888fda47e73 100644
--- a/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_iter_init.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_iter_init.pass.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <numeric>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
+
+// <numeric>
 
 // Became constexpr in C++20
 // template <class InputIterator1, class InputIterator2, class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_iter_init_op_op.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_iter_init_op_op.pass.cpp
index 84d8374feca20..74dd3a32ae4ed 100644
--- a/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_iter_init_op_op.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_iter_init_op_op.pass.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <numeric>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-8
+
+// <numeric>
 
 // Became constexpr in C++20
 // template <class InputIterator1, class InputIterator2, class T,
diff --git a/libcxx/test/std/ranges/range.access/range.access.begin/incomplete.compile.verify.cpp b/libcxx/test/std/ranges/range.access/range.access.begin/incomplete.compile.verify.cpp
index 0029f8b8c58c9..9399ffc7a4da2 100644
--- a/libcxx/test/std/ranges/range.access/range.access.begin/incomplete.compile.verify.cpp
+++ b/libcxx/test/std/ranges/range.access/range.access.begin/incomplete.compile.verify.cpp
@@ -8,7 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-no-concepts
-// UNSUPPORTED: clang-10
 // UNSUPPORTED: libcpp-has-no-incomplete-ranges
 
 // unspecified begin;
@@ -19,11 +18,9 @@
 
 using begin_t = decltype(std::ranges::begin);
 
-// clang-format off
 template <class T>
 requires(!std::invocable<begin_t&, T>)
 void f() {}
-// clang-format on
 
 void test() {
   struct incomplete;
diff --git a/libcxx/test/std/ranges/range.access/range.access.cbegin/incomplete.compile.verify.cpp b/libcxx/test/std/ranges/range.access/range.access.cbegin/incomplete.compile.verify.cpp
index 7a9323d4bbb10..a477822e90cb4 100644
--- a/libcxx/test/std/ranges/range.access/range.access.cbegin/incomplete.compile.verify.cpp
+++ b/libcxx/test/std/ranges/range.access/range.access.cbegin/incomplete.compile.verify.cpp
@@ -8,7 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-no-concepts
-// UNSUPPORTED: clang-10
 // UNSUPPORTED: libcpp-has-no-incomplete-ranges
 
 // ranges::cbegin;
@@ -19,11 +18,9 @@
 
 using cbegin_t = decltype(std::ranges::cbegin);
 
-// clang-format off
 template <class T>
 requires(!std::invocable<cbegin_t&, T>)
 void f() {}
-// clang-format on
 
 void test() {
   struct incomplete;
diff --git a/libcxx/test/std/ranges/range.access/range.access.cend/incomplete.compile.verify.cpp b/libcxx/test/std/ranges/range.access/range.access.cend/incomplete.compile.verify.cpp
index 38edd18f16cb9..306c662d5746c 100644
--- a/libcxx/test/std/ranges/range.access/range.access.cend/incomplete.compile.verify.cpp
+++ b/libcxx/test/std/ranges/range.access/range.access.cend/incomplete.compile.verify.cpp
@@ -8,7 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-no-concepts
-// UNSUPPORTED: clang-10
 // UNSUPPORTED: libcpp-has-no-incomplete-ranges
 
 // unspecified begin;
@@ -19,11 +18,9 @@
 
 using cend_t = decltype(std::ranges::cend);
 
-// clang-format off
 template <class T>
 requires(!std::invocable<cend_t&, T>)
 void f() {}
-// clang-format on
 
 void test() {
   struct incomplete;
diff --git a/libcxx/test/std/ranges/range.access/range.access.end/incomplete.compile.verify.cpp b/libcxx/test/std/ranges/range.access/range.access.end/incomplete.compile.verify.cpp
index ce8804c6f044d..4980d6ae8c79e 100644
--- a/libcxx/test/std/ranges/range.access/range.access.end/incomplete.compile.verify.cpp
+++ b/libcxx/test/std/ranges/range.access/range.access.end/incomplete.compile.verify.cpp
@@ -8,7 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-no-concepts
-// UNSUPPORTED: clang-10
 // UNSUPPORTED: libcpp-has-no-incomplete-ranges
 
 // unspecified begin;
@@ -19,11 +18,9 @@
 
 using end_t = decltype(std::ranges::end);
 
-// clang-format off
 template <class T>
 requires(!std::invocable<end_t&, T>)
 void f() {}
-// clang-format on
 
 void test() {
   struct incomplete;
diff --git a/libcxx/test/std/ranges/range.access/range.prim/data.incomplete.verify.cpp b/libcxx/test/std/ranges/range.access/range.prim/data.incomplete.verify.cpp
index 0192a2b71c031..7877dff52a475 100644
--- a/libcxx/test/std/ranges/range.access/range.prim/data.incomplete.verify.cpp
+++ b/libcxx/test/std/ranges/range.access/range.prim/data.incomplete.verify.cpp
@@ -8,7 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-no-concepts
-// UNSUPPORTED: clang-10
 // UNSUPPORTED: libcpp-has-no-incomplete-ranges
 
 // std::ranges::data
diff --git a/libcxx/test/std/ranges/range.access/range.prim/empty.incomplete.verify.cpp b/libcxx/test/std/ranges/range.access/range.prim/empty.incomplete.verify.cpp
index 485403e0a4aac..c615d6005fdfc 100644
--- a/libcxx/test/std/ranges/range.access/range.prim/empty.incomplete.verify.cpp
+++ b/libcxx/test/std/ranges/range.access/range.prim/empty.incomplete.verify.cpp
@@ -8,7 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-no-concepts
-// UNSUPPORTED: clang-10
 // UNSUPPORTED: libcpp-has-no-incomplete-ranges
 
 // std::ranges::empty
diff --git a/libcxx/test/std/ranges/range.adaptors/range.all/all.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.all/all.pass.cpp
index 2aeb94f8bd4cb..cef873465eb65 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.all/all.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.all/all.pass.cpp
@@ -142,6 +142,10 @@ constexpr bool test() {
     assert(std::ranges::end(subrange) == std::ranges::begin(subrange) + 8);
   }
 
+  {
+    static_assert(std::same_as<decltype(std::views::all), decltype(std::ranges::views::all)>);
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/ranges/range.adaptors/range.counted/counted.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.counted/counted.pass.cpp
index 62791bbdd28a0..2ebd7036db33a 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.counted/counted.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.counted/counted.pass.cpp
@@ -195,6 +195,10 @@ constexpr bool test() {
     }
   }
 
+  {
+    static_assert(std::same_as<decltype(std::views::counted), decltype(std::ranges::views::counted)>);
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/base.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/base.pass.cpp
new file mode 100644
index 0000000000000..60cbb26f2502d
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/base.pass.cpp
@@ -0,0 +1,62 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr V base() const& requires copy_constructible<V>;
+// constexpr V base() &&;
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "types.h"
+
+constexpr bool hasLValueQualifiedBase(auto&& view) {
+    return requires { view.base(); };
+}
+
+constexpr bool test() {
+  int buffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}};
+
+  {
+    ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])};
+    auto jv = std::ranges::join_view(ParentView{children});
+    assert(std::move(jv).base().ptr_ == children);
+
+    static_assert(!hasLValueQualifiedBase(jv));
+    ASSERT_SAME_TYPE(decltype(std::move(jv).base()), ParentView<ChildView>);
+  }
+
+  {
+    std::ranges::join_view jv(buffer);
+    assert(jv.base().base() == buffer + 0);
+
+    static_assert(hasLValueQualifiedBase(jv));
+    ASSERT_SAME_TYPE(decltype(jv.base()), std::ranges::ref_view<int [4][4]>);
+  }
+
+  {
+    const std::ranges::join_view jv(buffer);
+    assert(jv.base().base() == buffer + 0);
+
+    static_assert(hasLValueQualifiedBase(jv));
+    ASSERT_SAME_TYPE(decltype(jv.base()), std::ranges::ref_view<int [4][4]>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/begin.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/begin.pass.cpp
new file mode 100644
index 0000000000000..2441f6787f2ba
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/begin.pass.cpp
@@ -0,0 +1,97 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr auto begin();
+// constexpr auto begin() const;
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "types.h"
+
+constexpr bool test() {
+  int buffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}};
+
+  {
+    ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])};
+    auto jv = std::ranges::join_view(ParentView{children});
+    assert(*jv.begin() == 1111);
+  }
+
+  {
+    CopyableChild children[4] = {CopyableChild(buffer[0], 4), CopyableChild(buffer[1], 0), CopyableChild(buffer[2], 1), CopyableChild(buffer[3], 0)};
+    auto jv = std::ranges::join_view(ParentView{children});
+    assert(*jv.begin() == 1111);
+  }
+  // Parent is empty.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])};
+    std::ranges::join_view jv(ParentView(children, 0));
+    assert(jv.begin() == jv.end());
+  }
+  // Parent size is one.
+  {
+    CopyableChild children[1] = {CopyableChild(buffer[0])};
+    std::ranges::join_view jv(ParentView(children, 1));
+    assert(*jv.begin() == 1111);
+  }
+  // Parent and child size is one.
+  {
+    CopyableChild children[1] = {CopyableChild(buffer[0], 1)};
+    std::ranges::join_view jv(ParentView(children, 1));
+    assert(*jv.begin() == 1111);
+  }
+  // Parent size is one child is empty
+  {
+    CopyableChild children[1] = {CopyableChild(buffer[0], 0)};
+    std::ranges::join_view jv(ParentView(children, 1));
+    assert(jv.begin() == jv.end());
+  }
+  // Has all empty children.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer[0], 0), CopyableChild(buffer[1], 0), CopyableChild(buffer[2], 0), CopyableChild(buffer[3], 0)};
+    auto jv = std::ranges::join_view(ParentView{children});
+    assert(jv.begin() == jv.end());
+  }
+  // First child is empty, others are not.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer[0], 4), CopyableChild(buffer[1], 0), CopyableChild(buffer[2], 0), CopyableChild(buffer[3], 0)};
+    auto jv = std::ranges::join_view(ParentView{children});
+    assert(*jv.begin() == 1111);
+  }
+  // Last child is empty, others are not.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer[0], 4), CopyableChild(buffer[1], 4), CopyableChild(buffer[2], 4), CopyableChild(buffer[3], 0)};
+    auto jv = std::ranges::join_view(ParentView{children});
+    assert(*jv.begin() == 1111);
+  }
+
+  {
+    std::ranges::join_view jv(buffer);
+    assert(*jv.begin() == 1111);
+  }
+
+  {
+    const std::ranges::join_view jv(buffer);
+    assert(*jv.begin() == 1111);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.compile.pass.cpp
new file mode 100644
index 0000000000000..a81fa03e15c21
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.compile.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// template<class R>
+//   explicit join_view(R&&) -> join_view<views::all_t<R>>;
+
+#include <ranges>
+
+#include "test_iterators.h"
+
+template<class T>
+struct View : std::ranges::view_base {
+  // All friends here are defined to prevent GCC warnings.
+  friend T* begin(View&) { return nullptr; }
+  friend T* begin(View const&) { return nullptr; }
+  friend sentinel_wrapper<T*> end(View&) { return sentinel_wrapper<T*>(nullptr); }
+  friend sentinel_wrapper<T*> end(View const&) { return sentinel_wrapper<T*>(nullptr); }
+};
+
+template<class T>
+struct Range {
+  friend T* begin(Range&) { return nullptr; }
+  friend T* begin(Range const&) { return nullptr; }
+  friend sentinel_wrapper<T*> end(Range&) { return sentinel_wrapper<T*>(nullptr); }
+  friend sentinel_wrapper<T*> end(Range const&) { return sentinel_wrapper<T*>(nullptr); }
+};
+
+template<class T>
+struct BorrowedRange {
+  friend T* begin(BorrowedRange&) { return nullptr; }
+  friend T* begin(BorrowedRange const&) { return nullptr; }
+  friend sentinel_wrapper<T*> end(BorrowedRange&) { return sentinel_wrapper<T*>(nullptr); }
+  friend sentinel_wrapper<T*> end(BorrowedRange const&) { return sentinel_wrapper<T*>(nullptr); }
+};
+
+template<>
+inline constexpr bool std::ranges::enable_borrowed_range<BorrowedRange<BorrowedRange<int>>> = true;
+
+void testCTAD() {
+    View<View<int>> v;
+    Range<Range<int>> r;
+    BorrowedRange<BorrowedRange<int>> br;
+
+    static_assert(std::same_as<
+        decltype(std::ranges::join_view(v)),
+        std::ranges::join_view<View<View<int>>>
+    >);
+    static_assert(std::same_as<
+        decltype(std::ranges::join_view(r)),
+        std::ranges::join_view<std::ranges::ref_view<Range<Range<int>>>>
+    >);
+    // std::ranges::join_view(std::move(r)) invalid. RValue range must be borrowed.
+    static_assert(std::same_as<
+        decltype(std::ranges::join_view(br)),
+        std::ranges::join_view<std::ranges::ref_view<BorrowedRange<BorrowedRange<int>>>>
+    >);
+    static_assert(std::same_as<
+        decltype(std::ranges::join_view(std::move(br))),
+        std::ranges::join_view<std::ranges::subrange<BorrowedRange<int> *,
+                               sentinel_wrapper<BorrowedRange<int> *>,
+                               std::ranges::subrange_kind::unsized>>
+    >);
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.verify.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.verify.cpp
new file mode 100644
index 0000000000000..1bdd1e62eeb64
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctad.verify.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// template<class R>
+//   explicit join_view(R&&) -> join_view<views::all_t<R>>;
+
+// Tests that the deduction guide is explicit.
+
+#include <ranges>
+
+#include "test_iterators.h"
+
+template<class T>
+struct Range {
+  friend T* begin(Range&) { return nullptr; }
+  friend T* begin(Range const&) { return nullptr; }
+  friend sentinel_wrapper<T*> end(Range&) { return sentinel_wrapper<T*>(nullptr); }
+  friend sentinel_wrapper<T*> end(Range const&) { return sentinel_wrapper<T*>(nullptr); }
+};
+
+void testExplicitCTAD() {
+  Range<Range<int>> r;
+  std::ranges::join_view v = r; // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'join_view'}}
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.base.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.base.pass.cpp
new file mode 100644
index 0000000000000..2cdbe3b0268c4
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.base.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr explicit join_view(V base);
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "types.h"
+
+constexpr bool test() {
+  int buffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}};
+
+  {
+    ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])};
+    auto jv = std::ranges::join_view(ParentView{children});
+    assert(std::move(jv).base().ptr_ == children);
+  }
+
+  {
+    std::ranges::join_view jv(buffer);
+    assert(jv.base().base() == buffer + 0);
+  }
+
+  {
+    // Test explicitness.
+    static_assert( std::is_constructible_v<std::ranges::join_view<ParentView<ChildView>>, ParentView<ChildView>>);
+    static_assert(!std::is_convertible_v<std::ranges::join_view<ParentView<ChildView>>, ParentView<ChildView>>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.default.pass.cpp
new file mode 100644
index 0000000000000..ff93d8aa6fdf8
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/ctor.default.pass.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// join_view() requires default_initializable<V> = default;
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "types.h"
+
+
+constexpr bool test() {
+  std::ranges::join_view<ParentView<ChildView>> jv;
+  assert(std::move(jv).base().ptr_ == globalChildren);
+
+  static_assert( std::default_initializable<std::ranges::join_view<ParentView<ChildView>>>);
+  static_assert(!std::default_initializable<std::ranges::join_view<CopyableParent>>);
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/end.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/end.pass.cpp
new file mode 100644
index 0000000000000..33ef7a7374d3e
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/end.pass.cpp
@@ -0,0 +1,120 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr auto end();
+// constexpr auto end() const;
+
+#include <cassert>
+#include <ranges>
+#include <type_traits>
+
+#include "test_macros.h"
+#include "types.h"
+
+
+constexpr bool test() {
+  int buffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}};
+
+  // Non const common, forward range.
+  {
+    std::ranges::join_view jv(buffer);
+    assert(jv.end() == std::ranges::next(jv.begin(), 16));
+
+    static_assert(std::same_as<decltype(jv.end()), decltype(jv.begin())>);
+  }
+
+  // Non const not common, input range.
+  {
+    ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])};
+    auto jv = std::ranges::join_view(ParentView(children));
+    assert(jv.end() == std::ranges::next(jv.begin(), 16));
+
+    static_assert(!std::same_as<decltype(jv.end()), decltype(jv.begin())>);
+  }
+
+  // Const common, forward range.
+  {
+    const std::ranges::join_view jv(buffer);
+    assert(jv.end() == std::ranges::next(jv.begin(), 16));
+
+    static_assert(std::same_as<decltype(jv.end()), decltype(jv.begin())>);
+  }
+
+  // Const not common, input range.
+  {
+    static_assert(std::is_reference_v<std::ranges::range_reference_t<const CopyableParent>>);
+
+    CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])};
+    const auto jv = std::ranges::join_view(ParentView(children));
+    assert(jv.end() == std::ranges::next(jv.begin(), 16));
+
+    static_assert(!std::same_as<decltype(jv.end()), decltype(jv.begin())>);
+  }
+
+  // Has some empty children.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer[0], 4), CopyableChild(buffer[1], 0), CopyableChild(buffer[2], 1), CopyableChild(buffer[3], 0)};
+    auto jv = std::ranges::join_view(ParentView(children));
+    assert(jv.end() == std::ranges::next(jv.begin(), 5));
+  }
+  // Parent is empty.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])};
+    std::ranges::join_view jv(ParentView(children, 0));
+    assert(jv.end() == jv.begin());
+  }
+  // Parent size is one.
+  {
+    CopyableChild children[1] = {CopyableChild(buffer[0])};
+    std::ranges::join_view jv(ParentView(children, 1));
+    assert(jv.end() == std::ranges::next(jv.begin(), 4));
+  }
+  // Parent and child size is one.
+  {
+    CopyableChild children[1] = {CopyableChild(buffer[0], 1)};
+    std::ranges::join_view jv(ParentView(children, 1));
+    assert(jv.end() == std::ranges::next(jv.begin()));
+  }
+  // Parent size is one child is empty
+  {
+    CopyableChild children[1] = {CopyableChild(buffer[0], 0)};
+    std::ranges::join_view jv(ParentView(children, 1));
+    assert(jv.end() == jv.begin());
+  }
+  // Has all empty children.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer[0], 0), CopyableChild(buffer[1], 0), CopyableChild(buffer[2], 0), CopyableChild(buffer[3], 0)};
+    auto jv = std::ranges::join_view(ParentView(children));
+    assert(jv.end() == jv.begin());
+  }
+  // First child is empty, others are not.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer[0], 4), CopyableChild(buffer[1], 0), CopyableChild(buffer[2], 0), CopyableChild(buffer[3], 0)};
+    auto jv = std::ranges::join_view(ParentView(children));
+    assert(jv.end() == std::ranges::next(jv.begin(), 4));
+  }
+  // Last child is empty, others are not.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer[0], 4), CopyableChild(buffer[1], 4), CopyableChild(buffer[2], 4), CopyableChild(buffer[3], 0)};
+    auto jv = std::ranges::join_view(ParentView(children));
+    assert(jv.end() == std::ranges::next(jv.begin(), 12));
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/general.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/general.pass.cpp
new file mode 100644
index 0000000000000..e0fb8a8c6ddc1
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/general.pass.cpp
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// General tests for join_view. This file does not test anything specifically.
+
+#include <algorithm>
+#include <cassert>
+#include <ranges>
+#include <string>
+#include <vector>
+
+#include "test_macros.h"
+#include "types.h"
+
+
+template<class R, class I>
+bool isEqual(R &r, I i) {
+  for (auto e : r)
+    if (e != *i++)
+      return false;
+  return true;
+}
+
+int main(int, char**) {
+  {
+    int buffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}};
+    int *flattened = reinterpret_cast<int*>(buffer);
+
+    ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])};
+    auto jv = std::ranges::join_view(ParentView(children));
+    assert(isEqual(jv, flattened));
+  }
+
+  {
+    std::vector<std::string> vec = {"Hello", ",", " ", "World", "!"};
+    std::string check = "Hello, World!";
+    std::ranges::join_view jv(vec);
+    assert(isEqual(jv, check.begin()));
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/arrow.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/arrow.pass.cpp
new file mode 100644
index 0000000000000..1579f56151645
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/arrow.pass.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr InnerIter operator->() const
+//   requires has-arrow<InnerIter> && copyable<InnerIter>;
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  Box buffer[4][4] = {{{1111}, {2222}, {3333}, {4444}}, {{555}, {666}, {777}, {888}}, {{99}, {1010}, {1111}, {1212}}, {{13}, {14}, {15}, {16}}};
+
+  {
+    // Copyable input iterator with arrow.
+    ValueView<Box> children[4] = {ValueView(buffer[0]), ValueView(buffer[1]), ValueView(buffer[2]), ValueView(buffer[3])};
+    std::ranges::join_view jv(ValueView<ValueView<Box>>{children});
+    assert(jv.begin()->x == 1111);
+  }
+
+  {
+    std::ranges::join_view jv(buffer);
+    assert(jv.begin()->x == 1111);
+  }
+
+  {
+    const std::ranges::join_view jv(buffer);
+    assert(jv.begin()->x == 1111);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.default.pass.cpp
new file mode 100644
index 0000000000000..52bae5bb752fb
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.default.pass.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// iterator() requires default_initializable<OuterIter> = default;
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+template<class T>
+struct DefaultCtorParent : std::ranges::view_base {
+  T *ptr_;
+  constexpr DefaultCtorParent(T *ptr) : ptr_(ptr) {}
+
+  constexpr cpp17_input_iterator<T *> begin() { return cpp17_input_iterator<T *>(ptr_); }
+  constexpr cpp17_input_iterator<const T *> begin() const { return cpp17_input_iterator<const T *>(ptr_); }
+  constexpr T *end() { return ptr_ + 4; }
+  constexpr const T *end() const { return ptr_ + 4; }
+};
+
+template<class T>
+constexpr bool operator==(const cpp17_input_iterator<T*> &lhs, const T *rhs) { return lhs.base() == rhs; }
+template<class T>
+constexpr bool operator==(const T *lhs, const cpp17_input_iterator<T*> &rhs) { return rhs.base() == lhs; }
+
+constexpr bool test() {
+  using Base = DefaultCtorParent<ChildView>;
+  // Note, only the outer iterator is default_initializable:
+  static_assert( std::default_initializable<std::ranges::iterator_t<Base>>);
+  static_assert(!std::default_initializable<std::ranges::iterator_t<std::ranges::range_reference_t<Base>>>);
+
+  std::ranges::iterator_t<std::ranges::join_view<Base>> iter1;
+  (void) iter1;
+
+  static_assert(!std::default_initializable<std::ranges::iterator_t<std::ranges::join_view<ParentView<ChildView>>>>);
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.other.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.other.pass.cpp
new file mode 100644
index 0000000000000..87290c4baec86
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.other.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr iterator(iterator<!Const> i);
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}};
+
+  CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])};
+  std::ranges::join_view jv(CopyableParent{children});
+  auto iter1 = jv.begin();
+  std::ranges::iterator_t<const decltype(jv)> iter2 = iter1;
+  assert(iter1 == iter2);
+
+  // We cannot create a non-const iterator from a const iterator.
+  static_assert(!std::constructible_from<decltype(iter1), decltype(iter2)>);
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.parent.outer.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.parent.outer.pass.cpp
new file mode 100644
index 0000000000000..ae6ca21e72a94
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/ctor.parent.outer.pass.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr iterator(Parent& parent, OuterIter outer);
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}};
+
+  CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])};
+  CopyableParent parent{children};
+  std::ranges::join_view jv(parent);
+  std::ranges::iterator_t<decltype(jv)> iter(jv, std::ranges::begin(parent));
+  assert(*iter == 1);
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/decrement.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/decrement.pass.cpp
new file mode 100644
index 0000000000000..66b3749365461
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/decrement.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr iterator& operator--();
+// constexpr iterator operator--(int);
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}};
+
+  {
+    // outer == ranges::end
+    std::ranges::join_view jv(buffer);
+    auto iter = std::next(jv.begin(), 16);
+    for (int i = 16; i != 0; --i) {
+      assert(*--iter == i);
+    }
+  }
+  {
+    // outer == ranges::end
+    std::ranges::join_view jv(buffer);
+    auto iter = std::next(jv.begin(), 13);
+    for (int i = 13; i != 0; --i) {
+      assert(*--iter == i);
+    }
+  }
+  {
+    // outer != ranges::end
+    std::ranges::join_view jv(buffer);
+    auto iter = std::next(jv.begin(), 12);
+    for (int i = 12; i != 0; --i) {
+      assert(*--iter == i);
+    }
+  }
+  {
+    // outer != ranges::end
+    std::ranges::join_view jv(buffer);
+    auto iter = std::next(jv.begin());
+    for (int i = 1; i != 0; --i) {
+      assert(*--iter == i);
+    }
+  }
+  {
+    int small[2][1] = {{1}, {2}};
+    std::ranges::join_view jv(small);
+    auto iter = std::next(jv.begin(), 2);
+    for (int i = 2; i != 0; --i) {
+      assert(*--iter == i);
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/eq.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/eq.pass.cpp
new file mode 100644
index 0000000000000..b76f72a453cc5
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/eq.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// friend constexpr bool operator==(const iterator& x, const iterator& y);
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}};
+
+  std::ranges::join_view jv(buffer);
+  auto iter1 = jv.begin();
+  auto iter2 = jv.begin();
+  assert(iter1 == iter2);
+  iter1++;
+  assert(iter1 != iter2);
+  iter2++;
+  assert(iter1 == iter2);
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/increment.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/increment.pass.cpp
new file mode 100644
index 0000000000000..853ed1a27f577
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/increment.pass.cpp
@@ -0,0 +1,160 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr iterator& operator++();
+// constexpr void operator++(int);
+// constexpr iterator operator++(int);
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  // This way if we read past end we'll catch the error.
+  int buffer1[2][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}};
+  int dummy = 42;
+  (void) dummy;
+  int buffer2[2][4] = {{9, 10, 11, 12}, {13, 14, 15, 16}};
+
+  // operator++(int);
+  {
+    std::ranges::join_view jv(buffer1);
+    auto iter = jv.begin();
+    for (int i = 1; i < 9; ++i) {
+      assert(*iter++ == i);
+    }
+  }
+  {
+    ValueView<int> children[4] = {ValueView(buffer1[0]), ValueView(buffer1[1]), ValueView(buffer2[0]), ValueView(buffer2[1])};
+    std::ranges::join_view jv(ValueView<ValueView<int>>{children});
+    auto iter = jv.begin();
+    for (int i = 1; i < 17; ++i) {
+      assert(*iter == i);
+      iter++;
+    }
+
+    ASSERT_SAME_TYPE(decltype(iter++), void);
+  }
+  {
+    std::ranges::join_view jv(buffer1);
+    auto iter = std::next(jv.begin(), 7);
+    assert(*iter++ == 8);
+    assert(iter == jv.end());
+  }
+  {
+    int small[2][1] = {{1}, {2}};
+    std::ranges::join_view jv(small);
+    auto iter = jv.begin();
+    for (int i = 1; i < 3; ++i) {
+      assert(*iter++ == i);
+    }
+  }
+  // Has some empty children.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer1[0], 4), CopyableChild(buffer1[1], 0), CopyableChild(buffer2[0], 1), CopyableChild(buffer2[1], 0)};
+    auto jv = std::ranges::join_view(ParentView(children));
+    auto iter = jv.begin();
+    assert(*iter == 1); iter++;
+    assert(*iter == 2); iter++;
+    assert(*iter == 3); iter++;
+    assert(*iter == 4); iter++;
+    assert(*iter == 9); iter++;
+    assert(iter == jv.end());
+  }
+  // Parent is empty.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer1[0]), CopyableChild(buffer1[1]), CopyableChild(buffer2[0]), CopyableChild(buffer2[1])};
+    std::ranges::join_view jv(ParentView(children, 0));
+    assert(jv.begin() == jv.end());
+  }
+  // Parent size is one.
+  {
+    CopyableChild children[1] = {CopyableChild(buffer1[0])};
+    std::ranges::join_view jv(ParentView(children, 1));
+    auto iter = jv.begin();
+    assert(*iter == 1); iter++;
+    assert(*iter == 2); iter++;
+    assert(*iter == 3); iter++;
+    assert(*iter == 4); iter++;
+    assert(iter == jv.end());
+  }
+  // Parent and child size is one.
+  {
+    CopyableChild children[1] = {CopyableChild(buffer1[0], 1)};
+    std::ranges::join_view jv(ParentView(children, 1));
+    auto iter = jv.begin();
+    assert(*iter == 1); iter++;
+    assert(iter == jv.end());
+  }
+  // Parent size is one child is empty
+  {
+    CopyableChild children[1] = {CopyableChild(buffer1[0], 0)};
+    std::ranges::join_view jv(ParentView(children, 1));
+    assert(jv.begin() == jv.end());
+  }
+  // Has all empty children.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer1[0], 0), CopyableChild(buffer1[1], 0), CopyableChild(buffer2[0], 0), CopyableChild(buffer2[1], 0)};
+    auto jv = std::ranges::join_view(ParentView(children));
+    assert(jv.begin() == jv.end());
+  }
+  // First child is empty, others are not.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer1[0], 4), CopyableChild(buffer1[1], 0), CopyableChild(buffer2[0], 0), CopyableChild(buffer2[1], 0)};
+    auto jv = std::ranges::join_view(ParentView(children));
+    auto iter = jv.begin();
+    assert(*iter == 1); iter++;
+    assert(*iter == 2); iter++;
+    assert(*iter == 3); iter++;
+    assert(*iter == 4); iter++;
+    assert(iter == jv.end());
+  }
+  // Last child is empty, others are not.
+  {
+    CopyableChild children[4] = {CopyableChild(buffer1[0], 4), CopyableChild(buffer1[1], 4), CopyableChild(buffer2[0], 4), CopyableChild(buffer2[1], 0)};
+    auto jv = std::ranges::join_view(ParentView(children));
+    auto iter = jv.begin();
+    for (int i = 1; i < 13; ++i) {
+      assert(*iter == i);
+      iter++;
+    }
+  }
+  // operator++();
+  {
+    std::ranges::join_view jv(buffer1);
+    auto iter = jv.begin();
+    for (int i = 2; i < 9; ++i) {
+      assert(*++iter == i);
+    }
+  }
+  {
+    ValueView<int> children[4] = {ValueView(buffer1[0]), ValueView(buffer1[1]), ValueView(buffer2[0]), ValueView(buffer2[1])};
+    std::ranges::join_view jv(ValueView<ValueView<int>>{children});
+    auto iter = jv.begin();
+    for (int i = 2; i < 17; ++i) {
+      assert(*++iter == i);
+    }
+
+    ASSERT_SAME_TYPE(decltype(++iter), decltype(iter)&);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.move.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.move.pass.cpp
new file mode 100644
index 0000000000000..b3e3fd2dade71
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.move.pass.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// friend constexpr decltype(auto) iter_move(const iterator& i);
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}};
+
+  std::ranges::join_view jv(buffer);
+  assert(std::ranges::iter_move(jv.begin()) == 1);
+  ASSERT_SAME_TYPE(decltype(std::ranges::iter_move(jv.begin())), int&&);
+
+  static_assert(noexcept(std::ranges::iter_move(std::declval<decltype(jv.begin())>())));
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.swap.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.swap.pass.cpp
new file mode 100644
index 0000000000000..30d61f516df89
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/iter.swap.pass.cpp
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// friend constexpr void iter_swap(const iterator& x, const iterator& y);
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}};
+
+  std::ranges::join_view jv(buffer);
+  auto iter1 = jv.begin();
+  auto iter2 = std::next(jv.begin());
+  assert(*iter1 == 1);
+  assert(*iter2 == 2);
+  std::ranges::swap(iter1, iter2);
+  assert(*iter1 == 2);
+  assert(*iter2 == 1);
+
+  static_assert(noexcept(std::ranges::iter_swap(iter1, iter2)));
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/member_types.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/member_types.compile.pass.cpp
new file mode 100644
index 0000000000000..acf7ca17cd69b
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/member_types.compile.pass.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// Iterator traits and member typedefs in join_view::<iterator>.
+
+#include <ranges>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "../types.h"
+
+template<class T>
+struct ForwardView : std::ranges::view_base {
+  friend forward_iterator<T*> begin(ForwardView&) { return forward_iterator<T*>(nullptr); }
+  friend forward_iterator<T*> begin(ForwardView const&) { return forward_iterator<T*>(nullptr); }
+  friend forward_iterator<T*> end(ForwardView&) { return forward_iterator<T*>(nullptr); }
+  friend forward_iterator<T*> end(ForwardView const&) { return forward_iterator<T*>(nullptr); }
+};
+
+template<class T>
+struct InputView : std::ranges::view_base {
+  friend cpp17_input_iterator<T*> begin(InputView&) { return cpp17_input_iterator<T*>(nullptr); }
+  friend cpp17_input_iterator<T*> begin(InputView const&) { return cpp17_input_iterator<T*>(nullptr); }
+  friend cpp17_input_iterator<T*> end(InputView&) { return cpp17_input_iterator<T*>(nullptr); }
+  friend cpp17_input_iterator<T*> end(InputView const&) { return cpp17_input_iterator<T*>(nullptr); }
+};
+
+template<class T>
+concept HasIterCategory = requires { typename T::iterator_category; };
+
+void test() {
+  {
+    int buffer[4][4];
+    std::ranges::join_view jv(buffer);
+    using Iter = std::ranges::iterator_t<decltype(jv)>;
+
+    ASSERT_SAME_TYPE(Iter::iterator_concept, std::bidirectional_iterator_tag);
+    ASSERT_SAME_TYPE(Iter::iterator_category, std::bidirectional_iterator_tag);
+    ASSERT_SAME_TYPE(Iter::difference_type, std::ptrdiff_t);
+    ASSERT_SAME_TYPE(Iter::value_type, int);
+  }
+  {
+    using Iter = std::ranges::iterator_t<std::ranges::join_view<ForwardView<ForwardView<int>>>>;
+
+    ASSERT_SAME_TYPE(Iter::iterator_concept, std::forward_iterator_tag);
+    ASSERT_SAME_TYPE(Iter::iterator_category, std::forward_iterator_tag);
+    ASSERT_SAME_TYPE(Iter::difference_type, std::ptrdiff_t);
+    ASSERT_SAME_TYPE(Iter::value_type, int);
+  }
+  {
+    using Iter = std::ranges::iterator_t<std::ranges::join_view<InputView<InputView<int>>>>;
+
+    ASSERT_SAME_TYPE(Iter::iterator_concept, std::input_iterator_tag);
+    static_assert(!HasIterCategory<Iter>);
+    ASSERT_SAME_TYPE(Iter::difference_type, std::ptrdiff_t);
+    ASSERT_SAME_TYPE(Iter::value_type, int);
+  }
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/star.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/star.pass.cpp
new file mode 100644
index 0000000000000..542c3309d59b6
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/iterator/star.pass.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr decltype(auto) operator*() const;
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}};
+
+  {
+    std::ranges::join_view jv(buffer);
+    auto iter = jv.begin();
+    for (int i = 1; i < 17; ++i) {
+      assert(*iter++ == i);
+    }
+  }
+  {
+    std::ranges::join_view jv(buffer);
+    auto iter = std::next(jv.begin(), 15);
+    assert(*iter++ == 16);
+    assert(iter == jv.end());
+  }
+  {
+    ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])};
+    auto jv = std::ranges::join_view(ParentView(children));
+    auto iter = jv.begin();
+    for (int i = 1; i < 17; ++i) {
+      assert(*iter == i);
+      ++iter;
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.default.pass.cpp
new file mode 100644
index 0000000000000..74ab5c9af5a59
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.default.pass.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// sentinel() = default;
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  std::ranges::sentinel_t<std::ranges::join_view<CopyableParent>> sent;
+  (void) sent;
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.other.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.other.pass.cpp
new file mode 100644
index 0000000000000..fae2edd53fb87
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.other.pass.cpp
@@ -0,0 +1,41 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr sentinel(sentinel<!Const> s);
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}};
+
+  CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])};
+  std::ranges::join_view jv(CopyableParent{children});
+  auto sent1 = jv.end();
+  std::ranges::sentinel_t<const decltype(jv)> sent2 = sent1;
+  (void) sent2; // We can't really do anything with these sentinels now :/
+
+  // We cannot create a non-const iterator from a const iterator.
+  static_assert(!std::constructible_from<decltype(sent1), decltype(sent2)>);
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.parent.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.parent.pass.cpp
new file mode 100644
index 0000000000000..fc813dbe563de
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/ctor.parent.pass.cpp
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr explicit sentinel(Parent& parent);
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  int buffer[4][4] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}};
+
+  CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])};
+  CopyableParent parent{children};
+  std::ranges::join_view jv(parent);
+  std::ranges::sentinel_t<decltype(jv)> sent(jv);
+  assert(sent == std::ranges::next(jv.begin(), 16));
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  {
+    // Test explicitness.
+    using Parent = std::ranges::join_view<ParentView<ChildView>>;
+    static_assert( std::is_constructible_v<std::ranges::sentinel_t<Parent>, Parent&>);
+    static_assert(!std::is_convertible_v<std::ranges::sentinel_t<Parent>, Parent&>);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/eq.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/eq.pass.cpp
new file mode 100644
index 0000000000000..b33d13ff2df1e
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/sentinel/eq.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// template<bool OtherConst>
+//   requires sentinel_for<sentinel_t<Base>, iterator_t<maybe-const<OtherConst, V>>>
+// friend constexpr bool operator==(const iterator<OtherConst>& x, const sentinel& y);
+
+#include <cassert>
+#include <ranges>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  int buffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}};
+
+  {
+    ChildView children[4] = {ChildView(buffer[0]), ChildView(buffer[1]), ChildView(buffer[2]), ChildView(buffer[3])};
+    auto jv = std::ranges::join_view(ParentView(children));
+    assert(jv.end() == std::ranges::next(jv.begin(), 16));
+  }
+  {
+    CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])};
+    const auto jv = std::ranges::join_view(ParentView(children));
+    assert(jv.end() == std::ranges::next(jv.begin(), 16));
+  }
+  {
+    CopyableChild children[4] = {CopyableChild(buffer[0]), CopyableChild(buffer[1]), CopyableChild(buffer[2]), CopyableChild(buffer[3])};
+    const std::ranges::join_view jvc(CopyableParent{children});
+          std::ranges::join_view jv(CopyableParent{children});
+    assert(jvc.end() == std::ranges::next(jv.begin(), 16));
+    assert( jv.end() == std::ranges::next(jvc.begin(), 16));
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.adaptors/range.join.view/types.h b/libcxx/test/std/ranges/range.adaptors/range.join.view/types.h
new file mode 100644
index 0000000000000..57f79a57485b3
--- /dev/null
+++ b/libcxx/test/std/ranges/range.adaptors/range.join.view/types.h
@@ -0,0 +1,141 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_JOIN_TYPES_H
+#define TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_JOIN_TYPES_H
+
+#include <concepts>
+
+#include "test_macros.h"
+#include "test_iterators.h"
+
+int globalBuffer[4][4] = {{1111, 2222, 3333, 4444}, {555, 666, 777, 888}, {99, 1010, 1111, 1212}, {13, 14, 15, 16}};
+
+struct ChildView : std::ranges::view_base {
+  int *ptr_;
+
+  constexpr ChildView(int *ptr = globalBuffer[0]) : ptr_(ptr) {}
+  ChildView(const ChildView&) = delete;
+  ChildView(ChildView&&) = default;
+  ChildView& operator=(const ChildView&) = delete;
+  ChildView& operator=(ChildView&&) = default;
+
+  constexpr cpp20_input_iterator<int *> begin() { return cpp20_input_iterator<int *>(ptr_); }
+  constexpr cpp20_input_iterator<const int *> begin() const { return cpp20_input_iterator<const int *>(ptr_); }
+  constexpr int *end() { return ptr_ + 4; }
+  constexpr const int *end() const { return ptr_ + 4; }
+};
+
+constexpr bool operator==(const cpp20_input_iterator<int*> &lhs, int* rhs) { return lhs.base() == rhs; }
+constexpr bool operator==(int* lhs, const cpp20_input_iterator<int*> &rhs) { return rhs.base() == lhs; }
+
+ChildView globalChildren[4] = {ChildView(globalBuffer[0]), ChildView(globalBuffer[1]), ChildView(globalBuffer[2]), ChildView(globalBuffer[3])};
+
+template<class T>
+struct ParentView : std::ranges::view_base {
+  T *ptr_;
+  unsigned size_;
+
+  constexpr ParentView(T *ptr, unsigned size = 4)
+    : ptr_(ptr), size_(size) {}
+  constexpr ParentView(ChildView *ptr = globalChildren, unsigned size = 4)
+    requires std::same_as<ChildView, T>
+    : ptr_(ptr), size_(size) {}
+  ParentView(const ParentView&) = delete;
+  ParentView(ParentView&&) = default;
+  ParentView& operator=(const ParentView&) = delete;
+  ParentView& operator=(ParentView&&) = default;
+
+  constexpr cpp20_input_iterator<T *> begin() { return cpp20_input_iterator<T *>(ptr_); }
+  constexpr cpp20_input_iterator<const T *> begin() const { return cpp20_input_iterator<const T *>(ptr_); }
+  constexpr T *end() { return ptr_ + size_; }
+  constexpr const T *end() const { return ptr_ + size_; }
+};
+
+template<class T>
+constexpr bool operator==(const cpp20_input_iterator<T*> &lhs, T *rhs) { return lhs.base() == rhs; }
+template<class T>
+constexpr bool operator==(T *lhs, const cpp20_input_iterator<T*> &rhs) { return rhs.base() == lhs; }
+
+struct CopyableChild : std::ranges::view_base {
+  int *ptr_;
+  unsigned size_;
+  constexpr CopyableChild(int *ptr = globalBuffer[0], unsigned size = 4)
+    : ptr_(ptr), size_(size) {}
+
+  constexpr cpp17_input_iterator<int *> begin() { return cpp17_input_iterator<int *>(ptr_); }
+  constexpr cpp17_input_iterator<const int *> begin() const { return cpp17_input_iterator<const int *>(ptr_); }
+  constexpr int *end() { return ptr_ + size_; }
+  constexpr const int *end() const { return ptr_ + size_; }
+};
+
+constexpr bool operator==(const cpp17_input_iterator<const int*> &lhs, const int* rhs) { return lhs.base() == rhs; }
+constexpr bool operator==(const int* lhs, const cpp17_input_iterator<const int*> &rhs) { return rhs.base() == lhs; }
+
+struct CopyableParent : std::ranges::view_base {
+  CopyableChild *ptr_;
+  constexpr CopyableParent(CopyableChild *ptr) : ptr_(ptr) {}
+
+  constexpr cpp17_input_iterator<CopyableChild *> begin() { return cpp17_input_iterator<CopyableChild *>(ptr_); }
+  constexpr cpp17_input_iterator<const CopyableChild *> begin() const { return cpp17_input_iterator<const CopyableChild *>(ptr_); }
+  constexpr CopyableChild *end() { return ptr_ + 4; }
+  constexpr const CopyableChild *end() const { return ptr_ + 4; }
+};
+
+constexpr bool operator==(const cpp17_input_iterator<const CopyableChild*> &lhs, const CopyableChild *rhs) { return lhs.base() == rhs; }
+constexpr bool operator==(const CopyableChild *lhs, const cpp17_input_iterator<const CopyableChild*> &rhs) { return rhs.base() == lhs; }
+
+struct Box { int x; };
+
+template<class T>
+struct InputValueIter {
+  typedef std::input_iterator_tag iterator_category;
+  typedef T value_type;
+  typedef int difference_type;
+  typedef T reference;
+
+  T *ptr_;
+  constexpr InputValueIter(T *ptr) : ptr_(ptr) {}
+
+  constexpr T operator*() const { return std::move(*ptr_); }
+  constexpr void operator++(int) { ++ptr_; }
+  constexpr InputValueIter& operator++() { ++ptr_; return *this; }
+
+  constexpr T *operator->() { return ptr_; }
+};
+
+template<class T>
+constexpr bool operator==(const InputValueIter<T> &lhs, const T* rhs) { return lhs.ptr_ == rhs; }
+template<class T>
+constexpr bool operator==(const T* lhs, const InputValueIter<T> &rhs) { return rhs.ptr_ == lhs; }
+
+template<class T>
+struct ValueView : std::ranges::view_base {
+  InputValueIter<T> ptr_;
+
+  constexpr ValueView(T *ptr) : ptr_(ptr) {}
+
+  constexpr ValueView(ValueView &&other)
+    : ptr_(other.ptr_) { other.ptr_.ptr_ = nullptr; }
+
+  constexpr ValueView& operator=(ValueView &&other) {
+    ptr_ = other.ptr_;
+    other.ptr_ = InputValueIter<T>(nullptr);
+    return *this;
+  }
+
+  ValueView(const ValueView&) = delete;
+  ValueView& operator=(const ValueView&) = delete;
+
+  constexpr InputValueIter<T> begin() { return ptr_; }
+  constexpr const InputValueIter<T> begin() const { return ptr_; }
+  constexpr T *end() { return ptr_.ptr_ + 4; }
+  constexpr const T *end() const { return ptr_.ptr_ + 4; }
+};
+
+#endif // TEST_STD_RANGES_RANGE_ADAPTORS_RANGE_JOIN_TYPES_H
diff --git a/libcxx/test/std/ranges/range.adaptors/range.take/types.h b/libcxx/test/std/ranges/range.adaptors/range.take/types.h
index e260de2e38cf3..72563eecfc925 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.take/types.h
+++ b/libcxx/test/std/ranges/range.adaptors/range.take/types.h
@@ -34,12 +34,12 @@ using ForwardIter = forward_iterator<int*>;
 struct SizedForwardView : std::ranges::view_base {
   int *ptr_;
   constexpr SizedForwardView(int* ptr) : ptr_(ptr) {}
-  constexpr friend ForwardIter begin(SizedForwardView& view) { return ForwardIter(view.ptr_); }
-  constexpr friend ForwardIter begin(SizedForwardView const& view) { return ForwardIter(view.ptr_); }
-  constexpr friend sentinel_wrapper<ForwardIter> end(SizedForwardView& view) {
+  friend constexpr ForwardIter begin(SizedForwardView& view) { return ForwardIter(view.ptr_); }
+  friend constexpr ForwardIter begin(SizedForwardView const& view) { return ForwardIter(view.ptr_); }
+  friend constexpr sentinel_wrapper<ForwardIter> end(SizedForwardView& view) {
     return sentinel_wrapper<ForwardIter>{ForwardIter(view.ptr_ + 8)};
   }
-  constexpr friend sentinel_wrapper<ForwardIter> end(SizedForwardView const& view) {
+  friend constexpr sentinel_wrapper<ForwardIter> end(SizedForwardView const& view) {
     return sentinel_wrapper<ForwardIter>{ForwardIter(view.ptr_ + 8)};
   }
 };
@@ -55,12 +55,12 @@ using RandomAccessIter = random_access_iterator<int*>;
 struct SizedRandomAccessView : std::ranges::view_base {
   int *ptr_;
   constexpr SizedRandomAccessView(int* ptr) : ptr_(ptr) {}
-  constexpr friend RandomAccessIter begin(SizedRandomAccessView& view) { return RandomAccessIter(view.ptr_); }
-  constexpr friend RandomAccessIter begin(SizedRandomAccessView const& view) { return RandomAccessIter(view.ptr_); }
-  constexpr friend sentinel_wrapper<RandomAccessIter> end(SizedRandomAccessView& view) {
+  friend constexpr RandomAccessIter begin(SizedRandomAccessView& view) { return RandomAccessIter(view.ptr_); }
+  friend constexpr RandomAccessIter begin(SizedRandomAccessView const& view) { return RandomAccessIter(view.ptr_); }
+  friend constexpr sentinel_wrapper<RandomAccessIter> end(SizedRandomAccessView& view) {
     return sentinel_wrapper<RandomAccessIter>{RandomAccessIter(view.ptr_ + 8)};
   }
-  constexpr friend sentinel_wrapper<RandomAccessIter> end(SizedRandomAccessView const& view) {
+  friend constexpr sentinel_wrapper<RandomAccessIter> end(SizedRandomAccessView const& view) {
     return sentinel_wrapper<RandomAccessIter>{RandomAccessIter(view.ptr_ + 8)};
   }
 };
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/base.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/base.pass.cpp
index 882e1623dda7d..6e9572d12ada4 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/base.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/base.pass.cpp
@@ -20,14 +20,14 @@
 
 constexpr bool test() {
   {
-    std::ranges::transform_view<ContiguousView, Increment> transformView;
+    std::ranges::transform_view<ContiguousView, PlusOne> transformView;
     ContiguousView base = std::move(transformView).base();
     ASSERT_SAME_TYPE(ContiguousView, decltype(std::move(transformView).base()));
     assert(std::ranges::begin(base) == globalBuff);
   }
 
   {
-    std::ranges::transform_view<CopyableView, Increment> transformView;
+    std::ranges::transform_view<CopyableView, PlusOne> transformView;
     CopyableView base1 = transformView.base();
     ASSERT_SAME_TYPE(CopyableView, decltype(transformView.base()));
     assert(std::ranges::begin(base1) == globalBuff);
@@ -38,7 +38,7 @@ constexpr bool test() {
   }
 
   {
-    const std::ranges::transform_view<CopyableView, Increment> transformView;
+    const std::ranges::transform_view<CopyableView, PlusOne> transformView;
     const CopyableView base1 = transformView.base();
     ASSERT_SAME_TYPE(CopyableView, decltype(transformView.base()));
     assert(std::ranges::begin(base1) == globalBuff);
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/begin.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/begin.pass.cpp
index 48232c5218613..b4665a5b74f49 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/begin.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/begin.pass.cpp
@@ -27,29 +27,29 @@ constexpr bool test() {
   int buff[8] = {0, 1, 2, 3, 4, 5, 6, 7};
 
   {
-    std::ranges::transform_view transformView(ContiguousView{buff}, Increment{});
+    std::ranges::transform_view transformView(ContiguousView{buff}, PlusOneMutable{});
     assert(transformView.begin().base() == buff);
     assert(*transformView.begin() == 1);
   }
 
   {
-    std::ranges::transform_view transformView(ForwardView{buff}, Increment{});
+    std::ranges::transform_view transformView(ForwardView{buff}, PlusOneMutable{});
     assert(transformView.begin().base().base() == buff);
     assert(*transformView.begin() == 1);
   }
 
   {
-    std::ranges::transform_view transformView(InputView{buff}, Increment{});
+    std::ranges::transform_view transformView(InputView{buff}, PlusOneMutable{});
     assert(transformView.begin().base().base() == buff);
     assert(*transformView.begin() == 1);
   }
 
   {
-    const std::ranges::transform_view transformView(ContiguousView{buff}, IncrementConst{});
+    const std::ranges::transform_view transformView(ContiguousView{buff}, PlusOne{});
     assert(*transformView.begin() == 1);
   }
 
-  static_assert(!BeginInvocable<const std::ranges::transform_view<ContiguousView, Increment>>);
+  static_assert(!BeginInvocable<const std::ranges::transform_view<ContiguousView, PlusOneMutable>>);
 
   return true;
 }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/ctad.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/ctad.compile.pass.cpp
index c6da9f2e00c54..c07c79452d956 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/ctad.compile.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/ctad.compile.pass.cpp
@@ -18,9 +18,9 @@
 #include "test_macros.h"
 #include "types.h"
 
-static_assert(std::same_as<decltype(std::ranges::transform_view(InputView(), Increment())),
-                           std::ranges::transform_view<InputView, Increment>>);
-static_assert(std::same_as<decltype(std::ranges::transform_view(std::declval<ForwardRange&>(), Increment())),
-                           std::ranges::transform_view<std::ranges::ref_view<ForwardRange>, Increment>>);
-static_assert(std::same_as<decltype(std::ranges::transform_view(BorrowableRange(), Increment())),
-                           std::ranges::transform_view<std::ranges::subrange<int*>, Increment>>);
+static_assert(std::same_as<decltype(std::ranges::transform_view(InputView(), PlusOne())),
+                           std::ranges::transform_view<InputView, PlusOne>>);
+static_assert(std::same_as<decltype(std::ranges::transform_view(std::declval<ForwardRange&>(), PlusOne())),
+                           std::ranges::transform_view<std::ranges::ref_view<ForwardRange>, PlusOne>>);
+static_assert(std::same_as<decltype(std::ranges::transform_view(BorrowableRange(), PlusOne())),
+                           std::ranges::transform_view<std::ranges::subrange<int*>, PlusOne>>);
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/end.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/end.pass.cpp
index 931906353e44a..9c11ef6469b6f 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/end.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/end.pass.cpp
@@ -32,36 +32,36 @@ concept EndIsIter = requires(T t) { ++t.end(); };
 
 constexpr bool test() {
   {
-    std::ranges::transform_view transformView(ContiguousView{}, Increment{});
+    std::ranges::transform_view transformView(ContiguousView{}, PlusOneMutable{});
     assert(transformView.end().base() == globalBuff + 8);
   }
 
   {
-    std::ranges::transform_view transformView(ForwardView{}, Increment{});
+    std::ranges::transform_view transformView(ForwardView{}, PlusOneMutable{});
     assert(transformView.end().base().base() == globalBuff + 8);
   }
 
   {
-    std::ranges::transform_view transformView(InputView{}, Increment{});
+    std::ranges::transform_view transformView(InputView{}, PlusOneMutable{});
     assert(transformView.end().base() == globalBuff + 8);
   }
 
   {
-    const std::ranges::transform_view transformView(ContiguousView{}, IncrementConst{});
+    const std::ranges::transform_view transformView(ContiguousView{}, PlusOne{});
     assert(transformView.end().base() == globalBuff + 8);
   }
 
-  static_assert(!EndInvocable<const std::ranges::transform_view<ContiguousView, Increment>>);
-  static_assert( EndInvocable<      std::ranges::transform_view<ContiguousView, Increment>>);
-  static_assert( EndInvocable<const std::ranges::transform_view<ContiguousView, IncrementConst>>);
-  static_assert(!EndInvocable<const std::ranges::transform_view<InputView, Increment>>);
-  static_assert( EndInvocable<      std::ranges::transform_view<InputView, Increment>>);
-  static_assert( EndInvocable<const std::ranges::transform_view<InputView, IncrementConst>>);
+  static_assert(!EndInvocable<const std::ranges::transform_view<ContiguousView, PlusOneMutable>>);
+  static_assert( EndInvocable<      std::ranges::transform_view<ContiguousView, PlusOneMutable>>);
+  static_assert( EndInvocable<const std::ranges::transform_view<ContiguousView, PlusOne>>);
+  static_assert(!EndInvocable<const std::ranges::transform_view<InputView, PlusOneMutable>>);
+  static_assert( EndInvocable<      std::ranges::transform_view<InputView, PlusOneMutable>>);
+  static_assert( EndInvocable<const std::ranges::transform_view<InputView, PlusOne>>);
 
-  static_assert(!EndIsIter<const std::ranges::transform_view<InputView, IncrementConst>>);
-  static_assert(!EndIsIter<      std::ranges::transform_view<InputView, Increment>>);
-  static_assert( EndIsIter<const std::ranges::transform_view<ContiguousView, IncrementConst>>);
-  static_assert( EndIsIter<      std::ranges::transform_view<ContiguousView, Increment>>);
+  static_assert(!EndIsIter<const std::ranges::transform_view<InputView, PlusOne>>);
+  static_assert(!EndIsIter<      std::ranges::transform_view<InputView, PlusOneMutable>>);
+  static_assert( EndIsIter<const std::ranges::transform_view<ContiguousView, PlusOne>>);
+  static_assert( EndIsIter<      std::ranges::transform_view<ContiguousView, PlusOneMutable>>);
 
   return true;
 }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp
index 9df8be1fe8275..aec7143903e4e 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/general.pass.cpp
@@ -32,8 +32,8 @@ template<class T, class F>
 concept ValidTransformView = requires { typename std::ranges::transform_view<T, F>; };
 
 struct BadFunction { };
-static_assert( ValidTransformView<ContiguousView, Increment>);
-static_assert(!ValidTransformView<Range, Increment>);
+static_assert( ValidTransformView<ContiguousView, PlusOne>);
+static_assert(!ValidTransformView<Range, PlusOne>);
 static_assert(!ValidTransformView<ContiguousView, BadFunction>);
 
 template<std::ranges::range R>
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/arithmetic.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/arithmetic.pass.cpp
index ad4b554bcc3df..cde7431bd82a6 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/arithmetic.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/arithmetic.pass.cpp
@@ -18,7 +18,7 @@
 #include "../types.h"
 
 constexpr bool test() {
-  std::ranges::transform_view<ContiguousView, Increment> transformView;
+  std::ranges::transform_view<ContiguousView, PlusOne> transformView;
   auto iter = std::move(transformView).begin();
   assert((++iter).base() == globalBuff + 1);
 
@@ -31,7 +31,7 @@ constexpr bool test() {
 
   // Check that decltype(InputIter++) == void.
   ASSERT_SAME_TYPE(decltype(
-    std::declval<std::ranges::iterator_t<std::ranges::transform_view<InputView, Increment>>>()++),
+    std::declval<std::ranges::iterator_t<std::ranges::transform_view<InputView, PlusOne>>>()++),
     void);
 
   assert((iter += 4).base() == globalBuff + 4);
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/base.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/base.pass.cpp
index c1ecad20e706b..f60ed2a5c8214 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/base.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/base.pass.cpp
@@ -24,7 +24,7 @@ concept BaseInvocable = requires(std::ranges::iterator_t<std::ranges::transform_
 
 constexpr bool test() {
   {
-    std::ranges::transform_view<ContiguousView, Increment> transformView;
+    std::ranges::transform_view<ContiguousView, PlusOneMutable> transformView;
     auto iter = std::move(transformView).begin();
     ASSERT_SAME_TYPE(int*, decltype(iter.base()));
     assert(iter.base() == globalBuff);
@@ -33,13 +33,13 @@ constexpr bool test() {
   }
 
   {
-    std::ranges::transform_view<InputView, Increment> transformView;
+    std::ranges::transform_view<InputView, PlusOneMutable> transformView;
     auto iter = transformView.begin();
     assert(std::move(iter).base() == globalBuff);
     ASSERT_SAME_TYPE(cpp20_input_iterator<int *>, decltype(std::move(iter).base()));
   }
 
-  static_assert(!BaseInvocable<InputView, Increment>);
+  static_assert(!BaseInvocable<InputView, PlusOneMutable>);
 
   return true;
 }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/compare.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/compare.pass.cpp
index 5f734a819a38a..a6d0a5fbed8f1 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/compare.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/compare.pass.cpp
@@ -20,9 +20,9 @@
 
 constexpr bool test() {
   {
-    std::ranges::transform_view<ContiguousView, Increment> transformView1;
+    std::ranges::transform_view<ContiguousView, PlusOne> transformView1;
     auto iter1 = std::move(transformView1).begin();
-    std::ranges::transform_view<ContiguousView, Increment> transformView2;
+    std::ranges::transform_view<ContiguousView, PlusOne> transformView2;
     auto iter2 = std::move(transformView2).begin();
     assert(iter1 == iter2);
     assert(iter1 + 1 != iter2);
@@ -39,9 +39,9 @@ constexpr bool test() {
 // TODO: when three_way_comparable is implemented and std::is_eq is implemented,
 // uncomment this.
 //   {
-//     std::ranges::transform_view<ThreeWayCompView, Increment> transformView1;
+//     std::ranges::transform_view<ThreeWayCompView, PlusOne> transformView1;
 //     auto iter1 = transformView1.begin();
-//     std::ranges::transform_view<ThreeWayCompView, Increment> transformView2;
+//     std::ranges::transform_view<ThreeWayCompView, PlusOne> transformView2;
 //     auto iter2 = transformView2.begin();
 //
 //     assert(std::is_eq(iter1   <=> iter2));
@@ -52,8 +52,8 @@ constexpr bool test() {
 //     assert(std::is_gt(iter2   <=> iter1));
 //     assert(std::is_gteq(iter2 <=> iter1));
 //
-//     static_assert( std::three_way_comparable<std::iterator_t<std::ranges::transform_view<ThreeWayCompView, Increment>>>);
-//     static_assert(!std::three_way_comparable<std::iterator_t<std::ranges::transform_view<ContiguousView, Increment>>>);
+//     static_assert( std::three_way_comparable<std::iterator_t<std::ranges::transform_view<ThreeWayCompView, PlusOne>>>);
+//     static_assert(!std::three_way_comparable<std::iterator_t<std::ranges::transform_view<ContiguousView, PlusOne>>>);
 //   }
 
   return true;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/ctor.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/ctor.pass.cpp
index f96b8e0b36f47..5e333f8d8d1f5 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/ctor.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/ctor.pass.cpp
@@ -57,16 +57,16 @@ struct IterNoDefaultInitView : std::ranges::view_base {
 };
 
 constexpr bool test() {
-  std::ranges::transform_view<ContiguousView, IncrementConst> transformView;
+  std::ranges::transform_view<ContiguousView, PlusOne> transformView;
   auto iter = std::move(transformView).begin();
-  std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, IncrementConst>> i2(iter);
+  std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, PlusOne>> i2(iter);
   (void)i2;
-  std::ranges::iterator_t<const std::ranges::transform_view<ContiguousView, IncrementConst>> constIter(iter);
+  std::ranges::iterator_t<const std::ranges::transform_view<ContiguousView, PlusOne>> constIter(iter);
   (void)constIter;
 
 
-  static_assert( std::default_initializable<std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, IncrementConst>>>);
-  static_assert(!std::default_initializable<std::ranges::iterator_t<std::ranges::transform_view<IterNoDefaultInitView, IncrementConst>>>);
+  static_assert( std::default_initializable<std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, PlusOne>>>);
+  static_assert(!std::default_initializable<std::ranges::iterator_t<std::ranges::transform_view<IterNoDefaultInitView, PlusOne>>>);
 
   return true;
 }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp
index 2dc761f156c76..f1bc26d152a5c 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/deref.pass.cpp
@@ -18,27 +18,46 @@
 #include "../types.h"
 
 int main(int, char**) {
-  int buff[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-
   {
-    std::ranges::transform_view transformView(ContiguousView{buff}, Increment{});
+    int buff[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    using View = std::ranges::transform_view<ContiguousView, PlusOne>;
+    View transformView(ContiguousView{buff}, PlusOne{});
     assert(*transformView.begin() == 1);
+    static_assert(!noexcept(*std::declval<std::ranges::iterator_t<View>>()));
+    ASSERT_SAME_TYPE(int, decltype(*std::declval<View>().begin()));
+  }
+  {
+    int buff[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    using View = std::ranges::transform_view<ContiguousView, PlusOneMutable>;
+    View transformView(ContiguousView{buff}, PlusOneMutable{});
+    assert(*transformView.begin() == 1);
+    static_assert(!noexcept(*std::declval<std::ranges::iterator_t<View>>()));
+    ASSERT_SAME_TYPE(int, decltype(*std::declval<View>().begin()));
+  }
+  {
+    int buff[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    using View = std::ranges::transform_view<ContiguousView, PlusOneNoexcept>;
+    View transformView(ContiguousView{buff}, PlusOneNoexcept{});
+    assert(*transformView.begin() == 1);
+    static_assert(noexcept(*std::declval<std::ranges::iterator_t<View>>()));
+    ASSERT_SAME_TYPE(int, decltype(*std::declval<View>().begin()));
+  }
+  {
+    int buff[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    using View = std::ranges::transform_view<ContiguousView, Increment>;
+    View transformView(ContiguousView{buff}, Increment{});
+    assert(*transformView.begin() == 1);
+    static_assert(!noexcept(*std::declval<std::ranges::iterator_t<View>>()));
+    ASSERT_SAME_TYPE(int&, decltype(*std::declval<View>().begin()));
+  }
+  {
+    int buff[] = {0, 1, 2, 3, 4, 5, 6, 7};
+    using View = std::ranges::transform_view<ContiguousView, IncrementRvalueRef>;
+    View transformView(ContiguousView{buff}, IncrementRvalueRef{});
+    assert(*transformView.begin() == 1);
+    static_assert(!noexcept(*std::declval<std::ranges::iterator_t<View>>()));
+    ASSERT_SAME_TYPE(int&&, decltype(*std::declval<View>().begin()));
   }
-
-  static_assert(!noexcept(
-    *std::declval<std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, Increment>>>()));
-  static_assert( noexcept(
-    *std::declval<std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, IncrementNoexcept>>>()));
-
-  ASSERT_SAME_TYPE(
-    int,
-    decltype(*std::declval<std::ranges::transform_view<RandomAccessView, Increment>>().begin()));
-  ASSERT_SAME_TYPE(
-    int&,
-    decltype(*std::declval<std::ranges::transform_view<RandomAccessView, IncrementRef>>().begin()));
-  ASSERT_SAME_TYPE(
-    int&&,
-    decltype(*std::declval<std::ranges::transform_view<RandomAccessView, IncrementRvalueRef>>().begin()));
 
   return 0;
 }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp
index 68663785d4ca8..85a46d99e4904 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/iter_move.pass.cpp
@@ -21,7 +21,7 @@ constexpr bool test() {
   int buff[8] = {0, 1, 2, 3, 4, 5, 6, 7};
 
   {
-    std::ranges::transform_view transformView(ContiguousView{buff}, Increment{});
+    std::ranges::transform_view transformView(ContiguousView{buff}, PlusOneMutable{});
     auto iter = transformView.begin();
     static_assert(!noexcept(std::ranges::iter_move(iter)));
 
@@ -34,9 +34,9 @@ constexpr bool test() {
 
   {
     static_assert( noexcept(std::ranges::iter_move(
-      std::declval<std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, IncrementNoexcept>>&>())));
+      std::declval<std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, PlusOneNoexcept>>&>())));
     static_assert(!noexcept(std::ranges::iter_move(
-      std::declval<std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, Increment>>&>())));
+      std::declval<std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, PlusOneMutable>>&>())));
   }
 
   return true;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/plus_minus.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/plus_minus.pass.cpp
index a003945b9f23a..4a97d75958a9c 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/plus_minus.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/plus_minus.pass.cpp
@@ -18,9 +18,9 @@
 #include "../types.h"
 
 constexpr bool test() {
-  std::ranges::transform_view<ContiguousView, Increment> transformView1;
+  std::ranges::transform_view<ContiguousView, PlusOneMutable> transformView1;
   auto iter1 = std::move(transformView1).begin();
-  std::ranges::transform_view<ContiguousView, Increment> transformView2;
+  std::ranges::transform_view<ContiguousView, PlusOneMutable> transformView2;
   auto iter2 = std::move(transformView2).begin();
   iter1 += 4;
   assert((iter1 + 1).base() == globalBuff + 5);
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/requirements.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/requirements.compile.pass.cpp
index ae7485b5d215b..2932ad14a6fcc 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/requirements.compile.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/requirements.compile.pass.cpp
@@ -17,8 +17,8 @@
 #include "test_macros.h"
 #include "../types.h"
 
-static_assert(std::ranges::bidirectional_range<std::ranges::transform_view<BidirectionalView, IncrementConst>>);
-static_assert(!std::ranges::bidirectional_range<std::ranges::transform_view<ForwardView, IncrementConst>>);
+static_assert(std::ranges::bidirectional_range<std::ranges::transform_view<BidirectionalView, PlusOne>>);
+static_assert(!std::ranges::bidirectional_range<std::ranges::transform_view<ForwardView, PlusOne>>);
 
-static_assert(std::ranges::random_access_range<std::ranges::transform_view<RandomAccessView, IncrementConst>>);
-static_assert(!std::ranges::random_access_range<std::ranges::transform_view<BidirectionalView, IncrementConst>>);
+static_assert(std::ranges::random_access_range<std::ranges::transform_view<RandomAccessView, PlusOne>>);
+static_assert(!std::ranges::random_access_range<std::ranges::transform_view<BidirectionalView, PlusOne>>);
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/sentinel.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/sentinel.pass.cpp
index d12ece51c0c97..1c4ef28f9d38e 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/sentinel.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/sentinel.pass.cpp
@@ -21,18 +21,18 @@ template<class T>
 concept EndIsIter = requires(T t) { ++t.end(); };
 
 constexpr bool test() {
-  std::ranges::transform_view<SizedSentinelView, IncrementConst> transformView1;
+  std::ranges::transform_view<SizedSentinelView, PlusOne> transformView1;
   // Going to const and back.
   auto sent1 = transformView1.end();
-  std::ranges::sentinel_t<const std::ranges::transform_view<SizedSentinelView, IncrementConst>> sent2{sent1};
-  std::ranges::sentinel_t<const std::ranges::transform_view<SizedSentinelView, IncrementConst>> sent3{sent2};
+  std::ranges::sentinel_t<const std::ranges::transform_view<SizedSentinelView, PlusOne>> sent2{sent1};
+  std::ranges::sentinel_t<const std::ranges::transform_view<SizedSentinelView, PlusOne>> sent3{sent2};
   (void)sent3;
 
   static_assert(!EndIsIter<decltype(sent1)>);
   static_assert(!EndIsIter<decltype(sent2)>);
   assert(sent1.base() == globalBuff + 8);
 
-  std::ranges::transform_view transformView2(SizedSentinelView{4}, IncrementConst());
+  std::ranges::transform_view transformView2(SizedSentinelView{4}, PlusOne());
   auto sent4 = transformView2.end();
   auto iter = transformView1.begin();
   {
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/subscript.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/subscript.pass.cpp
index 90a673df15a01..567f646fbd14d 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/subscript.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/subscript.pass.cpp
@@ -19,22 +19,22 @@
 
 constexpr bool test() {
   int buff[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  std::ranges::transform_view transformView1(ContiguousView{buff}, Increment{});
+  std::ranges::transform_view transformView1(ContiguousView{buff}, PlusOneMutable{});
   auto iter1 = std::move(transformView1).begin() + 1;
   assert(iter1[0] == 2);
   assert(iter1[4] == 6);
 
   static_assert(!noexcept(
-    std::declval<std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, Increment>>>()[0]));
+    std::declval<std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, PlusOneMutable>>>()[0]));
   static_assert( noexcept(
-    std::declval<std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, IncrementNoexcept>>>()[0]));
+    std::declval<std::ranges::iterator_t<std::ranges::transform_view<ContiguousView, PlusOneNoexcept>>>()[0]));
 
   ASSERT_SAME_TYPE(
     int,
-    decltype(std::declval<std::ranges::transform_view<RandomAccessView, Increment>>().begin()[0]));
+    decltype(std::declval<std::ranges::transform_view<RandomAccessView, PlusOneMutable>>().begin()[0]));
   ASSERT_SAME_TYPE(
     int&,
-    decltype(std::declval<std::ranges::transform_view<RandomAccessView, IncrementRef>>().begin()[0]));
+    decltype(std::declval<std::ranges::transform_view<RandomAccessView, Increment>>().begin()[0]));
   ASSERT_SAME_TYPE(
     int&&,
     decltype(std::declval<std::ranges::transform_view<RandomAccessView, IncrementRvalueRef>>().begin()[0]));
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/types.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/types.pass.cpp
index 7c03c47bcdc8c..de7747023f5a4 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/types.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/iterator/types.pass.cpp
@@ -29,7 +29,7 @@ constexpr bool test() {
     static_assert(std::same_as<std::iterator_traits<int*>::iterator_concept, std::contiguous_iterator_tag>);
     static_assert(std::same_as<std::iterator_traits<int*>::iterator_category, std::random_access_iterator_tag>);
 
-    using TView = std::ranges::transform_view<ContiguousView, IncrementRef>;
+    using TView = std::ranges::transform_view<ContiguousView, Increment>;
     using TIter = std::ranges::iterator_t<TView>;
     static_assert(std::same_as<typename TIter::iterator_concept, std::random_access_iterator_tag>);
     static_assert(std::same_as<typename TIter::iterator_category, std::random_access_iterator_tag>);
@@ -38,7 +38,7 @@ constexpr bool test() {
   }
   {
     // Member typedefs for random access iterator.
-    using TView = std::ranges::transform_view<RandomAccessView, IncrementRef>;
+    using TView = std::ranges::transform_view<RandomAccessView, Increment>;
     using TIter = std::ranges::iterator_t<TView>;
     static_assert(std::same_as<typename TIter::iterator_concept, std::random_access_iterator_tag>);
     static_assert(std::same_as<typename TIter::iterator_category, std::random_access_iterator_tag>);
@@ -47,7 +47,7 @@ constexpr bool test() {
   }
   {
     // Member typedefs for random access iterator/not-lvalue-ref.
-    using TView = std::ranges::transform_view<RandomAccessView, Increment>;
+    using TView = std::ranges::transform_view<RandomAccessView, PlusOneMutable>;
     using TIter = std::ranges::iterator_t<TView>;
     static_assert(std::same_as<typename TIter::iterator_concept, std::random_access_iterator_tag>);
     static_assert(std::same_as<typename TIter::iterator_category, std::input_iterator_tag>); // Note: this is now input_iterator_tag.
@@ -56,7 +56,7 @@ constexpr bool test() {
   }
   {
     // Member typedefs for bidirectional iterator.
-    using TView = std::ranges::transform_view<BidirectionalView, IncrementRef>;
+    using TView = std::ranges::transform_view<BidirectionalView, Increment>;
     using TIter = std::ranges::iterator_t<TView>;
     static_assert(std::same_as<typename TIter::iterator_concept, std::bidirectional_iterator_tag>);
     static_assert(std::same_as<typename TIter::iterator_category, std::bidirectional_iterator_tag>);
@@ -65,7 +65,7 @@ constexpr bool test() {
   }
   {
     // Member typedefs for forward iterator.
-    using TView = std::ranges::transform_view<ForwardView, IncrementRef>;
+    using TView = std::ranges::transform_view<ForwardView, Increment>;
     using TIter = std::ranges::iterator_t<TView>;
     static_assert(std::same_as<typename TIter::iterator_concept, std::forward_iterator_tag>);
     static_assert(std::same_as<typename TIter::iterator_category, std::forward_iterator_tag>);
@@ -74,10 +74,10 @@ constexpr bool test() {
   }
   {
     // Member typedefs for input iterator.
-    using TView = std::ranges::transform_view<InputView, IncrementRef>;
+    using TView = std::ranges::transform_view<InputView, Increment>;
     using TIter = std::ranges::iterator_t<TView>;
     static_assert(std::same_as<typename TIter::iterator_concept, std::input_iterator_tag>);
-    static_assert(!HasIterCategory<InputView, IncrementRef>);
+    static_assert(!HasIterCategory<InputView, Increment>);
     static_assert(std::same_as<typename TIter::value_type, int>);
     static_assert(std::same_as<typename TIter::difference_type, std::ptrdiff_t>);
   }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/size.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.transform/size.pass.cpp
index d0de0e7a72c6a..0402d44f66447 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/size.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/size.pass.cpp
@@ -23,19 +23,19 @@ concept SizeInvocable = requires(T t) { t.size(); };
 
 constexpr bool test() {
   {
-    std::ranges::transform_view transformView(ContiguousView{}, Increment{});
+    std::ranges::transform_view transformView(ContiguousView{}, PlusOne{});
     assert(transformView.size() == 8);
   }
 
   {
-    const std::ranges::transform_view transformView(ContiguousView{globalBuff, 4}, Increment{});
+    const std::ranges::transform_view transformView(ContiguousView{globalBuff, 4}, PlusOne{});
     assert(transformView.size() == 4);
   }
 
-  static_assert(!SizeInvocable<std::ranges::transform_view<ForwardView, Increment>>);
+  static_assert(!SizeInvocable<std::ranges::transform_view<ForwardView, PlusOne>>);
 
-  static_assert(SizeInvocable<std::ranges::transform_view<SizedSentinelNotConstView, Increment>>);
-  static_assert(!SizeInvocable<const std::ranges::transform_view<SizedSentinelNotConstView, Increment>>);
+  static_assert(SizeInvocable<std::ranges::transform_view<SizedSentinelNotConstView, PlusOne>>);
+  static_assert(!SizeInvocable<const std::ranges::transform_view<SizedSentinelNotConstView, PlusOne>>);
 
   return true;
 }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.transform/types.h b/libcxx/test/std/ranges/range.adaptors/range.transform/types.h
index 429a5ed969d91..159db9b4c97bf 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.transform/types.h
+++ b/libcxx/test/std/ranges/range.adaptors/range.transform/types.h
@@ -129,15 +129,15 @@ struct ThreeWayCompView : std::ranges::view_base {
   constexpr ThreeWayCompIter end() const { return ThreeWayCompIter(globalBuff + 8); }
 };
 
-struct Increment {
+struct PlusOneMutable {
   constexpr int operator()(int x) { return x + 1; }
 };
 
-struct IncrementConst {
+struct PlusOne {
   constexpr int operator()(int x) const { return x + 1; }
 };
 
-struct IncrementRef {
+struct Increment {
   constexpr int& operator()(int& x) { return ++x; }
 };
 
@@ -145,7 +145,7 @@ struct IncrementRvalueRef {
   constexpr int&& operator()(int& x) { return std::move(++x); }
 };
 
-struct IncrementNoexcept {
+struct PlusOneNoexcept {
   constexpr int operator()(int x) noexcept { return x + 1; }
 };
 
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/begin.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/begin.pass.cpp
new file mode 100644
index 0000000000000..16b80b19ef3b3
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/begin.pass.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr iterator begin() const;
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "types.h"
+
+template<class T>
+constexpr void testType() {
+  {
+    std::ranges::iota_view<T> io(T(0));
+    assert(*io.begin() == T(0));
+  }
+  {
+    std::ranges::iota_view<T> io(T(10));
+    assert(*io.begin() == T(10));
+    assert(*std::move(io).begin() == T(10));
+  }
+  {
+    const std::ranges::iota_view<T> io(T(0));
+    assert(*io.begin() == T(0));
+  }
+  {
+    const std::ranges::iota_view<T> io(T(10));
+    assert(*io.begin() == T(10));
+  }
+}
+
+constexpr bool test() {
+  testType<SomeInt>();
+  testType<long long>();
+  testType<unsigned long long>();
+  testType<signed long>();
+  testType<unsigned long>();
+  testType<int>();
+  testType<unsigned>();
+  testType<short>();
+  testType<unsigned short>();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/borrowing.compile.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/borrowing.compile.pass.cpp
new file mode 100644
index 0000000000000..7fbf987463652
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/borrowing.compile.pass.cpp
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// template<class W, class Bound>
+//   inline constexpr bool enable_borrowed_range<iota_view<W, Bound>> = true;
+
+#include <ranges>
+#include <cassert>
+#include <concepts>
+
+#include "test_macros.h"
+#include "types.h"
+
+static_assert(std::ranges::enable_borrowed_range<std::ranges::iota_view<int, int>>);
+static_assert(std::ranges::enable_borrowed_range<std::ranges::iota_view<int, std::unreachable_sentinel_t>>);
+static_assert(std::ranges::enable_borrowed_range<std::ranges::iota_view<int, IntComparableWith<int>>>);
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/ctad.compile.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/ctad.compile.pass.cpp
new file mode 100644
index 0000000000000..d1f4e5a5f4503
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/ctad.compile.pass.cpp
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// template<class W, class Bound>
+//     requires (!is-integer-like<W> || !is-integer-like<Bound> ||
+//               (is-signed-integer-like<W> == is-signed-integer-like<Bound>))
+//     iota_view(W, Bound) -> iota_view<W, Bound>;
+
+#include <ranges>
+#include <cassert>
+#include <concepts>
+
+#include "test_macros.h"
+#include "types.h"
+
+template<class T, class U>
+concept CanDeduce = requires(const T& t, const U& u) {
+  std::ranges::iota_view(t, u);
+};
+
+void test() {
+  static_assert(std::same_as<
+    decltype(std::ranges::iota_view(0, 0)),
+    std::ranges::iota_view<int, int>
+  >);
+
+  static_assert(std::same_as<
+    decltype(std::ranges::iota_view(0)),
+    std::ranges::iota_view<int, std::unreachable_sentinel_t>
+  >);
+
+  static_assert(std::same_as<
+    decltype(std::ranges::iota_view(0, std::unreachable_sentinel)),
+    std::ranges::iota_view<int, std::unreachable_sentinel_t>
+  >);
+
+  static_assert(std::same_as<
+    decltype(std::ranges::iota_view(0, IntComparableWith(0))),
+    std::ranges::iota_view<int, IntComparableWith<int>>
+  >);
+
+  static_assert( CanDeduce<int, int>);
+  static_assert(!CanDeduce<int, unsigned>);
+  static_assert(!CanDeduce<unsigned, int>);
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.default.pass.cpp
new file mode 100644
index 0000000000000..724656429f7d6
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.default.pass.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// iota_view() requires default_­initializable<W> = default;
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "types.h"
+
+constexpr bool test() {
+  {
+    std::ranges::iota_view<Int42<DefaultTo42>> io;
+    assert((*io.begin()).value_ == 42);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  static_assert(!std::default_initializable<Int42<ValueCtor>>);
+  static_assert( std::default_initializable<Int42<DefaultTo42>>);
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.first.last.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.first.last.pass.cpp
new file mode 100644
index 0000000000000..61eb8a2d16732
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.first.last.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr iota_view(iterator first, see below last);
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "types.h"
+
+constexpr bool test() {
+  {
+    std::ranges::iota_view commonView(SomeInt(0), SomeInt(10));
+    std::ranges::iota_view<SomeInt, SomeInt> io(commonView.begin(), commonView.end());
+    assert(std::ranges::next(io.begin(), 10) == io.end());
+  }
+
+  {
+    std::ranges::iota_view unreachableSent(SomeInt(0));
+    std::ranges::iota_view<SomeInt> io(unreachableSent.begin(), std::unreachable_sentinel);
+    assert(std::ranges::next(io.begin(), 10) != io.end());
+  }
+
+  {
+    std::ranges::iota_view differentTypes(SomeInt(0), IntComparableWith(SomeInt(10)));
+    std::ranges::iota_view<SomeInt, IntComparableWith<SomeInt>> io(differentTypes.begin(), differentTypes.end());
+    assert(std::ranges::next(io.begin(), 10) == io.end());
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
+
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.bound.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.bound.pass.cpp
new file mode 100644
index 0000000000000..21f5558f61d81
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.bound.pass.cpp
@@ -0,0 +1,60 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-sign-compare
+
+// constexpr iota_view(type_identity_t<W> value, type_identity_t<Bound> bound);
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "types.h"
+
+constexpr bool test() {
+  {
+    std::ranges::iota_view<SomeInt, SomeInt> io(SomeInt(0), SomeInt(10));
+    assert(std::ranges::next(io.begin(), 10) == io.end());
+  }
+
+  {
+    std::ranges::iota_view<SomeInt> io(SomeInt(0), std::unreachable_sentinel);
+    assert(std::ranges::next(io.begin(), 10) != io.end());
+  }
+
+  {
+    std::ranges::iota_view<SomeInt, IntComparableWith<SomeInt>> io(SomeInt(0), IntComparableWith(SomeInt(10)));
+    assert(std::ranges::next(io.begin(), 10) == io.end());
+  }
+
+  {
+    // This is allowed only when using the constructor (not the deduction guide).
+    std::ranges::iota_view<int, unsigned> signedUnsigned(0, 10);
+    assert(std::ranges::next(signedUnsigned.begin(), 10) == signedUnsigned.end());
+  }
+
+  {
+    // This is allowed only when using the constructor (not the deduction guide).
+    std::ranges::iota_view<unsigned, int> signedUnsigned(0, 10);
+    assert(std::ranges::next(signedUnsigned.begin(), 10) == signedUnsigned.end());
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
+
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.pass.cpp
new file mode 100644
index 0000000000000..cc65b9368a2f9
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/ctor.value.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr explicit iota_view(W value);
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "types.h"
+
+struct SomeIntComparable {
+  using difference_type = int;
+
+  SomeInt value_;
+  constexpr SomeIntComparable() : value_(SomeInt(10)) {}
+
+  friend constexpr bool operator==(SomeIntComparable lhs, SomeIntComparable rhs) {
+    return lhs.value_ == rhs.value_;
+  }
+  friend constexpr bool operator==(SomeIntComparable lhs, SomeInt rhs) {
+    return lhs.value_ == rhs;
+  }
+  friend constexpr bool operator==(SomeInt lhs, SomeIntComparable rhs) {
+    return lhs == rhs.value_;
+  }
+
+  friend constexpr difference_type operator-(SomeIntComparable lhs, SomeIntComparable rhs) {
+    return lhs.value_ - rhs.value_;
+  }
+
+  constexpr SomeIntComparable& operator++() { ++value_; return *this; }
+  constexpr SomeIntComparable  operator++(int) { auto tmp = *this; ++value_; return tmp; }
+  constexpr SomeIntComparable  operator--() { --value_; return *this; }
+};
+
+constexpr bool test() {
+  {
+    std::ranges::iota_view<SomeInt> io(SomeInt(42));
+    assert((*io.begin()).value_ == 42);
+    // Check that end returns std::unreachable_sentinel.
+    assert(io.end() != io.begin());
+    static_assert(std::same_as<decltype(io.end()), std::unreachable_sentinel_t>);
+  }
+
+  {
+    std::ranges::iota_view<SomeInt, SomeIntComparable> io(SomeInt(0));
+    assert(std::ranges::next(io.begin(), 10) == io.end());
+  }
+  {
+    static_assert(!std::is_convertible_v<std::ranges::iota_view<SomeInt>, SomeInt>);
+    static_assert( std::is_constructible_v<std::ranges::iota_view<SomeInt>, SomeInt>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/end.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/end.pass.cpp
new file mode 100644
index 0000000000000..418a52a3b9d2f
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/end.pass.cpp
@@ -0,0 +1,82 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-sign-compare
+
+// constexpr auto end() const;
+// constexpr iterator end() const requires same_as<W, Bound>;
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "types.h"
+
+template<class T, class U>
+constexpr void testType(U u) {
+  {
+    std::ranges::iota_view<T, U> io(T(0), u);
+    assert(std::ranges::next(io.begin(), 10) == io.end());
+  }
+  {
+    std::ranges::iota_view<T, U> io(T(10), u);
+    assert(io.begin() == io.end());
+    assert(io.begin() == std::move(io).end());
+  }
+  {
+    const std::ranges::iota_view<T, U> io(T(0), u);
+    assert(std::ranges::next(io.begin(), 10) == io.end());
+    assert(std::ranges::next(io.begin(), 10) == std::move(io).end());
+  }
+  {
+    const std::ranges::iota_view<T, U> io(T(10), u);
+    assert(io.begin() == io.end());
+  }
+
+  {
+    std::ranges::iota_view<T> io(T(0), std::unreachable_sentinel);
+    assert(io.begin() != io.end());
+    assert(std::ranges::next(io.begin()) != io.end());
+    assert(std::ranges::next(io.begin(), 10) != io.end());
+  }
+  {
+    const std::ranges::iota_view<T> io(T(0), std::unreachable_sentinel);
+    assert(io.begin() != io.end());
+    assert(std::ranges::next(io.begin()) != io.end());
+    assert(std::ranges::next(io.begin(), 10) != io.end());
+  }
+}
+
+constexpr bool test() {
+  testType<SomeInt>(SomeInt(10));
+  testType<SomeInt>(IntComparableWith(SomeInt(10)));
+  testType<signed long>(IntComparableWith<signed long>(10));
+  testType<unsigned long>(IntComparableWith<unsigned long>(10));
+  testType<int>(IntComparableWith<int>(10));
+  testType<int>(int(10));
+  testType<int>(unsigned(10));
+  testType<unsigned>(unsigned(10));
+  testType<unsigned>(int(10));
+  testType<unsigned>(IntComparableWith<unsigned>(10));
+  testType<short>(short(10));
+  testType<short>(IntComparableWith<short>(10));
+  testType<unsigned short>(IntComparableWith<unsigned short>(10));
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/compare.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/compare.pass.cpp
new file mode 100644
index 0000000000000..6ef4751c64408
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/compare.pass.cpp
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// friend constexpr bool operator<(const iterator& x, const iterator& y)
+//   requires totally_ordered<W>;
+// friend constexpr bool operator>(const iterator& x, const iterator& y)
+//   requires totally_ordered<W>;
+// friend constexpr bool operator<=(const iterator& x, const iterator& y)
+//   requires totally_ordered<W>;
+// friend constexpr bool operator>=(const iterator& x, const iterator& y)
+//   requires totally_ordered<W>;
+// friend constexpr bool operator==(const iterator& x, const iterator& y)
+//   requires equality_comparable<W>;
+
+// TODO: test spaceship operator once it's implemented.
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  {
+    const std::ranges::iota_view<int> io(0);
+    assert(                  io.begin()  ==                   io.begin() );
+    assert(                  io.begin()  != std::ranges::next(io.begin()));
+    assert(                  io.begin()  <  std::ranges::next(io.begin()));
+    assert(std::ranges::next(io.begin()) >                    io.begin() );
+    assert(                  io.begin()  <= std::ranges::next(io.begin()));
+    assert(std::ranges::next(io.begin()) >=                   io.begin() );
+    assert(                  io.begin()  <=                   io.begin() );
+    assert(                  io.begin()  >=                   io.begin() );
+  }
+  {
+    std::ranges::iota_view<int> io(0);
+    assert(                  io.begin()  ==                   io.begin() );
+    assert(                  io.begin()  != std::ranges::next(io.begin()));
+    assert(                  io.begin()  <  std::ranges::next(io.begin()));
+    assert(std::ranges::next(io.begin()) >                    io.begin() );
+    assert(                  io.begin()  <= std::ranges::next(io.begin()));
+    assert(std::ranges::next(io.begin()) >=                   io.begin() );
+    assert(                  io.begin()  <=                   io.begin() );
+    assert(                  io.begin()  >=                   io.begin() );
+  }
+  {
+    const std::ranges::iota_view<SomeInt> io(SomeInt(0));
+    assert(                  io.begin()  ==                   io.begin() );
+    assert(                  io.begin()  != std::ranges::next(io.begin()));
+    assert(                  io.begin()  <  std::ranges::next(io.begin()));
+    assert(std::ranges::next(io.begin()) >                    io.begin() );
+    assert(                  io.begin()  <= std::ranges::next(io.begin()));
+    assert(std::ranges::next(io.begin()) >=                   io.begin() );
+    assert(                  io.begin()  <=                   io.begin() );
+    assert(                  io.begin()  >=                   io.begin() );
+  }
+  {
+    std::ranges::iota_view<SomeInt> io(SomeInt(0));
+    assert(                  io.begin()  ==                   io.begin() );
+    assert(                  io.begin()  != std::ranges::next(io.begin()));
+    assert(                  io.begin()  <  std::ranges::next(io.begin()));
+    assert(std::ranges::next(io.begin()) >                    io.begin() );
+    assert(                  io.begin()  <= std::ranges::next(io.begin()));
+    assert(std::ranges::next(io.begin()) >=                   io.begin() );
+    assert(                  io.begin()  <=                   io.begin() );
+    assert(                  io.begin()  >=                   io.begin() );
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.default.pass.cpp
new file mode 100644
index 0000000000000..ac8075dba9143
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.default.pass.cpp
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// iterator() requires default_initializable<W> = default;
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  using Iter = std::ranges::iterator_t<std::ranges::iota_view<Int42<DefaultTo42>>>;
+  Iter iter;
+  assert((*iter).value_ == 42);
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.value.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.value.pass.cpp
new file mode 100644
index 0000000000000..8395dde50df4f
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/ctor.value.pass.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr explicit iterator(W value);
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  {
+    using Iter = std::ranges::iterator_t<std::ranges::iota_view<int>>;
+    auto iter = Iter(42);
+    assert(*iter == 42);
+  }
+  {
+    using Iter = std::ranges::iterator_t<std::ranges::iota_view<SomeInt>>;
+    auto iter = Iter(SomeInt(42));
+    assert(*iter == SomeInt(42));
+  }
+  {
+    using Iter = std::ranges::iterator_t<std::ranges::iota_view<SomeInt>>;
+    static_assert(!std::is_convertible_v<Iter, SomeInt>);
+    static_assert( std::is_constructible_v<Iter, SomeInt>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/decrement.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/decrement.pass.cpp
new file mode 100644
index 0000000000000..a17dc3f99f472
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/decrement.pass.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr iterator& operator--() requires decrementable<W>;
+// constexpr iterator operator--(int) requires decrementable<W>;
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+template<class T>
+concept Decrementable =
+  requires(T i) {
+    --i;
+  } ||
+  requires(T i) {
+    i--;
+  };
+
+constexpr bool test() {
+  {
+    std::ranges::iota_view<int> io(0);
+    auto iter1 = std::next(io.begin());
+    auto iter2 = std::next(io.begin());
+    assert(iter1 == iter2);
+    assert(--iter1 != iter2--);
+    assert(iter1 == iter2);
+
+    static_assert(!std::is_reference_v<decltype(iter2--)>);
+    static_assert( std::is_reference_v<decltype(--iter2)>);
+    static_assert(std::same_as<std::remove_reference_t<decltype(--iter2)>, decltype(iter2--)>);
+  }
+  {
+    std::ranges::iota_view io(SomeInt(0));
+    auto iter1 = std::next(io.begin());
+    auto iter2 = std::next(io.begin());
+    assert(iter1 == iter2);
+    assert(--iter1 != iter2--);
+    assert(iter1 == iter2);
+
+    static_assert(!std::is_reference_v<decltype(iter2--)>);
+    static_assert( std::is_reference_v<decltype(--iter2)>);
+    static_assert(std::same_as<std::remove_reference_t<decltype(--iter2)>, decltype(iter2--)>);
+  }
+
+  static_assert(!Decrementable<std::ranges::iota_view<NotDecrementable>>);
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/increment.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/increment.pass.cpp
new file mode 100644
index 0000000000000..cbc143e20a0d6
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/increment.pass.cpp
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr iterator& operator++();
+// constexpr void operator++(int);
+// constexpr iterator operator++(int) requires incrementable<W>;
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  {
+    std::ranges::iota_view<int> io(0);
+    auto iter1 = io.begin();
+    auto iter2 = io.begin();
+    assert(iter1 == iter2);
+    assert(++iter1 != iter2++);
+    assert(iter1 == iter2);
+
+    static_assert(!std::is_reference_v<decltype(iter2++)>);
+    static_assert( std::is_reference_v<decltype(++iter2)>);
+    static_assert(std::same_as<std::remove_reference_t<decltype(++iter2)>, decltype(iter2++)>);
+  }
+  {
+    std::ranges::iota_view io(SomeInt(0));
+    auto iter1 = io.begin();
+    auto iter2 = io.begin();
+    assert(iter1 == iter2);
+    assert(++iter1 != iter2++);
+    assert(iter1 == iter2);
+
+    static_assert(!std::is_reference_v<decltype(iter2++)>);
+    static_assert( std::is_reference_v<decltype(++iter2)>);
+    static_assert(std::same_as<std::remove_reference_t<decltype(++iter2)>, decltype(iter2++)>);
+  }
+
+  {
+    std::ranges::iota_view<NotIncrementable> io(NotIncrementable(0));
+    auto iter1 = io.begin();
+    auto iter2 = io.begin();
+    assert(iter1 == iter2);
+    assert(++iter1 != iter2);
+    iter2++;
+    assert(iter1 == iter2);
+
+    static_assert(std::same_as<decltype(iter2++), void>);
+    static_assert(std::is_reference_v<decltype(++iter2)>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/member_typedefs.compile.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/member_typedefs.compile.pass.cpp
new file mode 100644
index 0000000000000..26d6bafbe5eed
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/member_typedefs.compile.pass.cpp
@@ -0,0 +1,163 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// Test iterator category and iterator concepts.
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+struct Decrementable {
+  using difference_type = int;
+
+  auto operator<=>(const Decrementable&) const = default;
+
+  constexpr Decrementable& operator++();
+  constexpr Decrementable  operator++(int);
+  constexpr Decrementable& operator--();
+  constexpr Decrementable  operator--(int);
+};
+
+struct Incrementable {
+  using difference_type = int;
+
+  auto operator<=>(const Incrementable&) const = default;
+
+  constexpr Incrementable& operator++();
+  constexpr Incrementable  operator++(int);
+};
+
+struct BigType {
+  char buffer[128];
+
+  using difference_type = int;
+
+  auto operator<=>(const BigType&) const = default;
+
+  constexpr BigType& operator++();
+  constexpr BigType  operator++(int);
+};
+
+struct CharDifferenceType {
+  using difference_type = signed char;
+
+  auto operator<=>(const CharDifferenceType&) const = default;
+
+  constexpr CharDifferenceType& operator++();
+  constexpr CharDifferenceType  operator++(int);
+};
+
+template<class T>
+concept HasIteratorCategory = requires { typename std::ranges::iterator_t<T>::iterator_category; };
+
+void test() {
+  {
+    const std::ranges::iota_view<char> io(0);
+    using Iter = decltype(io.begin());
+    static_assert(std::same_as<Iter::iterator_concept, std::random_access_iterator_tag>);
+    static_assert(std::same_as<Iter::iterator_category, std::input_iterator_tag>);
+    static_assert(std::same_as<Iter::value_type, char>);
+    static_assert(sizeof(Iter::difference_type) > sizeof(char));
+    static_assert(std::is_signed_v<Iter::difference_type>);
+    LIBCPP_STATIC_ASSERT(std::same_as<Iter::difference_type, int>);
+  }
+  {
+    const std::ranges::iota_view<short> io(0);
+    using Iter = decltype(io.begin());
+    static_assert(std::same_as<Iter::iterator_concept, std::random_access_iterator_tag>);
+    static_assert(std::same_as<Iter::iterator_category, std::input_iterator_tag>);
+    static_assert(std::same_as<Iter::value_type, short>);
+    static_assert(sizeof(Iter::difference_type) > sizeof(short));
+    static_assert(std::is_signed_v<Iter::difference_type>);
+    LIBCPP_STATIC_ASSERT(std::same_as<Iter::difference_type, int>);
+  }
+  {
+    const std::ranges::iota_view<int> io(0);
+    using Iter = decltype(io.begin());
+    static_assert(std::same_as<Iter::iterator_concept, std::random_access_iterator_tag>);
+    static_assert(std::same_as<Iter::iterator_category, std::input_iterator_tag>);
+    static_assert(std::same_as<Iter::value_type, int>);
+    static_assert(sizeof(Iter::difference_type) > sizeof(int));
+    static_assert(std::is_signed_v<Iter::difference_type>);
+    // If we're compiling for 32 bit or windows, int and long are the same size, so long long is the correct difference type.
+#if INTPTR_MAX == INT32_MAX || defined(_WIN32)
+    LIBCPP_STATIC_ASSERT(std::same_as<Iter::difference_type, long long>);
+#else
+    LIBCPP_STATIC_ASSERT(std::same_as<Iter::difference_type, long>);
+#endif
+  }
+  {
+    const std::ranges::iota_view<long> io(0);
+    using Iter = decltype(io.begin());
+    static_assert(std::same_as<Iter::iterator_concept, std::random_access_iterator_tag>);
+    static_assert(std::same_as<Iter::iterator_category, std::input_iterator_tag>);
+    static_assert(std::same_as<Iter::value_type, long>);
+    // Same as below, if there is no type larger than long, we can just use that.
+    static_assert(sizeof(Iter::difference_type) >= sizeof(long));
+    static_assert(std::is_signed_v<Iter::difference_type>);
+    LIBCPP_STATIC_ASSERT(std::same_as<Iter::difference_type, long long>);
+  }
+  {
+    const std::ranges::iota_view<long long> io(0);
+    using Iter = decltype(io.begin());
+    static_assert(std::same_as<Iter::iterator_concept, std::random_access_iterator_tag>);
+    static_assert(std::same_as<Iter::iterator_category, std::input_iterator_tag>);
+    static_assert(std::same_as<Iter::value_type, long long>);
+    // No integer is larger than long long, so it is OK to use long long as the difference type here:
+    // https://eel.is/c++draft/range.iota.view#1.3
+    static_assert(sizeof(Iter::difference_type) >= sizeof(long long));
+    static_assert(std::is_signed_v<Iter::difference_type>);
+    LIBCPP_STATIC_ASSERT(std::same_as<Iter::difference_type, long long>);
+  }
+  {
+    const std::ranges::iota_view<Decrementable> io;
+    using Iter = decltype(io.begin());
+    static_assert(std::same_as<Iter::iterator_concept, std::bidirectional_iterator_tag>);
+    static_assert(std::same_as<Iter::iterator_category, std::input_iterator_tag>);
+    static_assert(std::same_as<Iter::value_type, Decrementable>);
+    static_assert(std::same_as<Iter::difference_type, int>);
+  }
+  {
+    const std::ranges::iota_view<Incrementable> io;
+    using Iter = decltype(io.begin());
+    static_assert(std::same_as<Iter::iterator_concept, std::forward_iterator_tag>);
+    static_assert(std::same_as<Iter::iterator_category, std::input_iterator_tag>);
+    static_assert(std::same_as<Iter::value_type, Incrementable>);
+    static_assert(std::same_as<Iter::difference_type, int>);
+  }
+  {
+    const std::ranges::iota_view<NotIncrementable> io(NotIncrementable(0));
+    using Iter = decltype(io.begin());
+    static_assert(std::same_as<Iter::iterator_concept, std::input_iterator_tag>);
+    static_assert(!HasIteratorCategory<std::ranges::iota_view<NotIncrementable>>);
+    static_assert(std::same_as<Iter::value_type, NotIncrementable>);
+    static_assert(std::same_as<Iter::difference_type, int>);
+  }
+  {
+    const std::ranges::iota_view<BigType> io;
+    using Iter = decltype(io.begin());
+    static_assert(std::same_as<Iter::iterator_concept, std::forward_iterator_tag>);
+    static_assert(std::same_as<Iter::iterator_category, std::input_iterator_tag>);
+    static_assert(std::same_as<Iter::value_type, BigType>);
+    static_assert(std::same_as<Iter::difference_type, int>);
+  }
+  {
+    const std::ranges::iota_view<CharDifferenceType> io;
+    using Iter = decltype(io.begin());
+    static_assert(std::same_as<Iter::iterator_concept, std::forward_iterator_tag>);
+    static_assert(std::same_as<Iter::iterator_category, std::input_iterator_tag>);
+    static_assert(std::same_as<Iter::value_type, CharDifferenceType>);
+    static_assert(std::same_as<Iter::difference_type, signed char>);
+  }
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus.pass.cpp
new file mode 100644
index 0000000000000..f4181801a948f
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus.pass.cpp
@@ -0,0 +1,179 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// friend constexpr iterator operator-(iterator i, difference_type n)
+//   requires advanceable<W>;
+// friend constexpr difference_type operator-(const iterator& x, const iterator& y)
+//   requires advanceable<W>;
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+// If we're compiling for 32 bit or windows, int and long are the same size, so long long is the correct difference type.
+#if INTPTR_MAX == INT32_MAX || defined(_WIN32)
+using IntDiffT = long long;
+#else
+using IntDiffT = long;
+#endif
+
+constexpr bool test() {
+  // <iterator> - difference_type
+  {
+    // When "_Start" is signed integer like.
+    {
+      std::ranges::iota_view<int> io(0);
+      auto iter1 = std::next(io.begin(), 10);
+      auto iter2 = std::next(io.begin(), 10);
+      assert(iter1 == iter2);
+      assert(iter1 - 5 != iter2);
+      assert(iter1 - 5 == std::ranges::prev(iter2, 5));
+
+      static_assert(!std::is_reference_v<decltype(iter2 - 5)>);
+    }
+
+    // When "_Start" is not integer like.
+    {
+      std::ranges::iota_view io(SomeInt(0));
+      auto iter1 = std::next(io.begin(), 10);
+      auto iter2 = std::next(io.begin(), 10);
+      assert(iter1 == iter2);
+      assert(iter1 - 5 != iter2);
+      assert(iter1 - 5 == std::ranges::prev(iter2, 5));
+
+      static_assert(!std::is_reference_v<decltype(iter2 - 5)>);
+    }
+
+    // When "_Start" is unsigned integer like and n is greater than or equal to zero.
+    {
+      std::ranges::iota_view<unsigned> io(0);
+      auto iter1 = std::next(io.begin(), 10);
+      auto iter2 = std::next(io.begin(), 10);
+      assert(iter1 == iter2);
+      assert(iter1 - 5 != iter2);
+      assert(iter1 - 5 == std::ranges::prev(iter2, 5));
+
+      static_assert(!std::is_reference_v<decltype(iter2 - 5)>);
+    }
+    {
+      std::ranges::iota_view<unsigned> io(0);
+      auto iter1 = std::next(io.begin(), 10);
+      auto iter2 = std::next(io.begin(), 10);
+      assert(iter1 - 0 == iter2);
+    }
+
+    // When "_Start" is unsigned integer like and n is less than zero.
+    {
+      std::ranges::iota_view<unsigned> io(0);
+      auto iter1 = std::next(io.begin(), 10);
+      auto iter2 = std::next(io.begin(), 10);
+      assert(iter1 - 5 != iter2);
+      assert(iter1 - 5 == std::ranges::prev(iter2, 5));
+
+      static_assert(!std::is_reference_v<decltype(iter2 - 5)>);
+    }
+  }
+
+  // <iterator> - <iterator>
+  {
+    // When "_Start" is signed integer like.
+    {
+      std::ranges::iota_view<int> io(0);
+      auto iter1 = std::next(io.begin(), 10);
+      auto iter2 = std::next(io.begin(), 5);
+      assert(iter1 - iter2 == 5);
+
+      LIBCPP_STATIC_ASSERT(std::same_as<decltype(iter1 - iter2), IntDiffT>);
+    }
+    {
+      std::ranges::iota_view<int> io(0);
+      auto iter1 = std::next(io.begin(), 10);
+      auto iter2 = std::next(io.begin(), 10);
+      assert(iter1 - iter2 == 0);
+
+      LIBCPP_STATIC_ASSERT(std::same_as<decltype(iter1 - iter2), IntDiffT>);
+    }
+    {
+      std::ranges::iota_view<int> io(0);
+      auto iter1 = std::next(io.begin(), 5);
+      auto iter2 = std::next(io.begin(), 10);
+      assert(iter1 - iter2 == -5);
+
+      LIBCPP_STATIC_ASSERT(std::same_as<decltype(iter1 - iter2), IntDiffT>);
+    }
+
+    // When "_Start" is unsigned integer like and y > x.
+    {
+      std::ranges::iota_view<unsigned> io(0);
+      auto iter1 = std::next(io.begin(), 5);
+      auto iter2 = std::next(io.begin(), 10);
+      assert(iter1 - iter2 == -5);
+
+      LIBCPP_STATIC_ASSERT(std::same_as<decltype(iter1 - iter2), IntDiffT>);
+    }
+
+    // When "_Start" is unsigned integer like and x >= y.
+    {
+      std::ranges::iota_view<unsigned> io(0);
+      auto iter1 = std::next(io.begin(), 10);
+      auto iter2 = std::next(io.begin(), 5);
+      assert(iter1 - iter2 == 5);
+
+      LIBCPP_STATIC_ASSERT(std::same_as<decltype(iter1 - iter2), IntDiffT>);
+    }
+    {
+      std::ranges::iota_view<unsigned> io(0);
+      auto iter1 = std::next(io.begin(), 10);
+      auto iter2 = std::next(io.begin(), 10);
+      assert(iter1 - iter2 == 0);
+
+      LIBCPP_STATIC_ASSERT(std::same_as<decltype(iter1 - iter2), IntDiffT>);
+    }
+
+    // When "_Start" is not integer like.
+    {
+      std::ranges::iota_view io(SomeInt(0));
+      auto iter1 = std::next(io.begin(), 10);
+      auto iter2 = std::next(io.begin(), 5);
+      assert(iter1 - iter2 == 5);
+
+      static_assert(std::same_as<decltype(iter1 - iter2), int>);
+    }
+    {
+      std::ranges::iota_view io(SomeInt(0));
+      auto iter1 = std::next(io.begin(), 10);
+      auto iter2 = std::next(io.begin(), 10);
+      assert(iter1 - iter2 == 0);
+
+      static_assert(std::same_as<decltype(iter1 - iter2), int>);
+    }
+    {
+      std::ranges::iota_view io(SomeInt(0));
+      auto iter1 = std::next(io.begin(), 5);
+      auto iter2 = std::next(io.begin(), 10);
+      assert(iter1 - iter2 == -5);
+
+      static_assert(std::same_as<decltype(iter1 - iter2), int>);
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus_eq.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus_eq.pass.cpp
new file mode 100644
index 0000000000000..6616ebbb0a2ae
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/minus_eq.pass.cpp
@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr iterator& operator-=(difference_type n)
+//   requires advanceable<W>;
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  // When "_Start" is signed integer like.
+  {
+    std::ranges::iota_view<int> io(0);
+    auto iter1 = std::next(io.begin(), 10);
+    auto iter2 = std::next(io.begin(), 10);
+    assert(iter1 == iter2);
+    iter1 -= 5;
+    assert(iter1 != iter2);
+    assert(iter1 == std::ranges::prev(iter2, 5));
+
+    static_assert(std::is_reference_v<decltype(iter2 -= 5)>);
+  }
+
+  // When "_Start" is not integer like.
+  {
+    std::ranges::iota_view io(SomeInt(0));
+    auto iter1 = std::next(io.begin(), 10);
+    auto iter2 = std::next(io.begin(), 10);
+    assert(iter1 == iter2);
+    iter1 -= 5;
+    assert(iter1 != iter2);
+    assert(iter1 == std::ranges::prev(iter2, 5));
+
+    static_assert(std::is_reference_v<decltype(iter2 -= 5)>);
+  }
+
+  // When "_Start" is unsigned integer like and n is greater than or equal to zero.
+  {
+    std::ranges::iota_view<unsigned> io(0);
+    auto iter1 = std::next(io.begin(), 10);
+    auto iter2 = std::next(io.begin(), 10);
+    assert(iter1 == iter2);
+    iter1 -= 5;
+    assert(iter1 != iter2);
+    assert(iter1 == std::ranges::prev(iter2, 5));
+
+    static_assert(std::is_reference_v<decltype(iter2 -= 5)>);
+  }
+  {
+    std::ranges::iota_view<unsigned> io(0);
+    auto iter1 = std::next(io.begin(), 10);
+    auto iter2 = std::next(io.begin(), 10);
+    assert(iter1 == iter2);
+    iter1 -= 0;
+    assert(iter1 == iter2);
+  }
+
+  // When "_Start" is unsigned integer like and n is less than zero.
+  {
+    std::ranges::iota_view<unsigned> io(0);
+    auto iter1 = std::next(io.begin(), 10);
+    auto iter2 = std::next(io.begin(), 10);
+    assert(iter1 == iter2);
+    iter1 -= -5;
+    assert(iter1 != iter2);
+    assert(iter1 == std::ranges::next(iter2, 5));
+
+    static_assert(std::is_reference_v<decltype(iter2 -= -5)>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus.pass.cpp
new file mode 100644
index 0000000000000..b844900b3dd42
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus.pass.cpp
@@ -0,0 +1,88 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// friend constexpr iterator operator+(iterator i, difference_type n)
+//   requires advanceable<W>;
+// friend constexpr iterator operator+(difference_type n, iterator i)
+//   requires advanceable<W>;
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  // When "_Start" is signed integer like.
+  {
+    std::ranges::iota_view<int> io(0);
+    auto iter1 = io.begin();
+    auto iter2 = io.begin();
+    assert(iter1 == iter2);
+    assert(iter1 + 5 != iter2);
+    assert(iter1 + 5 == std::ranges::next(iter2, 5));
+
+    static_assert(std::is_reference_v<decltype(iter2 += 5)>);
+  }
+
+  // When "_Start" is not integer like.
+  {
+    std::ranges::iota_view io(SomeInt(0));
+    auto iter1 = io.begin();
+    auto iter2 = io.begin();
+    assert(iter1 == iter2);
+    assert(iter1 + 5 != iter2);
+    assert(iter1 + 5 == std::ranges::next(iter2, 5));
+
+    static_assert(std::is_reference_v<decltype(iter2 += 5)>);
+  }
+
+  // When "_Start" is unsigned integer like and n is greater than or equal to zero.
+  {
+    std::ranges::iota_view<unsigned> io(0);
+    auto iter1 = io.begin();
+    auto iter2 = io.begin();
+    assert(iter1 == iter2);
+    assert(iter1 + 5 != iter2);
+    assert(iter1 + 5 == std::ranges::next(iter2, 5));
+
+    static_assert(std::is_reference_v<decltype(iter2 += 5)>);
+  }
+  {
+    std::ranges::iota_view<unsigned> io(0);
+    auto iter1 = io.begin();
+    auto iter2 = io.begin();
+    assert(iter1 == iter2);
+    assert(iter1 + 0 == iter2);
+  }
+
+  // When "_Start" is unsigned integer like and n is less than zero.
+  {
+    std::ranges::iota_view<unsigned> io(0);
+    auto iter1 = io.begin();
+    auto iter2 = io.begin();
+    assert(iter1 == iter2);
+    assert(iter1 + 5 != iter2);
+    assert(iter1 + 5 == std::ranges::next(iter2, 5));
+
+    static_assert(std::is_reference_v<decltype(iter2 += 5)>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus_eq.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus_eq.pass.cpp
new file mode 100644
index 0000000000000..db3e6bdbcf287
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/plus_eq.pass.cpp
@@ -0,0 +1,91 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr iterator& operator+=(difference_type n)
+//   requires advanceable<W>;
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  // When "_Start" is signed integer like.
+  {
+    std::ranges::iota_view<int> io(0);
+    auto iter1 = io.begin();
+    auto iter2 = io.begin();
+    assert(iter1 == iter2);
+    iter1 += 5;
+    assert(iter1 != iter2);
+    assert(iter1 == std::ranges::next(iter2, 5));
+
+    static_assert(std::is_reference_v<decltype(iter2 += 5)>);
+  }
+
+  // When "_Start" is not integer like.
+  {
+    std::ranges::iota_view io(SomeInt(0));
+    auto iter1 = io.begin();
+    auto iter2 = io.begin();
+    assert(iter1 == iter2);
+    iter1 += 5;
+    assert(iter1 != iter2);
+    assert(iter1 == std::ranges::next(iter2, 5));
+
+    static_assert(std::is_reference_v<decltype(iter2 += 5)>);
+  }
+
+  // When "_Start" is unsigned integer like and n is greater than or equal to zero.
+  {
+    std::ranges::iota_view<unsigned> io(0);
+    auto iter1 = io.begin();
+    auto iter2 = io.begin();
+    assert(iter1 == iter2);
+    iter1 += 5;
+    assert(iter1 != iter2);
+    assert(iter1 == std::ranges::next(iter2, 5));
+
+    static_assert(std::is_reference_v<decltype(iter2 += 5)>);
+  }
+  {
+    std::ranges::iota_view<unsigned> io(0);
+    auto iter1 = io.begin();
+    auto iter2 = io.begin();
+    assert(iter1 == iter2);
+    iter1 += 0;
+    assert(iter1 == iter2);
+  }
+
+  // When "_Start" is unsigned integer like and n is less than zero.
+  {
+    std::ranges::iota_view<unsigned> io(0);
+    auto iter1 = io.begin();
+    auto iter2 = io.begin();
+    assert(iter1 == iter2);
+    iter1 += 5;
+    assert(iter1 != iter2);
+    assert(iter1 == std::ranges::next(iter2, 5));
+
+    static_assert(std::is_reference_v<decltype(iter2 += 5)>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/star.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/star.pass.cpp
new file mode 100644
index 0000000000000..492d7333fa9a0
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/star.pass.cpp
@@ -0,0 +1,106 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-sign-compare
+
+// constexpr W operator*() const noexcept(is_nothrow_copy_constructible_v<W>);
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+struct NotNoexceptCopy {
+  using difference_type = int;
+
+  int value_;
+  constexpr explicit NotNoexceptCopy(int value = 0) : value_(value) {}
+  NotNoexceptCopy(const NotNoexceptCopy&) noexcept(false) = default;
+
+  bool operator==(const NotNoexceptCopy&) const = default;
+
+  friend constexpr NotNoexceptCopy& operator+=(NotNoexceptCopy &lhs, const NotNoexceptCopy& rhs) {
+    lhs.value_ += rhs.value_; return lhs;
+  }
+  friend constexpr NotNoexceptCopy& operator-=(NotNoexceptCopy &lhs, const NotNoexceptCopy& rhs) {
+    lhs.value_ -= rhs.value_; return lhs;
+  }
+
+  friend constexpr NotNoexceptCopy operator+(NotNoexceptCopy lhs, NotNoexceptCopy rhs) {
+    return NotNoexceptCopy{lhs.value_ + rhs.value_};
+  }
+  friend constexpr int operator-(NotNoexceptCopy lhs, NotNoexceptCopy rhs) {
+    return lhs.value_ - rhs.value_;
+  }
+
+  constexpr NotNoexceptCopy& operator++()     { ++value_; return *this; }
+  constexpr void              operator++(int) { ++value_;               }
+};
+
+template<class T>
+constexpr void testType() {
+  {
+    std::ranges::iota_view<T> io(T(0));
+    auto iter = io.begin();
+    for (int i = 0; i < 100; ++i, ++iter)
+      assert(*iter == T(i));
+
+    static_assert(noexcept(*iter) == !std::same_as<T, NotNoexceptCopy>);
+  }
+  {
+    std::ranges::iota_view<T> io(T(10));
+    auto iter = io.begin();
+    for (int i = 10; i < 100; ++i, ++iter)
+      assert(*iter == T(i));
+  }
+  {
+    const std::ranges::iota_view<T> io(T(0));
+    auto iter = io.begin();
+    for (int i = 0; i < 100; ++i, ++iter)
+      assert(*iter == T(i));
+  }
+  {
+    const std::ranges::iota_view<T> io(T(10));
+    auto iter = io.begin();
+    for (int i = 10; i < 100; ++i, ++iter)
+      assert(*iter == T(i));
+  }
+}
+
+constexpr bool test() {
+  testType<SomeInt>();
+  testType<NotNoexceptCopy>();
+  testType<signed long>();
+  testType<unsigned long>();
+  testType<int>();
+  testType<unsigned>();
+  testType<short>();
+  testType<unsigned short>();
+
+  // Tests a mix of signed unsigned types.
+  {
+    const std::ranges::iota_view<int, unsigned> io(0, 10);
+    auto iter = io.begin();
+    for (int i = 0; i < 10; ++i, ++iter)
+      assert(*iter == i);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/subscript.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/subscript.pass.cpp
new file mode 100644
index 0000000000000..0fe4ea7048ca7
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/iterator/subscript.pass.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr W operator[](difference_type n) const
+//   requires advanceable<W>;
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+template<class T>
+constexpr void testType() {
+  {
+    std::ranges::iota_view<T> io(T(0));
+    auto iter = io.begin();
+    for (int i = 0; i < 100; ++i)
+      assert(iter[i] == T(i));
+  }
+  {
+    std::ranges::iota_view<T> io(T(10));
+    auto iter = io.begin();
+    for (int i = 0; i < 100; ++i)
+      assert(iter[i] == T(i + 10));
+  }
+  {
+    const std::ranges::iota_view<T> io(T(0));
+    auto iter = io.begin();
+    for (int i = 0; i < 100; ++i)
+      assert(iter[i] == T(i));
+  }
+  {
+    const std::ranges::iota_view<T> io(T(10));
+    auto iter = io.begin();
+    for (int i = 0; i < 100; ++i)
+      assert(iter[i] == T(i + 10));
+  }
+}
+
+constexpr bool test() {
+  testType<SomeInt>();
+  testType<signed long>();
+  testType<unsigned long>();
+  testType<int>();
+  testType<unsigned>();
+  testType<short>();
+  testType<unsigned short>();
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/range_concept_conformance.compile.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/range_concept_conformance.compile.pass.cpp
new file mode 100644
index 0000000000000..b125f76de9e49
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/range_concept_conformance.compile.pass.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// Test that iota_view conforms to range and view concepts.
+
+#include <ranges>
+
+#include "types.h"
+
+struct Decrementable {
+  using difference_type = int;
+
+  auto operator<=>(const Decrementable&) const = default;
+
+  constexpr Decrementable& operator++();
+  constexpr Decrementable  operator++(int);
+  constexpr Decrementable& operator--();
+  constexpr Decrementable  operator--(int);
+};
+
+struct Incrementable {
+  using difference_type = int;
+
+  auto operator<=>(const Incrementable&) const = default;
+
+  constexpr Incrementable& operator++();
+  constexpr Incrementable  operator++(int);
+};
+
+static_assert(std::ranges::random_access_range<std::ranges::iota_view<int>>);
+static_assert(std::ranges::random_access_range<const std::ranges::iota_view<int>>);
+static_assert(std::ranges::bidirectional_range<std::ranges::iota_view<Decrementable>>);
+static_assert(std::ranges::forward_range<std::ranges::iota_view<Incrementable>>);
+static_assert(std::ranges::input_range<std::ranges::iota_view<NotIncrementable>>);
+static_assert(std::ranges::view<std::ranges::iota_view<int>>);
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.default.pass.cpp
new file mode 100644
index 0000000000000..0adb29cb46154
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.default.pass.cpp
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// sentinel() = default;
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  using Sent = std::ranges::sentinel_t<std::ranges::iota_view<Int42<DefaultTo42>, IntComparableWith<Int42<DefaultTo42>>>>;
+  using Iter = std::ranges::iterator_t<std::ranges::iota_view<Int42<DefaultTo42>, IntComparableWith<Int42<DefaultTo42>>>>;
+  assert(Sent() == Iter());
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.value.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.value.pass.cpp
new file mode 100644
index 0000000000000..ebb273873e2bc
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/ctor.value.pass.cpp
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr explicit sentinel(Bound bound);
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  {
+    using Sent = std::ranges::sentinel_t<std::ranges::iota_view<int, IntSentinelWith<int>>>;
+    using Iter = std::ranges::iterator_t<std::ranges::iota_view<int, IntSentinelWith<int>>>;
+    auto sent = Sent(IntSentinelWith<int>(42));
+    assert(sent == Iter(42));
+  }
+  {
+    using Sent = std::ranges::sentinel_t<std::ranges::iota_view<SomeInt, IntSentinelWith<SomeInt>>>;
+    using Iter = std::ranges::iterator_t<std::ranges::iota_view<SomeInt, IntSentinelWith<SomeInt>>>;
+    auto sent = Sent(IntSentinelWith<SomeInt>(SomeInt(42)));
+    assert(sent == Iter(SomeInt(42)));
+  }
+  {
+    using Sent = std::ranges::sentinel_t<std::ranges::iota_view<SomeInt, IntSentinelWith<SomeInt>>>;
+    static_assert(!std::is_convertible_v<Sent, IntSentinelWith<SomeInt>>);
+    static_assert( std::is_constructible_v<Sent, IntSentinelWith<SomeInt>>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/eq.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/eq.pass.cpp
new file mode 100644
index 0000000000000..b4b9d01fabd1a
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/eq.pass.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// friend constexpr bool operator==(const iterator& x, const sentinel& y);
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "../types.h"
+
+constexpr bool test() {
+  {
+    const std::ranges::iota_view<int, IntComparableWith<int>> io(0, IntComparableWith<int>(10));
+    auto iter = io.begin();
+    auto sent = io.end();
+    assert(iter != sent);
+    assert(iter + 10 == sent);
+  }
+  {
+    std::ranges::iota_view<int, IntComparableWith<int>> io(0, IntComparableWith<int>(10));
+    auto iter = io.begin();
+    auto sent = io.end();
+    assert(iter != sent);
+    assert(iter + 10 == sent);
+  }
+  {
+    const std::ranges::iota_view io(SomeInt(0), IntComparableWith(SomeInt(10)));
+    auto iter = io.begin();
+    auto sent = io.end();
+    assert(iter != sent);
+    assert(iter + 10 == sent);
+  }
+  {
+    std::ranges::iota_view io(SomeInt(0), IntComparableWith(SomeInt(10)));
+    auto iter = io.begin();
+    auto sent = io.end();
+    assert(iter != sent);
+    assert(iter + 10 == sent);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/minus.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/minus.pass.cpp
new file mode 100644
index 0000000000000..6fd02878ca655
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/sentinel/minus.pass.cpp
@@ -0,0 +1,65 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// friend constexpr iter_difference_t<W> operator-(const iterator& x, const sentinel& y)
+//   requires sized_­sentinel_­for<Bound, W>;
+// friend constexpr iter_difference_t<W> operator-(const sentinel& x, const iterator& y)
+//   requires sized_­sentinel_­for<Bound, W>;
+
+#include <ranges>
+#include <cassert>
+
+#include "test_macros.h"
+#include "test_iterators.h"
+#include "../types.h"
+
+template<class T>
+concept MinusInvocable = requires(std::ranges::iota_view<T, IntSentinelWith<T>> io) {
+  io.end() - io.begin();
+};
+
+constexpr bool test() {
+  int buffer[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+
+  {
+    auto outIter = random_access_iterator<int*>(buffer);
+    std::ranges::iota_view<random_access_iterator<int*>, IntSentinelWith<random_access_iterator<int*>>> io(
+      outIter, IntSentinelWith<random_access_iterator<int*>>(std::ranges::next(outIter, 8)));
+    auto iter = io.begin();
+    auto sent = io.end();
+    assert(iter - sent == -8);
+    assert(sent - iter == 8);
+  }
+  {
+    auto outIter = random_access_iterator<int*>(buffer);
+    const std::ranges::iota_view<random_access_iterator<int*>, IntSentinelWith<random_access_iterator<int*>>> io(
+      outIter, IntSentinelWith<random_access_iterator<int*>>(std::ranges::next(outIter, 8)));
+    const auto iter = io.begin();
+    const auto sent = io.end();
+    assert(iter - sent == -8);
+    assert(sent - iter == 8);
+  }
+
+  {
+    // The minus operator requires that "W" is an input_or_output_iterator.
+    static_assert(!MinusInvocable<int>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/size.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/size.pass.cpp
new file mode 100644
index 0000000000000..4491b0f60eabe
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/size.pass.cpp
@@ -0,0 +1,101 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// constexpr auto size() const requires see below;
+
+#include <ranges>
+#include <cassert>
+#include <limits>
+
+#include "test_macros.h"
+#include "types.h"
+
+constexpr bool test() {
+  // Both are integer like and both are less than zero.
+  {
+    const std::ranges::iota_view<int, int> io(-10, -5);
+    assert(io.size() == 5);
+  }
+  {
+    const std::ranges::iota_view<int, int> io(-10, -10);
+    assert(io.size() == 0);
+  }
+
+  // Both are integer like and "value_" is less than zero.
+  {
+    const std::ranges::iota_view<int, int> io(-10, 10);
+    assert(io.size() == 20);
+  }
+  {
+// TODO: this is invalid with the current implementation. We need to file an LWG issue to
+// fix this. Essentially the issue is: An int's min and max are -2147483648 and 2147483647
+// which means the negated min cannot be represented as an integer; it needs to be cast to
+// an unsigned type first. That seems to be what the
+// to-unsigned-like(bound_) + to-unsigned-like(-value_))
+// part of https://eel.is/c++draft/range.iota#view-15 is doing, but I think it's doing it
+// wrong. It should be to-unsigned-like(bound_) - to-unsigned-like(value_)) (cast to
+// unsigned first).
+//     const std::ranges::iota_view<int, int> io(std::numeric_limits<int>::min(), std::numeric_limits<int>::max());
+//     assert(io.size() == (static_cast<unsigned>(std::numeric_limits<int>::max()) * 2) + 1);
+  }
+
+  // It is UB for "bound_" to be less than "value_" i.e.: iota_view<int, int> io(10, -5).
+
+  // Both are integer like and neither less than zero.
+  {
+    const std::ranges::iota_view<int, int> io(10, 20);
+    assert(io.size() == 10);
+  }
+  {
+    const std::ranges::iota_view<int, int> io(10, 10);
+    assert(io.size() == 0);
+  }
+  {
+    const std::ranges::iota_view<int, int> io(0, 0);
+    assert(io.size() == 0);
+  }
+  {
+    const std::ranges::iota_view<int, int> io(0, std::numeric_limits<int>::max());
+    assert(io.size() == std::numeric_limits<int>::max());
+  }
+
+  // Neither are integer like.
+  {
+    const std::ranges::iota_view<SomeInt, SomeInt> io(SomeInt(-20), SomeInt(-10));
+    assert(io.size() == 10);
+  }
+  {
+    const std::ranges::iota_view<SomeInt, SomeInt> io(SomeInt(-10), SomeInt(-10));
+    assert(io.size() == 0);
+  }
+  {
+    const std::ranges::iota_view<SomeInt, SomeInt> io(SomeInt(0), SomeInt(0));
+    assert(io.size() == 0);
+  }
+  {
+    const std::ranges::iota_view<SomeInt, SomeInt> io(SomeInt(10), SomeInt(20));
+    assert(io.size() == 10);
+  }
+  {
+    const std::ranges::iota_view<SomeInt, SomeInt> io(SomeInt(10), SomeInt(10));
+    assert(io.size() == 0);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.start.term/at_quick_exit.compile.fail.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/type.compile.pass.cpp
similarity index 54%
rename from libcxx/test/std/language.support/support.start.term/at_quick_exit.compile.fail.cpp
rename to libcxx/test/std/ranges/range.factories/range.iota.view/type.compile.pass.cpp
index 79051fb5167d4..73c4a349194a2 100644
--- a/libcxx/test/std/language.support/support.start.term/at_quick_exit.compile.fail.cpp
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/type.compile.pass.cpp
@@ -6,22 +6,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
 
-// test that referencing at_quick_exit when TEST_HAS_QUICK_EXIT is not defined
-// results in a compile error.
+#include <ranges>
 
-#include <cstdlib>
+// Test that we SFINAE away iota_view<bool>.
 
-#include "test_macros.h"
+template<class T> std::ranges::iota_view<T> f(int);
+template<class T> void f(...);
 
-void f() { }
-
-int main(int, char**) {
-#if !defined(TEST_HAS_QUICK_EXIT)
-    std::at_quick_exit(f);
-#else
-#   error
-#endif
-    return 0;
+void test() {
+  f<bool>(42);
 }
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/types.h b/libcxx/test/std/ranges/range.factories/range.iota.view/types.h
new file mode 100644
index 0000000000000..a6eb1b4d537aa
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/types.h
@@ -0,0 +1,212 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_RANGES_RANGE_FACTORIES_RANGE_IOTA_VIEW_TYPES_H
+#define TEST_STD_RANGES_RANGE_FACTORIES_RANGE_IOTA_VIEW_TYPES_H
+
+#include "test_macros.h"
+
+struct SomeInt {
+  using difference_type = int;
+
+  int value_;
+  constexpr explicit SomeInt(int value = 0) : value_(value) {}
+
+  auto operator<=>(const SomeInt&) const = default;
+
+  friend constexpr SomeInt& operator+=(SomeInt &lhs, const SomeInt& rhs) {
+    lhs.value_ += rhs.value_; return lhs;
+  }
+  friend constexpr SomeInt& operator-=(SomeInt &lhs, const SomeInt& rhs) {
+    lhs.value_ -= rhs.value_; return lhs;
+  }
+
+  friend constexpr SomeInt& operator+=(SomeInt &lhs, difference_type rhs) {
+    lhs.value_ += rhs; return lhs;
+  }
+  friend constexpr SomeInt& operator-=(SomeInt &lhs, difference_type rhs) {
+    lhs.value_ -= rhs; return lhs;
+  }
+
+  friend constexpr SomeInt operator+(SomeInt lhs, SomeInt rhs) {
+    return SomeInt{lhs.value_ + rhs.value_};
+  }
+  friend constexpr int operator-(SomeInt lhs, SomeInt rhs) {
+    return lhs.value_ - rhs.value_;
+  }
+
+  friend constexpr SomeInt operator+(SomeInt lhs, difference_type rhs) {
+    return SomeInt{lhs.value_ + rhs};
+  }
+  friend constexpr int operator-(SomeInt lhs, difference_type rhs) {
+    return lhs.value_ - rhs;
+  }
+
+  friend constexpr SomeInt operator+(difference_type lhs, SomeInt rhs) {
+    return SomeInt{lhs + rhs.value_};
+  }
+  friend constexpr int operator-(difference_type lhs, SomeInt rhs) {
+    return lhs - rhs.value_;
+  }
+
+  constexpr SomeInt& operator++() { ++value_; return *this; }
+  constexpr SomeInt  operator++(int) { auto tmp = *this; ++value_; return tmp; }
+  constexpr SomeInt& operator--() { --value_; return *this; }
+  constexpr SomeInt  operator--(int) { auto tmp = *this; --value_; return tmp; }
+};
+
+template<class T>
+struct IntComparableWith {
+  using difference_type = std::iter_difference_t<T>;
+
+  T value_;
+  constexpr explicit IntComparableWith(T value = T()) : value_(value) {}
+
+  friend constexpr bool operator==(IntComparableWith lhs, IntComparableWith rhs) {
+    return lhs.value_ == rhs.value_;
+  }
+  friend constexpr bool operator==(IntComparableWith lhs, T rhs) {
+    return lhs.value_ == rhs;
+  }
+  friend constexpr bool operator==(T lhs, IntComparableWith rhs) {
+    return lhs == rhs.value_;
+  }
+
+  friend constexpr IntComparableWith operator+(IntComparableWith lhs, IntComparableWith rhs) {
+    return IntComparableWith{lhs.value_ + rhs.value_};
+  }
+  friend constexpr difference_type operator-(IntComparableWith lhs, IntComparableWith rhs) {
+    return lhs.value_ - rhs.value_;
+  }
+
+  constexpr IntComparableWith& operator++() { ++value_; return *this; }
+  constexpr IntComparableWith  operator++(int) { auto tmp = *this; ++value_; return tmp; }
+  constexpr IntComparableWith  operator--() { --value_; return *this; }
+};
+
+template<class T>
+struct IntSentinelWith {
+  using difference_type = std::iter_difference_t<T>;
+
+  T value_;
+  constexpr explicit IntSentinelWith(T value = T()) : value_(value) {}
+
+  friend constexpr bool operator==(IntSentinelWith lhs, IntSentinelWith rhs) {
+    return lhs.value_ == rhs.value_;
+  }
+  friend constexpr bool operator==(IntSentinelWith lhs, T rhs) {
+    return lhs.value_ == rhs;
+  }
+  friend constexpr bool operator==(T lhs, IntSentinelWith rhs) {
+    return lhs == rhs.value_;
+  }
+
+  friend constexpr IntSentinelWith operator+(IntSentinelWith lhs, IntSentinelWith rhs) {
+    return IntSentinelWith{lhs.value_ + rhs.value_};
+  }
+  friend constexpr difference_type operator-(IntSentinelWith lhs, IntSentinelWith rhs) {
+    return lhs.value_ - rhs.value_;
+  }
+  friend constexpr difference_type operator-(IntSentinelWith lhs, T rhs) {
+    return lhs.value_ - rhs;
+  }
+  friend constexpr difference_type operator-(T lhs, IntSentinelWith rhs) {
+    return lhs - rhs.value_;
+  }
+
+  constexpr IntSentinelWith& operator++() { ++value_; return *this; }
+  constexpr IntSentinelWith  operator++(int) { auto tmp = *this; ++value_; return tmp; }
+  constexpr IntSentinelWith  operator--() { --value_; return *this; }
+};
+
+struct NotIncrementable {
+  using difference_type = int;
+
+  int value_;
+  constexpr explicit NotIncrementable(int value = 0) : value_(value) {}
+
+  bool operator==(const NotIncrementable&) const = default;
+
+  friend constexpr NotIncrementable& operator+=(NotIncrementable &lhs, const NotIncrementable& rhs) {
+    lhs.value_ += rhs.value_; return lhs;
+  }
+  friend constexpr NotIncrementable& operator-=(NotIncrementable &lhs, const NotIncrementable& rhs) {
+    lhs.value_ -= rhs.value_; return lhs;
+  }
+
+  friend constexpr NotIncrementable operator+(NotIncrementable lhs, NotIncrementable rhs) {
+    return NotIncrementable{lhs.value_ + rhs.value_};
+  }
+  friend constexpr int operator-(NotIncrementable lhs, NotIncrementable rhs) {
+    return lhs.value_ - rhs.value_;
+  }
+
+  constexpr NotIncrementable& operator++()    { ++value_; return *this; }
+  constexpr void              operator++(int) { ++value_;               }
+  constexpr NotIncrementable& operator--()    { --value_; return *this; }
+};
+static_assert(!std::incrementable<NotIncrementable>);
+
+struct NotDecrementable {
+  using difference_type = int;
+
+  int value_;
+  constexpr explicit NotDecrementable(int value = 0) : value_(value) {}
+
+  bool operator==(const NotDecrementable&) const = default;
+
+  friend constexpr NotDecrementable& operator+=(NotDecrementable &lhs, const NotDecrementable& rhs) {
+    lhs.value_ += rhs.value_; return lhs;
+  }
+  friend constexpr NotDecrementable& operator-=(NotDecrementable &lhs, const NotDecrementable& rhs) {
+    lhs.value_ -= rhs.value_; return lhs;
+  }
+
+  friend constexpr NotDecrementable operator+(NotDecrementable lhs, NotDecrementable rhs) {
+    return NotDecrementable{lhs.value_ + rhs.value_};
+  }
+  friend constexpr int operator-(NotDecrementable lhs, NotDecrementable rhs) {
+    return lhs.value_ - rhs.value_;
+  }
+
+  constexpr NotDecrementable& operator++()    { ++value_; return *this; }
+  constexpr void              operator++(int) { ++value_;               }
+};
+
+enum CtorKind { DefaultTo42, ValueCtor };
+
+template<CtorKind CK>
+struct Int42 {
+  using difference_type = int;
+
+  int value_;
+  constexpr explicit Int42(int value) : value_(value) {}
+  constexpr explicit Int42() requires (CK == DefaultTo42)
+    : value_(42) {}
+
+  bool operator==(const Int42&) const = default;
+
+  friend constexpr Int42& operator+=(Int42 &lhs, const Int42& rhs) {
+    lhs.value_ += rhs.value_; return lhs;
+  }
+  friend constexpr Int42& operator-=(Int42 &lhs, const Int42& rhs) {
+    lhs.value_ -= rhs.value_; return lhs;
+  }
+
+  friend constexpr Int42 operator+(Int42 lhs, Int42 rhs) {
+    return Int42{lhs.value_ + rhs.value_};
+  }
+  friend constexpr int operator-(Int42 lhs, Int42 rhs) {
+    return lhs.value_ - rhs.value_;
+  }
+
+  constexpr Int42& operator++()    { ++value_; return *this; }
+  constexpr void   operator++(int) { ++value_;               }
+};
+
+#endif // TEST_STD_RANGES_RANGE_FACTORIES_RANGE_IOTA_VIEW_TYPES_H
diff --git a/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp b/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp
new file mode 100644
index 0000000000000..6fcb4abe21dec
--- /dev/null
+++ b/libcxx/test/std/ranges/range.factories/range.iota.view/views_iota.pass.cpp
@@ -0,0 +1,86 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// views::iota
+
+#include <ranges>
+#include <cassert>
+#include <concepts>
+
+#include "test_macros.h"
+#include "types.h"
+
+template<class T, class U>
+constexpr void testType(U u) {
+  // Test that this generally does the right thing.
+  // Test with only one argument.
+  {
+    assert(*std::views::iota(T(0)).begin() == T(0));
+  }
+  {
+    const auto io = std::views::iota(T(10));
+    assert(*io.begin() == T(10));
+  }
+  // Test with two arguments.
+  {
+    assert(*std::views::iota(T(0), u).begin() == T(0));
+  }
+  {
+    const auto io = std::views::iota(T(10), u);
+    assert(*io.begin() == T(10));
+  }
+  // Test that we return the correct type.
+  {
+    ASSERT_SAME_TYPE(decltype(std::views::iota(T(10))), std::ranges::iota_view<T>);
+    ASSERT_SAME_TYPE(decltype(std::views::iota(T(10), u)), std::ranges::iota_view<T, U>);
+  }
+  // Test that this is semiregular.
+  // Note: we cannot test perfect forwarding because both T and U must be copyable.
+  {
+    static_assert(std::semiregular<std::remove_const_t<decltype(std::views::iota)>>);
+  }
+}
+
+struct X {};
+
+constexpr bool test() {
+  testType<SomeInt>(SomeInt(10));
+  testType<SomeInt>(IntComparableWith(SomeInt(10)));
+  testType<signed long>(IntComparableWith<signed long>(10));
+  testType<unsigned long>(IntComparableWith<unsigned long>(10));
+  testType<int>(IntComparableWith<int>(10));
+  testType<int>(int(10));
+  testType<unsigned>(unsigned(10));
+  testType<unsigned>(IntComparableWith<unsigned>(10));
+  testType<short>(short(10));
+  testType<short>(IntComparableWith<short>(10));
+  testType<unsigned short>(IntComparableWith<unsigned short>(10));
+
+  {
+    static_assert( std::is_invocable_v<decltype(std::views::iota), int>);
+    static_assert(!std::is_invocable_v<decltype(std::views::iota), X>);
+    static_assert( std::is_invocable_v<decltype(std::views::iota), int, int>);
+    static_assert(!std::is_invocable_v<decltype(std::views::iota), int, X>);
+  }
+  {
+    static_assert(std::same_as<decltype(std::views::iota), decltype(std::ranges::views::iota)>);
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/ranges/range.utility/range.subrange/access/structured_binding.pass.cpp b/libcxx/test/std/ranges/range.utility/range.subrange/access/structured_binding.pass.cpp
new file mode 100644
index 0000000000000..ed2a166b93eca
--- /dev/null
+++ b/libcxx/test/std/ranges/range.utility/range.subrange/access/structured_binding.pass.cpp
@@ -0,0 +1,113 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-no-concepts
+// UNSUPPORTED: libcpp-has-no-incomplete-ranges
+
+// class std::ranges::subrange;
+
+#include <ranges>
+
+#include <cassert>
+#include "test_macros.h"
+
+constexpr void test_sized_subrange()
+{
+    int a[4] = {1,2,3,4};
+    auto r = std::ranges::subrange(a, a+4);
+    assert(std::ranges::sized_range<decltype(r)>);
+    {
+        auto [first, last] = r;
+        assert(first == a);
+        assert(last == a+4);
+    }
+    {
+        auto [first, last] = std::move(r);
+        assert(first == a);
+        assert(last == a+4);
+    }
+    {
+        auto [first, last] = std::as_const(r);
+        assert(first == a);
+        assert(last == a+4);
+    }
+    {
+        auto [first, last] = std::move(std::as_const(r));
+        assert(first == a);
+        assert(last == a+4);
+    }
+}
+
+constexpr void test_unsized_subrange()
+{
+    int a[4] = {1,2,3,4};
+    auto r = std::ranges::subrange(a, std::unreachable_sentinel);
+    assert(!std::ranges::sized_range<decltype(r)>);
+    {
+        auto [first, last] = r;
+        assert(first == a);
+        ASSERT_SAME_TYPE(decltype(last), std::unreachable_sentinel_t);
+    }
+    {
+        auto [first, last] = std::move(r);
+        assert(first == a);
+        ASSERT_SAME_TYPE(decltype(last), std::unreachable_sentinel_t);
+    }
+    {
+        auto [first, last] = std::as_const(r);
+        assert(first == a);
+        ASSERT_SAME_TYPE(decltype(last), std::unreachable_sentinel_t);
+    }
+    {
+        auto [first, last] = std::move(std::as_const(r));
+        assert(first == a);
+        ASSERT_SAME_TYPE(decltype(last), std::unreachable_sentinel_t);
+    }
+}
+
+constexpr void test_copies_not_originals()
+{
+    int a[4] = {1,2,3,4};
+    {
+        auto r = std::ranges::subrange(a, a+4);
+        auto&& [first, last] = r;
+        ASSERT_SAME_TYPE(decltype(first), int*);
+        ASSERT_SAME_TYPE(decltype(last), int*);
+        first = a+2;
+        last = a+2;
+        assert(r.begin() == a);
+        assert(r.end() == a+4);
+    }
+    {
+        const auto r = std::ranges::subrange(a, a+4);
+        auto&& [first, last] = r;
+        ASSERT_SAME_TYPE(decltype(first), int*);
+        ASSERT_SAME_TYPE(decltype(last), int*);
+        first = a+2;
+        last = a+2;
+        assert(r.begin() == a);
+        assert(r.end() == a+4);
+    }
+}
+
+constexpr bool test()
+{
+    test_sized_subrange();
+    test_unsized_subrange();
+    test_copies_not_originals();
+    return true;
+}
+
+int main(int, char**)
+{
+    test();
+    static_assert(test());
+
+    return 0;
+}
diff --git a/libcxx/test/std/re/re.regex/re.regex.construct/deduct.fail.cpp b/libcxx/test/std/re/re.regex/re.regex.construct/deduct.fail.cpp
index b7460aae0f094..adb446a88c173 100644
--- a/libcxx/test/std/re/re.regex/re.regex.construct/deduct.fail.cpp
+++ b/libcxx/test/std/re/re.regex/re.regex.construct/deduct.fail.cpp
@@ -8,15 +8,12 @@
 
 // <regex>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-
 
 // template <class InputIterator, class Allocator = allocator<typename iterator_traits<InputIterator>::value_type>>
 //    vector(InputIterator, InputIterator, Allocator = Allocator())
 //    -> vector<typename iterator_traits<InputIterator>::value_type, Allocator>;
 //
 
-
 #include <regex>
 #include <string>
 #include <iterator>
diff --git a/libcxx/test/std/re/re.regex/re.regex.construct/deduct.pass.cpp b/libcxx/test/std/re/re.regex/re.regex.construct/deduct.pass.cpp
index 1a18dc27bbbc4..feabcfc588f14 100644
--- a/libcxx/test/std/re/re.regex/re.regex.construct/deduct.pass.cpp
+++ b/libcxx/test/std/re/re.regex/re.regex.construct/deduct.pass.cpp
@@ -8,15 +8,12 @@
 
 // <regex>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-
 
 // template<class ForwardIterator>
 // basic_regex(ForwardIterator, ForwardIterator,
 //             regex_constants::syntax_option_type = regex_constants::ECMAScript)
 // -> basic_regex<typename iterator_traits<ForwardIterator>::value_type>;
 
-
 #include <regex>
 #include <string>
 #include <iterator>
diff --git a/libcxx/test/std/strings/basic.string/string.cons/implicit_deduction_guides.pass.cpp b/libcxx/test/std/strings/basic.string/string.cons/implicit_deduction_guides.pass.cpp
index a385032f41267..e7dd236571ada 100644
--- a/libcxx/test/std/strings/basic.string/string.cons/implicit_deduction_guides.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.cons/implicit_deduction_guides.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // <string>
 
diff --git a/libcxx/test/std/strings/basic.string/string.cons/iter_alloc_deduction.fail.cpp b/libcxx/test/std/strings/basic.string/string.cons/iter_alloc_deduction.fail.cpp
index b8f5e2d699bcd..5f4fd060c8669 100644
--- a/libcxx/test/std/strings/basic.string/string.cons/iter_alloc_deduction.fail.cpp
+++ b/libcxx/test/std/strings/basic.string/string.cons/iter_alloc_deduction.fail.cpp
@@ -8,8 +8,6 @@
 
 // <string>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-4.0
-// UNSUPPORTED: apple-clang-9
 
 // template<class InputIterator,
 //      class Allocator = allocator<typename iterator_traits<InputIterator>::value_type>>
diff --git a/libcxx/test/std/strings/basic.string/string.cons/iter_alloc_deduction.pass.cpp b/libcxx/test/std/strings/basic.string/string.cons/iter_alloc_deduction.pass.cpp
index 3292003f14c31..4082005a2001e 100644
--- a/libcxx/test/std/strings/basic.string/string.cons/iter_alloc_deduction.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.cons/iter_alloc_deduction.pass.cpp
@@ -8,7 +8,6 @@
 
 // <string>
 // UNSUPPORTED: c++03, c++11, c++14
-// XFAIL: libcpp-no-deduction-guides
 
 // template<class InputIterator>
 //   basic_string(InputIterator begin, InputIterator end,
@@ -25,7 +24,6 @@
 //  is a type that does not qualify as an input iterator, or if Allocator is a type
 //  that does not qualify as an allocator.
 
-
 #include <string>
 #include <iterator>
 #include <cassert>
diff --git a/libcxx/test/std/strings/basic.string/string.cons/string_view_deduction.fail.cpp b/libcxx/test/std/strings/basic.string/string.cons/string_view_deduction.fail.cpp
index 1fa2ee0efb1b2..40211cf96ea21 100644
--- a/libcxx/test/std/strings/basic.string/string.cons/string_view_deduction.fail.cpp
+++ b/libcxx/test/std/strings/basic.string/string.cons/string_view_deduction.fail.cpp
@@ -8,7 +8,6 @@
 
 // <string>
 // UNSUPPORTED: c++03, c++11, c++14
-// XFAIL: libcpp-no-deduction-guides
 
 // template<class InputIterator>
 //   basic_string(InputIterator begin, InputIterator end,
@@ -24,7 +23,6 @@
 //  The deduction guide shall not participate in overload resolution if Allocator
 //  is a type that does not qualify as an allocator.
 
-
 #include <string>
 #include <string_view>
 #include <iterator>
diff --git a/libcxx/test/std/strings/basic.string/string.cons/string_view_deduction.pass.cpp b/libcxx/test/std/strings/basic.string/string.cons/string_view_deduction.pass.cpp
index e6dc0a72670c5..46b3b8fe20250 100644
--- a/libcxx/test/std/strings/basic.string/string.cons/string_view_deduction.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.cons/string_view_deduction.pass.cpp
@@ -8,7 +8,6 @@
 
 // <string>
 // UNSUPPORTED: c++03, c++11, c++14
-// XFAIL: libcpp-no-deduction-guides
 
 // template<class InputIterator>
 //   basic_string(InputIterator begin, InputIterator end,
@@ -24,7 +23,6 @@
 //  The deduction guide shall not participate in overload resolution if Allocator
 //  is a type that does not qualify as an allocator.
 
-
 #include <string>
 #include <string_view>
 #include <iterator>
diff --git a/libcxx/test/std/strings/basic.string/string.cons/string_view_size_size_deduction.fail.cpp b/libcxx/test/std/strings/basic.string/string.cons/string_view_size_size_deduction.fail.cpp
index 07c9002b5b7da..7e087fe12094e 100644
--- a/libcxx/test/std/strings/basic.string/string.cons/string_view_size_size_deduction.fail.cpp
+++ b/libcxx/test/std/strings/basic.string/string.cons/string_view_size_size_deduction.fail.cpp
@@ -8,7 +8,6 @@
 
 // <string>
 // UNSUPPORTED: c++03, c++11, c++14
-// XFAIL: libcpp-no-deduction-guides
 
 // template<class InputIterator>
 //   basic_string(InputIterator begin, InputIterator end,
@@ -30,7 +29,6 @@
 //  The deduction guide shall not participate in overload resolution if Allocator
 //  is a type that does not qualify as an allocator.
 
-
 #include <string>
 #include <string_view>
 #include <iterator>
diff --git a/libcxx/test/std/strings/basic.string/string.cons/string_view_size_size_deduction.pass.cpp b/libcxx/test/std/strings/basic.string/string.cons/string_view_size_size_deduction.pass.cpp
index fae3b475712e9..69e94ada9e38d 100644
--- a/libcxx/test/std/strings/basic.string/string.cons/string_view_size_size_deduction.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.cons/string_view_size_size_deduction.pass.cpp
@@ -8,7 +8,6 @@
 
 // <string>
 // UNSUPPORTED: c++03, c++11, c++14
-// XFAIL: libcpp-no-deduction-guides
 
 // template<class InputIterator>
 //   basic_string(InputIterator begin, InputIterator end,
@@ -30,7 +29,6 @@
 //  The deduction guide shall not participate in overload resolution if Allocator
 //  is a type that does not qualify as an allocator.
 
-
 #include <string>
 #include <string_view>
 #include <iterator>
diff --git a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char/assign3.pass.cpp b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char/assign3.pass.cpp
index 3d27bdc2366eb..4dcd7c8f02f8a 100644
--- a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char/assign3.pass.cpp
+++ b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char/assign3.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <string>
 
 // template<> struct char_traits<char>
diff --git a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char/copy.pass.cpp b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char/copy.pass.cpp
index 81cb0a109d710..c378a260055d4 100644
--- a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char/copy.pass.cpp
+++ b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char/copy.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
 // UNSUPPORTED: LIBCXX-DEBUG-FIXME
 
 // <string>
diff --git a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char/move.pass.cpp b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char/move.pass.cpp
index 74a11f7aeb93b..fd29d39ffee5d 100644
--- a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char/move.pass.cpp
+++ b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char/move.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <string>
 
 // template<> struct char_traits<char>
diff --git a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign3.pass.cpp b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign3.pass.cpp
index 50544298a0e58..bf94e5edc204f 100644
--- a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign3.pass.cpp
+++ b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/assign3.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: clang-8
 
 // <string>
 
diff --git a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/copy.pass.cpp b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/copy.pass.cpp
index 85006b5609cbb..36ddc58bd93f6 100644
--- a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/copy.pass.cpp
+++ b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/copy.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: clang-8
 // UNSUPPORTED: LIBCXX-DEBUG-FIXME
 
 // <string>
diff --git a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/move.pass.cpp b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/move.pass.cpp
index 8733dbd29e89c..d338ba72ea0b1 100644
--- a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/move.pass.cpp
+++ b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.char8_t/move.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: clang-8
 
 // <string>
 
diff --git a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.wchar.t/assign3.pass.cpp b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.wchar.t/assign3.pass.cpp
index 36ef3b01af6ba..e1b2945cefdf4 100644
--- a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.wchar.t/assign3.pass.cpp
+++ b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.wchar.t/assign3.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <string>
 
 // template<> struct char_traits<wchar_t>
diff --git a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.wchar.t/copy.pass.cpp b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.wchar.t/copy.pass.cpp
index 548ed88a4d6ed..64a761958e35b 100644
--- a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.wchar.t/copy.pass.cpp
+++ b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.wchar.t/copy.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
 // UNSUPPORTED: LIBCXX-DEBUG-FIXME
 
 // <string>
diff --git a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.wchar.t/move.pass.cpp b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.wchar.t/move.pass.cpp
index c8613e9268af6..2e275354832f8 100644
--- a/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.wchar.t/move.pass.cpp
+++ b/libcxx/test/std/strings/char.traits/char.traits.specializations/char.traits.specializations.wchar.t/move.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <string>
 
 // template<> struct char_traits<wchar_t>
diff --git a/libcxx/test/std/strings/string.view/string.view.access/back.pass.cpp b/libcxx/test/std/strings/string.view/string.view.access/back.pass.cpp
index 76487b1615758..535318c425342 100644
--- a/libcxx/test/std/strings/string.view/string.view.access/back.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.access/back.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // constexpr const _CharT& front();
diff --git a/libcxx/test/std/strings/string.view/string.view.access/data.pass.cpp b/libcxx/test/std/strings/string.view/string.view.access/data.pass.cpp
index 9ab83dfb2f53a..4ba1450a87482 100644
--- a/libcxx/test/std/strings/string.view/string.view.access/data.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.access/data.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // constexpr const _CharT* data() const noexcept;
diff --git a/libcxx/test/std/strings/string.view/string.view.access/front.pass.cpp b/libcxx/test/std/strings/string.view/string.view.access/front.pass.cpp
index eb7138950a75b..c9b92ca6cb707 100644
--- a/libcxx/test/std/strings/string.view/string.view.access/front.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.access/front.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // constexpr const _CharT& back();
diff --git a/libcxx/test/std/strings/string.view/string.view.access/index.pass.cpp b/libcxx/test/std/strings/string.view/string.view.access/index.pass.cpp
index 05b704c642afc..93976df582cd2 100644
--- a/libcxx/test/std/strings/string.view/string.view.access/index.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.access/index.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // constexpr const _CharT& operator[](size_type _pos) const;
diff --git a/libcxx/test/std/strings/string.view/string.view.capacity/capacity.pass.cpp b/libcxx/test/std/strings/string.view/string.view.capacity/capacity.pass.cpp
index 025d905a33356..741e2c1f5b5d0 100644
--- a/libcxx/test/std/strings/string.view/string.view.capacity/capacity.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.capacity/capacity.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // [string.view.capacity], capacity
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/opeq.string_view.pointer.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/opeq.string_view.pointer.pass.cpp
index c13b47e1fe907..0a89e04035971 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/opeq.string_view.pointer.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/opeq.string_view.pointer.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits>
 //   constexpr bool operator==(basic_string_view<charT,traits> lhs, const charT* rhs);
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/opeq.string_view.string.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/opeq.string_view.string.pass.cpp
index 0fa9b7855c3d6..2d01ecadfd8cc 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/opeq.string_view.string.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/opeq.string_view.string.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits, class Allocator>
 //   bool operator==(const charT* lhs, const basic_string<charT,traits> rhs);
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/opge.string_view.pointer.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/opge.string_view.pointer.pass.cpp
index a260c10eb6c36..685d2063c5a0a 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/opge.string_view.pointer.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/opge.string_view.pointer.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits, class Allocator>
 //   constexpr bool operator>=(const charT* lhs, basic_string_view<charT,traits> rhs);
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/opge.string_view.string.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/opge.string_view.string.pass.cpp
index dd81e8092f473..03fd89951a807 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/opge.string_view.string.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/opge.string_view.string.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits, class Allocator>
 //   bool operator>=(const basic_string<charT,traits,Allocator>& lhs,
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/opge.string_view.string_view.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/opge.string_view.string_view.pass.cpp
index 778157b389bde..a24d001260393 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/opge.string_view.string_view.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/opge.string_view.string_view.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits>
 //   constexpr bool operator>=(basic_string_view<charT,traits> lhs,
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/opgt.string_view.pointer.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/opgt.string_view.pointer.pass.cpp
index f50f0a6c46926..9f31974f73097 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/opgt.string_view.pointer.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/opgt.string_view.pointer.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // constexpr template<class charT, class traits, class Allocator>
 //   bool operator>(const charT* lhs, basic_string_view<charT,traits> rhs);
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/opgt.string_view.string.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/opgt.string_view.string.pass.cpp
index 0d7703aae9bd2..f0350dfe7ecd0 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/opgt.string_view.string.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/opgt.string_view.string.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits, class Allocator>
 //   bool operator>(const basic_string<charT,traits,Allocator>& lhs,
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/opgt.string_view.string_view.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/opgt.string_view.string_view.pass.cpp
index 2f88ef26af697..62d3c25ac6243 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/opgt.string_view.string_view.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/opgt.string_view.string_view.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits>
 //  constexpr bool operator>(basic_string_view<charT,traits> lhs,
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/ople.string_view.pointer.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/ople.string_view.pointer.pass.cpp
index f70cbe7b88e1e..faea9f33585f2 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/ople.string_view.pointer.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/ople.string_view.pointer.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits, class Allocator>
 //   constexpr bool operator<=(const charT* lhs, basic_string_view<charT,traits> rhs);
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/ople.string_view.string.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/ople.string_view.string.pass.cpp
index 117c042d0fc62..2d9480b2de42e 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/ople.string_view.string.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/ople.string_view.string.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits, class Allocator>
 //   bool operator<=(const basic_string<charT,traits,Allocator>& lhs,
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/ople.string_view.string_view.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/ople.string_view.string_view.pass.cpp
index 267550ad5ea63..f4ec057bb8eda 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/ople.string_view.string_view.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/ople.string_view.string_view.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits>
 //   constexpr bool operator<=(basic_string_view<charT,traits> lhs,
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/oplt.string_view.pointer.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/oplt.string_view.pointer.pass.cpp
index b4b234ba2b593..b545b251d8e7d 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/oplt.string_view.pointer.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/oplt.string_view.pointer.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits, class Allocator>
 //   constexpr bool operator<(const charT* lhs, basic_string_view<charT,traits> rhs);
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/oplt.string_view.string.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/oplt.string_view.string.pass.cpp
index cfd18f288fa64..03588b7caeec2 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/oplt.string_view.string.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/oplt.string_view.string.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits, class Allocator>
 //   bool operator<(const basic_string<charT,traits,Allocator>& lhs,
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/oplt.string_view.string_view.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/oplt.string_view.string_view.pass.cpp
index cefdab02f25a5..6263a9c008bcd 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/oplt.string_view.string_view.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/oplt.string_view.string_view.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits>
 //   constexpr bool operator<(basic_string_view<charT,traits> lhs,
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/opne.string_view.pointer.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/opne.string_view.pointer.pass.cpp
index 6bed63f2588c9..2bd60af587fd9 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/opne.string_view.pointer.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/opne.string_view.pointer.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits>
 //   constexpr bool operator!=(basic_string_view<charT,traits> lhs, const charT* rhs);
diff --git a/libcxx/test/std/strings/string.view/string.view.comparison/opne.string_view.string.pass.cpp b/libcxx/test/std/strings/string.view/string.view.comparison/opne.string_view.string.pass.cpp
index 0e987dc446529..61923b60da9ae 100644
--- a/libcxx/test/std/strings/string.view/string.view.comparison/opne.string_view.string.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.comparison/opne.string_view.string.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits, class Allocator>
 //   bool operator!=(const basic_string<charT, traits, Allocator> &lhs, basic_string_view<charT,traits> rhs);
diff --git a/libcxx/test/std/strings/string.view/string.view.cons/assign.pass.cpp b/libcxx/test/std/strings/string.view/string.view.cons/assign.pass.cpp
index b2bf8ed7198b3..5ca2f45a0ee61 100644
--- a/libcxx/test/std/strings/string.view/string.view.cons/assign.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.cons/assign.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // constexpr basic_string_view& operator=(const basic_string_view &) noexcept = default;
diff --git a/libcxx/test/std/strings/string.view/string.view.cons/default.pass.cpp b/libcxx/test/std/strings/string.view/string.view.cons/default.pass.cpp
index 07a453b2369bc..ad4621752757d 100644
--- a/libcxx/test/std/strings/string.view/string.view.cons/default.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.cons/default.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // constexpr basic_string_view () noexcept;
diff --git a/libcxx/test/std/strings/string.view/string.view.cons/from_literal.pass.cpp b/libcxx/test/std/strings/string.view/string.view.cons/from_literal.pass.cpp
index 600ce72e01dbe..043484e9ffed2 100644
--- a/libcxx/test/std/strings/string.view/string.view.cons/from_literal.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.cons/from_literal.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // constexpr basic_string_view(const _CharT* _s)
diff --git a/libcxx/test/std/strings/string.view/string.view.cons/from_ptr_len.pass.cpp b/libcxx/test/std/strings/string.view/string.view.cons/from_ptr_len.pass.cpp
index 92ae675a399ab..9a0c4ddf5957a 100644
--- a/libcxx/test/std/strings/string.view/string.view.cons/from_ptr_len.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.cons/from_ptr_len.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 //  constexpr basic_string_view(const _CharT* _s, size_type _len)
diff --git a/libcxx/test/std/strings/string.view/string.view.cons/from_string.pass.cpp b/libcxx/test/std/strings/string.view/string.view.cons/from_string.pass.cpp
index 2043d662a7a0a..99d3ecb3a4701 100644
--- a/libcxx/test/std/strings/string.view/string.view.cons/from_string.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.cons/from_string.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // template<class Allocator>
diff --git a/libcxx/test/std/strings/string.view/string.view.cons/from_string1.compile.fail.cpp b/libcxx/test/std/strings/string.view/string.view.cons/from_string1.compile.fail.cpp
index 3c464d7fe3ebe..dc3c809679f86 100644
--- a/libcxx/test/std/strings/string.view/string.view.cons/from_string1.compile.fail.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.cons/from_string1.compile.fail.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // template<class Allocator>
diff --git a/libcxx/test/std/strings/string.view/string.view.cons/from_string2.compile.fail.cpp b/libcxx/test/std/strings/string.view/string.view.cons/from_string2.compile.fail.cpp
index 482d220013b0c..192acf94d36f5 100644
--- a/libcxx/test/std/strings/string.view/string.view.cons/from_string2.compile.fail.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.cons/from_string2.compile.fail.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // template<class Allocator>
diff --git a/libcxx/test/std/strings/string.view/string.view.cons/implicit_deduction_guides.pass.cpp b/libcxx/test/std/strings/string.view/string.view.cons/implicit_deduction_guides.pass.cpp
index d17ece6df18cf..8e1ff27840526 100644
--- a/libcxx/test/std/strings/string.view/string.view.cons/implicit_deduction_guides.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.cons/implicit_deduction_guides.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // <string_view>
 
diff --git a/libcxx/test/std/strings/string.view/string.view.find/find_first_of_char_size.pass.cpp b/libcxx/test/std/strings/string.view/string.view.find/find_first_of_char_size.pass.cpp
index e312ae88e0f50..e812d662e0f24 100644
--- a/libcxx/test/std/strings/string.view/string.view.find/find_first_of_char_size.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.find/find_first_of_char_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // constexpr size_type find_first_of(charT c, size_type pos = 0) const;
 
diff --git a/libcxx/test/std/strings/string.view/string.view.find/find_first_of_pointer_size.pass.cpp b/libcxx/test/std/strings/string.view/string.view.find/find_first_of_pointer_size.pass.cpp
index 4ef219d1f1e8f..4703623cc8ca0 100644
--- a/libcxx/test/std/strings/string.view/string.view.find/find_first_of_pointer_size.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.find/find_first_of_pointer_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // constexpr size_type find_first_of(const charT* s, size_type pos = 0) const;
 
diff --git a/libcxx/test/std/strings/string.view/string.view.find/find_first_of_pointer_size_size.pass.cpp b/libcxx/test/std/strings/string.view/string.view.find/find_first_of_pointer_size_size.pass.cpp
index a2816d64ce37a..9d059f49349d3 100644
--- a/libcxx/test/std/strings/string.view/string.view.find/find_first_of_pointer_size_size.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.find/find_first_of_pointer_size_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // constexpr size_type find_first_of(const charT* s, size_type pos, size_type n) const;
 
diff --git a/libcxx/test/std/strings/string.view/string.view.find/find_first_of_string_view_size.pass.cpp b/libcxx/test/std/strings/string.view/string.view.find/find_first_of_string_view_size.pass.cpp
index 1752f3be1825d..d1887aa204a34 100644
--- a/libcxx/test/std/strings/string.view/string.view.find/find_first_of_string_view_size.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.find/find_first_of_string_view_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // size_type find_first_of(const basic_string_view& str, size_type pos = 0) const;
 
diff --git a/libcxx/test/std/strings/string.view/string.view.find/find_last_not_of_pointer_size_size.pass.cpp b/libcxx/test/std/strings/string.view/string.view.find/find_last_not_of_pointer_size_size.pass.cpp
index ceb0e138bd02e..da391377c06c8 100644
--- a/libcxx/test/std/strings/string.view/string.view.find/find_last_not_of_pointer_size_size.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.find/find_last_not_of_pointer_size_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // constexpr size_type find_last_not_of(const charT* s, size_type pos, size_type n) const;
 
diff --git a/libcxx/test/std/strings/string.view/string.view.find/find_pointer_size.pass.cpp b/libcxx/test/std/strings/string.view/string.view.find/find_pointer_size.pass.cpp
index 79f6bc30a91c4..aa936ecd32597 100644
--- a/libcxx/test/std/strings/string.view/string.view.find/find_pointer_size.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.find/find_pointer_size.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // constexpr size_type find(const charT* s, size_type pos = 0) const;
 
diff --git a/libcxx/test/std/strings/string.view/string.view.hash/char_type.hash.fail.cpp b/libcxx/test/std/strings/string.view/string.view.hash/char_type.hash.fail.cpp
index 486cb5857825c..137644d69b22f 100644
--- a/libcxx/test/std/strings/string.view/string.view.hash/char_type.hash.fail.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.hash/char_type.hash.fail.cpp
@@ -6,10 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // UNSUPPORTED: c++03
 
-// <string>
+// <string_view>
 
 // Test that hash specializations for <string_view> require "char_traits<_CharT>" not just any "_Trait".
 
diff --git a/libcxx/test/std/strings/string.view/string.view.io/stream_insert.pass.cpp b/libcxx/test/std/strings/string.view/string.view.io/stream_insert.pass.cpp
index 8ba444dc2cea5..2c172d07727e3 100644
--- a/libcxx/test/std/strings/string.view/string.view.io/stream_insert.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.io/stream_insert.pass.cpp
@@ -8,7 +8,7 @@
 
 // UNSUPPORTED: libcpp-has-no-localization
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits, class Allocator>
 //   basic_ostream<charT, traits>&
diff --git a/libcxx/test/std/strings/string.view/string.view.io/stream_insert_decl_present.compile.pass.cpp b/libcxx/test/std/strings/string.view/string.view.io/stream_insert_decl_present.compile.pass.cpp
index 444f262fcaa7a..fbbee09f60320 100644
--- a/libcxx/test/std/strings/string.view/string.view.io/stream_insert_decl_present.compile.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.io/stream_insert_decl_present.compile.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <string>
+// <string_view>
 
 // template<class charT, class traits, class Allocator>
 //   basic_ostream<charT, traits>&
diff --git a/libcxx/test/std/strings/string.view/string.view.modifiers/remove_prefix.pass.cpp b/libcxx/test/std/strings/string.view/string.view.modifiers/remove_prefix.pass.cpp
index 9245f235fc2aa..0e3313072c035 100644
--- a/libcxx/test/std/strings/string.view/string.view.modifiers/remove_prefix.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.modifiers/remove_prefix.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // void remove_prefix(size_type _n)
diff --git a/libcxx/test/std/strings/string.view/string.view.modifiers/remove_suffix.pass.cpp b/libcxx/test/std/strings/string.view/string.view.modifiers/remove_suffix.pass.cpp
index 0636bcea9573d..cab1254dda906 100644
--- a/libcxx/test/std/strings/string.view/string.view.modifiers/remove_suffix.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.modifiers/remove_suffix.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // void remove_suffix(size_type _n)
diff --git a/libcxx/test/std/strings/string.view/string.view.modifiers/swap.pass.cpp b/libcxx/test/std/strings/string.view/string.view.modifiers/swap.pass.cpp
index 2fc286e96a5b6..0f0ae48c5e0bc 100644
--- a/libcxx/test/std/strings/string.view/string.view.modifiers/swap.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.modifiers/swap.pass.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
 
 // void swap(basic_string_view& _other) noexcept
diff --git a/libcxx/test/std/strings/string.view/string.view.ops/compare.sv.pass.cpp b/libcxx/test/std/strings/string.view/string.view.ops/compare.sv.pass.cpp
index 72e8dee5024ff..e0848a5866902 100644
--- a/libcxx/test/std/strings/string.view/string.view.ops/compare.sv.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.ops/compare.sv.pass.cpp
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 // <string_view>
+
 // constexpr int compare(basic_string_view str) const noexcept;
 
 #include <string_view>
diff --git a/libcxx/test/std/strings/string.view/string.view.synop/nothing_to_do.pass.cpp b/libcxx/test/std/strings/string.view/string.view.synop/nothing_to_do.pass.cpp
index 9ab36cbdfe9a5..45edec7f4b068 100644
--- a/libcxx/test/std/strings/string.view/string.view.synop/nothing_to_do.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.synop/nothing_to_do.pass.cpp
@@ -8,8 +8,6 @@
 
 #include <string_view>
 
-#include "test_macros.h"
-
 int main(int, char**) {
   return 0;
 }
diff --git a/libcxx/test/std/strings/string.view/string.view.template/contains.char.pass.cpp b/libcxx/test/std/strings/string.view/string.view.template/contains.char.pass.cpp
index a9fbe106e363d..16ae8347f9425 100644
--- a/libcxx/test/std/strings/string.view/string.view.template/contains.char.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.template/contains.char.pass.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // <string_view>
diff --git a/libcxx/test/std/strings/string.view/string.view.template/contains.ptr.pass.cpp b/libcxx/test/std/strings/string.view/string.view.template/contains.ptr.pass.cpp
index 06f0f782e0faf..85a86e0aade5c 100644
--- a/libcxx/test/std/strings/string.view/string.view.template/contains.ptr.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.template/contains.ptr.pass.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // <string_view>
diff --git a/libcxx/test/std/strings/string.view/string.view.template/contains.string_view.pass.cpp b/libcxx/test/std/strings/string.view/string.view.template/contains.string_view.pass.cpp
index 068b9df94fb43..a4bdc7ab821ee 100644
--- a/libcxx/test/std/strings/string.view/string.view.template/contains.string_view.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.template/contains.string_view.pass.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // <string_view>
diff --git a/libcxx/test/std/strings/string.view/string.view.template/ends_with.char.pass.cpp b/libcxx/test/std/strings/string.view/string.view.template/ends_with.char.pass.cpp
index 1d68544f0ec16..d1e949340da71 100644
--- a/libcxx/test/std/strings/string.view/string.view.template/ends_with.char.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.template/ends_with.char.pass.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <string_view>
diff --git a/libcxx/test/std/strings/string.view/string.view.template/ends_with.ptr.pass.cpp b/libcxx/test/std/strings/string.view/string.view.template/ends_with.ptr.pass.cpp
index 51339641632ea..83f2a4077558a 100644
--- a/libcxx/test/std/strings/string.view/string.view.template/ends_with.ptr.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.template/ends_with.ptr.pass.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <string_view>
diff --git a/libcxx/test/std/strings/string.view/string.view.template/ends_with.string_view.pass.cpp b/libcxx/test/std/strings/string.view/string.view.template/ends_with.string_view.pass.cpp
index afd848651eeb2..347ba7b9fd896 100644
--- a/libcxx/test/std/strings/string.view/string.view.template/ends_with.string_view.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.template/ends_with.string_view.pass.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <string_view>
diff --git a/libcxx/test/std/strings/string.view/string.view.template/starts_with.char.pass.cpp b/libcxx/test/std/strings/string.view/string.view.template/starts_with.char.pass.cpp
index cc379ae689d80..175580650f982 100644
--- a/libcxx/test/std/strings/string.view/string.view.template/starts_with.char.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.template/starts_with.char.pass.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <string_view>
diff --git a/libcxx/test/std/strings/string.view/string.view.template/starts_with.ptr.pass.cpp b/libcxx/test/std/strings/string.view/string.view.template/starts_with.ptr.pass.cpp
index 87c95c9220229..ef64597530cc1 100644
--- a/libcxx/test/std/strings/string.view/string.view.template/starts_with.ptr.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.template/starts_with.ptr.pass.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <string_view>
diff --git a/libcxx/test/std/strings/string.view/string.view.template/starts_with.string_view.pass.cpp b/libcxx/test/std/strings/string.view/string.view.template/starts_with.string_view.pass.cpp
index cdda637aaa53d..ebeacaadf2c61 100644
--- a/libcxx/test/std/strings/string.view/string.view.template/starts_with.string_view.pass.cpp
+++ b/libcxx/test/std/strings/string.view/string.view.template/starts_with.string_view.pass.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <string_view>
diff --git a/libcxx/test/std/strings/strings.erasure/erase.pass.cpp b/libcxx/test/std/strings/strings.erasure/erase.pass.cpp
index b53d2b71bb7eb..aa397ca68536b 100644
--- a/libcxx/test/std/strings/strings.erasure/erase.pass.cpp
+++ b/libcxx/test/std/strings/strings.erasure/erase.pass.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <string>
diff --git a/libcxx/test/std/strings/strings.erasure/erase_if.pass.cpp b/libcxx/test/std/strings/strings.erasure/erase_if.pass.cpp
index 076b3102f0bf8..1957451bc64aa 100644
--- a/libcxx/test/std/strings/strings.erasure/erase_if.pass.cpp
+++ b/libcxx/test/std/strings/strings.erasure/erase_if.pass.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <string>
diff --git a/libcxx/test/std/thread/futures/futures.task/futures.task.members/assign_move.pass.cpp b/libcxx/test/std/thread/futures/futures.task/futures.task.members/assign_move.pass.cpp
index c2a98ea916730..dbab27ba3f08e 100644
--- a/libcxx/test/std/thread/futures/futures.task/futures.task.members/assign_move.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.task/futures.task.members/assign_move.pass.cpp
@@ -39,7 +39,7 @@ int main(int, char**)
         assert(!p0.valid());
         assert(p.valid());
         std::future<double> f = p.get_future();
-        p(3, 'a');
+        p(3, 97);
         assert(f.get() == 105.0);
     }
     {
diff --git a/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor_func.pass.cpp b/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor_func.pass.cpp
index b52ddf06525a1..e35f428ccf324 100644
--- a/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor_func.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor_func.pass.cpp
@@ -47,7 +47,7 @@ int main(int, char**)
         std::packaged_task<double(int, char)> p(A(5));
         assert(p.valid());
         std::future<double> f = p.get_future();
-        p(3, 'a');
+        p(3, 97);
         assert(f.get() == 105.0);
         assert(A::n_copies == 0);
         assert(A::n_moves > 0);
@@ -59,7 +59,7 @@ int main(int, char**)
         std::packaged_task<double(int, char)> p(a);
         assert(p.valid());
         std::future<double> f = p.get_future();
-        p(3, 'a');
+        p(3, 97);
         assert(f.get() == 105.0);
         assert(A::n_copies > 0);
         assert(A::n_moves > 0);
diff --git a/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor_move.pass.cpp b/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor_move.pass.cpp
index 88c57c5f56cda..e3ffa66022805 100644
--- a/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor_move.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.task/futures.task.members/ctor_move.pass.cpp
@@ -38,7 +38,7 @@ int main(int, char**)
         assert(!p0.valid());
         assert(p.valid());
         std::future<double> f = p.get_future();
-        p(3, 'a');
+        p(3, 97);
         assert(f.get() == 105.0);
     }
     {
diff --git a/libcxx/test/std/thread/futures/futures.task/futures.task.members/dtor.pass.cpp b/libcxx/test/std/thread/futures/futures.task/futures.task.members/dtor.pass.cpp
index d626f74244437..e6fa378210da4 100644
--- a/libcxx/test/std/thread/futures/futures.task/futures.task.members/dtor.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.task/futures.task.members/dtor.pass.cpp
@@ -35,10 +35,7 @@ void func(std::packaged_task<double(int, char)>)
 {
 }
 
-void func2(std::packaged_task<double(int, char)> p)
-{
-    p(3, 'a');
-}
+void func2(std::packaged_task<double(int, char)> p) { p(3, 97); }
 
 int main(int, char**)
 {
diff --git a/libcxx/test/std/thread/futures/futures.task/futures.task.members/get_future.pass.cpp b/libcxx/test/std/thread/futures/futures.task/futures.task.members/get_future.pass.cpp
index 110f969be0412..c918bad467687 100644
--- a/libcxx/test/std/thread/futures/futures.task/futures.task.members/get_future.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.task/futures.task.members/get_future.pass.cpp
@@ -35,7 +35,7 @@ int main(int, char**)
     {
         std::packaged_task<double(int, char)> p(A(5));
         std::future<double> f = p.get_future();
-        p(3, 'a');
+        p(3, 97);
         assert(f.get() == 105.0);
     }
 #ifndef TEST_HAS_NO_EXCEPTIONS
diff --git a/libcxx/test/std/thread/futures/futures.task/futures.task.members/make_ready_at_thread_exit.pass.cpp b/libcxx/test/std/thread/futures/futures.task/futures.task.members/make_ready_at_thread_exit.pass.cpp
index b42a632ace75a..3863b99518987 100644
--- a/libcxx/test/std/thread/futures/futures.task/futures.task.members/make_ready_at_thread_exit.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.task/futures.task.members/make_ready_at_thread_exit.pass.cpp
@@ -30,31 +30,30 @@ class A
 
     long operator()(long i, long j) const
     {
-        if (j == 'z')
-            TEST_THROW(A(6));
-        return data_ + i + j;
+      if (j == 122)
+        TEST_THROW(A(6));
+      return data_ + i + j;
     }
 };
 
 void func0(std::packaged_task<double(int, char)> p)
 {
     std::this_thread::sleep_for(std::chrono::milliseconds(500));
-    p.make_ready_at_thread_exit(3, 'a');
+    p.make_ready_at_thread_exit(3, 97);
 }
 
 void func1(std::packaged_task<double(int, char)> p)
 {
     std::this_thread::sleep_for(std::chrono::milliseconds(500));
-    p.make_ready_at_thread_exit(3, 'z');
+    p.make_ready_at_thread_exit(3, 122);
 }
 
 void func2(std::packaged_task<double(int, char)> p)
 {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    p.make_ready_at_thread_exit(3, 'a');
-    try
-    {
-        p.make_ready_at_thread_exit(3, 'c');
+  p.make_ready_at_thread_exit(3, 97);
+  try {
+    p.make_ready_at_thread_exit(3, 99);
     }
     catch (const std::future_error& e)
     {
@@ -70,7 +69,7 @@ void func3(std::packaged_task<double(int, char)> p)
 #ifndef TEST_HAS_NO_EXCEPTIONS
     try
     {
-        p.make_ready_at_thread_exit(3, 'a');
+      p.make_ready_at_thread_exit(3, 97);
     }
     catch (const std::future_error& e)
     {
@@ -101,7 +100,7 @@ int main(int, char**)
         }
         catch (const A& e)
         {
-            assert(e(3, 'a') == 106);
+          assert(e(3, 97) == 106.0);
         }
     }
     {
diff --git a/libcxx/test/std/thread/futures/futures.task/futures.task.members/operator.pass.cpp b/libcxx/test/std/thread/futures/futures.task/futures.task.members/operator.pass.cpp
index 886deb78cb8a6..f3a27ccf772cf 100644
--- a/libcxx/test/std/thread/futures/futures.task/futures.task.members/operator.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.task/futures.task.members/operator.pass.cpp
@@ -30,31 +30,30 @@ class A
 
     long operator()(long i, long j) const
     {
-        if (j == 'z')
-            TEST_THROW(A(6));
-        return data_ + i + j;
+      if (j == 122)
+        TEST_THROW(A(6));
+      return data_ + i + j;
     }
 };
 
 void func0(std::packaged_task<double(int, char)> p)
 {
     std::this_thread::sleep_for(std::chrono::milliseconds(500));
-    p(3, 'a');
+    p(3, 97);
 }
 
 void func1(std::packaged_task<double(int, char)> p)
 {
     std::this_thread::sleep_for(std::chrono::milliseconds(500));
-    p(3, 'z');
+    p(3, 122);
 }
 
 void func2(std::packaged_task<double(int, char)> p)
 {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-    p(3, 'a');
-    try
-    {
-        p(3, 'c');
+  p(3, 97);
+  try {
+    p(3, 99);
     }
     catch (const std::future_error& e)
     {
@@ -70,7 +69,7 @@ void func3(std::packaged_task<double(int, char)> p)
 #ifndef TEST_HAS_NO_EXCEPTIONS
     try
     {
-        p(3, 'a');
+      p(3, 97);
     }
     catch (const std::future_error& e)
     {
@@ -101,7 +100,7 @@ int main(int, char**)
         }
         catch (const A& e)
         {
-            assert(e(3, 'a') == 106);
+          assert(e(3, 97) == 106.0);
         }
     }
     {
diff --git a/libcxx/test/std/thread/futures/futures.task/futures.task.members/reset.pass.cpp b/libcxx/test/std/thread/futures/futures.task/futures.task.members/reset.pass.cpp
index d0120e46e9cb7..e2e11faea66a5 100644
--- a/libcxx/test/std/thread/futures/futures.task/futures.task.members/reset.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.task/futures.task.members/reset.pass.cpp
@@ -38,10 +38,10 @@ int main(int, char**)
     {
         std::packaged_task<double(int, char)> p(A(5));
         std::future<double> f = p.get_future();
-        p(3, 'a');
+        p(3, 97);
         assert(f.get() == 105.0);
         p.reset();
-        p(4, 'a');
+        p(4, 97);
         f = p.get_future();
         assert(f.get() == 106.0);
     }
diff --git a/libcxx/test/std/thread/futures/futures.task/futures.task.members/swap.pass.cpp b/libcxx/test/std/thread/futures/futures.task/futures.task.members/swap.pass.cpp
index 9b5ac81ef0b12..c56510366e6ac 100644
--- a/libcxx/test/std/thread/futures/futures.task/futures.task.members/swap.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.task/futures.task.members/swap.pass.cpp
@@ -39,7 +39,7 @@ int main(int, char**)
         assert(!p0.valid());
         assert(p.valid());
         std::future<double> f = p.get_future();
-        p(3, 'a');
+        p(3, 97);
         assert(f.get() == 105.0);
     }
     {
diff --git a/libcxx/test/std/thread/futures/futures.task/futures.task.nonmembers/swap.pass.cpp b/libcxx/test/std/thread/futures/futures.task/futures.task.nonmembers/swap.pass.cpp
index 1cba698a9b704..5cf4064b00521 100644
--- a/libcxx/test/std/thread/futures/futures.task/futures.task.nonmembers/swap.pass.cpp
+++ b/libcxx/test/std/thread/futures/futures.task/futures.task.nonmembers/swap.pass.cpp
@@ -41,7 +41,7 @@ int main(int, char**)
         assert(!p0.valid());
         assert(p.valid());
         std::future<double> f = p.get_future();
-        p(3, 'a');
+        p(3, 97);
         assert(f.get() == 105.0);
     }
     {
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.pass.cpp
index 5dcecd344c36c..ddcc0805a8559 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.guard/mutex.pass.cpp
@@ -40,7 +40,7 @@ int main(int, char**) {
   m.lock();
   m.unlock();
 
-#ifdef __cpp_deduction_guides
+#if TEST_STD_VER >= 17
   std::lock_guard lg(m);
   static_assert((std::is_same<decltype(lg), std::lock_guard<decltype(m)>>::value), "" );
 #endif
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/mutex.pass.cpp
index 41c7aeb06b69f..816eb7c0ea34d 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/mutex.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.scoped/mutex.pass.cpp
@@ -130,7 +130,7 @@ int main(int, char**)
     }
 #endif
 
-#ifdef __cpp_deduction_guides
+#if TEST_STD_VER >= 17
     {
     TestMutex m1, m2, m3;
         {
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
index d09cde085db3d..9eba9cd9057f2 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
@@ -99,7 +99,7 @@ int main(int, char**)
         q.join();
     }
 
-#ifdef __cpp_deduction_guides
+#if TEST_STD_VER >= 17
     std::shared_lock sl(m);
     static_assert((std::is_same<decltype(sl), std::shared_lock<decltype(m)>>::value), "" );
 #endif
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp
index bd25d02cfe10b..0c3e5e55fb269 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.unique/thread.lock.unique.cons/mutex.pass.cpp
@@ -55,7 +55,7 @@ int main(int, char**)
     m.unlock();
     t.join();
 
-#ifdef __cpp_deduction_guides
+#if TEST_STD_VER >= 17
     std::unique_lock ul(m);
     static_assert((std::is_same<decltype(ul), std::unique_lock<decltype(m)>>::value), "" );
 #endif
diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.fail.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.fail.cpp
index 7c806629be5fe..cc20a01d2448f 100644
--- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.fail.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.fail.cpp
@@ -13,7 +13,6 @@
 // function(F) -> function<see-below>;
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // The deduction guides for std::function do not handle rvalue-ref qualified
 // call operators and C-style variadics. It also doesn't deduce from nullptr_t.
@@ -22,7 +21,6 @@
 #include <functional>
 #include <type_traits>
 
-
 struct R { };
 struct f0 { R operator()() && { return {}; } };
 struct f1 { R operator()(int, ...) { return {}; } };
diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp
index 793673de130cc..00d183168d4b0 100644
--- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_F.pass.cpp
@@ -13,7 +13,6 @@
 // function(F) -> function<see-below>;
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 #include <functional>
 #include <type_traits>
@@ -21,7 +20,6 @@
 
 #include "test_macros.h"
 
-
 struct R { };
 struct A1 { };
 struct A2 { };
diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_ptr.pass.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_ptr.pass.cpp
index edd4322dfc471..cc61a75c84f98 100644
--- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_ptr.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.func/func.wrap.func.con/deduct_ptr.pass.cpp
@@ -13,14 +13,12 @@
 // function(R(*)(Args...)) -> function<R(Args...)>;
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 #include <functional>
 #include <type_traits>
 
 #include "test_macros.h"
 
-
 struct R { };
 struct A1 { };
 struct A2 { };
diff --git a/libcxx/test/std/utilities/function.objects/negators/binary_negate.depr_in_cxx17.verify.cpp b/libcxx/test/std/utilities/function.objects/negators/binary_negate.depr_in_cxx17.verify.cpp
index 50b2f5ad2ab6c..1f6e153021e8a 100644
--- a/libcxx/test/std/utilities/function.objects/negators/binary_negate.depr_in_cxx17.verify.cpp
+++ b/libcxx/test/std/utilities/function.objects/negators/binary_negate.depr_in_cxx17.verify.cpp
@@ -11,7 +11,6 @@
 // binary_negate
 //  deprecated in C++17
 
-// UNSUPPORTED: clang-4.0
 // UNSUPPORTED: c++03, c++11, c++14
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_NEGATORS
 
diff --git a/libcxx/test/std/utilities/function.objects/negators/not1.depr_in_cxx17.verify.cpp b/libcxx/test/std/utilities/function.objects/negators/not1.depr_in_cxx17.verify.cpp
index d2fbe3c9e0509..e807a1ec6d4ce 100644
--- a/libcxx/test/std/utilities/function.objects/negators/not1.depr_in_cxx17.verify.cpp
+++ b/libcxx/test/std/utilities/function.objects/negators/not1.depr_in_cxx17.verify.cpp
@@ -11,7 +11,6 @@
 // not1
 //  deprecated in C++17
 
-// UNSUPPORTED: clang-4.0
 // UNSUPPORTED: c++03, c++11, c++14
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_NEGATORS
 
diff --git a/libcxx/test/std/utilities/function.objects/negators/not2.depr_in_cxx17.verify.cpp b/libcxx/test/std/utilities/function.objects/negators/not2.depr_in_cxx17.verify.cpp
index 9f7f532ad4ba1..7e079f5d35f85 100644
--- a/libcxx/test/std/utilities/function.objects/negators/not2.depr_in_cxx17.verify.cpp
+++ b/libcxx/test/std/utilities/function.objects/negators/not2.depr_in_cxx17.verify.cpp
@@ -11,7 +11,6 @@
 // not2
 //  deprecated in C++17
 
-// UNSUPPORTED: clang-4.0
 // UNSUPPORTED: c++03, c++11, c++14
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_NEGATORS
 
diff --git a/libcxx/test/std/utilities/function.objects/negators/unary_negate.depr_in_cxx17.verify.cpp b/libcxx/test/std/utilities/function.objects/negators/unary_negate.depr_in_cxx17.verify.cpp
index db29e30ba8cdf..1cf16297c2878 100644
--- a/libcxx/test/std/utilities/function.objects/negators/unary_negate.depr_in_cxx17.verify.cpp
+++ b/libcxx/test/std/utilities/function.objects/negators/unary_negate.depr_in_cxx17.verify.cpp
@@ -11,7 +11,6 @@
 // unary_negate
 //  deprecated in C++17
 
-// UNSUPPORTED: clang-4.0
 // UNSUPPORTED: c++03, c++11, c++14
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_NEGATORS
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.const/deduct.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.const/deduct.pass.cpp
index 4e197e8fc3f32..56a226d0096e5 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.const/deduct.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.const/deduct.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // <functional>
 
diff --git a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.const/type_conv_ctor.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.const/type_conv_ctor.pass.cpp
index d8ad18215fb7a..01fe619f41fa6 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/refwrap.const/type_conv_ctor.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/refwrap.const/type_conv_ctor.pass.cpp
@@ -18,6 +18,8 @@
 #include <functional>
 #include <cassert>
 
+#include "test_macros.h"
+
 struct convertible_to_int_ref {
     int val = 0;
     operator int&() { return val; }
@@ -68,7 +70,7 @@ int main()
     (true) ? purr() : 0;
     }
 
-#ifdef __cpp_deduction_guides
+#if TEST_STD_VER >= 17
     {
     int i = 0;
     std::reference_wrapper ri(i);
diff --git a/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.members/allocate_hint.pass.cpp b/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.members/allocate_hint.pass.cpp
index 5cdb5e070bed9..a8205f7d884b1 100644
--- a/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.members/allocate_hint.pass.cpp
+++ b/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.members/allocate_hint.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <memory>
 
 // template <class Alloc>
diff --git a/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.members/construct.pass.cpp b/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.members/construct.pass.cpp
index a3465ff22d66f..38e24fdc5db44 100644
--- a/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.members/construct.pass.cpp
+++ b/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.members/construct.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <memory>
 
 // template <class Alloc>
diff --git a/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.members/destroy.pass.cpp b/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.members/destroy.pass.cpp
index 6d7b0bd5c44c8..047b7c1836dfd 100644
--- a/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.members/destroy.pass.cpp
+++ b/libcxx/test/std/utilities/memory/allocator.traits/allocator.traits.members/destroy.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <memory>
 
 // template <class Alloc>
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.pass.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.pass.cpp
index a528bf20273a8..49c7a0b3aad8d 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.pass.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator.members/allocate.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: clang-8
-
 // <memory>
 
 // allocator:
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx17.verify.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx17.verify.cpp
index 88af53a4eeaab..901f04730ee8f 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx17.verify.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx17.verify.cpp
@@ -25,10 +25,6 @@
 
 // REQUIRES: c++17
 
-// Clang 6 does not handle the deprecated attribute on template members properly,
-// so the rebind<int> check below fails.
-// UNSUPPORTED: clang-6
-
 #include <memory>
 
 int main(int, char**) {
diff --git a/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp b/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp
index ea44b972718d0..06c6d0ef1db17 100644
--- a/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp
+++ b/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.construct/construct_at.pass.cpp
@@ -7,11 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// constexpr destructors are only supported starting with clang 10
-// UNSUPPORTED: clang-5, clang-6, clang-7, clang-8, clang-9
 
 // Investigation needed
-// UNSUPPORTED: gcc
+// XFAIL: gcc
 
 // <memory>
 
diff --git a/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy.pass.cpp b/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy.pass.cpp
index 96b7a976991d3..f67c76ed003a0 100644
--- a/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy.pass.cpp
+++ b/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy.pass.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// constexpr destructors are only supported starting with clang 10
-// UNSUPPORTED: clang-5, clang-6, clang-7, clang-8, clang-9
 
 // <memory>
 
diff --git a/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_at.pass.cpp b/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_at.pass.cpp
index 067c8847cd5eb..8b56cb9cd1c1f 100644
--- a/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_at.pass.cpp
+++ b/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_at.pass.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// constexpr destructors are only supported starting with clang 10
-// UNSUPPORTED: clang-5, clang-6, clang-7, clang-8, clang-9
 
 // <memory>
 
diff --git a/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_n.pass.cpp b/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_n.pass.cpp
index d38316da3e1b5..80421cddb0da7 100644
--- a/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_n.pass.cpp
+++ b/libcxx/test/std/utilities/memory/specialized.algorithms/specialized.destroy/destroy_n.pass.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// constexpr destructors are only supported starting with clang 10
-// UNSUPPORTED: clang-5, clang-6, clang-7, clang-8, clang-9
 
 // <memory>
 
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/deduction.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/deduction.pass.cpp
index 03ee2dbbf6ba1..05810e80e914a 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/deduction.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.const/deduction.pass.cpp
@@ -8,7 +8,6 @@
 
 // <memory>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // template<class T> class shared_ptr
 
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/op_arrow.fail.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/op_arrow.fail.cpp
index e14b67bb2a16b..09fd98cec2ca8 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/op_arrow.fail.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/op_arrow.fail.cpp
@@ -10,9 +10,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// Clang doesn't support filename wildcards in verify tests until 05eedf1f5b44.
-// UNSUPPORTED: clang-10
-
 // shared_ptr
 
 // element_type& operator[](ptrdiff_t i) const;
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/op_bracket.fail.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/op_bracket.fail.cpp
index b8b23ad309ab9..eaf4db30a85f7 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/op_bracket.fail.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.obs/op_bracket.fail.cpp
@@ -10,9 +10,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// Clang doesn't support filename wildcards in verify tests until 05eedf1f5b44.
-// UNSUPPORTED: clang-10
-
 // shared_ptr
 
 // element_type& operator[](ptrdiff_t i) const;
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.weak/util.smartptr.weak.const/shared_ptr_deduction.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.weak/util.smartptr.weak.const/shared_ptr_deduction.pass.cpp
index 3bcdebf156bce..bf4a4322a02c4 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.weak/util.smartptr.weak.const/shared_ptr_deduction.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.weak/util.smartptr.weak.const/shared_ptr_deduction.pass.cpp
@@ -8,7 +8,6 @@
 
 // <memory>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // template<class T> class weak_ptr
 
diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_base_of_union.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_base_of_union.pass.cpp
index c64c7a160aca9..43f2bf723ad6c 100644
--- a/libcxx/test/std/utilities/meta/meta.rel/is_base_of_union.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.rel/is_base_of_union.pass.cpp
@@ -14,13 +14,6 @@
 
 #include "test_macros.h"
 
-//  Clang before v9 and apple-clang up to and including v11 do not
-//  report that unions are never base classes - nor can they have bases.
-//  See https://reviews.llvm.org/D61858
-// XFAIL: apple-clang-9.0, apple-clang-9.1, apple-clang-10.0, apple-clang-11.0.0
-// XFAIL: clang-4.0, clang-5.0, clang-6.0, clang-7.0, clang-8.0
-
-
 template <class T, class U>
 void test_is_base_of()
 {
diff --git a/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp b/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
index e98df1ef96be9..c1100916d90a9 100644
--- a/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.trans/meta.trans.other/result_of11.pass.cpp
@@ -23,7 +23,7 @@
 
 // Ignore warnings about volatile in parameters being deprecated.
 // We know it is, but we still have to test it.
-#if defined(__GNUC__) && (__GNUC__ >= 10) && !defined(__clang__)
+#if defined(TEST_COMPILER_GCC)
 #   pragma GCC diagnostic ignored "-Wvolatile"
 #endif
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.pass.cpp
index 540479e04d42e..ce34c8e958dba 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-4, clang-5, apple-clang
 
 // type_traits
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_trivially_copyable.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_trivially_copyable.pass.cpp
index 842d30394bd6b..38a6b17722c43 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_trivially_copyable.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_trivially_copyable.pass.cpp
@@ -10,9 +10,9 @@
 
 // is_trivially_copyable
 
-// These compilers have not implemented Core 2094 which makes volatile
+// GCC has not implemented Core 2094 which makes volatile
 // qualified types trivially copyable.
-// XFAIL: clang-4, apple-clang-9.0, gcc
+// XFAIL: gcc
 
 #include <type_traits>
 #include <cassert>
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_trivially_destructible.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_trivially_destructible.pass.cpp
index 902f673cfcbac..3fc1d83742204 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_trivially_destructible.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_trivially_destructible.pass.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: apple-clang-9
 
 // type_traits
 
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/deduct.fail.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/deduct.fail.cpp
index 3c051eb9686fd..a9ccc0b8d564f 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/deduct.fail.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/deduct.fail.cpp
@@ -6,18 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-// <optional>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-5
-// UNSUPPORTED: libcpp-no-deduction-guides
-// Clang 5 will generate bad implicit deduction guides
-//  Specifically, for the copy constructor.
 
+// <optional>
 
 // template<class T>
 //   optional(T) -> optional<T>;
 
-
 #include <optional>
 #include <cassert>
 
@@ -30,12 +25,7 @@ int main(int, char**)
 //  Test the implicit deduction guides
     {
 //  optional()
-    std::optional opt;   // expected-error-re {{{{declaration of variable 'opt' with deduced type 'std::optional' requires an initializer|no viable constructor or deduction guide for deduction of template arguments of 'optional'}}}}
-//  clang-6 gives a bogus error here:
-//      declaration of variable 'opt' with deduced type 'std::optional' requires an initializer
-//  clang-7 (and later) give a better message:
-//      no viable constructor or deduction guide for deduction of template arguments of 'optional'
-//  So we check for one or the other.
+    std::optional opt;   // expected-error {{no viable constructor or deduction guide for deduction of template arguments of 'optional'}}
     }
 
     {
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/deduct.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/deduct.pass.cpp
index b75b155a6973d..f69427bd91ab1 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/deduct.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/deduct.pass.cpp
@@ -8,16 +8,10 @@
 
 // <optional>
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: clang-5, apple-clang-9
-// UNSUPPORTED: libcpp-no-deduction-guides
-// Clang 5 will generate bad implicit deduction guides
-//  Specifically, for the copy constructor.
-
 
 // template<class T>
 //   optional(T) -> optional<T>;
 
-
 #include <optional>
 #include <cassert>
 
diff --git a/libcxx/test/std/utilities/time/date.time/ctime.pass.cpp b/libcxx/test/std/utilities/time/date.time/ctime.pass.cpp
deleted file mode 100644
index 8ad3cca4758e0..0000000000000
--- a/libcxx/test/std/utilities/time/date.time/ctime.pass.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <ctime>
-#include <type_traits>
-
-#include "test_macros.h"
-
-#ifndef NULL
-#error NULL not defined
-#endif
-
-#ifndef CLOCKS_PER_SEC
-#error CLOCKS_PER_SEC not defined
-#endif
-
-#if TEST_STD_VER > 14 && defined(TEST_HAS_TIMESPEC_GET)
-#ifndef TIME_UTC
-#error TIME_UTC not defined
-#endif
-#endif
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wformat-zero-length"
-#endif
-
-int main(int, char**)
-{
-    std::clock_t c = 0;
-    std::size_t s = 0;
-    std::time_t t = 0;
-    std::tm tm = {};
-    char str[3];
-    ((void)c); // Prevent unused warning
-    ((void)s); // Prevent unused warning
-    ((void)t); // Prevent unused warning
-    ((void)tm); // Prevent unused warning
-    ((void)str); // Prevent unused warning
-#if TEST_STD_VER > 14 && defined(TEST_HAS_TIMESPEC_GET)
-    std::timespec tmspec = {};
-    ((void)tmspec); // Prevent unused warning
-#endif
-
-    static_assert((std::is_same<decltype(std::clock()), std::clock_t>::value), "");
-    static_assert((std::is_same<decltype(std::difftime(t,t)), double>::value), "");
-    static_assert((std::is_same<decltype(std::mktime(&tm)), std::time_t>::value), "");
-    static_assert((std::is_same<decltype(std::time(&t)), std::time_t>::value), "");
-#if TEST_STD_VER > 14 && defined(TEST_HAS_TIMESPEC_GET)
-    static_assert((std::is_same<decltype(std::timespec_get(&tmspec, 0)), int>::value), "");
-#endif
-#ifndef _LIBCPP_HAS_NO_THREAD_UNSAFE_C_FUNCTIONS
-    static_assert((std::is_same<decltype(std::asctime(&tm)), char*>::value), "");
-    static_assert((std::is_same<decltype(std::ctime(&t)), char*>::value), "");
-    static_assert((std::is_same<decltype(std::gmtime(&t)), std::tm*>::value), "");
-    static_assert((std::is_same<decltype(std::localtime(&t)), std::tm*>::value), "");
-#endif
-    static_assert((std::is_same<decltype(std::strftime(str,s,"",&tm)), std::size_t>::value), "");
-
-  return 0;
-}
diff --git a/libcxx/test/std/utilities/time/time.cal/time.cal.day/time.cal.day.nonmembers/literals.fail.cpp b/libcxx/test/std/utilities/time/time.cal/time.cal.day/time.cal.day.nonmembers/literals.fail.cpp
index 6198c7c0101e7..d7fe94a04c96a 100644
--- a/libcxx/test/std/utilities/time/time.cal/time.cal.day/time.cal.day.nonmembers/literals.fail.cpp
+++ b/libcxx/test/std/utilities/time/time.cal/time.cal.day/time.cal.day.nonmembers/literals.fail.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: clang-5, clang-6
-// UNSUPPORTED: apple-clang-9, apple-clang-10
 
 // <chrono>
 // class day;
diff --git a/libcxx/test/std/utilities/time/time.cal/time.cal.day/time.cal.day.nonmembers/literals.pass.cpp b/libcxx/test/std/utilities/time/time.cal/time.cal.day/time.cal.day.nonmembers/literals.pass.cpp
index 61de53b7acd50..3e753d9c20bfc 100644
--- a/libcxx/test/std/utilities/time/time.cal/time.cal.day/time.cal.day.nonmembers/literals.pass.cpp
+++ b/libcxx/test/std/utilities/time/time.cal/time.cal.day/time.cal.day.nonmembers/literals.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: clang-5, clang-6, clang-7
-// UNSUPPORTED: apple-clang-9, apple-clang-10
 
 // <chrono>
 // class day;
diff --git a/libcxx/test/std/utilities/time/time.cal/time.cal.year/time.cal.year.nonmembers/literals.fail.cpp b/libcxx/test/std/utilities/time/time.cal/time.cal.year/time.cal.year.nonmembers/literals.fail.cpp
index 56083c05736f6..7475202904095 100644
--- a/libcxx/test/std/utilities/time/time.cal/time.cal.year/time.cal.year.nonmembers/literals.fail.cpp
+++ b/libcxx/test/std/utilities/time/time.cal/time.cal.year/time.cal.year.nonmembers/literals.fail.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: clang-5, clang-6
-// UNSUPPORTED: apple-clang-9, apple-clang-10
 
 // <chrono>
 // class year;
diff --git a/libcxx/test/std/utilities/time/time.cal/time.cal.year/time.cal.year.nonmembers/literals.pass.cpp b/libcxx/test/std/utilities/time/time.cal/time.cal.year/time.cal.year.nonmembers/literals.pass.cpp
index 05494f239bdf2..8d597cceba2e6 100644
--- a/libcxx/test/std/utilities/time/time.cal/time.cal.year/time.cal.year.nonmembers/literals.pass.cpp
+++ b/libcxx/test/std/utilities/time/time.cal/time.cal.year/time.cal.year.nonmembers/literals.pass.cpp
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: clang-5, clang-6, clang-7
-// UNSUPPORTED: apple-clang-9, apple-clang-10
 
 // <chrono>
 // class year;
diff --git a/libcxx/test/std/utilities/time/time.hms/hhmmss.fail.cpp b/libcxx/test/std/utilities/time/time.hms/hhmmss.fail.cpp
index aa253cc75a59c..f2e49a4ecce6c 100644
--- a/libcxx/test/std/utilities/time/time.hms/hhmmss.fail.cpp
+++ b/libcxx/test/std/utilities/time/time.hms/hhmmss.fail.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: apple-clang-9
+
 // <chrono>
 
 // template <class Duration> class hh_mm_ss;
diff --git a/libcxx/test/std/utilities/time/time.hms/time.hms.members/precision.pass.cpp b/libcxx/test/std/utilities/time/time.hms/time.hms.members/precision.pass.cpp
index ee104523d6bd2..622495a3a92a2 100644
--- a/libcxx/test/std/utilities/time/time.hms/time.hms.members/precision.pass.cpp
+++ b/libcxx/test/std/utilities/time/time.hms/time.hms.members/precision.pass.cpp
@@ -44,7 +44,7 @@ constexpr unsigned long long powers[] = {
 	100000000000000000ULL,
 	1000000000000000000ULL,
 	10000000000000000000ULL
-	};
+};
 
 template <typename Duration, unsigned width>
 constexpr bool check_precision()
diff --git a/libcxx/test/std/utilities/time/time.hms/time.hms.members/precision_type.pass.cpp b/libcxx/test/std/utilities/time/time.hms/time.hms.members/precision_type.pass.cpp
deleted file mode 100644
index ee104523d6bd2..0000000000000
--- a/libcxx/test/std/utilities/time/time.hms/time.hms.members/precision_type.pass.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// UNSUPPORTED: c++03, c++11, c++14, c++17
-// <chrono>
-
-// template <class Duration>
-// class hh_mm_ss
-// {
-// public:
-//     static unsigned constexpr fractional_width = see below;
-//     using precision                            = see below;
-//
-//   precision is duration<common_type_t<Duration::rep, seconds::rep>,
-//                                 ratio<1, 10^^fractional_width>>
-
-#include <chrono>
-#include <cassert>
-
-#include "test_macros.h"
-
-constexpr unsigned long long powers[] = {
-	1ULL,
-	10ULL,
-	100ULL,
-	1000ULL,
-	10000ULL,
-	100000ULL,
-	1000000ULL,
-	10000000ULL,
-	100000000ULL,
-	1000000000ULL,
-	10000000000ULL,
-	100000000000ULL,
-	1000000000000ULL,
-	10000000000000ULL,
-	100000000000000ULL,
-	1000000000000000ULL,
-	10000000000000000ULL,
-	100000000000000000ULL,
-	1000000000000000000ULL,
-	10000000000000000000ULL
-	};
-
-template <typename Duration, unsigned width>
-constexpr bool check_precision()
-{
-	using HMS = std::chrono::hh_mm_ss<Duration>;
-	using CT  = std::common_type_t<typename Duration::rep, std::chrono::seconds::rep>;
-	using Pre = std::chrono::duration<CT, std::ratio<1, powers[width]>>;
-	return std::is_same_v<typename HMS::precision, Pre>;
-}
-
-int main(int, char**)
-{
-	using microfortnights = std::chrono::duration<int, std::ratio<756, 625>>;
-
-	static_assert( check_precision<std::chrono::hours,                               0>(), "");
-	static_assert( check_precision<std::chrono::minutes,                             0>(), "");
-	static_assert( check_precision<std::chrono::seconds,                             0>(), "");
-	static_assert( check_precision<std::chrono::milliseconds,                        3>(), "");
-	static_assert( check_precision<std::chrono::microseconds,                        6>(), "");
-	static_assert( check_precision<std::chrono::nanoseconds,                         9>(), "");
-	static_assert( check_precision<std::chrono::duration<int, std::ratio<  1,   2>>, 1>(), "");
-	static_assert( check_precision<std::chrono::duration<int, std::ratio<  1,   3>>, 6>(), "");
-	static_assert( check_precision<std::chrono::duration<int, std::ratio<  1,   4>>, 2>(), "");
-	static_assert( check_precision<std::chrono::duration<int, std::ratio<  1,   5>>, 1>(), "");
-	static_assert( check_precision<std::chrono::duration<int, std::ratio<  1,   6>>, 6>(), "");
-	static_assert( check_precision<std::chrono::duration<int, std::ratio<  1,   7>>, 6>(), "");
-	static_assert( check_precision<std::chrono::duration<int, std::ratio<  1,   8>>, 3>(), "");
-	static_assert( check_precision<std::chrono::duration<int, std::ratio<  1,   9>>, 6>(), "");
-	static_assert( check_precision<std::chrono::duration<int, std::ratio<  1,  10>>, 1>(), "");
-	static_assert( check_precision<microfortnights,                                  4>(), "");
-
-	return 0;
-}
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/deduct.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/deduct.pass.cpp
index abec3653559d9..993a1ea4e900a 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/deduct.pass.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/deduct.pass.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
-// UNSUPPORTED: apple-clang-9
 
 // GCC's implementation of class template deduction is still immature and runs
 // into issues with libc++. However GCC accepts this code when compiling
diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/implicit_deduction_guides.pass.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/implicit_deduction_guides.pass.cpp
index cd90fd73eba6a..3c0a7f1a63fa0 100644
--- a/libcxx/test/std/utilities/utility/pairs/pairs.pair/implicit_deduction_guides.pass.cpp
+++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/implicit_deduction_guides.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// UNSUPPORTED: libcpp-no-deduction-guides
 
 // GCC's implementation of class template deduction is still immature and runs
 // into issues with libc++. However GCC accepts this code when compiling
diff --git a/libcxx/test/std/utilities/variant/variant.get/get_index.pass.cpp b/libcxx/test/std/utilities/variant/variant.get/get_index.pass.cpp
index bed0026ccc506..77d4d6cba79da 100644
--- a/libcxx/test/std/utilities/variant/variant.get/get_index.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.get/get_index.pass.cpp
@@ -39,11 +39,7 @@ void test_const_lvalue_get() {
   {
     using V = std::variant<int, const long>;
     constexpr V v(42);
-#ifdef TEST_WORKAROUND_CONSTEXPR_IMPLIES_NOEXCEPT
-    ASSERT_NOEXCEPT(std::get<0>(v));
-#else
     ASSERT_NOT_NOEXCEPT(std::get<0>(v));
-#endif
     ASSERT_SAME_TYPE(decltype(std::get<0>(v)), const int &);
     static_assert(std::get<0>(v) == 42, "");
   }
@@ -57,11 +53,7 @@ void test_const_lvalue_get() {
   {
     using V = std::variant<int, const long>;
     constexpr V v(42l);
-#ifdef TEST_WORKAROUND_CONSTEXPR_IMPLIES_NOEXCEPT
-    ASSERT_NOEXCEPT(std::get<1>(v));
-#else
     ASSERT_NOT_NOEXCEPT(std::get<1>(v));
-#endif
     ASSERT_SAME_TYPE(decltype(std::get<1>(v)), const long &);
     static_assert(std::get<1>(v) == 42, "");
   }
diff --git a/libcxx/test/std/utilities/variant/variant.get/get_type.pass.cpp b/libcxx/test/std/utilities/variant/variant.get/get_type.pass.cpp
index e96269b4fadf1..5eed39a3cfc34 100644
--- a/libcxx/test/std/utilities/variant/variant.get/get_type.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.get/get_type.pass.cpp
@@ -33,11 +33,7 @@ void test_const_lvalue_get() {
   {
     using V = std::variant<int, const long>;
     constexpr V v(42);
-#ifdef TEST_WORKAROUND_CONSTEXPR_IMPLIES_NOEXCEPT
-    ASSERT_NOEXCEPT(std::get<int>(v));
-#else
     ASSERT_NOT_NOEXCEPT(std::get<int>(v));
-#endif
     ASSERT_SAME_TYPE(decltype(std::get<int>(v)), const int &);
     static_assert(std::get<int>(v) == 42, "");
   }
@@ -51,11 +47,7 @@ void test_const_lvalue_get() {
   {
     using V = std::variant<int, const long>;
     constexpr V v(42l);
-#ifdef TEST_WORKAROUND_CONSTEXPR_IMPLIES_NOEXCEPT
-    ASSERT_NOEXCEPT(std::get<const long>(v));
-#else
     ASSERT_NOT_NOEXCEPT(std::get<const long>(v));
-#endif
     ASSERT_SAME_TYPE(decltype(std::get<const long>(v)), const long &);
     static_assert(std::get<const long>(v) == 42, "");
   }
diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index 12cdd36632c12..2381fb8607b40 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -914,6 +914,13 @@ class stride_counting_iterator {
   difference_type stride_displacement_ = 0;
 };
 
+template<class T, class U>
+concept sentinel_for_base = requires(U const& u) {
+  u.base();
+  requires std::input_or_output_iterator<std::remove_cvref_t<decltype(u.base())>>;
+  requires std::equality_comparable_with<T, decltype(u.base())>;
+};
+
 template <std::input_or_output_iterator I>
 class sentinel_wrapper {
 public:
@@ -927,6 +934,12 @@ class sentinel_wrapper {
   constexpr const I& base() const& { return base_; }
   constexpr I base() && { return std::move(base_); }
 
+  template<std::input_or_output_iterator I2>
+  requires sentinel_for_base<I, I2>
+  constexpr bool operator==(I2 const& other) const {
+    return base_ == other.base();
+  }
+
 private:
   I base_ = I();
 };
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index 4bccea62f3737..16012613c4e0a 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -163,67 +163,6 @@
 # define TEST_CONSTEXPR_CXX20
 #endif
 
-// Sniff out to see if the underlying C library has C11 features
-// This is cribbed from __config; but lives here as well because we can't assume libc++
-#if (defined(__ISO_C_VISIBLE) && (__ISO_C_VISIBLE >= 2011)) ||                 \
-    TEST_STD_VER >= 11
-#  if defined(__FreeBSD__)
-#    if __FreeBSD_version >= 1300064 || \
-       (__FreeBSD_version >= 1201504 && __FreeBSD_version < 1300000)
-#      define TEST_HAS_TIMESPEC_GET
-#    endif
-#    define TEST_HAS_ALIGNED_ALLOC
-#    define TEST_HAS_QUICK_EXIT
-#  elif defined(__BIONIC__)
-#    if __ANDROID_API__ >= 21
-#      define TEST_HAS_QUICK_EXIT
-#    endif
-#    if __ANDROID_API__ >= 28
-#      define TEST_HAS_ALIGNED_ALLOC
-#    endif
-#    if __ANDROID_API__ >= 29
-#      define TEST_HAS_TIMESPEC_GET
-#    endif
-#  elif defined(__Fuchsia__) || defined(__wasi__) || defined(__NetBSD__)
-#    define TEST_HAS_QUICK_EXIT
-#    define TEST_HAS_ALIGNED_ALLOC
-#    define TEST_HAS_TIMESPEC_GET
-#  elif defined(__linux__)
-// This block preserves the old behavior used by include/__config:
-// _LIBCPP_GLIBC_PREREQ would be defined to 0 if __GLIBC_PREREQ was not
-// available. The configuration here may be too vague though, as Bionic, uClibc,
-// newlib, etc may all support these features but need to be configured.
-#    if defined(TEST_GLIBC_PREREQ)
-#      if TEST_GLIBC_PREREQ(2, 15)
-#        define TEST_HAS_QUICK_EXIT
-#      endif
-#      if TEST_GLIBC_PREREQ(2, 17)
-#        define TEST_HAS_ALIGNED_ALLOC
-#        define TEST_HAS_TIMESPEC_GET
-#      endif
-#    elif defined(_LIBCPP_HAS_MUSL_LIBC)
-#      define TEST_HAS_QUICK_EXIT
-#      define TEST_HAS_ALIGNED_ALLOC
-#      define TEST_HAS_TIMESPEC_GET
-#    endif
-#  elif defined(_WIN32)
-#    if defined(_MSC_VER) && !defined(__MINGW32__)
-#      define TEST_HAS_QUICK_EXIT
-#      define TEST_HAS_TIMESPEC_GET
-#    endif
-#  elif defined(__APPLE__)
-     // timespec_get and aligned_alloc were introduced in macOS 10.15 and
-     // aligned releases
-#    if ((defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ >= 101500) || \
-         (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ >= 130000) || \
-         (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ >= 130000) || \
-         (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ >= 60000))
-#      define TEST_HAS_ALIGNED_ALLOC
-#      define TEST_HAS_TIMESPEC_GET
-#    endif
-#  endif // __APPLE__
-#endif
-
 /* Features that were introduced in C++14 */
 #if TEST_STD_VER >= 14
 #define TEST_HAS_EXTENDED_CONSTEXPR
diff --git a/libcxx/test/support/test_range.h b/libcxx/test/support/test_range.h
index c99e3f72e8bf9..6b279e21ce38e 100644
--- a/libcxx/test/support/test_range.h
+++ b/libcxx/test/support/test_range.h
@@ -62,4 +62,10 @@ struct test_view : std::ranges::view_base {
   sentinel end() const;
 };
 
+template<template<class...> class I, class R>
+constexpr auto make_archetype_range(R&& r) {
+  return std::ranges::subrange(I(std::ranges::begin(r)), sentinel_wrapper(std::ranges::end(r)));
+}
+
+
 #endif // LIBCXX_TEST_SUPPORT_TEST_RANGE_H
diff --git a/libcxx/test/support/test_workarounds.h b/libcxx/test/support/test_workarounds.h
index 8e1b44ff98547..4909421da5c79 100644
--- a/libcxx/test/support/test_workarounds.h
+++ b/libcxx/test/support/test_workarounds.h
@@ -25,10 +25,4 @@
 # endif
 #endif
 
-#if defined(TEST_COMPILER_GCC)
-# if __GNUC__ < 9
-#  define TEST_WORKAROUND_CONSTEXPR_IMPLIES_NOEXCEPT // GCC-87603
-# endif
-#endif
-
 #endif // SUPPORT_TEST_WORKAROUNDS_H
diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile
index 276e5a5cf275d..78a779bcd4ea9 100644
--- a/libcxx/utils/ci/Dockerfile
+++ b/libcxx/utils/ci/Dockerfile
@@ -62,7 +62,7 @@ RUN add-apt-repository ppa:ubuntu-toolchain-r/test
 RUN apt-get update && apt install -y gcc-$GCC_LATEST_VERSION g++-$GCC_LATEST_VERSION
 
 # Install a recent CMake
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.18.2/cmake-3.18.2-Linux-x86_64.sh -O /tmp/install-cmake.sh
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.21.1/cmake-3.21.1-linux-x86_64.sh -O /tmp/install-cmake.sh
 RUN bash /tmp/install-cmake.sh --prefix=/usr --exclude-subdir --skip-license
 RUN rm /tmp/install-cmake.sh
 
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 09afe2053c1b4..205a4868d5816 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -37,7 +37,6 @@
   Feature(name='fdelayed-template-parsing',     when=lambda cfg: hasCompileFlag(cfg, '-fdelayed-template-parsing')),
   Feature(name='libcpp-no-if-constexpr',        when=lambda cfg: '__cpp_if_constexpr' not in featureTestMacros(cfg)),
   Feature(name='libcpp-no-structured-bindings', when=lambda cfg: '__cpp_structured_bindings' not in featureTestMacros(cfg)),
-  Feature(name='libcpp-no-deduction-guides',    when=lambda cfg: featureTestMacros(cfg).get('__cpp_deduction_guides', 0) < 201611),
   Feature(name='libcpp-no-concepts',            when=lambda cfg: featureTestMacros(cfg).get('__cpp_concepts', 0) < 201907),
   Feature(name='has-fobjc-arc',                 when=lambda cfg: hasCompileFlag(cfg, '-xobjective-c++ -fobjc-arc') and
                                                                  sys.platform.lower().strip() == 'darwin'), # TODO: this doesn't handle cross-compiling to Apple platforms.
diff --git a/libcxx/utils/merge_archives.py b/libcxx/utils/merge_archives.py
index 2fcb474d3d5df..fa0b2ee681713 100755
--- a/libcxx/utils/merge_archives.py
+++ b/libcxx/utils/merge_archives.py
@@ -8,9 +8,7 @@
 #===----------------------------------------------------------------------===##
 
 from argparse import ArgumentParser
-from ctypes.util import find_library
 import distutils.spawn
-import glob
 import tempfile
 import os
 import shutil
diff --git a/libcxxabi/.clang-format b/libcxxabi/.clang-format
index 2d1d3bee68f58..dc6f2af8fb26b 100644
--- a/libcxxabi/.clang-format
+++ b/libcxxabi/.clang-format
@@ -9,4 +9,9 @@ PointerAlignment: Left
 # Disable formatting options which may break tests.
 SortIncludes: false
 ReflowComments: false
+
+IndentPPDirectives: AfterHash
+
+# libc++ has some long names so we need more than the 80 column limit imposed by LLVM style, for sensible formatting
+ColumnLimit: 120
 ---
diff --git a/libcxxabi/src/cxa_guard_impl.h b/libcxxabi/src/cxa_guard_impl.h
index 6f873f241fade..80feaa148983b 100644
--- a/libcxxabi/src/cxa_guard_impl.h
+++ b/libcxxabi/src/cxa_guard_impl.h
@@ -41,25 +41,25 @@
 #include "include/atomic_support.h"
 #include <unistd.h>
 #if defined(__has_include)
-# if __has_include(<sys/syscall.h>)
-#   include <sys/syscall.h>
-# endif
+#  if __has_include(<sys/syscall.h>)
+#    include <sys/syscall.h>
+#  endif
 #endif
 
 #include <stdlib.h>
 #include <__threading_support>
 #ifndef _LIBCXXABI_HAS_NO_THREADS
-#if defined(__ELF__) && defined(_LIBCXXABI_LINK_PTHREAD_LIB)
-#pragma comment(lib, "pthread")
-#endif
+#  if defined(__ELF__) && defined(_LIBCXXABI_LINK_PTHREAD_LIB)
+#    pragma comment(lib, "pthread")
+#  endif
 #endif
 
 #if defined(__clang__)
-# pragma clang diagnostic push
-# pragma clang diagnostic ignored "-Wtautological-pointer-compare"
+#  pragma clang diagnostic push
+#  pragma clang diagnostic ignored "-Wtautological-pointer-compare"
 #elif defined(__GNUC__)
-# pragma GCC diagnostic push
-# pragma GCC diagnostic ignored "-Waddress"
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Waddress"
 #endif
 
 // To make testing possible, this header is included from both cxa_guard.cpp
@@ -74,20 +74,20 @@
 // defined when including this file. Only `src/cxa_guard.cpp` should define
 // the former.
 #ifdef BUILDING_CXA_GUARD
-# include "abort_message.h"
-# define ABORT_WITH_MESSAGE(...) ::abort_message(__VA_ARGS__)
+#  include "abort_message.h"
+#  define ABORT_WITH_MESSAGE(...) ::abort_message(__VA_ARGS__)
 #elif defined(TESTING_CXA_GUARD)
-# define ABORT_WITH_MESSAGE(...) ::abort()
+#  define ABORT_WITH_MESSAGE(...) ::abort()
 #else
-# error "Either BUILDING_CXA_GUARD or TESTING_CXA_GUARD must be defined"
+#  error "Either BUILDING_CXA_GUARD or TESTING_CXA_GUARD must be defined"
 #endif
 
 #if __has_feature(thread_sanitizer)
 extern "C" void __tsan_acquire(void*);
 extern "C" void __tsan_release(void*);
 #else
-#define __tsan_acquire(addr) ((void)0)
-#define __tsan_release(addr) ((void)0)
+#  define __tsan_acquire(addr) ((void)0)
+#  define __tsan_release(addr) ((void)0)
 #endif
 
 namespace __cxxabiv1 {
@@ -99,7 +99,7 @@ namespace {
 //                          Misc Utilities
 //===----------------------------------------------------------------------===//
 
-template <class T, T(*Init)()>
+template <class T, T (*Init)()>
 struct LazyValue {
   LazyValue() : is_init(false) {}
 
@@ -110,7 +110,8 @@ struct LazyValue {
     }
     return value;
   }
- private:
+
+private:
   T value;
   bool is_init = false;
 };
@@ -120,25 +121,19 @@ class AtomicInt {
 public:
   using MemoryOrder = std::__libcpp_atomic_order;
 
-  explicit AtomicInt(IntType *b) : b_(b) {}
+  explicit AtomicInt(IntType* b) : b_(b) {}
   AtomicInt(AtomicInt const&) = delete;
   AtomicInt& operator=(AtomicInt const&) = delete;
 
-  IntType load(MemoryOrder ord) {
-    return std::__libcpp_atomic_load(b_, ord);
-  }
-  void store(IntType val, MemoryOrder ord) {
-    std::__libcpp_atomic_store(b_, val, ord);
-  }
-  IntType exchange(IntType new_val, MemoryOrder ord) {
-    return std::__libcpp_atomic_exchange(b_, new_val, ord);
-  }
-  bool compare_exchange(IntType *expected, IntType desired, MemoryOrder ord_success, MemoryOrder ord_failure) {
+  IntType load(MemoryOrder ord) { return std::__libcpp_atomic_load(b_, ord); }
+  void store(IntType val, MemoryOrder ord) { std::__libcpp_atomic_store(b_, val, ord); }
+  IntType exchange(IntType new_val, MemoryOrder ord) { return std::__libcpp_atomic_exchange(b_, new_val, ord); }
+  bool compare_exchange(IntType* expected, IntType desired, MemoryOrder ord_success, MemoryOrder ord_failure) {
     return std::__libcpp_atomic_compare_exchange(b_, expected, desired, ord_success, ord_failure);
   }
 
 private:
-  IntType *b_;
+  IntType* b_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -148,8 +143,7 @@ class AtomicInt {
 #if defined(__APPLE__) && defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
 uint32_t PlatformThreadID() {
   static_assert(sizeof(mach_port_t) == sizeof(uint32_t), "");
-  return static_cast<uint32_t>(
-      pthread_mach_thread_np(std::__libcpp_thread_get_current_id()));
+  return static_cast<uint32_t>(pthread_mach_thread_np(std::__libcpp_thread_get_current_id()));
 }
 #elif defined(SYS_gettid) && defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
 uint32_t PlatformThreadID() {
@@ -160,10 +154,7 @@ uint32_t PlatformThreadID() {
 constexpr uint32_t (*PlatformThreadID)() = nullptr;
 #endif
 
-
-constexpr bool PlatformSupportsThreadID() {
-  return +PlatformThreadID != nullptr;
-}
+constexpr bool PlatformSupportsThreadID() { return +PlatformThreadID != nullptr; }
 
 //===----------------------------------------------------------------------===//
 //                          GuardBase
@@ -189,13 +180,11 @@ struct GuardObject {
 
   explicit GuardObject(uint32_t* g)
       : base_address(g), guard_byte_address(reinterpret_cast<uint8_t*>(g)),
-        init_byte_address(reinterpret_cast<uint8_t*>(g) + 1),
-        thread_id_address(nullptr) {}
+        init_byte_address(reinterpret_cast<uint8_t*>(g) + 1), thread_id_address(nullptr) {}
 
   explicit GuardObject(uint64_t* g)
       : base_address(g), guard_byte_address(reinterpret_cast<uint8_t*>(g)),
-        init_byte_address(reinterpret_cast<uint8_t*>(g) + 1),
-        thread_id_address(reinterpret_cast<uint32_t*>(g) + 1) {}
+        init_byte_address(reinterpret_cast<uint8_t*>(g) + 1), thread_id_address(reinterpret_cast<uint32_t*>(g) + 1) {}
 
 public:
   /// Implements __cxa_guard_acquire
@@ -253,7 +242,6 @@ struct InitByteNoThreads : GuardObject<InitByteNoThreads> {
   void abort_init_byte() { *init_byte_address = UNSET; }
 };
 
-
 //===----------------------------------------------------------------------===//
 //                     Global Mutex Implementation
 //===----------------------------------------------------------------------===//
@@ -280,9 +268,7 @@ struct LibcppCondVar {
   LibcppCondVar(LibcppCondVar const&) = delete;
   LibcppCondVar& operator=(LibcppCondVar const&) = delete;
 
-  bool wait(LibcppMutex& mut) {
-    return std::__libcpp_condvar_wait(&cond, &mut.mutex);
-  }
+  bool wait(LibcppMutex& mut) { return std::__libcpp_condvar_wait(&cond, &mut.mutex); }
   bool broadcast() { return std::__libcpp_condvar_broadcast(&cond); }
 
 private:
@@ -293,20 +279,15 @@ struct LibcppMutex {};
 struct LibcppCondVar {};
 #endif // !defined(_LIBCXXABI_HAS_NO_THREADS)
 
-
 template <class Mutex, class CondVar, Mutex& global_mutex, CondVar& global_cond,
           uint32_t (*GetThreadID)() = PlatformThreadID>
-struct InitByteGlobalMutex
-    : GuardObject<InitByteGlobalMutex<Mutex, CondVar, global_mutex, global_cond,
-                                    GetThreadID>> {
+struct InitByteGlobalMutex : GuardObject<InitByteGlobalMutex<Mutex, CondVar, global_mutex, global_cond, GetThreadID>> {
 
   using BaseT = typename InitByteGlobalMutex::GuardObject;
   using BaseT::BaseT;
 
-  explicit InitByteGlobalMutex(uint32_t *g)
-    : BaseT(g), has_thread_id_support(false) {}
-  explicit InitByteGlobalMutex(uint64_t *g)
-    : BaseT(g), has_thread_id_support(PlatformSupportsThreadID()) {}
+  explicit InitByteGlobalMutex(uint32_t* g) : BaseT(g), has_thread_id_support(false) {}
+  explicit InitByteGlobalMutex(uint64_t* g) : BaseT(g), has_thread_id_support(PlatformSupportsThreadID()) {}
 
 public:
   AcquireResult acquire_init_byte() {
@@ -314,7 +295,7 @@ struct InitByteGlobalMutex
     // Check for possible recursive initialization.
     if (has_thread_id_support && (*init_byte_address & PENDING_BIT)) {
       if (*thread_id_address == current_thread_id.get())
-       ABORT_WITH_MESSAGE("__cxa_guard_acquire detected recursive initialization");
+        ABORT_WITH_MESSAGE("__cxa_guard_acquire detected recursive initialization");
     }
 
     // Wait until the pending bit is not set.
@@ -375,8 +356,7 @@ struct InitByteGlobalMutex
     LockGuard(LockGuard const&) = delete;
     LockGuard& operator=(LockGuard const&) = delete;
 
-    explicit LockGuard(const char* calling_func)
-        : calling_func_(calling_func)  {
+    explicit LockGuard(const char* calling_func) : calling_func_(calling_func) {
       if (global_mutex.lock())
         ABORT_WITH_MESSAGE("%s failed to acquire mutex", calling_func_);
     }
@@ -411,36 +391,30 @@ constexpr void (*PlatformFutexWait)(int*, int) = nullptr;
 constexpr void (*PlatformFutexWake)(int*) = nullptr;
 #endif
 
-constexpr bool PlatformSupportsFutex() {
-  return +PlatformFutexWait != nullptr;
-}
+constexpr bool PlatformSupportsFutex() { return +PlatformFutexWait != nullptr; }
 
 /// InitByteFutex - Manages initialization using atomics and the futex syscall
 /// for waiting and waking.
-template <void (*Wait)(int*, int) = PlatformFutexWait,
-          void (*Wake)(int*) = PlatformFutexWake,
+template <void (*Wait)(int*, int) = PlatformFutexWait, void (*Wake)(int*) = PlatformFutexWake,
           uint32_t (*GetThreadIDArg)() = PlatformThreadID>
 struct InitByteFutex : GuardObject<InitByteFutex<Wait, Wake, GetThreadIDArg>> {
   using BaseT = typename InitByteFutex::GuardObject;
 
   /// ARM Constructor
-  explicit InitByteFutex(uint32_t *g) : BaseT(g),
-    init_byte(this->init_byte_address),
-    has_thread_id_support(this->thread_id_address && GetThreadIDArg),
-    thread_id(this->thread_id_address) {}
+  explicit InitByteFutex(uint32_t* g)
+      : BaseT(g), init_byte(this->init_byte_address), has_thread_id_support(this->thread_id_address && GetThreadIDArg),
+        thread_id(this->thread_id_address) {}
 
   /// Itanium Constructor
-  explicit InitByteFutex(uint64_t *g) : BaseT(g),
-    init_byte(this->init_byte_address),
-    has_thread_id_support(this->thread_id_address && GetThreadIDArg),
-    thread_id(this->thread_id_address) {}
+  explicit InitByteFutex(uint64_t* g)
+      : BaseT(g), init_byte(this->init_byte_address), has_thread_id_support(this->thread_id_address && GetThreadIDArg),
+        thread_id(this->thread_id_address) {}
 
 public:
   AcquireResult acquire_init_byte() {
     while (true) {
       uint8_t last_val = UNSET;
-      if (init_byte.compare_exchange(&last_val, PENDING_BIT, std::_AO_Acq_Rel,
-                                     std::_AO_Acquire)) {
+      if (init_byte.compare_exchange(&last_val, PENDING_BIT, std::_AO_Acq_Rel, std::_AO_Acquire)) {
         if (has_thread_id_support) {
           thread_id.store(current_thread_id.get(), std::_AO_Relaxed);
         }
@@ -454,7 +428,7 @@ struct InitByteFutex : GuardObject<InitByteFutex<Wait, Wake, GetThreadIDArg>> {
 
         // Check for recursive initialization
         if (has_thread_id_support && thread_id.load(std::_AO_Relaxed) == current_thread_id.get()) {
-            ABORT_WITH_MESSAGE("__cxa_guard_acquire detected recursive initialization");
+          ABORT_WITH_MESSAGE("__cxa_guard_acquire detected recursive initialization");
         }
 
         if ((last_val & WAITING_BIT) == 0) {
@@ -462,8 +436,7 @@ struct InitByteFutex : GuardObject<InitByteFutex<Wait, Wake, GetThreadIDArg>> {
           // (1) another thread finished the whole thing before we got here
           // (2) another thread set the waiting bit we were trying to thread
           // (3) another thread had an exception and failed to finish
-          if (!init_byte.compare_exchange(&last_val, PENDING_BIT | WAITING_BIT,
-                                          std::_AO_Acq_Rel, std::_AO_Release)) {
+          if (!init_byte.compare_exchange(&last_val, PENDING_BIT | WAITING_BIT, std::_AO_Acq_Rel, std::_AO_Release)) {
             // (1) success, via someone else's work!
             if (last_val == COMPLETE_BIT)
               return INIT_IS_DONE;
@@ -490,7 +463,7 @@ struct InitByteFutex : GuardObject<InitByteFutex<Wait, Wake, GetThreadIDArg>> {
     if (has_thread_id_support)
       thread_id.store(0, std::_AO_Relaxed);
 
-    uint8_t old = init_byte.exchange(0, std::_AO_Acq_Rel);
+    uint8_t old = init_byte.exchange(UNSET, std::_AO_Acq_Rel);
     if (old & WAITING_BIT)
       wake_all();
   }
@@ -500,8 +473,7 @@ struct InitByteFutex : GuardObject<InitByteFutex<Wait, Wake, GetThreadIDArg>> {
   /// 32-bit 4-byte aligned address as the first argument, so we have to use use
   /// the base address of the guard variable (not the init byte).
   void wait_on_initialization() {
-    Wait(static_cast<int*>(this->base_address),
-         expected_value_for_futex(PENDING_BIT | WAITING_BIT));
+    Wait(static_cast<int*>(this->base_address), expected_value_for_futex(PENDING_BIT | WAITING_BIT));
   }
   void wake_all() { Wake(static_cast<int*>(this->base_address)); }
 
@@ -536,11 +508,7 @@ struct GlobalStatic {
 template <class T>
 _LIBCPP_SAFE_STATIC T GlobalStatic<T>::instance = {};
 
-enum class Implementation {
-  NoThreads,
-  GlobalLock,
-  Futex
-};
+enum class Implementation { NoThreads, GlobalLock, Futex };
 
 template <Implementation Impl>
 struct SelectImplementation;
@@ -552,15 +520,13 @@ struct SelectImplementation<Implementation::NoThreads> {
 
 template <>
 struct SelectImplementation<Implementation::GlobalLock> {
-  using type = InitByteGlobalMutex<
-      LibcppMutex, LibcppCondVar, GlobalStatic<LibcppMutex>::instance,
-      GlobalStatic<LibcppCondVar>::instance, PlatformThreadID>;
+  using type = InitByteGlobalMutex<LibcppMutex, LibcppCondVar, GlobalStatic<LibcppMutex>::instance,
+                                   GlobalStatic<LibcppCondVar>::instance, PlatformThreadID>;
 };
 
 template <>
 struct SelectImplementation<Implementation::Futex> {
-  using type =
-      InitByteFutex<PlatformFutexWait, PlatformFutexWake, PlatformThreadID>;
+  using type = InitByteFutex<PlatformFutexWait, PlatformFutexWake, PlatformThreadID>;
 };
 
 // TODO(EricWF): We should prefer the futex implementation when available. But
@@ -571,22 +537,21 @@ constexpr Implementation CurrentImplementation =
 #elif defined(_LIBCXXABI_USE_FUTEX)
     Implementation::Futex;
 #else
-   Implementation::GlobalLock;
+    Implementation::GlobalLock;
 #endif
 
-static_assert(CurrentImplementation != Implementation::Futex
-           || PlatformSupportsFutex(), "Futex selected but not supported");
+static_assert(CurrentImplementation != Implementation::Futex || PlatformSupportsFutex(),
+              "Futex selected but not supported");
 
-using SelectedImplementation =
-    SelectImplementation<CurrentImplementation>::type;
+using SelectedImplementation = SelectImplementation<CurrentImplementation>::type;
 
 } // end namespace
 } // end namespace __cxxabiv1
 
 #if defined(__clang__)
-# pragma clang diagnostic pop
+#  pragma clang diagnostic pop
 #elif defined(__GNUC__)
-# pragma GCC diagnostic pop
+#  pragma GCC diagnostic pop
 #endif
 
 #endif // LIBCXXABI_SRC_INCLUDE_CXA_GUARD_IMPL_H
diff --git a/libcxxabi/src/demangle/ItaniumDemangle.h b/libcxxabi/src/demangle/ItaniumDemangle.h
index 36d5d1adeca07..f73959868d706 100644
--- a/libcxxabi/src/demangle/ItaniumDemangle.h
+++ b/libcxxabi/src/demangle/ItaniumDemangle.h
@@ -109,6 +109,126 @@
 
 DEMANGLE_NAMESPACE_BEGIN
 
+template <class T, size_t N> class PODSmallVector {
+  static_assert(std::is_pod<T>::value,
+                "T is required to be a plain old data type");
+
+  T *First = nullptr;
+  T *Last = nullptr;
+  T *Cap = nullptr;
+  T Inline[N] = {0};
+
+  bool isInline() const { return First == Inline; }
+
+  void clearInline() {
+    First = Inline;
+    Last = Inline;
+    Cap = Inline + N;
+  }
+
+  void reserve(size_t NewCap) {
+    size_t S = size();
+    if (isInline()) {
+      auto *Tmp = static_cast<T *>(std::malloc(NewCap * sizeof(T)));
+      if (Tmp == nullptr)
+        std::terminate();
+      std::copy(First, Last, Tmp);
+      First = Tmp;
+    } else {
+      First = static_cast<T *>(std::realloc(First, NewCap * sizeof(T)));
+      if (First == nullptr)
+        std::terminate();
+    }
+    Last = First + S;
+    Cap = First + NewCap;
+  }
+
+public:
+  PODSmallVector() : First(Inline), Last(First), Cap(Inline + N) {}
+
+  PODSmallVector(const PODSmallVector &) = delete;
+  PODSmallVector &operator=(const PODSmallVector &) = delete;
+
+  PODSmallVector(PODSmallVector &&Other) : PODSmallVector() {
+    if (Other.isInline()) {
+      std::copy(Other.begin(), Other.end(), First);
+      Last = First + Other.size();
+      Other.clear();
+      return;
+    }
+
+    First = Other.First;
+    Last = Other.Last;
+    Cap = Other.Cap;
+    Other.clearInline();
+  }
+
+  PODSmallVector &operator=(PODSmallVector &&Other) {
+    if (Other.isInline()) {
+      if (!isInline()) {
+        std::free(First);
+        clearInline();
+      }
+      std::copy(Other.begin(), Other.end(), First);
+      Last = First + Other.size();
+      Other.clear();
+      return *this;
+    }
+
+    if (isInline()) {
+      First = Other.First;
+      Last = Other.Last;
+      Cap = Other.Cap;
+      Other.clearInline();
+      return *this;
+    }
+
+    std::swap(First, Other.First);
+    std::swap(Last, Other.Last);
+    std::swap(Cap, Other.Cap);
+    Other.clear();
+    return *this;
+  }
+
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  void push_back(const T &Elem) {
+    if (Last == Cap)
+      reserve(size() * 2);
+    *Last++ = Elem;
+  }
+
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  void pop_back() {
+    assert(Last != First && "Popping empty vector!");
+    --Last;
+  }
+
+  void dropBack(size_t Index) {
+    assert(Index <= size() && "dropBack() can't expand!");
+    Last = First + Index;
+  }
+
+  T *begin() { return First; }
+  T *end() { return Last; }
+
+  bool empty() const { return First == Last; }
+  size_t size() const { return static_cast<size_t>(Last - First); }
+  T &back() {
+    assert(Last != First && "Calling back() on empty vector!");
+    return *(Last - 1);
+  }
+  T &operator[](size_t Index) {
+    assert(Index < size() && "Invalid access!");
+    return *(begin() + Index);
+  }
+  void clear() { Last = First; }
+
+  ~PODSmallVector() {
+    if (!isInline())
+      std::free(First);
+  }
+};
+
 // Base class of all AST nodes. The AST is built by the parser, then is
 // traversed by the printLeft/Right functions to produce a demangled string.
 class Node {
@@ -531,8 +651,15 @@ class ReferenceType : public Node {
   // Dig through any refs to refs, collapsing the ReferenceTypes as we go. The
   // rule here is rvalue ref to rvalue ref collapses to a rvalue ref, and any
   // other combination collapses to a lvalue ref.
+  //
+  // A combination of a TemplateForwardReference and a back-ref Substitution
+  // from an ill-formed string may have created a cycle; use cycle detection to
+  // avoid looping forever.
   std::pair<ReferenceKind, const Node *> collapse(OutputStream &S) const {
     auto SoFar = std::make_pair(RK, Pointee);
+    // Track the chain of nodes for the Floyd's 'tortoise and hare'
+    // cycle-detection algorithm, since getSyntaxNode(S) is impure
+    PODSmallVector<const Node *, 8> Prev;
     for (;;) {
       const Node *SN = SoFar.second->getSyntaxNode(S);
       if (SN->getKind() != KReferenceType)
@@ -540,6 +667,14 @@ class ReferenceType : public Node {
       auto *RT = static_cast<const ReferenceType *>(SN);
       SoFar.second = RT->Pointee;
       SoFar.first = std::min(SoFar.first, RT->RK);
+
+      // The middle of Prev is the 'slow' pointer moving at half speed
+      Prev.push_back(SoFar.second);
+      if (Prev.size() > 1 && SoFar.second == Prev[(Prev.size() - 1) / 2]) {
+        // Cycle detected
+        SoFar.second = nullptr;
+        break;
+      }
     }
     return SoFar;
   }
@@ -560,6 +695,8 @@ class ReferenceType : public Node {
       return;
     SwapAndRestore<bool> SavePrinting(Printing, true);
     std::pair<ReferenceKind, const Node *> Collapsed = collapse(s);
+    if (!Collapsed.second)
+      return;
     Collapsed.second->printLeft(s);
     if (Collapsed.second->hasArray(s))
       s += " ";
@@ -573,6 +710,8 @@ class ReferenceType : public Node {
       return;
     SwapAndRestore<bool> SavePrinting(Printing, true);
     std::pair<ReferenceKind, const Node *> Collapsed = collapse(s);
+    if (!Collapsed.second)
+      return;
     if (Collapsed.second->hasArray(s) || Collapsed.second->hasFunction(s))
       s += ")";
     Collapsed.second->printRight(s);
@@ -2217,125 +2356,6 @@ FOR_EACH_NODE_KIND(SPECIALIZATION)
 
 #undef FOR_EACH_NODE_KIND
 
-template <class T, size_t N>
-class PODSmallVector {
-  static_assert(std::is_pod<T>::value,
-                "T is required to be a plain old data type");
-
-  T* First = nullptr;
-  T* Last = nullptr;
-  T* Cap = nullptr;
-  T Inline[N] = {0};
-
-  bool isInline() const { return First == Inline; }
-
-  void clearInline() {
-    First = Inline;
-    Last = Inline;
-    Cap = Inline + N;
-  }
-
-  void reserve(size_t NewCap) {
-    size_t S = size();
-    if (isInline()) {
-      auto* Tmp = static_cast<T*>(std::malloc(NewCap * sizeof(T)));
-      if (Tmp == nullptr)
-        std::terminate();
-      std::copy(First, Last, Tmp);
-      First = Tmp;
-    } else {
-      First = static_cast<T*>(std::realloc(First, NewCap * sizeof(T)));
-      if (First == nullptr)
-        std::terminate();
-    }
-    Last = First + S;
-    Cap = First + NewCap;
-  }
-
-public:
-  PODSmallVector() : First(Inline), Last(First), Cap(Inline + N) {}
-
-  PODSmallVector(const PODSmallVector&) = delete;
-  PODSmallVector& operator=(const PODSmallVector&) = delete;
-
-  PODSmallVector(PODSmallVector&& Other) : PODSmallVector() {
-    if (Other.isInline()) {
-      std::copy(Other.begin(), Other.end(), First);
-      Last = First + Other.size();
-      Other.clear();
-      return;
-    }
-
-    First = Other.First;
-    Last = Other.Last;
-    Cap = Other.Cap;
-    Other.clearInline();
-  }
-
-  PODSmallVector& operator=(PODSmallVector&& Other) {
-    if (Other.isInline()) {
-      if (!isInline()) {
-        std::free(First);
-        clearInline();
-      }
-      std::copy(Other.begin(), Other.end(), First);
-      Last = First + Other.size();
-      Other.clear();
-      return *this;
-    }
-
-    if (isInline()) {
-      First = Other.First;
-      Last = Other.Last;
-      Cap = Other.Cap;
-      Other.clearInline();
-      return *this;
-    }
-
-    std::swap(First, Other.First);
-    std::swap(Last, Other.Last);
-    std::swap(Cap, Other.Cap);
-    Other.clear();
-    return *this;
-  }
-
-  void push_back(const T& Elem) {
-    if (Last == Cap)
-      reserve(size() * 2);
-    *Last++ = Elem;
-  }
-
-  void pop_back() {
-    assert(Last != First && "Popping empty vector!");
-    --Last;
-  }
-
-  void dropBack(size_t Index) {
-    assert(Index <= size() && "dropBack() can't expand!");
-    Last = First + Index;
-  }
-
-  T* begin() { return First; }
-  T* end() { return Last; }
-
-  bool empty() const { return First == Last; }
-  size_t size() const { return static_cast<size_t>(Last - First); }
-  T& back() {
-    assert(Last != First && "Calling back() on empty vector!");
-    return *(Last - 1);
-  }
-  T& operator[](size_t Index) {
-    assert(Index < size() && "Invalid access!");
-    return *(begin() + Index);
-  }
-  void clear() { Last = First; }
-
-  ~PODSmallVector() {
-    if (!isInline())
-      std::free(First);
-  }
-};
-
 template <typename Derived, typename Alloc> struct AbstractManglingParser {
   const char *First;
   const char *Last;
diff --git a/libcxxabi/test/dynamic_cast.pass.cpp b/libcxxabi/test/dynamic_cast.pass.cpp
index 4df97c995ac19..886f9afced1f6 100644
--- a/libcxxabi/test/dynamic_cast.pass.cpp
+++ b/libcxxabi/test/dynamic_cast.pass.cpp
@@ -18,7 +18,7 @@
 // bases.
 #if defined(__clang__)
 #   pragma clang diagnostic ignored "-Winaccessible-base"
-#elif defined(__GNUC__) && (__GNUC__ >= 10)
+#elif defined(__GNUC__)
 #   pragma GCC diagnostic ignored "-Winaccessible-base"
 #endif
 
diff --git a/libcxxabi/test/dynamic_cast3.pass.cpp b/libcxxabi/test/dynamic_cast3.pass.cpp
index 2eefafa2b42bb..9c25cac865b9a 100644
--- a/libcxxabi/test/dynamic_cast3.pass.cpp
+++ b/libcxxabi/test/dynamic_cast3.pass.cpp
@@ -13,7 +13,7 @@
 // bases.
 #if defined(__clang__)
 #   pragma clang diagnostic ignored "-Winaccessible-base"
-#elif defined(__GNUC__) && (__GNUC__ >= 10)
+#elif defined(__GNUC__)
 #   pragma GCC diagnostic ignored "-Winaccessible-base"
 #endif
 
diff --git a/libcxxabi/test/dynamic_cast5.pass.cpp b/libcxxabi/test/dynamic_cast5.pass.cpp
index 7fdf106aae5bf..0a9689c23d6b0 100644
--- a/libcxxabi/test/dynamic_cast5.pass.cpp
+++ b/libcxxabi/test/dynamic_cast5.pass.cpp
@@ -13,7 +13,7 @@
 // bases.
 #if defined(__clang__)
 #   pragma clang diagnostic ignored "-Winaccessible-base"
-#elif defined(__GNUC__) && (__GNUC__ >= 10)
+#elif defined(__GNUC__)
 #   pragma GCC diagnostic ignored "-Winaccessible-base"
 #endif
 
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index 1780e684cbb57..0744172a6cbb4 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -9,6 +9,11 @@
 // The demangler does not pass all these tests with the system dylibs on macOS.
 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
 
+// https://llvm.org/PR51407 was not fixed in some previously-released
+// demanglers, which causes them to run into the infinite loop.
+// UNSUPPORTED: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
+// UNSUPPORTED: use_system_cxx_lib && target={{.+}}-apple-macosx11.0
+
 #include "support/timer.h"
 #include <cassert>
 #include <cstdio>
@@ -29844,6 +29849,9 @@ const char* cases[][2] =
 
     {"_ZN3xxx3yyyIvNS_1AILm0EEEZNS_2bb2cc2ddILNS_1eE1EEEvRKNS_1fERKNS_1g1hINS_1iEEERKNS_1jEfRKNS_1kEiPhEUlvE_JEEEvT1_DpT2_", "void xxx::yyy<void, xxx::A<0ul>, void xxx::bb::cc::dd<(xxx::e)1>(xxx::f const&, xxx::g::h<xxx::i> const&, xxx::j const&, float, xxx::k const&, int, unsigned char*)::'lambda'()>(void xxx::bb::cc::dd<(xxx::e)1>(xxx::f const&, xxx::g::h<xxx::i> const&, xxx::j const&, float, xxx::k const&, int, unsigned char*)::'lambda'())"},
 
+    // This should be invalid, but it is currently not recognized as such
+    // See https://llvm.org/PR51407
+    {"_Zcv1BIRT_EIS1_E", "operator B<><>"},
 };
 
 const unsigned N = sizeof(cases) / sizeof(cases[0]);
diff --git a/libcxxabi/test/test_exception_address_alignment.pass.cpp b/libcxxabi/test/test_exception_address_alignment.pass.cpp
index 117251cbe34cf..f684d3a9307ab 100644
--- a/libcxxabi/test/test_exception_address_alignment.pass.cpp
+++ b/libcxxabi/test/test_exception_address_alignment.pass.cpp
@@ -14,9 +14,6 @@
 // tests to fail when compiling against such a SDK, or when running against a
 // system libc++abi that was compiled with an incorrect definition of _Unwind_Exception.
 // XFAIL: apple-clang-12.0.0 && !target={{arm.*}}
-// XFAIL: apple-clang-11 && !target={{arm.*}}
-// XFAIL: apple-clang-10 && !target={{arm.*}}
-// XFAIL: apple-clang-9 && !target={{arm.*}}
 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12}}
 
 // Test that the address of the exception object is properly aligned as required
diff --git a/libcxxabi/test/unwind_06.pass.cpp b/libcxxabi/test/unwind_06.pass.cpp
index f8e9f8953aa9c..0c34c49493762 100644
--- a/libcxxabi/test/unwind_06.pass.cpp
+++ b/libcxxabi/test/unwind_06.pass.cpp
@@ -14,7 +14,7 @@
 #include <stdio.h>
 
 // Suppress diagnostics about deprecated volatile operations
-#if defined(__GNUC__) && (__GNUC__ >= 10) && !defined(__clang__)
+#if defined(__GNUC__) && !defined(__clang__)
 # pragma GCC diagnostic ignored "-Wvolatile"
 #endif
 
diff --git a/libunwind/include/libunwind.h b/libunwind/include/libunwind.h
index 0feecd7bd6fc3..5ba63e5b7e5b3 100644
--- a/libunwind/include/libunwind.h
+++ b/libunwind/include/libunwind.h
@@ -493,77 +493,150 @@ enum {
 
 // 64-bit ARM64 registers
 enum {
-  UNW_ARM64_X0 = 0,
-  UNW_ARM64_X1 = 1,
-  UNW_ARM64_X2 = 2,
-  UNW_ARM64_X3 = 3,
-  UNW_ARM64_X4 = 4,
-  UNW_ARM64_X5 = 5,
-  UNW_ARM64_X6 = 6,
-  UNW_ARM64_X7 = 7,
-  UNW_ARM64_X8 = 8,
-  UNW_ARM64_X9 = 9,
-  UNW_ARM64_X10 = 10,
-  UNW_ARM64_X11 = 11,
-  UNW_ARM64_X12 = 12,
-  UNW_ARM64_X13 = 13,
-  UNW_ARM64_X14 = 14,
-  UNW_ARM64_X15 = 15,
-  UNW_ARM64_X16 = 16,
-  UNW_ARM64_X17 = 17,
-  UNW_ARM64_X18 = 18,
-  UNW_ARM64_X19 = 19,
-  UNW_ARM64_X20 = 20,
-  UNW_ARM64_X21 = 21,
-  UNW_ARM64_X22 = 22,
-  UNW_ARM64_X23 = 23,
-  UNW_ARM64_X24 = 24,
-  UNW_ARM64_X25 = 25,
-  UNW_ARM64_X26 = 26,
-  UNW_ARM64_X27 = 27,
-  UNW_ARM64_X28 = 28,
-  UNW_ARM64_X29 = 29,
-  UNW_ARM64_FP = 29,
-  UNW_ARM64_X30 = 30,
-  UNW_ARM64_LR = 30,
-  UNW_ARM64_X31 = 31,
-  UNW_ARM64_SP = 31,
-  UNW_ARM64_PC = 32,
-  // reserved block
-  UNW_ARM64_RA_SIGN_STATE = 34,
+  UNW_AARCH64_X0 = 0,
+  UNW_AARCH64_X1 = 1,
+  UNW_AARCH64_X2 = 2,
+  UNW_AARCH64_X3 = 3,
+  UNW_AARCH64_X4 = 4,
+  UNW_AARCH64_X5 = 5,
+  UNW_AARCH64_X6 = 6,
+  UNW_AARCH64_X7 = 7,
+  UNW_AARCH64_X8 = 8,
+  UNW_AARCH64_X9 = 9,
+  UNW_AARCH64_X10 = 10,
+  UNW_AARCH64_X11 = 11,
+  UNW_AARCH64_X12 = 12,
+  UNW_AARCH64_X13 = 13,
+  UNW_AARCH64_X14 = 14,
+  UNW_AARCH64_X15 = 15,
+  UNW_AARCH64_X16 = 16,
+  UNW_AARCH64_X17 = 17,
+  UNW_AARCH64_X18 = 18,
+  UNW_AARCH64_X19 = 19,
+  UNW_AARCH64_X20 = 20,
+  UNW_AARCH64_X21 = 21,
+  UNW_AARCH64_X22 = 22,
+  UNW_AARCH64_X23 = 23,
+  UNW_AARCH64_X24 = 24,
+  UNW_AARCH64_X25 = 25,
+  UNW_AARCH64_X26 = 26,
+  UNW_AARCH64_X27 = 27,
+  UNW_AARCH64_X28 = 28,
+  UNW_AARCH64_X29 = 29,
+  UNW_AARCH64_FP = 29,
+  UNW_AARCH64_X30 = 30,
+  UNW_AARCH64_LR = 30,
+  UNW_AARCH64_X31 = 31,
+  UNW_AARCH64_SP = 31,
+  UNW_AARCH64_PC = 32,
+
   // reserved block
-  UNW_ARM64_D0 = 64,
-  UNW_ARM64_D1 = 65,
-  UNW_ARM64_D2 = 66,
-  UNW_ARM64_D3 = 67,
-  UNW_ARM64_D4 = 68,
-  UNW_ARM64_D5 = 69,
-  UNW_ARM64_D6 = 70,
-  UNW_ARM64_D7 = 71,
-  UNW_ARM64_D8 = 72,
-  UNW_ARM64_D9 = 73,
-  UNW_ARM64_D10 = 74,
-  UNW_ARM64_D11 = 75,
-  UNW_ARM64_D12 = 76,
-  UNW_ARM64_D13 = 77,
-  UNW_ARM64_D14 = 78,
-  UNW_ARM64_D15 = 79,
-  UNW_ARM64_D16 = 80,
-  UNW_ARM64_D17 = 81,
-  UNW_ARM64_D18 = 82,
-  UNW_ARM64_D19 = 83,
-  UNW_ARM64_D20 = 84,
-  UNW_ARM64_D21 = 85,
-  UNW_ARM64_D22 = 86,
-  UNW_ARM64_D23 = 87,
-  UNW_ARM64_D24 = 88,
-  UNW_ARM64_D25 = 89,
-  UNW_ARM64_D26 = 90,
-  UNW_ARM64_D27 = 91,
-  UNW_ARM64_D28 = 92,
-  UNW_ARM64_D29 = 93,
-  UNW_ARM64_D30 = 94,
-  UNW_ARM64_D31 = 95,
+  UNW_AARCH64_RA_SIGN_STATE = 34,
+
+  // FP/vector registers
+  UNW_AARCH64_V0 = 64,
+  UNW_AARCH64_V1 = 65,
+  UNW_AARCH64_V2 = 66,
+  UNW_AARCH64_V3 = 67,
+  UNW_AARCH64_V4 = 68,
+  UNW_AARCH64_V5 = 69,
+  UNW_AARCH64_V6 = 70,
+  UNW_AARCH64_V7 = 71,
+  UNW_AARCH64_V8 = 72,
+  UNW_AARCH64_V9 = 73,
+  UNW_AARCH64_V10 = 74,
+  UNW_AARCH64_V11 = 75,
+  UNW_AARCH64_V12 = 76,
+  UNW_AARCH64_V13 = 77,
+  UNW_AARCH64_V14 = 78,
+  UNW_AARCH64_V15 = 79,
+  UNW_AARCH64_V16 = 80,
+  UNW_AARCH64_V17 = 81,
+  UNW_AARCH64_V18 = 82,
+  UNW_AARCH64_V19 = 83,
+  UNW_AARCH64_V20 = 84,
+  UNW_AARCH64_V21 = 85,
+  UNW_AARCH64_V22 = 86,
+  UNW_AARCH64_V23 = 87,
+  UNW_AARCH64_V24 = 88,
+  UNW_AARCH64_V25 = 89,
+  UNW_AARCH64_V26 = 90,
+  UNW_AARCH64_V27 = 91,
+  UNW_AARCH64_V28 = 92,
+  UNW_AARCH64_V29 = 93,
+  UNW_AARCH64_V30 = 94,
+  UNW_AARCH64_V31 = 95,
+
+  // Compatibility aliases
+  UNW_ARM64_X0 = UNW_AARCH64_X0,
+  UNW_ARM64_X1 = UNW_AARCH64_X1,
+  UNW_ARM64_X2 = UNW_AARCH64_X2,
+  UNW_ARM64_X3 = UNW_AARCH64_X3,
+  UNW_ARM64_X4 = UNW_AARCH64_X4,
+  UNW_ARM64_X5 = UNW_AARCH64_X5,
+  UNW_ARM64_X6 = UNW_AARCH64_X6,
+  UNW_ARM64_X7 = UNW_AARCH64_X7,
+  UNW_ARM64_X8 = UNW_AARCH64_X8,
+  UNW_ARM64_X9 = UNW_AARCH64_X9,
+  UNW_ARM64_X10 = UNW_AARCH64_X10,
+  UNW_ARM64_X11 = UNW_AARCH64_X11,
+  UNW_ARM64_X12 = UNW_AARCH64_X12,
+  UNW_ARM64_X13 = UNW_AARCH64_X13,
+  UNW_ARM64_X14 = UNW_AARCH64_X14,
+  UNW_ARM64_X15 = UNW_AARCH64_X15,
+  UNW_ARM64_X16 = UNW_AARCH64_X16,
+  UNW_ARM64_X17 = UNW_AARCH64_X17,
+  UNW_ARM64_X18 = UNW_AARCH64_X18,
+  UNW_ARM64_X19 = UNW_AARCH64_X19,
+  UNW_ARM64_X20 = UNW_AARCH64_X20,
+  UNW_ARM64_X21 = UNW_AARCH64_X21,
+  UNW_ARM64_X22 = UNW_AARCH64_X22,
+  UNW_ARM64_X23 = UNW_AARCH64_X23,
+  UNW_ARM64_X24 = UNW_AARCH64_X24,
+  UNW_ARM64_X25 = UNW_AARCH64_X25,
+  UNW_ARM64_X26 = UNW_AARCH64_X26,
+  UNW_ARM64_X27 = UNW_AARCH64_X27,
+  UNW_ARM64_X28 = UNW_AARCH64_X28,
+  UNW_ARM64_X29 = UNW_AARCH64_X29,
+  UNW_ARM64_FP = UNW_AARCH64_FP,
+  UNW_ARM64_X30 = UNW_AARCH64_X30,
+  UNW_ARM64_LR = UNW_AARCH64_LR,
+  UNW_ARM64_X31 = UNW_AARCH64_X31,
+  UNW_ARM64_SP = UNW_AARCH64_SP,
+  UNW_ARM64_PC = UNW_AARCH64_PC,
+  UNW_ARM64_RA_SIGN_STATE = UNW_AARCH64_RA_SIGN_STATE,
+  UNW_ARM64_D0 = UNW_AARCH64_V0,
+  UNW_ARM64_D1 = UNW_AARCH64_V1,
+  UNW_ARM64_D2 = UNW_AARCH64_V2,
+  UNW_ARM64_D3 = UNW_AARCH64_V3,
+  UNW_ARM64_D4 = UNW_AARCH64_V4,
+  UNW_ARM64_D5 = UNW_AARCH64_V5,
+  UNW_ARM64_D6 = UNW_AARCH64_V6,
+  UNW_ARM64_D7 = UNW_AARCH64_V7,
+  UNW_ARM64_D8 = UNW_AARCH64_V8,
+  UNW_ARM64_D9 = UNW_AARCH64_V9,
+  UNW_ARM64_D10 = UNW_AARCH64_V10,
+  UNW_ARM64_D11 = UNW_AARCH64_V11,
+  UNW_ARM64_D12 = UNW_AARCH64_V12,
+  UNW_ARM64_D13 = UNW_AARCH64_V13,
+  UNW_ARM64_D14 = UNW_AARCH64_V14,
+  UNW_ARM64_D15 = UNW_AARCH64_V15,
+  UNW_ARM64_D16 = UNW_AARCH64_V16,
+  UNW_ARM64_D17 = UNW_AARCH64_V17,
+  UNW_ARM64_D18 = UNW_AARCH64_V18,
+  UNW_ARM64_D19 = UNW_AARCH64_V19,
+  UNW_ARM64_D20 = UNW_AARCH64_V20,
+  UNW_ARM64_D21 = UNW_AARCH64_V21,
+  UNW_ARM64_D22 = UNW_AARCH64_V22,
+  UNW_ARM64_D23 = UNW_AARCH64_V23,
+  UNW_ARM64_D24 = UNW_AARCH64_V24,
+  UNW_ARM64_D25 = UNW_AARCH64_V25,
+  UNW_ARM64_D26 = UNW_AARCH64_V26,
+  UNW_ARM64_D27 = UNW_AARCH64_V27,
+  UNW_ARM64_D28 = UNW_AARCH64_V28,
+  UNW_ARM64_D29 = UNW_AARCH64_V29,
+  UNW_ARM64_D30 = UNW_AARCH64_V30,
+  UNW_ARM64_D31 = UNW_AARCH64_V31,
 };
 
 // 32-bit ARM registers. Numbers match DWARF for ARM spec #3.1 Table 1.
diff --git a/libunwind/src/CompactUnwinder.hpp b/libunwind/src/CompactUnwinder.hpp
index 1c3175dff50a4..312bfbb2c7568 100644
--- a/libunwind/src/CompactUnwinder.hpp
+++ b/libunwind/src/CompactUnwinder.hpp
@@ -537,65 +537,65 @@ int CompactUnwinder_arm64<A>::stepWithCompactEncodingFrameless(
   uint64_t savedRegisterLoc = registers.getSP() + stackSize;
 
   if (encoding & UNWIND_ARM64_FRAME_X19_X20_PAIR) {
-    registers.setRegister(UNW_ARM64_X19, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X19, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setRegister(UNW_ARM64_X20, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X20, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_X21_X22_PAIR) {
-    registers.setRegister(UNW_ARM64_X21, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X21, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setRegister(UNW_ARM64_X22, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X22, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_X23_X24_PAIR) {
-    registers.setRegister(UNW_ARM64_X23, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X23, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setRegister(UNW_ARM64_X24, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X24, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_X25_X26_PAIR) {
-    registers.setRegister(UNW_ARM64_X25, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X25, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setRegister(UNW_ARM64_X26, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X26, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_X27_X28_PAIR) {
-    registers.setRegister(UNW_ARM64_X27, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X27, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setRegister(UNW_ARM64_X28, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X28, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
 
   if (encoding & UNWIND_ARM64_FRAME_D8_D9_PAIR) {
-    registers.setFloatRegister(UNW_ARM64_D8,
+    registers.setFloatRegister(UNW_AARCH64_V8,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setFloatRegister(UNW_ARM64_D9,
+    registers.setFloatRegister(UNW_AARCH64_V9,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_D10_D11_PAIR) {
-    registers.setFloatRegister(UNW_ARM64_D10,
+    registers.setFloatRegister(UNW_AARCH64_V10,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setFloatRegister(UNW_ARM64_D11,
+    registers.setFloatRegister(UNW_AARCH64_V11,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_D12_D13_PAIR) {
-    registers.setFloatRegister(UNW_ARM64_D12,
+    registers.setFloatRegister(UNW_AARCH64_V12,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setFloatRegister(UNW_ARM64_D13,
+    registers.setFloatRegister(UNW_AARCH64_V13,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_D14_D15_PAIR) {
-    registers.setFloatRegister(UNW_ARM64_D14,
+    registers.setFloatRegister(UNW_AARCH64_V14,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setFloatRegister(UNW_ARM64_D15,
+    registers.setFloatRegister(UNW_AARCH64_V15,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
@@ -604,7 +604,7 @@ int CompactUnwinder_arm64<A>::stepWithCompactEncodingFrameless(
   registers.setSP(savedRegisterLoc);
 
   // set pc to be value in lr
-  registers.setIP(registers.getRegister(UNW_ARM64_LR));
+  registers.setIP(registers.getRegister(UNW_AARCH64_LR));
 
   return UNW_STEP_SUCCESS;
 }
@@ -616,65 +616,65 @@ int CompactUnwinder_arm64<A>::stepWithCompactEncodingFrame(
   uint64_t savedRegisterLoc = registers.getFP() - 8;
 
   if (encoding & UNWIND_ARM64_FRAME_X19_X20_PAIR) {
-    registers.setRegister(UNW_ARM64_X19, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X19, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setRegister(UNW_ARM64_X20, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X20, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_X21_X22_PAIR) {
-    registers.setRegister(UNW_ARM64_X21, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X21, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setRegister(UNW_ARM64_X22, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X22, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_X23_X24_PAIR) {
-    registers.setRegister(UNW_ARM64_X23, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X23, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setRegister(UNW_ARM64_X24, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X24, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_X25_X26_PAIR) {
-    registers.setRegister(UNW_ARM64_X25, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X25, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setRegister(UNW_ARM64_X26, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X26, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_X27_X28_PAIR) {
-    registers.setRegister(UNW_ARM64_X27, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X27, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setRegister(UNW_ARM64_X28, addressSpace.get64(savedRegisterLoc));
+    registers.setRegister(UNW_AARCH64_X28, addressSpace.get64(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
 
   if (encoding & UNWIND_ARM64_FRAME_D8_D9_PAIR) {
-    registers.setFloatRegister(UNW_ARM64_D8,
+    registers.setFloatRegister(UNW_AARCH64_V8,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setFloatRegister(UNW_ARM64_D9,
+    registers.setFloatRegister(UNW_AARCH64_V9,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_D10_D11_PAIR) {
-    registers.setFloatRegister(UNW_ARM64_D10,
+    registers.setFloatRegister(UNW_AARCH64_V10,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setFloatRegister(UNW_ARM64_D11,
+    registers.setFloatRegister(UNW_AARCH64_V11,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_D12_D13_PAIR) {
-    registers.setFloatRegister(UNW_ARM64_D12,
+    registers.setFloatRegister(UNW_AARCH64_V12,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setFloatRegister(UNW_ARM64_D13,
+    registers.setFloatRegister(UNW_AARCH64_V13,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
   if (encoding & UNWIND_ARM64_FRAME_D14_D15_PAIR) {
-    registers.setFloatRegister(UNW_ARM64_D14,
+    registers.setFloatRegister(UNW_AARCH64_V14,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
-    registers.setFloatRegister(UNW_ARM64_D15,
+    registers.setFloatRegister(UNW_AARCH64_V15,
                                addressSpace.getDouble(savedRegisterLoc));
     savedRegisterLoc -= 8;
   }
diff --git a/libunwind/src/DwarfInstructions.hpp b/libunwind/src/DwarfInstructions.hpp
index 686c6be0d8fde..60b242e0c143c 100644
--- a/libunwind/src/DwarfInstructions.hpp
+++ b/libunwind/src/DwarfInstructions.hpp
@@ -219,7 +219,7 @@ int DwarfInstructions<A, R>::stepWithDwarf(A &addressSpace, pint_t pc,
       // restored. autia1716 is used instead of autia as autia1716 assembles
       // to a NOP on pre-v8.3a architectures.
       if ((R::getArch() == REGISTERS_ARM64) &&
-          prolog.savedRegisters[UNW_ARM64_RA_SIGN_STATE].value &&
+          prolog.savedRegisters[UNW_AARCH64_RA_SIGN_STATE].value &&
           returnAddress != 0) {
 #if !defined(_LIBUNWIND_IS_NATIVE_ONLY)
         return UNW_ECROSSRASIGNING;
diff --git a/libunwind/src/DwarfParser.hpp b/libunwind/src/DwarfParser.hpp
index de0eb6de9d700..2a7155ba9eb29 100644
--- a/libunwind/src/DwarfParser.hpp
+++ b/libunwind/src/DwarfParser.hpp
@@ -733,8 +733,8 @@ bool CFI_Parser<A>::parseFDEInstructions(A &addressSpace,
 #if defined(_LIBUNWIND_TARGET_AARCH64)
         case REGISTERS_ARM64: {
           int64_t value =
-              results->savedRegisters[UNW_ARM64_RA_SIGN_STATE].value ^ 0x1;
-          results->setRegisterValue(UNW_ARM64_RA_SIGN_STATE, value,
+              results->savedRegisters[UNW_AARCH64_RA_SIGN_STATE].value ^ 0x1;
+          results->setRegisterValue(UNW_AARCH64_RA_SIGN_STATE, value,
                                     initialState);
           _LIBUNWIND_TRACE_DWARF("DW_CFA_AARCH64_negate_ra_state\n");
         } break;
diff --git a/libunwind/src/Registers.hpp b/libunwind/src/Registers.hpp
index 8f1e4272049d8..0699743888e97 100644
--- a/libunwind/src/Registers.hpp
+++ b/libunwind/src/Registers.hpp
@@ -1850,7 +1850,7 @@ inline bool Registers_arm64::validRegister(int regNum) const {
     return false;
   if (regNum > 95)
     return false;
-  if (regNum == UNW_ARM64_RA_SIGN_STATE)
+  if (regNum == UNW_AARCH64_RA_SIGN_STATE)
     return true;
   if ((regNum > 32) && (regNum < 64))
     return false;
@@ -1858,15 +1858,15 @@ inline bool Registers_arm64::validRegister(int regNum) const {
 }
 
 inline uint64_t Registers_arm64::getRegister(int regNum) const {
-  if (regNum == UNW_REG_IP || regNum == UNW_ARM64_PC)
+  if (regNum == UNW_REG_IP || regNum == UNW_AARCH64_PC)
     return _registers.__pc;
-  if (regNum == UNW_REG_SP || regNum == UNW_ARM64_SP)
+  if (regNum == UNW_REG_SP || regNum == UNW_AARCH64_SP)
     return _registers.__sp;
-  if (regNum == UNW_ARM64_RA_SIGN_STATE)
+  if (regNum == UNW_AARCH64_RA_SIGN_STATE)
     return _registers.__ra_sign_state;
-  if (regNum == UNW_ARM64_FP)
+  if (regNum == UNW_AARCH64_FP)
     return _registers.__fp;
-  if (regNum == UNW_ARM64_LR)
+  if (regNum == UNW_AARCH64_LR)
     return _registers.__lr;
   if ((regNum >= 0) && (regNum < 29))
     return _registers.__x[regNum];
@@ -1874,15 +1874,15 @@ inline uint64_t Registers_arm64::getRegister(int regNum) const {
 }
 
 inline void Registers_arm64::setRegister(int regNum, uint64_t value) {
-  if (regNum == UNW_REG_IP || regNum == UNW_ARM64_PC)
+  if (regNum == UNW_REG_IP || regNum == UNW_AARCH64_PC)
     _registers.__pc = value;
-  else if (regNum == UNW_REG_SP || regNum == UNW_ARM64_SP)
+  else if (regNum == UNW_REG_SP || regNum == UNW_AARCH64_SP)
     _registers.__sp = value;
-  else if (regNum == UNW_ARM64_RA_SIGN_STATE)
+  else if (regNum == UNW_AARCH64_RA_SIGN_STATE)
     _registers.__ra_sign_state = value;
-  else if (regNum == UNW_ARM64_FP)
+  else if (regNum == UNW_AARCH64_FP)
     _registers.__fp = value;
-  else if (regNum == UNW_ARM64_LR)
+  else if (regNum == UNW_AARCH64_LR)
     _registers.__lr = value;
   else if ((regNum >= 0) && (regNum < 29))
     _registers.__x[regNum] = value;
@@ -1896,135 +1896,135 @@ inline const char *Registers_arm64::getRegisterName(int regNum) {
     return "pc";
   case UNW_REG_SP:
     return "sp";
-  case UNW_ARM64_X0:
+  case UNW_AARCH64_X0:
     return "x0";
-  case UNW_ARM64_X1:
+  case UNW_AARCH64_X1:
     return "x1";
-  case UNW_ARM64_X2:
+  case UNW_AARCH64_X2:
     return "x2";
-  case UNW_ARM64_X3:
+  case UNW_AARCH64_X3:
     return "x3";
-  case UNW_ARM64_X4:
+  case UNW_AARCH64_X4:
     return "x4";
-  case UNW_ARM64_X5:
+  case UNW_AARCH64_X5:
     return "x5";
-  case UNW_ARM64_X6:
+  case UNW_AARCH64_X6:
     return "x6";
-  case UNW_ARM64_X7:
+  case UNW_AARCH64_X7:
     return "x7";
-  case UNW_ARM64_X8:
+  case UNW_AARCH64_X8:
     return "x8";
-  case UNW_ARM64_X9:
+  case UNW_AARCH64_X9:
     return "x9";
-  case UNW_ARM64_X10:
+  case UNW_AARCH64_X10:
     return "x10";
-  case UNW_ARM64_X11:
+  case UNW_AARCH64_X11:
     return "x11";
-  case UNW_ARM64_X12:
+  case UNW_AARCH64_X12:
     return "x12";
-  case UNW_ARM64_X13:
+  case UNW_AARCH64_X13:
     return "x13";
-  case UNW_ARM64_X14:
+  case UNW_AARCH64_X14:
     return "x14";
-  case UNW_ARM64_X15:
+  case UNW_AARCH64_X15:
     return "x15";
-  case UNW_ARM64_X16:
+  case UNW_AARCH64_X16:
     return "x16";
-  case UNW_ARM64_X17:
+  case UNW_AARCH64_X17:
     return "x17";
-  case UNW_ARM64_X18:
+  case UNW_AARCH64_X18:
     return "x18";
-  case UNW_ARM64_X19:
+  case UNW_AARCH64_X19:
     return "x19";
-  case UNW_ARM64_X20:
+  case UNW_AARCH64_X20:
     return "x20";
-  case UNW_ARM64_X21:
+  case UNW_AARCH64_X21:
     return "x21";
-  case UNW_ARM64_X22:
+  case UNW_AARCH64_X22:
     return "x22";
-  case UNW_ARM64_X23:
+  case UNW_AARCH64_X23:
     return "x23";
-  case UNW_ARM64_X24:
+  case UNW_AARCH64_X24:
     return "x24";
-  case UNW_ARM64_X25:
+  case UNW_AARCH64_X25:
     return "x25";
-  case UNW_ARM64_X26:
+  case UNW_AARCH64_X26:
     return "x26";
-  case UNW_ARM64_X27:
+  case UNW_AARCH64_X27:
     return "x27";
-  case UNW_ARM64_X28:
+  case UNW_AARCH64_X28:
     return "x28";
-  case UNW_ARM64_FP:
+  case UNW_AARCH64_FP:
     return "fp";
-  case UNW_ARM64_LR:
+  case UNW_AARCH64_LR:
     return "lr";
-  case UNW_ARM64_SP:
+  case UNW_AARCH64_SP:
     return "sp";
-  case UNW_ARM64_PC:
+  case UNW_AARCH64_PC:
     return "pc";
-  case UNW_ARM64_D0:
+  case UNW_AARCH64_V0:
     return "d0";
-  case UNW_ARM64_D1:
+  case UNW_AARCH64_V1:
     return "d1";
-  case UNW_ARM64_D2:
+  case UNW_AARCH64_V2:
     return "d2";
-  case UNW_ARM64_D3:
+  case UNW_AARCH64_V3:
     return "d3";
-  case UNW_ARM64_D4:
+  case UNW_AARCH64_V4:
     return "d4";
-  case UNW_ARM64_D5:
+  case UNW_AARCH64_V5:
     return "d5";
-  case UNW_ARM64_D6:
+  case UNW_AARCH64_V6:
     return "d6";
-  case UNW_ARM64_D7:
+  case UNW_AARCH64_V7:
     return "d7";
-  case UNW_ARM64_D8:
+  case UNW_AARCH64_V8:
     return "d8";
-  case UNW_ARM64_D9:
+  case UNW_AARCH64_V9:
     return "d9";
-  case UNW_ARM64_D10:
+  case UNW_AARCH64_V10:
     return "d10";
-  case UNW_ARM64_D11:
+  case UNW_AARCH64_V11:
     return "d11";
-  case UNW_ARM64_D12:
+  case UNW_AARCH64_V12:
     return "d12";
-  case UNW_ARM64_D13:
+  case UNW_AARCH64_V13:
     return "d13";
-  case UNW_ARM64_D14:
+  case UNW_AARCH64_V14:
     return "d14";
-  case UNW_ARM64_D15:
+  case UNW_AARCH64_V15:
     return "d15";
-  case UNW_ARM64_D16:
+  case UNW_AARCH64_V16:
     return "d16";
-  case UNW_ARM64_D17:
+  case UNW_AARCH64_V17:
     return "d17";
-  case UNW_ARM64_D18:
+  case UNW_AARCH64_V18:
     return "d18";
-  case UNW_ARM64_D19:
+  case UNW_AARCH64_V19:
     return "d19";
-  case UNW_ARM64_D20:
+  case UNW_AARCH64_V20:
     return "d20";
-  case UNW_ARM64_D21:
+  case UNW_AARCH64_V21:
     return "d21";
-  case UNW_ARM64_D22:
+  case UNW_AARCH64_V22:
     return "d22";
-  case UNW_ARM64_D23:
+  case UNW_AARCH64_V23:
     return "d23";
-  case UNW_ARM64_D24:
+  case UNW_AARCH64_V24:
     return "d24";
-  case UNW_ARM64_D25:
+  case UNW_AARCH64_V25:
     return "d25";
-  case UNW_ARM64_D26:
+  case UNW_AARCH64_V26:
     return "d26";
-  case UNW_ARM64_D27:
+  case UNW_AARCH64_V27:
     return "d27";
-  case UNW_ARM64_D28:
+  case UNW_AARCH64_V28:
     return "d28";
-  case UNW_ARM64_D29:
+  case UNW_AARCH64_V29:
     return "d29";
-  case UNW_ARM64_D30:
+  case UNW_AARCH64_V30:
     return "d30";
-  case UNW_ARM64_D31:
+  case UNW_AARCH64_V31:
     return "d31";
   default:
     return "unknown register";
@@ -2032,21 +2032,21 @@ inline const char *Registers_arm64::getRegisterName(int regNum) {
 }
 
 inline bool Registers_arm64::validFloatRegister(int regNum) const {
-  if (regNum < UNW_ARM64_D0)
+  if (regNum < UNW_AARCH64_V0)
     return false;
-  if (regNum > UNW_ARM64_D31)
+  if (regNum > UNW_AARCH64_V31)
     return false;
   return true;
 }
 
 inline double Registers_arm64::getFloatRegister(int regNum) const {
   assert(validFloatRegister(regNum));
-  return _vectorHalfRegisters[regNum - UNW_ARM64_D0];
+  return _vectorHalfRegisters[regNum - UNW_AARCH64_V0];
 }
 
 inline void Registers_arm64::setFloatRegister(int regNum, double value) {
   assert(validFloatRegister(regNum));
-  _vectorHalfRegisters[regNum - UNW_ARM64_D0] = value;
+  _vectorHalfRegisters[regNum - UNW_AARCH64_V0] = value;
 }
 
 inline bool Registers_arm64::validVectorRegister(int) const {
diff --git a/libunwind/src/Unwind-seh.cpp b/libunwind/src/Unwind-seh.cpp
index 5a6a719730c80..ad0b267659ec4 100644
--- a/libunwind/src/Unwind-seh.cpp
+++ b/libunwind/src/Unwind-seh.cpp
@@ -169,8 +169,8 @@ _GCC_specific_handler(PEXCEPTION_RECORD ms_exc, PVOID frame, PCONTEXT ms_ctx,
     __unw_get_reg(&cursor, UNW_ARM_R1, &exc->private_[3]);
 #elif defined(__aarch64__)
     exc->private_[2] = disp->TargetPc;
-    __unw_get_reg(&cursor, UNW_ARM64_X0, &retval);
-    __unw_get_reg(&cursor, UNW_ARM64_X1, &exc->private_[3]);
+    __unw_get_reg(&cursor, UNW_AARCH64_X0, &retval);
+    __unw_get_reg(&cursor, UNW_AARCH64_X1, &exc->private_[3]);
 #endif
     __unw_get_reg(&cursor, UNW_REG_IP, &target);
     ms_exc->ExceptionCode = STATUS_GCC_UNWIND;
diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp
index 757d9808a978d..8373b61eb4247 100644
--- a/libunwind/src/UnwindCursor.hpp
+++ b/libunwind/src/UnwindCursor.hpp
@@ -620,12 +620,12 @@ UnwindCursor<A, R>::UnwindCursor(unw_context_t *context, A &as)
     _msContext.D[i - UNW_ARM_D0] = d.w;
   }
 #elif defined(_LIBUNWIND_TARGET_AARCH64)
-  for (int i = UNW_ARM64_X0; i <= UNW_ARM64_X30; ++i)
-    _msContext.X[i - UNW_ARM64_X0] = r.getRegister(i);
+  for (int i = UNW_AARCH64_X0; i <= UNW_ARM64_X30; ++i)
+    _msContext.X[i - UNW_AARCH64_X0] = r.getRegister(i);
   _msContext.Sp = r.getRegister(UNW_REG_SP);
   _msContext.Pc = r.getRegister(UNW_REG_IP);
-  for (int i = UNW_ARM64_D0; i <= UNW_ARM64_D31; ++i)
-    _msContext.V[i - UNW_ARM64_D0].D[0] = r.getFloatRegister(i);
+  for (int i = UNW_AARCH64_V0; i <= UNW_ARM64_D31; ++i)
+    _msContext.V[i - UNW_AARCH64_V0].D[0] = r.getFloatRegister(i);
 #endif
 }
 
@@ -650,7 +650,7 @@ bool UnwindCursor<A, R>::validReg(int regNum) {
 #elif defined(_LIBUNWIND_TARGET_ARM)
   if (regNum >= UNW_ARM_R0 && regNum <= UNW_ARM_R15) return true;
 #elif defined(_LIBUNWIND_TARGET_AARCH64)
-  if (regNum >= UNW_ARM64_X0 && regNum <= UNW_ARM64_X30) return true;
+  if (regNum >= UNW_AARCH64_X0 && regNum <= UNW_ARM64_X30) return true;
 #endif
   return false;
 }
@@ -699,7 +699,7 @@ unw_word_t UnwindCursor<A, R>::getReg(int regNum) {
 #elif defined(_LIBUNWIND_TARGET_AARCH64)
   case UNW_REG_SP: return _msContext.Sp;
   case UNW_REG_IP: return _msContext.Pc;
-  default: return _msContext.X[regNum - UNW_ARM64_X0];
+  default: return _msContext.X[regNum - UNW_AARCH64_X0];
 #endif
   }
   _LIBUNWIND_ABORT("unsupported register");
@@ -749,37 +749,37 @@ void UnwindCursor<A, R>::setReg(int regNum, unw_word_t value) {
 #elif defined(_LIBUNWIND_TARGET_AARCH64)
   case UNW_REG_SP: _msContext.Sp = value; break;
   case UNW_REG_IP: _msContext.Pc = value; break;
-  case UNW_ARM64_X0:
-  case UNW_ARM64_X1:
-  case UNW_ARM64_X2:
-  case UNW_ARM64_X3:
-  case UNW_ARM64_X4:
-  case UNW_ARM64_X5:
-  case UNW_ARM64_X6:
-  case UNW_ARM64_X7:
-  case UNW_ARM64_X8:
-  case UNW_ARM64_X9:
-  case UNW_ARM64_X10:
-  case UNW_ARM64_X11:
-  case UNW_ARM64_X12:
-  case UNW_ARM64_X13:
-  case UNW_ARM64_X14:
-  case UNW_ARM64_X15:
-  case UNW_ARM64_X16:
-  case UNW_ARM64_X17:
-  case UNW_ARM64_X18:
-  case UNW_ARM64_X19:
-  case UNW_ARM64_X20:
-  case UNW_ARM64_X21:
-  case UNW_ARM64_X22:
-  case UNW_ARM64_X23:
-  case UNW_ARM64_X24:
-  case UNW_ARM64_X25:
-  case UNW_ARM64_X26:
-  case UNW_ARM64_X27:
-  case UNW_ARM64_X28:
-  case UNW_ARM64_FP:
-  case UNW_ARM64_LR: _msContext.X[regNum - UNW_ARM64_X0] = value; break;
+  case UNW_AARCH64_X0:
+  case UNW_AARCH64_X1:
+  case UNW_AARCH64_X2:
+  case UNW_AARCH64_X3:
+  case UNW_AARCH64_X4:
+  case UNW_AARCH64_X5:
+  case UNW_AARCH64_X6:
+  case UNW_AARCH64_X7:
+  case UNW_AARCH64_X8:
+  case UNW_AARCH64_X9:
+  case UNW_AARCH64_X10:
+  case UNW_AARCH64_X11:
+  case UNW_AARCH64_X12:
+  case UNW_AARCH64_X13:
+  case UNW_AARCH64_X14:
+  case UNW_AARCH64_X15:
+  case UNW_AARCH64_X16:
+  case UNW_AARCH64_X17:
+  case UNW_AARCH64_X18:
+  case UNW_AARCH64_X19:
+  case UNW_AARCH64_X20:
+  case UNW_AARCH64_X21:
+  case UNW_AARCH64_X22:
+  case UNW_AARCH64_X23:
+  case UNW_AARCH64_X24:
+  case UNW_AARCH64_X25:
+  case UNW_AARCH64_X26:
+  case UNW_AARCH64_X27:
+  case UNW_AARCH64_X28:
+  case UNW_AARCH64_FP:
+  case UNW_AARCH64_LR: _msContext.X[regNum - UNW_ARM64_X0] = value; break;
 #endif
   default:
     _LIBUNWIND_ABORT("unsupported register");
@@ -792,7 +792,7 @@ bool UnwindCursor<A, R>::validFloatReg(int regNum) {
   if (regNum >= UNW_ARM_S0 && regNum <= UNW_ARM_S31) return true;
   if (regNum >= UNW_ARM_D0 && regNum <= UNW_ARM_D31) return true;
 #elif defined(_LIBUNWIND_TARGET_AARCH64)
-  if (regNum >= UNW_ARM64_D0 && regNum <= UNW_ARM64_D31) return true;
+  if (regNum >= UNW_AARCH64_V0 && regNum <= UNW_ARM64_D31) return true;
 #else
   (void)regNum;
 #endif
@@ -820,7 +820,7 @@ unw_fpreg_t UnwindCursor<A, R>::getFloatReg(int regNum) {
   }
   _LIBUNWIND_ABORT("unsupported float register");
 #elif defined(_LIBUNWIND_TARGET_AARCH64)
-  return _msContext.V[regNum - UNW_ARM64_D0].D[0];
+  return _msContext.V[regNum - UNW_AARCH64_V0].D[0];
 #else
   (void)regNum;
   _LIBUNWIND_ABORT("float registers unimplemented");
@@ -848,7 +848,7 @@ void UnwindCursor<A, R>::setFloatReg(int regNum, unw_fpreg_t value) {
   }
   _LIBUNWIND_ABORT("unsupported float register");
 #elif defined(_LIBUNWIND_TARGET_AARCH64)
-  _msContext.V[regNum - UNW_ARM64_D0].D[0] = value;
+  _msContext.V[regNum - UNW_AARCH64_V0].D[0] = value;
 #else
   (void)regNum;
   (void)value;
@@ -2061,7 +2061,7 @@ int UnwindCursor<A, R>::stepThroughSigReturn(Registers_arm64 &) {
   for (int i = 0; i <= 30; ++i) {
     uint64_t value = _addressSpace.get64(sigctx + kOffsetGprs +
                                          static_cast<pint_t>(i * 8));
-    _registers.setRegister(UNW_ARM64_X0 + i, value);
+    _registers.setRegister(UNW_AARCH64_X0 + i, value);
   }
   _registers.setSP(_addressSpace.get64(sigctx + kOffsetSp));
   _registers.setIP(_addressSpace.get64(sigctx + kOffsetPc));
diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp
index 04cdd981929b9..8d64f2731d8d8 100644
--- a/lld/MachO/InputFiles.cpp
+++ b/lld/MachO/InputFiles.cpp
@@ -279,19 +279,27 @@ void ObjFile::parseSections(ArrayRef<Section> sections) {
             {off, make<ConcatInputSection>(segname, name, this,
                                            data.slice(off, literalSize), align,
                                            flags)});
+    } else if (segname == segment_names::llvm) {
+      // ld64 does not appear to emit contents from sections within the __LLVM
+      // segment. Symbols within those sections point to bitcode metadata
+      // instead of actual symbols. Global symbols within those sections could
+      // have the same name without causing duplicate symbol errors. Push an
+      // empty map to ensure indices line up for the remaining sections.
+      // TODO: Evaluate whether the bitcode metadata is needed.
+      subsections.push_back({});
     } else {
       auto *isec =
           make<ConcatInputSection>(segname, name, this, data, align, flags);
-      if (!(isDebugSection(isec->getFlags()) &&
-            isec->getSegName() == segment_names::dwarf)) {
-        subsections.push_back({{0, isec}});
-      } else {
+      if (isDebugSection(isec->getFlags()) &&
+          isec->getSegName() == segment_names::dwarf) {
         // Instead of emitting DWARF sections, we emit STABS symbols to the
         // object files that contain them. We filter them out early to avoid
         // parsing their relocations unnecessarily. But we must still push an
         // empty map to ensure the indices line up for the remaining sections.
         subsections.push_back({});
         debugSections.push_back(isec);
+      } else {
+        subsections.push_back({{0, isec}});
       }
     }
   }
diff --git a/lld/test/ELF/linkerscript/overwrite-sections.test b/lld/test/ELF/linkerscript/overwrite-sections.test
index 06918728826db..0a14542f7d10d 100644
--- a/lld/test/ELF/linkerscript/overwrite-sections.test
+++ b/lld/test/ELF/linkerscript/overwrite-sections.test
@@ -1,5 +1,5 @@
 # REQUIRES: x86
-# RUN: rm -rf %t && split-file %s %t
+# RUN: rm -rf %t && split-file --leading-lines %s %t
 # RUN: llvm-mc -filetype=obj -triple=x86_64 %t/a.s -o %t/a.o
 
 ## There is no main linker script. OVERWRITE_SECTIONS defines output section
diff --git a/lld/test/MachO/discard-llvm-sections.s b/lld/test/MachO/discard-llvm-sections.s
new file mode 100644
index 0000000000000..a05456dc96672
--- /dev/null
+++ b/lld/test/MachO/discard-llvm-sections.s
@@ -0,0 +1,46 @@
+# REQUIRES: x86
+# RUN: rm -rf %t; split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/foo.s -o %t/foo.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/bar.s -o %t/bar.o
+
+## "_llvm." symbols are not special. LLD would produce duplicate symbol errors
+## if they were not within the LLVM segment.
+
+## 1/ Test that LLD does not produce duplicate symbols errors when linking global symbols
+##    with the same name under the LLVM segment.
+# RUN: %lld -dylib %t/foo.o %t/bar.o -o %t/libDuplicate.dylib
+
+## 2/ Test that all sections within an LLVM segment are dropped.
+# RUN: llvm-objdump --section-headers %t/libDuplicate.dylib | FileCheck %s
+
+# CHECK-LABEL: Sections:
+# CHECK-NEXT:  Idx  Name    Size      VMA            Type
+# CHECK-NEXT:  0    __text  00000000  {{[0-9a-f]+}}  TEXT
+
+## 3/ Test that linking global symbol that is not under the LLVM segment produces duplicate
+##    symbols
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin --defsym TEXT=0 %t/foo.s -o %t/foo.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin --defsym TEXT=0 %t/bar.s -o %t/bar.o
+# RUN: not %lld -dylib %t/foo.o %t/bar.o -o %t/libDuplicate.dylib 2>&1 | FileCheck %s --check-prefix=DUP
+
+# DUP: error: duplicate symbol: _llvm.foo
+
+#--- foo.s
+.globl _llvm.foo
+.ifdef TEXT
+  .section __TEXT,__cstring
+.else
+  .section __LLVM,__bitcode
+.endif
+  _llvm.foo:
+    .asciz "test"
+
+#--- bar.s
+.globl _llvm.foo
+.ifdef TEXT
+  .section __TEXT,__cstring
+.else
+  .section __LLVM,__bitcode
+.endif
+  _llvm.foo:
+    .asciz "test"
diff --git a/lld/test/wasm/shared-weak-symbols.s b/lld/test/wasm/shared-weak-symbols.s
new file mode 100644
index 0000000000000..00b5d3a9212b3
--- /dev/null
+++ b/lld/test/wasm/shared-weak-symbols.s
@@ -0,0 +1,59 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld --experimental-pic -shared -o %t.wasm %t.o
+# RUN: obj2yaml %t.wasm | FileCheck %s
+# RUN: llvm-objdump -d %t.wasm | FileCheck %s -check-prefix=ASM
+
+# Verify the weakly defined fuctions (weak_func) are both
+# imported and exported, and that internal usage (direct call)
+# always uses the imported version.
+
+
+.globl weak_func
+.weak weak_func
+weak_func:
+  .functype weak_func () -> (i32)
+  i32.const 0
+  end_function
+
+.globl call_weak
+call_weak:
+# ASM: <call_weak>:
+  .functype call_weak () -> (i32)
+  call weak_func
+# ASM:           10 80 80 80 80 00      call  0
+  end_function
+# ASM-NEXT:      0b                     end
+
+# CHECK:       - Type:            IMPORT
+# CHECK-NEXT:    Imports:
+# CHECK-NEXT:      - Module:          env
+# CHECK-NEXT:        Field:           memory
+# CHECK-NEXT:        Kind:            MEMORY
+# CHECK-NEXT:        Memory:
+# CHECK-NEXT:          Minimum:         0x0
+# CHECK-NEXT:      - Module:          env
+# CHECK-NEXT:        Field:           __memory_base
+# CHECK-NEXT:        Kind:            GLOBAL
+# CHECK-NEXT:        GlobalType:      I32
+# CHECK-NEXT:        GlobalMutable:   false
+# CHECK-NEXT:      - Module:          env
+# CHECK-NEXT:        Field:           __table_base
+# CHECK-NEXT:        Kind:            GLOBAL
+# CHECK-NEXT:        GlobalType:      I32
+# CHECK-NEXT:        GlobalMutable:   false
+# CHECK-NEXT:      - Module:          env
+# CHECK-NEXT:        Field:           weak_func
+# CHECK-NEXT:        Kind:            FUNCTION
+# CHECK-NEXT:        SigIndex:        0
+
+#      CHECK:   - Type:            CUSTOM
+# CHECK-NEXT:     Name:            name
+# CHECK-NEXT:     FunctionNames:
+# CHECK-NEXT:       - Index:           0
+# CHECK-NEXT:         Name:            weak_func
+# CHECK-NEXT:       - Index:           1
+# CHECK-NEXT:         Name:            __wasm_call_ctors
+# CHECK-NEXT:       - Index:           2
+# CHECK-NEXT:         Name:            __wasm_apply_data_relocs
+# CHECK-NEXT:       - Index:           3
+# CHECK-NEXT:         Name:            weak_func
diff --git a/lld/test/wasm/shared.s b/lld/test/wasm/shared.s
index 3299d844c2b3e..29cf8f6f996c5 100644
--- a/lld/test/wasm/shared.s
+++ b/lld/test/wasm/shared.s
@@ -167,10 +167,6 @@ get_local_func_address:
 # CHECK-NEXT:         Kind:            GLOBAL
 # CHECK-NEXT:         GlobalType:      I32
 # CHECK-NEXT:         GlobalMutable:   false
-# CHECK-NEXT:       - Module:          env
-# CHECK-NEXT:         Field:           func_external
-# CHECK-NEXT:         Kind:            FUNCTION
-# CHECK-NEXT:         SigIndex:        1
 # CHECK-NEXT:       - Module:          GOT.mem
 # CHECK-NEXT:         Field:           indirect_func
 # CHECK-NEXT:         Kind:            GLOBAL
@@ -197,7 +193,7 @@ get_local_func_address:
 # CHECK-NEXT:     Exports:
 # CHECK-NEXT:       - Name:            __wasm_call_ctors
 # CHECK-NEXT:         Kind:            FUNCTION
-# CHECK-NEXT:         Index:           1
+# CHECK-NEXT:         Index:           0
 
 # check for elem segment initialized with __table_base global as offset
 
@@ -206,17 +202,17 @@ get_local_func_address:
 # CHECK-NEXT:       - Offset:
 # CHECK-NEXT:           Opcode:          GLOBAL_GET
 # CHECK-NEXT:           Index:           2
-# CHECK-NEXT:         Functions:       [ 4, 3 ]
+# CHECK-NEXT:         Functions:       [ 3, 2 ]
 
 # check the generated code in __wasm_call_ctors and __wasm_apply_data_relocs functions
 # TODO(sbc): Disassemble and verify instructions.
 
 # CHECK:        - Type:            CODE
 # CHECK-NEXT:     Functions:
-# CHECK-NEXT:       - Index:           1
+# CHECK-NEXT:       - Index:           0
 # CHECK-NEXT:         Locals:          []
-# CHECK-NEXT:         Body:            10020B
-# CHECK-NEXT:       - Index:           2
+# CHECK-NEXT:         Body:            10010B
+# CHECK-NEXT:       - Index:           1
 # CHECK-NEXT:         Locals:          []
 # CHECK-NEXT:         Body:            230141046A2304360200230141086A230241016A3602002301410C6A230141006A360200230141106A2305360200230141146A230641046A3602000B
 
diff --git a/lld/test/wasm/shared64.s b/lld/test/wasm/shared64.s
index 36528e1031a65..d441dd363cdb2 100644
--- a/lld/test/wasm/shared64.s
+++ b/lld/test/wasm/shared64.s
@@ -174,10 +174,6 @@ get_local_func_address:
 # CHECK-NEXT:         Kind:            GLOBAL
 # CHECK-NEXT:         GlobalType:      I32
 # CHECK-NEXT:         GlobalMutable:   false
-# CHECK-NEXT:       - Module:          env
-# CHECK-NEXT:         Field:           func_external
-# CHECK-NEXT:         Kind:            FUNCTION
-# CHECK-NEXT:         SigIndex:        1
 # CHECK-NEXT:       - Module:          GOT.mem
 # CHECK-NEXT:         Field:           indirect_func
 # CHECK-NEXT:         Kind:            GLOBAL
@@ -204,7 +200,7 @@ get_local_func_address:
 # CHECK-NEXT:     Exports:
 # CHECK-NEXT:       - Name:            __wasm_call_ctors
 # CHECK-NEXT:         Kind:            FUNCTION
-# CHECK-NEXT:         Index:           1
+# CHECK-NEXT:         Index:           0
 
 # check for elem segment initialized with __table_base global as offset
 
@@ -213,17 +209,17 @@ get_local_func_address:
 # CHECK-NEXT:       - Offset:
 # CHECK-NEXT:           Opcode:          GLOBAL_GET
 # CHECK-NEXT:           Index:           3
-# CHECK-NEXT:         Functions:       [ 4, 3 ]
+# CHECK-NEXT:         Functions:       [ 3, 2 ]
 
 # check the generated code in __wasm_call_ctors and __wasm_apply_data_relocs functions
 # TODO(sbc): Disassemble and verify instructions.
 
 # CHECK:        - Type:            CODE
 # CHECK-NEXT:     Functions:
-# CHECK-NEXT:       - Index:           1
+# CHECK-NEXT:       - Index:           0
 # CHECK-NEXT:         Locals:          []
-# CHECK-NEXT:         Body:            10020B
-# CHECK-NEXT:       - Index:           2
+# CHECK-NEXT:         Body:            10010B
+# CHECK-NEXT:       - Index:           1
 # CHECK-NEXT:         Locals:          []
 # CHECK-NEXT:         Body:            230142047C23053702002301420C7C230242017C370200230142147C230141006A360200230142187C2306370200230142207C230741046A3602000B
 
diff --git a/lld/test/wasm/signature-mismatch.ll b/lld/test/wasm/signature-mismatch.ll
deleted file mode 100644
index 931ec9ef12346..0000000000000
--- a/lld/test/wasm/signature-mismatch.ll
+++ /dev/null
@@ -1,102 +0,0 @@
-; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
-; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/call-ret32.s -o %t.call.o
-; RUN: llc -filetype=obj %s -o %t.main.o
-
-; RUN: wasm-ld --export=call_ret32 --export=ret32 -o %t.wasm %t.main.o %t.ret32.o %t.call.o 2>&1 | FileCheck %s -check-prefix=WARN
-; RUN: obj2yaml %t.wasm | FileCheck %s -check-prefix=YAML
-
-; RUN: wasm-ld -r -o %t.reloc.o %t.main.o %t.ret32.o %t.call.o 2>&1 | FileCheck %s -check-prefix=WARN
-; RUN: obj2yaml %t.reloc.o | FileCheck %s -check-prefix=RELOC
-
-; RUN: not wasm-ld --fatal-warnings -o %t.wasm %t.main.o %t.ret32.o %t.call.o 2>&1 | FileCheck %s -check-prefix=ERROR
-
-target triple = "wasm32-unknown-unknown"
-
-@ret32_address_main = global i32 (i32, i64, i32)* @ret32, align 4
-
-; Function Attrs: nounwind
-define hidden void @_start() local_unnamed_addr {
-entry:
-  %call1 = call i32 @ret32(i32 1, i64 2, i32 3)
-  %addr = load i32 (i32, i64, i32)*, i32 (i32, i64, i32)** @ret32_address_main, align 4
-  %call2 = call i32 %addr(i32 1, i64 2, i32 3)
-  ret void
-}
-
-declare i32 @ret32(i32, i64, i32) local_unnamed_addr
-
-; WARN: warning: function signature mismatch: ret32
-; WARN-NEXT: >>> defined as (i32, i64, i32) -> i32 in {{.*}}.main.o
-; WARN-NEXT: >>> defined as (f32) -> i32 in {{.*}}.ret32.o
-
-; ERROR: error: function signature mismatch: ret32
-; ERROR-NEXT: >>> defined as (i32, i64, i32) -> i32 in {{.*}}.main.o
-; ERROR-NEXT: >>> defined as (f32) -> i32 in {{.*}}.ret32.o
-
-; YAML:        - Type:            EXPORT
-; YAML:           - Name:            ret32
-; YAML-NEXT:        Kind:            FUNCTION
-; YAML-NEXT:        Index:           2
-; YAML-NEXT:      - Name:            call_ret32
-; YAML-NEXT:        Kind:            FUNCTION
-; YAML-NEXT:        Index:           3
-
-; YAML:        - Type:            CUSTOM
-; YAML-NEXT:     Name:            name
-; YAML-NEXT:     FunctionNames:   
-; YAML-NEXT:       - Index:           0
-; YAML-NEXT:         Name:            'signature_mismatch:ret32'
-; YAML-NEXT:       - Index:           1
-; YAML-NEXT:         Name:            _start
-; YAML-NEXT:       - Index:           2
-; YAML-NEXT:         Name:            ret32
-; YAML-NEXT:       - Index:           3
-; YAML-NEXT:         Name:            call_ret32
-; YAML-NEXT:     GlobalNames:
-; YAML-NEXT:       - Index:           0
-; YAML-NEXT:         Name:            __stack_pointer
-; YAML-NEXT:     DataSegmentNames:
-; YAML-NEXT:       - Index:           0
-; YAML-NEXT:         Name:            .data
-; YAML-NEXT: ...
-
-;      RELOC:     Name:            linking
-; RELOC-NEXT:     Version:         2
-; RELOC-NEXT:     SymbolTable:
-; RELOC-NEXT:       - Index:           0
-; RELOC-NEXT:         Kind:            FUNCTION
-; RELOC-NEXT:         Name:            _start
-; RELOC-NEXT:         Flags:           [ VISIBILITY_HIDDEN ]
-; RELOC-NEXT:         Function:        1
-; RELOC-NEXT:       - Index:           1
-; RELOC-NEXT:         Kind:            FUNCTION
-; RELOC-NEXT:         Name:            ret32
-; RELOC-NEXT:         Flags:           [ VISIBILITY_HIDDEN ]
-; RELOC-NEXT:         Function:        2
-; RELOC-NEXT:       - Index:           2
-; RELOC-NEXT:         Kind:            DATA
-; RELOC-NEXT:         Name:            ret32_address_main
-; RELOC-NEXT:         Flags:           [  ]
-; RELOC-NEXT:         Segment:         0
-; RELOC-NEXT:         Size:            4
-; RELOC-NEXT:       - Index:           3
-; RELOC-NEXT:         Kind:            TABLE
-; RELOC-NEXT:         Name:            __indirect_function_table
-; RELOC-NEXT:         Flags:           [ UNDEFINED, NO_STRIP ]
-; RELOC-NEXT:         Table:           0
-; RELOC-NEXT:       - Index:           4
-; RELOC-NEXT:         Kind:            FUNCTION
-; RELOC-NEXT:         Name:            call_ret32
-; RELOC-NEXT:         Flags:           [ ]
-; RELOC-NEXT:         Function:        3
-; RELOC-NEXT:       - Index:           5
-; RELOC-NEXT:         Kind:            DATA
-; RELOC-NEXT:         Name:            ret32_address
-; RELOC-NEXT:         Flags:           [  ]
-; RELOC-NEXT:         Segment:         1
-; RELOC-NEXT:         Size:            4
-; RELOC-NEXT:       - Index:           6
-; RELOC-NEXT:         Kind:            FUNCTION
-; RELOC-NEXT:         Name:            'signature_mismatch:ret32'
-; RELOC-NEXT:         Flags:           [ BINDING_LOCAL ]
-; RELOC-NEXT:         Function:        0
diff --git a/lld/test/wasm/signature-mismatch.s b/lld/test/wasm/signature-mismatch.s
new file mode 100644
index 0000000000000..5d305efca2464
--- /dev/null
+++ b/lld/test/wasm/signature-mismatch.s
@@ -0,0 +1,115 @@
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/ret32.s -o %t.ret32.o
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/call-ret32.s -o %t.call.o
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t.main.o
+
+# RUN: wasm-ld --export=call_ret32 --export=ret32 -o %t.wasm %t.main.o %t.ret32.o %t.call.o 2>&1 | FileCheck %s -check-prefix=WARN
+# RUN: obj2yaml %t.wasm | FileCheck %s -check-prefix=YAML
+
+# RUN: wasm-ld -r -o %t.reloc.o %t.main.o %t.ret32.o %t.call.o 2>&1 | FileCheck %s -check-prefix=WARN
+# RUN: obj2yaml %t.reloc.o | FileCheck %s -check-prefix=RELOC
+
+# RUN: not wasm-ld --fatal-warnings -o %t.wasm %t.main.o %t.ret32.o %t.call.o 2>&1 | FileCheck %s -check-prefix=ERROR
+
+.functype ret32 (i32, i64, i32) -> (i32)
+
+.hidden  _start
+.globl  _start
+_start:
+  .functype _start () -> ()
+  i32.const 1
+  i64.const 2
+  i32.const 3
+  call  ret32
+  drop
+  i32.const 1
+  i64.const 2
+  i32.const 3
+  i32.const 0
+  i32.load  ret32_address_main
+  call_indirect  (i32, i64, i32) -> (i32)
+  drop
+  end_function
+
+.section  .data,"",@
+.globl  ret32_address_main
+.p2align  2
+
+ret32_address_main:
+  .int32  ret32
+.size ret32_address_main, 4
+
+# WARN: warning: function signature mismatch: ret32
+# WARN-NEXT: >>> defined as (i32, i64, i32) -> i32 in {{.*}}.main.o
+# WARN-NEXT: >>> defined as (f32) -> i32 in {{.*}}.ret32.o
+
+# ERROR: error: function signature mismatch: ret32
+# ERROR-NEXT: >>> defined as (i32, i64, i32) -> i32 in {{.*}}.main.o
+# ERROR-NEXT: >>> defined as (f32) -> i32 in {{.*}}.ret32.o
+
+# YAML:        - Type:            EXPORT
+# YAML:           - Name:            ret32
+# YAML-NEXT:        Kind:            FUNCTION
+# YAML-NEXT:        Index:           2
+# YAML-NEXT:      - Name:            call_ret32
+# YAML-NEXT:        Kind:            FUNCTION
+# YAML-NEXT:        Index:           3
+
+# YAML:        - Type:            CUSTOM
+# YAML-NEXT:     Name:            name
+# YAML-NEXT:     FunctionNames:   
+# YAML-NEXT:       - Index:           0
+# YAML-NEXT:         Name:            'signature_mismatch:ret32'
+# YAML-NEXT:       - Index:           1
+# YAML-NEXT:         Name:            _start
+# YAML-NEXT:       - Index:           2
+# YAML-NEXT:         Name:            ret32
+# YAML-NEXT:       - Index:           3
+# YAML-NEXT:         Name:            call_ret32
+# YAML-NEXT:     GlobalNames:
+# YAML-NEXT:       - Index:           0
+# YAML-NEXT:         Name:            __stack_pointer
+# YAML-NEXT:     DataSegmentNames:
+# YAML-NEXT:       - Index:           0
+# YAML-NEXT:         Name:            .data
+# YAML-NEXT: ...
+
+#      RELOC:     Name:            linking
+# RELOC-NEXT:     Version:         2
+# RELOC-NEXT:     SymbolTable:
+# RELOC-NEXT:       - Index:           0
+# RELOC-NEXT:         Kind:            FUNCTION
+# RELOC-NEXT:         Name:            _start
+# RELOC-NEXT:         Flags:           [ VISIBILITY_HIDDEN ]
+# RELOC-NEXT:         Function:        1
+# RELOC-NEXT:       - Index:           1
+# RELOC-NEXT:         Kind:            FUNCTION
+# RELOC-NEXT:         Name:            ret32
+# RELOC-NEXT:         Flags:           [ VISIBILITY_HIDDEN ]
+# RELOC-NEXT:         Function:        2
+# RELOC-NEXT:       - Index:           2
+# RELOC-NEXT:         Kind:            DATA
+# RELOC-NEXT:         Name:            ret32_address_main
+# RELOC-NEXT:         Flags:           [  ]
+# RELOC-NEXT:         Segment:         0
+# RELOC-NEXT:         Size:            4
+# RELOC-NEXT:       - Index:           3
+# RELOC-NEXT:         Kind:            TABLE
+# RELOC-NEXT:         Name:            __indirect_function_table
+# RELOC-NEXT:         Flags:           [ UNDEFINED, NO_STRIP ]
+# RELOC-NEXT:         Table:           0
+# RELOC-NEXT:       - Index:           4
+# RELOC-NEXT:         Kind:            FUNCTION
+# RELOC-NEXT:         Name:            call_ret32
+# RELOC-NEXT:         Flags:           [ ]
+# RELOC-NEXT:         Function:        3
+# RELOC-NEXT:       - Index:           5
+# RELOC-NEXT:         Kind:            DATA
+# RELOC-NEXT:         Name:            ret32_address
+# RELOC-NEXT:         Flags:           [  ]
+# RELOC-NEXT:         Segment:         1
+# RELOC-NEXT:         Size:            4
+# RELOC-NEXT:       - Index:           6
+# RELOC-NEXT:         Kind:            FUNCTION
+# RELOC-NEXT:         Name:            'signature_mismatch:ret32'
+# RELOC-NEXT:         Flags:           [ BINDING_LOCAL ]
+# RELOC-NEXT:         Function:        0
diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp
index 1a5213347ba3d..89fb481d72ceb 100644
--- a/lld/wasm/SymbolTable.cpp
+++ b/lld/wasm/SymbolTable.cpp
@@ -548,9 +548,12 @@ Symbol *SymbolTable::addUndefinedFunction(StringRef name,
       else if (getFunctionVariant(s, sig, file, &s))
         replaceSym();
     }
-    if (existingUndefined)
+    if (existingUndefined) {
       setImportAttributes(existingUndefined, importName, importModule, flags,
                           file);
+      if (isCalledDirectly)
+        existingUndefined->isCalledDirectly = true;
+    }
   }
 
   return s;
diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp
index 37a26b3a1dc86..09b5badb9c07c 100644
--- a/lld/wasm/Symbols.cpp
+++ b/lld/wasm/Symbols.cpp
@@ -215,6 +215,12 @@ void Symbol::setHidden(bool isHidden) {
 }
 
 bool Symbol::isExported() const {
+  // Shared libraries must export all weakly defined symbols
+  // in case they contain the version that will be chosed by
+  // the dynamic linker.
+  if (config->shared && isLive() && isDefined() && isWeak())
+    return true;
+
   if (!isDefined() || isLocal())
     return false;
 
@@ -233,15 +239,13 @@ bool Symbol::isNoStrip() const {
 }
 
 uint32_t FunctionSymbol::getFunctionIndex() const {
-  if (auto *f = dyn_cast<DefinedFunction>(this))
-    return f->function->getFunctionIndex();
-  if (const auto *u = dyn_cast<UndefinedFunction>(this)) {
-    if (u->stubFunction) {
+  if (const auto *u = dyn_cast<UndefinedFunction>(this))
+    if (u->stubFunction)
       return u->stubFunction->getFunctionIndex();
-    }
-  }
-  assert(functionIndex != INVALID_INDEX);
-  return functionIndex;
+  if (functionIndex != INVALID_INDEX)
+    return functionIndex;
+  auto *f = cast<DefinedFunction>(this);
+  return f->function->getFunctionIndex();
 }
 
 void FunctionSymbol::setFunctionIndex(uint32_t index) {
@@ -288,6 +292,10 @@ DefinedFunction::DefinedFunction(StringRef name, uint32_t flags, InputFile *f,
                      function ? &function->signature : nullptr),
       function(function) {}
 
+uint32_t DefinedFunction::getExportedFunctionIndex() const {
+  return function->getFunctionIndex();
+}
+
 uint64_t DefinedData::getVA() const {
   LLVM_DEBUG(dbgs() << "getVA: " << getName() << "\n");
   if (segment)
diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h
index a883b8ac58657..ef2ae7f457af9 100644
--- a/lld/wasm/Symbols.h
+++ b/lld/wasm/Symbols.h
@@ -214,6 +214,12 @@ class DefinedFunction : public FunctionSymbol {
     return s->kind() == DefinedFunctionKind;
   }
 
+  // Get the function index to be used when exporting.  This only applies to
+  // defined functions and can be differ from the regular function index for
+  // weakly defined functions (that are imported and used via one index but
+  // defined and exported via another).
+  uint32_t getExportedFunctionIndex() const;
+
   InputFunction *function;
 };
 
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index fc37115844b7e..bb0439653b659 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -516,7 +516,11 @@ void LinkingSection::writeBody() {
       writeUleb128(sub.os, flags, "sym flags");
 
       if (auto *f = dyn_cast<FunctionSymbol>(sym)) {
-        writeUleb128(sub.os, f->getFunctionIndex(), "index");
+        if (auto *d = dyn_cast<DefinedFunction>(sym)) {
+          writeUleb128(sub.os, d->getExportedFunctionIndex(), "index");
+        } else {
+          writeUleb128(sub.os, f->getFunctionIndex(), "index");
+        }
         if (sym->isDefined() || (flags & WASM_SYMBOL_EXPLICIT_NAME) != 0)
           writeStr(sub.os, sym->getName(), "sym name");
       } else if (auto *g = dyn_cast<GlobalSymbol>(sym)) {
diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp
index cf9356ba18a36..2d4dc472834ae 100644
--- a/lld/wasm/Writer.cpp
+++ b/lld/wasm/Writer.cpp
@@ -548,20 +548,33 @@ void Writer::populateTargetFeatures() {
 }
 
 static bool shouldImport(Symbol *sym) {
-  if (!sym->isUndefined())
-    return false;
-  if (sym->isWeak() && !config->relocatable && !config->isPic)
+  // We don't generate imports for data symbols. They however can be imported
+  // as GOT entries.
+  if (isa<DataSymbol>(sym))
     return false;
   if (!sym->isLive())
     return false;
   if (!sym->isUsedInRegularObj)
     return false;
 
-  // We don't generate imports for data symbols. They however can be imported
-  // as GOT entries.
-  if (isa<DataSymbol>(sym))
+  // When a symbol is weakly defined in a shared library we need to allow
+  // it to be overridden by another module so need to both import
+  // and export the symbol.
+  if (config->shared && sym->isDefined() && sym->isWeak())
+    return true;
+  if (!sym->isUndefined())
+    return false;
+  if (sym->isWeak() && !config->relocatable && !config->isPic)
     return false;
 
+  // In PIC mode we only need to import functions when they are called directly.
+  // Indirect usage all goes via GOT imports.
+  if (config->isPic) {
+    if (auto *f = dyn_cast<UndefinedFunction>(sym))
+      if (!f->isCalledDirectly)
+        return false;
+  }
+
   if (config->isPic || config->relocatable || config->importUndefined)
     return true;
   if (config->allowUndefinedSymbols.count(sym->getName()) != 0)
@@ -611,7 +624,7 @@ void Writer::calculateExports() {
       if (Optional<StringRef> exportName = f->function->getExportName()) {
         name = *exportName;
       }
-      export_ = {name, WASM_EXTERNAL_FUNCTION, f->getFunctionIndex()};
+      export_ = {name, WASM_EXTERNAL_FUNCTION, f->getExportedFunctionIndex()};
     } else if (auto *g = dyn_cast<DefinedGlobal>(sym)) {
       if (g->getGlobalType()->Mutable && !g->getFile() && !g->forceExport) {
         // Avoid exporting mutable globals are linker synthesized (e.g.
diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index 2bb05c1e220b3..594c769141b43 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -64,7 +64,7 @@ if(LLVM_ENABLE_MODULES)
   list(APPEND LLVM_COMMON_DEPENDS intrinsics_gen)
 endif()
 
-if(CMAKE_CROSSCOMPILING AND LLDB_BUILT_STANDALONE)
+if(CMAKE_CROSSCOMPILING AND LLDB_BUILT_STANDALONE AND NOT LLDB_TABLEGEN_EXE)
   set(LLVM_USE_HOST_TOOLS ON)
   include(CrossCompile)
   if (NOT NATIVE_LLVM_DIR OR NOT NATIVE_Clang_DIR)
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index ac8d002b09a12..fc9dd97515aa9 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -122,7 +122,16 @@ class TargetProperties : public Properties {
 
   void SetRunArguments(const Args &args);
 
+  // Get the whole environment including the platform inherited environment and
+  // the target specific environment, excluding the unset environment variables.
   Environment GetEnvironment() const;
+  // Get the platform inherited environment, excluding the unset environment
+  // variables.
+  Environment GetInheritedEnvironment() const;
+  // Get the target specific environment only, without the platform inherited
+  // environment.
+  Environment GetTargetEnvironment() const;
+  // Set the target specific environment.
   void SetEnvironment(Environment env);
 
   bool GetSkipPrologue() const;
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index 7e1fdce36ede6..a899b1b154fe2 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1293,7 +1293,8 @@ def getCPUInfo(self):
 
     def isAArch64(self):
         """Returns true if the architecture is AArch64."""
-        return self.getArchitecture().lower() == "aarch64"
+        arch = self.getArchitecture().lower()
+        return arch in ["aarch64", "arm64", "arm64e"]
 
     def isAArch64SVE(self):
         return self.isAArch64() and "sve" in self.getCPUInfo()
diff --git a/lldb/packages/Python/lldbsuite/test/make/Android.rules b/lldb/packages/Python/lldbsuite/test/make/Android.rules
index 7339c22d2e4c4..32f786aa34756 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Android.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Android.rules
@@ -66,8 +66,8 @@ OBJCOPY ?= $(GCC_TOOLCHAIN)/bin/$(TOOL_PREFIX)-objcopy
 ARCHIVER ?= $(GCC_TOOLCHAIN)/bin/$(TOOL_PREFIX)-ar
 
 ifeq "$(findstring clang,$(CC))" "clang"
-	ARCH_CFLAGS += -target $(TRIPLE) -gcc-toolchain $(GCC_TOOLCHAIN)
-	ARCH_LDFLAGS += -target $(TRIPLE) -gcc-toolchain $(GCC_TOOLCHAIN)
+	ARCH_CFLAGS += -target $(TRIPLE) --gcc-toolchain=$(GCC_TOOLCHAIN)
+	ARCH_LDFLAGS += -target $(TRIPLE) --gcc-toolchain=$(GCC_TOOLCHAIN)
 endif
 
 ARCH_CFLAGS += --sysroot=$(NDK_ROOT)/sysroot \
diff --git a/lldb/source/API/SBMemoryRegionInfo.cpp b/lldb/source/API/SBMemoryRegionInfo.cpp
index ab74d559387fb..c0f5456223da5 100644
--- a/lldb/source/API/SBMemoryRegionInfo.cpp
+++ b/lldb/source/API/SBMemoryRegionInfo.cpp
@@ -135,8 +135,8 @@ uint32_t SBMemoryRegionInfo::GetNumDirtyPages() {
 }
 
 addr_t SBMemoryRegionInfo::GetDirtyPageAddressAtIndex(uint32_t idx) {
-  LLDB_RECORD_METHOD(addr_t, SBMemoryRegionInfo, GetDirtyPageAddressAtIndex,
-                     (uint32_t), idx);
+  LLDB_RECORD_METHOD(lldb::addr_t, SBMemoryRegionInfo,
+                     GetDirtyPageAddressAtIndex, (uint32_t), idx);
 
   addr_t dirty_page_addr = LLDB_INVALID_ADDRESS;
   const llvm::Optional<std::vector<addr_t>> &dirty_page_list =
@@ -149,6 +149,7 @@ addr_t SBMemoryRegionInfo::GetDirtyPageAddressAtIndex(uint32_t idx) {
 
 int SBMemoryRegionInfo::GetPageSize() {
   LLDB_RECORD_METHOD_NO_ARGS(int, SBMemoryRegionInfo, GetPageSize);
+
   return m_opaque_up->GetPageSize();
 }
 
@@ -196,6 +197,10 @@ void RegisterMethods<SBMemoryRegionInfo>(Registry &R) {
   LLDB_REGISTER_METHOD(const char *, SBMemoryRegionInfo, GetName, ());
   LLDB_REGISTER_METHOD(bool, SBMemoryRegionInfo, GetDescription,
                        (lldb::SBStream &));
+  LLDB_REGISTER_METHOD(bool, SBMemoryRegionInfo, HasDirtyMemoryPageList, ());
+  LLDB_REGISTER_METHOD(uint32_t, SBMemoryRegionInfo, GetNumDirtyPages, ());
+  LLDB_REGISTER_METHOD(lldb::addr_t, SBMemoryRegionInfo, GetDirtyPageAddressAtIndex, (uint32_t));
+  LLDB_REGISTER_METHOD(int, SBMemoryRegionInfo, GetPageSize, ());
 }
 
 }
diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
index b78fb37f56c47..56f7720636e8a 100644
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -151,10 +151,10 @@ let Command = "breakpoint set" in {
   def breakpoint_set_selector : Option<"selector", "S">, Group<5>,
     Arg<"Selector">, Required,
     Desc<"Set the breakpoint by ObjC selector name. Can be repeated multiple "
-    "times tomake one breakpoint for multiple Selectors.">;
+    "times to make one breakpoint for multiple Selectors.">;
   def breakpoint_set_method : Option<"method", "M">, Group<6>, Arg<"Method">,
     Required, Desc<"Set the breakpoint by C++ method names.  Can be repeated "
-    "multiple times tomake one breakpoint for multiple methods.">;
+    "multiple times to make one breakpoint for multiple methods.">;
   def breakpoint_set_func_regex : Option<"func-regex", "r">, Group<7>,
     Arg<"RegularExpression">, Required, Desc<"Set the breakpoint by function "
     "name, evaluating a regular-expression to find the function name(s).">;
diff --git a/lldb/source/Core/IOHandlerCursesGUI.cpp b/lldb/source/Core/IOHandlerCursesGUI.cpp
index 010f9300aa2e5..9dc13ccbb4764 100644
--- a/lldb/source/Core/IOHandlerCursesGUI.cpp
+++ b/lldb/source/Core/IOHandlerCursesGUI.cpp
@@ -45,6 +45,7 @@
 #include "lldb/Core/ValueObject.h"
 #include "lldb/Core/ValueObjectRegister.h"
 #include "lldb/Symbol/Block.h"
+#include "lldb/Symbol/CompileUnit.h"
 #include "lldb/Symbol/Function.h"
 #include "lldb/Symbol/Symbol.h"
 #include "lldb/Symbol/VariableList.h"
@@ -343,15 +344,30 @@ class HelpDialogDelegate : public WindowDelegate {
 // A surface is an abstraction for something than can be drawn on. The surface
 // have a width, a height, a cursor position, and a multitude of drawing
 // operations. This type should be sub-classed to get an actually useful ncurses
-// object, such as a Window, SubWindow, Pad, or a SubPad.
+// object, such as a Window or a Pad.
 class Surface {
 public:
-  Surface() : m_window(nullptr) {}
+  enum class Type { Window, Pad };
+
+  Surface(Surface::Type type) : m_type(type), m_window(nullptr) {}
 
   WINDOW *get() { return m_window; }
 
   operator WINDOW *() { return m_window; }
 
+  Surface SubSurface(Rect bounds) {
+    Surface subSurface(m_type);
+    if (m_type == Type::Pad)
+      subSurface.m_window =
+          ::subpad(m_window, bounds.size.height, bounds.size.width,
+                   bounds.origin.y, bounds.origin.x);
+    else
+      subSurface.m_window =
+          ::derwin(m_window, bounds.size.height, bounds.size.width,
+                   bounds.origin.y, bounds.origin.x);
+    return subSurface;
+  }
+
   // Copy a region of the surface to another surface.
   void CopyToSurface(Surface &target, Point source_origin, Point target_origin,
                      Size size) {
@@ -535,41 +551,32 @@ class Surface {
   }
 
 protected:
+  Type m_type;
   WINDOW *m_window;
 };
 
 class Pad : public Surface {
 public:
-  Pad(Size size) { m_window = ::newpad(size.height, size.width); }
-
-  ~Pad() { ::delwin(m_window); }
-};
-
-class SubPad : public Surface {
-public:
-  SubPad(Pad &pad, Rect bounds) {
-    m_window = ::subpad(pad.get(), bounds.size.height, bounds.size.width,
-                        bounds.origin.y, bounds.origin.x);
-  }
-  SubPad(SubPad &subpad, Rect bounds) {
-    m_window = ::subpad(subpad.get(), bounds.size.height, bounds.size.width,
-                        bounds.origin.y, bounds.origin.x);
+  Pad(Size size) : Surface(Surface::Type::Pad) {
+    m_window = ::newpad(size.height, size.width);
   }
 
-  ~SubPad() { ::delwin(m_window); }
+  ~Pad() { ::delwin(m_window); }
 };
 
 class Window : public Surface {
 public:
   Window(const char *name)
-      : m_name(name), m_panel(nullptr), m_parent(nullptr), m_subwindows(),
-        m_delegate_sp(), m_curr_active_window_idx(UINT32_MAX),
+      : Surface(Surface::Type::Window), m_name(name), m_panel(nullptr),
+        m_parent(nullptr), m_subwindows(), m_delegate_sp(),
+        m_curr_active_window_idx(UINT32_MAX),
         m_prev_active_window_idx(UINT32_MAX), m_delete(false),
         m_needs_update(true), m_can_activate(true), m_is_subwin(false) {}
 
   Window(const char *name, WINDOW *w, bool del = true)
-      : m_name(name), m_panel(nullptr), m_parent(nullptr), m_subwindows(),
-        m_delegate_sp(), m_curr_active_window_idx(UINT32_MAX),
+      : Surface(Surface::Type::Window), m_name(name), m_panel(nullptr),
+        m_parent(nullptr), m_subwindows(), m_delegate_sp(),
+        m_curr_active_window_idx(UINT32_MAX),
         m_prev_active_window_idx(UINT32_MAX), m_delete(del),
         m_needs_update(true), m_can_activate(true), m_is_subwin(false) {
     if (w)
@@ -577,8 +584,8 @@ class Window : public Surface {
   }
 
   Window(const char *name, const Rect &bounds)
-      : m_name(name), m_parent(nullptr), m_subwindows(), m_delegate_sp(),
-        m_curr_active_window_idx(UINT32_MAX),
+      : Surface(Surface::Type::Window), m_name(name), m_parent(nullptr),
+        m_subwindows(), m_delegate_sp(), m_curr_active_window_idx(UINT32_MAX),
         m_prev_active_window_idx(UINT32_MAX), m_delete(true),
         m_needs_update(true), m_can_activate(true), m_is_subwin(false) {
     Reset(::newwin(bounds.size.height, bounds.size.width, bounds.origin.y,
@@ -970,20 +977,6 @@ class Window : public Surface {
   const Window &operator=(const Window &) = delete;
 };
 
-class DerivedWindow : public Surface {
-public:
-  DerivedWindow(Window &window, Rect bounds) {
-    m_window = ::derwin(window.get(), bounds.size.height, bounds.size.width,
-                        bounds.origin.y, bounds.origin.x);
-  }
-  DerivedWindow(DerivedWindow &derived_window, Rect bounds) {
-    m_window = ::derwin(derived_window.get(), bounds.size.height,
-                        bounds.size.width, bounds.origin.y, bounds.origin.x);
-  }
-
-  ~DerivedWindow() { ::delwin(m_window); }
-};
-
 /////////
 // Forms
 /////////
@@ -1025,7 +1018,7 @@ class FieldDelegate {
   // Draw the field in the given subpad surface. The surface have a height that
   // is equal to the height returned by FieldDelegateGetHeight(). If the field
   // is selected in the form window, then is_selected will be true.
-  virtual void FieldDelegateDraw(SubPad &surface, bool is_selected) = 0;
+  virtual void FieldDelegateDraw(Surface &surface, bool is_selected) = 0;
 
   // Handle the key that wasn't handled by the form window or a container field.
   virtual HandleCharResult FieldDelegateHandleChar(int key) {
@@ -1112,7 +1105,7 @@ class TextFieldDelegate : public FieldDelegate {
 
   int GetContentLength() { return m_content.length(); }
 
-  void DrawContent(SubPad &surface, bool is_selected) {
+  void DrawContent(Surface &surface, bool is_selected) {
     surface.MoveCursor(0, 0);
     const char *text = m_content.c_str() + m_first_visibile_char;
     surface.PutCString(text, surface.GetWidth());
@@ -1131,17 +1124,17 @@ class TextFieldDelegate : public FieldDelegate {
       surface.AttributeOff(A_REVERSE);
   }
 
-  void DrawField(SubPad &surface, bool is_selected) {
+  void DrawField(Surface &surface, bool is_selected) {
     surface.TitledBox(m_label.c_str());
 
     Rect content_bounds = surface.GetFrame();
     content_bounds.Inset(1, 1);
-    SubPad content_surface = SubPad(surface, content_bounds);
+    Surface content_surface = surface.SubSurface(content_bounds);
 
     DrawContent(content_surface, is_selected);
   }
 
-  void DrawError(SubPad &surface) {
+  void DrawError(Surface &surface) {
     if (!FieldDelegateHasError())
       return;
     surface.MoveCursor(0, 0);
@@ -1152,12 +1145,12 @@ class TextFieldDelegate : public FieldDelegate {
     surface.AttributeOff(COLOR_PAIR(RedOnBlack));
   }
 
-  void FieldDelegateDraw(SubPad &surface, bool is_selected) override {
+  void FieldDelegateDraw(Surface &surface, bool is_selected) override {
     Rect frame = surface.GetFrame();
     Rect field_bounds, error_bounds;
     frame.HorizontalSplit(GetFieldHeight(), field_bounds, error_bounds);
-    SubPad field_surface = SubPad(surface, field_bounds);
-    SubPad error_surface = SubPad(surface, error_bounds);
+    Surface field_surface = surface.SubSurface(field_bounds);
+    Surface error_surface = surface.SubSurface(error_bounds);
 
     DrawField(field_surface, is_selected);
     DrawError(error_surface);
@@ -1215,7 +1208,13 @@ class TextFieldDelegate : public FieldDelegate {
 
   // True if the key represents a char that can be inserted in the field
   // content, false otherwise.
-  virtual bool IsAcceptableChar(int key) { return isprint(key); }
+  virtual bool IsAcceptableChar(int key) {
+    // The behavior of isprint is undefined when the value is not representable
+    // as an unsigned char. So explicitly check for non-ascii key codes.
+    if (key > 127)
+      return false;
+    return isprint(key);
+  }
 
   HandleCharResult FieldDelegateHandleChar(int key) override {
     if (IsAcceptableChar(key)) {
@@ -1260,6 +1259,14 @@ class TextFieldDelegate : public FieldDelegate {
 
   const std::string &GetText() { return m_content; }
 
+  void SetText(const char *text) {
+    if (text == nullptr) {
+      m_content.clear();
+      return;
+    }
+    m_content = text;
+  }
+
 protected:
   std::string m_label;
   bool m_required;
@@ -1406,7 +1413,7 @@ class BooleanFieldDelegate : public FieldDelegate {
   // Boolean fields are have a single line.
   int FieldDelegateGetHeight() override { return 1; }
 
-  void FieldDelegateDraw(SubPad &surface, bool is_selected) override {
+  void FieldDelegateDraw(Surface &surface, bool is_selected) override {
     surface.MoveCursor(0, 0);
     surface.PutChar('[');
     if (is_selected)
@@ -1486,7 +1493,7 @@ class ChoicesFieldDelegate : public FieldDelegate {
     return std::min(index, GetNumberOfChoices()) - 1;
   }
 
-  void DrawContent(SubPad &surface, bool is_selected) {
+  void DrawContent(Surface &surface, bool is_selected) {
     int choices_to_draw = GetLastVisibleChoice() - m_first_visibile_choice + 1;
     for (int i = 0; i < choices_to_draw; i++) {
       surface.MoveCursor(0, i);
@@ -1502,14 +1509,14 @@ class ChoicesFieldDelegate : public FieldDelegate {
     }
   }
 
-  void FieldDelegateDraw(SubPad &surface, bool is_selected) override {
+  void FieldDelegateDraw(Surface &surface, bool is_selected) override {
     UpdateScrolling();
 
     surface.TitledBox(m_label.c_str());
 
     Rect content_bounds = surface.GetFrame();
     content_bounds.Inset(1, 1);
-    SubPad content_surface = SubPad(surface, content_bounds);
+    Surface content_surface = surface.SubSurface(content_bounds);
 
     DrawContent(content_surface, is_selected);
   }
@@ -1619,6 +1626,33 @@ class ProcessPluginFieldDelegate : public ChoicesFieldDelegate {
   }
 };
 
+class LazyBooleanFieldDelegate : public ChoicesFieldDelegate {
+public:
+  LazyBooleanFieldDelegate(const char *label, const char *calculate_label)
+      : ChoicesFieldDelegate(label, 3, GetPossibleOptions(calculate_label)) {}
+
+  static constexpr const char *kNo = "No";
+  static constexpr const char *kYes = "Yes";
+
+  std::vector<std::string> GetPossibleOptions(const char *calculate_label) {
+    std::vector<std::string> options;
+    options.push_back(calculate_label);
+    options.push_back(kYes);
+    options.push_back(kNo);
+    return options;
+  }
+
+  LazyBool GetLazyBoolean() {
+    std::string choice = GetChoiceContent();
+    if (choice == kNo)
+      return eLazyBoolNo;
+    else if (choice == kYes)
+      return eLazyBoolYes;
+    else
+      return eLazyBoolCalculate;
+  }
+};
+
 template <class T> class ListFieldDelegate : public FieldDelegate {
 public:
   ListFieldDelegate(const char *label, T default_field)
@@ -1684,7 +1718,7 @@ template <class T> class ListFieldDelegate : public FieldDelegate {
     return context;
   }
 
-  void DrawRemoveButton(SubPad &surface, int highlight) {
+  void DrawRemoveButton(Surface &surface, int highlight) {
     surface.MoveCursor(1, surface.GetHeight() / 2);
     if (highlight)
       surface.AttributeOn(A_REVERSE);
@@ -1693,7 +1727,7 @@ template <class T> class ListFieldDelegate : public FieldDelegate {
       surface.AttributeOff(A_REVERSE);
   }
 
-  void DrawFields(SubPad &surface, bool is_selected) {
+  void DrawFields(Surface &surface, bool is_selected) {
     int line = 0;
     int width = surface.GetWidth();
     for (int i = 0; i < GetNumberOfFields(); i++) {
@@ -1702,8 +1736,8 @@ template <class T> class ListFieldDelegate : public FieldDelegate {
       Rect field_bounds, remove_button_bounds;
       bounds.VerticalSplit(bounds.size.width - sizeof(" [Remove]"),
                            field_bounds, remove_button_bounds);
-      SubPad field_surface = SubPad(surface, field_bounds);
-      SubPad remove_button_surface = SubPad(surface, remove_button_bounds);
+      Surface field_surface = surface.SubSurface(field_bounds);
+      Surface remove_button_surface = surface.SubSurface(remove_button_bounds);
 
       bool is_element_selected = m_selection_index == i && is_selected;
       bool is_field_selected =
@@ -1718,7 +1752,7 @@ template <class T> class ListFieldDelegate : public FieldDelegate {
     }
   }
 
-  void DrawNewButton(SubPad &surface, bool is_selected) {
+  void DrawNewButton(Surface &surface, bool is_selected) {
     const char *button_text = "[New]";
     int x = (surface.GetWidth() - sizeof(button_text) - 1) / 2;
     surface.MoveCursor(x, 0);
@@ -1731,7 +1765,7 @@ template <class T> class ListFieldDelegate : public FieldDelegate {
       surface.AttributeOff(A_REVERSE);
   }
 
-  void FieldDelegateDraw(SubPad &surface, bool is_selected) override {
+  void FieldDelegateDraw(Surface &surface, bool is_selected) override {
     surface.TitledBox(m_label.c_str());
 
     Rect content_bounds = surface.GetFrame();
@@ -1739,8 +1773,8 @@ template <class T> class ListFieldDelegate : public FieldDelegate {
     Rect fields_bounds, new_button_bounds;
     content_bounds.HorizontalSplit(content_bounds.size.height - 1,
                                    fields_bounds, new_button_bounds);
-    SubPad fields_surface = SubPad(surface, fields_bounds);
-    SubPad new_button_surface = SubPad(surface, new_button_bounds);
+    Surface fields_surface = surface.SubSurface(fields_bounds);
+    Surface new_button_surface = surface.SubSurface(new_button_bounds);
 
     DrawFields(fields_surface, is_selected);
     DrawNewButton(new_button_surface, is_selected);
@@ -1823,6 +1857,31 @@ template <class T> class ListFieldDelegate : public FieldDelegate {
     return eKeyHandled;
   }
 
+  // If the last element of the field is selected and it didn't handle the key.
+  // Select the next field or new button if the selected field is the last one.
+  HandleCharResult SelectNextInList(int key) {
+    assert(m_selection_type == SelectionType::Field);
+
+    FieldDelegate &field = m_fields[m_selection_index];
+    if (field.FieldDelegateHandleChar(key) == eKeyHandled)
+      return eKeyHandled;
+
+    if (!field.FieldDelegateOnLastOrOnlyElement())
+      return eKeyNotHandled;
+
+    field.FieldDelegateExitCallback();
+
+    if (m_selection_index == GetNumberOfFields() - 1) {
+      m_selection_type = SelectionType::NewButton;
+      return eKeyHandled;
+    }
+
+    m_selection_index++;
+    FieldDelegate &next_field = m_fields[m_selection_index];
+    next_field.FieldDelegateSelectFirstElement();
+    return eKeyHandled;
+  }
+
   HandleCharResult FieldDelegateHandleChar(int key) override {
     switch (key) {
     case '\r':
@@ -1835,16 +1894,14 @@ template <class T> class ListFieldDelegate : public FieldDelegate {
       case SelectionType::RemoveButton:
         RemoveField();
         return eKeyHandled;
-      default:
-        break;
+      case SelectionType::Field:
+        return SelectNextInList(key);
       }
       break;
     case '\t':
-      SelectNext(key);
-      return eKeyHandled;
+      return SelectNext(key);
     case KEY_SHIFT_TAB:
-      SelectPrevious(key);
-      return eKeyHandled;
+      return SelectPrevious(key);
     default:
       break;
     }
@@ -1909,6 +1966,29 @@ template <class T> class ListFieldDelegate : public FieldDelegate {
   SelectionType m_selection_type;
 };
 
+class ArgumentsFieldDelegate : public ListFieldDelegate<TextFieldDelegate> {
+public:
+  ArgumentsFieldDelegate()
+      : ListFieldDelegate("Arguments",
+                          TextFieldDelegate("Argument", "", false)) {}
+
+  Args GetArguments() {
+    Args arguments;
+    for (int i = 0; i < GetNumberOfFields(); i++) {
+      arguments.AppendArgument(GetField(i).GetText());
+    }
+    return arguments;
+  }
+
+  void AddArguments(const Args &arguments) {
+    for (size_t i = 0; i < arguments.GetArgumentCount(); i++) {
+      AddNewField();
+      TextFieldDelegate &field = GetField(GetNumberOfFields() - 1);
+      field.SetText(arguments.GetArgumentAtIndex(i));
+    }
+  }
+};
+
 template <class KeyFieldDelegateType, class ValueFieldDelegateType>
 class MappingFieldDelegate : public FieldDelegate {
 public:
@@ -1936,12 +2016,12 @@ class MappingFieldDelegate : public FieldDelegate {
                     m_value_field.FieldDelegateGetHeight());
   }
 
-  void DrawArrow(SubPad &surface) {
+  void DrawArrow(Surface &surface) {
     surface.MoveCursor(0, 1);
     surface.PutChar(ACS_RARROW);
   }
 
-  void FieldDelegateDraw(SubPad &surface, bool is_selected) override {
+  void FieldDelegateDraw(Surface &surface, bool is_selected) override {
     Rect bounds = surface.GetFrame();
     Rect key_field_bounds, arrow_and_value_field_bounds;
     bounds.VerticalSplit(bounds.size.width / 2, key_field_bounds,
@@ -1950,9 +2030,9 @@ class MappingFieldDelegate : public FieldDelegate {
     arrow_and_value_field_bounds.VerticalSplit(1, arrow_bounds,
                                                value_field_bounds);
 
-    SubPad key_field_surface = SubPad(surface, key_field_bounds);
-    SubPad arrow_surface = SubPad(surface, arrow_bounds);
-    SubPad value_field_surface = SubPad(surface, value_field_bounds);
+    Surface key_field_surface = surface.SubSurface(key_field_bounds);
+    Surface arrow_surface = surface.SubSurface(arrow_bounds);
+    Surface value_field_surface = surface.SubSurface(value_field_bounds);
 
     bool key_is_selected =
         m_selection_type == SelectionType::Key && is_selected;
@@ -1991,14 +2071,34 @@ class MappingFieldDelegate : public FieldDelegate {
     return eKeyHandled;
   }
 
+  // If the value field is selected, pass the key to it. If the key field is
+  // selected, its last element is selected, and it didn't handle the key, then
+  // select its corresponding value field.
+  HandleCharResult SelectNextField(int key) {
+    if (m_selection_type == SelectionType::Value) {
+      return m_value_field.FieldDelegateHandleChar(key);
+    }
+
+    if (m_key_field.FieldDelegateHandleChar(key) == eKeyHandled)
+      return eKeyHandled;
+
+    if (!m_key_field.FieldDelegateOnLastOrOnlyElement())
+      return eKeyNotHandled;
+
+    m_key_field.FieldDelegateExitCallback();
+    m_selection_type = SelectionType::Value;
+    m_value_field.FieldDelegateSelectFirstElement();
+    return eKeyHandled;
+  }
+
   HandleCharResult FieldDelegateHandleChar(int key) override {
     switch (key) {
+    case KEY_RETURN:
+      return SelectNextField(key);
     case '\t':
-      SelectNext(key);
-      return eKeyHandled;
+      return SelectNext(key);
     case KEY_SHIFT_TAB:
-      SelectPrevious(key);
-      return eKeyHandled;
+      return SelectPrevious(key);
     default:
       break;
     }
@@ -2069,14 +2169,36 @@ class EnvironmentVariableFieldDelegate
   const std::string &GetName() { return GetKeyField().GetName(); }
 
   const std::string &GetValue() { return GetValueField().GetText(); }
+
+  void SetName(const char *name) { return GetKeyField().SetText(name); }
+
+  void SetValue(const char *value) { return GetValueField().SetText(value); }
 };
 
 class EnvironmentVariableListFieldDelegate
     : public ListFieldDelegate<EnvironmentVariableFieldDelegate> {
 public:
-  EnvironmentVariableListFieldDelegate()
-      : ListFieldDelegate("Environment Variables",
-                          EnvironmentVariableFieldDelegate()) {}
+  EnvironmentVariableListFieldDelegate(const char *label)
+      : ListFieldDelegate(label, EnvironmentVariableFieldDelegate()) {}
+
+  Environment GetEnvironment() {
+    Environment environment;
+    for (int i = 0; i < GetNumberOfFields(); i++) {
+      environment.insert(
+          std::make_pair(GetField(i).GetName(), GetField(i).GetValue()));
+    }
+    return environment;
+  }
+
+  void AddEnvironmentVariables(const Environment &environment) {
+    for (auto &variable : environment) {
+      AddNewField();
+      EnvironmentVariableFieldDelegate &field =
+          GetField(GetNumberOfFields() - 1);
+      field.SetName(variable.getKey().str().c_str());
+      field.SetValue(variable.getValue().c_str());
+    }
+  }
 };
 
 class FormAction {
@@ -2088,7 +2210,7 @@ class FormAction {
   }
 
   // Draw a centered [Label].
-  void Draw(SubPad &surface, bool is_selected) {
+  void Draw(Surface &surface, bool is_selected) {
     int x = (surface.GetWidth() - m_label.length()) / 2;
     surface.MoveCursor(x, 0);
     if (is_selected)
@@ -2201,6 +2323,14 @@ class FormDelegate {
     return delegate;
   }
 
+  LazyBooleanFieldDelegate *AddLazyBooleanField(const char *label,
+                                                const char *calculate_label) {
+    LazyBooleanFieldDelegate *delegate =
+        new LazyBooleanFieldDelegate(label, calculate_label);
+    m_fields.push_back(FieldDelegateUP(delegate));
+    return delegate;
+  }
+
   ChoicesFieldDelegate *AddChoicesField(const char *label, int height,
                                         std::vector<std::string> choices) {
     ChoicesFieldDelegate *delegate =
@@ -2230,6 +2360,12 @@ class FormDelegate {
     return delegate;
   }
 
+  ArgumentsFieldDelegate *AddArgumentsField() {
+    ArgumentsFieldDelegate *delegate = new ArgumentsFieldDelegate();
+    m_fields.push_back(FieldDelegateUP(delegate));
+    return delegate;
+  }
+
   template <class K, class V>
   MappingFieldDelegate<K, V> *AddMappingField(K key_field, V value_field) {
     MappingFieldDelegate<K, V> *delegate =
@@ -2253,9 +2389,10 @@ class FormDelegate {
     return delegate;
   }
 
-  EnvironmentVariableListFieldDelegate *AddEnvironmentVariableListField() {
+  EnvironmentVariableListFieldDelegate *
+  AddEnvironmentVariableListField(const char *label) {
     EnvironmentVariableListFieldDelegate *delegate =
-        new EnvironmentVariableListFieldDelegate();
+        new EnvironmentVariableListFieldDelegate(label);
     m_fields.push_back(FieldDelegateUP(delegate));
     return delegate;
   }
@@ -2357,7 +2494,7 @@ class FormWindowDelegate : public WindowDelegate {
     return context;
   }
 
-  void UpdateScrolling(DerivedWindow &surface) {
+  void UpdateScrolling(Surface &surface) {
     ScrollContext context = GetScrollContext();
     int content_height = GetContentHeight();
     int surface_height = surface.GetHeight();
@@ -2381,7 +2518,7 @@ class FormWindowDelegate : public WindowDelegate {
     }
   }
 
-  void DrawError(SubPad &surface) {
+  void DrawError(Surface &surface) {
     if (!m_delegate_sp->HasError())
       return;
     surface.MoveCursor(0, 0);
@@ -2395,7 +2532,7 @@ class FormWindowDelegate : public WindowDelegate {
     surface.HorizontalLine(surface.GetWidth());
   }
 
-  void DrawFields(SubPad &surface) {
+  void DrawFields(Surface &surface) {
     int line = 0;
     int width = surface.GetWidth();
     bool a_field_is_selected = m_selection_type == SelectionType::Field;
@@ -2406,13 +2543,13 @@ class FormWindowDelegate : public WindowDelegate {
       bool is_field_selected = a_field_is_selected && m_selection_index == i;
       int height = field->FieldDelegateGetHeight();
       Rect bounds = Rect(Point(0, line), Size(width, height));
-      SubPad field_surface = SubPad(surface, bounds);
+      Surface field_surface = surface.SubSurface(bounds);
       field->FieldDelegateDraw(field_surface, is_field_selected);
       line += height;
     }
   }
 
-  void DrawActions(SubPad &surface) {
+  void DrawActions(Surface &surface) {
     int number_of_actions = m_delegate_sp->GetNumberOfActions();
     int width = surface.GetWidth() / number_of_actions;
     bool an_action_is_selected = m_selection_type == SelectionType::Action;
@@ -2421,19 +2558,19 @@ class FormWindowDelegate : public WindowDelegate {
       bool is_action_selected = an_action_is_selected && m_selection_index == i;
       FormAction &action = m_delegate_sp->GetAction(i);
       Rect bounds = Rect(Point(x, 0), Size(width, 1));
-      SubPad action_surface = SubPad(surface, bounds);
+      Surface action_surface = surface.SubSurface(bounds);
       action.Draw(action_surface, is_action_selected);
       x += width;
     }
   }
 
-  void DrawElements(SubPad &surface) {
+  void DrawElements(Surface &surface) {
     Rect frame = surface.GetFrame();
     Rect fields_bounds, actions_bounds;
     frame.HorizontalSplit(surface.GetHeight() - GetActionsHeight(),
                           fields_bounds, actions_bounds);
-    SubPad fields_surface = SubPad(surface, fields_bounds);
-    SubPad actions_surface = SubPad(surface, actions_bounds);
+    Surface fields_surface = surface.SubSurface(fields_bounds);
+    Surface actions_surface = surface.SubSurface(actions_bounds);
 
     DrawFields(fields_surface);
     DrawActions(actions_surface);
@@ -2442,7 +2579,7 @@ class FormWindowDelegate : public WindowDelegate {
   // Contents are first drawn on a pad. Then a subset of that pad is copied to
   // the derived window starting at the first visible line. This essentially
   // provides scrolling functionality.
-  void DrawContent(DerivedWindow &surface) {
+  void DrawContent(Surface &surface) {
     UpdateScrolling(surface);
 
     int width = surface.GetWidth();
@@ -2452,8 +2589,8 @@ class FormWindowDelegate : public WindowDelegate {
     Rect frame = pad.GetFrame();
     Rect error_bounds, elements_bounds;
     frame.HorizontalSplit(GetErrorHeight(), error_bounds, elements_bounds);
-    SubPad error_surface = SubPad(pad, error_bounds);
-    SubPad elements_surface = SubPad(pad, elements_bounds);
+    Surface error_surface = pad.SubSurface(error_bounds);
+    Surface elements_surface = pad.SubSurface(elements_bounds);
 
     DrawError(error_surface);
     DrawElements(elements_surface);
@@ -2473,7 +2610,7 @@ class FormWindowDelegate : public WindowDelegate {
 
     Rect content_bounds = window.GetFrame();
     content_bounds.Inset(2, 2);
-    DerivedWindow content_surface = DerivedWindow(window, content_bounds);
+    Surface content_surface = window.SubSurface(content_bounds);
 
     DrawContent(content_surface);
     return true;
@@ -3031,6 +3168,396 @@ class TargetCreateFormDelegate : public FormDelegate {
   ChoicesFieldDelegate *m_load_dependent_files_field;
 };
 
+class ProcessLaunchFormDelegate : public FormDelegate {
+public:
+  ProcessLaunchFormDelegate(Debugger &debugger, WindowSP main_window_sp)
+      : m_debugger(debugger), m_main_window_sp(main_window_sp) {
+
+    m_arguments_field = AddArgumentsField();
+    SetArgumentsFieldDefaultValue();
+    m_target_environment_field =
+        AddEnvironmentVariableListField("Target Environment Variables");
+    SetTargetEnvironmentFieldDefaultValue();
+    m_working_directory_field = AddDirectoryField(
+        "Working Directory", GetDefaultWorkingDirectory().c_str(), true, false);
+
+    m_show_advanced_field = AddBooleanField("Show advanced settings.", false);
+
+    m_stop_at_entry_field = AddBooleanField("Stop at entry point.", false);
+    m_detach_on_error_field =
+        AddBooleanField("Detach on error.", GetDefaultDetachOnError());
+    m_disable_aslr_field =
+        AddBooleanField("Disable ASLR", GetDefaultDisableASLR());
+    m_plugin_field = AddProcessPluginField();
+    m_arch_field = AddArchField("Architecture", "", false);
+    m_shell_field = AddFileField("Shell", "", true, false);
+    m_expand_shell_arguments_field =
+        AddBooleanField("Expand shell arguments.", false);
+
+    m_disable_standard_io_field =
+        AddBooleanField("Disable Standard IO", GetDefaultDisableStandardIO());
+    m_standard_output_field =
+        AddFileField("Standard Output File", "", /*need_to_exist=*/false,
+                     /*required=*/false);
+    m_standard_error_field =
+        AddFileField("Standard Error File", "", /*need_to_exist=*/false,
+                     /*required=*/false);
+    m_standard_input_field =
+        AddFileField("Standard Input File", "", /*need_to_exist=*/false,
+                     /*required=*/false);
+
+    m_show_inherited_environment_field =
+        AddBooleanField("Show inherited environment variables.", false);
+    m_inherited_environment_field =
+        AddEnvironmentVariableListField("Inherited Environment Variables");
+    SetInheritedEnvironmentFieldDefaultValue();
+
+    AddAction("Launch", [this](Window &window) { Launch(window); });
+  }
+
+  std::string GetName() override { return "Launch Process"; }
+
+  void UpdateFieldsVisibility() override {
+    if (m_show_advanced_field->GetBoolean()) {
+      m_stop_at_entry_field->FieldDelegateShow();
+      m_detach_on_error_field->FieldDelegateShow();
+      m_disable_aslr_field->FieldDelegateShow();
+      m_plugin_field->FieldDelegateShow();
+      m_arch_field->FieldDelegateShow();
+      m_shell_field->FieldDelegateShow();
+      m_expand_shell_arguments_field->FieldDelegateShow();
+      m_disable_standard_io_field->FieldDelegateShow();
+      if (m_disable_standard_io_field->GetBoolean()) {
+        m_standard_input_field->FieldDelegateHide();
+        m_standard_output_field->FieldDelegateHide();
+        m_standard_error_field->FieldDelegateHide();
+      } else {
+        m_standard_input_field->FieldDelegateShow();
+        m_standard_output_field->FieldDelegateShow();
+        m_standard_error_field->FieldDelegateShow();
+      }
+      m_show_inherited_environment_field->FieldDelegateShow();
+      if (m_show_inherited_environment_field->GetBoolean())
+        m_inherited_environment_field->FieldDelegateShow();
+      else
+        m_inherited_environment_field->FieldDelegateHide();
+    } else {
+      m_stop_at_entry_field->FieldDelegateHide();
+      m_detach_on_error_field->FieldDelegateHide();
+      m_disable_aslr_field->FieldDelegateHide();
+      m_plugin_field->FieldDelegateHide();
+      m_arch_field->FieldDelegateHide();
+      m_shell_field->FieldDelegateHide();
+      m_expand_shell_arguments_field->FieldDelegateHide();
+      m_disable_standard_io_field->FieldDelegateHide();
+      m_standard_input_field->FieldDelegateHide();
+      m_standard_output_field->FieldDelegateHide();
+      m_standard_error_field->FieldDelegateHide();
+      m_show_inherited_environment_field->FieldDelegateHide();
+      m_inherited_environment_field->FieldDelegateHide();
+    }
+  }
+
+  // Methods for setting the default value of the fields.
+
+  void SetArgumentsFieldDefaultValue() {
+    TargetSP target = m_debugger.GetSelectedTarget();
+    if (target == nullptr)
+      return;
+
+    const Args &target_arguments =
+        target->GetProcessLaunchInfo().GetArguments();
+    m_arguments_field->AddArguments(target_arguments);
+  }
+
+  void SetTargetEnvironmentFieldDefaultValue() {
+    TargetSP target = m_debugger.GetSelectedTarget();
+    if (target == nullptr)
+      return;
+
+    const Environment &target_environment = target->GetTargetEnvironment();
+    m_target_environment_field->AddEnvironmentVariables(target_environment);
+  }
+
+  void SetInheritedEnvironmentFieldDefaultValue() {
+    TargetSP target = m_debugger.GetSelectedTarget();
+    if (target == nullptr)
+      return;
+
+    const Environment &inherited_environment =
+        target->GetInheritedEnvironment();
+    m_inherited_environment_field->AddEnvironmentVariables(
+        inherited_environment);
+  }
+
+  std::string GetDefaultWorkingDirectory() {
+    TargetSP target = m_debugger.GetSelectedTarget();
+    if (target == nullptr)
+      return "";
+
+    PlatformSP platform = target->GetPlatform();
+    return platform->GetWorkingDirectory().GetPath();
+  }
+
+  bool GetDefaultDisableASLR() {
+    TargetSP target = m_debugger.GetSelectedTarget();
+    if (target == nullptr)
+      return false;
+
+    return target->GetDisableASLR();
+  }
+
+  bool GetDefaultDisableStandardIO() {
+    TargetSP target = m_debugger.GetSelectedTarget();
+    if (target == nullptr)
+      return true;
+
+    return target->GetDisableSTDIO();
+  }
+
+  bool GetDefaultDetachOnError() {
+    TargetSP target = m_debugger.GetSelectedTarget();
+    if (target == nullptr)
+      return true;
+
+    return target->GetDetachOnError();
+  }
+
+  // Methods for getting the necessary information and setting them to the
+  // ProcessLaunchInfo.
+
+  void GetExecutableSettings(ProcessLaunchInfo &launch_info) {
+    TargetSP target = m_debugger.GetSelectedTarget();
+    ModuleSP executable_module = target->GetExecutableModule();
+    llvm::StringRef target_settings_argv0 = target->GetArg0();
+
+    if (!target_settings_argv0.empty()) {
+      launch_info.GetArguments().AppendArgument(target_settings_argv0);
+      launch_info.SetExecutableFile(executable_module->GetPlatformFileSpec(),
+                                    false);
+      return;
+    }
+
+    launch_info.SetExecutableFile(executable_module->GetPlatformFileSpec(),
+                                  true);
+  }
+
+  void GetArguments(ProcessLaunchInfo &launch_info) {
+    TargetSP target = m_debugger.GetSelectedTarget();
+    Args arguments = m_arguments_field->GetArguments();
+    launch_info.GetArguments().AppendArguments(arguments);
+  }
+
+  void GetEnvironment(ProcessLaunchInfo &launch_info) {
+    Environment target_environment =
+        m_target_environment_field->GetEnvironment();
+    Environment inherited_environment =
+        m_inherited_environment_field->GetEnvironment();
+    launch_info.GetEnvironment().insert(target_environment.begin(),
+                                        target_environment.end());
+    launch_info.GetEnvironment().insert(inherited_environment.begin(),
+                                        inherited_environment.end());
+  }
+
+  void GetWorkingDirectory(ProcessLaunchInfo &launch_info) {
+    if (m_working_directory_field->IsSpecified())
+      launch_info.SetWorkingDirectory(
+          m_working_directory_field->GetResolvedFileSpec());
+  }
+
+  void GetStopAtEntry(ProcessLaunchInfo &launch_info) {
+    if (m_stop_at_entry_field->GetBoolean())
+      launch_info.GetFlags().Set(eLaunchFlagStopAtEntry);
+    else
+      launch_info.GetFlags().Clear(eLaunchFlagStopAtEntry);
+  }
+
+  void GetDetachOnError(ProcessLaunchInfo &launch_info) {
+    if (m_detach_on_error_field->GetBoolean())
+      launch_info.GetFlags().Set(eLaunchFlagDetachOnError);
+    else
+      launch_info.GetFlags().Clear(eLaunchFlagDetachOnError);
+  }
+
+  void GetDisableASLR(ProcessLaunchInfo &launch_info) {
+    if (m_disable_aslr_field->GetBoolean())
+      launch_info.GetFlags().Set(eLaunchFlagDisableASLR);
+    else
+      launch_info.GetFlags().Clear(eLaunchFlagDisableASLR);
+  }
+
+  void GetPlugin(ProcessLaunchInfo &launch_info) {
+    launch_info.SetProcessPluginName(m_plugin_field->GetPluginName());
+  }
+
+  void GetArch(ProcessLaunchInfo &launch_info) {
+    if (!m_arch_field->IsSpecified())
+      return;
+
+    TargetSP target_sp = m_debugger.GetSelectedTarget();
+    PlatformSP platform_sp =
+        target_sp ? target_sp->GetPlatform() : PlatformSP();
+    launch_info.GetArchitecture() = Platform::GetAugmentedArchSpec(
+        platform_sp.get(), m_arch_field->GetArchString());
+  }
+
+  void GetShell(ProcessLaunchInfo &launch_info) {
+    if (!m_shell_field->IsSpecified())
+      return;
+
+    launch_info.SetShell(m_shell_field->GetResolvedFileSpec());
+    launch_info.SetShellExpandArguments(
+        m_expand_shell_arguments_field->GetBoolean());
+  }
+
+  void GetStandardIO(ProcessLaunchInfo &launch_info) {
+    if (m_disable_standard_io_field->GetBoolean()) {
+      launch_info.GetFlags().Set(eLaunchFlagDisableSTDIO);
+      return;
+    }
+
+    FileAction action;
+    if (m_standard_input_field->IsSpecified()) {
+      action.Open(STDIN_FILENO, m_standard_input_field->GetFileSpec(), true,
+                  false);
+      launch_info.AppendFileAction(action);
+    }
+    if (m_standard_output_field->IsSpecified()) {
+      action.Open(STDOUT_FILENO, m_standard_output_field->GetFileSpec(), false,
+                  true);
+      launch_info.AppendFileAction(action);
+    }
+    if (m_standard_error_field->IsSpecified()) {
+      action.Open(STDERR_FILENO, m_standard_error_field->GetFileSpec(), false,
+                  true);
+      launch_info.AppendFileAction(action);
+    }
+  }
+
+  void GetInheritTCC(ProcessLaunchInfo &launch_info) {
+    if (m_debugger.GetSelectedTarget()->GetInheritTCC())
+      launch_info.GetFlags().Set(eLaunchFlagInheritTCCFromParent);
+  }
+
+  ProcessLaunchInfo GetLaunchInfo() {
+    ProcessLaunchInfo launch_info;
+
+    GetExecutableSettings(launch_info);
+    GetArguments(launch_info);
+    GetEnvironment(launch_info);
+    GetWorkingDirectory(launch_info);
+    GetStopAtEntry(launch_info);
+    GetDetachOnError(launch_info);
+    GetDisableASLR(launch_info);
+    GetPlugin(launch_info);
+    GetArch(launch_info);
+    GetShell(launch_info);
+    GetStandardIO(launch_info);
+    GetInheritTCC(launch_info);
+
+    return launch_info;
+  }
+
+  bool StopRunningProcess() {
+    ExecutionContext exe_ctx =
+        m_debugger.GetCommandInterpreter().GetExecutionContext();
+
+    if (!exe_ctx.HasProcessScope())
+      return false;
+
+    Process *process = exe_ctx.GetProcessPtr();
+    if (!(process && process->IsAlive()))
+      return false;
+
+    FormDelegateSP form_delegate_sp =
+        FormDelegateSP(new DetachOrKillProcessFormDelegate(process));
+    Rect bounds = m_main_window_sp->GetCenteredRect(85, 8);
+    WindowSP form_window_sp = m_main_window_sp->CreateSubWindow(
+        form_delegate_sp->GetName().c_str(), bounds, true);
+    WindowDelegateSP window_delegate_sp =
+        WindowDelegateSP(new FormWindowDelegate(form_delegate_sp));
+    form_window_sp->SetDelegate(window_delegate_sp);
+
+    return true;
+  }
+
+  Target *GetTarget() {
+    Target *target = m_debugger.GetSelectedTarget().get();
+
+    if (target == nullptr) {
+      SetError("No target exists!");
+      return nullptr;
+    }
+
+    ModuleSP exe_module_sp = target->GetExecutableModule();
+
+    if (exe_module_sp == nullptr) {
+      SetError("No executable in target!");
+      return nullptr;
+    }
+
+    return target;
+  }
+
+  void Launch(Window &window) {
+    ClearError();
+
+    bool all_fields_are_valid = CheckFieldsValidity();
+    if (!all_fields_are_valid)
+      return;
+
+    bool process_is_running = StopRunningProcess();
+    if (process_is_running)
+      return;
+
+    Target *target = GetTarget();
+    if (HasError())
+      return;
+
+    StreamString stream;
+    ProcessLaunchInfo launch_info = GetLaunchInfo();
+    Status status = target->Launch(launch_info, &stream);
+
+    if (status.Fail()) {
+      SetError(status.AsCString());
+      return;
+    }
+
+    ProcessSP process_sp(target->GetProcessSP());
+    if (!process_sp) {
+      SetError("Launched successfully but target has no process!");
+      return;
+    }
+
+    window.GetParent()->RemoveSubWindow(&window);
+  }
+
+protected:
+  Debugger &m_debugger;
+  WindowSP m_main_window_sp;
+
+  ArgumentsFieldDelegate *m_arguments_field;
+  EnvironmentVariableListFieldDelegate *m_target_environment_field;
+  DirectoryFieldDelegate *m_working_directory_field;
+
+  BooleanFieldDelegate *m_show_advanced_field;
+
+  BooleanFieldDelegate *m_stop_at_entry_field;
+  BooleanFieldDelegate *m_detach_on_error_field;
+  BooleanFieldDelegate *m_disable_aslr_field;
+  ProcessPluginFieldDelegate *m_plugin_field;
+  ArchFieldDelegate *m_arch_field;
+  FileFieldDelegate *m_shell_field;
+  BooleanFieldDelegate *m_expand_shell_arguments_field;
+  BooleanFieldDelegate *m_disable_standard_io_field;
+  FileFieldDelegate *m_standard_input_field;
+  FileFieldDelegate *m_standard_output_field;
+  FileFieldDelegate *m_standard_error_field;
+
+  BooleanFieldDelegate *m_show_inherited_environment_field;
+  EnvironmentVariableListFieldDelegate *m_inherited_environment_field;
+};
+
 class MenuDelegate {
 public:
   virtual ~MenuDelegate() = default;
@@ -3772,9 +4299,14 @@ class TreeDelegate {
                                            TreeItem *&selected_item) {
     return;
   }
-  virtual bool TreeDelegateItemSelected(
-      TreeItem &item) = 0; // Return true if we need to update views
+  // This is invoked when a tree item is selected. If true is returned, the
+  // views are updated.
+  virtual bool TreeDelegateItemSelected(TreeItem &item) = 0;
   virtual bool TreeDelegateExpandRootByDefault() { return false; }
+  // This is mostly useful for root tree delegates. If false is returned,
+  // drawing will be skipped completely. This is needed, for instance, in
+  // skipping drawing of the threads tree if there is no running process.
+  virtual bool TreeDelegateShouldDraw() { return true; }
 };
 
 typedef std::shared_ptr<TreeDelegate> TreeDelegateSP;
@@ -3964,6 +4496,16 @@ class TreeItem {
 
   void SetIdentifier(uint64_t identifier) { m_identifier = identifier; }
 
+  const std::string &GetText() const { return m_text; }
+
+  void SetText(const char *text) {
+    if (text == nullptr) {
+      m_text.clear();
+      return;
+    }
+    m_text = text;
+  }
+
   void SetMightHaveChildren(bool b) { m_might_have_children = b; }
 
 protected:
@@ -3971,6 +4513,7 @@ class TreeItem {
   TreeDelegate &m_delegate;
   void *m_user_data;
   uint64_t m_identifier;
+  std::string m_text;
   int m_row_idx; // Zero based visible row index, -1 if not visible or for the
                  // root item
   std::vector<TreeItem> m_children;
@@ -3989,21 +4532,6 @@ class TreeWindowDelegate : public WindowDelegate {
   int NumVisibleRows() const { return m_max_y - m_min_y; }
 
   bool WindowDelegateDraw(Window &window, bool force) override {
-    ExecutionContext exe_ctx(
-        m_debugger.GetCommandInterpreter().GetExecutionContext());
-    Process *process = exe_ctx.GetProcessPtr();
-
-    bool display_content = false;
-    if (process) {
-      StateType state = process->GetState();
-      if (StateIsStoppedState(state, true)) {
-        // We are stopped, so it is ok to
-        display_content = true;
-      } else if (StateIsRunningState(state)) {
-        return true; // Don't do any updating when we are running
-      }
-    }
-
     m_min_x = 2;
     m_min_y = 1;
     m_max_x = window.GetWidth() - 1;
@@ -4012,35 +4540,36 @@ class TreeWindowDelegate : public WindowDelegate {
     window.Erase();
     window.DrawTitleBox(window.GetName());
 
-    if (display_content) {
-      const int num_visible_rows = NumVisibleRows();
-      m_num_rows = 0;
-      m_root.CalculateRowIndexes(m_num_rows);
-      m_delegate_sp->TreeDelegateUpdateSelection(m_root, m_selected_row_idx,
-                                                 m_selected_item);
-
-      // If we unexpanded while having something selected our total number of
-      // rows is less than the num visible rows, then make sure we show all the
-      // rows by setting the first visible row accordingly.
-      if (m_first_visible_row > 0 && m_num_rows < num_visible_rows)
-        m_first_visible_row = 0;
-
-      // Make sure the selected row is always visible
-      if (m_selected_row_idx < m_first_visible_row)
-        m_first_visible_row = m_selected_row_idx;
-      else if (m_first_visible_row + num_visible_rows <= m_selected_row_idx)
-        m_first_visible_row = m_selected_row_idx - num_visible_rows + 1;
-
-      int row_idx = 0;
-      int num_rows_left = num_visible_rows;
-      m_root.Draw(window, m_first_visible_row, m_selected_row_idx, row_idx,
-                  num_rows_left);
-      // Get the selected row
-      m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
-    } else {
+    if (!m_delegate_sp->TreeDelegateShouldDraw()) {
       m_selected_item = nullptr;
+      return true;
     }
 
+    const int num_visible_rows = NumVisibleRows();
+    m_num_rows = 0;
+    m_root.CalculateRowIndexes(m_num_rows);
+    m_delegate_sp->TreeDelegateUpdateSelection(m_root, m_selected_row_idx,
+                                               m_selected_item);
+
+    // If we unexpanded while having something selected our total number of
+    // rows is less than the num visible rows, then make sure we show all the
+    // rows by setting the first visible row accordingly.
+    if (m_first_visible_row > 0 && m_num_rows < num_visible_rows)
+      m_first_visible_row = 0;
+
+    // Make sure the selected row is always visible
+    if (m_selected_row_idx < m_first_visible_row)
+      m_first_visible_row = m_selected_row_idx;
+    else if (m_first_visible_row + num_visible_rows <= m_selected_row_idx)
+      m_first_visible_row = m_selected_row_idx - num_visible_rows + 1;
+
+    int row_idx = 0;
+    int num_rows_left = num_visible_rows;
+    m_root.Draw(window, m_first_visible_row, m_selected_row_idx, row_idx,
+                num_rows_left);
+    // Get the selected row
+    m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
+
     return true; // Drawing handled
   }
 
@@ -4168,6 +4697,23 @@ class TreeWindowDelegate : public WindowDelegate {
   int m_max_y;
 };
 
+// A tree delegate that just draws the text member of the tree item, it doesn't
+// have any children or actions.
+class TextTreeDelegate : public TreeDelegate {
+public:
+  TextTreeDelegate() : TreeDelegate() {}
+
+  ~TextTreeDelegate() override = default;
+
+  void TreeDelegateDrawTreeItem(TreeItem &item, Window &window) override {
+    window.PutCStringTruncated(1, item.GetText().c_str());
+  }
+
+  void TreeDelegateGenerateChildren(TreeItem &item) override {}
+
+  bool TreeDelegateItemSelected(TreeItem &item) override { return false; }
+};
+
 class FrameTreeDelegate : public TreeDelegate {
 public:
   FrameTreeDelegate() : TreeDelegate() {
@@ -4332,6 +4878,17 @@ class ThreadsTreeDelegate : public TreeDelegate {
         .GetProcessSP();
   }
 
+  bool TreeDelegateShouldDraw() override {
+    ProcessSP process = GetProcess();
+    if (!process)
+      return false;
+
+    if (StateIsRunningState(process->GetState()))
+      return false;
+
+    return true;
+  }
+
   void TreeDelegateDrawTreeItem(TreeItem &item, Window &window) override {
     ProcessSP process_sp = GetProcess();
     if (process_sp && process_sp->IsAlive()) {
@@ -4423,6 +4980,240 @@ class ThreadsTreeDelegate : public TreeDelegate {
   FormatEntity::Entry m_format;
 };
 
+class BreakpointLocationTreeDelegate : public TreeDelegate {
+public:
+  BreakpointLocationTreeDelegate(Debugger &debugger)
+      : TreeDelegate(), m_debugger(debugger) {}
+
+  ~BreakpointLocationTreeDelegate() override = default;
+
+  Process *GetProcess() {
+    ExecutionContext exe_ctx(
+        m_debugger.GetCommandInterpreter().GetExecutionContext());
+    return exe_ctx.GetProcessPtr();
+  }
+
+  BreakpointLocationSP GetBreakpointLocation(const TreeItem &item) {
+    Breakpoint *breakpoint = (Breakpoint *)item.GetUserData();
+    return breakpoint->GetLocationAtIndex(item.GetIdentifier());
+  }
+
+  void TreeDelegateDrawTreeItem(TreeItem &item, Window &window) override {
+    BreakpointLocationSP breakpoint_location = GetBreakpointLocation(item);
+    Process *process = GetProcess();
+    StreamString stream;
+    stream.Printf("%i.%i: ", breakpoint_location->GetBreakpoint().GetID(),
+                  breakpoint_location->GetID());
+    Address address = breakpoint_location->GetAddress();
+    address.Dump(&stream, process, Address::DumpStyleResolvedDescription,
+                 Address::DumpStyleInvalid);
+    window.PutCStringTruncated(1, stream.GetString().str().c_str());
+  }
+
+  StringList ComputeDetailsList(BreakpointLocationSP breakpoint_location) {
+    StringList details;
+
+    Address address = breakpoint_location->GetAddress();
+    SymbolContext symbol_context;
+    address.CalculateSymbolContext(&symbol_context);
+
+    if (symbol_context.module_sp) {
+      StreamString module_stream;
+      module_stream.PutCString("module = ");
+      symbol_context.module_sp->GetFileSpec().Dump(
+          module_stream.AsRawOstream());
+      details.AppendString(module_stream.GetString());
+    }
+
+    if (symbol_context.comp_unit != nullptr) {
+      StreamString compile_unit_stream;
+      compile_unit_stream.PutCString("compile unit = ");
+      symbol_context.comp_unit->GetPrimaryFile().GetFilename().Dump(
+          &compile_unit_stream);
+      details.AppendString(compile_unit_stream.GetString());
+
+      if (symbol_context.function != nullptr) {
+        StreamString function_stream;
+        function_stream.PutCString("function = ");
+        function_stream.PutCString(
+            symbol_context.function->GetName().AsCString("<unknown>"));
+        details.AppendString(function_stream.GetString());
+      }
+
+      if (symbol_context.line_entry.line > 0) {
+        StreamString location_stream;
+        location_stream.PutCString("location = ");
+        symbol_context.line_entry.DumpStopContext(&location_stream, true);
+        details.AppendString(location_stream.GetString());
+      }
+
+    } else {
+      if (symbol_context.symbol) {
+        StreamString symbol_stream;
+        if (breakpoint_location->IsReExported())
+          symbol_stream.PutCString("re-exported target = ");
+        else
+          symbol_stream.PutCString("symbol = ");
+        symbol_stream.PutCString(
+            symbol_context.symbol->GetName().AsCString("<unknown>"));
+        details.AppendString(symbol_stream.GetString());
+      }
+    }
+
+    Process *process = GetProcess();
+
+    StreamString address_stream;
+    address.Dump(&address_stream, process, Address::DumpStyleLoadAddress,
+                 Address::DumpStyleModuleWithFileAddress);
+    details.AppendString(address_stream.GetString());
+
+    BreakpointSiteSP breakpoint_site = breakpoint_location->GetBreakpointSite();
+    if (breakpoint_location->IsIndirect() && breakpoint_site) {
+      Address resolved_address;
+      resolved_address.SetLoadAddress(breakpoint_site->GetLoadAddress(),
+                                      &breakpoint_location->GetTarget());
+      Symbol *resolved_symbol = resolved_address.CalculateSymbolContextSymbol();
+      if (resolved_symbol) {
+        StreamString indirect_target_stream;
+        indirect_target_stream.PutCString("indirect target = ");
+        indirect_target_stream.PutCString(
+            resolved_symbol->GetName().GetCString());
+        details.AppendString(indirect_target_stream.GetString());
+      }
+    }
+
+    bool is_resolved = breakpoint_location->IsResolved();
+    StreamString resolved_stream;
+    resolved_stream.Printf("resolved = %s", is_resolved ? "true" : "false");
+    details.AppendString(resolved_stream.GetString());
+
+    bool is_hardware = is_resolved && breakpoint_site->IsHardware();
+    StreamString hardware_stream;
+    hardware_stream.Printf("hardware = %s", is_hardware ? "true" : "false");
+    details.AppendString(hardware_stream.GetString());
+
+    StreamString hit_count_stream;
+    hit_count_stream.Printf("hit count = %-4u",
+                            breakpoint_location->GetHitCount());
+    details.AppendString(hit_count_stream.GetString());
+
+    return details;
+  }
+
+  void TreeDelegateGenerateChildren(TreeItem &item) override {
+    BreakpointLocationSP breakpoint_location = GetBreakpointLocation(item);
+    StringList details = ComputeDetailsList(breakpoint_location);
+
+    if (!m_string_delegate_sp)
+      m_string_delegate_sp = std::make_shared<TextTreeDelegate>();
+    TreeItem details_tree_item(&item, *m_string_delegate_sp, false);
+
+    item.Resize(details.GetSize(), details_tree_item);
+    for (size_t i = 0; i < details.GetSize(); i++) {
+      item[i].SetText(details.GetStringAtIndex(i));
+    }
+  }
+
+  bool TreeDelegateItemSelected(TreeItem &item) override { return false; }
+
+protected:
+  Debugger &m_debugger;
+  std::shared_ptr<TextTreeDelegate> m_string_delegate_sp;
+};
+
+class BreakpointTreeDelegate : public TreeDelegate {
+public:
+  BreakpointTreeDelegate(Debugger &debugger)
+      : TreeDelegate(), m_debugger(debugger),
+        m_breakpoint_location_delegate_sp() {}
+
+  ~BreakpointTreeDelegate() override = default;
+
+  BreakpointSP GetBreakpoint(const TreeItem &item) {
+    TargetSP target = m_debugger.GetSelectedTarget();
+    BreakpointList &breakpoints = target->GetBreakpointList(false);
+    return breakpoints.GetBreakpointAtIndex(item.GetIdentifier());
+  }
+
+  void TreeDelegateDrawTreeItem(TreeItem &item, Window &window) override {
+    BreakpointSP breakpoint = GetBreakpoint(item);
+    StreamString stream;
+    stream.Format("{0}: ", breakpoint->GetID());
+    breakpoint->GetResolverDescription(&stream);
+    breakpoint->GetFilterDescription(&stream);
+    window.PutCStringTruncated(1, stream.GetString().str().c_str());
+  }
+
+  void TreeDelegateGenerateChildren(TreeItem &item) override {
+    BreakpointSP breakpoint = GetBreakpoint(item);
+
+    if (!m_breakpoint_location_delegate_sp)
+      m_breakpoint_location_delegate_sp =
+          std::make_shared<BreakpointLocationTreeDelegate>(m_debugger);
+    TreeItem breakpoint_location_tree_item(
+        &item, *m_breakpoint_location_delegate_sp, true);
+
+    item.Resize(breakpoint->GetNumLocations(), breakpoint_location_tree_item);
+    for (size_t i = 0; i < breakpoint->GetNumLocations(); i++) {
+      item[i].SetIdentifier(i);
+      item[i].SetUserData(breakpoint.get());
+    }
+  }
+
+  bool TreeDelegateItemSelected(TreeItem &item) override { return false; }
+
+protected:
+  Debugger &m_debugger;
+  std::shared_ptr<BreakpointLocationTreeDelegate>
+      m_breakpoint_location_delegate_sp;
+};
+
+class BreakpointsTreeDelegate : public TreeDelegate {
+public:
+  BreakpointsTreeDelegate(Debugger &debugger)
+      : TreeDelegate(), m_debugger(debugger), m_breakpoint_delegate_sp() {}
+
+  ~BreakpointsTreeDelegate() override = default;
+
+  bool TreeDelegateShouldDraw() override {
+    TargetSP target = m_debugger.GetSelectedTarget();
+    if (!target)
+      return false;
+
+    return true;
+  }
+
+  void TreeDelegateDrawTreeItem(TreeItem &item, Window &window) override {
+    window.PutCString("Breakpoints");
+  }
+
+  void TreeDelegateGenerateChildren(TreeItem &item) override {
+    TargetSP target = m_debugger.GetSelectedTarget();
+
+    BreakpointList &breakpoints = target->GetBreakpointList(false);
+    std::unique_lock<std::recursive_mutex> lock;
+    breakpoints.GetListMutex(lock);
+
+    if (!m_breakpoint_delegate_sp)
+      m_breakpoint_delegate_sp =
+          std::make_shared<BreakpointTreeDelegate>(m_debugger);
+    TreeItem breakpoint_tree_item(&item, *m_breakpoint_delegate_sp, true);
+
+    item.Resize(breakpoints.GetSize(), breakpoint_tree_item);
+    for (size_t i = 0; i < breakpoints.GetSize(); i++) {
+      item[i].SetIdentifier(i);
+    }
+  }
+
+  bool TreeDelegateItemSelected(TreeItem &item) override { return false; }
+
+  bool TreeDelegateExpandRootByDefault() override { return true; }
+
+protected:
+  Debugger &m_debugger;
+  std::shared_ptr<BreakpointTreeDelegate> m_breakpoint_delegate_sp;
+};
+
 class ValueObjectListDelegate : public WindowDelegate {
 public:
   ValueObjectListDelegate() : m_rows() {}
@@ -5224,6 +6015,7 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate {
     eMenuID_ViewRegisters,
     eMenuID_ViewSource,
     eMenuID_ViewVariables,
+    eMenuID_ViewBreakpoints,
 
     eMenuID_Help,
     eMenuID_HelpGUIHelp
@@ -5348,6 +6140,18 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate {
       form_window_sp->SetDelegate(window_delegate_sp);
       return MenuActionResult::Handled;
     }
+    case eMenuID_ProcessLaunch: {
+      WindowSP main_window_sp = m_app.GetMainWindow();
+      FormDelegateSP form_delegate_sp = FormDelegateSP(
+          new ProcessLaunchFormDelegate(m_debugger, main_window_sp));
+      Rect bounds = main_window_sp->GetCenteredRect(80, 22);
+      WindowSP form_window_sp = main_window_sp->CreateSubWindow(
+          form_delegate_sp->GetName().c_str(), bounds, true);
+      WindowDelegateSP window_delegate_sp =
+          WindowDelegateSP(new FormWindowDelegate(form_delegate_sp));
+      form_window_sp->SetDelegate(window_delegate_sp);
+      return MenuActionResult::Handled;
+    }
 
     case eMenuID_ProcessContinue: {
       ExecutionContext exe_ctx =
@@ -5438,8 +6242,8 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate {
         // previously added
         submenus.erase(submenus.begin() + 7, submenus.end());
       }
-      // Since we are adding and removing items we need to recalculate the name
-      // lengths
+      // Since we are adding and removing items we need to recalculate the
+      // name lengths
       menu.RecalculateNameLengths();
     }
       return MenuActionResult::Handled;
@@ -5547,6 +6351,39 @@ class ApplicationDelegate : public WindowDelegate, public MenuDelegate {
     }
       return MenuActionResult::Handled;
 
+    case eMenuID_ViewBreakpoints: {
+      WindowSP main_window_sp = m_app.GetMainWindow();
+      WindowSP threads_window_sp = main_window_sp->FindSubWindow("Threads");
+      WindowSP breakpoints_window_sp =
+          main_window_sp->FindSubWindow("Breakpoints");
+      const Rect threads_bounds = threads_window_sp->GetBounds();
+
+      // If a breakpoints window already exists, remove it and give the area
+      // it used to occupy to the threads window. If it doesn't exist, split
+      // the threads window horizontally into two windows where the top window
+      // is the threads window and the bottom window is a newly added
+      // breakpoints window.
+      if (breakpoints_window_sp) {
+        threads_window_sp->Resize(threads_bounds.size.width,
+                                  threads_bounds.size.height +
+                                      breakpoints_window_sp->GetHeight());
+        main_window_sp->RemoveSubWindow(breakpoints_window_sp.get());
+      } else {
+        Rect new_threads_bounds, breakpoints_bounds;
+        threads_bounds.HorizontalSplitPercentage(0.70, new_threads_bounds,
+                                                 breakpoints_bounds);
+        threads_window_sp->SetBounds(new_threads_bounds);
+        breakpoints_window_sp = main_window_sp->CreateSubWindow(
+            "Breakpoints", breakpoints_bounds, false);
+        TreeDelegateSP breakpoints_delegate_sp(
+            new BreakpointsTreeDelegate(m_debugger));
+        breakpoints_window_sp->SetDelegate(WindowDelegateSP(
+            new TreeWindowDelegate(m_debugger, breakpoints_delegate_sp)));
+      }
+      touchwin(stdscr);
+      return MenuActionResult::Handled;
+    }
+
     case eMenuID_HelpGUIHelp:
       m_app.GetMainWindow()->CreateHelpSubwindow();
       return MenuActionResult::Handled;
@@ -5739,8 +6576,8 @@ class SourceFileWindowDelegate : public WindowDelegate {
             m_selected_line = m_pc_line;
 
           if (m_file_sp && m_file_sp->GetFileSpec() == m_sc.line_entry.file) {
-            // Same file, nothing to do, we should either have the lines or not
-            // (source file missing)
+            // Same file, nothing to do, we should either have the lines or
+            // not (source file missing)
             if (m_selected_line >= static_cast<size_t>(m_first_visible_line)) {
               if (m_selected_line >= m_first_visible_line + num_visible_lines)
                 m_first_visible_line = m_selected_line - 10;
@@ -5862,8 +6699,8 @@ class SourceFileWindowDelegate : public WindowDelegate {
           window.MoveCursor(1, line_y);
           const bool is_pc_line = curr_line == m_pc_line;
           const bool line_is_selected = m_selected_line == curr_line;
-          // Highlight the line as the PC line first, then if the selected line
-          // isn't the same as the PC line, highlight it differently
+          // Highlight the line as the PC line first, then if the selected
+          // line isn't the same as the PC line, highlight it differently
           attr_t highlight_attr = 0;
           attr_t bp_attr = 0;
           if (is_pc_line)
@@ -6002,8 +6839,8 @@ class SourceFileWindowDelegate : public WindowDelegate {
           window.MoveCursor(1, line_y);
           const bool is_pc_line = frame_sp && inst_idx == pc_idx;
           const bool line_is_selected = m_selected_line == inst_idx;
-          // Highlight the line as the PC line first, then if the selected line
-          // isn't the same as the PC line, highlight it differently
+          // Highlight the line as the PC line first, then if the selected
+          // line isn't the same as the PC line, highlight it differently
           attr_t highlight_attr = 0;
           attr_t bp_attr = 0;
           if (is_pc_line)
@@ -6467,7 +7304,7 @@ void IOHandlerCursesGUI::Activate() {
     MenuSP view_menu_sp(
         new Menu("View", "F5", KEY_F(5), ApplicationDelegate::eMenuID_View));
     view_menu_sp->AddSubmenu(
-        MenuSP(new Menu("Backtrace", nullptr, 'b',
+        MenuSP(new Menu("Backtrace", nullptr, 't',
                         ApplicationDelegate::eMenuID_ViewBacktrace)));
     view_menu_sp->AddSubmenu(
         MenuSP(new Menu("Registers", nullptr, 'r',
@@ -6477,6 +7314,9 @@ void IOHandlerCursesGUI::Activate() {
     view_menu_sp->AddSubmenu(
         MenuSP(new Menu("Variables", nullptr, 'v',
                         ApplicationDelegate::eMenuID_ViewVariables)));
+    view_menu_sp->AddSubmenu(
+        MenuSP(new Menu("Breakpoints", nullptr, 'b',
+                        ApplicationDelegate::eMenuID_ViewBreakpoints)));
 
     MenuSP help_menu_sp(
         new Menu("Help", "F6", KEY_F(6), ApplicationDelegate::eMenuID_Help));
@@ -6537,7 +7377,8 @@ void IOHandlerCursesGUI::Activate() {
     status_window_sp->SetDelegate(
         WindowDelegateSP(new StatusBarWindowDelegate(m_debugger)));
 
-    // Show the main help window once the first time the curses GUI is launched
+    // Show the main help window once the first time the curses GUI is
+    // launched
     static bool g_showed_help = false;
     if (!g_showed_help) {
       g_showed_help = true;
diff --git a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
index b49b541927afe..63178e6c8a7a7 100644
--- a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
+++ b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
@@ -14,6 +14,7 @@
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/Log.h"
 #include "llvm/Support/Errno.h"
+#include "llvm/Support/FileSystem.h"
 
 #include <climits>
 #include <sys/ptrace.h>
@@ -143,9 +144,32 @@ static void DupDescriptor(int error_fd, const FileSpec &file_spec, int fd,
     // Close everything besides stdin, stdout, and stderr that has no file
     // action to avoid leaking. Only do this when debugging, as elsewhere we
     // actually rely on passing open descriptors to child processes.
-    for (int fd = 3; fd < sysconf(_SC_OPEN_MAX); ++fd)
-      if (!info.GetFileActionForFD(fd) && fd != error_fd)
-        close(fd);
+
+    const llvm::StringRef proc_fd_path = "/proc/self/fd";
+    std::error_code ec;
+    bool result;
+    ec = llvm::sys::fs::is_directory(proc_fd_path, result);
+    if (result) {
+      std::vector<int> files_to_close;
+      // Directory iterator doesn't ensure any sequence.
+      for (llvm::sys::fs::directory_iterator iter(proc_fd_path, ec), file_end;
+           iter != file_end && !ec; iter.increment(ec)) {
+        int fd = std::stoi(iter->path().substr(proc_fd_path.size() + 1));
+
+        // Don't close first three entries since they are stdin, stdout and
+        // stderr.
+        if (fd > 2 && !info.GetFileActionForFD(fd) && fd != error_fd)
+          files_to_close.push_back(fd);
+      }
+      for (int file_to_close : files_to_close)
+        close(file_to_close);
+    } else {
+      // Since /proc/self/fd didn't work, trying the slow way instead.
+      int max_fd = sysconf(_SC_OPEN_MAX);
+      for (int fd = 3; fd < max_fd; ++fd)
+        if (!info.GetFileActionForFD(fd) && fd != error_fd)
+          close(fd);
+    }
 
     // Start tracing this child that is about to exec.
     if (ptrace(PT_TRACE_ME, 0, nullptr, 0) == -1)
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
index fad2f3f1a8638..58b6b625bfeca 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
@@ -1495,7 +1495,7 @@ bool ClangASTSource::layoutRecordType(const RecordDecl *record, uint64_t &size,
 
   LLDB_LOG(log,
            "LayoutRecordType on (ASTContext*){0} '{1}' for (RecordDecl*)"
-           "{3} [name = '{4}']",
+           "{2} [name = '{3}']",
            m_ast_context, m_clang_ast_context->getDisplayName(), record,
            record->getName());
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 79e6b3f609651..f2ed6330e36d1 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -1584,6 +1584,7 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
     }
   }
   assert(tag_decl_kind != -1);
+  (void)tag_decl_kind;
   bool clang_type_was_created = false;
   clang_type.SetCompilerType(
       &m_ast, dwarf->GetForwardDeclDieToClangType().lookup(die.GetDIE()));
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index a498117c264bb..9a56e1e6aa2c6 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -437,15 +437,20 @@ ParseListTableHeader(const llvm::DWARFDataExtractor &data, uint64_t offset,
   // We are expected to be called with Offset 0 or pointing just past the table
   // header. Correct Offset in the latter case so that it points to the start
   // of the header.
-  if (offset > 0) {
-    uint64_t HeaderSize = llvm::DWARFListTableHeader::getHeaderSize(format);
-    if (offset < HeaderSize)
-      return llvm::createStringError(errc::invalid_argument,
-                                     "did not detect a valid"
-                                     " list table with base = 0x%" PRIx64 "\n",
-                                     offset);
-    offset -= HeaderSize;
+  if (offset == 0) {
+    // This means DW_AT_rnglists_base is missing and therefore DW_FORM_rnglistx
+    // cannot be handled. Returning a default-constructed ListTableType allows
+    // DW_FORM_sec_offset to be supported.
+    return ListTableType();
   }
+
+  uint64_t HeaderSize = llvm::DWARFListTableHeader::getHeaderSize(format);
+  if (offset < HeaderSize)
+    return llvm::createStringError(errc::invalid_argument,
+                                   "did not detect a valid"
+                                   " list table with base = 0x%" PRIx64 "\n",
+                                   offset);
+  offset -= HeaderSize;
   ListTableType Table;
   if (llvm::Error E = Table.extractHeaderAndOffsets(data, &offset))
     return std::move(E);
@@ -996,8 +1001,12 @@ DWARFUnit::FindRnglistFromOffset(dw_offset_t offset) {
     return llvm::createStringError(errc::invalid_argument,
                                    "missing or invalid range list table");
 
-  auto range_list_or_error = GetRnglistTable()->findList(
-      m_dwarf.GetDWARFContext().getOrLoadRngListsData().GetAsLLVM(), offset);
+  llvm::DWARFDataExtractor data =
+      m_dwarf.GetDWARFContext().getOrLoadRngListsData().GetAsLLVM();
+
+  // As DW_AT_rnglists_base may be missing we need to call setAddressSize.
+  data.setAddressSize(m_header.GetAddressByteSize());
+  auto range_list_or_error = GetRnglistTable()->findList(data, offset);
   if (!range_list_or_error)
     return range_list_or_error.takeError();
 
diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp
index 252b06e269d6b..0b3f7e4f3bd4e 100644
--- a/lldb/source/Symbol/TypeSystem.cpp
+++ b/lldb/source/Symbol/TypeSystem.cpp
@@ -223,62 +223,32 @@ void TypeSystemMap::ForEach(std::function<bool(TypeSystem *)> const &callback) {
 llvm::Expected<TypeSystem &> TypeSystemMap::GetTypeSystemForLanguage(
     lldb::LanguageType language,
     llvm::Optional<CreateCallback> create_callback) {
-  llvm::Error error = llvm::Error::success();
-  assert(!error); // Check the success value when assertions are enabled
   std::lock_guard<std::mutex> guard(m_mutex);
-  if (m_clear_in_progress) {
-    error = llvm::make_error<llvm::StringError>(
+  if (m_clear_in_progress)
+    return llvm::make_error<llvm::StringError>(
         "Unable to get TypeSystem because TypeSystemMap is being cleared",
         llvm::inconvertibleErrorCode());
-  } else {
-    collection::iterator pos = m_map.find(language);
-    if (pos != m_map.end()) {
-      auto *type_system = pos->second.get();
-      if (type_system) {
-        llvm::consumeError(std::move(error));
-        return *type_system;
-      }
-      error = llvm::make_error<llvm::StringError>(
-          "TypeSystem for language " +
-              llvm::StringRef(Language::GetNameForLanguageType(language)) +
-              " doesn't exist",
-          llvm::inconvertibleErrorCode());
-      return std::move(error);
-    }
 
-    for (const auto &pair : m_map) {
-      if (pair.second && pair.second->SupportsLanguage(language)) {
-        // Add a new mapping for "language" to point to an already existing
-        // TypeSystem that supports this language
-        m_map[language] = pair.second;
-        if (pair.second.get()) {
-          llvm::consumeError(std::move(error));
-          return *pair.second.get();
-        }
-        error = llvm::make_error<llvm::StringError>(
-            "TypeSystem for language " +
-                llvm::StringRef(Language::GetNameForLanguageType(language)) +
-                " doesn't exist",
-            llvm::inconvertibleErrorCode());
-        return std::move(error);
-      }
-    }
+  collection::iterator pos = m_map.find(language);
+  if (pos != m_map.end()) {
+    auto *type_system = pos->second.get();
+    if (type_system)
+      return *type_system;
+    return llvm::make_error<llvm::StringError>(
+        "TypeSystem for language " +
+            llvm::StringRef(Language::GetNameForLanguageType(language)) +
+            " doesn't exist",
+        llvm::inconvertibleErrorCode());
+  }
 
-    if (!create_callback) {
-      error = llvm::make_error<llvm::StringError>(
-          "Unable to find type system for language " +
-              llvm::StringRef(Language::GetNameForLanguageType(language)),
-          llvm::inconvertibleErrorCode());
-    } else {
-      // Cache even if we get a shared pointer that contains a null type system
-      // back
-      TypeSystemSP type_system_sp = (*create_callback)();
-      m_map[language] = type_system_sp;
-      if (type_system_sp.get()) {
-        llvm::consumeError(std::move(error));
-        return *type_system_sp.get();
-      }
-      error = llvm::make_error<llvm::StringError>(
+  for (const auto &pair : m_map) {
+    if (pair.second && pair.second->SupportsLanguage(language)) {
+      // Add a new mapping for "language" to point to an already existing
+      // TypeSystem that supports this language
+      m_map[language] = pair.second;
+      if (pair.second.get())
+        return *pair.second.get();
+      return llvm::make_error<llvm::StringError>(
           "TypeSystem for language " +
               llvm::StringRef(Language::GetNameForLanguageType(language)) +
               " doesn't exist",
@@ -286,7 +256,23 @@ llvm::Expected<TypeSystem &> TypeSystemMap::GetTypeSystemForLanguage(
     }
   }
 
-  return std::move(error);
+  if (!create_callback)
+    return llvm::make_error<llvm::StringError>(
+        "Unable to find type system for language " +
+            llvm::StringRef(Language::GetNameForLanguageType(language)),
+        llvm::inconvertibleErrorCode());
+
+  // Cache even if we get a shared pointer that contains a null type system
+  // back
+  TypeSystemSP type_system_sp = (*create_callback)();
+  m_map[language] = type_system_sp;
+  if (type_system_sp.get())
+    return *type_system_sp.get();
+  return llvm::make_error<llvm::StringError>(
+      "TypeSystem for language " +
+          llvm::StringRef(Language::GetNameForLanguageType(language)) +
+          " doesn't exist",
+      llvm::inconvertibleErrorCode());
 }
 
 llvm::Expected<TypeSystem &>
diff --git a/lldb/source/Symbol/UnwindPlan.cpp b/lldb/source/Symbol/UnwindPlan.cpp
index 41bd8cd46ad83..5547998691db4 100644
--- a/lldb/source/Symbol/UnwindPlan.cpp
+++ b/lldb/source/Symbol/UnwindPlan.cpp
@@ -8,7 +8,6 @@
 
 #include "lldb/Symbol/UnwindPlan.h"
 
-#include "lldb/Expression/DWARFExpression.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/RegisterContext.h"
 #include "lldb/Target/Target.h"
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index d21935cb6f344..159928981c921 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -3985,6 +3985,45 @@ Environment TargetProperties::GetEnvironment() const {
   return ComputeEnvironment();
 }
 
+Environment TargetProperties::GetInheritedEnvironment() const {
+  Environment environment;
+
+  if (m_target == nullptr)
+    return environment;
+
+  if (!m_collection_sp->GetPropertyAtIndexAsBoolean(
+          nullptr, ePropertyInheritEnv,
+          g_target_properties[ePropertyInheritEnv].default_uint_value != 0))
+    return environment;
+
+  PlatformSP platform_sp = m_target->GetPlatform();
+  if (platform_sp == nullptr)
+    return environment;
+
+  Environment platform_environment = platform_sp->GetEnvironment();
+  for (const auto &KV : platform_environment)
+    environment[KV.first()] = KV.second;
+
+  Args property_unset_environment;
+  m_collection_sp->GetPropertyAtIndexAsArgs(nullptr, ePropertyUnsetEnvVars,
+                                            property_unset_environment);
+  for (const auto &var : property_unset_environment)
+    environment.erase(var.ref());
+
+  return environment;
+}
+
+Environment TargetProperties::GetTargetEnvironment() const {
+  Args property_environment;
+  m_collection_sp->GetPropertyAtIndexAsArgs(nullptr, ePropertyEnvVars,
+                                            property_environment);
+  Environment environment;
+  for (const auto &KV : Environment(property_environment))
+    environment[KV.first()] = KV.second;
+
+  return environment;
+}
+
 void TargetProperties::SetEnvironment(Environment env) {
   // TODO: Get rid of the Args intermediate step
   const uint32_t idx = ePropertyEnvVars;
diff --git a/lldb/test/API/functionalities/disassemble/aarch64-adrp-add/TestAArch64AdrpAdd.py b/lldb/test/API/functionalities/disassemble/aarch64-adrp-add/TestAArch64AdrpAdd.py
index 325607028c033..c5b9921d7e62d 100644
--- a/lldb/test/API/functionalities/disassemble/aarch64-adrp-add/TestAArch64AdrpAdd.py
+++ b/lldb/test/API/functionalities/disassemble/aarch64-adrp-add/TestAArch64AdrpAdd.py
@@ -11,6 +11,7 @@ class TestAArch64AdrpAdd(TestBase):
     mydir = TestBase.compute_mydir(__file__)
 
     @no_debug_info_test
+    @skipIfLLVMTargetMissing("AArch64")
     def test_arm64(self):
         src_dir = self.getSourceDir()
         yaml_path = os.path.join(src_dir, "a.out-arm64.yaml")
@@ -27,6 +28,7 @@ def test_arm64(self):
                 self.disassemble_check_for_hi_and_foo(target, f, binaryname)
 
     @no_debug_info_test
+    @skipIfLLVMTargetMissing("AArch64")
     def test_arm64_32(self):
         src_dir = self.getSourceDir()
         yaml_path = os.path.join(src_dir, "a.out-arm64_32.yaml")
diff --git a/lldb/test/API/functionalities/return-value/TestReturnValue.py b/lldb/test/API/functionalities/return-value/TestReturnValue.py
index fe79c46c30f01..54abfe46c5607 100644
--- a/lldb/test/API/functionalities/return-value/TestReturnValue.py
+++ b/lldb/test/API/functionalities/return-value/TestReturnValue.py
@@ -22,9 +22,10 @@ def affected_by_pr44132(self):
         return (self.getArchitecture() in ["aarch64", "arm"] and
                 self.getPlatform() in ["freebsd", "linux"])
 
-    # ABIMacOSX_arm can't fetch simple values inside a structure
+    # ABIMacOSX_arm(64) can't fetch simple values inside a structure
     def affected_by_radar_34562999(self):
-        return (self.getArchitecture() == 'armv7' or self.getArchitecture() == 'armv7k') and self.platformIsDarwin()
+        arch = self.getArchitecture().lower()
+        return arch in ['arm64', 'arm64e', 'armv7', 'armv7k'] and self.platformIsDarwin()
 
     @expectedFailureAll(oslist=["freebsd"], archs=["i386"],
                         bugnumber="llvm.org/pr48376")
diff --git a/lldb/test/Shell/SymbolFile/DWARF/DW_AT_range-DW_FORM_sec_offset.s b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_range-DW_FORM_sec_offset.s
index da1fff61dc021..5c1f6c46f7391 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/DW_AT_range-DW_FORM_sec_offset.s
+++ b/lldb/test/Shell/SymbolFile/DWARF/DW_AT_range-DW_FORM_sec_offset.s
@@ -29,7 +29,7 @@
 # RUN:   -o exit 2>&1 | FileCheck --check-prefix=RNGLISTBASE %s
 
 # RNGLISTBASE-LABEL: image lookup -v -s lookup_rnglists
-# RNGLISTBASE: error: {{.*}}-rnglistbase {0x00000043}: DIE has DW_AT_ranges(DW_FORM_rnglistx 0x0) attribute, but range extraction failed (invalid range list table index 0; OffsetEntryCount is 0, DW_AT_rnglists_base is 12), please file a bug and attach the file at the start of this error message
+# RNGLISTBASE: error: {{.*}}-rnglistbase {0x00000043}: DIE has DW_AT_ranges(DW_FORM_rnglistx 0x0) attribute, but range extraction failed (invalid range list table index 0; OffsetEntryCount is 0, DW_AT_rnglists_base is 24), please file a bug and attach the file at the start of this error message
 
         .text
 rnglists:
@@ -97,7 +97,7 @@ lookup_rnglists:
         .long   .Lrnglists_end-rnglists # DW_AT_high_pc
         .long   .Laddr_table_base0      # DW_AT_addr_base
 .ifdef RNGLISTBASE
-        .long   .Ldebug_ranges0         # DW_AT_rnglists_base
+        .long   .Ldebug_ranges1         # DW_AT_rnglists_base
 .endif
         .byte   2                       # Abbrev [2] 0x2b:0x37 DW_TAG_subprogram
         .quad   rnglists                # DW_AT_low_pc
@@ -105,7 +105,7 @@ lookup_rnglists:
         .asciz  "rnglists"              # DW_AT_name
         .byte   5                       # Abbrev [5] 0x52:0xf DW_TAG_lexical_block
 .ifndef RNGLISTX
-        .long   .Ldebug_ranges0         # DW_AT_ranges DW_FORM_sec_offset
+        .long   .Ldebug_ranges1         # DW_AT_ranges DW_FORM_sec_offset
 .else
         .uleb128 0                      # DW_AT_ranges DW_FORM_rnglistx
 .endif
@@ -130,9 +130,17 @@ lookup_rnglists:
         .byte   8                       # Address size
         .byte   0                       # Segment selector size
         .long   0                       # Offset entry count
-.Ldebug_ranges0:
+.Ldebug_rnglist_table_end0:
+
+        .long   .Ldebug_rnglist_table_end1-.Ldebug_rnglist_table_start1 # Length
+.Ldebug_rnglist_table_start1:
+        .short  5                       # Version
+        .byte   8                       # Address size
+        .byte   0                       # Segment selector size
+        .long   0                       # Offset entry count
+.Ldebug_ranges1:
         .byte   4                       # DW_RLE_offset_pair
         .uleb128 .Lblock1_begin-rnglists  #   starting offset
         .uleb128 .Lblock1_end-rnglists    #   ending offset
         .byte   0                       # DW_RLE_end_of_list
-.Ldebug_rnglist_table_end0:
+.Ldebug_rnglist_table_end1:
diff --git a/lldb/unittests/Symbol/CMakeLists.txt b/lldb/unittests/Symbol/CMakeLists.txt
index 76c7b645f277a..748faf33b556b 100644
--- a/lldb/unittests/Symbol/CMakeLists.txt
+++ b/lldb/unittests/Symbol/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_lldb_unittest(SymbolTests
   LocateSymbolFileTest.cpp
   PostfixExpressionTest.cpp
+  TestTypeSystem.cpp
   TestTypeSystemClang.cpp
   TestClangASTImporter.cpp
   TestDWARFCallFrameInfo.cpp
diff --git a/lldb/unittests/Symbol/TestTypeSystem.cpp b/lldb/unittests/Symbol/TestTypeSystem.cpp
new file mode 100644
index 0000000000000..59297a7475d27
--- /dev/null
+++ b/lldb/unittests/Symbol/TestTypeSystem.cpp
@@ -0,0 +1,92 @@
+//===-- TestTypeSystem.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestingSupport/SubsystemRAII.h"
+#include "lldb/Core/Module.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Host/HostInfo.h"
+#include "lldb/Symbol/TypeSystem.h"
+#include "gtest/gtest.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+class TestTypeSystemMap : public testing::Test {
+public:
+  SubsystemRAII<FileSystem, HostInfo> subsystems;
+};
+
+TEST_F(TestTypeSystemMap, GetTypeSystemForLanguageWithInvalidModule) {
+  // GetTypeSystemForLanguage called with an invalid Module.
+  TypeSystemMap map;
+  Module module{ModuleSpec()};
+  EXPECT_THAT_EXPECTED(
+      map.GetTypeSystemForLanguage(eLanguageTypeUnknown, &module,
+                                   /*can_create=*/true),
+      llvm::FailedWithMessage("TypeSystem for language unknown doesn't exist"));
+
+  EXPECT_THAT_EXPECTED(
+      map.GetTypeSystemForLanguage(eLanguageTypeUnknown, &module,
+                                   /*can_create=*/false),
+      llvm::FailedWithMessage("TypeSystem for language unknown doesn't exist"));
+
+  EXPECT_THAT_EXPECTED(
+      map.GetTypeSystemForLanguage(eLanguageTypeC, &module,
+                                   /*can_create=*/true),
+      llvm::FailedWithMessage("TypeSystem for language c doesn't exist"));
+  EXPECT_THAT_EXPECTED(
+      map.GetTypeSystemForLanguage(eLanguageTypeC, &module,
+                                   /*can_create=*/false),
+      llvm::FailedWithMessage("TypeSystem for language c doesn't exist"));
+}
+
+TEST_F(TestTypeSystemMap, GetTypeSystemForLanguageWithNoModule) {
+  // GetTypeSystemForLanguage called with no Module.
+  TypeSystemMap map;
+  Module *module = nullptr;
+  EXPECT_THAT_EXPECTED(
+      map.GetTypeSystemForLanguage(eLanguageTypeUnknown, module,
+                                   /*can_create=*/true),
+      llvm::FailedWithMessage("TypeSystem for language unknown doesn't exist"));
+
+  EXPECT_THAT_EXPECTED(
+      map.GetTypeSystemForLanguage(eLanguageTypeUnknown, module,
+                                   /*can_create=*/false),
+      llvm::FailedWithMessage("TypeSystem for language unknown doesn't exist"));
+
+  EXPECT_THAT_EXPECTED(
+      map.GetTypeSystemForLanguage(eLanguageTypeC, module, /*can_create=*/true),
+      llvm::FailedWithMessage("TypeSystem for language c doesn't exist"));
+  EXPECT_THAT_EXPECTED(
+      map.GetTypeSystemForLanguage(eLanguageTypeC, module,
+                                   /*can_create=*/false),
+      llvm::FailedWithMessage("TypeSystem for language c doesn't exist"));
+}
+
+TEST_F(TestTypeSystemMap, GetTypeSystemForLanguageWithNoTarget) {
+  // GetTypeSystemForLanguage called with no Target.
+  TypeSystemMap map;
+  Target *target = nullptr;
+  EXPECT_THAT_EXPECTED(
+      map.GetTypeSystemForLanguage(eLanguageTypeUnknown, target,
+                                   /*can_create=*/true),
+      llvm::FailedWithMessage("TypeSystem for language unknown doesn't exist"));
+
+  EXPECT_THAT_EXPECTED(
+      map.GetTypeSystemForLanguage(eLanguageTypeUnknown, target,
+                                   /*can_create=*/false),
+      llvm::FailedWithMessage("TypeSystem for language unknown doesn't exist"));
+
+  EXPECT_THAT_EXPECTED(
+      map.GetTypeSystemForLanguage(eLanguageTypeC, target, /*can_create=*/true),
+      llvm::FailedWithMessage("TypeSystem for language c doesn't exist"));
+  EXPECT_THAT_EXPECTED(
+      map.GetTypeSystemForLanguage(eLanguageTypeC, target,
+                                   /*can_create=*/false),
+      llvm::FailedWithMessage("TypeSystem for language c doesn't exist"));
+}
diff --git a/llvm-spirv/docs/SPIRVRepresentationInLLVM.rst b/llvm-spirv/docs/SPIRVRepresentationInLLVM.rst
index bae9b2bd9803b..ef6cf2bb486a1 100644
--- a/llvm-spirv/docs/SPIRVRepresentationInLLVM.rst
+++ b/llvm-spirv/docs/SPIRVRepresentationInLLVM.rst
@@ -17,10 +17,11 @@ an obvious way. These include:
  * SPIR-V types mapped to LLVM types
  * SPIR-V instructions mapped to LLVM function calls
  * SPIR-V extended instructions mapped to LLVM function calls
- * SPIR-V builtins variables mapped to LLVM global variables
+ * SPIR-V builtin variables mapped to LLVM function calls or LLVM global variables
  * SPIR-V instructions mapped to LLVM metadata
  * SPIR-V types mapped to LLVM opaque types
  * SPIR-V decorations mapped to LLVM metadata or named attributes
+ * Additional requirements for LLVM module
 
 SPIR-V Types Mapped to LLVM Types
 =================================
@@ -74,6 +75,28 @@ mangled as __spirv_{TypeName}, where {TypeName} is the name of the SPIR-V
 type with "OpType" removed, e.g., OpTypeEvent is mapped to spirv.Event and
 mangled as __spirv_Event.
 
+Address spaces
+--------------
+
+The following
+`SPIR-V storage classes <https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html#Storage_Class>`_
+are naturally represented as LLVM IR address spaces with the following mapping:
+
+====================    ====================================
+SPIR-V storage class    LLVM IR address space
+====================    ====================================
+``Function``            No address space or ``addrspace(0)``
+``CrossWorkgroup``      ``addrspace(1)``
+``UniformConstant``     ``addrspace(2)``
+``Workgroup``           ``addrspace(3)``
+``Generic``             ``addrspace(4)``
+====================    ====================================
+
+SPIR-V extensions are allowed to add new storage classes. For example,
+SPV_INTEL_usm_storage_classes extension adds ``DeviceOnlyINTEL`` and
+``HostOnlyINTEL`` storage classes which are mapped to ``addrspace(5)`` and
+``addrspace(6)`` respectively.
+
 SPIR-V Instructions Mapped to LLVM Function Calls
 =================================================
 
@@ -203,11 +226,58 @@ where
  * {VectorLoadOpCodeName} = vloadn|vload_half|vload_halfn|vloada_halfn
 
 
-SPIR-V Builtins Variables Mapped to LLVM Global Variables
-=========================================================
+SPIR-V Builtin Variables Mapped to LLVM Function Calls or LLVM Global Variables
+===============================================================================
+
+By default each access of SPIR-V builtin variable's value is mapped to LLVM
+function call. The unmangled names of these functions follow the convention:
+
+.. code-block:: c
+
+  __spirv_BuiltIn{VariableName}
+
+In case if SPIR-V builtin variable has vector type, the corresponding
+LLVM function will have an integer argument, so each access of the variable's
+scalar component is mapped to a function call with index argument, i.e.:
+
+.. code-block:: llvm
 
-SPIR-V builtin variables are mapped to LLVM global variables with unmangled
-name __spirv_BuiltIn{Name}.
+  ; For scalar variables
+  ; SPIR-V
+  OpDecorate %__spirv_BuiltInGlobalInvocationId BuiltIn GlobalInvocationId
+  %13 = OpLoad %uint %__spirv_BuiltInGlobalLinearId Aligned 4
+
+  ; Will be transformed into the following LLVM IR:
+  %0 = call spir_func i32 @_Z29__spirv_BuiltInGlobalLinearIdv()
+
+  ; For vector variables
+  ; SPIRV
+  OpDecorate %__spirv_BuiltInGlobalInvocationId BuiltIn GlobalInvocationId
+  %14 = OpLoad %v3ulong %__spirv_BuiltInGlobalInvocationId Aligned 32
+  %15 = OpCompositeExtract %ulong %14 1
+
+  ; Can be transformed into the following LLVM IR:
+  %0 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 1)
+
+  ; However SPIRV-LLVM translator will transform it to the following pattern:
+  %1 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0)
+  %2 = insertelement <3 x i64> undef, i64 %1, i32 0
+  %3 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 1)
+  %4 = insertelement <3 x i64> %2, i64 %3, i32 1
+  %5 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 2)
+  %6 = insertelement <3 x i64> %4, i64 %5, i32 2
+  %7 = extractelement <3 x i64> %6, i32 1
+  ; In case some actions are performed with the variable's value in vector form.
+
+SPIR-V builtin variables can also be mapped to LLVM global variables with
+unmangled name __spirv_BuiltIn{Name}.
+
+The representation with variables is closer to SPIR-V, so it is easier to
+translate from SPIR-V to LLVM and back using it.
+Hovewer in languages like OpenCL the functionality covered by SPIR-V builtin
+variables is usually represented by builtin functions, so it is easier to
+translate from/to SPIR-V friendly IR to/from LLVM IR produced from OpenCL-like
+source languages. That is why both forms of mapping are supported.
 
 SPIR-V instructions mapped to LLVM metadata
 ===========================================
@@ -222,6 +292,48 @@ following format:
   !<InstructionMetadata1> = !{<Operand1>, <Operand2>, ..}
   !<InstructionMetadata2> = !{<Operand1>, <Operand2>, ..}
 
++--------------------+---------------------------------------------------------+
+| SPIR-V instruction | LLVM IR                                                 |
++====================+=========================================================+
+| OpSource           | .. code-block:: llvm                                    |
+|                    |                                                         |
+|                    |    !spirv.Source = !{!0}                                |
+|                    |    !0 = !{i32 3, i32 66048, !1}                         |
+|                    |    ; 3 - OpenCL_C                                       |
+|                    |    ; 66048 = 0x10200 - OpenCL version 1.2               |
+|                    |    ; !1 - optional file id.                             |
+|                    |    !1 = !{!"/tmp/opencl/program.cl"}                    |
++--------------------+---------------------------------------------------------+
+| OpSourceExtension  | .. code-block:: llvm                                    |
+|                    |                                                         |
+|                    |    !spirv.SourceExtension = !{!0, !1}                   |
+|                    |    !0 = !{!"cl_khr_fp16"}                               |
+|                    |    !1 = !{!"cl_khr_gl_sharing"}                         |
++--------------------+---------------------------------------------------------+
+| OpExtension        | .. code-block:: llvm                                    |
+|                    |                                                         |
+|                    |    !spirv.Extension = !{!0}                             |
+|                    |    !0 = !{!"SPV_KHR_expect_assume"}                     |
++--------------------+---------------------------------------------------------+
+| OpCapability       | .. code-block:: llvm                                    |
+|                    |                                                         |
+|                    |    !spirv.Capability = !{!0}                            |
+|                    |    !0 = !{i32 10} ; Float64 - program uses doubles      |
++--------------------+---------------------------------------------------------+
+| OpExecutionMode    | .. code-block:: llvm                                    |
+|                    |                                                         |
+|                    |    !spirv.ExecutionMode = !{!0}                         |
+|                    |    !0 = !{void ()* @worker, i32 30, i32 262149}         |
+|                    |    ; Set execution mode with id 30 (VecTypeHint) and    |
+|                    |    ; literal `262149` operand.                          |
++--------------------+---------------------------------------------------------+
+| Generator's magic  | .. code-block:: llvm                                    |
+| number - word # 2  |                                                         |
+| in SPIR-V module   |    !spirv.Generator = !{!0}                             |
+|                    |    !0 = !{i16 6, i16 123}                               |
+|                    |    ; 6 - Generator Id, 123 - Generator Version          |
++--------------------+---------------------------------------------------------+
+
 For example:
 
 .. code-block:: llvm
@@ -248,3 +360,58 @@ For example:
   !9 = !{!7, i32 32}     ; independent forward progress is required for 'kernel2'
   !10 = !{i16 6, i16 123} ; 6 - Generator Id, 123 - Generator Version 
 
+Additional requirements for LLVM module
+=======================================
+
+Target triple and datalayout string
+-----------------------------------
+
+Target triple architecture must be ``spir`` (32-bit architecture) or ``spir64``
+(64-bit architecture) and ``datalayout`` string must be aligned with OpenCL
+environment specification requirements for data type sizes and alignments (e.g.
+3-element vector must have 4-element vector alignment). For example:
+
+.. code-block:: llvm
+
+   target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+   target triple = "spir-unknown-unknown"
+
+Target triple architecture is translated to
+`addressing model operand <https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html#_a_id_addressing_model_a_addressing_model>`_
+of
+`OpMemoryModel <https://www.khronos.org/registry/spir-v/specs/unified1/SPIRV.html#_a_id_mode_setting_a_mode_setting_instructions>`_
+SPIR-V instruction.
+
+- ``spir`` -> Physical32
+- ``spir64`` -> Physical64
+
+Calling convention
+------------------
+
+``OpEntryPoint`` information is represented in LLVM IR in calling convention.
+A function with ``spir_kernel`` calling convention will be translated as an entry
+point of the SPIR-V module.
+
+Function metadata
+-----------------
+
+Some kernel parameter information is stored in LLVM IR as a function metadata.
+
+For example:
+
+.. code-block:: llvm
+
+  !kernel_arg_addr_space !1
+  !kernel_arg_access_qual !2
+  !kernel_arg_type !3
+  !kernel_arg_base_type !4
+  !kernel_arg_type_qual !5
+
+**NOTE**: All metadata from the example above are optional. Access qualifiers
+are translated for image types, but they should be encoded in LLVM IR type name
+rather than function metadata.
+
+Debug information extension
+===========================
+
+**TBD**
diff --git a/llvm-spirv/include/LLVMSPIRVLib.h b/llvm-spirv/include/LLVMSPIRVLib.h
index a88d0f03d58a4..261b5d41e27d6 100644
--- a/llvm-spirv/include/LLVMSPIRVLib.h
+++ b/llvm-spirv/include/LLVMSPIRVLib.h
@@ -63,8 +63,10 @@ void initializeSPIRVRegularizeLLVMLegacyPass(PassRegistry &);
 void initializeSPIRVToOCL12LegacyPass(PassRegistry &);
 void initializeSPIRVToOCL20LegacyPass(PassRegistry &);
 void initializePreprocessMetadataLegacyPass(PassRegistry &);
+void initializeSPIRVLowerBitCastToNonStandardTypeLegacyPass(PassRegistry &);
 
 class ModulePass;
+class FunctionPass;
 } // namespace llvm
 
 #include "llvm/IR/Module.h"
@@ -215,6 +217,11 @@ ModulePass *createSPIRVWriterPass(std::ostream &Str);
 ModulePass *createSPIRVWriterPass(std::ostream &Str,
                                   const SPIRV::TranslatorOpts &Opts);
 
+/// Create a pass for removing bitcast instructions to non-standard SPIR-V
+/// types
+FunctionPass *createSPIRVLowerBitCastToNonStandardTypeLegacy(
+    const SPIRV::TranslatorOpts &Opts);
+
 } // namespace llvm
 
 #endif // SPIRV_H
diff --git a/llvm-spirv/include/LLVMSPIRVOpts.h b/llvm-spirv/include/LLVMSPIRVOpts.h
index cb424fc965ad9..8c73b644f0f5c 100644
--- a/llvm-spirv/include/LLVMSPIRVOpts.h
+++ b/llvm-spirv/include/LLVMSPIRVOpts.h
@@ -61,10 +61,11 @@ enum class VersionNumber : uint32_t {
   SPIRV_1_1 = 0x00010100,
   SPIRV_1_2 = 0x00010200,
   SPIRV_1_3 = 0x00010300,
+  SPIRV_1_4 = 0x00010400,
   // TODO: populate this enum with the latest versions (up to 1.5) once
   // translator get support of corresponding features
   MinimumVersion = SPIRV_1_0,
-  MaximumVersion = SPIRV_1_3
+  MaximumVersion = SPIRV_1_4
 };
 
 enum class ExtensionID : uint32_t {
diff --git a/llvm-spirv/lib/SPIRV/CMakeLists.txt b/llvm-spirv/lib/SPIRV/CMakeLists.txt
index 4fdf0f70d2716..f81c004b247f9 100644
--- a/llvm-spirv/lib/SPIRV/CMakeLists.txt
+++ b/llvm-spirv/lib/SPIRV/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_library(LLVMSPIRVLib
   OCLTypeToSPIRV.cpp
   OCLUtil.cpp
   VectorComputeUtil.cpp
+  SPIRVLowerBitCastToNonStandardType.cpp
   SPIRVLowerBool.cpp
   SPIRVLowerConstExpr.cpp
   SPIRVLowerMemmove.cpp
diff --git a/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp b/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp
index d28fd10df4414..b833114dd9fa3 100644
--- a/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp
+++ b/llvm-spirv/lib/SPIRV/PreprocessMetadata.cpp
@@ -319,7 +319,7 @@ void PreprocessMetadataBase::preprocessVectorComputeMetadata(Module *M,
     // RoundMode and FloatMode are always same for all types in VC
     // While Denorm could be different for double, float and half
     auto Attrs = F.getAttributes();
-    if (Attrs.hasFnAttribute(kVCMetadata::VCFloatControl)) {
+    if (Attrs.hasFnAttr(kVCMetadata::VCFloatControl)) {
       SPIRVWord Mode = 0;
       Attrs
           .getAttribute(AttributeList::FunctionIndex,
@@ -342,7 +342,7 @@ void PreprocessMetadataBase::preprocessVectorComputeMetadata(Module *M,
                 .done();
           });
     }
-    if (Attrs.hasFnAttribute(kVCMetadata::VCSLMSize)) {
+    if (Attrs.hasFnAttr(kVCMetadata::VCSLMSize)) {
       SPIRVWord SLMSize = 0;
       Attrs.getAttribute(AttributeList::FunctionIndex, kVCMetadata::VCSLMSize)
           .getValueAsString()
@@ -353,7 +353,7 @@ void PreprocessMetadataBase::preprocessVectorComputeMetadata(Module *M,
           .add(SLMSize)
           .done();
     }
-    if (Attrs.hasFnAttribute(kVCMetadata::VCFCEntry)) {
+    if (Attrs.hasFnAttr(kVCMetadata::VCFCEntry)) {
       EM.addOp()
           .add(&F)
           .add(spv::internal::ExecutionModeFastCompositeKernelINTEL)
diff --git a/llvm-spirv/lib/SPIRV/SPIRVLowerBitCastToNonStandardType.cpp b/llvm-spirv/lib/SPIRV/SPIRVLowerBitCastToNonStandardType.cpp
new file mode 100644
index 0000000000000..52c33948a2432
--- /dev/null
+++ b/llvm-spirv/lib/SPIRV/SPIRVLowerBitCastToNonStandardType.cpp
@@ -0,0 +1,237 @@
+//===============- SPIRVLowerBitCastToNonStandardType.cpp -================//
+//
+//                     The LLVM/SPIRV Translator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// Copyright (c) 2021 Intel Corporation. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a
+// copy of this software and associated documentation files (the "Software"),
+// to deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimers in the documentation
+// and/or other materials provided with the distribution.
+// Neither the names of Intel Corporation, nor the names of its
+// contributors may be used to endorse or promote products derived from this
+// Software without specific prior written permission.
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+// THE SOFTWARE.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements lowering of BitCast to nonstandard types. LLVM
+// transformations bitcast some vector types to scalar types, which are not
+// universally supported across all targets. We need ensure that "optimized"
+// LLVM IR doesn't have primitive types other than supported by the
+// SPIR target (i.e. "scalar 8/16/32/64-bit integer and 16/32/64-bit floating
+// point types, 2/3/4/8/16-element vector of scalar types").
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "spv-lower-bitcast-to-nonstandard-type"
+
+#include "SPIRVInternal.h"
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PassManager.h"
+
+#include <utility>
+
+using namespace llvm;
+
+namespace SPIRV {
+
+static VectorType *getVectorType(Type *Ty) {
+  assert(Ty != nullptr && "Expected non-null type");
+  if (auto *ElemTy = dyn_cast<PointerType>(Ty))
+    Ty = ElemTy->getElementType();
+  return dyn_cast<VectorType>(Ty);
+}
+
+/// Since SPIR-V does not support non-standard vector types, instructions using
+/// these types should be replaced in a special way to avoid using of
+/// unsupported types.
+/// lowerBitCastToNonStdVec function is designed to avoid using of bitcast to
+/// unsupported vector types instructions and should be called if similar
+/// instructions have been encountered in input LLVM IR.
+bool lowerBitCastToNonStdVec(Instruction *OldInst, Value *NewInst,
+                             const VectorType *OldVecTy,
+                             std::vector<Instruction *> &InstsToErase,
+                             IRBuilder<> &Builder,
+                             unsigned RecursionDepth = 0) {
+  static constexpr unsigned MaxRecursionDepth = 16;
+  if (RecursionDepth++ > MaxRecursionDepth)
+    report_fatal_error(
+        "The depth of recursion exceeds the maximum possible depth", false);
+
+  bool Changed = false;
+  VectorType *NewVecTy = getVectorType(NewInst->getType());
+  if (NewVecTy) {
+    Builder.SetInsertPoint(OldInst);
+    for (auto *U : OldInst->users()) {
+      // Handle addrspacecast instruction after bitcast if present
+      if (auto *ASCastInst = dyn_cast<AddrSpaceCastInst>(U)) {
+        unsigned DestAS = ASCastInst->getDestAddressSpace();
+        auto *NewVecPtrTy = NewVecTy->getPointerTo(DestAS);
+        // AddrSpaceCast is created explicitly instead of using method
+        // IRBuilder<>.CreateAddrSpaceCast because IRBuilder doesn't create
+        // separate instruction for constant values. Whereas SPIR-V translator
+        // doesn't like several nested instructions in one.
+        Value *LocalValue = new AddrSpaceCastInst(NewInst, NewVecPtrTy);
+        Builder.Insert(LocalValue);
+        Changed |=
+            lowerBitCastToNonStdVec(ASCastInst, LocalValue, OldVecTy,
+                                    InstsToErase, Builder, RecursionDepth);
+      }
+      // Handle load instruction which is following the bitcast in the pattern
+      else if (auto *LI = dyn_cast<LoadInst>(U)) {
+        Value *LocalValue = Builder.CreateLoad(NewVecTy, NewInst);
+        Changed |= lowerBitCastToNonStdVec(
+            LI, LocalValue, OldVecTy, InstsToErase, Builder, RecursionDepth);
+      }
+      // Handle extractelement instruction which is following the load
+      else if (auto *EEI = dyn_cast<ExtractElementInst>(U)) {
+        uint64_t NumElemsInOldVec = OldVecTy->getElementCount().getValue();
+        uint64_t NumElemsInNewVec = NewVecTy->getElementCount().getValue();
+        uint64_t OldElemIdx =
+            cast<ConstantInt>(EEI->getIndexOperand())->getZExtValue();
+        uint64_t NewElemIdx =
+            OldElemIdx / (NumElemsInOldVec / NumElemsInNewVec);
+        Value *LocalValue = Builder.CreateExtractElement(NewInst, NewElemIdx);
+        // The trunc instruction truncates the high order bits in value, so it
+        // may be necessary to shift right high order bits, if required bits are
+        // not at the end of extracted value
+        unsigned OldVecElemBitWidth =
+            cast<IntegerType>(OldVecTy->getElementType())->getBitWidth();
+        unsigned NewVecElemBitWidth =
+            cast<IntegerType>(NewVecTy->getElementType())->getBitWidth();
+        unsigned BitWidthRatio = NewVecElemBitWidth / OldVecElemBitWidth;
+        if (auto RequiredBitsIdx =
+                OldElemIdx % BitWidthRatio != BitWidthRatio - 1) {
+          uint64_t Shift =
+              OldVecElemBitWidth * (BitWidthRatio - RequiredBitsIdx);
+          LocalValue = Builder.CreateLShr(LocalValue, Shift);
+        }
+        LocalValue =
+            Builder.CreateTrunc(LocalValue, OldVecTy->getElementType());
+        Changed |= lowerBitCastToNonStdVec(
+            EEI, LocalValue, OldVecTy, InstsToErase, Builder, RecursionDepth);
+      }
+    }
+  }
+  InstsToErase.push_back(OldInst);
+  if (!Changed)
+    OldInst->replaceAllUsesWith(NewInst);
+  return true;
+}
+
+class SPIRVLowerBitCastToNonStandardTypePass
+    : public llvm::PassInfoMixin<SPIRVLowerBitCastToNonStandardTypePass> {
+public:
+  SPIRVLowerBitCastToNonStandardTypePass(const SPIRV::TranslatorOpts &Opts)
+      : Opts(Opts) {}
+
+  PreservedAnalyses
+  runLowerBitCastToNonStandardType(Function &F, FunctionAnalysisManager &FAM) {
+    // This pass doesn't cover all possible uses of non-standard types, only
+    // known. We assume that bad type won't be passed to a function as
+    // parameter, since it added by an optimization.
+    bool Changed = false;
+
+    // SPV_INTEL_vector_compute allows to use vectors with any number of
+    // components. Since this method only lowers vectors with non-standard
+    // in pure SPIR-V number of components, there is no need to do anything in
+    // case SPV_INTEL_vector_compute is enabled.
+    if (Opts.isAllowedToUseExtension(ExtensionID::SPV_INTEL_vector_compute))
+      return PreservedAnalyses::all();
+
+    std::vector<Instruction *> BCastsToNonStdVec;
+    std::vector<Instruction *> InstsToErase;
+    for (auto &BB : F)
+      for (auto &I : BB) {
+        auto *BC = dyn_cast<BitCastInst>(&I);
+        if (!BC)
+          continue;
+        VectorType *SrcVecTy = getVectorType(BC->getSrcTy());
+        if (SrcVecTy) {
+          uint64_t NumElemsInSrcVec = SrcVecTy->getElementCount().getValue();
+          if (!isValidVectorSize(NumElemsInSrcVec))
+            report_fatal_error("Unsupported vector type with the size of: " +
+                                   std::to_string(NumElemsInSrcVec),
+                               false);
+        }
+        VectorType *DestVecTy = getVectorType(BC->getDestTy());
+        if (DestVecTy) {
+          uint64_t NumElemsInDestVec = DestVecTy->getElementCount().getValue();
+          if (!isValidVectorSize(NumElemsInDestVec))
+            BCastsToNonStdVec.push_back(&I);
+        }
+      }
+    IRBuilder<> Builder(F.getContext());
+    for (auto &I : BCastsToNonStdVec) {
+      Value *NewValue = I->getOperand(0);
+      VectorType *OldVecTy = getVectorType(I->getType());
+      Changed |=
+          lowerBitCastToNonStdVec(I, NewValue, OldVecTy, InstsToErase, Builder);
+    }
+
+    for (auto *I : InstsToErase)
+      I->eraseFromParent();
+
+    return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+  }
+
+private:
+  SPIRV::TranslatorOpts Opts;
+};
+
+class SPIRVLowerBitCastToNonStandardTypeLegacy : public FunctionPass {
+public:
+  static char ID;
+  SPIRVLowerBitCastToNonStandardTypeLegacy(const SPIRV::TranslatorOpts &Opts)
+      : FunctionPass(ID), Opts(Opts) {}
+
+  SPIRVLowerBitCastToNonStandardTypeLegacy() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override {
+    SPIRVLowerBitCastToNonStandardTypePass Impl(Opts);
+    FunctionAnalysisManager FAM;
+    auto PA = Impl.runLowerBitCastToNonStandardType(F, FAM);
+    return !PA.areAllPreserved();
+  }
+
+  bool doFinalization(Module &M) override {
+    verifyRegularizationPass(M, "SPIRVLowerBitCastToNonStandardType");
+    return false;
+  }
+
+  StringRef getPassName() const override { return "Lower nonstandard type"; }
+
+private:
+  SPIRV::TranslatorOpts Opts;
+};
+
+char SPIRVLowerBitCastToNonStandardTypeLegacy::ID = 0;
+
+} // namespace SPIRV
+
+INITIALIZE_PASS(SPIRVLowerBitCastToNonStandardTypeLegacy,
+                "spv-lower-bitcast-to-nonstandard-type",
+                "Remove bitcast to nonstandard types", false, false)
+
+llvm::FunctionPass *llvm::createSPIRVLowerBitCastToNonStandardTypeLegacy(
+    const SPIRV::TranslatorOpts &Opts) {
+  return new SPIRVLowerBitCastToNonStandardTypeLegacy(Opts);
+}
diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
index b18eada883fed..55252cf6719b8 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
@@ -2724,8 +2724,7 @@ Function *SPIRVToLLVM::transFunction(SPIRVFunction *BF) {
   BF->foreachReturnValueAttr([&](SPIRVFuncParamAttrKind Kind) {
     if (Kind == FunctionParameterAttributeNoWrite)
       return;
-    F->addAttribute(AttributeList::ReturnIndex,
-                    SPIRSPIRVFuncParamAttrMap::rmap(Kind));
+    F->addRetAttr(SPIRSPIRVFuncParamAttrMap::rmap(Kind));
   });
 
   // Creating all basic blocks before creating instructions.
diff --git a/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp b/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp
index 5edf5e4389bc3..4fd746a75f6f9 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVRegularizeLLVM.cpp
@@ -359,6 +359,25 @@ bool SPIRVRegularizeLLVMBase::regularize() {
             II.setMetadata(MDName, nullptr);
           }
         }
+        // Add an additional bitcast in case address space cast also changes
+        // pointer element type.
+        if (auto *ASCast = dyn_cast<AddrSpaceCastInst>(&II)) {
+          Type *DestTy = ASCast->getDestTy();
+          Type *SrcTy = ASCast->getSrcTy();
+          if (DestTy->getPointerElementType() !=
+              SrcTy->getPointerElementType()) {
+            PointerType *InterTy =
+                PointerType::get(DestTy->getPointerElementType(),
+                                 SrcTy->getPointerAddressSpace());
+            BitCastInst *NewBCast = new BitCastInst(
+                ASCast->getPointerOperand(), InterTy, /*NameStr=*/"", ASCast);
+            AddrSpaceCastInst *NewASCast =
+                new AddrSpaceCastInst(NewBCast, DestTy, /*NameStr=*/"", ASCast);
+            ToErase.push_back(ASCast);
+            ASCast->dropAllReferences();
+            ASCast->replaceAllUsesWith(NewASCast);
+          }
+        }
         if (auto Cmpxchg = dyn_cast<AtomicCmpXchgInst>(&II)) {
           // Transform:
           // %1 = cmpxchg i32* %ptr, i32 %comparator, i32 %0 seq_cst acquire
diff --git a/llvm-spirv/lib/SPIRV/SPIRVToOCL.cpp b/llvm-spirv/lib/SPIRV/SPIRVToOCL.cpp
index da67b01d7d040..8ae595559b3ff 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVToOCL.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVToOCL.cpp
@@ -982,6 +982,8 @@ void SPIRVToOCLBase::visitCallSPIRVVLoadn(CallInst *CI, OCLExtOpKind Kind) {
         std::string Name = OCLExtOpMap::map(Kind);
         if (ConstantInt *C = dyn_cast<ConstantInt>(Args.back())) {
           uint64_t NumComponents = C->getZExtValue();
+          assert(NumComponents > 1 &&
+                 "vloada_halfn instruction is not for scalar types");
           std::stringstream SS;
           SS << NumComponents;
           Name.replace(Name.find("n"), 1, SS.str());
diff --git a/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp b/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp
index a882ab72ce23d..54f3f8b3d49a8 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp
@@ -1870,6 +1870,7 @@ bool postProcessBuiltinReturningStruct(Function *F) {
   LLVMContext *Context = &M->getContext();
   std::string Name = F->getName().str();
   F->setName(Name + ".old");
+  SmallVector<Instruction *, 32> InstToRemove;
   for (auto *U : F->users()) {
     if (auto *CI = dyn_cast<CallInst>(U)) {
       auto *ST = cast<StoreInst>(*(CI->user_begin()));
@@ -1887,12 +1888,14 @@ bool postProcessBuiltinReturningStruct(Function *F) {
       Args.insert(Args.begin(), ST->getPointerOperand());
       auto *NewCI = CallInst::Create(NewF, Args, CI->getName(), CI);
       NewCI->setCallingConv(CI->getCallingConv());
-      ST->dropAllReferences();
-      ST->eraseFromParent();
-      CI->dropAllReferences();
-      CI->eraseFromParent();
+      InstToRemove.push_back(ST);
+      InstToRemove.push_back(CI);
     }
   }
+  for (auto *Inst : InstToRemove) {
+    Inst->dropAllReferences();
+    Inst->eraseFromParent();
+  }
   F->dropAllReferences();
   F->eraseFromParent();
   return true;
diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index 3343f90bbe134..f6667be940f8e 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -209,6 +209,16 @@ bool LLVMToSPIRVBase::isBuiltinTransToExtInst(
   OCLExtOpKind EOC;
   if (!OCLExtOpMap::rfind(Splited.first.str(), &EOC))
     return false;
+  if (EOC == OpenCLLIB::Vloada_halfn) {
+    auto *VecTy = dyn_cast<VectorType>(F->getReturnType());
+    if (!VecTy)
+      BM->getErrorLog().checkError(
+          false, SPIRVEC_InvalidModule,
+          "vloada_half should be of a half vector type");
+    auto *Ty = VecTy->getElementType();
+    BM->getErrorLog().checkError(Ty->isHalfTy(), SPIRVEC_InvalidModule,
+                                 "vloada_half should be of a half vector type");
+  }
 
   if (ExtSet)
     *ExtSet = Set;
@@ -659,13 +669,13 @@ SPIRVFunction *LLVMToSPIRVBase::transFunctionDecl(Function *F) {
     BF->addDecorate(DecorationFuncParamAttr, FunctionParameterAttributeZext);
   if (Attrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
     BF->addDecorate(DecorationFuncParamAttr, FunctionParameterAttributeSext);
-  if (Attrs.hasFnAttribute("referenced-indirectly")) {
+  if (Attrs.hasFnAttr("referenced-indirectly")) {
     assert(!isKernel(F) &&
            "kernel function was marked as referenced-indirectly");
     BF->addDecorate(DecorationReferencedIndirectlyINTEL);
   }
 
-  if (Attrs.hasFnAttribute(kVCMetadata::VCCallable) &&
+  if (Attrs.hasFnAttr(kVCMetadata::VCCallable) &&
       BM->isAllowedToUseExtension(ExtensionID::SPV_INTEL_fast_composite)) {
     BF->addDecorate(internal::DecorationCallableFunctionINTEL);
   }
@@ -688,14 +698,14 @@ void LLVMToSPIRVBase::transVectorComputeMetadata(Function *F) {
   assert(BF && "The SPIRVFunction pointer shouldn't be nullptr");
   auto Attrs = F->getAttributes();
 
-  if (Attrs.hasFnAttribute(kVCMetadata::VCStackCall))
+  if (Attrs.hasFnAttr(kVCMetadata::VCStackCall))
     BF->addDecorate(DecorationStackCallINTEL);
-  if (Attrs.hasFnAttribute(kVCMetadata::VCFunction))
+  if (Attrs.hasFnAttr(kVCMetadata::VCFunction))
     BF->addDecorate(DecorationVectorComputeFunctionINTEL);
   else
     return;
 
-  if (Attrs.hasFnAttribute(kVCMetadata::VCSIMTCall)) {
+  if (Attrs.hasFnAttr(kVCMetadata::VCSIMTCall)) {
     SPIRVWord SIMTMode = 0;
     Attrs.getAttribute(AttributeList::FunctionIndex, kVCMetadata::VCSIMTCall)
         .getValueAsString()
@@ -748,7 +758,7 @@ void LLVMToSPIRVBase::transVectorComputeMetadata(Function *F) {
   }
   if (!isKernel(F) &&
       BM->isAllowedToUseExtension(ExtensionID::SPV_INTEL_float_controls2) &&
-      Attrs.hasFnAttribute(kVCMetadata::VCFloatControl)) {
+      Attrs.hasFnAttr(kVCMetadata::VCFloatControl)) {
 
     SPIRVWord Mode = 0;
     Attrs
@@ -2474,6 +2484,7 @@ bool LLVMToSPIRVBase::isKnownIntrinsic(Intrinsic::ID Id) {
   case Intrinsic::dbg_label:
   case Intrinsic::trap:
   case Intrinsic::arithmetic_fence:
+  case Intrinsic::isnan:
     return true;
   default:
     // Unknown intrinsics' declarations should always be translated
@@ -3108,6 +3119,11 @@ SPIRVValue *LLVMToSPIRVBase::transIntrinsicInst(IntrinsicInst *II,
     }
     return Op;
   }
+  case Intrinsic::isnan: {
+    SPIRVType *Ty = transType(II->getType());
+    SPIRVValue *Op = transValue(II->getArgOperand(0), BB);
+    return BM->addUnaryInst(OpIsNan, Ty, Op, BB);
+  }
   default:
     if (BM->isUnknownIntrinsicAllowed(II))
       return BM->addCallInst(
@@ -3745,8 +3761,8 @@ bool LLVMToSPIRVBase::transExecutionMode() {
           break;
         unsigned SLMSize;
         N.get(SLMSize);
-        BF->addExecutionMode(new SPIRVExecutionMode(
-            BF, static_cast<ExecutionMode>(EMode), SLMSize));
+        BF->addExecutionMode(BM->add(new SPIRVExecutionMode(
+            BF, static_cast<ExecutionMode>(EMode), SLMSize)));
       } break;
 
       case spv::ExecutionModeDenormPreserve:
@@ -4285,6 +4301,7 @@ void addPassesForSPIRV(legacy::PassManager &PassMgr,
   PassMgr.add(createSPIRVLowerBoolLegacy());
   PassMgr.add(createSPIRVLowerMemmoveLegacy());
   PassMgr.add(createSPIRVLowerSaddWithOverflowLegacy());
+  PassMgr.add(createSPIRVLowerBitCastToNonStandardTypeLegacy(Opts));
 }
 
 bool isValidLLVMModule(Module *M, SPIRVErrorLog &ErrorLog) {
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
index 21b8dde2d2071..04bebf158ca87 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
@@ -1949,6 +1949,9 @@ static std::string to_string(uint32_t Version) {
   case static_cast<uint32_t>(VersionNumber::SPIRV_1_3):
     Res = "1.3";
     break;
+  case static_cast<uint32_t>(VersionNumber::SPIRV_1_4):
+    Res = "1.4";
+    break;
   default:
     Res = "unknown";
   }
diff --git a/llvm-spirv/test/lower-non-standard-types.ll b/llvm-spirv/test/lower-non-standard-types.ll
new file mode 100644
index 0000000000000..9767be015b8bf
--- /dev/null
+++ b/llvm-spirv/test/lower-non-standard-types.ll
@@ -0,0 +1,50 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv -s %t.bc -o - | llvm-dis -o - | FileCheck %s --implicit-check-not="<6 x i32>"
+
+; CHECK: [[ASCastInst:%.*]] = addrspacecast <3 x i64> addrspace(1)* @Id to <3 x i64> addrspace(4)*
+; CHECK: [[LoadInst1:%.*]] = load <3 x i64>, <3 x i64> addrspace(4)* [[ASCastInst]], align 32
+; CHECK: [[ExtrElInst1:%.*]] = extractelement <3 x i64> [[LoadInst1]], i64 0
+; CHECK: [[TruncInst1:%.*]] = trunc i64 [[ExtrElInst1]] to i32
+; CHECK: [[LoadInst2:%.*]] = load <3 x i64>, <3 x i64> addrspace(4)* [[ASCastInst]], align 32
+; CHECK: [[ExtrElInst2:%.*]] = extractelement <3 x i64> [[LoadInst2]], i64 2
+; CHECK: [[LShrInst:%.*]] = lshr i64 [[ExtrElInst2]], 32
+; CHECK: [[TruncInst2:%.*]] = trunc i64 [[LShrInst]] to i32
+; CHECK: %conv1 = sitofp i32 [[TruncInst1]] to float
+; CHECK: %conv2 = sitofp i32 [[TruncInst2]] to float
+
+; ModuleID = 'lower-non-standard-types'
+source_filename = "lower-non-standard-types.cpp"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown-sycldevice"
+
+@Id = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32
+
+; Function Attrs: convergent norecurse
+define dso_local spir_func void @vmult2() local_unnamed_addr #0 !sycl_explicit_simd !4 !intel_reqd_sub_group_size !6 {
+entry:
+  %0 = load <6 x i32>, <6 x i32> addrspace(4)* addrspacecast (<6 x i32> addrspace(1)* bitcast (<3 x i64> addrspace(1)* @Id to <6 x i32> addrspace(1)*) to <6 x i32> addrspace(4)*), align 32
+  %1 = load <6 x i32>, <6 x i32> addrspace(4)* addrspacecast (<6 x i32> addrspace(1)* bitcast (<3 x i64> addrspace(1)* @Id to <6 x i32> addrspace(1)*) to <6 x i32> addrspace(4)*), align 32
+  %2 = extractelement <6 x i32> %0, i32 1
+  %3 = extractelement <6 x i32> %1, i32 4
+  %conv1 = sitofp i32 %2 to float
+  %conv2 = sitofp i32 %3 to float
+  ret void
+}
+
+attributes #0 = { convergent norecurse "frame-pointer"="all" "min-legal-vector-width"="256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="lower-external-funcs-with-z.cpp" }
+
+!llvm.module.flags = !{!0, !1}
+!opencl.spir.version = !{!2}
+!spirv.Source = !{!3}
+!opencl.used.extensions = !{!4}
+!opencl.used.optional.core.features = !{!4}
+!opencl.compiler.options = !{!4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{i32 1, i32 2}
+!3 = !{i32 0, i32 100000}
+!4 = !{}
+!5 = !{!"Compiler"}
+!6 = !{i32 1}
diff --git a/llvm-spirv/test/lower-non-standard-vec-with-ext.ll b/llvm-spirv/test/lower-non-standard-vec-with-ext.ll
new file mode 100644
index 0000000000000..711ca17858683
--- /dev/null
+++ b/llvm-spirv/test/lower-non-standard-vec-with-ext.ll
@@ -0,0 +1,38 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: not --crash llvm-spirv -s %t.bc
+; RUN: llvm-spirv --spirv-ext=+SPV_INTEL_vector_compute -s %t.bc
+
+; ModuleID = 'lower-non-standard-vec-with-ext'
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown-sycldevice"
+
+@Id = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32
+
+; Function Attrs: convergent norecurse
+define dso_local spir_func void @vmult2() local_unnamed_addr #0 !sycl_explicit_simd !4 !intel_reqd_sub_group_size !6 {
+entry:
+  %0 = bitcast <1 x i32> <i32 65793> to <4 x i8>
+  %1 = extractelement <4 x i8> %0, i32 0
+  %2 = bitcast <1 x i32> <i32 131586> to <4 x i8>
+  %3 = extractelement <4 x i8> %2, i32 0
+  %4 = bitcast <5 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1> to <20 x i8>
+  ret void
+}
+
+attributes #0 = { convergent norecurse "frame-pointer"="all" "min-legal-vector-width"="256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="lower-external-funcs-with-z.cpp" }
+
+!llvm.module.flags = !{!0, !1}
+!opencl.spir.version = !{!2}
+!spirv.Source = !{!3}
+!opencl.used.extensions = !{!4}
+!opencl.used.optional.core.features = !{!4}
+!opencl.compiler.options = !{!4}
+!llvm.ident = !{!5}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{i32 1, i32 2}
+!3 = !{i32 0, i32 100000}
+!4 = !{}
+!5 = !{!"Compiler"}
+!6 = !{i32 1}
diff --git a/llvm-spirv/test/negative/invalid-vloada-half-scalar.ll b/llvm-spirv/test/negative/invalid-vloada-half-scalar.ll
new file mode 100644
index 0000000000000..97cbf3e8e2b2a
--- /dev/null
+++ b/llvm-spirv/test/negative/invalid-vloada-half-scalar.ll
@@ -0,0 +1,37 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: not llvm-spirv %t.bc -o %t.spv 2>&1 | FileCheck %s
+
+; CHECK: InvalidModule: Invalid SPIR-V module: vloada_half should be of a half vector type
+
+; ModuleID = 'loada.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: noinline nounwind
+define spir_kernel void @test(half %val, half addrspace(4)* %res) #0 {
+entry:
+  %call1 = call spir_func float @_Z12vloada_half1mPU3AS4KDh(half %val, half addrspace(4)* %res) #1
+  ret void
+}
+
+; Function Attrs: nounwind
+declare spir_func float @_Z12vloada_half1mPU3AS4KDh(half, half addrspace(4)*) #1
+
+attributes #0 = { noinline nounwind }
+attributes #1 = { nounwind }
+
+!spirv.MemoryModel = !{!0}
+!opencl.enable.FP_CONTRACT = !{}
+!spirv.Source = !{!1}
+!opencl.spir.version = !{!2}
+!opencl.ocl.version = !{!3}
+!opencl.used.extensions = !{!4}
+!opencl.used.optional.core.features = !{!4}
+!spirv.Generator = !{!5}
+
+!0 = !{i32 2, i32 2}
+!1 = !{i32 3, i32 300000}
+!2 = !{i32 2, i32 0}
+!3 = !{i32 3, i32 0}
+!4 = !{}
+!5 = !{i16 6, i16 14}
diff --git a/llvm-spirv/test/negative/invalid-vloada-half-vector.ll b/llvm-spirv/test/negative/invalid-vloada-half-vector.ll
new file mode 100644
index 0000000000000..cde97f850baff
--- /dev/null
+++ b/llvm-spirv/test/negative/invalid-vloada-half-vector.ll
@@ -0,0 +1,37 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: not llvm-spirv %t.bc -o %t.spv 2>&1 | FileCheck %s
+
+; CHECK: InvalidModule: Invalid SPIR-V module: vloada_half should be of a half vector type
+
+; ModuleID = 'loada.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
+target triple = "spir64-unknown-unknown"
+
+; Function Attrs: noinline nounwind
+define spir_kernel void @test(<2 x float> %val, half addrspace(4)* %res) #0 {
+entry:
+  %call1 = call spir_func float @_Z12vloada_half1mPU3AS4KDh(<2 x float> %val, half addrspace(4)* %res) #1
+  ret void
+}
+
+; Function Attrs: nounwind
+declare spir_func float @_Z12vloada_half1mPU3AS4KDh(<2 x float>, half addrspace(4)*) #1
+
+attributes #0 = { noinline nounwind }
+attributes #1 = { nounwind }
+
+!spirv.MemoryModel = !{!0}
+!opencl.enable.FP_CONTRACT = !{}
+!spirv.Source = !{!1}
+!opencl.spir.version = !{!2}
+!opencl.ocl.version = !{!3}
+!opencl.used.extensions = !{!4}
+!opencl.used.optional.core.features = !{!4}
+!spirv.Generator = !{!5}
+
+!0 = !{i32 2, i32 2}
+!1 = !{i32 3, i32 300000}
+!2 = !{i32 2, i32 0}
+!3 = !{i32 3, i32 0}
+!4 = !{}
+!5 = !{i16 6, i16 14}
diff --git a/llvm-spirv/test/negative/spirv-version-controls-1.spt b/llvm-spirv/test/negative/spirv-version-controls-1.spt
index cc061dbeadb43..4dca2b270ac21 100644
--- a/llvm-spirv/test/negative/spirv-version-controls-1.spt
+++ b/llvm-spirv/test/negative/spirv-version-controls-1.spt
@@ -1,4 +1,4 @@
-119734787 66560 393230 12 0
+119734787 66816 393230 12 0
 2 Capability Addresses
 2 Capability Kernel
 5 ExtInstImport 1 "OpenCL.std"
@@ -29,5 +29,5 @@
 
 ; RUN: not llvm-spirv %s -to-binary -o - 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
 ;
-; CHECK-ERROR: Invalid SPIR-V module: unsupported SPIR-V version number 'unknown (66560)'. Range of supported/known SPIR-V versions is 1.0 (65536) - 1.3 (66304)
+; CHECK-ERROR: Invalid SPIR-V module: unsupported SPIR-V version number 'unknown (66816)'. Range of supported/known SPIR-V versions is 1.0 (65536) - 1.4 (66560)
 
diff --git a/llvm-spirv/test/negative/spirv-version-controls-2.spt b/llvm-spirv/test/negative/spirv-version-controls-2.spt
index 468d753c6395a..9f1690bf9a70c 100644
--- a/llvm-spirv/test/negative/spirv-version-controls-2.spt
+++ b/llvm-spirv/test/negative/spirv-version-controls-2.spt
@@ -29,6 +29,6 @@
 
 ; RUN: not llvm-spirv %s -to-binary -o - 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
 ;
-; CHECK-ERROR: Invalid SPIR-V module: unsupported SPIR-V version number 'unknown (1024)'. Range of supported/known SPIR-V versions is 1.0 (65536) - 1.3 (66304)
+; CHECK-ERROR: Invalid SPIR-V module: unsupported SPIR-V version number 'unknown (1024)'. Range of supported/known SPIR-V versions is 1.0 (65536) - 1.4 (66560)
 
 
diff --git a/llvm-spirv/test/transcoding/enqueue_kernel.cl b/llvm-spirv/test/transcoding/enqueue_kernel.cl
index 3e810807d79bc..685d35259b4b2 100644
--- a/llvm-spirv/test/transcoding/enqueue_kernel.cl
+++ b/llvm-spirv/test/transcoding/enqueue_kernel.cl
@@ -2,6 +2,7 @@
 // RUN: llvm-spirv %t.bc -spirv-text -o %t.spv.txt
 // RUN: FileCheck < %t.spv.txt %s --check-prefix=CHECK-SPIRV
 // RUN: llvm-spirv %t.bc -o %t.spv
+// RUN: spirv-val %t.spv
 // RUN: llvm-spirv -r %t.spv -o %t.rev.bc
 // RUN: llvm-dis %t.rev.bc
 // RUN: FileCheck < %t.rev.ll %s --check-prefix=CHECK-LLVM
@@ -60,7 +61,8 @@ kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) {
   //                            [[BlockKer1]] [[BlockLit1]] [[ConstInt17]] [[ConstInt8]]
 
   // CHECK-LLVM: [[Block2:%[0-9]+]] = bitcast [[BlockTy2]]* %block to %struct.__opencl_block_literal_generic*
-  // CHECK-LLVM: [[Block2Ptr:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* [[Block2]] to i8 addrspace(4)*
+  // CHECK-LLVM: [[InterCast2:%[0-9]+]] = bitcast %struct.__opencl_block_literal_generic* [[Block2]] to i8
+  // CHECK-LLVM: [[Block2Ptr:%[0-9]+]] = addrspacecast i8* [[InterCast2]] to i8 addrspace(4)*
   // CHECK-LLVM: [[BlockInv2:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_kernel to i8 addrspace(4)*
   // CHECK-LLVM: call i32 @__enqueue_kernel_basic(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i8 addrspace(4)* [[BlockInv2]], i8 addrspace(4)* [[Block2Ptr]])
   enqueue_kernel(default_queue, flags, ndrange,
@@ -79,7 +81,8 @@ kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) {
   //                            [[BlockKer2]] [[BlockLit2]] [[ConstInt20]] [[ConstInt8]]
 
   // CHECK-LLVM: [[Block3:%[0-9]+]] = bitcast [[BlockTy3]]* %block4 to %struct.__opencl_block_literal_generic*
-  // CHECK-LLVM: [[Block3Ptr:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* [[Block3]] to i8 addrspace(4)
+  // CHECK-LLVM: [[InterCast3:%[0-9]+]] = bitcast %struct.__opencl_block_literal_generic* [[Block3]] to i8
+  // CHECK-LLVM: [[Block3Ptr:%[0-9]+]] = addrspacecast i8* [[InterCast3]] to i8 addrspace(4)
   // CHECK-LLVM: [[BlockInv3:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_2_kernel to i8 addrspace(4)*
   // CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 2, %opencl.clk_event_t* addrspace(4)* {{.*}}, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv3]], i8 addrspace(4)* [[Block3Ptr]])
   enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event,
@@ -140,7 +143,8 @@ kernel void device_side_enqueue(global int *a, global int *b, int i, char c0) {
   //                            [[BlockKer5]] [[BlockLit5]] [[ConstInt20]] [[ConstInt8]]
 
   // CHECK-LLVM: [[Block5:%[0-9]+]] = bitcast [[BlockTy3]]* %block15 to %struct.__opencl_block_literal_generic*
-  // CHECK-LLVM: [[Block5Ptr:%[0-9]+]] = addrspacecast %struct.__opencl_block_literal_generic* [[Block5]] to i8 addrspace(4)
+  // CHECK-LLVM: [[InterCast5:%[0-9]+]] = bitcast %struct.__opencl_block_literal_generic* [[Block5]] to i8
+  // CHECK-LLVM: [[Block5Ptr:%[0-9]+]] = addrspacecast i8* [[InterCast5]] to i8 addrspace(4)
   // CHECK-LLVM: [[BlockInv5:%[0-9]+]] = addrspacecast void (i8 addrspace(4)*)* @__device_side_enqueue_block_invoke_5_kernel to i8 addrspace(4)*
   // CHECK-LLVM: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* {{.*}}, i32 {{.*}}, %struct.ndrange_t* {{.*}}, i32 0, %opencl.clk_event_t* addrspace(4)* null, %opencl.clk_event_t* addrspace(4)* {{.*}}, i8 addrspace(4)* [[BlockInv5]], i8 addrspace(4)* [[Block5Ptr]])
   enqueue_kernel(default_queue, flags, ndrange, 0, NULL, &clk_event,
diff --git a/llvm-spirv/test/transcoding/isnan.ll b/llvm-spirv/test/transcoding/isnan.ll
new file mode 100644
index 0000000000000..8c6ef94a2d494
--- /dev/null
+++ b/llvm-spirv/test/transcoding/isnan.ll
@@ -0,0 +1,35 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv %t.spv --to-text -o %t.spt
+; RUN: FileCheck < %t.spt %s --check-prefix=CHECK-SPIRV
+
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM-OCL
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc --spirv-target-env=SPV-IR
+; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM-SPV
+
+; CHECK-SPIRV: IsNan
+; CHECK-LLVM-OCL: call spir_func i32 @_Z5isnanf(float 1.200000e+01)
+; CHECK-LLVM-SPV: call spir_func i32 @_Z13__spirv_IsNanf(float 1.200000e+01)
+
+; ModuleID = 'test.bc'
+source_filename = "test.cpp"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown-unknown"
+
+define spir_kernel void @test() {
+entry:
+  %call = call i1 @llvm.isnan.f32(float 1.200000e+01)
+  ret void
+}
+
+declare i1 @llvm.isnan.f32(float)
+
+!llvm.module.flags = !{!0, !1}
+!opencl.spir.version = !{!2}
+!spirv.Source = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{i32 1, i32 2}
+!3 = !{i32 4, i32 100000}
diff --git a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
index 78d6cefd82ace..4fabac22a8c0e 100644
--- a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
+++ b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
@@ -104,7 +104,8 @@ static cl::opt<VersionNumber> MaxSPIRVVersion(
     cl::values(clEnumValN(VersionNumber::SPIRV_1_0, "1.0", "SPIR-V 1.0"),
                clEnumValN(VersionNumber::SPIRV_1_1, "1.1", "SPIR-V 1.1"),
                clEnumValN(VersionNumber::SPIRV_1_2, "1.2", "SPIR-V 1.2"),
-               clEnumValN(VersionNumber::SPIRV_1_3, "1.3", "SPIR-V 1.3")),
+               clEnumValN(VersionNumber::SPIRV_1_3, "1.3", "SPIR-V 1.3"),
+               clEnumValN(VersionNumber::SPIRV_1_4, "1.4", "SPIR-V 1.4")),
     cl::init(VersionNumber::MaximumVersion));
 
 static cl::list<std::string>
diff --git a/llvm/.gitattributes b/llvm/.gitattributes
index 48ddf2f02d15b..8812053169ff6 100644
--- a/llvm/.gitattributes
+++ b/llvm/.gitattributes
@@ -13,7 +13,8 @@ test/tools/llvm-strings/Inputs/numbers binary
 test/MC/AsmParser/incbin_abcd binary
 test/YAMLParser/spec-09-02.test binary
 
-# This file must have CRLF line endings, therefore git should treat it as
+# These files must have CRLF line endings, therefore git should treat them as
 # binary and not autoconvert line endings (for example, when core.autocrlf is
 # on).
 test/MC/AsmParser/preserve-comments-crlf.s binary
+test/tools/llvm-mca/X86/directives-handle-crlf.s binary
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index cf4855f9957e8..95986973fbf8d 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -803,8 +803,8 @@ if (TENSORFLOW_C_LIB_PATH)
   include_directories(${TENSORFLOW_C_LIB_PATH}/include)
   if (NOT TF_PROTO_HEADERS)
     message(STATUS "TF_PROTO_HEADERS not defined. Looking for tensorflow pip package.")
-    execute_process(COMMAND 
-      ${Python3_EXECUTABLE} "-m" "pip" "show" "tensorflow" 
+    execute_process(COMMAND
+      ${Python3_EXECUTABLE} "-m" "pip" "show" "tensorflow"
       OUTPUT_VARIABLE TF_PIP_OUT)
     if ("${TF_PIP_OUT}" STREQUAL "")
       message(FATAL ERROR "Tensorflow pip package is also required for 'development' mode (protobuf headers)")
@@ -836,7 +836,7 @@ if (NOT TENSORFLOW_AOT_PATH STREQUAL "")
   include_directories(${TENSORFLOW_AOT_PATH}/include)
   add_subdirectory(${TENSORFLOW_AOT_PATH}/xla_aot_runtime_src
     ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/tf_runtime)
-  install(TARGETS tf_xla_runtime EXPORT LLVMExports 
+  install(TARGETS tf_xla_runtime EXPORT LLVMExports
     ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT tf_xla_runtime)
   set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS tf_xla_runtime)
 endif()
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index dd0aaadb47c72..0f6110ba7e1d8 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -211,6 +211,7 @@ endif()
 # Determine whether we can register EH tables.
 check_symbol_exists(__register_frame "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_REGISTER_FRAME)
 check_symbol_exists(__deregister_frame "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_DEREGISTER_FRAME)
+check_symbol_exists(__unw_add_dynamic_fde "${CMAKE_CURRENT_LIST_DIR}/unwind.h" HAVE_UNW_ADD_DYNAMIC_FDE)
 
 check_symbol_exists(_Unwind_Backtrace "unwind.h" HAVE__UNWIND_BACKTRACE)
 check_symbol_exists(getpagesize unistd.h HAVE_GETPAGESIZE)
@@ -689,3 +690,30 @@ if(PY_PYGMENTS_FOUND AND PY_PYGMENTS_LEXERS_C_CPP_FOUND AND PY_YAML_FOUND)
 else()
   set (LLVM_HAVE_OPT_VIEWER_MODULES 0)
 endif()
+
+function(llvm_get_host_prefixes_and_suffixes)
+  # Not all platform files will set these variables (relying on them being
+  # implicitly empty if they're unset), so unset the variables before including
+  # the platform file, to prevent any values from the target system leaking.
+  unset(CMAKE_STATIC_LIBRARY_PREFIX)
+  unset(CMAKE_STATIC_LIBRARY_SUFFIX)
+  unset(CMAKE_SHARED_LIBRARY_PREFIX)
+  unset(CMAKE_SHARED_LIBRARY_SUFFIX)
+  unset(CMAKE_IMPORT_LIBRARY_PREFIX)
+  unset(CMAKE_IMPORT_LIBRARY_SUFFIX)
+  unset(CMAKE_EXECUTABLE_SUFFIX)
+  unset(CMAKE_LINK_LIBRARY_SUFFIX)
+  include(Platform/${CMAKE_HOST_SYSTEM_NAME} OPTIONAL RESULT_VARIABLE _includedFile)
+  if (_includedFile)
+    set(LLVM_HOST_STATIC_LIBRARY_PREFIX ${CMAKE_STATIC_LIBRARY_PREFIX} PARENT_SCOPE)
+    set(LLVM_HOST_STATIC_LIBRARY_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX} PARENT_SCOPE)
+    set(LLVM_HOST_SHARED_LIBRARY_PREFIX ${CMAKE_SHARED_LIBRARY_PREFIX} PARENT_SCOPE)
+    set(LLVM_HOST_SHARED_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX} PARENT_SCOPE)
+    set(LLVM_HOST_IMPORT_LIBRARY_PREFIX ${CMAKE_IMPORT_LIBRARY_PREFIX} PARENT_SCOPE)
+    set(LLVM_HOST_IMPORT_LIBRARY_SUFFIX ${CMAKE_IMPORT_LIBRARY_SUFFIX} PARENT_SCOPE)
+    set(LLVM_HOST_EXECUTABLE_SUFFIX ${CMAKE_EXECUTABLE_SUFFIX} PARENT_SCOPE)
+    set(LLVM_HOST_LINK_LIBRARY_SUFFIX ${CMAKE_LINK_LIBRARY_SUFFIX} PARENT_SCOPE)
+  endif()
+endfunction()
+
+llvm_get_host_prefixes_and_suffixes()
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 3e009f5061d3a..2d497d0dc20c3 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -486,6 +486,9 @@ function(llvm_add_library name)
     # Do add_dependencies(obj) later due to CMake issue 14747.
     list(APPEND objlibs ${obj_name})
 
+    # Bring in the target include directories from our original target.
+    target_include_directories(${obj_name} PRIVATE $<TARGET_PROPERTY:${name},INCLUDE_DIRECTORIES>)
+
     set_target_properties(${obj_name} PROPERTIES FOLDER "Object Libraries")
     if(ARG_DEPENDS)
       add_dependencies(${obj_name} ${ARG_DEPENDS})
@@ -525,6 +528,11 @@ function(llvm_add_library name)
       LINK_LIBS ${ARG_LINK_LIBS}
       LINK_COMPONENTS ${ARG_LINK_COMPONENTS}
       )
+
+    # Bring in the target link info from our original target.
+    target_link_directories(${name_static} PRIVATE $<TARGET_PROPERTY:${name},LINK_DIRECTORIES>)
+    target_link_libraries(${name_static} PRIVATE $<TARGET_PROPERTY:${name},LINK_LIBRARIES>)
+
     # FIXME: Add name_static to anywhere in TARGET ${name}'s PROPERTY.
     set(ARG_STATIC)
   endif()
diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake
index 01cd37124841e..179d183ed2b7d 100644
--- a/llvm/cmake/modules/CrossCompile.cmake
+++ b/llvm/cmake/modules/CrossCompile.cmake
@@ -103,6 +103,7 @@ function(build_native_tool target output_path_var)
   else()
     set(output_path "${${PROJECT_NAME}_NATIVE_BUILD}/bin/${target}")
   endif()
+  set(output_path ${output_path}${LLVM_HOST_EXECUTABLE_SUFFIX})
 
   llvm_ExternalProject_BuildCmd(build_cmd ${target} ${${PROJECT_NAME}_NATIVE_BUILD}
                                 CONFIGURATION Release)
diff --git a/llvm/cmake/unwind.h b/llvm/cmake/unwind.h
index e7f53465f9ce3..52243f2af787b 100644
--- a/llvm/cmake/unwind.h
+++ b/llvm/cmake/unwind.h
@@ -5,3 +5,4 @@
 // exist in the runtime.
 extern void __register_frame(const void *fde);   // NOLINT
 extern void __deregister_frame(const void *fde); // NOLINT
+extern void __unw_add_dynamic_fde();             // NOLINT
diff --git a/llvm/docs/BitCodeFormat.rst b/llvm/docs/BitCodeFormat.rst
index fcc10d5efe5b7..32f87fe719f88 100644
--- a/llvm/docs/BitCodeFormat.rst
+++ b/llvm/docs/BitCodeFormat.rst
@@ -1075,6 +1075,8 @@ The integer codes are mapped to well-known attributes as follows.
 * code 74: ``vscale_range(<Min>[, <Max>])``
 * code 75: ``swiftasync``
 * code 76: ``nosanitize_coverage``
+* code 77: ``elementtype``
+* code 78: ``disable_sanitizer_instrumentation``
 
 .. note::
   The ``allocsize`` attribute has a special encoding for its arguments. Its two
diff --git a/llvm/docs/GettingStartedVS.rst b/llvm/docs/GettingStartedVS.rst
index 2178c41824eae..8050187e49ba6 100644
--- a/llvm/docs/GettingStartedVS.rst
+++ b/llvm/docs/GettingStartedVS.rst
@@ -111,6 +111,19 @@ Here's the short story for getting up and running quickly with LLVM:
      you have the Visual Studio C++ Tools installed, not just Visual Studio
      itself (trying to create a C++ project in Visual Studio will generally
      download the C++ tools if they haven't already been).
+   * Run cmake from a "x86/x64 Native Tools Command Prompt" so Visual C++ will
+     be on the PATH and its environment variables are set. Do **not** use
+     ``CMAKE_C_COMPILER`` and ``CMAKE_CXX_COMPILER`` for this purpose:
+
+     .. code-block:: bat
+
+        **********************************************************************
+        ** Visual Studio 2019 Developer Command Prompt v16.11.1
+        ** Copyright (c) 2021 Microsoft Corporation
+        **********************************************************************
+        [vcvarsall.bat] Environment initialized for: 'x64'
+        c:\build> cmake ..\llvm\llvm -DLLVM_ENABLE_PROJECTS=clang -DLLVM_TARGETS_TO_BUILD=X86 -Thost=x64
+
    * See the :doc:`LLVM CMake guide <CMake>` for detailed information about
      how to configure the LLVM build.
    * CMake generates project files for all build types. To select a specific
@@ -144,7 +157,7 @@ Here's the short story for getting up and running quickly with LLVM:
 
    * If ``%PATH%`` does not contain GnuWin32, you may specify
      ``LLVM_LIT_TOOLS_DIR`` on CMake for the path to GnuWin32.
-   * You can run LLVM tests by merely building the project "check". The test
+   * You can run LLVM tests by merely building the project "check-all". The test
      results will be shown in the VS output window.
 
 9. Test LLVM on the command line:
@@ -154,10 +167,10 @@ Here's the short story for getting up and running quickly with LLVM:
 
      .. code-block:: bat
 
-        C:\..\llvm> python ..\build\bin\llvm-lit --param build_config=Win32 --param build_mode=Debug --param llvm_site_config=../build/test/lit.site.cfg test
+        c:\llvm> python ..\build\Release\bin\llvm-lit.py llvm\test
 
      This example assumes that Python is in your PATH variable, you
-     have built a Win32 Debug version of llvm with a standard out of
+     have built a Release version of llvm with a standard out of
      line build. You should not see any unexpected failures, but will
      see many unsupported tests and expected failures.
 
@@ -165,7 +178,7 @@ Here's the short story for getting up and running quickly with LLVM:
 
      .. code-block:: bat
 
-        C:\..\llvm> python ..\build\bin\llvm-lit --param build_config=Win32 --param build_mode=Debug --param llvm_site_config=../build/test/lit.site.cfg test/path/to/test
+        c:\llvm> python ..\build\Release\bin\llvm-lit.py llvm\test\Transforms\Util
 
 
 An Example Using the LLVM Tool Chain
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index 5f1ac067d9f90..6941aec2290cf 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -573,6 +573,19 @@ G_INTRINSIC_ROUND
 
 Returns the operand rounded to the nearest integer.
 
+G_LROUND, G_LLROUND
+^^^^^^^^^^^^^^^^^^^
+
+Returns the source operand rounded to the nearest integer with ties away from
+zero.
+
+See the LLVM LangRef entry on '``llvm.lround.*'`` for details on behaviour.
+
+.. code-block:: none
+
+  %rounded_32:_(s32) = G_LROUND %round_me:_(s64)
+  %rounded_64:_(s64) = G_LLROUND %round_me:_(s64)
+
 Vector Specific Operations
 --------------------------
 
@@ -629,6 +642,17 @@ G_VECREDUCE_FMAX, G_VECREDUCE_FMIN
 
 FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
 
+G_ISNAN
+^^^^^^^
+
+GlobalISel-equivalent of the '``llvm.isnan``' intrinsic.
+
+Returns a 1-bit scalar or vector of 1-bit scalar values. The result's contents
+represent whether or not the source value is NaN.
+
+.. code-block:: none
+
+  %is_nan:_(s1) = G_ISNAN %check_me_for_nan
 
 Integer/bitwise reductions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/docs/HowToCrossCompileBuiltinsOnArm.rst b/llvm/docs/HowToCrossCompileBuiltinsOnArm.rst
index 0c2699b4a8981..2e199a0170176 100644
--- a/llvm/docs/HowToCrossCompileBuiltinsOnArm.rst
+++ b/llvm/docs/HowToCrossCompileBuiltinsOnArm.rst
@@ -137,8 +137,8 @@ The cmake try compile stage fails
 At an early stage cmake will attempt to compile and link a simple C program to
 test if the toolchain is working.
 
-This stage can often fail at link time if the ``--sysroot`` and
-``--gcc-toolchain`` options are not passed to the compiler. Check the
+This stage can often fail at link time if the ``--sysroot=`` and
+``--gcc-toolchain=`` options are not passed to the compiler. Check the
 ``CMAKE_C_FLAGS`` and ``CMAKE_C_COMPILER_TARGET`` flags.
 
 It can be useful to build a simple example outside of cmake with your toolchain
@@ -148,7 +148,7 @@ Clang uses the host header files
 --------------------------------
 On debian based systems it is possible to install multiarch support for
 arm-linux-gnueabi and arm-linux-gnueabihf. In many cases clang can successfully
-use this multiarch support when -gcc-toolchain and --sysroot are not supplied.
+use this multiarch support when ``--gcc-toolchain=`` and ``--sysroot=`` are not supplied.
 Unfortunately clang adds ``/usr/local/include`` before
 ``/usr/include/arm-linux-gnueabihf`` leading to errors when compiling the hosts
 header files.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index f9c3b3da1260e..543783a8a90d2 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -1582,6 +1582,19 @@ example:
     can prove that the function does not execute any convergent operations.
     Similarly, the optimizer may remove ``convergent`` on calls/invokes when it
     can prove that the call/invoke cannot call a convergent function.
+``disable_sanitizer_instrumentation``
+    When instrumenting code with sanitizers, it can be important to skip certain
+    functions to ensure no instrumentation is applied to them.
+
+    This attribute is not always similar to absent ``sanitize_<name>``
+    attributes: depending on the specific sanitizer, code can be inserted into
+    functions regardless of the ``sanitize_<name>`` attribute to prevent false
+    positive reports.
+
+    ``disable_sanitizer_instrumentation`` disables all kinds of instrumentation,
+    taking precedence over the ``sanitize_<name>`` attributes and other compiler
+    flags.
+
 ``"frame-pointer"``
     This attribute tells the code generator whether the function
     should keep the frame pointer. The code generator may emit the frame pointer
@@ -16633,6 +16646,7 @@ intrinsics. Each one takes a vector operand as an input and applies its
 respective operation across all elements of the vector, returning a single
 scalar result of the same element type.
 
+.. _int_vector_reduce_add:
 
 '``llvm.vector.reduce.add.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -16656,6 +16670,8 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_fadd:
+
 '``llvm.vector.reduce.fadd.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16708,6 +16724,8 @@ Examples:
       %ord = call float @llvm.vector.reduce.fadd.v4f32(float %start_value, <4 x float> %input) ; sequential reduction
 
 
+.. _int_vector_reduce_mul:
+
 '``llvm.vector.reduce.mul.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16730,6 +16748,8 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_fmul:
+
 '``llvm.vector.reduce.fmul.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16781,6 +16801,8 @@ Examples:
       %unord = call reassoc float @llvm.vector.reduce.fmul.v4f32(float 1.0, <4 x float> %input) ; relaxed reduction
       %ord = call float @llvm.vector.reduce.fmul.v4f32(float %start_value, <4 x float> %input) ; sequential reduction
 
+.. _int_vector_reduce_and:
+
 '``llvm.vector.reduce.and.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16802,6 +16824,8 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_or:
+
 '``llvm.vector.reduce.or.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16823,6 +16847,8 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_xor:
+
 '``llvm.vector.reduce.xor.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16844,6 +16870,8 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_smax:
+
 '``llvm.vector.reduce.smax.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16865,6 +16893,8 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_smin:
+
 '``llvm.vector.reduce.smin.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16886,6 +16916,8 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_umax:
+
 '``llvm.vector.reduce.umax.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16907,6 +16939,8 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_umin:
+
 '``llvm.vector.reduce.umin.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16928,6 +16962,8 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of integer values.
 
+.. _int_vector_reduce_fmax:
+
 '``llvm.vector.reduce.fmax.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -16958,6 +16994,8 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of floating-point values.
 
+.. _int_vector_reduce_fmin:
+
 '``llvm.vector.reduce.fmin.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -18572,6 +18610,775 @@ Examples:
 
 
 
+.. _int_vp_reduce_add:
+
+'``llvm.vp.reduce.add.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.add.v4i32(i32 <start_value>, <4 x i32> <val>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.add.nxv8i16(i16 <start_value>, <vscale x 8 x i16> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated integer ``ADD`` reduction of a vector and a scalar starting value,
+returning the result as a scalar.
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+integer type equal to the result type. The second operand is the vector on
+which the reduction is performed and must be a vector of integer values whose
+element type is the result/start type. The third operand is the vector mask and
+is a vector of boolean values with the same number of elements as the vector
+operand. The fourth operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.add``' intrinsic performs the integer ``ADD`` reduction
+(:ref:`llvm.vector.reduce.add <int_vector_reduce_add>`) of the vector operand
+``val`` on each enabled lane, adding it to the scalar ``start_value``. Disabled
+lanes are treated as containing the neutral value ``0`` (i.e. having no effect
+on the reduction operation). If the vector length is zero, the result is equal
+to ``start_value``.
+
+To ignore the start value, the neutral value can be used.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> zeroinitializer
+      %reduction = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %masked.a)
+      %also.r = add i32 %reduction, %start
+
+
+.. _int_vp_reduce_fadd:
+
+'``llvm.vp.reduce.fadd.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare float @llvm.vp.reduce.fadd.v4f32(float <start_value>, <4 x float> <val>, <4 x i1> <mask>, i32 <vector_length>)
+      declare double @llvm.vp.reduce.fadd.nxv8f64(double <start_value>, <vscale x 8 x double> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point ``ADD`` reduction of a vector and a scalar starting
+value, returning the result as a scalar.
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+floating-point type equal to the result type. The second operand is the vector
+on which the reduction is performed and must be a vector of floating-point
+values whose element type is the result/start type. The third operand is the
+vector mask and is a vector of boolean values with the same number of elements
+as the vector operand. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.fadd``' intrinsic performs the floating-point ``ADD``
+reduction (:ref:`llvm.vector.reduce.fadd <int_vector_reduce_fadd>`) of the
+vector operand ``val`` on each enabled lane, adding it to the scalar
+``start_value``. Disabled lanes are treated as containing the neutral value
+``-0.0`` (i.e. having no effect on the reduction operation). If no lanes are
+enabled, the resulting value will be equal to ``start_value``.
+
+To ignore the start value, the neutral value can be used.
+
+See the unpredicated version (:ref:`llvm.vector.reduce.fadd
+<int_vector_reduce_fadd>`) for more detail on the semantics of the reduction.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call float @llvm.vp.reduce.fadd.v4f32(float %start, <4 x float> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>
+      %also.r = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %masked.a)
+
+
+.. _int_vp_reduce_mul:
+
+'``llvm.vp.reduce.mul.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.mul.v4i32(i32 <start_value>, <4 x i32> <val>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.mul.nxv8i16(i16 <start_value>, <vscale x 8 x i16> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated integer ``MUL`` reduction of a vector and a scalar starting value,
+returning the result as a scalar.
+
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+integer type equal to the result type. The second operand is the vector on
+which the reduction is performed and must be a vector of integer values whose
+element type is the result/start type. The third operand is the vector mask and
+is a vector of boolean values with the same number of elements as the vector
+operand. The fourth operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.mul``' intrinsic performs the integer ``MUL`` reduction
+(:ref:`llvm.vector.reduce.mul <int_vector_reduce_mul>`) of the vector operand ``val``
+on each enabled lane, multiplying it by the scalar ``start_value``. Disabled
+lanes are treated as containing the neutral value ``1`` (i.e. having no effect
+on the reduction operation). If the vector length is zero, the result is the
+start value.
+
+To ignore the start value, the neutral value can be used.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+      %reduction = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %masked.a)
+      %also.r = mul i32 %reduction, %start
+
+.. _int_vp_reduce_fmul:
+
+'``llvm.vp.reduce.fmul.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare float @llvm.vp.reduce.fmul.v4f32(float <start_value>, <4 x float> <val>, <4 x i1> <mask>, i32 <vector_length>)
+      declare double @llvm.vp.reduce.fmul.nxv8f64(double <start_value>, <vscale x 8 x double> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point ``MUL`` reduction of a vector and a scalar starting
+value, returning the result as a scalar.
+
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+floating-point type equal to the result type. The second operand is the vector
+on which the reduction is performed and must be a vector of floating-point
+values whose element type is the result/start type. The third operand is the
+vector mask and is a vector of boolean values with the same number of elements
+as the vector operand. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.fmul``' intrinsic performs the floating-point ``MUL``
+reduction (:ref:`llvm.vector.reduce.fmul <int_vector_reduce_fmul>`) of the
+vector operand ``val`` on each enabled lane, multiplying it by the scalar
+`start_value``. Disabled lanes are treated as containing the neutral value
+``1.0`` (i.e. having no effect on the reduction operation). If no lanes are
+enabled, the resulting value will be equal to the starting value.
+
+To ignore the start value, the neutral value can be used.
+
+See the unpredicated version (:ref:`llvm.vector.reduce.fmul
+<int_vector_reduce_fmul>`) for more detail on the semantics.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call float @llvm.vp.reduce.fmul.v4f32(float %start, <4 x float> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>
+      %also.r = call float @llvm.vector.reduce.fmul.v4f32(float %start, <4 x float> %masked.a)
+
+
+.. _int_vp_reduce_and:
+
+'``llvm.vp.reduce.and.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.and.v4i32(i32 <start_value>, <4 x i32> <val>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.and.nxv8i16(i16 <start_value>, <vscale x 8 x i16> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated integer ``AND`` reduction of a vector and a scalar starting value,
+returning the result as a scalar.
+
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+integer type equal to the result type. The second operand is the vector on
+which the reduction is performed and must be a vector of integer values whose
+element type is the result/start type. The third operand is the vector mask and
+is a vector of boolean values with the same number of elements as the vector
+operand. The fourth operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.and``' intrinsic performs the integer ``AND`` reduction
+(:ref:`llvm.vector.reduce.and <int_vector_reduce_and>`) of the vector operand
+``val`` on each enabled lane, performing an '``and``' of that with with the
+scalar ``start_value``. Disabled lanes are treated as containing the neutral
+value ``UINT_MAX``, or ``-1`` (i.e. having no effect on the reduction
+operation). If the vector length is zero, the result is the start value.
+
+To ignore the start value, the neutral value can be used.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+      %reduction = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %masked.a)
+      %also.r = and i32 %reduction, %start
+
+
+.. _int_vp_reduce_or:
+
+'``llvm.vp.reduce.or.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.or.v4i32(i32 <start_value>, <4 x i32> <val>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.or.nxv8i16(i16 <start_value>, <vscale x 8 x i16> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated integer ``OR`` reduction of a vector and a scalar starting value,
+returning the result as a scalar.
+
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+integer type equal to the result type. The second operand is the vector on
+which the reduction is performed and must be a vector of integer values whose
+element type is the result/start type. The third operand is the vector mask and
+is a vector of boolean values with the same number of elements as the vector
+operand. The fourth operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.or``' intrinsic performs the integer ``OR`` reduction
+(:ref:`llvm.vector.reduce.or <int_vector_reduce_or>`) of the vector operand
+``val`` on each enabled lane, performing an '``or``' of that with the scalar
+``start_value``. Disabled lanes are treated as containing the neutral value
+``0`` (i.e. having no effect on the reduction operation). If the vector length
+is zero, the result is the start value.
+
+To ignore the start value, the neutral value can be used.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+      %reduction = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %masked.a)
+      %also.r = or i32 %reduction, %start
+
+.. _int_vp_reduce_xor:
+
+'``llvm.vp.reduce.xor.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.xor.v4i32(i32 <start_value>, <4 x i32> <val>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.xor.nxv8i16(i16 <start_value>, <vscale x 8 x i16> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated integer ``XOR`` reduction of a vector and a scalar starting value,
+returning the result as a scalar.
+
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+integer type equal to the result type. The second operand is the vector on
+which the reduction is performed and must be a vector of integer values whose
+element type is the result/start type. The third operand is the vector mask and
+is a vector of boolean values with the same number of elements as the vector
+operand. The fourth operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.xor``' intrinsic performs the integer ``XOR`` reduction
+(:ref:`llvm.vector.reduce.xor <int_vector_reduce_xor>`) of the vector operand
+``val`` on each enabled lane, performing an '``xor``' of that with the scalar
+``start_value``. Disabled lanes are treated as containing the neutral value
+``0`` (i.e. having no effect on the reduction operation). If the vector length
+is zero, the result is the start value.
+
+To ignore the start value, the neutral value can be used.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+      %reduction = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %masked.a)
+      %also.r = xor i32 %reduction, %start
+
+
+.. _int_vp_reduce_smax:
+
+'``llvm.vp.reduce.smax.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.smax.v4i32(i32 <start_value>, <4 x i32> <val>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.smax.nxv8i16(i16 <start_value>, <vscale x 8 x i16> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated signed-integer ``MAX`` reduction of a vector and a scalar starting
+value, returning the result as a scalar.
+
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+integer type equal to the result type. The second operand is the vector on
+which the reduction is performed and must be a vector of integer values whose
+element type is the result/start type. The third operand is the vector mask and
+is a vector of boolean values with the same number of elements as the vector
+operand. The fourth operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.smax``' intrinsic performs the signed-integer ``MAX``
+reduction (:ref:`llvm.vector.reduce.smax <int_vector_reduce_smax>`) of the
+vector operand ``val`` on each enabled lane, and taking the maximum of that and
+the scalar ``start_value``. Disabled lanes are treated as containing the
+neutral value ``INT_MIN`` (i.e. having no effect on the reduction operation).
+If the vector length is zero, the result is the start value.
+
+To ignore the start value, the neutral value can be used.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i8 @llvm.vp.reduce.smax.v4i8(i8 %start, <4 x i8> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i8> %a, <4 x i8> <i8 -128, i8 -128, i8 -128, i8 -128>
+      %reduction = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %masked.a)
+      %also.r = call i8 @llvm.smax.i8(i8 %reduction, i8 %start)
+
+
+.. _int_vp_reduce_smin:
+
+'``llvm.vp.reduce.smin.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.smin.v4i32(i32 <start_value>, <4 x i32> <val>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.smin.nxv8i16(i16 <start_value>, <vscale x 8 x i16> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated signed-integer ``MIN`` reduction of a vector and a scalar starting
+value, returning the result as a scalar.
+
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+integer type equal to the result type. The second operand is the vector on
+which the reduction is performed and must be a vector of integer values whose
+element type is the result/start type. The third operand is the vector mask and
+is a vector of boolean values with the same number of elements as the vector
+operand. The fourth operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.smin``' intrinsic performs the signed-integer ``MIN``
+reduction (:ref:`llvm.vector.reduce.smin <int_vector_reduce_smin>`) of the
+vector operand ``val`` on each enabled lane, and taking the minimum of that and
+the scalar ``start_value``. Disabled lanes are treated as containing the
+neutral value ``INT_MAX`` (i.e. having no effect on the reduction operation).
+If the vector length is zero, the result is the start value.
+
+To ignore the start value, the neutral value can be used.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i8 @llvm.vp.reduce.smin.v4i8(i8 %start, <4 x i8> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i8> %a, <4 x i8> <i8 127, i8 127, i8 127, i8 127>
+      %reduction = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %masked.a)
+      %also.r = call i8 @llvm.smin.i8(i8 %reduction, i8 %start)
+
+
+.. _int_vp_reduce_umax:
+
+'``llvm.vp.reduce.umax.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.umax.v4i32(i32 <start_value>, <4 x i32> <val>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.umax.nxv8i16(i16 <start_value>, <vscale x 8 x i16> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated unsigned-integer ``MAX`` reduction of a vector and a scalar starting
+value, returning the result as a scalar.
+
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+integer type equal to the result type. The second operand is the vector on
+which the reduction is performed and must be a vector of integer values whose
+element type is the result/start type. The third operand is the vector mask and
+is a vector of boolean values with the same number of elements as the vector
+operand. The fourth operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.umax``' intrinsic performs the unsigned-integer ``MAX``
+reduction (:ref:`llvm.vector.reduce.umax <int_vector_reduce_umax>`) of the
+vector operand ``val`` on each enabled lane, and taking the maximum of that and
+the scalar ``start_value``. Disabled lanes are treated as containing the
+neutral value ``0`` (i.e. having no effect on the reduction operation). If the
+vector length is zero, the result is the start value.
+
+To ignore the start value, the neutral value can be used.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+      %reduction = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %masked.a)
+      %also.r = call i32 @llvm.umax.i32(i32 %reduction, i32 %start)
+
+
+.. _int_vp_reduce_umin:
+
+'``llvm.vp.reduce.umin.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare i32 @llvm.vp.reduce.umin.v4i32(i32 <start_value>, <4 x i32> <val>, <4 x i1> <mask>, i32 <vector_length>)
+      declare i16 @llvm.vp.reduce.umin.nxv8i16(i16 <start_value>, <vscale x 8 x i16> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated unsigned-integer ``MIN`` reduction of a vector and a scalar starting
+value, returning the result as a scalar.
+
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+integer type equal to the result type. The second operand is the vector on
+which the reduction is performed and must be a vector of integer values whose
+element type is the result/start type. The third operand is the vector mask and
+is a vector of boolean values with the same number of elements as the vector
+operand. The fourth operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.umin``' intrinsic performs the unsigned-integer ``MIN``
+reduction (:ref:`llvm.vector.reduce.umin <int_vector_reduce_umin>`) of the
+vector operand ``val`` on each enabled lane, taking the minimum of that and the
+scalar ``start_value``. Disabled lanes are treated as containing the neutral
+value ``UINT_MAX``, or ``-1`` (i.e. having no effect on the reduction
+operation). If the vector length is zero, the result is the start value.
+
+To ignore the start value, the neutral value can be used.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x i32> %a, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+      %reduction = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %masked.a)
+      %also.r = call i32 @llvm.umin.i32(i32 %reduction, i32 %start)
+
+
+.. _int_vp_reduce_fmax:
+
+'``llvm.vp.reduce.fmax.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare float @llvm.vp.reduce.fmax.v4f32(float <start_value>, <4 x float> <val>, <4 x i1> <mask>, float <vector_length>)
+      declare double @llvm.vp.reduce.fmax.nxv8f64(double <start_value>, <vscale x 8 x double> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point ``MAX`` reduction of a vector and a scalar starting
+value, returning the result as a scalar.
+
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+floating-point type equal to the result type. The second operand is the vector
+on which the reduction is performed and must be a vector of floating-point
+values whose element type is the result/start type. The third operand is the
+vector mask and is a vector of boolean values with the same number of elements
+as the vector operand. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.fmax``' intrinsic performs the floating-point ``MAX``
+reduction (:ref:`llvm.vector.reduce.fmax <int_vector_reduce_fmax>`) of the
+vector operand ``val`` on each enabled lane, taking the maximum of that and the
+scalar ``start_value``. Disabled lanes are treated as containing the neutral
+value (i.e. having no effect on the reduction operation). If the vector length
+is zero, the result is the start value.
+
+The neutral value is dependent on the :ref:`fast-math flags <fastmath>`. If no
+flags are set, the neutral value is ``-QNAN``. If ``nnan``  and ``ninf`` are
+both set, then the neutral value is the smallest floating-point value for the
+result type. If only ``nnan`` is set then the neutral value is ``-Infinity``.
+
+This instruction has the same comparison semantics as the
+:ref:`llvm.vector.reduce.fmax <int_vector_reduce_fmax>` intrinsic (and thus the
+'``llvm.maxnum.*``' intrinsic). That is, the result will always be a number
+unless all elements of the vector and the starting value are ``NaN``. For a
+vector with maximum element magnitude ``0.0`` and containing both ``+0.0`` and
+``-0.0`` elements, the sign of the result is unspecified.
+
+To ignore the start value, the neutral value can be used.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call float @llvm.vp.reduce.fmax.v4f32(float %float, <4 x float> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> <float QNAN, float QNAN, float QNAN, float QNAN>
+      %reduction = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %masked.a)
+      %also.r = call float @llvm.maxnum.f32(float %reduction, float %start)
+
+
+.. _int_vp_reduce_fmin:
+
+'``llvm.vp.reduce.fmin.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare float @llvm.vp.reduce.fmin.v4f32(float <start_value>, <4 x float> <val>, <4 x i1> <mask>, float <vector_length>)
+      declare double @llvm.vp.reduce.fmin.nxv8f64(double <start_value>, <vscale x 8 x double> <val>, <vscale x 8 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point ``MIN`` reduction of a vector and a scalar starting
+value, returning the result as a scalar.
+
+
+Arguments:
+""""""""""
+
+The first operand is the start value of the reduction, which must be a scalar
+floating-point type equal to the result type. The second operand is the vector
+on which the reduction is performed and must be a vector of floating-point
+values whose element type is the result/start type. The third operand is the
+vector mask and is a vector of boolean values with the same number of elements
+as the vector operand. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.reduce.fmin``' intrinsic performs the floating-point ``MIN``
+reduction (:ref:`llvm.vector.reduce.fmin <int_vector_reduce_fmin>`) of the
+vector operand ``val`` on each enabled lane, taking the minimum of that and the
+scalar ``start_value``. Disabled lanes are treated as containing the neutral
+value (i.e. having no effect on the reduction operation). If the vector length
+is zero, the result is the start value.
+
+The neutral value is dependent on the :ref:`fast-math flags <fastmath>`. If no
+flags are set, the neutral value is ``+QNAN``. If ``nnan``  and ``ninf`` are
+both set, then the neutral value is the largest floating-point value for the
+result type. If only ``nnan`` is set then the neutral value is ``+Infinity``.
+
+This instruction has the same comparison semantics as the
+:ref:`llvm.vector.reduce.fmin <int_vector_reduce_fmin>` intrinsic (and thus the
+'``llvm.minnum.*``' intrinsic). That is, the result will always be a number
+unless all elements of the vector and the starting value are ``NaN``. For a
+vector with maximum element magnitude ``0.0`` and containing both ``+0.0`` and
+``-0.0`` elements, the sign of the result is unspecified.
+
+To ignore the start value, the neutral value can be used.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call float @llvm.vp.reduce.fmin.v4f32(float %start, <4 x float> %a, <4 x i1> %mask, i32 %evl)
+      ; %r is equivalent to %also.r, where lanes greater than or equal to %evl
+      ; are treated as though %mask were false for those lanes.
+
+      %masked.a = select <4 x i1> %mask, <4 x float> %a, <4 x float> <float QNAN, float QNAN, float QNAN, float QNAN>
+      %reduction = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %masked.a)
+      %also.r = call float @llvm.minnum.f32(float %reduction, float %start)
+
+
 .. _int_get_active_lane_mask:
 
 '``llvm.get.active.lane.mask.*``' Intrinsics
diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst
index d3cf330a52902..45043e07e62ca 100644
--- a/llvm/docs/Security.rst
+++ b/llvm/docs/Security.rst
@@ -41,6 +41,7 @@ The members of the group represent a wide cross-section of the community, and me
 * Kate McInnes (Apple) []
 * Kristof Beyls (ARM) [kristof.beyls]
 * Matthew Riley (Google) [mattdr]
+* Nikhil Gupta (Nvidia) [nikhgupt]
 * Oliver Hunt (Apple) [ojhunt]
 * Paul Robinson (Sony) [probinson]
 * Peter Smith (ARM) [peter.smith]
diff --git a/llvm/docs/SphinxQuickstartTemplate.rst b/llvm/docs/SphinxQuickstartTemplate.rst
index dcf21645a84e4..f15970dec30c7 100644
--- a/llvm/docs/SphinxQuickstartTemplate.rst
+++ b/llvm/docs/SphinxQuickstartTemplate.rst
@@ -180,6 +180,12 @@ On Debian you can install these with:
 
    sudo apt install -y sphinx-doc python-recommonmark-doc
 
+On Ubuntu use pip to get an up-to-date version of recommonmark:
+
+.. code-block:: console
+
+   sudo pip install sphinx recommonmark
+
 Then run cmake to build the documentation inside the ``llvm-project`` checkout:
 
 .. code-block:: console
diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst
index cd2c012d95738..aeb2e89f9588d 100644
--- a/llvm/docs/TestingGuide.rst
+++ b/llvm/docs/TestingGuide.rst
@@ -173,6 +173,11 @@ or to run all of the ARM CodeGen tests:
 
     % llvm-lit ~/llvm/test/CodeGen/ARM
 
+The regression tests will use the Python psutil module only if installed in a
+**non-user** location. Under Linux, install with sudo or within a virtual
+environment. Under Windows, install Python for all users and then run
+``pip install psutil`` in an elevated command prompt.
+
 For more information on using the :program:`lit` tool, see ``llvm-lit --help``
 or the :doc:`lit man page <CommandGuide/lit>`.
 
diff --git a/llvm/docs/XRayExample.rst b/llvm/docs/XRayExample.rst
index c0a2a7a917cfe..1ff66dcdff16b 100644
--- a/llvm/docs/XRayExample.rst
+++ b/llvm/docs/XRayExample.rst
@@ -78,7 +78,7 @@ been instrumented. We can see an example accounting with ``llvm-xray account``:
 
 ::
 
-  $ llvm-xray account xray-log.llc.m35qPB -top=10 -sort=sum -sortorder=dsc -instr_map ./bin/llc
+  $ llvm-xray account xray-log.llc.m35qPB --top=10 --sort=sum --sortorder=dsc --instr_map=./bin/llc
   Functions with latencies: 29
      funcid      count [      min,       med,       90p,       99p,       max]       sum  function
         187        360 [ 0.000000,  0.000001,  0.000014,  0.000032,  0.000075]  0.001596  LLLexer.cpp:446:0: llvm::LLLexer::LexIdentifier()
@@ -103,7 +103,7 @@ output for an example trace would look like the following:
 
 ::
 
-  $ llvm-xray convert -f yaml -symbolize -instr_map=./bin/llc xray-log.llc.m35qPB
+  $ llvm-xray convert -f yaml --symbolize --instr_map=./bin/llc xray-log.llc.m35qPB
   ---
   header:          
     version:         1
@@ -151,7 +151,7 @@ function bodies to 1. We can do that with the
   $ XRAY_OPTIONS="patch_premain=true" ./bin/llc input.ll
   ==69819==XRay: Log file in 'xray-log.llc.5rqxkU'
 
-  $ llvm-xray account xray-log.llc.5rqxkU -top=10 -sort=sum -sortorder=dsc -instr_map ./bin/llc
+  $ llvm-xray account xray-log.llc.5rqxkU --top=10 --sort=sum --sortorder=dsc --instr_map=./bin/llc
   Functions with latencies: 36652
    funcid      count [      min,       med,       90p,       99p,       max]       sum  function    
        75          1 [ 0.672368,  0.672368,  0.672368,  0.672368,  0.672368]  0.672368  llc.cpp:271:0: main
@@ -205,7 +205,7 @@ The way to use the command is to output the top stacks by call count and time sp
 
 ::
 
-  $ llvm-xray stack xray-log.llc.5rqxkU -instr_map ./bin/llc
+  $ llvm-xray stack xray-log.llc.5rqxkU --instr_map=./bin/llc
 
   Unique Stacks: 3069
   Top 10 Stacks by leaf sum:
@@ -227,9 +227,9 @@ In the default mode, identical stacks on different threads are independently
 aggregated. In a multithreaded program, you may end up having identical call
 stacks fill your list of top calls.
 
-To address this, you may specify the ``-aggregate-threads`` or
-``-per-thread-stacks`` flags. ``-per-thread-stacks`` treats the thread id as an
-implicit root in each call stack tree, while ``-aggregate-threads`` combines
+To address this, you may specify the ``--aggregate-threads`` or
+``--per-thread-stacks`` flags. ``--per-thread-stacks`` treats the thread id as an
+implicit root in each call stack tree, while ``--aggregate-threads`` combines
 identical stacks from all threads.
 
 Flame Graph Generation
@@ -243,16 +243,16 @@ FlameGraph tool, currently available on `github
 
 To generate output for a flamegraph, a few more options are necessary.
 
-- ``-all-stacks`` - Emits all of the stacks.
-- ``-stack-format`` - Choose the flamegraph output format 'flame'.
-- ``-aggregation-type`` - Choose the metric to graph.
+- ``--all-stacks`` - Emits all of the stacks.
+- ``--stack-format`` - Choose the flamegraph output format 'flame'.
+- ``--aggregation-type`` - Choose the metric to graph.
 
 You may pipe the command output directly to the flamegraph tool to obtain an
 svg file.
 
 ::
 
-  $llvm-xray stack xray-log.llc.5rqxkU -instr_map ./bin/llc -stack-format=flame -aggregation-type=time -all-stacks | \
+  $ llvm-xray stack xray-log.llc.5rqxkU --instr_map=./bin/llc --stack-format=flame --aggregation-type=time --all-stacks | \
   /path/to/FlameGraph/flamegraph.pl > flamegraph.svg
 
 If you open the svg in a browser, mouse events allow exploring the call stacks.
@@ -265,8 +265,8 @@ from the same generated trace:
 
 ::
 
-  $ llvm-xray convert -symbolize -instr_map=./bin/llc \
-    -output-format=trace_event xray-log.llc.5rqxkU \
+  $ llvm-xray convert --symbolize --instr_map=./bin/llc \
+    --output-format=trace_event xray-log.llc.5rqxkU \
       | gzip > llc-trace.txt.gz
 
 From a Chrome browser, navigating to ``chrome:///tracing`` allows us to load
@@ -329,7 +329,7 @@ applications:
 
 ::
 
-  $ llvm-xray graph xray-log.sample.* -m sample -color-edges=sum -edge-label=sum \
+  $ llvm-xray graph xray-log.sample.* -m sample --color-edges=sum --edge-label=sum \
       | unflatten -f -l10 | dot -Tsvg -o sample.svg
 
 
diff --git a/llvm/examples/Kaleidoscope/include/KaleidoscopeJIT.h b/llvm/examples/Kaleidoscope/include/KaleidoscopeJIT.h
index 18a4c111a5275..457a2d44e87f1 100644
--- a/llvm/examples/Kaleidoscope/include/KaleidoscopeJIT.h
+++ b/llvm/examples/Kaleidoscope/include/KaleidoscopeJIT.h
@@ -54,6 +54,10 @@ class KaleidoscopeJIT {
     MainJD.addGenerator(
         cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(
             DL.getGlobalPrefix())));
+    if (JTMB.getTargetTriple().isOSBinFormatCOFF()) {
+      ObjectLayer.setOverrideObjectFlagsWithResponsibilityFlags(true);
+      ObjectLayer.setAutoClaimResponsibilityForObjectSymbols(true);
+    }
   }
 
   ~KaleidoscopeJIT() {
diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index 31d3880736338..cd1964cbdd98d 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -85,7 +85,7 @@ class BitVector {
   unsigned Size; // Size of bitvector in bits.
 
 public:
-  typedef unsigned size_type;
+  using size_type = unsigned;
 
   // Encapsulation of a single bit.
   class reference {
@@ -536,8 +536,8 @@ class BitVector {
                [&Arg](auto const &BV) { return Arg.size() == BV; }) &&
            "consistent sizes");
     Out.resize(Arg.size());
-    for (size_t i = 0, e = Arg.Bits.size(); i != e; ++i)
-      Out.Bits[i] = f(Arg.Bits[i], Args.Bits[i]...);
+    for (size_type I = 0, E = Arg.Bits.size(); I != E; ++I)
+      Out.Bits[I] = f(Arg.Bits[I], Args.Bits[I]...);
     Out.clear_unused_bits();
     return Out;
   }
@@ -545,16 +545,16 @@ class BitVector {
   BitVector &operator|=(const BitVector &RHS) {
     if (size() < RHS.size())
       resize(RHS.size());
-    for (size_t i = 0, e = RHS.Bits.size(); i != e; ++i)
-      Bits[i] |= RHS.Bits[i];
+    for (size_type I = 0, E = RHS.Bits.size(); I != E; ++I)
+      Bits[I] |= RHS.Bits[I];
     return *this;
   }
 
   BitVector &operator^=(const BitVector &RHS) {
     if (size() < RHS.size())
       resize(RHS.size());
-    for (size_t i = 0, e = RHS.Bits.size(); i != e; ++i)
-      Bits[i] ^= RHS.Bits[i];
+    for (size_type I = 0, E = RHS.Bits.size(); I != E; ++I)
+      Bits[I] ^= RHS.Bits[I];
     return *this;
   }
 
@@ -808,11 +808,11 @@ class BitVector {
 
 public:
   /// Return the size (in bytes) of the bit vector.
-  size_t getMemorySize() const { return Bits.size() * sizeof(BitWord); }
-  size_t getBitCapacity() const { return Bits.size() * BITWORD_SIZE; }
+  size_type getMemorySize() const { return Bits.size() * sizeof(BitWord); }
+  size_type getBitCapacity() const { return Bits.size() * BITWORD_SIZE; }
 };
 
-inline size_t capacity_in_bytes(const BitVector &X) {
+inline BitVector::size_type capacity_in_bytes(const BitVector &X) {
   return X.getMemorySize();
 }
 
@@ -824,8 +824,8 @@ template <> struct DenseMapInfo<BitVector> {
     return V;
   }
   static unsigned getHashValue(const BitVector &V) {
-    return DenseMapInfo<std::pair<unsigned, ArrayRef<uintptr_t>>>::getHashValue(
-        std::make_pair(V.size(), V.getData()));
+    return DenseMapInfo<std::pair<BitVector::size_type, ArrayRef<uintptr_t>>>::
+        getHashValue(std::make_pair(V.size(), V.getData()));
   }
   static bool isEqual(const BitVector &LHS, const BitVector &RHS) {
     if (LHS.isInvalid() || RHS.isInvalid())
diff --git a/llvm/include/llvm/ADT/SmallBitVector.h b/llvm/include/llvm/ADT/SmallBitVector.h
index f570bac23ad51..51ee5dbbce05a 100644
--- a/llvm/include/llvm/ADT/SmallBitVector.h
+++ b/llvm/include/llvm/ADT/SmallBitVector.h
@@ -60,7 +60,7 @@ class SmallBitVector {
                 "Unsupported word size");
 
 public:
-  using size_type = unsigned;
+  using size_type = uintptr_t;
 
   // Encapsulation of a single bit.
   class reference {
@@ -96,7 +96,7 @@ class SmallBitVector {
     return reinterpret_cast<BitVector *>(X);
   }
 
-  void switchToSmall(uintptr_t NewSmallBits, size_t NewSize) {
+  void switchToSmall(uintptr_t NewSmallBits, size_type NewSize) {
     X = 1;
     setSmallSize(NewSize);
     setSmallBits(NewSmallBits);
@@ -120,9 +120,11 @@ class SmallBitVector {
   }
 
   // Return the size.
-  size_t getSmallSize() const { return getSmallRawBits() >> SmallNumDataBits; }
+  size_type getSmallSize() const {
+    return getSmallRawBits() >> SmallNumDataBits;
+  }
 
-  void setSmallSize(size_t Size) {
+  void setSmallSize(size_type Size) {
     setSmallRawBits(getSmallBits() | (Size << SmallNumDataBits));
   }
 
@@ -189,7 +191,7 @@ class SmallBitVector {
   }
 
   /// Returns the number of bits in this bitvector.
-  size_t size() const {
+  size_type size() const {
     return isSmall() ? getSmallSize() : getPointer()->size();
   }
 
@@ -336,8 +338,8 @@ class SmallBitVector {
     } else {
       BitVector *BV = new BitVector(N, t);
       uintptr_t OldBits = getSmallBits();
-      for (size_t i = 0, e = getSmallSize(); i != e; ++i)
-        (*BV)[i] = (OldBits >> i) & 1;
+      for (size_type I = 0, E = getSmallSize(); I != E; ++I)
+        (*BV)[I] = (OldBits >> I) & 1;
       switchToLarge(BV);
     }
   }
@@ -346,11 +348,11 @@ class SmallBitVector {
     if (isSmall()) {
       if (N > SmallNumDataBits) {
         uintptr_t OldBits = getSmallRawBits();
-        size_t SmallSize = getSmallSize();
+        size_type SmallSize = getSmallSize();
         BitVector *BV = new BitVector(SmallSize);
-        for (size_t i = 0; i < SmallSize; ++i)
-          if ((OldBits >> i) & 1)
-            BV->set(i);
+        for (size_type I = 0; I < SmallSize; ++I)
+          if ((OldBits >> I) & 1)
+            BV->set(I);
         BV->reserve(N);
         switchToLarge(BV);
       }
@@ -491,8 +493,8 @@ class SmallBitVector {
     else if (!isSmall() && !RHS.isSmall())
       return *getPointer() == *RHS.getPointer();
     else {
-      for (size_t i = 0, e = size(); i != e; ++i) {
-        if ((*this)[i] != RHS[i])
+      for (size_type I = 0, E = size(); I != E; ++I) {
+        if ((*this)[I] != RHS[I])
           return false;
       }
       return true;
@@ -512,11 +514,11 @@ class SmallBitVector {
     else if (!isSmall() && !RHS.isSmall())
       getPointer()->operator&=(*RHS.getPointer());
     else {
-      size_t i, e;
-      for (i = 0, e = std::min(size(), RHS.size()); i != e; ++i)
-        (*this)[i] = test(i) && RHS.test(i);
-      for (e = size(); i != e; ++i)
-        reset(i);
+      size_type I, E;
+      for (I = 0, E = std::min(size(), RHS.size()); I != E; ++I)
+        (*this)[I] = test(I) && RHS.test(I);
+      for (E = size(); I != E; ++I)
+        reset(I);
     }
     return *this;
   }
@@ -561,8 +563,8 @@ class SmallBitVector {
     else if (!isSmall() && !RHS.isSmall())
       getPointer()->operator|=(*RHS.getPointer());
     else {
-      for (size_t i = 0, e = RHS.size(); i != e; ++i)
-        (*this)[i] = test(i) || RHS.test(i);
+      for (size_type I = 0, E = RHS.size(); I != E; ++I)
+        (*this)[I] = test(I) || RHS.test(I);
     }
     return *this;
   }
@@ -574,8 +576,8 @@ class SmallBitVector {
     else if (!isSmall() && !RHS.isSmall())
       getPointer()->operator^=(*RHS.getPointer());
     else {
-      for (size_t i = 0, e = RHS.size(); i != e; ++i)
-        (*this)[i] = test(i) != RHS.test(i);
+      for (size_type I = 0, E = RHS.size(); I != E; ++I)
+        (*this)[I] = test(I) != RHS.test(I);
     }
     return *this;
   }
@@ -721,8 +723,9 @@ template <> struct DenseMapInfo<SmallBitVector> {
   }
   static unsigned getHashValue(const SmallBitVector &V) {
     uintptr_t Store;
-    return DenseMapInfo<std::pair<unsigned, ArrayRef<uintptr_t>>>::getHashValue(
-        std::make_pair(V.size(), V.getData(Store)));
+    return DenseMapInfo<
+        std::pair<SmallBitVector::size_type, ArrayRef<uintptr_t>>>::
+        getHashValue(std::make_pair(V.size(), V.getData(Store)));
   }
   static bool isEqual(const SmallBitVector &LHS, const SmallBitVector &RHS) {
     if (LHS.isInvalid() || RHS.isInvalid())
diff --git a/llvm/include/llvm/Analysis/LoopNestAnalysis.h b/llvm/include/llvm/Analysis/LoopNestAnalysis.h
index 9a749a1c8eae1..0e963fce6e2c2 100644
--- a/llvm/include/llvm/Analysis/LoopNestAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopNestAnalysis.h
@@ -21,11 +21,14 @@
 namespace llvm {
 
 using LoopVectorTy = SmallVector<Loop *, 8>;
+
 class LPMUpdater;
 
 /// This class represents a loop nest and can be used to query its properties.
 class LoopNest {
 public:
+  using InstrVectorTy = SmallVector<const Instruction *>;
+
   /// Construct a loop nest rooted by loop \p Root.
   LoopNest(Loop &Root, ScalarEvolution &SE);
 
@@ -48,6 +51,12 @@ class LoopNest {
   static bool arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
                                  ScalarEvolution &SE);
 
+  /// Return a vector of instructions that prevent the LoopNest given
+  /// by loops \p OuterLoop and \p InnerLoop from being perfect.
+  static InstrVectorTy getInterveningInstructions(const Loop &OuterLoop,
+                                                  const Loop &InnerLoop,
+                                                  ScalarEvolution &SE);
+
   /// Return the maximum nesting depth of the loop nest rooted by loop \p Root.
   /// For example given the loop nest:
   /// \code
@@ -150,6 +159,17 @@ class LoopNest {
 protected:
   const unsigned MaxPerfectDepth; // maximum perfect nesting depth level.
   LoopVectorTy Loops; // the loops in the nest (in breadth first order).
+
+private:
+  enum LoopNestEnum {
+    PerfectLoopNest,
+    ImperfectLoopNest,
+    InvalidLoopStructure,
+    OuterLoopLowerBoundUnknown
+  };
+  static LoopNestEnum analyzeLoopNestForPerfectNest(const Loop &OuterLoop,
+                                                    const Loop &InnerLoop,
+                                                    ScalarEvolution &SE);
 };
 
 raw_ostream &operator<<(raw_ostream &, const LoopNest &);
diff --git a/llvm/include/llvm/Analysis/MemorySSA.h b/llvm/include/llvm/Analysis/MemorySSA.h
index f40b99968fd3a..024d2c4b003c9 100644
--- a/llvm/include/llvm/Analysis/MemorySSA.h
+++ b/llvm/include/llvm/Analysis/MemorySSA.h
@@ -106,9 +106,6 @@
 
 namespace llvm {
 
-/// Enables memory ssa as a dependency for loop passes.
-extern cl::opt<bool> EnableMSSALoopDependency;
-
 class AllocaInst;
 class Function;
 class Instruction;
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
index 22bfeda0efd0d..c27e109e8687b 100644
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h
@@ -238,7 +238,7 @@ class TargetLibraryInfo {
     else {
       // Disable individual libc/libm calls in TargetLibraryInfo.
       LibFunc LF;
-      AttributeSet FnAttrs = (*F)->getAttributes().getFnAttributes();
+      AttributeSet FnAttrs = (*F)->getAttributes().getFnAttrs();
       for (const Attribute &Attr : FnAttrs) {
         if (!Attr.isStringAttribute())
           continue;
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index e38b7fd0f3ea9..bc35f6381c0db 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -190,6 +190,7 @@ enum Kind {
   kw_convergent,
   kw_dereferenceable,
   kw_dereferenceable_or_null,
+  kw_disable_sanitizer_instrumentation,
   kw_elementtype,
   kw_inaccessiblememonly,
   kw_inaccessiblemem_or_argmemonly,
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 28870afb2fcb3..04eb2739cbd5a 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -671,6 +671,7 @@ enum AttributeKindCodes {
   ATTR_KIND_SWIFT_ASYNC = 75,
   ATTR_KIND_NO_SANITIZE_COVERAGE = 76,
   ATTR_KIND_ELEMENTTYPE = 77,
+  ATTR_KIND_DISABLE_SANITIZER_INSTRUMENTATION = 78,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 555be8be8885b..d892a7525a6d3 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -36,7 +36,10 @@ class GISelKnownBits;
 class MachineDominatorTree;
 class LegalizerInfo;
 struct LegalityQuery;
+class RegisterBank;
+class RegisterBankInfo;
 class TargetLowering;
+class TargetRegisterInfo;
 
 struct PreferredTuple {
   LLT Ty;                // The result type of the extend.
@@ -54,6 +57,7 @@ struct IndexedLoadStoreMatchInfo {
 struct PtrAddChain {
   int64_t Imm;
   Register Base;
+  const RegisterBank *Bank;
 };
 
 struct RegisterImmPair {
@@ -95,6 +99,8 @@ class CombinerHelper {
   GISelKnownBits *KB;
   MachineDominatorTree *MDT;
   const LegalizerInfo *LI;
+  const RegisterBankInfo *RBI;
+  const TargetRegisterInfo *TRI;
 
 public:
   CombinerHelper(GISelChangeObserver &Observer, MachineIRBuilder &B,
@@ -120,6 +126,18 @@ class CombinerHelper {
   void replaceRegOpWith(MachineRegisterInfo &MRI, MachineOperand &FromRegOp,
                         Register ToReg) const;
 
+  /// Get the register bank of \p Reg.
+  /// If Reg has not been assigned a register, a register class,
+  /// or a register bank, then this returns nullptr.
+  ///
+  /// \pre Reg.isValid()
+  const RegisterBank *getRegBank(Register Reg) const;
+
+  /// Set the register bank of \p Reg.
+  /// Does nothing if the RegBank is null.
+  /// This is the counterpart to getRegBank.
+  void setRegBank(Register Reg, const RegisterBank *RegBank);
+
   /// If \p MI is COPY, try to combine it.
   /// Returns true if MI changed.
   bool tryCombineCopy(MachineInstr &MI);
@@ -554,6 +572,11 @@ class CombinerHelper {
   /// Do constant folding when opportunities are exposed after MIR building.
   bool matchConstantFold(MachineInstr &MI, APInt &MatchInfo);
 
+  /// \returns true if it is possible to narrow the width of a scalar binop
+  /// feeding a G_AND instruction \p MI.
+  bool matchNarrowBinopFeedingAnd(
+      MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo);
+
   /// Try to transform \p MI by using all of the above
   /// combine functions. Returns true if changed.
   bool tryCombine(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 67141f3a63260..678efdad9f720 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -402,6 +402,8 @@ class LegalizerHelper {
   LegalizeResult lowerDIVREM(MachineInstr &MI);
   LegalizeResult lowerAbsToAddXor(MachineInstr &MI);
   LegalizeResult lowerAbsToMaxNeg(MachineInstr &MI);
+  LegalizeResult lowerIsNaN(MachineInstr &MI);
+  LegalizeResult lowerVectorReduction(MachineInstr &MI);
 };
 
 /// Helper function that creates a libcall to the given \p Name using the given
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 9b652d8e16bcb..4d50533412d76 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1865,6 +1865,12 @@ class MachineIRBuilder {
     return buildInstr(TargetOpcode::G_BITREVERSE, {Dst}, {Src});
   }
 
+  /// Build and insert \p Dst = G_ISNAN \p Src
+  MachineInstrBuilder buildIsNaN(const DstOp &Dst, const SrcOp &Src,
+                                 Optional<unsigned> Flags = None) {
+    return buildInstr(TargetOpcode::G_ISNAN, {Dst}, {Src}, Flags);
+  }
+
   virtual MachineInstrBuilder buildInstr(unsigned Opc, ArrayRef<DstOp> DstOps,
                                          ArrayRef<SrcOp> SrcOps,
                                          Optional<unsigned> Flags = None);
diff --git a/llvm/include/llvm/CodeGen/IndirectThunks.h b/llvm/include/llvm/CodeGen/IndirectThunks.h
index 74973f38bc791..90f9912f0ee0f 100644
--- a/llvm/include/llvm/CodeGen/IndirectThunks.h
+++ b/llvm/include/llvm/CodeGen/IndirectThunks.h
@@ -62,7 +62,7 @@ void ThunkInserter<Derived>::createThunkFunction(MachineModuleInfo &MMI,
   AttrBuilder B;
   B.addAttribute(llvm::Attribute::NoUnwind);
   B.addAttribute(llvm::Attribute::Naked);
-  F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+  F->addFnAttrs(B);
 
   // Populate our function a bit so that we can verify.
   BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F);
diff --git a/llvm/include/llvm/CodeGen/LowLevelType.h b/llvm/include/llvm/CodeGen/LowLevelType.h
index 40985e16b37a7..922f93d2e5985 100644
--- a/llvm/include/llvm/CodeGen/LowLevelType.h
+++ b/llvm/include/llvm/CodeGen/LowLevelType.h
@@ -16,8 +16,8 @@
 #ifndef LLVM_CODEGEN_LOWLEVELTYPE_H
 #define LLVM_CODEGEN_LOWLEVELTYPE_H
 
+#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
-#include "llvm/Support/MachineValueType.h"
 
 namespace llvm {
 
@@ -31,6 +31,7 @@ LLT getLLTForType(Type &Ty, const DataLayout &DL);
 /// Get a rough equivalent of an MVT for a given LLT. MVT can't distinguish
 /// pointers, so these will convert to a plain integer.
 MVT getMVTForLLT(LLT Ty);
+EVT getApproximateEVTForLLT(LLT Ty, const DataLayout &DL, LLVMContext &Ctx);
 
 /// Get a rough equivalent of an LLT for a given MVT. LLT does not yet support
 /// scalarable vector types, and will assert if used.
diff --git a/llvm/include/llvm/CodeGen/MIRSampleProfile.h b/llvm/include/llvm/CodeGen/MIRSampleProfile.h
new file mode 100644
index 0000000000000..9a360032c5296
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MIRSampleProfile.h
@@ -0,0 +1,74 @@
+//===----- MIRSampleProfile.h: SampleFDO Support in MIR ---*- c++ -*-------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the supoorting functions for machine level Sample FDO
+// loader. This is used in Flow Sensitive SampelFDO.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MIRSAMPLEPROFILE_H
+#define LLVM_CODEGEN_MIRSAMPLEPROFILE_H
+
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/ProfileData/SampleProfReader.h"
+
+#include <cassert>
+
+namespace llvm {
+
+using namespace sampleprof;
+
+class MIRProfileLoader;
+class MIRProfileLoaderPass : public MachineFunctionPass {
+  MachineFunction *MF;
+  std::string ProfileFileName;
+  FSDiscriminatorPass P;
+  unsigned LowBit;
+  unsigned HighBit;
+
+public:
+  static char ID;
+  /// FS bits will only use the '1' bits in the Mask.
+  MIRProfileLoaderPass(std::string FileName = "",
+                       std::string RemappingFileName = "",
+                       FSDiscriminatorPass P = FSDiscriminatorPass::Pass1);
+
+  /// getMachineFunction - Return the last machine function computed.
+  const MachineFunction *getMachineFunction() const { return MF; }
+
+private:
+  void init(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &) override;
+  bool doInitialization(Module &M) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  std::unique_ptr<MIRProfileLoader> MIRSampleLoader;
+  /// Hold the information of the basic block frequency.
+  MachineBlockFrequencyInfo *MBFI;
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_MIRSAMPLEPROFILE_H
diff --git a/llvm/include/llvm/CodeGen/MachineDominators.h b/llvm/include/llvm/CodeGen/MachineDominators.h
index 46bf73cdd7b63..00bfa1a269456 100644
--- a/llvm/include/llvm/CodeGen/MachineDominators.h
+++ b/llvm/include/llvm/CodeGen/MachineDominators.h
@@ -112,6 +112,12 @@ class MachineDominatorTree : public MachineFunctionPass {
     return DT->dominates(A, B);
   }
 
+  void getDescendants(MachineBasicBlock *A,
+                      SmallVectorImpl<MachineBasicBlock *> &Result) {
+    applySplitCriticalEdges();
+    DT->getDescendants(A, Result);
+  }
+
   bool dominates(const MachineBasicBlock *A, const MachineBasicBlock *B) const {
     applySplitCriticalEdges();
     return DT->dominates(A, B);
diff --git a/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h b/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
index 8cc5909c40b7f..285b858c96cb1 100644
--- a/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
+++ b/llvm/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
@@ -118,6 +118,12 @@ class MachineOptimizationRemarkAnalysis : public DiagnosticInfoMIROptimization {
       : DiagnosticInfoMIROptimization(DK_MachineOptimizationRemarkAnalysis,
                                       PassName, RemarkName, Loc, MBB) {}
 
+  MachineOptimizationRemarkAnalysis(const char *PassName, StringRef RemarkName,
+                                    const MachineInstr *MI)
+      : DiagnosticInfoMIROptimization(DK_MachineOptimizationRemarkAnalysis,
+                                      PassName, RemarkName, MI->getDebugLoc(),
+                                      MI->getParent()) {}
+
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_MachineOptimizationRemarkAnalysis;
   }
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index da1bab7189488..09aad65464400 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -171,6 +171,9 @@ namespace llvm {
   /// This pass adds flow sensitive discriminators.
   extern char &MIRAddFSDiscriminatorsID;
 
+  /// This pass reads flow sensitive profile.
+  extern char &MIRProfileLoaderPassID;
+
   /// FastRegisterAllocation Pass - This pass register allocates as fast as
   /// possible. It is best suited for debug code where live ranges are short.
   ///
@@ -513,6 +516,11 @@ namespace llvm {
   FunctionPass *
   createMIRAddFSDiscriminatorsPass(sampleprof::FSDiscriminatorPass P);
 
+  /// Read Flow Sensitive Profile.
+  FunctionPass *createMIRProfileLoaderPass(std::string File,
+                                           std::string RemappingFile,
+                                           sampleprof::FSDiscriminatorPass P);
+
   /// Creates MIR Debugify pass. \see MachineDebugify.cpp
   ModulePass *createDebugifyMachineModulePass();
 
diff --git a/llvm/include/llvm/CodeGen/RegisterScavenging.h b/llvm/include/llvm/CodeGen/RegisterScavenging.h
index 4f48ea2dc8e84..218e05f6eb6b3 100644
--- a/llvm/include/llvm/CodeGen/RegisterScavenging.h
+++ b/llvm/include/llvm/CodeGen/RegisterScavenging.h
@@ -211,9 +211,6 @@ class RegScavenger {
   /// Initialize RegisterScavenger.
   void init(MachineBasicBlock &MBB);
 
-  /// Mark live-in registers of basic block as used.
-  void setLiveInsUsed(const MachineBasicBlock &MBB);
-
   /// Spill a register after position \p After and reload it before position
   /// \p UseMI.
   ScavengedInfo &spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
diff --git a/llvm/include/llvm/CodeGen/TargetCallingConv.h b/llvm/include/llvm/CodeGen/TargetCallingConv.h
index 29e644898f6be..7713dd0800c09 100644
--- a/llvm/include/llvm/CodeGen/TargetCallingConv.h
+++ b/llvm/include/llvm/CodeGen/TargetCallingConv.h
@@ -247,11 +247,11 @@ namespace ISD {
     unsigned PartOffset;
 
     OutputArg() = default;
-    OutputArg(ArgFlagsTy flags, EVT vt, EVT argvt, bool isfixed,
+    OutputArg(ArgFlagsTy flags, MVT vt, EVT argvt, bool isfixed,
               unsigned origIdx, unsigned partOffs)
-      : Flags(flags), IsFixed(isfixed), OrigArgIndex(origIdx),
-        PartOffset(partOffs) {
-      VT = vt.getSimpleVT();
+        : Flags(flags), IsFixed(isfixed), OrigArgIndex(origIdx),
+          PartOffset(partOffs) {
+      VT = vt;
       ArgVT = argvt;
     }
   };
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index d0328b1405afe..100f4e2a6ab4c 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -30,6 +30,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
@@ -2080,6 +2081,20 @@ class TargetLoweringBase {
     return false;
   }
 
+  /// Return true if it may be profitable to transform
+  /// (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
+  /// This may not be true if c1 and c2 can be represented as immediates but
+  /// c1*c2 cannot, for example.
+  /// The target should check if c1, c2 and c1*c2 can be represented as
+  /// immediates, or have to be materialized into registers. If it is not sure
+  /// about some cases, a default true can be returned to let the DAGCombiner
+  /// decide.
+  /// AddNode is (add x, c1), and ConstNode is c2.
+  virtual bool isMulAddWithConstProfitable(const SDValue &AddNode,
+                                           const SDValue &ConstNode) const {
+    return true;
+  }
+
   /// Return true if it is more correct/profitable to use strict FP_TO_INT
   /// conversion operations - canonicalizing the FP source value instead of
   /// converting all cases and then selecting based on value.
@@ -2509,8 +2524,11 @@ class TargetLoweringBase {
     return false;
   }
 
-  virtual bool isTruncateFree(EVT FromVT, EVT ToVT) const {
-    return false;
+  virtual bool isTruncateFree(EVT FromVT, EVT ToVT) const { return false; }
+  virtual bool isTruncateFree(LLT FromTy, LLT ToTy, const DataLayout &DL,
+                              LLVMContext &Ctx) const {
+    return isTruncateFree(getApproximateEVTForLLT(FromTy, DL, Ctx),
+                          getApproximateEVTForLLT(ToTy, DL, Ctx));
   }
 
   virtual bool isProfitableToHoist(Instruction *I) const { return true; }
@@ -2586,8 +2604,11 @@ class TargetLoweringBase {
     return false;
   }
 
-  virtual bool isZExtFree(EVT FromTy, EVT ToTy) const {
-    return false;
+  virtual bool isZExtFree(EVT FromTy, EVT ToTy) const { return false; }
+  virtual bool isZExtFree(LLT FromTy, LLT ToTy, const DataLayout &DL,
+                          LLVMContext &Ctx) const {
+    return isZExtFree(getApproximateEVTForLLT(FromTy, DL, Ctx),
+                      getApproximateEVTForLLT(ToTy, DL, Ctx));
   }
 
   /// Return true if sign-extension from FromTy to ToTy is cheaper than
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 92ce5b737090c..70017173a0de3 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -871,10 +871,6 @@ class TargetRegisterInfo : public MCRegisterInfo {
   /// (3) Bottom-up allocation is no longer guaranteed to optimally color.
   virtual bool reverseLocalAssignment() const { return false; }
 
-  /// Add the allocation priority to global and split ranges as well as the
-  /// local ranges when registers are added to the queue.
-  virtual bool addAllocPriorityToGlobalRanges() const { return false; }
-
   /// Allow the target to override the cost of using a callee-saved register for
   /// the first time. Default value of 0 means we will use a callee-saved
   /// register if it is available.
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 8d58ec9d665b3..d7cd44b5db36a 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -64,6 +64,9 @@
 /* Define to 1 if we can deregister EH frames on this platform. */
 #cmakedefine HAVE_DEREGISTER_FRAME ${HAVE_DEREGISTER_FRAME}
 
+/* Define if __unw_add_dynamic_fde() is available on this platform. */
+#cmakedefine HAVE_UNW_ADD_DYNAMIC_FDE ${HAVE_UNW_ADD_DYNAMIC_FDE}
+
 /* Define to 1 if you have the <errno.h> header file. */
 #cmakedefine HAVE_ERRNO_H ${HAVE_ERRNO_H}
 
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
index 794e859bfe72c..1bb4a311be189 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFExpression.h
@@ -159,6 +159,8 @@ class DWARFExpression {
 
   bool operator==(const DWARFExpression &RHS) const;
 
+  StringRef getData() const { return Data.getData(); }
+
 private:
   DataExtractor Data;
   uint8_t AddressSize;
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
index 161a4f8f8f06a..5ab216598bb43 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFVerifier.h
@@ -79,14 +79,11 @@ class DWARFVerifier {
   raw_ostream &OS;
   DWARFContext &DCtx;
   DIDumpOptions DumpOpts;
-  /// A map that tracks all references (converted absolute references) so we
-  /// can verify each reference points to a valid DIE and not an offset that
-  /// lies between to valid DIEs.
-  std::map<uint64_t, std::set<uint64_t>> ReferenceToDIEOffsets;
   uint32_t NumDebugLineErrors = 0;
   // Used to relax some checks that do not currently work portably
   bool IsObjectFile;
   bool IsMachOObject;
+  using ReferenceMap = std::map<uint64_t, std::set<uint64_t>>;
 
   raw_ostream &error() const;
   raw_ostream &warn() const;
@@ -144,7 +141,9 @@ class DWARFVerifier {
   /// \param Unit      The DWARF Unit to verify.
   ///
   /// \returns The number of errors that occurred during verification.
-  unsigned verifyUnitContents(DWARFUnit &Unit);
+  unsigned verifyUnitContents(DWARFUnit &Unit,
+                              ReferenceMap &UnitLocalReferences,
+                              ReferenceMap &CrossUnitReferences);
 
   /// Verifies the unit headers and contents in a .debug_info or .debug_types
   /// section.
@@ -196,7 +195,9 @@ class DWARFVerifier {
   ///
   /// \returns NumErrors The number of errors occurred during verification of
   /// attributes' forms in a unit
-  unsigned verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue);
+  unsigned verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue,
+                               ReferenceMap &UnitLocalReferences,
+                               ReferenceMap &CrossUnitReferences);
 
   /// Verifies the all valid references that were found when iterating through
   /// all of the DIE attributes.
@@ -208,7 +209,9 @@ class DWARFVerifier {
   ///
   /// \returns NumErrors The number of errors occurred during verification of
   /// references for the .debug_info and .debug_types sections
-  unsigned verifyDebugInfoReferences();
+  unsigned verifyDebugInfoReferences(
+      const ReferenceMap &,
+      llvm::function_ref<DWARFUnit *(uint64_t)> GetUnitForDieOffset);
 
   /// Verify the DW_AT_stmt_list encoding and value and ensure that no
   /// compile units that have the same DW_AT_stmt_list value.
diff --git a/llvm/include/llvm/Demangle/ItaniumDemangle.h b/llvm/include/llvm/Demangle/ItaniumDemangle.h
index 2f72821a2a3af..4969a75f28c33 100644
--- a/llvm/include/llvm/Demangle/ItaniumDemangle.h
+++ b/llvm/include/llvm/Demangle/ItaniumDemangle.h
@@ -109,6 +109,126 @@
 
 DEMANGLE_NAMESPACE_BEGIN
 
+template <class T, size_t N> class PODSmallVector {
+  static_assert(std::is_pod<T>::value,
+                "T is required to be a plain old data type");
+
+  T *First = nullptr;
+  T *Last = nullptr;
+  T *Cap = nullptr;
+  T Inline[N] = {0};
+
+  bool isInline() const { return First == Inline; }
+
+  void clearInline() {
+    First = Inline;
+    Last = Inline;
+    Cap = Inline + N;
+  }
+
+  void reserve(size_t NewCap) {
+    size_t S = size();
+    if (isInline()) {
+      auto *Tmp = static_cast<T *>(std::malloc(NewCap * sizeof(T)));
+      if (Tmp == nullptr)
+        std::terminate();
+      std::copy(First, Last, Tmp);
+      First = Tmp;
+    } else {
+      First = static_cast<T *>(std::realloc(First, NewCap * sizeof(T)));
+      if (First == nullptr)
+        std::terminate();
+    }
+    Last = First + S;
+    Cap = First + NewCap;
+  }
+
+public:
+  PODSmallVector() : First(Inline), Last(First), Cap(Inline + N) {}
+
+  PODSmallVector(const PODSmallVector &) = delete;
+  PODSmallVector &operator=(const PODSmallVector &) = delete;
+
+  PODSmallVector(PODSmallVector &&Other) : PODSmallVector() {
+    if (Other.isInline()) {
+      std::copy(Other.begin(), Other.end(), First);
+      Last = First + Other.size();
+      Other.clear();
+      return;
+    }
+
+    First = Other.First;
+    Last = Other.Last;
+    Cap = Other.Cap;
+    Other.clearInline();
+  }
+
+  PODSmallVector &operator=(PODSmallVector &&Other) {
+    if (Other.isInline()) {
+      if (!isInline()) {
+        std::free(First);
+        clearInline();
+      }
+      std::copy(Other.begin(), Other.end(), First);
+      Last = First + Other.size();
+      Other.clear();
+      return *this;
+    }
+
+    if (isInline()) {
+      First = Other.First;
+      Last = Other.Last;
+      Cap = Other.Cap;
+      Other.clearInline();
+      return *this;
+    }
+
+    std::swap(First, Other.First);
+    std::swap(Last, Other.Last);
+    std::swap(Cap, Other.Cap);
+    Other.clear();
+    return *this;
+  }
+
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  void push_back(const T &Elem) {
+    if (Last == Cap)
+      reserve(size() * 2);
+    *Last++ = Elem;
+  }
+
+  // NOLINTNEXTLINE(readability-identifier-naming)
+  void pop_back() {
+    assert(Last != First && "Popping empty vector!");
+    --Last;
+  }
+
+  void dropBack(size_t Index) {
+    assert(Index <= size() && "dropBack() can't expand!");
+    Last = First + Index;
+  }
+
+  T *begin() { return First; }
+  T *end() { return Last; }
+
+  bool empty() const { return First == Last; }
+  size_t size() const { return static_cast<size_t>(Last - First); }
+  T &back() {
+    assert(Last != First && "Calling back() on empty vector!");
+    return *(Last - 1);
+  }
+  T &operator[](size_t Index) {
+    assert(Index < size() && "Invalid access!");
+    return *(begin() + Index);
+  }
+  void clear() { Last = First; }
+
+  ~PODSmallVector() {
+    if (!isInline())
+      std::free(First);
+  }
+};
+
 // Base class of all AST nodes. The AST is built by the parser, then is
 // traversed by the printLeft/Right functions to produce a demangled string.
 class Node {
@@ -548,8 +668,15 @@ class ReferenceType : public Node {
   // Dig through any refs to refs, collapsing the ReferenceTypes as we go. The
   // rule here is rvalue ref to rvalue ref collapses to a rvalue ref, and any
   // other combination collapses to a lvalue ref.
+  //
+  // A combination of a TemplateForwardReference and a back-ref Substitution
+  // from an ill-formed string may have created a cycle; use cycle detection to
+  // avoid looping forever.
   std::pair<ReferenceKind, const Node *> collapse(OutputStream &S) const {
     auto SoFar = std::make_pair(RK, Pointee);
+    // Track the chain of nodes for the Floyd's 'tortoise and hare'
+    // cycle-detection algorithm, since getSyntaxNode(S) is impure
+    PODSmallVector<const Node *, 8> Prev;
     for (;;) {
       const Node *SN = SoFar.second->getSyntaxNode(S);
       if (SN->getKind() != KReferenceType)
@@ -557,6 +684,14 @@ class ReferenceType : public Node {
       auto *RT = static_cast<const ReferenceType *>(SN);
       SoFar.second = RT->Pointee;
       SoFar.first = std::min(SoFar.first, RT->RK);
+
+      // The middle of `Prev` is the 'slow' pointer moving at half speed
+      Prev.push_back(SoFar.second);
+      if (Prev.size() > 1 && SoFar.second == Prev[(Prev.size() - 1) / 2]) {
+        // Cycle detected
+        SoFar.second = nullptr;
+        break;
+      }
     }
     return SoFar;
   }
@@ -577,6 +712,8 @@ class ReferenceType : public Node {
       return;
     SwapAndRestore<bool> SavePrinting(Printing, true);
     std::pair<ReferenceKind, const Node *> Collapsed = collapse(s);
+    if (!Collapsed.second)
+      return;
     Collapsed.second->printLeft(s);
     if (Collapsed.second->hasArray(s))
       s += " ";
@@ -590,6 +727,8 @@ class ReferenceType : public Node {
       return;
     SwapAndRestore<bool> SavePrinting(Printing, true);
     std::pair<ReferenceKind, const Node *> Collapsed = collapse(s);
+    if (!Collapsed.second)
+      return;
     if (Collapsed.second->hasArray(s) || Collapsed.second->hasFunction(s))
       s += ")";
     Collapsed.second->printRight(s);
@@ -2247,125 +2386,6 @@ FOR_EACH_NODE_KIND(SPECIALIZATION)
 
 #undef FOR_EACH_NODE_KIND
 
-template <class T, size_t N>
-class PODSmallVector {
-  static_assert(std::is_pod<T>::value,
-                "T is required to be a plain old data type");
-
-  T* First = nullptr;
-  T* Last = nullptr;
-  T* Cap = nullptr;
-  T Inline[N] = {0};
-
-  bool isInline() const { return First == Inline; }
-
-  void clearInline() {
-    First = Inline;
-    Last = Inline;
-    Cap = Inline + N;
-  }
-
-  void reserve(size_t NewCap) {
-    size_t S = size();
-    if (isInline()) {
-      auto* Tmp = static_cast<T*>(std::malloc(NewCap * sizeof(T)));
-      if (Tmp == nullptr)
-        std::terminate();
-      std::copy(First, Last, Tmp);
-      First = Tmp;
-    } else {
-      First = static_cast<T*>(std::realloc(First, NewCap * sizeof(T)));
-      if (First == nullptr)
-        std::terminate();
-    }
-    Last = First + S;
-    Cap = First + NewCap;
-  }
-
-public:
-  PODSmallVector() : First(Inline), Last(First), Cap(Inline + N) {}
-
-  PODSmallVector(const PODSmallVector&) = delete;
-  PODSmallVector& operator=(const PODSmallVector&) = delete;
-
-  PODSmallVector(PODSmallVector&& Other) : PODSmallVector() {
-    if (Other.isInline()) {
-      std::copy(Other.begin(), Other.end(), First);
-      Last = First + Other.size();
-      Other.clear();
-      return;
-    }
-
-    First = Other.First;
-    Last = Other.Last;
-    Cap = Other.Cap;
-    Other.clearInline();
-  }
-
-  PODSmallVector& operator=(PODSmallVector&& Other) {
-    if (Other.isInline()) {
-      if (!isInline()) {
-        std::free(First);
-        clearInline();
-      }
-      std::copy(Other.begin(), Other.end(), First);
-      Last = First + Other.size();
-      Other.clear();
-      return *this;
-    }
-
-    if (isInline()) {
-      First = Other.First;
-      Last = Other.Last;
-      Cap = Other.Cap;
-      Other.clearInline();
-      return *this;
-    }
-
-    std::swap(First, Other.First);
-    std::swap(Last, Other.Last);
-    std::swap(Cap, Other.Cap);
-    Other.clear();
-    return *this;
-  }
-
-  void push_back(const T& Elem) {
-    if (Last == Cap)
-      reserve(size() * 2);
-    *Last++ = Elem;
-  }
-
-  void pop_back() {
-    assert(Last != First && "Popping empty vector!");
-    --Last;
-  }
-
-  void dropBack(size_t Index) {
-    assert(Index <= size() && "dropBack() can't expand!");
-    Last = First + Index;
-  }
-
-  T* begin() { return First; }
-  T* end() { return Last; }
-
-  bool empty() const { return First == Last; }
-  size_t size() const { return static_cast<size_t>(Last - First); }
-  T& back() {
-    assert(Last != First && "Calling back() on empty vector!");
-    return *(Last - 1);
-  }
-  T& operator[](size_t Index) {
-    assert(Index < size() && "Invalid access!");
-    return *(begin() + Index);
-  }
-  void clear() { Last = First; }
-
-  ~PODSmallVector() {
-    if (!isInline())
-      std::free(First);
-  }
-};
-
 template <typename Derived, typename Alloc> struct AbstractManglingParser {
   const char *First;
   const char *Last;
diff --git a/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h b/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h
index 2e386518f0bfa..43c91fb5f988a 100644
--- a/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/llvm/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -21,7 +21,6 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/OrcV1Deprecation.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Object/Binary.h"
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
index bf6b449a1a7c9..611d58b974494 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/ELF_x86_64.h
@@ -21,9 +21,12 @@ namespace jitlink {
 namespace ELF_x86_64_Edges {
 enum ELFX86RelocationKind : Edge::Kind {
   Branch32 = Edge::FirstRelocation,
+  Pointer32Signed,
   Pointer64,
   PCRel32,
   PCRel32GOTLoad,
+  PCRel32GOTLoadRelaxable,
+  PCRel32REXGOTLoadRelaxable,
   PCRel64GOT,
   GOTOFF64,
   GOT64,
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
index 9714d6285c24e..a11174a7f34ec 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
@@ -42,6 +42,16 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///
   Pointer32,
 
+  /// A signed 32-bit pointer value relocation
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target + Addend : int32
+  ///
+  /// Errors:
+  ///   - The target must reside in the signed 32-bits([-2**31, 2**32 - 1]) of
+  ///   the address space, otherwise an out-of-range error will be returned.
+  Pointer32Signed,
+
   /// A 64-bit delta.
   ///
   /// Delta from the fixup to the target.
@@ -132,7 +142,7 @@ enum EdgeKind_x86_64 : Edge::Kind {
   /// This edge kind has the same fixup expression as BranchPCRel32, but further
   /// identifies the call/branch as being to a pointer jump stub. For edges of
   /// this kind the jump stub should not be bypassed (use
-  /// BranchPCRel32ToPtrJumpStubRelaxable for that), but the pointer location
+  /// BranchPCRel32ToPtrJumpStubBypassable for that), but the pointer location
   /// target may be recorded to allow manipulation at runtime.
   ///
   /// Fixup expression:
@@ -148,7 +158,8 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///
   /// The edge kind has the same fixup expression as BranchPCRel32ToPtrJumpStub,
   /// but identifies the call/branch as being to a pointer jump stub that may be
-  /// bypassed if the ultimate target is within range of the fixup location.
+  /// bypassed with a direct jump to the ultimate target if the ultimate target
+  /// is within range of the fixup location.
   ///
   /// Fixup expression:
   ///   Fixup <- Target - Fixup + Addend - 4: int32
@@ -157,7 +168,7 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///   - The result of the fixup expression must fit into an int32, otherwise
   ///     an out-of-range error will be returned.
   ///
-  BranchPCRel32ToPtrJumpStubRelaxable,
+  BranchPCRel32ToPtrJumpStubBypassable,
 
   /// A GOT entry getter/constructor, transformed to Delta32 pointing at the GOT
   /// entry for the original target.
@@ -220,7 +231,21 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///     phase will result in an assert/unreachable during the fixup phase
   RequestGOTAndTransformToDelta64FromGOT,
 
-  /// A PC-relative reference to a GOT entry, relaxable if GOT entry target
+  /// A PC-relative load of a GOT entry, relaxable if GOT entry target is
+  /// in-range of the fixup
+  ///
+  /// TODO: Explain the optimization
+  ///
+  /// Fixup expression
+  ///   Fixup <- Target - (Fixup + 4) + Addend : int32
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression must fit into an int32, otherwise
+  ///     an out-of-range error will be returned.
+  //
+  PCRel32GOTLoadRelaxable,
+
+  /// A PC-relative REX load of a GOT entry, relaxable if GOT entry target
   /// is in-range of the fixup.
   ///
   /// If the GOT entry target is in-range of the fixup then the load from the
@@ -233,17 +258,39 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///   - The result of the fixup expression must fit into an int32, otherwise
   ///     an out-of-range error will be returned.
   ///
-  PCRel32GOTLoadRelaxable,
+  PCRel32GOTLoadREXRelaxable,
 
-  /// A GOT entry getter/constructor, transformed to PCRel32ToGOTLoadRelaxable
-  /// pointing at the GOT entry for the original target.
+  /// A GOT entry getter/constructor, transformed to
+  /// PCRel32ToGOTLoadREXRelaxable pointing at the GOT entry for the original
+  /// target.
   ///
-  /// Indicates that this edge should be transformed into a
-  /// PC32ToGOTLoadRelaxable targeting the GOT entry for the edge's current
-  /// target, maintaining the same addend. A GOT entry for the target should be
-  /// created if one does not already exist.
+  /// Indicates that this edge should be lowered to a PC32ToGOTLoadREXRelaxable
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does not
+  /// already exist.
   ///
-  /// Edges of this kind are usually handled by a GOT builder pass inserted by
+  /// Edges of this kind are usually lowered by a GOT builder pass inserted by
+  /// default.
+  ///
+  /// Fixup expression:
+  ///   NONE
+  ///
+  /// Errors:
+  ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
+  ///     phase will result in an assert/unreachable during the fixup phase.
+  ///
+  RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable,
+
+  /// A GOT entry getter/constructor, transformed to
+  /// PCRel32ToGOTLoadRelaxable pointing at the GOT entry for the original
+  /// target.
+  ///
+  /// Indicates that this edge should be lowered to a PC32ToGOTLoadRelaxable
+  /// targeting the GOT entry for the edge's current target, maintaining the
+  /// same addend. A GOT entry for the target should be created if one does not
+  /// already exist.
+  ///
+  /// Edges of this kind are usually lowered by a GOT builder pass inserted by
   /// default.
   ///
   /// Fixup expression:
@@ -255,10 +302,10 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///
   RequestGOTAndTransformToPCRel32GOTLoadRelaxable,
 
-  /// A PC-relative reference to a Thread Local Variable Pointer (TLVP) entry,
+  /// A PC-relative REX load of a Thread Local Variable Pointer (TLVP) entry,
   /// relaxable if the TLVP entry target is in-range of the fixup.
   ///
-  /// If the TLVP entry target is in-range of the fixup then the load frmo the
+  /// If the TLVP entry target is in-range of the fixup then the load from the
   /// TLVP may be replaced with a direct memory address calculation.
   ///
   /// The target of this edge must be a thread local variable entry of the form
@@ -275,15 +322,15 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///   - The target must be either external, or a TLV entry of the required
   ///     form, otherwise a malformed TLV entry error will be returned.
   ///
-  PCRel32TLVPLoadRelaxable,
+  PCRel32TLVPLoadREXRelaxable,
 
   /// A TLVP entry getter/constructor, transformed to
-  /// Delta32ToTLVPLoadRelaxable.
+  /// Delta32ToTLVPLoadREXRelaxable.
   ///
   /// Indicates that this edge should be transformed into a
-  /// Delta32ToTLVPLoadRelaxable targeting the TLVP entry for the edge's current
-  /// target. A TLVP entry for the target should be created if one does not
-  /// already exist.
+  /// Delta32ToTLVPLoadREXRelaxable targeting the TLVP entry for the edge's
+  /// current target. A TLVP entry for the target should be created if one does
+  /// not already exist.
   ///
   /// Fixup expression:
   ///   NONE
@@ -292,13 +339,21 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///   - *ASSERTION* Failure to handle edges of this kind prior to the fixup
   ///     phase will result in an assert/unreachable during the fixup phase.
   ///
-  RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable
+  RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable
 };
 
 /// Returns a string name for the given x86-64 edge. For debugging purposes
 /// only.
 const char *getEdgeKindName(Edge::Kind K);
 
+/// Optimize the GOT and Stub relocations if the edge target address is in range
+/// 1. PCRel32GOTLoadRelaxable. For this edge kind, if the target is in range,
+/// then replace GOT load with lea
+/// 2. BranchPCRel32ToPtrJumpStubRelaxable. For this edge kind, if the target is
+/// in range, replace a indirect jump by plt stub with a direct jump to the
+/// target
+Error optimize_x86_64_GOTAndStubs(LinkGraph &G);
+
 /// Returns true if the given uint64_t value is in range for a uint32_t.
 inline bool isInRangeForImmU32(uint64_t Value) {
   return Value <= std::numeric_limits<uint32_t>::max();
@@ -335,12 +390,21 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
       return makeTargetOutOfRangeError(G, B, E);
     break;
   }
+  case Pointer32Signed: {
+    int64_t Value = E.getTarget().getAddress() + E.getAddend();
+    if (LLVM_LIKELY(isInRangeForImmS32(Value)))
+      *(little32_t *)FixupPtr = Value;
+    else
+      return makeTargetOutOfRangeError(G, B, E);
+    break;
+  }
 
   case BranchPCRel32:
   case BranchPCRel32ToPtrJumpStub:
-  case BranchPCRel32ToPtrJumpStubRelaxable:
+  case BranchPCRel32ToPtrJumpStubBypassable:
   case PCRel32GOTLoadRelaxable:
-  case PCRel32TLVPLoadRelaxable: {
+  case PCRel32GOTLoadREXRelaxable:
+  case PCRel32TLVPLoadREXRelaxable: {
     int64_t Value =
         E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend();
     if (LLVM_LIKELY(isInRangeForImmS32(Value)))
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index e832d8d57dfa3..e047e5ef5a45c 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -21,7 +21,6 @@
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
 #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h"
-#include "llvm/ExecutionEngine/OrcV1Deprecation.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 
@@ -1490,14 +1489,8 @@ class ExecutionSession {
   void callSPSWrapperAsync(SendResultT &&SendResult,
                            JITTargetAddress WrapperFnAddr,
                            const ArgTs &...Args) {
-    shared::WrapperFunction<SPSSignature>::callAsync(
-        [this,
-         WrapperFnAddr](ExecutorProcessControl::SendResultFunction SendResult,
-                        const char *ArgData, size_t ArgSize) {
-          callWrapperAsync(std::move(SendResult), WrapperFnAddr,
-                           ArrayRef<char>(ArgData, ArgSize));
-        },
-        std::move(SendResult), Args...);
+    EPC->callSPSWrapperAsync<SPSSignature, SendResultT, ArgTs...>(
+        std::forward<SendResultT>(SendResult), WrapperFnAddr, Args...);
   }
 
   /// Run a wrapper function using SPS to serialize the arguments and
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h
new file mode 100644
index 0000000000000..20a47846927a4
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h
@@ -0,0 +1,90 @@
+//===- EPCGenericMemoryAccess.h - Generic EPC MemoryAccess impl -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements ExecutorProcessControl::MemoryAccess by making calls to
+// ExecutorProcessControl::callWrapperAsync.
+//
+// This simplifies the implementaton of new ExecutorProcessControl instances,
+// as this implementation will always work (at the cost of some performance
+// overhead for the calls).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_EPCGENERICMEMORYACCESS_H
+#define LLVM_EXECUTIONENGINE_ORC_EPCGENERICMEMORYACCESS_H
+
+#include "llvm/ExecutionEngine/Orc/Core.h"
+
+namespace llvm {
+namespace orc {
+
+class EPCGenericMemoryAccess : public ExecutorProcessControl::MemoryAccess {
+public:
+  /// Function addresses for memory access.
+  struct FuncAddrs {
+    ExecutorAddress WriteUInt8s;
+    ExecutorAddress WriteUInt16s;
+    ExecutorAddress WriteUInt32s;
+    ExecutorAddress WriteUInt64s;
+    ExecutorAddress WriteBuffers;
+  };
+
+  /// Create an EPCGenericMemoryAccess instance from a given set of
+  /// function addrs.
+  EPCGenericMemoryAccess(ExecutorProcessControl &EPC, FuncAddrs FAs)
+      : EPC(EPC), FAs(FAs) {}
+
+  /// Create using the standard memory access function names from the ORC
+  /// runtime.
+  static Expected<std::unique_ptr<EPCGenericMemoryAccess>>
+  CreateUsingOrcRTFuncs(ExecutionSession &ES, JITDylib &OrcRuntimeJD);
+
+  void writeUInt8sAsync(ArrayRef<tpctypes::UInt8Write> Ws,
+                        WriteResultFn OnWriteComplete) override {
+    using namespace shared;
+    EPC.callSPSWrapperAsync<void(SPSSequence<SPSMemoryAccessUInt8Write>)>(
+        std::move(OnWriteComplete), FAs.WriteUInt8s.getValue(), Ws);
+  }
+
+  void writeUInt16sAsync(ArrayRef<tpctypes::UInt16Write> Ws,
+                         WriteResultFn OnWriteComplete) override {
+    using namespace shared;
+    EPC.callSPSWrapperAsync<void(SPSSequence<SPSMemoryAccessUInt16Write>)>(
+        std::move(OnWriteComplete), FAs.WriteUInt16s.getValue(), Ws);
+  }
+
+  void writeUInt32sAsync(ArrayRef<tpctypes::UInt32Write> Ws,
+                         WriteResultFn OnWriteComplete) override {
+    using namespace shared;
+    EPC.callSPSWrapperAsync<void(SPSSequence<SPSMemoryAccessUInt32Write>)>(
+        std::move(OnWriteComplete), FAs.WriteUInt32s.getValue(), Ws);
+  }
+
+  void writeUInt64sAsync(ArrayRef<tpctypes::UInt64Write> Ws,
+                         WriteResultFn OnWriteComplete) override {
+    using namespace shared;
+    EPC.callSPSWrapperAsync<void(SPSSequence<SPSMemoryAccessUInt64Write>)>(
+        std::move(OnWriteComplete), FAs.WriteUInt64s.getValue(), Ws);
+  }
+
+  void writeBuffersAsync(ArrayRef<tpctypes::BufferWrite> Ws,
+                         WriteResultFn OnWriteComplete) override {
+    using namespace shared;
+    EPC.callSPSWrapperAsync<void(SPSSequence<SPSMemoryAccessBufferWrite>)>(
+        std::move(OnWriteComplete), FAs.WriteBuffers.getValue(), Ws);
+  }
+
+private:
+  ExecutorProcessControl &EPC;
+  FuncAddrs FAs;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_EPCGENERICMEMORYACCESS_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
index d540d0cd0608c..9b00369311c95 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
@@ -51,53 +51,58 @@ class ExecutorProcessControl {
 
     virtual ~MemoryAccess();
 
-    virtual void writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws,
-                             WriteResultFn OnWriteComplete) = 0;
+    virtual void writeUInt8sAsync(ArrayRef<tpctypes::UInt8Write> Ws,
+                                  WriteResultFn OnWriteComplete) = 0;
 
-    virtual void writeUInt16s(ArrayRef<tpctypes::UInt16Write> Ws,
-                              WriteResultFn OnWriteComplete) = 0;
+    virtual void writeUInt16sAsync(ArrayRef<tpctypes::UInt16Write> Ws,
+                                   WriteResultFn OnWriteComplete) = 0;
 
-    virtual void writeUInt32s(ArrayRef<tpctypes::UInt32Write> Ws,
-                              WriteResultFn OnWriteComplete) = 0;
+    virtual void writeUInt32sAsync(ArrayRef<tpctypes::UInt32Write> Ws,
+                                   WriteResultFn OnWriteComplete) = 0;
 
-    virtual void writeUInt64s(ArrayRef<tpctypes::UInt64Write> Ws,
-                              WriteResultFn OnWriteComplete) = 0;
+    virtual void writeUInt64sAsync(ArrayRef<tpctypes::UInt64Write> Ws,
+                                   WriteResultFn OnWriteComplete) = 0;
 
-    virtual void writeBuffers(ArrayRef<tpctypes::BufferWrite> Ws,
-                              WriteResultFn OnWriteComplete) = 0;
+    virtual void writeBuffersAsync(ArrayRef<tpctypes::BufferWrite> Ws,
+                                   WriteResultFn OnWriteComplete) = 0;
 
     Error writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws) {
       std::promise<MSVCPError> ResultP;
       auto ResultF = ResultP.get_future();
-      writeUInt8s(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      writeUInt8sAsync(Ws,
+                       [&](Error Err) { ResultP.set_value(std::move(Err)); });
       return ResultF.get();
     }
 
     Error writeUInt16s(ArrayRef<tpctypes::UInt16Write> Ws) {
       std::promise<MSVCPError> ResultP;
       auto ResultF = ResultP.get_future();
-      writeUInt16s(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      writeUInt16sAsync(Ws,
+                        [&](Error Err) { ResultP.set_value(std::move(Err)); });
       return ResultF.get();
     }
 
     Error writeUInt32s(ArrayRef<tpctypes::UInt32Write> Ws) {
       std::promise<MSVCPError> ResultP;
       auto ResultF = ResultP.get_future();
-      writeUInt32s(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      writeUInt32sAsync(Ws,
+                        [&](Error Err) { ResultP.set_value(std::move(Err)); });
       return ResultF.get();
     }
 
     Error writeUInt64s(ArrayRef<tpctypes::UInt64Write> Ws) {
       std::promise<MSVCPError> ResultP;
       auto ResultF = ResultP.get_future();
-      writeUInt64s(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      writeUInt64sAsync(Ws,
+                        [&](Error Err) { ResultP.set_value(std::move(Err)); });
       return ResultF.get();
     }
 
     Error writeBuffers(ArrayRef<tpctypes::BufferWrite> Ws) {
       std::promise<MSVCPError> ResultP;
       auto ResultF = ResultP.get_future();
-      writeBuffers(Ws, [&](Error Err) { ResultP.set_value(std::move(Err)); });
+      writeBuffersAsync(Ws,
+                        [&](Error Err) { ResultP.set_value(std::move(Err)); });
       return ResultF.get();
     }
   };
@@ -163,8 +168,7 @@ class ExecutorProcessControl {
   /// The result of the lookup is a 2-dimentional array of target addresses
   /// that correspond to the lookup order. If a required symbol is not
   /// found then this method will return an error. If a weakly referenced
-  /// symbol is not found then it be assigned a '0' value in the result.
-  /// that correspond to the lookup order.
+  /// symbol is not found then it be assigned a '0' value.
   virtual Expected<std::vector<tpctypes::LookupResult>>
   lookupSymbols(ArrayRef<LookupRequest> Request) = 0;
 
@@ -185,6 +189,22 @@ class ExecutorProcessControl {
                                 JITTargetAddress WrapperFnAddr,
                                 ArrayRef<char> ArgBuffer) = 0;
 
+  /// Run a wrapper function using SPS to serialize the arguments and
+  /// deserialize the results.
+  template <typename SPSSignature, typename SendResultT, typename... ArgTs>
+  void callSPSWrapperAsync(SendResultT &&SendResult,
+                           JITTargetAddress WrapperFnAddr,
+                           const ArgTs &...Args) {
+    shared::WrapperFunction<SPSSignature>::callAsync(
+        [this,
+         WrapperFnAddr](ExecutorProcessControl::SendResultFunction SendResult,
+                        const char *ArgData, size_t ArgSize) {
+          callWrapperAsync(std::move(SendResult), WrapperFnAddr,
+                           ArrayRef<char>(ArgData, ArgSize));
+        },
+        std::move(SendResult), Args...);
+  }
+
   /// Disconnect from the target process.
   ///
   /// This should be called after the JIT session is shut down.
@@ -273,20 +293,20 @@ class SelfExecutorProcessControl
   Error disconnect() override;
 
 private:
-  void writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws,
-                   WriteResultFn OnWriteComplete) override;
+  void writeUInt8sAsync(ArrayRef<tpctypes::UInt8Write> Ws,
+                        WriteResultFn OnWriteComplete) override;
 
-  void writeUInt16s(ArrayRef<tpctypes::UInt16Write> Ws,
-                    WriteResultFn OnWriteComplete) override;
+  void writeUInt16sAsync(ArrayRef<tpctypes::UInt16Write> Ws,
+                         WriteResultFn OnWriteComplete) override;
 
-  void writeUInt32s(ArrayRef<tpctypes::UInt32Write> Ws,
-                    WriteResultFn OnWriteComplete) override;
+  void writeUInt32sAsync(ArrayRef<tpctypes::UInt32Write> Ws,
+                         WriteResultFn OnWriteComplete) override;
 
-  void writeUInt64s(ArrayRef<tpctypes::UInt64Write> Ws,
-                    WriteResultFn OnWriteComplete) override;
+  void writeUInt64sAsync(ArrayRef<tpctypes::UInt64Write> Ws,
+                         WriteResultFn OnWriteComplete) override;
 
-  void writeBuffers(ArrayRef<tpctypes::BufferWrite> Ws,
-                    WriteResultFn OnWriteComplete) override;
+  void writeBuffersAsync(ArrayRef<tpctypes::BufferWrite> Ws,
+                         WriteResultFn OnWriteComplete) override;
 
   static shared::detail::CWrapperFunctionResult
   jitDispatchViaWrapperFunctionManager(void *Ctx, const void *FnTag,
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h b/llvm/include/llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h
new file mode 100644
index 0000000000000..181d735468c63
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h
@@ -0,0 +1,70 @@
+//===-- LookupAndRecordAddrs.h - Symbol lookup support utility --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Record the addresses of a set of symbols into ExecutorAddress objects.
+//
+// This can be used to avoid repeated lookup (via ExecutionSession::lookup) of
+// the given symbols.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_LOOKUPANDRECORDADDRS_H
+#define LLVM_EXECUTIONENGINE_ORC_LOOKUPANDRECORDADDRS_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+/// Record addresses of the given symbols in the given ExecutorAddresses.
+///
+/// Useful for making permanent records of symbol addreses to call or
+/// access in the executor (e.g. runtime support functions in Platform
+/// subclasses).
+///
+/// By default the symbols are looked up using
+/// SymbolLookupFlags::RequiredSymbol, and an error will be generated if any of
+/// the requested symbols are not defined.
+///
+/// If SymbolLookupFlags::WeaklyReferencedSymbol is used then any missing
+/// symbols will have their corresponding address objects set to zero, and
+/// this function will never generate an error (the caller will need to check
+/// addresses before using them).
+///
+/// Asynchronous version.
+void lookupAndRecordAddrs(
+    unique_function<void(Error)> OnRecorded, ExecutionSession &ES, LookupKind K,
+    const JITDylibSearchOrder &SearchOrder,
+    std::vector<std::pair<SymbolStringPtr, ExecutorAddress *>> Pairs,
+    SymbolLookupFlags LookupFlags = SymbolLookupFlags::RequiredSymbol);
+
+/// Record addresses of the given symbols in the given ExecutorAddresses.
+///
+/// Blocking version.
+Error lookupAndRecordAddrs(
+    ExecutionSession &ES, LookupKind K, const JITDylibSearchOrder &SearchOrder,
+    std::vector<std::pair<SymbolStringPtr, ExecutorAddress *>> Pairs,
+    SymbolLookupFlags LookupFlags = SymbolLookupFlags::RequiredSymbol);
+
+/// Record addresses of given symbols in the given ExecutorAddresses.
+///
+/// ExecutorProcessControl lookup version. Lookups are always implicitly
+/// weak.
+Error lookupAndRecordAddrs(
+    ExecutorProcessControl &EPC, tpctypes::DylibHandle H,
+    std::vector<std::pair<SymbolStringPtr, ExecutorAddress *>> Pairs,
+    SymbolLookupFlags LookupFlags = SymbolLookupFlags::RequiredSymbol);
+
+} // End namespace orc
+} // End namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_LOOKUPANDRECORDADDRS_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/OrcRPCExecutorProcessControl.h b/llvm/include/llvm/ExecutionEngine/Orc/OrcRPCExecutorProcessControl.h
index 297b3387999da..eac64a2cfaa20 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/OrcRPCExecutorProcessControl.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/OrcRPCExecutorProcessControl.h
@@ -231,28 +231,28 @@ class OrcRPCEPCMemoryAccess : public ExecutorProcessControl::MemoryAccess {
 public:
   OrcRPCEPCMemoryAccess(OrcRPCEPCImplT &Parent) : Parent(Parent) {}
 
-  void writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws,
-                   WriteResultFn OnWriteComplete) override {
+  void writeUInt8sAsync(ArrayRef<tpctypes::UInt8Write> Ws,
+                        WriteResultFn OnWriteComplete) override {
     writeViaRPC<orcrpctpc::WriteUInt8s>(Ws, std::move(OnWriteComplete));
   }
 
-  void writeUInt16s(ArrayRef<tpctypes::UInt16Write> Ws,
-                    WriteResultFn OnWriteComplete) override {
+  void writeUInt16sAsync(ArrayRef<tpctypes::UInt16Write> Ws,
+                         WriteResultFn OnWriteComplete) override {
     writeViaRPC<orcrpctpc::WriteUInt16s>(Ws, std::move(OnWriteComplete));
   }
 
-  void writeUInt32s(ArrayRef<tpctypes::UInt32Write> Ws,
-                    WriteResultFn OnWriteComplete) override {
+  void writeUInt32sAsync(ArrayRef<tpctypes::UInt32Write> Ws,
+                         WriteResultFn OnWriteComplete) override {
     writeViaRPC<orcrpctpc::WriteUInt32s>(Ws, std::move(OnWriteComplete));
   }
 
-  void writeUInt64s(ArrayRef<tpctypes::UInt64Write> Ws,
-                    WriteResultFn OnWriteComplete) override {
+  void writeUInt64sAsync(ArrayRef<tpctypes::UInt64Write> Ws,
+                         WriteResultFn OnWriteComplete) override {
     writeViaRPC<orcrpctpc::WriteUInt64s>(Ws, std::move(OnWriteComplete));
   }
 
-  void writeBuffers(ArrayRef<tpctypes::BufferWrite> Ws,
-                    WriteResultFn OnWriteComplete) override {
+  void writeBuffersAsync(ArrayRef<tpctypes::BufferWrite> Ws,
+                         WriteResultFn OnWriteComplete) override {
     writeViaRPC<orcrpctpc::WriteBuffers>(Ws, std::move(OnWriteComplete));
   }
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h
index 854f1098d5af5..dfa2307b2f200 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h
@@ -289,6 +289,13 @@ class TrivialSPSSequenceDeserialization<SPSElementTagT, std::vector<T>> {
   }
 };
 
+/// Trivial ArrayRef<T> -> SPSSequence<SPSElementTagT> serialization.
+template <typename SPSElementTagT, typename T>
+class TrivialSPSSequenceSerialization<SPSElementTagT, ArrayRef<T>> {
+public:
+  static constexpr bool available = true;
+};
+
 /// 'Trivial' sequence serialization: Sequence is serialized as a uint64_t size
 /// followed by a for-earch loop over the elements of the sequence to serialize
 /// each of them.
@@ -330,6 +337,44 @@ class SPSSerializationTraits<SPSSequence<SPSElementTagT>, SequenceT,
   }
 };
 
+/// SPSTuple serialization for std::tuple.
+template <typename... SPSTagTs, typename... Ts>
+class SPSSerializationTraits<SPSTuple<SPSTagTs...>, std::tuple<Ts...>> {
+private:
+  using TupleArgList = typename SPSTuple<SPSTagTs...>::AsArgList;
+  using ArgIndices = std::make_index_sequence<sizeof...(Ts)>;
+
+  template <std::size_t... I>
+  static size_t size(const std::tuple<Ts...> &T, std::index_sequence<I...>) {
+    return TupleArgList::size(std::get<I>(T)...);
+  }
+
+  template <std::size_t... I>
+  static bool serialize(SPSOutputBuffer &OB, const std::tuple<Ts...> &T,
+                        std::index_sequence<I...>) {
+    return TupleArgList::serialize(OB, std::get<I>(T)...);
+  }
+
+  template <std::size_t... I>
+  static bool deserialize(SPSInputBuffer &IB, std::tuple<Ts...> &T,
+                          std::index_sequence<I...>) {
+    return TupleArgList::deserialize(IB, std::get<I>(T)...);
+  }
+
+public:
+  static size_t size(const std::tuple<Ts...> &T) {
+    return size(T, ArgIndices{});
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const std::tuple<Ts...> &T) {
+    return serialize(OB, T, ArgIndices{});
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, std::tuple<Ts...> &T) {
+    return deserialize(IB, T, ArgIndices{});
+  }
+};
+
 /// SPSTuple serialization for std::pair.
 template <typename SPSTagT1, typename SPSTagT2, typename T1, typename T2>
 class SPSSerializationTraits<SPSTuple<SPSTagT1, SPSTagT2>, std::pair<T1, T2>> {
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
index a44bcd4c80648..d05a1a158db70 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/TargetProcessControlTypes.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
 
 #include <vector>
 
@@ -62,6 +63,62 @@ using DylibHandle = JITTargetAddress;
 using LookupResult = std::vector<JITTargetAddress>;
 
 } // end namespace tpctypes
+
+namespace shared {
+
+template <typename T>
+using SPSMemoryAccessUIntWrite = SPSTuple<SPSExecutorAddress, T>;
+
+using SPSMemoryAccessUInt8Write = SPSMemoryAccessUIntWrite<uint8_t>;
+using SPSMemoryAccessUInt16Write = SPSMemoryAccessUIntWrite<uint16_t>;
+using SPSMemoryAccessUInt32Write = SPSMemoryAccessUIntWrite<uint32_t>;
+using SPSMemoryAccessUInt64Write = SPSMemoryAccessUIntWrite<uint64_t>;
+
+using SPSMemoryAccessBufferWrite =
+    SPSTuple<SPSExecutorAddress, SPSSequence<char>>;
+
+template <typename T>
+class SPSSerializationTraits<SPSMemoryAccessUIntWrite<T>,
+                             tpctypes::UIntWrite<T>> {
+public:
+  static size_t size(const tpctypes::UIntWrite<T> &W) {
+    return SPSTuple<SPSExecutorAddress, T>::AsArgList::size(W.Address, W.Value);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const tpctypes::UIntWrite<T> &W) {
+    return SPSTuple<SPSExecutorAddress, T>::AsArgList::serialize(OB, W.Address,
+                                                                 W.Value);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, tpctypes::UIntWrite<T> &W) {
+    return SPSTuple<SPSExecutorAddress, T>::AsArgList::deserialize(
+        IB, W.Address, W.Value);
+  }
+};
+
+template <>
+class SPSSerializationTraits<SPSMemoryAccessBufferWrite,
+                             tpctypes::BufferWrite> {
+public:
+  static size_t size(const tpctypes::BufferWrite &W) {
+    return SPSTuple<SPSExecutorAddress, SPSSequence<char>>::AsArgList::size(
+        W.Address, W.Buffer);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const tpctypes::BufferWrite &W) {
+    return SPSTuple<SPSExecutorAddress,
+                    SPSSequence<char>>::AsArgList ::serialize(OB, W.Address,
+                                                              W.Buffer);
+  }
+
+  static bool deserialize(SPSInputBuffer &IB, tpctypes::BufferWrite &W) {
+    return SPSTuple<SPSExecutorAddress,
+                    SPSSequence<char>>::AsArgList ::deserialize(IB, W.Address,
+                                                                W.Buffer);
+  }
+};
+
+} // end namespace shared
 } // end namespace orc
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h
index e92f580668c6e..3154bf68271fe 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h
@@ -486,7 +486,7 @@ class WrapperFunction<SPSRetTagT(SPSTagTs...)> {
     }
 
     auto SendSerializedResult = [SDR = std::move(SendDeserializedResult)](
-                                    WrapperFunctionResult R) {
+                                    WrapperFunctionResult R) mutable {
       RetT RetVal = detail::ResultDeserializer<SPSRetTagT, RetT>::makeValue();
       detail::ResultDeserializer<SPSRetTagT, RetT>::makeSafe(RetVal);
 
@@ -547,6 +547,20 @@ class WrapperFunction<void(SPSTagTs...)>
     return WrapperFunction<SPSEmpty(SPSTagTs...)>::call(Caller, BE, Args...);
   }
 
+  template <typename AsyncCallerFn, typename SendDeserializedResultFn,
+            typename... ArgTs>
+  static void callAsync(AsyncCallerFn &&Caller,
+                        SendDeserializedResultFn &&SendDeserializedResult,
+                        const ArgTs &...Args) {
+    WrapperFunction<SPSEmpty(SPSTagTs...)>::callAsync(
+        Caller,
+        [SDR = std::move(SendDeserializedResult)](Error SerializeErr,
+                                                  SPSEmpty E) mutable {
+          SDR(std::move(SerializeErr));
+        },
+        Args...);
+  }
+
   using WrapperFunction<SPSEmpty(SPSTagTs...)>::handle;
   using WrapperFunction<SPSEmpty(SPSTagTs...)>::handleAsync;
 };
diff --git a/llvm/include/llvm/ExecutionEngine/OrcMCJITReplacement.h b/llvm/include/llvm/ExecutionEngine/OrcMCJITReplacement.h
deleted file mode 100644
index 6cca1933f39f6..0000000000000
--- a/llvm/include/llvm/ExecutionEngine/OrcMCJITReplacement.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//===---- OrcMCJITReplacement.h - Orc-based MCJIT replacement ---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file forces OrcMCJITReplacement to link in on certain operating systems.
-// (Windows).
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORCMCJITREPLACEMENT_H
-#define LLVM_EXECUTIONENGINE_ORCMCJITREPLACEMENT_H
-
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include <cstdlib>
-
-extern "C" void LLVMLinkInOrcMCJITReplacement();
-
-namespace {
-  struct ForceOrcMCJITReplacementLinking {
-    ForceOrcMCJITReplacementLinking() {
-      // We must reference OrcMCJITReplacement in such a way that compilers will
-      // not delete it all as dead code, even with whole program optimization,
-      // yet is effectively a NO-OP. As the compiler isn't smart enough to know
-      // that getenv() never returns -1, this will do the job.
-      if (std::getenv("bar") != (char*) -1)
-        return;
-
-      LLVMLinkInOrcMCJITReplacement();
-    }
-  } ForceOrcMCJITReplacementLinking;
-}
-
-#endif
diff --git a/llvm/include/llvm/ExecutionEngine/OrcV1Deprecation.h b/llvm/include/llvm/ExecutionEngine/OrcV1Deprecation.h
deleted file mode 100644
index 7ed254b3ee04f..0000000000000
--- a/llvm/include/llvm/ExecutionEngine/OrcV1Deprecation.h
+++ /dev/null
@@ -1,22 +0,0 @@
-//===------ OrcV1Deprecation.h - Memory manager for MC-JIT ------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Tag for suppressing ORCv1 deprecation warnings.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTIONENGINE_ORCV1DEPRECATION_H
-#define LLVM_EXECUTIONENGINE_ORCV1DEPRECATION_H
-
-namespace llvm {
-
-enum ORCv1DeprecationAcknowledgement { AcknowledgeORCv1Deprecation };
-
-} // namespace llvm
-
-#endif // LLVM_EXECUTIONENGINE_ORCV1DEPRECATION_H
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
index 0b6aed1e9e12b..1d7735ebf72d0 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -29,68 +29,69 @@ namespace omp {
 ///
 /// Example usage in clang:
 ///   const unsigned slot_size =
-///   ctx.GetTargetInfo().getGridValue(llvm::omp::GVIDX::GV_Warp_Size);
+///   ctx.GetTargetInfo().getGridValue().GV_Warp_Size;
 ///
 /// Example usage in libomptarget/deviceRTLs:
 ///   #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 ///   #ifdef __AMDGPU__
-///     #define GRIDVAL AMDGPUGpuGridValues
+///     #define GRIDVAL AMDGPUGridValues
 ///   #else
-///     #define GRIDVAL NVPTXGpuGridValues
+///     #define GRIDVAL NVPTXGridValues
 ///   #endif
 ///   ... Then use this reference for GV_Warp_Size in the deviceRTL source.
-///   llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
+///   llvm::omp::GRIDVAL().GV_Warp_Size
 ///
 /// Example usage in libomptarget hsa plugin:
 ///   #include "llvm/Frontend/OpenMP/OMPGridValues.h"
-///   #define GRIDVAL AMDGPUGpuGridValues
+///   #define GRIDVAL AMDGPUGridValues
 ///   ... Then use this reference to access GV_Warp_Size in the hsa plugin.
-///   llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
+///   llvm::omp::GRIDVAL().GV_Warp_Size
 ///
 /// Example usage in libomptarget cuda plugin:
 ///    #include "llvm/Frontend/OpenMP/OMPGridValues.h"
-///    #define GRIDVAL NVPTXGpuGridValues
+///    #define GRIDVAL NVPTXGridValues
 ///   ... Then use this reference to access GV_Warp_Size in the cuda plugin.
-///    llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
+///    llvm::omp::GRIDVAL().GV_Warp_Size
 ///
-enum GVIDX {
+
+struct GV {
   /// The maximum number of workers in a kernel.
   /// (THREAD_ABSOLUTE_LIMIT) - (GV_Warp_Size), might be issue for blockDim.z
-  GV_Threads,
+  const unsigned GV_Threads;
   /// The size reserved for data in a shared memory slot.
-  GV_Slot_Size,
+  const unsigned GV_Slot_Size;
   /// The default value of maximum number of threads in a worker warp.
-  GV_Warp_Size,
+  const unsigned GV_Warp_Size;
   /// Alternate warp size for some AMDGCN architectures. Same as GV_Warp_Size
   /// for NVPTX.
-  GV_Warp_Size_32,
+  const unsigned GV_Warp_Size_32;
   /// The number of bits required to represent the max number of threads in warp
-  GV_Warp_Size_Log2,
+  const unsigned GV_Warp_Size_Log2;
   /// GV_Warp_Size * GV_Slot_Size,
-  GV_Warp_Slot_Size,
+  const unsigned GV_Warp_Slot_Size;
   /// the maximum number of teams.
-  GV_Max_Teams,
+  const unsigned GV_Max_Teams;
   /// Global Memory Alignment
-  GV_Mem_Align,
+  const unsigned GV_Mem_Align;
   /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2))
-  GV_Warp_Size_Log2_Mask,
+  const unsigned GV_Warp_Size_Log2_Mask;
   // An alternative to the heavy data sharing infrastructure that uses global
   // memory is one that uses device __shared__ memory.  The amount of such space
   // (in bytes) reserved by the OpenMP runtime is noted here.
-  GV_SimpleBufferSize,
+  const unsigned GV_SimpleBufferSize;
   // The absolute maximum team size for a working group
-  GV_Max_WG_Size,
+  const unsigned GV_Max_WG_Size;
   // The default maximum team size for a working group
-  GV_Default_WG_Size,
+  const unsigned GV_Default_WG_Size;
   // This is GV_Max_WG_Size / GV_WarpSize. 32 for NVPTX and 16 for AMDGCN.
-  GV_Max_Warp_Number,
+  const unsigned GV_Max_Warp_Number;
   /// The slot size that should be reserved for a working warp.
   /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2))
-  GV_Warp_Size_Log2_MaskL
+  const unsigned GV_Warp_Size_Log2_MaskL;
 };
 
 /// For AMDGPU GPUs
-static constexpr unsigned AMDGPUGpuGridValues[] = {
+static constexpr GV AMDGPUGridValues = {
     448,       // GV_Threads
     256,       // GV_Slot_Size
     64,        // GV_Warp_Size
@@ -108,7 +109,7 @@ static constexpr unsigned AMDGPUGpuGridValues[] = {
 };
 
 /// For Nvidia GPUs
-static constexpr unsigned NVPTXGpuGridValues[] = {
+static constexpr GV NVPTXGridValues = {
     992,               // GV_Threads
     256,               // GV_Slot_Size
     32,                // GV_Warp_Size
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 9e242b04cc6a5..b982e5b7ed5fe 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -592,20 +592,7 @@ class OpenMPIRBuilder {
   ///                           reduction variables.
   /// \param AllocaIP           An insertion point suitable for allocas usable
   ///                           in reductions.
-  /// \param Variables          A list of variables in which the reduction
-  ///                           results will be stored (values of pointer type).
-  /// \param PrivateVariables   A list of variables in which the partial
-  ///                           reduction results are stored (values of pointer
-  ///                           type). Coindexed with Variables. Privatization
-  ///                           must be handled separately from this call.
-  /// \param ReductionGen       A list of generators for non-atomic reduction
-  ///                           bodies. Each takes a pair of partially reduced
-  ///                           values and sets a new one.
-  /// \param AtomicReductionGen A list of generators for atomic reduction
-  ///                           bodies, empty if the reduction cannot be
-  ///                           performed with atomics. Each takes a pair of
-  ///                           _pointers_ to paritally reduced values and
-  ///                           atomically stores the result into the first.
+  /// \param ReductionInfos     A list of info on each reduction variable.
   /// \param IsNoWait           A flag set if the reduction is marked as nowait.
   InsertPointTy createReductions(const LocationDescription &Loc,
                                  InsertPointTy AllocaIP,
@@ -654,9 +641,6 @@ class OpenMPIRBuilder {
                           omp::IdentFlag Flags = omp::IdentFlag(0),
                           unsigned Reserve2Flags = 0);
 
-  // Get the type corresponding to __kmpc_impl_lanemask_t from the deviceRTL
-  Type *getLanemaskType();
-
   /// Generate control flow and cleanup for cancellation.
   ///
   /// \param CancelFlag Flag indicating if the cancellation is performed.
@@ -781,11 +765,11 @@ class OpenMPIRBuilder {
   /// \param Loc The source location description.
   /// \param MapperFunc Function to be called.
   /// \param SrcLocInfo Source location information global.
-  /// \param MaptypesArgs
-  /// \param MapnamesArg
+  /// \param MaptypesArg The argument types.
+  /// \param MapnamesArg The argument names.
   /// \param MapperAllocas The AllocaInst used for the call.
   /// \param DeviceID Device ID for the call.
-  /// \param TotalNbOperand Number of operand in the call.
+  /// \param NumOperands Number of operands in the call.
   void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc,
                       Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg,
                       struct MapperAllocas &MapperAllocas, int64_t DeviceID,
@@ -835,7 +819,7 @@ class OpenMPIRBuilder {
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param FiniCB Callback to finialize variable copies.
   ///
-  /// \returns The insertion position *after* the master.
+  /// \returns The insertion position *after* the masked.
   InsertPointTy createMasked(const LocationDescription &Loc,
                              BodyGenCallbackTy BodyGenCB,
                              FinalizeCallbackTy FiniCB, Value *Filter);
@@ -848,7 +832,7 @@ class OpenMPIRBuilder {
   /// \param CriticalName name of the lock used by the critical directive
   /// \param HintInst Hint Instruction for hint clause associated with critical
   ///
-  /// \returns The insertion position *after* the master.
+  /// \returns The insertion position *after* the critical.
   InsertPointTy createCritical(const LocationDescription &Loc,
                                BodyGenCallbackTy BodyGenCB,
                                FinalizeCallbackTy FiniCB,
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 26141c9ce249e..e084036ac60b5 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -39,7 +39,6 @@ __OMP_TYPE(Int32Ptr)
 __OMP_TYPE(Int64Ptr)
 
 OMP_TYPE(SizeTy, M.getDataLayout().getIntPtrType(Ctx))
-OMP_TYPE(LanemaskTy, getLanemaskType())
 
 #define __OMP_PTR_TYPE(NAME, BASE) OMP_TYPE(NAME, BASE->getPointerTo())
 
@@ -443,8 +442,8 @@ __OMP_RTL(__kmpc_parallel_level, false, Int8, )
 __OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, )
 __OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32)
 
-__OMP_RTL(__kmpc_warp_active_thread_mask, false, LanemaskTy,)
-__OMP_RTL(__kmpc_syncwarp, false, Void, LanemaskTy)
+__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
+__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
 
 __OMP_RTL(__kmpc_is_generic_main_thread_id, false, Int8, Int32)
 
diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index d28a4b4de1bf7..d6237e3450f41 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -475,6 +475,55 @@ class AttributeList {
   LLVM_NODISCARD AttributeList addAttributes(LLVMContext &C, unsigned Index,
                                              const AttrBuilder &B) const;
 
+  /// Add a function attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addFnAttribute(LLVMContext &C,
+                                              Attribute::AttrKind Kind) const {
+    return addAttribute(C, FunctionIndex, Kind);
+  }
+
+  /// Add a function attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addFnAttribute(LLVMContext &C,
+                                              Attribute Attr) const {
+    return addAttribute(C, FunctionIndex, Attr);
+  }
+
+  /// Add a function attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addFnAttribute(
+      LLVMContext &C, StringRef Kind, StringRef Value = StringRef()) const {
+    return addAttribute(C, FunctionIndex, Kind, Value);
+  }
+
+  /// Add function attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addFnAttributes(LLVMContext &C,
+                                               const AttrBuilder &B) const {
+    return addAttributes(C, FunctionIndex, B);
+  }
+
+  /// Add a return value attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addRetAttribute(LLVMContext &C,
+                                               Attribute::AttrKind Kind) const {
+    return addAttribute(C, ReturnIndex, Kind);
+  }
+
+  /// Add a return value attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addRetAttribute(LLVMContext &C,
+                                               Attribute Attr) const {
+    return addAttribute(C, ReturnIndex, Attr);
+  }
+
+  /// Add a return value attribute to the list. Returns a new list because
+  /// attribute lists are immutable.
+  LLVM_NODISCARD AttributeList addRetAttributes(LLVMContext &C,
+                                                const AttrBuilder &B) const {
+    return addAttributes(C, ReturnIndex, B);
+  }
+
   /// Add an argument attribute to the list. Returns a new list because
   /// attribute lists are immutable.
   LLVM_NODISCARD AttributeList addParamAttribute(
@@ -524,6 +573,48 @@ class AttributeList {
   LLVM_NODISCARD AttributeList removeAttributes(LLVMContext &C,
                                                 unsigned Index) const;
 
+  /// Remove the specified attribute at the function index from this
+  /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList
+  removeFnAttribute(LLVMContext &C, Attribute::AttrKind Kind) const {
+    return removeAttribute(C, FunctionIndex, Kind);
+  }
+
+  /// Remove the specified attribute at the function index from this
+  /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList removeFnAttribute(LLVMContext &C,
+                                                 StringRef Kind) const {
+    return removeAttribute(C, FunctionIndex, Kind);
+  }
+
+  /// Remove the specified attribute at the function index from this
+  /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList
+  removeFnAttributes(LLVMContext &C, const AttrBuilder &AttrsToRemove) const {
+    return removeAttributes(C, FunctionIndex, AttrsToRemove);
+  }
+
+  /// Remove the specified attribute at the return value index from this
+  /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList
+  removeRetAttribute(LLVMContext &C, Attribute::AttrKind Kind) const {
+    return removeAttribute(C, ReturnIndex, Kind);
+  }
+
+  /// Remove the specified attribute at the return value index from this
+  /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList removeRetAttribute(LLVMContext &C,
+                                                  StringRef Kind) const {
+    return removeAttribute(C, ReturnIndex, Kind);
+  }
+
+  /// Remove the specified attribute at the return value index from this
+  /// attribute list. Returns a new list because attribute lists are immutable.
+  LLVM_NODISCARD AttributeList
+  removeRetAttributes(LLVMContext &C, const AttrBuilder &AttrsToRemove) const {
+    return removeAttributes(C, ReturnIndex, AttrsToRemove);
+  }
+
   /// Remove the specified attribute at the specified arg index from this
   /// attribute list. Returns a new list because attribute lists are immutable.
   LLVM_NODISCARD AttributeList removeParamAttribute(
@@ -566,50 +657,25 @@ class AttributeList {
 
   /// \brief Add the dereferenceable attribute to the attribute set at the given
   /// index. Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList addDereferenceableAttr(LLVMContext &C,
-                                                      unsigned Index,
-                                                      uint64_t Bytes) const;
+  LLVM_NODISCARD AttributeList addDereferenceableRetAttr(LLVMContext &C,
+                                                         uint64_t Bytes) const;
 
   /// \brief Add the dereferenceable attribute to the attribute set at the given
   /// arg index. Returns a new list because attribute lists are immutable.
   LLVM_NODISCARD AttributeList addDereferenceableParamAttr(
-      LLVMContext &C, unsigned ArgNo, uint64_t Bytes) const {
-    return addDereferenceableAttr(C, ArgNo + FirstArgIndex, Bytes);
-  }
-
-  /// Add the dereferenceable_or_null attribute to the attribute set at
-  /// the given index. Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList addDereferenceableOrNullAttr(
-      LLVMContext &C, unsigned Index, uint64_t Bytes) const;
+      LLVMContext &C, unsigned ArgNo, uint64_t Bytes) const;
 
   /// Add the dereferenceable_or_null attribute to the attribute set at
   /// the given arg index. Returns a new list because attribute lists are
   /// immutable.
   LLVM_NODISCARD AttributeList addDereferenceableOrNullParamAttr(
-      LLVMContext &C, unsigned ArgNo, uint64_t Bytes) const {
-    return addDereferenceableOrNullAttr(C, ArgNo + FirstArgIndex, Bytes);
-  }
-
-  /// Add the allocsize attribute to the attribute set at the given index.
-  /// Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList
-  addAllocSizeAttr(LLVMContext &C, unsigned Index, unsigned ElemSizeArg,
-                   const Optional<unsigned> &NumElemsArg);
+      LLVMContext &C, unsigned ArgNo, uint64_t Bytes) const;
 
   /// Add the allocsize attribute to the attribute set at the given arg index.
   /// Returns a new list because attribute lists are immutable.
   LLVM_NODISCARD AttributeList
   addAllocSizeParamAttr(LLVMContext &C, unsigned ArgNo, unsigned ElemSizeArg,
-                        const Optional<unsigned> &NumElemsArg) {
-    return addAllocSizeAttr(C, ArgNo + FirstArgIndex, ElemSizeArg, NumElemsArg);
-  }
-
-  /// Add the vscale_range attribute to the attribute set at the given index.
-  /// Returns a new list because attribute lists are immutable.
-  LLVM_NODISCARD AttributeList addVScaleRangeAttr(LLVMContext &C,
-                                                  unsigned Index,
-                                                  unsigned MinValue,
-                                                  unsigned MaxValue);
+                        const Optional<unsigned> &NumElemsArg);
 
   //===--------------------------------------------------------------------===//
   // AttributeList Accessors
@@ -620,13 +686,13 @@ class AttributeList {
 
   /// The attributes for the argument or parameter at the given index are
   /// returned.
-  AttributeSet getParamAttributes(unsigned ArgNo) const;
+  AttributeSet getParamAttrs(unsigned ArgNo) const;
 
   /// The attributes for the ret value are returned.
-  AttributeSet getRetAttributes() const;
+  AttributeSet getRetAttrs() const;
 
   /// The function attributes are returned.
-  AttributeSet getFnAttributes() const;
+  AttributeSet getFnAttrs() const;
 
   /// Return true if the attribute exists at the given index.
   bool hasAttribute(unsigned Index, Attribute::AttrKind Kind) const;
@@ -652,13 +718,27 @@ class AttributeList {
     return hasAttributes(ArgNo + FirstArgIndex);
   }
 
-  /// Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but
-  /// may be faster.
-  bool hasFnAttribute(Attribute::AttrKind Kind) const;
+  /// Return true if the attribute exists for the return value.
+  bool hasRetAttr(Attribute::AttrKind Kind) const {
+    return hasAttribute(ReturnIndex, Kind);
+  }
+
+  /// Return true if the attribute exists for the return value.
+  bool hasRetAttr(StringRef Kind) const {
+    return hasAttribute(ReturnIndex, Kind);
+  }
+
+  /// Return true if attributes exist for the return value.
+  bool hasRetAttrs() const { return hasAttributes(ReturnIndex); }
 
-  /// Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but
-  /// may be faster.
-  bool hasFnAttribute(StringRef Kind) const;
+  /// Return true if the attribute exists for the function.
+  bool hasFnAttr(Attribute::AttrKind Kind) const;
+
+  /// Return true if the attribute exists for the function.
+  bool hasFnAttr(StringRef Kind) const;
+
+  /// Return true the attributes exist for the function.
+  bool hasFnAttrs() const { return hasAttributes(FunctionIndex); }
 
   /// Return true if the specified attribute is set for at least one
   /// parameter or for the return value. If Index is not nullptr, the index
@@ -682,6 +762,16 @@ class AttributeList {
     return getAttribute(ArgNo + FirstArgIndex, Kind);
   }
 
+  /// Return the attribute object that exists for the function.
+  Attribute getFnAttr(Attribute::AttrKind Kind) const {
+    return getAttribute(FunctionIndex, Kind);
+  }
+
+  /// Return the attribute object that exists for the function.
+  Attribute getFnAttr(StringRef Kind) const {
+    return getAttribute(FunctionIndex, Kind);
+  }
+
   /// Return the alignment of the return value.
   MaybeAlign getRetAlignment() const;
 
@@ -709,34 +799,26 @@ class AttributeList {
   /// Return the elementtype type for the specified function parameter.
   Type *getParamElementType(unsigned ArgNo) const;
 
-  /// Get the stack alignment.
-  MaybeAlign getStackAlignment(unsigned Index) const;
+  /// Get the stack alignment of the function.
+  MaybeAlign getFnStackAlignment() const;
 
-  /// Get the number of dereferenceable bytes (or zero if unknown).
-  uint64_t getDereferenceableBytes(unsigned Index) const;
-
-  /// Get the number of dereferenceable bytes (or zero if unknown) of an
-  /// arg.
-  uint64_t getParamDereferenceableBytes(unsigned ArgNo) const {
-    return getDereferenceableBytes(ArgNo + FirstArgIndex);
-  }
+  /// Get the stack alignment of the return value.
+  MaybeAlign getRetStackAlignment() const;
 
-  /// Get the number of dereferenceable_or_null bytes (or zero if
-  /// unknown).
-  uint64_t getDereferenceableOrNullBytes(unsigned Index) const;
+  /// Get the number of dereferenceable bytes (or zero if unknown) of the return
+  /// value.
+  uint64_t getRetDereferenceableBytes() const;
 
-  /// Get the number of dereferenceable_or_null bytes (or zero if
-  /// unknown) of an arg.
-  uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const {
-    return getDereferenceableOrNullBytes(ArgNo + FirstArgIndex);
-  }
+  /// Get the number of dereferenceable bytes (or zero if unknown) of an arg.
+  uint64_t getParamDereferenceableBytes(unsigned Index) const;
 
-  /// Get the allocsize argument numbers (or pair(0, 0) if unknown).
-  std::pair<unsigned, Optional<unsigned>>
-  getAllocSizeArgs(unsigned Index) const;
+  /// Get the number of dereferenceable_or_null bytes (or zero if unknown) of
+  /// the return value.
+  uint64_t getRetDereferenceableOrNullBytes() const;
 
-  /// Get the vscale_range argument numbers (or pair(0, 0) if unknown).
-  std::pair<unsigned, unsigned> getVScaleRangeArgs(unsigned Index) const;
+  /// Get the number of dereferenceable_or_null bytes (or zero if unknown) of an
+  /// arg.
+  uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const;
 
   /// Return the attributes at the index as a string.
   std::string getAsString(unsigned Index, bool InAttrGrp = false) const;
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 99b474161df7a..de25b51a62927 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -86,6 +86,9 @@ def Dereferenceable : IntAttr<"dereferenceable", [ParamAttr, RetAttr]>;
 def DereferenceableOrNull : IntAttr<"dereferenceable_or_null",
                                     [ParamAttr, RetAttr]>;
 
+/// Do not instrument function with sanitizers.
+def DisableSanitizerInstrumentation: EnumAttr<"disable_sanitizer_instrumentation", [FnAttr]>;
+
 /// Provide pointer element type to intrinsic.
 def ElementType : TypeAttr<"elementtype", [ParamAttr]>;
 
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index 23ac47ca4d816..f0c62bc5b50a2 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -279,12 +279,13 @@ namespace llvm {
     /// \param OffsetInBits Member offset.
     /// \param Flags        Flags to encode member attribute, e.g. private
     /// \param Ty           Parent type.
+    /// \param Annotations  Member annotations.
     DIDerivedType *createMemberType(DIScope *Scope, StringRef Name,
                                     DIFile *File, unsigned LineNo,
-                                    uint64_t SizeInBits,
-                                    uint32_t AlignInBits,
+                                    uint64_t SizeInBits, uint32_t AlignInBits,
                                     uint64_t OffsetInBits,
-                                    DINode::DIFlags Flags, DIType *Ty);
+                                    DINode::DIFlags Flags, DIType *Ty,
+                                    DINodeArray Annotations = nullptr);
 
     /// Create debugging information entry for a variant.  A variant
     /// normally should be a member of a variant part.
@@ -317,10 +318,14 @@ namespace llvm {
     /// \param StorageOffsetInBits Member storage offset.
     /// \param Flags               Flags to encode member attribute.
     /// \param Ty                  Parent type.
-    DIDerivedType *createBitFieldMemberType(
-        DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNo,
-        uint64_t SizeInBits, uint64_t OffsetInBits,
-        uint64_t StorageOffsetInBits, DINode::DIFlags Flags, DIType *Ty);
+    /// \param Annotations         Member annotations.
+    DIDerivedType *createBitFieldMemberType(DIScope *Scope, StringRef Name,
+                                            DIFile *File, unsigned LineNo,
+                                            uint64_t SizeInBits,
+                                            uint64_t OffsetInBits,
+                                            uint64_t StorageOffsetInBits,
+                                            DINode::DIFlags Flags, DIType *Ty,
+                                            DINodeArray Annotations = nullptr);
 
     /// Create debugging information entry for a
     /// C++ static data member.
@@ -586,7 +591,7 @@ namespace llvm {
         unsigned Tag, StringRef Name, DIScope *Scope, DIFile *F, unsigned Line,
         unsigned RuntimeLang = 0, uint64_t SizeInBits = 0,
         uint32_t AlignInBits = 0, DINode::DIFlags Flags = DINode::FlagFwdDecl,
-        StringRef UniqueIdentifier = "");
+        StringRef UniqueIdentifier = "", DINodeArray Annotations = nullptr);
 
     /// Retain DIScope* in a module even if it is not referenced
     /// through debug info anchors.
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index b59a7b3a46f3d..72652a4373d18 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -936,25 +936,26 @@ class DIDerivedType : public DIType {
           unsigned Line, DIScope *Scope, DIType *BaseType, uint64_t SizeInBits,
           uint32_t AlignInBits, uint64_t OffsetInBits,
           Optional<unsigned> DWARFAddressSpace, DIFlags Flags,
-          Metadata *ExtraData, StorageType Storage, bool ShouldCreate = true) {
+          Metadata *ExtraData, DINodeArray Annotations, StorageType Storage,
+          bool ShouldCreate = true) {
     return getImpl(Context, Tag, getCanonicalMDString(Context, Name), File,
                    Line, Scope, BaseType, SizeInBits, AlignInBits, OffsetInBits,
-                   DWARFAddressSpace, Flags, ExtraData, Storage, ShouldCreate);
-  }
-  static DIDerivedType *getImpl(LLVMContext &Context, unsigned Tag,
-                                MDString *Name, Metadata *File, unsigned Line,
-                                Metadata *Scope, Metadata *BaseType,
-                                uint64_t SizeInBits, uint32_t AlignInBits,
-                                uint64_t OffsetInBits,
-                                Optional<unsigned> DWARFAddressSpace,
-                                DIFlags Flags, Metadata *ExtraData,
-                                StorageType Storage, bool ShouldCreate = true);
+                   DWARFAddressSpace, Flags, ExtraData, Annotations.get(),
+                   Storage, ShouldCreate);
+  }
+  static DIDerivedType *
+  getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
+          unsigned Line, Metadata *Scope, Metadata *BaseType,
+          uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
+          Optional<unsigned> DWARFAddressSpace, DIFlags Flags,
+          Metadata *ExtraData, Metadata *Annotations, StorageType Storage,
+          bool ShouldCreate = true);
 
   TempDIDerivedType cloneImpl() const {
-    return getTemporary(getContext(), getTag(), getName(), getFile(), getLine(),
-                        getScope(), getBaseType(), getSizeInBits(),
-                        getAlignInBits(), getOffsetInBits(),
-                        getDWARFAddressSpace(), getFlags(), getExtraData());
+    return getTemporary(
+        getContext(), getTag(), getName(), getFile(), getLine(), getScope(),
+        getBaseType(), getSizeInBits(), getAlignInBits(), getOffsetInBits(),
+        getDWARFAddressSpace(), getFlags(), getExtraData(), getAnnotations());
   }
 
 public:
@@ -964,19 +965,21 @@ class DIDerivedType : public DIType {
                      uint64_t SizeInBits, uint32_t AlignInBits,
                      uint64_t OffsetInBits,
                      Optional<unsigned> DWARFAddressSpace, DIFlags Flags,
-                     Metadata *ExtraData = nullptr),
+                     Metadata *ExtraData = nullptr,
+                     Metadata *Annotations = nullptr),
                     (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                      AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
-                     ExtraData))
+                     ExtraData, Annotations))
   DEFINE_MDNODE_GET(DIDerivedType,
                     (unsigned Tag, StringRef Name, DIFile *File, unsigned Line,
                      DIScope *Scope, DIType *BaseType, uint64_t SizeInBits,
                      uint32_t AlignInBits, uint64_t OffsetInBits,
                      Optional<unsigned> DWARFAddressSpace, DIFlags Flags,
-                     Metadata *ExtraData = nullptr),
+                     Metadata *ExtraData = nullptr,
+                     DINodeArray Annotations = nullptr),
                     (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                      AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
-                     ExtraData))
+                     ExtraData, Annotations))
 
   TempDIDerivedType clone() const { return cloneImpl(); }
 
@@ -999,6 +1002,12 @@ class DIDerivedType : public DIType {
   Metadata *getExtraData() const { return getRawExtraData(); }
   Metadata *getRawExtraData() const { return getOperand(4); }
 
+  /// Get annotations associated with this derived type.
+  DINodeArray getAnnotations() const {
+    return cast_or_null<MDTuple>(getRawAnnotations());
+  }
+  Metadata *getRawAnnotations() const { return getOperand(5); }
+
   /// Get casted version of extra data.
   /// @{
   DIType *getClassType() const {
@@ -1081,13 +1090,14 @@ class DICompositeType : public DIType {
           DITemplateParameterArray TemplateParams, StringRef Identifier,
           DIDerivedType *Discriminator, Metadata *DataLocation,
           Metadata *Associated, Metadata *Allocated, Metadata *Rank,
-          StorageType Storage, bool ShouldCreate = true) {
+          DINodeArray Annotations, StorageType Storage,
+          bool ShouldCreate = true) {
     return getImpl(
         Context, Tag, getCanonicalMDString(Context, Name), File, Line, Scope,
         BaseType, SizeInBits, AlignInBits, OffsetInBits, Flags, Elements.get(),
         RuntimeLang, VTableHolder, TemplateParams.get(),
         getCanonicalMDString(Context, Identifier), Discriminator, DataLocation,
-        Associated, Allocated, Rank, Storage, ShouldCreate);
+        Associated, Allocated, Rank, Annotations.get(), Storage, ShouldCreate);
   }
   static DICompositeType *
   getImpl(LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
@@ -1097,16 +1107,16 @@ class DICompositeType : public DIType {
           Metadata *VTableHolder, Metadata *TemplateParams,
           MDString *Identifier, Metadata *Discriminator, Metadata *DataLocation,
           Metadata *Associated, Metadata *Allocated, Metadata *Rank,
-          StorageType Storage, bool ShouldCreate = true);
+          Metadata *Annotations, StorageType Storage, bool ShouldCreate = true);
 
   TempDICompositeType cloneImpl() const {
-    return getTemporary(getContext(), getTag(), getName(), getFile(), getLine(),
-                        getScope(), getBaseType(), getSizeInBits(),
-                        getAlignInBits(), getOffsetInBits(), getFlags(),
-                        getElements(), getRuntimeLang(), getVTableHolder(),
-                        getTemplateParams(), getIdentifier(),
-                        getDiscriminator(), getRawDataLocation(),
-                        getRawAssociated(), getRawAllocated(), getRawRank());
+    return getTemporary(
+        getContext(), getTag(), getName(), getFile(), getLine(), getScope(),
+        getBaseType(), getSizeInBits(), getAlignInBits(), getOffsetInBits(),
+        getFlags(), getElements(), getRuntimeLang(), getVTableHolder(),
+        getTemplateParams(), getIdentifier(), getDiscriminator(),
+        getRawDataLocation(), getRawAssociated(), getRawAllocated(),
+        getRawRank(), getAnnotations());
   }
 
 public:
@@ -1119,10 +1129,12 @@ class DICompositeType : public DIType {
        DITemplateParameterArray TemplateParams = nullptr,
        StringRef Identifier = "", DIDerivedType *Discriminator = nullptr,
        Metadata *DataLocation = nullptr, Metadata *Associated = nullptr,
-       Metadata *Allocated = nullptr, Metadata *Rank = nullptr),
+       Metadata *Allocated = nullptr, Metadata *Rank = nullptr,
+       DINodeArray Annotations = nullptr),
       (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
        OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder, TemplateParams,
-       Identifier, Discriminator, DataLocation, Associated, Allocated, Rank))
+       Identifier, Discriminator, DataLocation, Associated, Allocated, Rank,
+       Annotations))
   DEFINE_MDNODE_GET(
       DICompositeType,
       (unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
@@ -1132,10 +1144,11 @@ class DICompositeType : public DIType {
        Metadata *TemplateParams = nullptr, MDString *Identifier = nullptr,
        Metadata *Discriminator = nullptr, Metadata *DataLocation = nullptr,
        Metadata *Associated = nullptr, Metadata *Allocated = nullptr,
-       Metadata *Rank = nullptr),
+       Metadata *Rank = nullptr, Metadata *Annotations = nullptr),
       (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
        OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder, TemplateParams,
-       Identifier, Discriminator, DataLocation, Associated, Allocated, Rank))
+       Identifier, Discriminator, DataLocation, Associated, Allocated, Rank,
+       Annotations))
 
   TempDICompositeType clone() const { return cloneImpl(); }
 
@@ -1154,7 +1167,7 @@ class DICompositeType : public DIType {
              unsigned RuntimeLang, Metadata *VTableHolder,
              Metadata *TemplateParams, Metadata *Discriminator,
              Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
-             Metadata *Rank);
+             Metadata *Rank, Metadata *Annotations);
   static DICompositeType *getODRTypeIfExists(LLVMContext &Context,
                                              MDString &Identifier);
 
@@ -1175,7 +1188,7 @@ class DICompositeType : public DIType {
                unsigned RuntimeLang, Metadata *VTableHolder,
                Metadata *TemplateParams, Metadata *Discriminator,
                Metadata *DataLocation, Metadata *Associated,
-               Metadata *Allocated, Metadata *Rank);
+               Metadata *Allocated, Metadata *Rank, Metadata *Annotations);
 
   DIType *getBaseType() const { return cast_or_null<DIType>(getRawBaseType()); }
   DINodeArray getElements() const {
@@ -1228,6 +1241,11 @@ class DICompositeType : public DIType {
     return dyn_cast_or_null<DIExpression>(getRawRank());
   }
 
+  Metadata *getRawAnnotations() const { return getOperand(13); }
+  DINodeArray getAnnotations() const {
+    return cast_or_null<MDTuple>(getRawAnnotations());
+  }
+
   /// Replace operands.
   ///
   /// If this \a isUniqued() and not \a isResolved(), on a uniquing collision
@@ -2212,7 +2230,8 @@ unsigned DILocation::getCopyIdentifier() const {
   return getCopyIdentifierFromDiscriminator(getDiscriminator());
 }
 
-Optional<const DILocation *> DILocation::cloneWithBaseDiscriminator(unsigned D) const {
+Optional<const DILocation *>
+DILocation::cloneWithBaseDiscriminator(unsigned D) const {
   unsigned BD, DF, CI;
 
   if (EnableFSDiscriminator) {
@@ -2230,7 +2249,8 @@ Optional<const DILocation *> DILocation::cloneWithBaseDiscriminator(unsigned D)
   return None;
 }
 
-Optional<const DILocation *> DILocation::cloneByMultiplyingDuplicationFactor(unsigned DF) const {
+Optional<const DILocation *>
+DILocation::cloneByMultiplyingDuplicationFactor(unsigned DF) const {
   assert(!EnableFSDiscriminator && "FSDiscriminator should not call this.");
 
   DF *= getDuplicationFactor();
diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index 6d98f53157d27..f20b253a94cf2 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -245,45 +245,6 @@ class Function : public GlobalObject, public ilist_node<Function> {
     setValueSubclassData((getSubclassDataFromValue() & 0xc00f) | (ID << 4));
   }
 
-  /// Return the attribute list for this Function.
-  AttributeList getAttributes() const { return AttributeSets; }
-
-  /// Set the attribute list for this Function.
-  void setAttributes(AttributeList Attrs) { AttributeSets = Attrs; }
-
-  /// Add function attributes to this function.
-  void addFnAttr(Attribute::AttrKind Kind) {
-    addAttribute(AttributeList::FunctionIndex, Kind);
-  }
-
-  /// Add function attributes to this function.
-  void addFnAttr(StringRef Kind, StringRef Val = StringRef()) {
-    addAttribute(AttributeList::FunctionIndex,
-                 Attribute::get(getContext(), Kind, Val));
-  }
-
-  /// Add function attributes to this function.
-  void addFnAttr(Attribute Attr) {
-    addAttribute(AttributeList::FunctionIndex, Attr);
-  }
-
-  /// Remove function attributes from this function.
-  void removeFnAttr(Attribute::AttrKind Kind) {
-    removeAttribute(AttributeList::FunctionIndex, Kind);
-  }
-
-  /// Remove function attribute from this function.
-  void removeFnAttr(StringRef Kind) {
-    setAttributes(getAttributes().removeAttribute(
-        getContext(), AttributeList::FunctionIndex, Kind));
-  }
-
-  /// A function will have the "coroutine.presplit" attribute if it's
-  /// a coroutine and has not gone through full CoroSplit pass.
-  bool isPresplitCoroutine() const {
-    return hasFnAttribute("coroutine.presplit");
-  }
-
   enum ProfileCountType { PCT_Invalid, PCT_Real, PCT_Synthetic };
 
   /// Class to represent profile counts.
@@ -351,43 +312,6 @@ class Function : public GlobalObject, public ilist_node<Function> {
   /// Get the section prefix for this function.
   Optional<StringRef> getSectionPrefix() const;
 
-  /// Return true if the function has the attribute.
-  bool hasFnAttribute(Attribute::AttrKind Kind) const {
-    return AttributeSets.hasFnAttribute(Kind);
-  }
-
-  /// Return true if the function has the attribute.
-  bool hasFnAttribute(StringRef Kind) const {
-    return AttributeSets.hasFnAttribute(Kind);
-  }
-
-  /// Return the attribute for the given attribute kind.
-  Attribute getFnAttribute(Attribute::AttrKind Kind) const {
-    return getAttribute(AttributeList::FunctionIndex, Kind);
-  }
-
-  /// Return the attribute for the given attribute kind.
-  Attribute getFnAttribute(StringRef Kind) const {
-    return getAttribute(AttributeList::FunctionIndex, Kind);
-  }
-
-  /// Return the stack alignment for the function.
-  unsigned getFnStackAlignment() const {
-    if (!hasFnAttribute(Attribute::StackAlignment))
-      return 0;
-    if (const auto MA =
-            AttributeSets.getStackAlignment(AttributeList::FunctionIndex))
-      return MA->value();
-    return 0;
-  }
-
-  /// Return the stack alignment for the function.
-  MaybeAlign getFnStackAlign() const {
-    if (!hasFnAttribute(Attribute::StackAlignment))
-      return None;
-    return AttributeSets.getStackAlignment(AttributeList::FunctionIndex);
-  }
-
   /// hasGC/getGC/setGC/clearGC - The name of the garbage collection algorithm
   ///                             to use during code generation.
   bool hasGC() const {
@@ -397,17 +321,29 @@ class Function : public GlobalObject, public ilist_node<Function> {
   void setGC(std::string Str);
   void clearGC();
 
-  /// Returns true if the function has ssp, sspstrong, or sspreq fn attrs.
-  bool hasStackProtectorFnAttr() const;
+  /// Return the attribute list for this Function.
+  AttributeList getAttributes() const { return AttributeSets; }
 
-  /// adds the attribute to the list of attributes.
-  void addAttribute(unsigned i, Attribute::AttrKind Kind);
+  /// Set the attribute list for this Function.
+  void setAttributes(AttributeList Attrs) { AttributeSets = Attrs; }
 
   /// adds the attribute to the list of attributes.
   void addAttribute(unsigned i, Attribute Attr);
 
-  /// adds the attributes to the list of attributes.
-  void addAttributes(unsigned i, const AttrBuilder &Attrs);
+  /// Add function attributes to this function.
+  void addFnAttr(Attribute::AttrKind Kind);
+
+  /// Add function attributes to this function.
+  void addFnAttr(StringRef Kind, StringRef Val = StringRef());
+
+  /// Add function attributes to this function.
+  void addFnAttr(Attribute Attr);
+
+  /// Add function attributes to this function.
+  void addFnAttrs(const AttrBuilder &Attrs);
+
+  /// Add return value attributes to this function.
+  void addRetAttr(Attribute::AttrKind Kind);
 
   /// adds the attribute to the list of attributes for the given arg.
   void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind);
@@ -424,8 +360,22 @@ class Function : public GlobalObject, public ilist_node<Function> {
   /// removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, StringRef Kind);
 
-  /// removes the attributes from the list of attributes.
-  void removeAttributes(unsigned i, const AttrBuilder &Attrs);
+  /// Remove function attributes from this function.
+  void removeFnAttr(Attribute::AttrKind Kind);
+
+  /// Remove function attribute from this function.
+  void removeFnAttr(StringRef Kind);
+
+  void removeFnAttrs(const AttrBuilder &Attrs);
+
+  /// removes the attribute from the return value list of attributes.
+  void removeRetAttr(Attribute::AttrKind Kind);
+
+  /// removes the attribute from the return value list of attributes.
+  void removeRetAttr(StringRef Kind);
+
+  /// removes the attributes from the return value list of attributes.
+  void removeRetAttrs(const AttrBuilder &Attrs);
 
   /// removes the attribute from the list of attributes.
   void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind);
@@ -436,46 +386,49 @@ class Function : public GlobalObject, public ilist_node<Function> {
   /// removes the attribute from the list of attributes.
   void removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs);
 
-  /// removes noundef and other attributes that imply undefined behavior if a
-  /// `undef` or `poison` value is passed from the list of attributes.
-  void removeParamUndefImplyingAttrs(unsigned ArgNo);
+  /// Return true if the function has the attribute.
+  bool hasFnAttribute(Attribute::AttrKind Kind) const;
 
-  /// check if an attributes is in the list of attributes.
-  bool hasAttribute(unsigned i, Attribute::AttrKind Kind) const {
-    return getAttributes().hasAttribute(i, Kind);
-  }
+  /// Return true if the function has the attribute.
+  bool hasFnAttribute(StringRef Kind) const;
 
-  /// check if an attributes is in the list of attributes.
-  bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const {
-    return getAttributes().hasParamAttr(ArgNo, Kind);
-  }
+  /// check if an attribute is in the list of attributes for the return value.
+  bool hasRetAttribute(Attribute::AttrKind Kind) const;
 
-  /// gets the specified attribute from the list of attributes.
-  Attribute getParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const {
-    return getAttributes().getParamAttr(ArgNo, Kind);
-  }
+  /// check if an attributes is in the list of attributes.
+  bool hasParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const;
 
   /// gets the attribute from the list of attributes.
-  Attribute getAttribute(unsigned i, Attribute::AttrKind Kind) const {
-    return AttributeSets.getAttribute(i, Kind);
-  }
+  Attribute getAttribute(unsigned i, Attribute::AttrKind Kind) const;
 
   /// gets the attribute from the list of attributes.
-  Attribute getAttribute(unsigned i, StringRef Kind) const {
-    return AttributeSets.getAttribute(i, Kind);
+  Attribute getAttribute(unsigned i, StringRef Kind) const;
+
+  /// Return the attribute for the given attribute kind.
+  Attribute getFnAttribute(Attribute::AttrKind Kind) const;
+
+  /// Return the attribute for the given attribute kind.
+  Attribute getFnAttribute(StringRef Kind) const;
+
+  /// gets the specified attribute from the list of attributes.
+  Attribute getParamAttribute(unsigned ArgNo, Attribute::AttrKind Kind) const;
+
+  /// removes noundef and other attributes that imply undefined behavior if a
+  /// `undef` or `poison` value is passed from the list of attributes.
+  void removeParamUndefImplyingAttrs(unsigned ArgNo);
+
+  /// Return the stack alignment for the function.
+  MaybeAlign getFnStackAlign() const {
+    return AttributeSets.getFnStackAlignment();
   }
 
-  /// adds the dereferenceable attribute to the list of attributes.
-  void addDereferenceableAttr(unsigned i, uint64_t Bytes);
+  /// Returns true if the function has ssp, sspstrong, or sspreq fn attrs.
+  bool hasStackProtectorFnAttr() const;
 
   /// adds the dereferenceable attribute to the list of attributes for
   /// the given arg.
   void addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes);
 
-  /// adds the dereferenceable_or_null attribute to the list of
-  /// attributes.
-  void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes);
-
   /// adds the dereferenceable_or_null attribute to the list of
   /// attributes for the given arg.
   void addDereferenceableOrNullParamAttr(unsigned ArgNo, uint64_t Bytes);
@@ -517,26 +470,12 @@ class Function : public GlobalObject, public ilist_node<Function> {
     return AttributeSets.getParamByRefType(ArgNo);
   }
 
-  /// Extract the number of dereferenceable bytes for a call or
-  /// parameter (0=unknown).
-  /// @param i AttributeList index, referring to a return value or argument.
-  uint64_t getDereferenceableBytes(unsigned i) const {
-    return AttributeSets.getDereferenceableBytes(i);
-  }
-
   /// Extract the number of dereferenceable bytes for a parameter.
   /// @param ArgNo Index of an argument, with 0 being the first function arg.
   uint64_t getParamDereferenceableBytes(unsigned ArgNo) const {
     return AttributeSets.getParamDereferenceableBytes(ArgNo);
   }
 
-  /// Extract the number of dereferenceable_or_null bytes for a call or
-  /// parameter (0=unknown).
-  /// @param i AttributeList index, referring to a return value or argument.
-  uint64_t getDereferenceableOrNullBytes(unsigned i) const {
-    return AttributeSets.getDereferenceableOrNullBytes(i);
-  }
-
   /// Extract the number of dereferenceable_or_null bytes for a
   /// parameter.
   /// @param ArgNo AttributeList ArgNo, referring to an argument.
@@ -544,6 +483,12 @@ class Function : public GlobalObject, public ilist_node<Function> {
     return AttributeSets.getParamDereferenceableOrNullBytes(ArgNo);
   }
 
+  /// A function will have the "coroutine.presplit" attribute if it's
+  /// a coroutine and has not gone through full CoroSplit pass.
+  bool isPresplitCoroutine() const {
+    return hasFnAttribute("coroutine.presplit");
+  }
+
   /// Determine if the function does not access memory.
   bool doesNotAccessMemory() const {
     return hasFnAttribute(Attribute::ReadNone);
@@ -699,12 +644,9 @@ class Function : public GlobalObject, public ilist_node<Function> {
   /// Determine if the parameter or return value is marked with NoAlias
   /// attribute.
   bool returnDoesNotAlias() const {
-    return AttributeSets.hasAttribute(AttributeList::ReturnIndex,
-                                      Attribute::NoAlias);
-  }
-  void setReturnDoesNotAlias() {
-    addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+    return AttributeSets.hasRetAttr(Attribute::NoAlias);
   }
+  void setReturnDoesNotAlias() { addRetAttr(Attribute::NoAlias); }
 
   /// Do not optimize this function (-O0).
   bool hasOptNone() const { return hasFnAttribute(Attribute::OptimizeNone); }
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 8f57d47eda89e..2c07f431b5703 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -351,7 +351,7 @@ class IRBuilderBase {
   }
 
   void setConstrainedFPCallAttr(CallBase *I) {
-    I->addAttribute(AttributeList::FunctionIndex, Attribute::StrictFP);
+    I->addFnAttr(Attribute::StrictFP);
   }
 
   void setDefaultOperandBundles(ArrayRef<OperandBundleDef> OpBundles) {
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index ef2c279ed4552..3f37ac3659d28 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1487,90 +1487,101 @@ class CallBase : public Instruction {
 
   /// adds the attribute to the list of attributes.
   void addAttribute(unsigned i, Attribute::AttrKind Kind) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addAttribute(getContext(), i, Kind);
-    setAttributes(PAL);
+    Attrs = Attrs.addAttribute(getContext(), i, Kind);
   }
 
   /// adds the attribute to the list of attributes.
   void addAttribute(unsigned i, Attribute Attr) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addAttribute(getContext(), i, Attr);
-    setAttributes(PAL);
+    Attrs = Attrs.addAttribute(getContext(), i, Attr);
+  }
+
+  /// Adds the attribute to the function.
+  void addFnAttr(Attribute::AttrKind Kind) {
+    Attrs = Attrs.addFnAttribute(getContext(), Kind);
+  }
+
+  /// Adds the attribute to the function.
+  void addFnAttr(Attribute Attr) {
+    Attrs = Attrs.addFnAttribute(getContext(), Attr);
+  }
+
+  /// Adds the attribute to the return value.
+  void addRetAttr(Attribute::AttrKind Kind) {
+    Attrs = Attrs.addRetAttribute(getContext(), Kind);
+  }
+
+  /// Adds the attribute to the return value.
+  void addRetAttr(Attribute Attr) {
+    Attrs = Attrs.addRetAttribute(getContext(), Attr);
   }
 
   /// Adds the attribute to the indicated argument
   void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
     assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind);
-    setAttributes(PAL);
+    Attrs = Attrs.addParamAttribute(getContext(), ArgNo, Kind);
   }
 
   /// Adds the attribute to the indicated argument
   void addParamAttr(unsigned ArgNo, Attribute Attr) {
     assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr);
-    setAttributes(PAL);
+    Attrs = Attrs.addParamAttribute(getContext(), ArgNo, Attr);
   }
 
   /// removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeAttribute(getContext(), i, Kind);
-    setAttributes(PAL);
+    Attrs = Attrs.removeAttribute(getContext(), i, Kind);
   }
 
   /// removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, StringRef Kind) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeAttribute(getContext(), i, Kind);
-    setAttributes(PAL);
+    Attrs = Attrs.removeAttribute(getContext(), i, Kind);
+  }
+
+  /// Removes the attributes from the function
+  void removeFnAttrs(const AttrBuilder &AttrsToRemove) {
+    Attrs = Attrs.removeFnAttributes(getContext(), AttrsToRemove);
+  }
+
+  /// Removes the attribute from the function
+  void removeFnAttr(Attribute::AttrKind Kind) {
+    Attrs = Attrs.removeFnAttribute(getContext(), Kind);
+  }
+
+  /// Removes the attribute from the return value
+  void removeRetAttr(Attribute::AttrKind Kind) {
+    Attrs = Attrs.removeRetAttribute(getContext(), Kind);
   }
 
-  void removeAttributes(unsigned i, const AttrBuilder &Attrs) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeAttributes(getContext(), i, Attrs);
-    setAttributes(PAL);
+  /// Removes the attributes from the return value
+  void removeRetAttrs(const AttrBuilder &AttrsToRemove) {
+    Attrs = Attrs.removeRetAttributes(getContext(), AttrsToRemove);
   }
 
   /// Removes the attribute from the given argument
   void removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
     assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
-    setAttributes(PAL);
+    Attrs = Attrs.removeParamAttribute(getContext(), ArgNo, Kind);
   }
 
   /// Removes the attribute from the given argument
   void removeParamAttr(unsigned ArgNo, StringRef Kind) {
     assert(ArgNo < getNumArgOperands() && "Out of bounds");
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
-    setAttributes(PAL);
+    Attrs = Attrs.removeParamAttribute(getContext(), ArgNo, Kind);
   }
 
   /// Removes the attributes from the given argument
-  void removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.removeParamAttributes(getContext(), ArgNo, Attrs);
-    setAttributes(PAL);
+  void removeParamAttrs(unsigned ArgNo, const AttrBuilder &AttrsToRemove) {
+    Attrs = Attrs.removeParamAttributes(getContext(), ArgNo, AttrsToRemove);
   }
 
   /// adds the dereferenceable attribute to the list of attributes.
-  void addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
-    setAttributes(PAL);
+  void addDereferenceableParamAttr(unsigned i, uint64_t Bytes) {
+    Attrs = Attrs.addDereferenceableParamAttr(getContext(), i, Bytes);
   }
 
-  /// adds the dereferenceable_or_null attribute to the list of
-  /// attributes.
-  void addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-    AttributeList PAL = getAttributes();
-    PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
-    setAttributes(PAL);
+  /// adds the dereferenceable attribute to the list of attributes.
+  void addDereferenceableRetAttr(uint64_t Bytes) {
+    Attrs = Attrs.addDereferenceableRetAttr(getContext(), Bytes);
   }
 
   /// Determine whether the return value has the given attribute.
@@ -1593,6 +1604,16 @@ class CallBase : public Instruction {
     return getAttributes().getAttribute(i, Kind);
   }
 
+  /// Get the attribute of a given kind for the function.
+  Attribute getFnAttr(StringRef Kind) const {
+    return getAttributes().getFnAttr(Kind);
+  }
+
+  /// Get the attribute of a given kind for the function.
+  Attribute getFnAttr(Attribute::AttrKind Kind) const {
+    return getAttributes().getFnAttr(Kind);
+  }
+
   /// Get the attribute of a given kind from a given arg
   Attribute getParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
     assert(ArgNo < getNumArgOperands() && "Out of bounds");
@@ -1743,14 +1764,26 @@ class CallBase : public Instruction {
 
   /// Extract the number of dereferenceable bytes for a call or
   /// parameter (0=unknown).
-  uint64_t getDereferenceableBytes(unsigned i) const {
-    return Attrs.getDereferenceableBytes(i);
+  uint64_t getRetDereferenceableBytes() const {
+    return Attrs.getRetDereferenceableBytes();
   }
 
-  /// Extract the number of dereferenceable_or_null bytes for a call or
+  /// Extract the number of dereferenceable bytes for a call or
   /// parameter (0=unknown).
-  uint64_t getDereferenceableOrNullBytes(unsigned i) const {
-    return Attrs.getDereferenceableOrNullBytes(i);
+  uint64_t getParamDereferenceableBytes(unsigned i) const {
+    return Attrs.getParamDereferenceableBytes(i);
+  }
+
+  /// Extract the number of dereferenceable_or_null bytes for a call
+  /// (0=unknown).
+  uint64_t getRetDereferenceableOrNullBytes() const {
+    return Attrs.getRetDereferenceableOrNullBytes();
+  }
+
+  /// Extract the number of dereferenceable_or_null bytes for a
+  /// parameter (0=unknown).
+  uint64_t getParamDereferenceableOrNullBytes(unsigned i) const {
+    return Attrs.getParamDereferenceableOrNullBytes(i);
   }
 
   /// Return true if the return value is known to be not null.
@@ -1760,7 +1793,7 @@ class CallBase : public Instruction {
 
   /// Determine if the return value is marked with NoAlias attribute.
   bool returnDoesNotAlias() const {
-    return Attrs.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+    return Attrs.hasRetAttr(Attribute::NoAlias);
   }
 
   /// If one of the arguments has the 'returned' attribute, returns its
@@ -1779,40 +1812,30 @@ class CallBase : public Instruction {
 
   /// Return true if the call should not be inlined.
   bool isNoInline() const { return hasFnAttr(Attribute::NoInline); }
-  void setIsNoInline() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
-  }
+  void setIsNoInline() { addFnAttr(Attribute::NoInline); }
   /// Determine if the call does not access memory.
   bool doesNotAccessMemory() const { return hasFnAttr(Attribute::ReadNone); }
-  void setDoesNotAccessMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
-  }
+  void setDoesNotAccessMemory() { addFnAttr(Attribute::ReadNone); }
 
   /// Determine if the call does not access or only reads memory.
   bool onlyReadsMemory() const {
     return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
   }
 
-  void setOnlyReadsMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
-  }
+  void setOnlyReadsMemory() { addFnAttr(Attribute::ReadOnly); }
 
   /// Determine if the call does not access or only writes memory.
   bool doesNotReadMemory() const {
     return doesNotAccessMemory() || hasFnAttr(Attribute::WriteOnly);
   }
-  void setDoesNotReadMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::WriteOnly);
-  }
+  void setDoesNotReadMemory() { addFnAttr(Attribute::WriteOnly); }
 
   /// Determine if the call can access memmory only using pointers based
   /// on its arguments.
   bool onlyAccessesArgMemory() const {
     return hasFnAttr(Attribute::ArgMemOnly);
   }
-  void setOnlyAccessesArgMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ArgMemOnly);
-  }
+  void setOnlyAccessesArgMemory() { addFnAttr(Attribute::ArgMemOnly); }
 
   /// Determine if the function may only access memory that is
   /// inaccessible from the IR.
@@ -1820,7 +1843,7 @@ class CallBase : public Instruction {
     return hasFnAttr(Attribute::InaccessibleMemOnly);
   }
   void setOnlyAccessesInaccessibleMemory() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::InaccessibleMemOnly);
+    addFnAttr(Attribute::InaccessibleMemOnly);
   }
 
   /// Determine if the function may only access memory that is
@@ -1829,44 +1852,31 @@ class CallBase : public Instruction {
     return hasFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
   }
   void setOnlyAccessesInaccessibleMemOrArgMem() {
-    addAttribute(AttributeList::FunctionIndex,
-                 Attribute::InaccessibleMemOrArgMemOnly);
+    addFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
   }
   /// Determine if the call cannot return.
   bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
-  void setDoesNotReturn() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
-  }
+  void setDoesNotReturn() { addFnAttr(Attribute::NoReturn); }
 
   /// Determine if the call should not perform indirect branch tracking.
   bool doesNoCfCheck() const { return hasFnAttr(Attribute::NoCfCheck); }
 
   /// Determine if the call cannot unwind.
   bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); }
-  void setDoesNotThrow() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
-  }
+  void setDoesNotThrow() { addFnAttr(Attribute::NoUnwind); }
 
   /// Determine if the invoke cannot be duplicated.
   bool cannotDuplicate() const { return hasFnAttr(Attribute::NoDuplicate); }
-  void setCannotDuplicate() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoDuplicate);
-  }
+  void setCannotDuplicate() { addFnAttr(Attribute::NoDuplicate); }
 
   /// Determine if the call cannot be tail merged.
   bool cannotMerge() const { return hasFnAttr(Attribute::NoMerge); }
-  void setCannotMerge() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::NoMerge);
-  }
+  void setCannotMerge() { addFnAttr(Attribute::NoMerge); }
 
   /// Determine if the invoke is convergent
   bool isConvergent() const { return hasFnAttr(Attribute::Convergent); }
-  void setConvergent() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
-  }
-  void setNotConvergent() {
-    removeAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
-  }
+  void setConvergent() { addFnAttr(Attribute::Convergent); }
+  void setNotConvergent() { removeFnAttr(Attribute::Convergent); }
 
   /// Determine if the call returns a structure through first
   /// pointer argument.
@@ -2258,7 +2268,7 @@ class CallBase : public Instruction {
   bool hasFnAttrOnCalledFunction(StringRef Kind) const;
 
   template <typename AttrKind> bool hasFnAttrImpl(AttrKind Kind) const {
-    if (Attrs.hasFnAttribute(Kind))
+    if (Attrs.hasFnAttr(Kind))
       return true;
 
     // Operand bundles override attributes on the called function, but don't
@@ -2272,12 +2282,12 @@ class CallBase : public Instruction {
   /// Determine whether the return value has the given attribute. Supports
   /// Attribute::AttrKind and StringRef as \p AttrKind types.
   template <typename AttrKind> bool hasRetAttrImpl(AttrKind Kind) const {
-    if (Attrs.hasAttribute(AttributeList::ReturnIndex, Kind))
+    if (Attrs.hasRetAttr(Kind))
       return true;
 
     // Look at the callee, if available.
     if (const Function *F = getCalledFunction())
-      return F->getAttributes().hasAttribute(AttributeList::ReturnIndex, Kind);
+      return F->getAttributes().hasRetAttr(Kind);
     return false;
   }
 };
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 4d3f512b03ed6..fac6b108fd6a8 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -1685,9 +1685,7 @@ class CallInst : public CallBase {
 
   /// Return true if the call can return twice
   bool canReturnTwice() const { return hasFnAttr(Attribute::ReturnsTwice); }
-  void setCanReturnTwice() {
-    addAttribute(AttributeList::FunctionIndex, Attribute::ReturnsTwice);
-  }
+  void setCanReturnTwice() { addFnAttr(Attribute::ReturnsTwice); }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index 6b42cb9490509..d186029db8cf4 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -448,6 +448,28 @@ class VPIntrinsic : public IntrinsicInst {
   static Optional<unsigned> getFunctionalOpcodeForVP(Intrinsic::ID ID);
 };
 
+/// This represents vector predication reduction intrinsics.
+class VPReductionIntrinsic : public VPIntrinsic {
+public:
+  static bool isVPReduction(Intrinsic::ID ID);
+
+  unsigned getStartParamPos() const;
+  unsigned getVectorParamPos() const;
+
+  static Optional<unsigned> getStartParamPos(Intrinsic::ID ID);
+  static Optional<unsigned> getVectorParamPos(Intrinsic::ID ID);
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast:
+  /// @{
+  static bool classof(const IntrinsicInst *I) {
+    return VPReductionIntrinsic::isVPReduction(I->getIntrinsicID());
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+  /// @}
+};
+
 /// This is the common base class for constrained floating point intrinsics.
 class ConstrainedFPIntrinsic : public IntrinsicInst {
 public:
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 61165ab9e1369..4487a2467662f 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1508,6 +1508,75 @@ let IntrProperties =
                                 llvm_i32_ty]>;
 }
 
+// Reductions
+let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in {
+  def int_vp_reduce_fadd : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_fmul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_add  : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_mul : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_and : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_or : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_xor : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_smax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_smin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_umax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_umin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_fmax : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+  def int_vp_reduce_fmin : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>],
+                                    [LLVMVectorElementType<0>,
+                                     llvm_anyvector_ty,
+                                     LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                     llvm_i32_ty]>;
+}
+
 def int_get_active_lane_mask:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
             [llvm_anyint_ty, LLVMMatchType<1>],
@@ -1570,10 +1639,10 @@ def int_load_relative: DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_a
 
 def int_hwasan_check_memaccess :
   Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>]>;
+            [ImmArg<ArgIndex<2>>]>;
 def int_hwasan_check_memaccess_shortgranules :
   Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty, llvm_i32_ty],
-            [IntrInaccessibleMemOnly, ImmArg<ArgIndex<2>>]>;
+            [ImmArg<ArgIndex<2>>]>;
 
 // Xray intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 46a7aeb39c9a5..c8e55fe0f5799 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -684,7 +684,14 @@ class AMDGPUDimAtomicProfile<string opmod,
   let IsAtomic = true;
 }
 
-class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim> : AMDGPUDimProfile<"GET_RESINFO", dim> {
+class AMDGPUDimAtomicFloatProfile<string opmod, AMDGPUDimProps dim,
+                                  list<AMDGPUArg> dataargs>
+    : AMDGPUDimAtomicProfile<opmod, dim, dataargs> {
+  let RetTypes = [llvm_anyfloat_ty];
+}
+
+class AMDGPUDimGetResInfoProfile<AMDGPUDimProps dim>
+    : AMDGPUDimProfile<"GET_RESINFO", dim> {
   let RetTypes = [llvm_anyfloat_ty];
   let DataArgs = [];
   let AddrArgs = [AMDGPUArg<llvm_anyint_ty, "mip">];
@@ -860,17 +867,24 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
 // atomic intrinsics
 //////////////////////////////////////////////////////////////////////////
 defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
-  multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs> {
-    foreach dim = AMDGPUDims.All in {
-      def !strconcat(NAME, "_", dim.Name)
-        : AMDGPUImageDimIntrinsic<
-            AMDGPUDimAtomicProfile<opmod, dim, dataargs>,
-            [], [SDNPMemOperand]>;
-    }
+  multiclass AMDGPUImageDimAtomicX<string opmod, list<AMDGPUArg> dataargs,
+                                   int isFloat = 0> {
+        foreach dim = AMDGPUDims.All in {
+          def !strconcat(NAME, "_", dim.Name): AMDGPUImageDimIntrinsic<
+              !if (isFloat, AMDGPUDimAtomicFloatProfile<opmod, dim, dataargs>,
+                   AMDGPUDimAtomicProfile<opmod, dim, dataargs>),
+              [], [SDNPMemOperand]>;
+        }
   }
 
-  multiclass AMDGPUImageDimAtomic<string opmod> {
-    defm "" : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">]>;
+  multiclass AMDGPUImageDimAtomic<string opmod, int isFloat = 0> {
+    defm ""
+        : AMDGPUImageDimAtomicX<opmod, [AMDGPUArg<LLVMMatchType<0>, "vdata">],
+                                isFloat>;
+  }
+
+  multiclass AMDGPUImageDimFloatAtomic<string opmod> {
+    defm "" : AMDGPUImageDimAtomic<opmod, 1 /*isFloat*/>;
   }
 
   defm int_amdgcn_image_atomic_swap : AMDGPUImageDimAtomic<"ATOMIC_SWAP">;
@@ -878,8 +892,10 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
   defm int_amdgcn_image_atomic_sub : AMDGPUImageDimAtomic<"ATOMIC_SUB">;
   defm int_amdgcn_image_atomic_smin : AMDGPUImageDimAtomic<"ATOMIC_SMIN">;
   defm int_amdgcn_image_atomic_umin : AMDGPUImageDimAtomic<"ATOMIC_UMIN">;
+  defm int_amdgcn_image_atomic_fmin : AMDGPUImageDimFloatAtomic<"ATOMIC_FMIN">;
   defm int_amdgcn_image_atomic_smax : AMDGPUImageDimAtomic<"ATOMIC_SMAX">;
   defm int_amdgcn_image_atomic_umax : AMDGPUImageDimAtomic<"ATOMIC_UMAX">;
+  defm int_amdgcn_image_atomic_fmax : AMDGPUImageDimFloatAtomic<"ATOMIC_FMAX">;
   defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">;
   defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">;
   defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">;
@@ -1015,8 +1031,10 @@ def int_amdgcn_raw_buffer_atomic_add : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_sub : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_smin : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_umin : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
 def int_amdgcn_raw_buffer_atomic_smax : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
 def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
 def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
@@ -1036,10 +1054,6 @@ def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
 // gfx908 intrinsic
 def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
 
-// gfx90a intrinsics
-def int_amdgcn_raw_buffer_atomic_fmin : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
-def int_amdgcn_raw_buffer_atomic_fmax : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
-
 class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty, bit NoRtn = false> : Intrinsic <
   !if(NoRtn, [], [data_ty]),
   [!if(NoRtn, data_ty, LLVMMatchType<0>),  // vdata(VGPR)
diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
index dd5de335b1980..320d9bae47c21 100644
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@@ -164,6 +164,15 @@ def int_wasm_q15mulr_sat_signed :
             [llvm_v8i16_ty, llvm_v8i16_ty],
             [IntrNoMem, IntrSpeculatable]>;
 
+def int_wasm_pmin :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+def int_wasm_pmax :
+  Intrinsic<[llvm_anyvector_ty],
+            [LLVMMatchType<0>, LLVMMatchType<0>],
+            [IntrNoMem, IntrSpeculatable]>;
+
 def int_wasm_extadd_pairwise_signed :
   Intrinsic<[llvm_anyvector_ty],
             [LLVMSubdivide2VectorType<0>],
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 5848356b5b1a1..c79c6118db680 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5093,6 +5093,10 @@ let TargetPrefix = "x86" in {
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
                          llvm_x86amx_ty, llvm_x86amx_ty,
                          llvm_x86amx_ty], []>;
+  def int_x86_cast_vector_to_tile:
+              Intrinsic<[llvm_x86amx_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+  def int_x86_cast_tile_to_vector:
+              Intrinsic<[llvm_anyvector_ty], [llvm_x86amx_ty], [IntrNoMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -5108,3 +5112,601 @@ let TargetPrefix = "x86" in {
   def int_x86_senduipi : GCCBuiltin<"__builtin_ia32_senduipi">,
               Intrinsic<[], [llvm_i64_ty], []>;
 }
+
+//===----------------------------------------------------------------------===//
+// avx512_fp16: vaddph
+let TargetPrefix = "x86" in {
+  def int_x86_avx512fp16_add_ph_512
+      : GCCBuiltin<"__builtin_ia32_addph512">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_sub_ph_512
+      : GCCBuiltin<"__builtin_ia32_subph512">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_mul_ph_512
+      : GCCBuiltin<"__builtin_ia32_mulph512">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_div_ph_512
+      : GCCBuiltin<"__builtin_ia32_divph512">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_max_ph_128
+      : GCCBuiltin<"__builtin_ia32_maxph128">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_max_ph_256
+      : GCCBuiltin<"__builtin_ia32_maxph256">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_max_ph_512
+      : GCCBuiltin<"__builtin_ia32_maxph512">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_min_ph_128
+      : GCCBuiltin<"__builtin_ia32_minph128">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_min_ph_256
+      : GCCBuiltin<"__builtin_ia32_minph256">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_min_ph_512
+      : GCCBuiltin<"__builtin_ia32_minph512">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+
+  def int_x86_avx512fp16_mask_cmp_ph_512
+      : Intrinsic<[ llvm_v32i1_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty, llvm_v32i1_ty,
+                  llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_cmp_ph_256
+      : Intrinsic<[ llvm_v16i1_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i32_ty, llvm_v16i1_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_mask_cmp_ph_128
+      : Intrinsic<[ llvm_v8i1_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_v8i1_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+
+  def int_x86_avx512fp16_mask_add_sh_round
+      : GCCBuiltin<"__builtin_ia32_addsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_sub_sh_round
+      : GCCBuiltin<"__builtin_ia32_subsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_mul_sh_round
+      : GCCBuiltin<"__builtin_ia32_mulsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_div_sh_round
+      : GCCBuiltin<"__builtin_ia32_divsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_min_sh_round
+      : GCCBuiltin<"__builtin_ia32_minsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_max_sh_round
+      : GCCBuiltin<"__builtin_ia32_maxsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_cmp_sh
+      : GCCBuiltin<"__builtin_ia32_cmpsh_mask">,
+        Intrinsic<[ llvm_i8_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_vcomi_sh
+      : GCCBuiltin<"__builtin_ia32_vcomish">,
+        Intrinsic<[ llvm_i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>> ]>;
+
+  def int_x86_avx512fp16_mask_vcvtph2psx_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2psx128_mask">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v8f16_ty, llvm_v4f32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2psx_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2psx256_mask">,
+        Intrinsic<[ llvm_v8f32_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2psx_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2psx512_mask">,
+        Intrinsic<[ llvm_v16f32_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtps2phx_128
+      : GCCBuiltin<"__builtin_ia32_vcvtps2phx128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v4f32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtps2phx_256
+      : GCCBuiltin<"__builtin_ia32_vcvtps2phx256_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtps2phx_512
+      : GCCBuiltin<"__builtin_ia32_vcvtps2phx512_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f32_ty, llvm_v16f16_ty, llvm_i16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtpd2ph_128
+      : GCCBuiltin<"__builtin_ia32_vcvtpd2ph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v2f64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtpd2ph_256
+      : GCCBuiltin<"__builtin_ia32_vcvtpd2ph256_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v4f64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtpd2ph_512
+      : GCCBuiltin<"__builtin_ia32_vcvtpd2ph512_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f64_ty, llvm_v8f16_ty, llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtph2pd_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2pd128_mask">,
+        Intrinsic<[ llvm_v2f64_ty ],
+                  [ llvm_v8f16_ty, llvm_v2f64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2pd_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2pd256_mask">,
+        Intrinsic<[ llvm_v4f64_ty ],
+                  [ llvm_v8f16_ty, llvm_v4f64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2pd_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2pd512_mask">,
+        Intrinsic<[ llvm_v8f64_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtsh2ss_round
+      : GCCBuiltin<"__builtin_ia32_vcvtsh2ss_round_mask">,
+        Intrinsic<[ llvm_v4f32_ty ],
+                  [ llvm_v4f32_ty, llvm_v8f16_ty, llvm_v4f32_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_vcvtss2sh_round
+      : GCCBuiltin<"__builtin_ia32_vcvtss2sh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v4f32_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_vcvtsd2sh_round
+      : GCCBuiltin<"__builtin_ia32_vcvtsd2sh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v2f64_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_vcvtsh2sd_round
+      : GCCBuiltin<"__builtin_ia32_vcvtsh2sd_round_mask">,
+        Intrinsic<[ llvm_v2f64_ty ],
+                  [ llvm_v2f64_ty, llvm_v8f16_ty, llvm_v2f64_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+
+  def int_x86_avx512fp16_mask_vcvtph2w_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2w128_mask">,
+        Intrinsic<[ llvm_v8i16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2w_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2w256_mask">,
+        Intrinsic<[ llvm_v16i16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2w_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2w512_mask">,
+        Intrinsic<[ llvm_v32i16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvttph2w_128
+      : GCCBuiltin<"__builtin_ia32_vcvttph2w128_mask">,
+        Intrinsic<[ llvm_v8i16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2w_256
+      : GCCBuiltin<"__builtin_ia32_vcvttph2w256_mask">,
+        Intrinsic<[ llvm_v16i16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2w_512
+      : GCCBuiltin<"__builtin_ia32_vcvttph2w512_mask">,
+        Intrinsic<[ llvm_v32i16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtph2uw_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2uw128_mask">,
+        Intrinsic<[ llvm_v8i16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2uw_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2uw256_mask">,
+        Intrinsic<[ llvm_v16i16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2uw_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2uw512_mask">,
+        Intrinsic<[ llvm_v32i16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvttph2uw_128
+      : GCCBuiltin<"__builtin_ia32_vcvttph2uw128_mask">,
+        Intrinsic<[ llvm_v8i16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2uw_256
+      : GCCBuiltin<"__builtin_ia32_vcvttph2uw256_mask">,
+        Intrinsic<[ llvm_v16i16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2uw_512
+      : GCCBuiltin<"__builtin_ia32_vcvttph2uw512_mask">,
+        Intrinsic<[ llvm_v32i16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32i16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+
+  def int_x86_avx512fp16_mask_vcvtph2dq_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2dq128_mask">,
+        Intrinsic<[ llvm_v4i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2dq_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2dq256_mask">,
+        Intrinsic<[ llvm_v8i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2dq_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2dq512_mask">,
+        Intrinsic<[ llvm_v16i32_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtph2udq_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2udq128_mask">,
+        Intrinsic<[ llvm_v4i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2udq_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2udq256_mask">,
+        Intrinsic<[ llvm_v8i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2udq_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2udq512_mask">,
+        Intrinsic<[ llvm_v16i32_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtdq2ph_128
+      : GCCBuiltin<"__builtin_ia32_vcvtdq2ph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v4i32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtudq2ph_128
+      : GCCBuiltin<"__builtin_ia32_vcvtudq2ph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v4i32_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2dq_128
+      : GCCBuiltin<"__builtin_ia32_vcvttph2dq128_mask">,
+        Intrinsic<[ llvm_v4i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2dq_256
+      : GCCBuiltin<"__builtin_ia32_vcvttph2dq256_mask">,
+        Intrinsic<[ llvm_v8i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2dq_512
+      : GCCBuiltin<"__builtin_ia32_vcvttph2dq512_mask">,
+        Intrinsic<[ llvm_v16i32_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvttph2udq_128
+      : GCCBuiltin<"__builtin_ia32_vcvttph2udq128_mask">,
+        Intrinsic<[ llvm_v4i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2udq_256
+      : GCCBuiltin<"__builtin_ia32_vcvttph2udq256_mask">,
+        Intrinsic<[ llvm_v8i32_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i32_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2udq_512
+      : GCCBuiltin<"__builtin_ia32_vcvttph2udq512_mask">,
+        Intrinsic<[ llvm_v16i32_ty ],
+                  [ llvm_v16f16_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+
+  def int_x86_avx512fp16_mask_vcvtqq2ph_128
+      : GCCBuiltin<"__builtin_ia32_vcvtqq2ph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v2i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtqq2ph_256
+      : GCCBuiltin<"__builtin_ia32_vcvtqq2ph256_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v4i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2qq_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2qq128_mask">,
+        Intrinsic<[ llvm_v2i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2qq_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2qq256_mask">,
+        Intrinsic<[ llvm_v4i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2qq_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2qq512_mask">,
+        Intrinsic<[ llvm_v8i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvtuqq2ph_128
+      : GCCBuiltin<"__builtin_ia32_vcvtuqq2ph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v2i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtuqq2ph_256
+      : GCCBuiltin<"__builtin_ia32_vcvtuqq2ph256_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v4i64_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2uqq_128
+      : GCCBuiltin<"__builtin_ia32_vcvtph2uqq128_mask">,
+        Intrinsic<[ llvm_v2i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2uqq_256
+      : GCCBuiltin<"__builtin_ia32_vcvtph2uqq256_mask">,
+        Intrinsic<[ llvm_v4i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvtph2uqq_512
+      : GCCBuiltin<"__builtin_ia32_vcvtph2uqq512_mask">,
+        Intrinsic<[ llvm_v8i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvttph2qq_128
+      : GCCBuiltin<"__builtin_ia32_vcvttph2qq128_mask">,
+        Intrinsic<[ llvm_v2i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2qq_256
+      : GCCBuiltin<"__builtin_ia32_vcvttph2qq256_mask">,
+        Intrinsic<[ llvm_v4i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2qq_512
+      : GCCBuiltin<"__builtin_ia32_vcvttph2qq512_mask">,
+        Intrinsic<[ llvm_v8i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_vcvttph2uqq_128
+      : GCCBuiltin<"__builtin_ia32_vcvttph2uqq128_mask">,
+        Intrinsic<[ llvm_v2i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v2i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2uqq_256
+      : GCCBuiltin<"__builtin_ia32_vcvttph2uqq256_mask">,
+        Intrinsic<[ llvm_v4i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v4i64_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_vcvttph2uqq_512
+      : GCCBuiltin<"__builtin_ia32_vcvttph2uqq512_mask">,
+        Intrinsic<[ llvm_v8i64_ty ],
+                  [ llvm_v8f16_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+
+  def int_x86_avx512fp16_vcvtsh2si32
+      : GCCBuiltin<"__builtin_ia32_vcvtsh2si32">,
+        Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvtsh2usi32
+      : GCCBuiltin<"__builtin_ia32_vcvtsh2usi32">,
+        Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvtsh2si64
+      : GCCBuiltin<"__builtin_ia32_vcvtsh2si64">,
+        Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvtsh2usi64
+      : GCCBuiltin<"__builtin_ia32_vcvtsh2usi64">,
+        Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvtusi2sh
+      : GCCBuiltin<"__builtin_ia32_vcvtusi2sh">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_vcvtusi642sh
+      : GCCBuiltin<"__builtin_ia32_vcvtusi642sh">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i64_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_vcvtsi2sh
+      : GCCBuiltin<"__builtin_ia32_vcvtsi2sh">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_vcvtsi642sh
+      : GCCBuiltin<"__builtin_ia32_vcvtsi642sh">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i64_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>> ]>;
+  def int_x86_avx512fp16_vcvttsh2si32
+      : GCCBuiltin<"__builtin_ia32_vcvttsh2si32">,
+        Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvttsh2si64
+      : GCCBuiltin<"__builtin_ia32_vcvttsh2si64">,
+        Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvttsh2usi32
+      : GCCBuiltin<"__builtin_ia32_vcvttsh2usi32">,
+        Intrinsic<[ llvm_i32_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_vcvttsh2usi64
+      : GCCBuiltin<"__builtin_ia32_vcvttsh2usi64">,
+        Intrinsic<[ llvm_i64_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+
+  def int_x86_avx512fp16_sqrt_ph_512
+      : Intrinsic<[ llvm_v32f16_ty ], [ llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_sqrt_sh
+      : Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_rsqrt_ph_128
+      : GCCBuiltin<"__builtin_ia32_rsqrtph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rsqrt_ph_256
+      : GCCBuiltin<"__builtin_ia32_rsqrtph256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rsqrt_ph_512
+      : GCCBuiltin<"__builtin_ia32_rsqrtph512_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rsqrt_sh
+      : GCCBuiltin<"__builtin_ia32_rsqrtsh_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rcp_ph_128
+      : GCCBuiltin<"__builtin_ia32_rcpph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rcp_ph_256
+      : GCCBuiltin<"__builtin_ia32_rcpph256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rcp_ph_512
+      : GCCBuiltin<"__builtin_ia32_rcpph512_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_rcp_sh
+      : GCCBuiltin<"__builtin_ia32_rcpsh_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_reduce_ph_128
+      : GCCBuiltin<"__builtin_ia32_reduceph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_reduce_ph_256
+      : GCCBuiltin<"__builtin_ia32_reduceph256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_reduce_ph_512
+      : GCCBuiltin<"__builtin_ia32_reduceph512_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_reduce_sh
+      : GCCBuiltin<"__builtin_ia32_reducesh_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>> ]>;
+  def int_x86_avx512fp16_fpclass_ph_128
+      : Intrinsic<[ llvm_v8i1_ty ], [ llvm_v8f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_fpclass_ph_256
+      : Intrinsic<[ llvm_v16i1_ty ], [ llvm_v16f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_fpclass_ph_512
+      : Intrinsic<[ llvm_v32i1_ty ], [ llvm_v32f16_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_fpclass_sh
+      : GCCBuiltin<"__builtin_ia32_fpclasssh_mask">,
+        Intrinsic<[ llvm_i8_ty ], [ llvm_v8f16_ty, llvm_i32_ty, llvm_i8_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_getexp_ph_128
+      : GCCBuiltin<"__builtin_ia32_getexpph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ], [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_getexp_ph_256
+      : GCCBuiltin<"__builtin_ia32_getexpph256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_getexp_ph_512
+      : GCCBuiltin<"__builtin_ia32_getexpph512_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<3>> ]>;
+  def int_x86_avx512fp16_mask_getexp_sh
+      : GCCBuiltin<"__builtin_ia32_getexpsh128_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_getmant_ph_128
+      : GCCBuiltin<"__builtin_ia32_getmantph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_getmant_ph_256
+      : GCCBuiltin<"__builtin_ia32_getmantph256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_getmant_ph_512
+      : GCCBuiltin<"__builtin_ia32_getmantph512_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_getmant_sh
+      : GCCBuiltin<"__builtin_ia32_getmantsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty,
+                    llvm_i8_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>> ]>;
+  def int_x86_avx512fp16_mask_rndscale_ph_128
+      : GCCBuiltin<"__builtin_ia32_rndscaleph_128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_i32_ty, llvm_v8f16_ty, llvm_i8_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_rndscale_ph_256
+      : GCCBuiltin<"__builtin_ia32_rndscaleph_256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_i32_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>> ]>;
+  def int_x86_avx512fp16_mask_rndscale_ph_512
+      : GCCBuiltin<"__builtin_ia32_rndscaleph_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_i32_ty, llvm_v32f16_ty, llvm_i32_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_rndscale_sh
+      : GCCBuiltin<"__builtin_ia32_rndscalesh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty, llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>> ]>;
+  def int_x86_avx512fp16_mask_scalef_ph_128
+      : GCCBuiltin<"__builtin_ia32_scalefph128_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_scalef_ph_256
+      : GCCBuiltin<"__builtin_ia32_scalefph256_mask">,
+        Intrinsic<[ llvm_v16f16_ty ],
+                  [ llvm_v16f16_ty, llvm_v16f16_ty, llvm_v16f16_ty, llvm_i16_ty ],
+                  [ IntrNoMem ]>;
+  def int_x86_avx512fp16_mask_scalef_ph_512
+      : GCCBuiltin<"__builtin_ia32_scalefph512_mask">,
+        Intrinsic<[ llvm_v32f16_ty ],
+                  [ llvm_v32f16_ty, llvm_v32f16_ty, llvm_v32f16_ty, llvm_i32_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+  def int_x86_avx512fp16_mask_scalef_sh
+      : GCCBuiltin<"__builtin_ia32_scalefsh_round_mask">,
+        Intrinsic<[ llvm_v8f16_ty ],
+                  [ llvm_v8f16_ty, llvm_v8f16_ty, llvm_v8f16_ty, llvm_i8_ty,
+                    llvm_i32_ty ],
+                  [ IntrNoMem, ImmArg<ArgIndex<4>> ]>;
+}
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
index 136de14ba2c1a..d5fc2d1fce71f 100644
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -82,7 +82,7 @@ template <class IRBuilderTy> class MatrixBuilder {
     CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
     Attribute AlignAttr =
         Attribute::getWithAlignment(Call->getContext(), Alignment);
-    Call->addAttribute(1, AlignAttr);
+    Call->addParamAttr(0, AlignAttr);
     return Call;
   }
 
@@ -105,7 +105,7 @@ template <class IRBuilderTy> class MatrixBuilder {
     CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name);
     Attribute AlignAttr =
         Attribute::getWithAlignment(Call->getContext(), Alignment);
-    Call->addAttribute(2, AlignAttr);
+    Call->addParamAttr(1, AlignAttr);
     return Call;
   }
 
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def
index c73172612b1e1..dd28780ac946f 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.def
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -287,6 +287,7 @@ HANDLE_LIBCALL(FPEXT_F80_F128, "__extendxftf2")
 HANDLE_LIBCALL(FPEXT_F64_F128, "__extenddftf2")
 HANDLE_LIBCALL(FPEXT_F32_F128, "__extendsftf2")
 HANDLE_LIBCALL(FPEXT_F16_F128, "__extendhftf2")
+HANDLE_LIBCALL(FPEXT_F16_F80, "__extendhfxf2")
 HANDLE_LIBCALL(FPEXT_F32_F64, "__extendsfdf2")
 HANDLE_LIBCALL(FPEXT_F16_F64, "__extendhfdf2")
 HANDLE_LIBCALL(FPEXT_F16_F32, "__gnu_h2f_ieee")
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 92e2cd3a27830..9bbb899db7bb0 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -111,6 +111,11 @@ END_REGISTER_VP_SDNODE(SDOPC)
 #define HANDLE_VP_IS_MEMOP(VPID, POINTERPOS, DATAPOS)
 #endif
 
+// Map this VP reduction intrinsic to its reduction operand positions.
+#ifndef HANDLE_VP_REDUCTION
+#define HANDLE_VP_REDUCTION(ID, STARTPOS, VECTORPOS)
+#endif
+
 /// } Property Macros
 
 ///// Integer Arithmetic {
@@ -231,6 +236,94 @@ END_REGISTER_VP(vp_gather, VP_GATHER)
 
 ///// } Memory Operations
 
+///// Reductions {
+
+// Specialized helper macro for VP reductions (%start, %x, %mask, %evl).
+#ifdef HELPER_REGISTER_REDUCTION_VP
+#error "The internal helper macro HELPER_REGISTER_REDUCTION_VP is already defined!"
+#endif
+#define HELPER_REGISTER_REDUCTION_VP(VPINTRIN, SDOPC, INTRIN) \
+BEGIN_REGISTER_VP(VPINTRIN, 2, 3, SDOPC, -1) \
+HANDLE_VP_TO_INTRIN(INTRIN) \
+HANDLE_VP_REDUCTION(VPINTRIN, 0, 1) \
+END_REGISTER_VP(VPINTRIN, SDOPC)
+
+// llvm.vp.reduce.add(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_add, VP_REDUCE_ADD,
+                             experimental_vector_reduce_add)
+
+// llvm.vp.reduce.mul(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_mul, VP_REDUCE_MUL,
+                             experimental_vector_reduce_mul)
+
+// llvm.vp.reduce.and(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_and, VP_REDUCE_AND,
+                             experimental_vector_reduce_and)
+
+// llvm.vp.reduce.or(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_or, VP_REDUCE_OR,
+                             experimental_vector_reduce_or)
+
+// llvm.vp.reduce.xor(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_xor, VP_REDUCE_XOR,
+                             experimental_vector_reduce_xor)
+
+// llvm.vp.reduce.smax(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_smax, VP_REDUCE_SMAX,
+                             experimental_vector_reduce_smax)
+
+// llvm.vp.reduce.smin(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_smin, VP_REDUCE_SMIN,
+                             experimental_vector_reduce_smin)
+
+// llvm.vp.reduce.umax(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_umax, VP_REDUCE_UMAX,
+                             experimental_vector_reduce_umax)
+
+// llvm.vp.reduce.umin(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_umin, VP_REDUCE_UMIN,
+                             experimental_vector_reduce_umin)
+
+// llvm.vp.reduce.fmax(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmax, VP_REDUCE_FMAX,
+                             experimental_vector_reduce_fmax)
+
+// llvm.vp.reduce.fmin(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_VP(vp_reduce_fmin, VP_REDUCE_FMIN,
+                             experimental_vector_reduce_fmin)
+
+#undef HELPER_REGISTER_REDUCTION_VP
+
+// Specialized helper macro for VP reductions as above but with two forms:
+// sequential and reassociative. These manifest as the presence of 'reassoc'
+// fast-math flags in the IR and as two distinct ISD opcodes in the
+// SelectionDAG.
+#ifdef HELPER_REGISTER_REDUCTION_SEQ_VP
+#error "The internal helper macro HELPER_REGISTER_REDUCTION_SEQ_VP is already defined!"
+#endif
+#define HELPER_REGISTER_REDUCTION_SEQ_VP(VPINTRIN, SDOPC, SEQ_SDOPC, INTRIN) \
+BEGIN_REGISTER_VP_INTRINSIC(VPINTRIN, 2, 3) \
+BEGIN_REGISTER_VP_SDNODE(SDOPC, -1, VPINTRIN, 2, 3) \
+END_REGISTER_VP_SDNODE(SDOPC) \
+BEGIN_REGISTER_VP_SDNODE(SEQ_SDOPC, -1, VPINTRIN, 2, 3) \
+END_REGISTER_VP_SDNODE(SEQ_SDOPC) \
+HANDLE_VP_TO_INTRIN(INTRIN) \
+HANDLE_VP_REDUCTION(VPINTRIN, 0, 1) \
+END_REGISTER_VP_INTRINSIC(VPINTRIN)
+
+// llvm.vp.reduce.fadd(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_SEQ_VP(vp_reduce_fadd, VP_REDUCE_FADD,
+                                 VP_REDUCE_SEQ_FADD,
+                                 experimental_vector_reduce_fadd)
+
+// llvm.vp.reduce.fmul(start,x,mask,vlen)
+HELPER_REGISTER_REDUCTION_SEQ_VP(vp_reduce_fmul, VP_REDUCE_FMUL,
+                                 VP_REDUCE_SEQ_FMUL,
+                                 experimental_vector_reduce_fmul)
+
+#undef HELPER_REGISTER_REDUCTION_SEQ_VP
+
+///// } Reduction
 
 #undef BEGIN_REGISTER_VP
 #undef BEGIN_REGISTER_VP_INTRINSIC
@@ -242,3 +335,4 @@ END_REGISTER_VP(vp_gather, VP_GATHER)
 #undef HANDLE_VP_TO_CONSTRAINEDFP
 #undef HANDLE_VP_TO_INTRIN
 #undef HANDLE_VP_IS_MEMOP
+#undef HANDLE_VP_REDUCTION
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index bcf5c062d44db..8cec011926259 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -64,6 +64,7 @@ void initializeAAEvalLegacyPassPass(PassRegistry&);
 void initializeAAResultsWrapperPassPass(PassRegistry&);
 void initializeADCELegacyPassPass(PassRegistry&);
 void initializeAddDiscriminatorsLegacyPassPass(PassRegistry&);
+void initializeAddFSDiscriminatorsPass(PassRegistry &);
 void initializeModuleAddressSanitizerLegacyPassPass(PassRegistry &);
 void initializeASanGlobalsMetadataWrapperPassPass(PassRegistry &);
 void initializeAddressSanitizerLegacyPassPass(PassRegistry &);
@@ -184,6 +185,7 @@ void initializeGlobalSplitPass(PassRegistry&);
 void initializeGlobalsAAWrapperPassPass(PassRegistry&);
 void initializeGuardWideningLegacyPassPass(PassRegistry&);
 void initializeHardwareLoopsPass(PassRegistry&);
+void initializeMIRProfileLoaderPassPass(PassRegistry &);
 void initializeMemProfilerLegacyPassPass(PassRegistry &);
 void initializeHotColdSplittingLegacyPassPass(PassRegistry&);
 void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index c87a09f86fae1..386dd24f52efc 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -96,6 +96,10 @@ class ELFObjectFileBase : public ObjectFile {
 
   std::vector<std::pair<Optional<DataRefImpl>, uint64_t>>
   getPltAddresses() const;
+
+  /// Returns a vector containing a symbol version for each dynamic symbol.
+  /// Returns an empty vector if version sections do not exist.
+  Expected<std::vector<VersionEntry>> readDynsymVersions() const;
 };
 
 class ELFSectionRef : public SectionRef {
diff --git a/llvm/include/llvm/Object/XCOFFObjectFile.h b/llvm/include/llvm/Object/XCOFFObjectFile.h
index 7d024fbc3eaef..c51c438d7470f 100644
--- a/llvm/include/llvm/Object/XCOFFObjectFile.h
+++ b/llvm/include/llvm/Object/XCOFFObjectFile.h
@@ -228,7 +228,7 @@ struct XCOFFSectAuxEntForStat {
   uint8_t Pad[10];
 }; // 32-bit XCOFF file only.
 
-struct XCOFFRelocation32 {
+template <typename AddressType> struct XCOFFRelocation {
   // Masks for packing/unpacking the r_rsize field of relocations.
 
   // The msb is used to indicate if the bits being relocated are signed or
@@ -244,7 +244,7 @@ struct XCOFFRelocation32 {
   static constexpr uint8_t XR_BIASED_LENGTH_MASK = 0x3f;
 
 public:
-  support::ubig32_t VirtualAddress;
+  AddressType VirtualAddress;
   support::ubig32_t SymbolIndex;
 
   // Packed field, see XR_* masks for details of packing.
@@ -260,6 +260,12 @@ struct XCOFFRelocation32 {
   uint8_t getRelocatedLength() const;
 };
 
+extern template struct XCOFFRelocation<llvm::support::ubig32_t>;
+extern template struct XCOFFRelocation<llvm::support::ubig64_t>;
+
+struct XCOFFRelocation32 : XCOFFRelocation<llvm::support::ubig32_t> {};
+struct XCOFFRelocation64 : XCOFFRelocation<llvm::support::ubig64_t> {};
+
 class XCOFFSymbolRef;
 
 class XCOFFObjectFile : public ObjectFile {
@@ -275,6 +281,7 @@ class XCOFFObjectFile : public ObjectFile {
 
   const XCOFFSectionHeader32 *sectionHeaderTable32() const;
   const XCOFFSectionHeader64 *sectionHeaderTable64() const;
+  template <typename T> const T *sectionHeaderTable() const;
 
   size_t getFileHeaderSize() const;
   size_t getSectionHeaderSize() const;
@@ -415,11 +422,12 @@ class XCOFFObjectFile : public ObjectFile {
   void checkSymbolEntryPointer(uintptr_t SymbolEntPtr) const;
 
   // Relocation-related interfaces.
+  template <typename T>
   Expected<uint32_t>
-  getLogicalNumberOfRelocationEntries(const XCOFFSectionHeader32 &Sec) const;
+  getNumberOfRelocationEntries(const XCOFFSectionHeader<T> &Sec) const;
 
-  Expected<ArrayRef<XCOFFRelocation32>>
-  relocations(const XCOFFSectionHeader32 &) const;
+  template <typename Shdr, typename Reloc>
+  Expected<ArrayRef<Reloc>> relocations(const Shdr &Sec) const;
 
   // This function returns string table entry.
   Expected<StringRef> getStringTableEntry(uint32_t Offset) const;
@@ -572,6 +580,7 @@ class XCOFFTracebackTable {
   Optional<uint8_t> ExtensionTable;
 
   XCOFFTracebackTable(const uint8_t *Ptr, uint64_t &Size, Error &Err);
+
 public:
   /// Parse an XCOFF Traceback Table from \a Ptr with \a Size bytes.
   /// Returns an XCOFFTracebackTable upon successful parsing, otherwise an
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 9ab7bd4664f59..943ad316b082e 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -20,6 +20,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Passes/OptimizationLevel.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/PGOOptions.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO/Inliner.h"
 #include "llvm/Transforms/Instrumentation.h"
@@ -32,49 +33,6 @@ class AAManager;
 class TargetMachine;
 class ModuleSummaryIndex;
 
-/// A struct capturing PGO tunables.
-struct PGOOptions {
-  enum PGOAction { NoAction, IRInstr, IRUse, SampleUse };
-  enum CSPGOAction { NoCSAction, CSIRInstr, CSIRUse };
-  PGOOptions(std::string ProfileFile = "", std::string CSProfileGenFile = "",
-             std::string ProfileRemappingFile = "", PGOAction Action = NoAction,
-             CSPGOAction CSAction = NoCSAction,
-             bool DebugInfoForProfiling = false,
-             bool PseudoProbeForProfiling = false)
-      : ProfileFile(ProfileFile), CSProfileGenFile(CSProfileGenFile),
-        ProfileRemappingFile(ProfileRemappingFile), Action(Action),
-        CSAction(CSAction), DebugInfoForProfiling(DebugInfoForProfiling ||
-                                                  (Action == SampleUse &&
-                                                   !PseudoProbeForProfiling)),
-        PseudoProbeForProfiling(PseudoProbeForProfiling) {
-    // Note, we do allow ProfileFile.empty() for Action=IRUse LTO can
-    // callback with IRUse action without ProfileFile.
-
-    // If there is a CSAction, PGOAction cannot be IRInstr or SampleUse.
-    assert(this->CSAction == NoCSAction ||
-           (this->Action != IRInstr && this->Action != SampleUse));
-
-    // For CSIRInstr, CSProfileGenFile also needs to be nonempty.
-    assert(this->CSAction != CSIRInstr || !this->CSProfileGenFile.empty());
-
-    // If CSAction is CSIRUse, PGOAction needs to be IRUse as they share
-    // a profile.
-    assert(this->CSAction != CSIRUse || this->Action == IRUse);
-
-    // If neither Action nor CSAction, DebugInfoForProfiling or
-    // PseudoProbeForProfiling needs to be true.
-    assert(this->Action != NoAction || this->CSAction != NoCSAction ||
-           this->DebugInfoForProfiling || this->PseudoProbeForProfiling);
-  }
-  std::string ProfileFile;
-  std::string CSProfileGenFile;
-  std::string ProfileRemappingFile;
-  PGOAction Action;
-  CSPGOAction CSAction;
-  bool DebugInfoForProfiling;
-  bool PseudoProbeForProfiling;
-};
-
 /// Tunable parameters for passes in the default pipelines.
 class PipelineTuningOptions {
 public:
diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index 2f71bbc6bbbe6..b33246db4e356 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -104,10 +104,10 @@ static inline uint64_t SPMagic(SampleProfileFormat Format = SPF_Binary) {
 /// current Format uses MD5 to represent the string.
 static inline StringRef getRepInFormat(StringRef Name, bool UseMD5,
                                        std::string &GUIDBuf) {
-  if (Name.empty())
+  if (Name.empty() || !UseMD5)
     return Name;
   GUIDBuf = std::to_string(Function::getGUID(Name));
-  return UseMD5 ? StringRef(GUIDBuf) : Name;
+  return GUIDBuf;
 }
 
 static inline uint64_t SPVersion() { return 103; }
@@ -430,6 +430,16 @@ class SampleContext {
     return ContextStr.split(" @ ");
   }
 
+  // Split the leaf context frame (right-most substr) from context.
+  static std::pair<StringRef, StringRef>
+  rsplitContextString(StringRef ContextStr) {
+    auto ContextSplit = ContextStr.rsplit(" @ ");
+    if (ContextSplit.second.empty()) {
+      std::swap(ContextSplit.first, ContextSplit.second);
+    }
+    return ContextSplit;
+  }
+
   // Reconstruct a new context with the last k frames, return the context-less
   // name if K = 1
   StringRef getContextWithLastKFrames(uint32_t K) {
@@ -961,6 +971,11 @@ class FunctionSamples {
 
 raw_ostream &operator<<(raw_ostream &OS, const FunctionSamples &FS);
 
+using NameFunctionSamples = std::pair<StringRef, const FunctionSamples *>;
+
+void sortFuncProfiles(const StringMap<FunctionSamples> &ProfileMap,
+                      std::vector<NameFunctionSamples> &SortedProfiles);
+
 /// Sort a LocationT->SampleT map by LocationT.
 ///
 /// It produces a sorted list of <LocationT, SampleT> records by ascending
diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h
index 2d5925bdb2b43..eae312477199a 100644
--- a/llvm/include/llvm/ProfileData/SampleProfReader.h
+++ b/llvm/include/llvm/ProfileData/SampleProfReader.h
@@ -245,6 +245,7 @@
 #include <memory>
 #include <string>
 #include <system_error>
+#include <unordered_set>
 #include <vector>
 
 namespace llvm {
@@ -407,6 +408,13 @@ class SampleProfileReader {
     std::string FGUID;
     StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
     CanonName = getRepInFormat(CanonName, useMD5(), FGUID);
+    auto It = Profiles.find(CanonName);
+    if (It != Profiles.end())
+      return &It->second;
+    if (!FGUID.empty()) {
+      assert(useMD5() && "New name should only be generated for md5 profile");
+      CanonName = *MD5NameBuffer.insert(FGUID).first;
+    }
     return &Profiles[CanonName];
   }
 
@@ -503,6 +511,10 @@ class SampleProfileReader {
   /// Memory buffer holding the profile file.
   std::unique_ptr<MemoryBuffer> Buffer;
 
+  /// Extra name buffer holding names created on demand.
+  /// This should only be needed for md5 profiles.
+  std::unordered_set<std::string> MD5NameBuffer;
+
   /// Profile summary information.
   std::unique_ptr<ProfileSummary> Summary;
 
diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h
index 3b2d5b974d0bf..3b960cd4fd880 100644
--- a/llvm/include/llvm/Support/MD5.h
+++ b/llvm/include/llvm/Support/MD5.h
@@ -39,18 +39,6 @@ template <unsigned N> class SmallString;
 template <typename T> class ArrayRef;
 
 class MD5 {
-  // Any 32-bit or wider unsigned integer data type will do.
-  typedef uint32_t MD5_u32plus;
-
-  MD5_u32plus a = 0x67452301;
-  MD5_u32plus b = 0xefcdab89;
-  MD5_u32plus c = 0x98badcfe;
-  MD5_u32plus d = 0x10325476;
-  MD5_u32plus hi = 0;
-  MD5_u32plus lo = 0;
-  uint8_t buffer[64];
-  MD5_u32plus block[16];
-
 public:
   struct MD5Result {
     std::array<uint8_t, 16> Bytes;
@@ -90,6 +78,14 @@ class MD5 {
   /// Finishes off the hash and puts the result in result.
   void final(MD5Result &Result);
 
+  /// Finishes off the hash, and returns a reference to the 16-byte hash data.
+  StringRef final();
+
+  /// Finishes off the hash, and returns a reference to the 16-byte hash data.
+  /// This is suitable for getting the MD5 at any time without invalidating the
+  /// internal state, so that more calls can be made into `update`.
+  StringRef result();
+
   /// Translates the bytes in \p Res to a hex string that is
   /// deposited into \p Str. The result will be of length 32.
   static void stringifyResult(MD5Result &Result, SmallString<32> &Str);
@@ -98,6 +94,23 @@ class MD5 {
   static std::array<uint8_t, 16> hash(ArrayRef<uint8_t> Data);
 
 private:
+  // Any 32-bit or wider unsigned integer data type will do.
+  typedef uint32_t MD5_u32plus;
+
+  // Internal State
+  struct {
+    MD5_u32plus a = 0x67452301;
+    MD5_u32plus b = 0xefcdab89;
+    MD5_u32plus c = 0x98badcfe;
+    MD5_u32plus d = 0x10325476;
+    MD5_u32plus hi = 0;
+    MD5_u32plus lo = 0;
+    uint8_t buffer[64];
+    MD5_u32plus block[16];
+  } InternalState;
+
+  MD5Result Result;
+
   const uint8_t *body(ArrayRef<uint8_t> Data);
 };
 
diff --git a/llvm/include/llvm/Support/PGOOptions.h b/llvm/include/llvm/Support/PGOOptions.h
new file mode 100644
index 0000000000000..2141e2159c0c3
--- /dev/null
+++ b/llvm/include/llvm/Support/PGOOptions.h
@@ -0,0 +1,65 @@
+//===------ PGOOptions.h -- PGO option tunables ----------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Define option tunables for PGO.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_PGOOPTIONS_H
+#define LLVM_SUPPORT_PGOOPTIONS_H
+
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+
+/// A struct capturing PGO tunables.
+struct PGOOptions {
+  enum PGOAction { NoAction, IRInstr, IRUse, SampleUse };
+  enum CSPGOAction { NoCSAction, CSIRInstr, CSIRUse };
+  PGOOptions(std::string ProfileFile = "", std::string CSProfileGenFile = "",
+             std::string ProfileRemappingFile = "", PGOAction Action = NoAction,
+             CSPGOAction CSAction = NoCSAction,
+             bool DebugInfoForProfiling = false,
+             bool PseudoProbeForProfiling = false)
+      : ProfileFile(ProfileFile), CSProfileGenFile(CSProfileGenFile),
+        ProfileRemappingFile(ProfileRemappingFile), Action(Action),
+        CSAction(CSAction), DebugInfoForProfiling(DebugInfoForProfiling ||
+                                                  (Action == SampleUse &&
+                                                   !PseudoProbeForProfiling)),
+        PseudoProbeForProfiling(PseudoProbeForProfiling) {
+    // Note, we do allow ProfileFile.empty() for Action=IRUse LTO can
+    // callback with IRUse action without ProfileFile.
+
+    // If there is a CSAction, PGOAction cannot be IRInstr or SampleUse.
+    assert(this->CSAction == NoCSAction ||
+           (this->Action != IRInstr && this->Action != SampleUse));
+
+    // For CSIRInstr, CSProfileGenFile also needs to be nonempty.
+    assert(this->CSAction != CSIRInstr || !this->CSProfileGenFile.empty());
+
+    // If CSAction is CSIRUse, PGOAction needs to be IRUse as they share
+    // a profile.
+    assert(this->CSAction != CSIRUse || this->Action == IRUse);
+
+    // If neither Action nor CSAction, DebugInfoForProfiling or
+    // PseudoProbeForProfiling needs to be true.
+    assert(this->Action != NoAction || this->CSAction != NoCSAction ||
+           this->DebugInfoForProfiling || this->PseudoProbeForProfiling);
+  }
+  std::string ProfileFile;
+  std::string CSProfileGenFile;
+  std::string ProfileRemappingFile;
+  PGOAction Action;
+  CSPGOAction CSAction;
+  bool DebugInfoForProfiling;
+  bool PseudoProbeForProfiling;
+};
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index fbe0d1a55bfca..b33c4ae1a888c 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -652,6 +652,9 @@ HANDLE_TARGET_OPCODE(G_UMAX)
 /// Generic integer absolute value.
 HANDLE_TARGET_OPCODE(G_ABS)
 
+HANDLE_TARGET_OPCODE(G_LROUND)
+HANDLE_TARGET_OPCODE(G_LLROUND)
+
 /// Generic BRANCH instruction. This is an unconditional branch.
 HANDLE_TARGET_OPCODE(G_BR)
 
@@ -769,10 +772,12 @@ HANDLE_TARGET_OPCODE(G_VECREDUCE_UMIN)
 HANDLE_TARGET_OPCODE(G_SBFX)
 HANDLE_TARGET_OPCODE(G_UBFX)
 
+HANDLE_TARGET_OPCODE(G_ISNAN)
+
 /// Marker for the end of the generic opcode.
 /// This is used to check if an opcode is in the range of the
 /// generic opcodes.
-HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_UBFX)
+HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_ISNAN)
 
 /// BUILTIN_OP_END - This must be the last enum value in this list.
 /// The target-specific post-isel opcode values start here.
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index e3e1d5fc3c655..58140cb71dc80 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -225,6 +225,25 @@ def G_FREEZE : GenericInstruction {
   let hasSideEffects = false;
 }
 
+// Generic opcode equivalent to the llvm.isnan intrinsic.
+def G_ISNAN: GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src);
+  let hasSideEffects = false;
+}
+
+def G_LROUND: GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src);
+  let hasSideEffects = false;
+}
+
+def G_LLROUND: GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src);
+  let hasSideEffects = false;
+}
+
 //------------------------------------------------------------------------------
 // Binary ops.
 //------------------------------------------------------------------------------
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 13ebc43697c2f..e65073a1d28d0 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -197,6 +197,12 @@ def reduce_shl_of_extend : GICombineRule<
          [{ return Helper.matchCombineShlOfExtend(*${mi}, ${matchinfo}); }]),
   (apply [{ Helper.applyCombineShlOfExtend(*${mi}, ${matchinfo}); }])>;
 
+def narrow_binop_feeding_and : GICombineRule<
+  (defs root:$root, build_fn_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_AND):$root,
+         [{ return Helper.matchNarrowBinopFeedingAnd(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyBuildFnNoErase(*${root}, ${matchinfo}); }])>;
+
 // [us]itofp(undef) = 0, because the result value is bounded.
 def undef_to_fp_zero : GICombineRule<
   (defs root:$root),
@@ -698,7 +704,8 @@ def known_bits_simplifications : GICombineGroup<[
   redundant_and, redundant_sext_inreg, redundant_or, urem_pow2_to_mask,
   zext_trunc_fold, icmp_to_true_false_known_bits]>;
 
-def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend]>;
+def width_reduction_combines : GICombineGroup<[reduce_shl_of_extend,
+                                               narrow_binop_feeding_and]>;
 
 def phi_combines : GICombineGroup<[extend_through_phis]>;
 
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 8a5052401e9b3..12eee24b578f8 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -144,6 +144,8 @@ def : GINodeEquiv<G_FMAXNUM_IEEE, fmaxnum_ieee>;
 def : GINodeEquiv<G_READCYCLECOUNTER, readcyclecounter>;
 def : GINodeEquiv<G_ROTR, rotr>;
 def : GINodeEquiv<G_ROTL, rotl>;
+def : GINodeEquiv<G_LROUND, lround>;
+def : GINodeEquiv<G_LLROUND, llround>;
 
 def : GINodeEquiv<G_STRICT_FADD, strict_fadd>;
 def : GINodeEquiv<G_STRICT_FSUB, strict_fsub>;
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index dd17af4a642ac..9e8853853aa26 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_TARGET_TARGETMACHINE_H
 #define LLVM_TARGET_TARGETMACHINE_H
 
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/DataLayout.h"
@@ -20,6 +21,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/PGOOptions.h"
 #include "llvm/Target/CGPassBuilderOption.h"
 #include "llvm/Target/TargetOptions.h"
 #include <string>
@@ -110,6 +112,9 @@ class TargetMachine {
   unsigned RequireStructuredCFG : 1;
   unsigned O0WantsFastISel : 1;
 
+  // PGO related tunables.
+  Optional<PGOOptions> PGOOption = None;
+
 public:
   const TargetOptions DefaultOptions;
   mutable TargetOptions Options;
@@ -303,6 +308,9 @@ class TargetMachine {
     return false;
   }
 
+  void setPGOOption(Optional<PGOOptions> PGOOpt) { PGOOption = PGOOpt; }
+  const Optional<PGOOptions> &getPGOOption() const { return PGOOption; }
+
   /// If the specified generic pointer could be assumed as a pointer to a
   /// specific address space, return that address space.
   ///
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index c3c12fd237463..b05ab69e2b5e3 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1150,8 +1150,6 @@ struct Attributor {
   /// \param Allowed If not null, a set limiting the attribute opportunities.
   /// \param DeleteFns Whether to delete functions.
   /// \param RewriteSignatures Whether to rewrite function signatures.
-  /// \param MaxFixedPointIterations Maximum number of iterations to run until
-  ///                                fixpoint.
   Attributor(SetVector<Function *> &Functions, InformationCache &InfoCache,
              CallGraphUpdater &CGUpdater,
              DenseSet<const char *> *Allowed = nullptr, bool DeleteFns = true,
@@ -1169,8 +1167,9 @@ struct Attributor {
   /// \param CGUpdater Helper to update an underlying call graph.
   /// \param Allowed If not null, a set limiting the attribute opportunities.
   /// \param DeleteFns Whether to delete functions
-  /// \param MaxFixedPointIterations Maximum number of iterations to run until
-  ///                                fixpoint.
+  /// \param RewriteSignatures Whether to rewrite function signatures.
+  /// \param MaxFixpointIterations Maximum number of iterations to run until
+  ///                              fixpoint.
   /// \param OREGetter A callback function that returns an ORE object from a
   ///                  Function pointer.
   /// \param PassName  The name of the pass emitting remarks.
diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
index 94f7796298dbe..d280d83088c34 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h
@@ -37,30 +37,33 @@ class ContextTrieNode {
 public:
   ContextTrieNode(ContextTrieNode *Parent = nullptr,
                   StringRef FName = StringRef(),
-                  FunctionSamples *FSamples = nullptr,
+                  FunctionSamples *FSamples = nullptr, uint32_t FSize = 0,
                   LineLocation CallLoc = {0, 0})
       : ParentContext(Parent), FuncName(FName), FuncSamples(FSamples),
-        CallSiteLoc(CallLoc){};
+        FuncSize(FSize), CallSiteLoc(CallLoc){};
   ContextTrieNode *getChildContext(const LineLocation &CallSite,
-                                   StringRef CalleeName);
+                                   StringRef ChildName);
   ContextTrieNode *getHottestChildContext(const LineLocation &CallSite);
   ContextTrieNode *getOrCreateChildContext(const LineLocation &CallSite,
-                                           StringRef CalleeName,
+                                           StringRef ChildName,
                                            bool AllowCreate = true);
 
   ContextTrieNode &moveToChildContext(const LineLocation &CallSite,
                                       ContextTrieNode &&NodeToMove,
                                       StringRef ContextStrToRemove,
                                       bool DeleteNode = true);
-  void removeChildContext(const LineLocation &CallSite, StringRef CalleeName);
+  void removeChildContext(const LineLocation &CallSite, StringRef ChildName);
   std::map<uint32_t, ContextTrieNode> &getAllChildContext();
   StringRef getFuncName() const;
   FunctionSamples *getFunctionSamples() const;
   void setFunctionSamples(FunctionSamples *FSamples);
+  uint32_t getFunctionSize() const;
+  void setFunctionSize(uint32_t FSize);
   LineLocation getCallSiteLoc() const;
   ContextTrieNode *getParentContext() const;
   void setParentContext(ContextTrieNode *Parent);
-  void dump();
+  void dumpNode();
+  void dumpTree();
 
 private:
   static uint32_t nodeHash(StringRef ChildName, const LineLocation &Callsite);
@@ -77,6 +80,9 @@ class ContextTrieNode {
   // Function Samples for current context
   FunctionSamples *FuncSamples;
 
+  // Function size for current context
+  uint32_t FuncSize;
+
   // Callsite location in parent context
   LineLocation CallSiteLoc;
 };
diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
index ba0d41f9b7489..3f13df7b76b4b 100644
--- a/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
+++ b/llvm/include/llvm/Transforms/InstCombine/InstCombiner.h
@@ -259,6 +259,12 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner {
                        m_Not(PatternMatch::m_Value()))))
       return WillInvertAllUses;
 
+    // Min/max may be in the form of intrinsics, so handle those identically
+    // to select patterns.
+    if (match(V, m_MaxOrMin(m_Not(PatternMatch::m_Value()),
+                            m_Not(PatternMatch::m_Value()))))
+      return WillInvertAllUses;
+
     return false;
   }
 
diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h
index 03108bacb0da5..0c822999aecf3 100644
--- a/llvm/include/llvm/Transforms/Instrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation.h
@@ -78,7 +78,7 @@ struct GCOVOptions {
 ModulePass *createGCOVProfilerPass(const GCOVOptions &Options =
                                    GCOVOptions::getDefault());
 
-// PGO Instrumention. Parameter IsCS indicates if this is the context senstive
+// PGO Instrumention. Parameter IsCS indicates if this is the context sensitive
 // instrumentation.
 ModulePass *createPGOInstrumentationGenLegacyPass(bool IsCS = false);
 ModulePass *
@@ -138,7 +138,7 @@ struct InstrProfOptions {
 };
 
 /// Insert frontend instrumentation based profiling. Parameter IsCS indicates if
-// this is the context senstive instrumentation.
+// this is the context sensitive instrumentation.
 ModulePass *createInstrProfilingLegacyPass(
     const InstrProfOptions &Options = InstrProfOptions(), bool IsCS = false);
 
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
index de0871bc99219..9e8b97c0b94f5 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h
@@ -89,6 +89,20 @@ class ASanGlobalsMetadataAnalysis
   static AnalysisKey Key;
 };
 
+struct AddressSanitizerOptions {
+  AddressSanitizerOptions()
+      : AddressSanitizerOptions(false, false, false,
+                                AsanDetectStackUseAfterReturnMode::Runtime){};
+  AddressSanitizerOptions(bool CompileKernel, bool Recover, bool UseAfterScope,
+                          AsanDetectStackUseAfterReturnMode UseAfterReturn)
+      : CompileKernel(CompileKernel), Recover(Recover),
+        UseAfterScope(UseAfterScope), UseAfterReturn(UseAfterReturn){};
+  bool CompileKernel;
+  bool Recover;
+  bool UseAfterScope;
+  AsanDetectStackUseAfterReturnMode UseAfterReturn;
+};
+
 /// Public interface to the address sanitizer pass for instrumenting code to
 /// check for various memory errors at runtime.
 ///
@@ -98,19 +112,13 @@ class ASanGlobalsMetadataAnalysis
 /// surrounding requested memory to be checked for invalid accesses.
 class AddressSanitizerPass : public PassInfoMixin<AddressSanitizerPass> {
 public:
-  explicit AddressSanitizerPass(
-      bool CompileKernel = false, bool Recover = false,
-      bool UseAfterScope = false,
-      AsanDetectStackUseAfterReturnMode UseAfterReturn =
-          AsanDetectStackUseAfterReturnMode::Runtime);
+  explicit AddressSanitizerPass(AddressSanitizerOptions Options)
+      : Options(Options){};
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
   static bool isRequired() { return true; }
 
 private:
-  bool CompileKernel;
-  bool Recover;
-  bool UseAfterScope;
-  AsanDetectStackUseAfterReturnMode UseAfterReturn;
+  AddressSanitizerOptions Options;
 };
 
 /// Public interface to the address sanitizer module pass for instrumenting code
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
index 5de80e6b5e50f..86c6ce4bc7c5e 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h
@@ -87,6 +87,11 @@ bool forAllReachableExits(const DominatorTree &DT, const PostDominatorTree &PDT,
   return true;
 }
 
+// Get AddressSanitizer parameters.
+void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize,
+                               bool IsKasan, uint64_t *ShadowBase,
+                               int *MappingScale, bool *OrShadowOffset);
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h b/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h
index 7942417d2bfa7..07248baaa8b62 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/HWAddressSanitizer.h
@@ -18,21 +18,30 @@
 
 namespace llvm {
 
+struct HWAddressSanitizerOptions {
+  HWAddressSanitizerOptions()
+      : HWAddressSanitizerOptions(false, false, false){};
+  HWAddressSanitizerOptions(bool CompileKernel, bool Recover,
+                            bool DisableOptimization)
+      : CompileKernel(CompileKernel), Recover(Recover),
+        DisableOptimization(DisableOptimization){};
+  bool CompileKernel;
+  bool Recover;
+  bool DisableOptimization;
+};
+
 /// This is a public interface to the hardware address sanitizer pass for
 /// instrumenting code to check for various memory errors at runtime, similar to
 /// AddressSanitizer but based on partial hardware assistance.
 class HWAddressSanitizerPass : public PassInfoMixin<HWAddressSanitizerPass> {
 public:
-  explicit HWAddressSanitizerPass(bool CompileKernel = false,
-                                  bool Recover = false,
-                                  bool DisableOptimization = false);
+  explicit HWAddressSanitizerPass(HWAddressSanitizerOptions Options)
+      : Options(Options){};
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM);
   static bool isRequired() { return true; }
 
 private:
-  bool CompileKernel;
-  bool Recover;
-  bool DisableOptimization;
+  HWAddressSanitizerOptions Options;
 };
 
 FunctionPass *
diff --git a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
index d6d043aee3fb0..87d33b9b11b7f 100644
--- a/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -54,12 +54,6 @@ namespace llvm {
   /// 'i8*' type.
   Value *emitStrDup(Value *Ptr, IRBuilderBase &B, const TargetLibraryInfo *TLI);
 
-  /// Emit a call to the strnlen function to the builder, for the specified
-  /// pointer. Ptr is required to be some pointer type, MaxLen must be of size_t
-  /// type, and the return value has 'intptr_t' type.
-  Value *emitStrNLen(Value *Ptr, Value *MaxLen, IRBuilderBase &B,
-                     const DataLayout &DL, const TargetLibraryInfo *TLI);
-
   /// Emit a call to the strchr function to the builder, for the specified
   /// pointer and character. Ptr is required to be some pointer type, and the
   /// return value has 'i8*' type.
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 9428e2d10e4e1..a425aa2a9fba1 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -151,18 +151,17 @@ class SinkAndHoistLICMFlags {
 /// this function is called by \p sinkRegionForLoopNest.
 bool sinkRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
                 BlockFrequencyInfo *, TargetLibraryInfo *,
-                TargetTransformInfo *, Loop *CurLoop, AliasSetTracker *,
-                MemorySSAUpdater *, ICFLoopSafetyInfo *,
-                SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *,
-                Loop *OutermostLoop = nullptr);
+                TargetTransformInfo *, Loop *CurLoop, MemorySSAUpdater *,
+                ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &,
+                OptimizationRemarkEmitter *, Loop *OutermostLoop = nullptr);
 
 /// Call sinkRegion on loops contained within the specified loop
 /// in order from innermost to outermost.
 bool sinkRegionForLoopNest(DomTreeNode *, AAResults *, LoopInfo *,
                            DominatorTree *, BlockFrequencyInfo *,
                            TargetLibraryInfo *, TargetTransformInfo *, Loop *,
-                           AliasSetTracker *, MemorySSAUpdater *,
-                           ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &,
+                           MemorySSAUpdater *, ICFLoopSafetyInfo *,
+                           SinkAndHoistLICMFlags &,
                            OptimizationRemarkEmitter *);
 
 /// Walk the specified region of the CFG (defined by all blocks
@@ -175,9 +174,8 @@ bool sinkRegionForLoopNest(DomTreeNode *, AAResults *, LoopInfo *,
 /// Diagnostics is emitted via \p ORE. It returns changed status.
 bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *,
                  BlockFrequencyInfo *, TargetLibraryInfo *, Loop *,
-                 AliasSetTracker *, MemorySSAUpdater *, ScalarEvolution *,
-                 ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &,
-                 OptimizationRemarkEmitter *, bool);
+                 MemorySSAUpdater *, ScalarEvolution *, ICFLoopSafetyInfo *,
+                 SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, bool);
 
 /// This function deletes dead loops. The caller of this function needs to
 /// guarantee that the loop is infact dead.
@@ -211,7 +209,7 @@ bool promoteLoopAccessesToScalars(
     const SmallSetVector<Value *, 8> &, SmallVectorImpl<BasicBlock *> &,
     SmallVectorImpl<Instruction *> &, SmallVectorImpl<MemoryAccess *> &,
     PredIteratorCache &, LoopInfo *, DominatorTree *, const TargetLibraryInfo *,
-    Loop *, AliasSetTracker *, MemorySSAUpdater *, ICFLoopSafetyInfo *,
+    Loop *, MemorySSAUpdater *, ICFLoopSafetyInfo *,
     OptimizationRemarkEmitter *);
 
 /// Does a BFS from a given node to all of its children inside a given loop.
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
index e0759d359dbe7..2a510e69cf45f 100644
--- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
@@ -56,15 +56,20 @@ template <> struct IRTraits<BasicBlock> {
   using FunctionT = Function;
   using BlockFrequencyInfoT = BlockFrequencyInfo;
   using LoopT = Loop;
-  using LoopInfoT = LoopInfo;
+  using LoopInfoPtrT = std::unique_ptr<LoopInfo>;
+  using DominatorTreePtrT = std::unique_ptr<DominatorTree>;
+  using PostDominatorTreeT = PostDominatorTree;
+  using PostDominatorTreePtrT = std::unique_ptr<PostDominatorTree>;
   using OptRemarkEmitterT = OptimizationRemarkEmitter;
   using OptRemarkAnalysisT = OptimizationRemarkAnalysis;
-  using DominatorTreeT = DominatorTree;
-  using PostDominatorTreeT = PostDominatorTree;
+  using PredRangeT = pred_range;
+  using SuccRangeT = succ_range;
   static Function &getFunction(Function &F) { return F; }
   static const BasicBlock *getEntryBB(const Function *F) {
     return &F->getEntryBlock();
   }
+  static pred_range getPredecessors(BasicBlock *BB) { return predecessors(BB); }
+  static succ_range getSuccessors(BasicBlock *BB) { return successors(BB); }
 };
 
 } // end namespace afdo_detail
@@ -76,7 +81,8 @@ extern cl::opt<bool> NoWarnSampleUnused;
 
 template <typename BT> class SampleProfileLoaderBaseImpl {
 public:
-  SampleProfileLoaderBaseImpl(std::string Name) : Filename(Name) {}
+  SampleProfileLoaderBaseImpl(std::string Name, std::string RemapName)
+      : Filename(Name), RemappingFilename(RemapName) {}
   void dump() { Reader->dump(); }
 
   using InstructionT = typename afdo_detail::IRTraits<BT>::InstructionT;
@@ -85,14 +91,19 @@ template <typename BT> class SampleProfileLoaderBaseImpl {
       typename afdo_detail::IRTraits<BT>::BlockFrequencyInfoT;
   using FunctionT = typename afdo_detail::IRTraits<BT>::FunctionT;
   using LoopT = typename afdo_detail::IRTraits<BT>::LoopT;
-  using LoopInfoT = typename afdo_detail::IRTraits<BT>::LoopInfoT;
+  using LoopInfoPtrT = typename afdo_detail::IRTraits<BT>::LoopInfoPtrT;
+  using DominatorTreePtrT =
+      typename afdo_detail::IRTraits<BT>::DominatorTreePtrT;
+  using PostDominatorTreePtrT =
+      typename afdo_detail::IRTraits<BT>::PostDominatorTreePtrT;
+  using PostDominatorTreeT =
+      typename afdo_detail::IRTraits<BT>::PostDominatorTreeT;
   using OptRemarkEmitterT =
       typename afdo_detail::IRTraits<BT>::OptRemarkEmitterT;
   using OptRemarkAnalysisT =
       typename afdo_detail::IRTraits<BT>::OptRemarkAnalysisT;
-  using DominatorTreeT = typename afdo_detail::IRTraits<BT>::DominatorTreeT;
-  using PostDominatorTreeT =
-      typename afdo_detail::IRTraits<BT>::PostDominatorTreeT;
+  using PredRangeT = typename afdo_detail::IRTraits<BT>::PredRangeT;
+  using SuccRangeT = typename afdo_detail::IRTraits<BT>::SuccRangeT;
 
   using BlockWeightMap = DenseMap<const BasicBlockT *, uint64_t>;
   using EquivalenceClassMap =
@@ -112,6 +123,12 @@ template <typename BT> class SampleProfileLoaderBaseImpl {
   const BasicBlockT *getEntryBB(const FunctionT *F) {
     return afdo_detail::IRTraits<BT>::getEntryBB(F);
   }
+  PredRangeT getPredecessors(BasicBlockT *BB) {
+    return afdo_detail::IRTraits<BT>::getPredecessors(BB);
+  }
+  SuccRangeT getSuccessors(BasicBlockT *BB) {
+    return afdo_detail::IRTraits<BT>::getSuccessors(BB);
+  }
 
   unsigned getFunctionLoc(FunctionT &Func);
   virtual ErrorOr<uint64_t> getInstWeight(const InstructionT &Inst);
@@ -129,12 +146,11 @@ template <typename BT> class SampleProfileLoaderBaseImpl {
   void findEquivalencesFor(BasicBlockT *BB1,
                            ArrayRef<BasicBlockT *> Descendants,
                            PostDominatorTreeT *DomTree);
-
   void propagateWeights(FunctionT &F);
   uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
   void buildEdges(FunctionT &F);
   bool propagateThroughEdges(FunctionT &F, bool UpdateBlockCount);
-  void clearFunctionData();
+  void clearFunctionData(bool ResetDT = true);
   void computeDominanceAndLoopInfo(FunctionT &F);
   bool
   computeAndPropagateWeights(FunctionT &F,
@@ -168,9 +184,9 @@ template <typename BT> class SampleProfileLoaderBaseImpl {
   EquivalenceClassMap EquivalenceClass;
 
   /// Dominance, post-dominance and loop information.
-  std::unique_ptr<DominatorTreeT> DT;
-  std::unique_ptr<PostDominatorTreeT> PDT;
-  std::unique_ptr<LoopInfoT> LI;
+  DominatorTreePtrT DT;
+  PostDominatorTreePtrT PDT;
+  LoopInfoPtrT LI;
 
   /// Predecessors for each basic block in the CFG.
   BlockEdgeMap Predecessors;
@@ -190,6 +206,9 @@ template <typename BT> class SampleProfileLoaderBaseImpl {
   /// Name of the profile file to load.
   std::string Filename;
 
+  /// Name of the profile remapping file to load.
+  std::string RemappingFilename;
+
   /// Profile Summary Info computed from sample profile.
   ProfileSummaryInfo *PSI = nullptr;
 
@@ -199,15 +218,17 @@ template <typename BT> class SampleProfileLoaderBaseImpl {
 
 /// Clear all the per-function data used to load samples and propagate weights.
 template <typename BT>
-void SampleProfileLoaderBaseImpl<BT>::clearFunctionData() {
+void SampleProfileLoaderBaseImpl<BT>::clearFunctionData(bool ResetDT) {
   BlockWeights.clear();
   EdgeWeights.clear();
   VisitedBlocks.clear();
   VisitedEdges.clear();
   EquivalenceClass.clear();
-  DT = nullptr;
-  PDT = nullptr;
-  LI = nullptr;
+  if (ResetDT) {
+    DT = nullptr;
+    PDT = nullptr;
+    LI = nullptr;
+  }
   Predecessors.clear();
   Successors.clear();
   CoverageTracker.clear();
@@ -475,7 +496,7 @@ void SampleProfileLoaderBaseImpl<BT>::findEquivalenceClasses(FunctionT &F) {
     // class by making BB2's equivalence class be BB1.
     DominatedBBs.clear();
     DT->getDescendants(BB1, DominatedBBs);
-    findEquivalencesFor(BB1, DominatedBBs, PDT.get());
+    findEquivalencesFor(BB1, DominatedBBs, &*PDT);
 
     LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
   }
@@ -692,7 +713,7 @@ void SampleProfileLoaderBaseImpl<BT>::buildEdges(FunctionT &F) {
     SmallPtrSet<BasicBlockT *, 16> Visited;
     if (!Predecessors[B1].empty())
       llvm_unreachable("Found a stale predecessors list in a basic block.");
-    for (BasicBlockT *B2 : predecessors(B1))
+    for (auto *B2 : getPredecessors(B1))
       if (Visited.insert(B2).second)
         Predecessors[B1].push_back(B2);
 
@@ -700,7 +721,7 @@ void SampleProfileLoaderBaseImpl<BT>::buildEdges(FunctionT &F) {
     Visited.clear();
     if (!Successors[B1].empty())
       llvm_unreachable("Found a stale successors list in a basic block.");
-    for (BasicBlockT *B2 : successors(B1))
+    for (auto *B2 : getSuccessors(B1))
       if (Visited.insert(B2).second)
         Successors[B1].push_back(B2);
   }
@@ -911,12 +932,12 @@ unsigned SampleProfileLoaderBaseImpl<BT>::getFunctionLoc(FunctionT &F) {
 template <typename BT>
 void SampleProfileLoaderBaseImpl<BT>::computeDominanceAndLoopInfo(
     FunctionT &F) {
-  DT.reset(new DominatorTreeT);
+  DT.reset(new DominatorTree);
   DT->recalculate(F);
 
   PDT.reset(new PostDominatorTree(F));
 
-  LI.reset(new LoopInfoT);
+  LI.reset(new LoopInfo);
   LI->analyze(*DT);
 }
 
diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index 59bf3a342caa3..98d62ab9f7b90 100644
--- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -489,9 +489,6 @@ class SCEVExpander : public SCEVVisitor<SCEVExpander, Value *> {
   Value *expandIVInc(PHINode *PN, Value *StepV, const Loop *L, Type *ExpandTy,
                      Type *IntTy, bool useSubtract);
 
-  void hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
-                      Instruction *Pos, PHINode *LoopPhi);
-
   void fixupInsertPoints(Instruction *I);
 
   /// If required, create LCSSA PHIs for \p Users' operand \p OpIdx. If new
diff --git a/llvm/include/llvm/module.modulemap b/llvm/include/llvm/module.modulemap
index 848fb266374ef..5191962007fa2 100644
--- a/llvm/include/llvm/module.modulemap
+++ b/llvm/include/llvm/module.modulemap
@@ -181,7 +181,6 @@ module LLVM_ExecutionEngine {
   // translation unit (or none) and aren't part of this module.
   exclude header "ExecutionEngine/MCJIT.h"
   exclude header "ExecutionEngine/Interpreter.h"
-  exclude header "ExecutionEngine/OrcMCJITReplacement.h"
 
   // FIXME: These exclude directives were added as a workaround for
   //        <rdar://problem/29247092> and should be removed once it is fixed.
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 4a943a1b7325b..758c0a01f2101 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1880,7 +1880,7 @@ static bool mayFoldConstrained(ConstrainedFPIntrinsic *CI,
     // know that its evaluation does not raise exceptions, so side effect
     // is absent. To allow removing the call, mark it as not accessing memory.
     if (EB && *EB != fp::ExceptionBehavior::ebIgnore)
-      CI->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+      CI->addFnAttr(Attribute::ReadNone);
     return true;
   }
 
diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
index 363b81a55060e..d2a7168404545 100644
--- a/llvm/lib/Analysis/InlineAdvisor.cpp
+++ b/llvm/lib/Analysis/InlineAdvisor.cpp
@@ -313,7 +313,7 @@ void llvm::setInlineRemark(CallBase &CB, StringRef Message) {
     return;
 
   Attribute Attr = Attribute::get(CB.getContext(), "inline-remark", Message);
-  CB.addAttribute(AttributeList::FunctionIndex, Attr);
+  CB.addFnAttr(Attr);
 }
 
 /// Return the cost only if the inliner should attempt to inline at the given
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index b30cd799f8c5d..5536de590e1b4 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5855,6 +5855,15 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
       if (ShAmtC->urem(BitWidth).isNullValue())
         return Call->getArgOperand(IID == Intrinsic::fshl ? 0 : 1);
     }
+
+    // Rotating zero by anything is zero.
+    if (match(Op0, m_Zero()) && match(Op1, m_Zero()))
+      return ConstantInt::getNullValue(F->getReturnType());
+
+    // Rotating -1 by anything is -1.
+    if (match(Op0, m_AllOnes()) && match(Op1, m_AllOnes()))
+      return ConstantInt::getAllOnesValue(F->getReturnType());
+
     return nullptr;
   }
   case Intrinsic::experimental_constrained_fma: {
diff --git a/llvm/lib/Analysis/LoopNestAnalysis.cpp b/llvm/lib/Analysis/LoopNestAnalysis.cpp
index 2649ed60f762b..675bb7a7749c2 100644
--- a/llvm/lib/Analysis/LoopNestAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopNestAnalysis.cpp
@@ -50,8 +50,66 @@ std::unique_ptr<LoopNest> LoopNest::getLoopNest(Loop &Root,
   return std::make_unique<LoopNest>(Root, SE);
 }
 
+static CmpInst *getOuterLoopLatchCmp(const Loop &OuterLoop) {
+
+  const BasicBlock *Latch = OuterLoop.getLoopLatch();
+  assert(Latch && "Expecting a valid loop latch");
+
+  const BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
+  assert(BI && BI->isConditional() &&
+         "Expecting loop latch terminator to be a branch instruction");
+
+  CmpInst *OuterLoopLatchCmp = dyn_cast<CmpInst>(BI->getCondition());
+  DEBUG_WITH_TYPE(
+      VerboseDebug, if (OuterLoopLatchCmp) {
+        dbgs() << "Outer loop latch compare instruction: " << *OuterLoopLatchCmp
+               << "\n";
+      });
+  return OuterLoopLatchCmp;
+}
+
+static CmpInst *getInnerLoopGuardCmp(const Loop &InnerLoop) {
+
+  BranchInst *InnerGuard = InnerLoop.getLoopGuardBranch();
+  CmpInst *InnerLoopGuardCmp =
+      (InnerGuard) ? dyn_cast<CmpInst>(InnerGuard->getCondition()) : nullptr;
+
+  DEBUG_WITH_TYPE(
+      VerboseDebug, if (InnerLoopGuardCmp) {
+        dbgs() << "Inner loop guard compare instruction: " << *InnerLoopGuardCmp
+               << "\n";
+      });
+  return InnerLoopGuardCmp;
+}
+
+static bool checkSafeInstruction(const Instruction &I,
+                                 const CmpInst *InnerLoopGuardCmp,
+                                 const CmpInst *OuterLoopLatchCmp,
+                                 Optional<Loop::LoopBounds> OuterLoopLB) {
+
+  bool IsAllowed =
+      isSafeToSpeculativelyExecute(&I) || isa<PHINode>(I) || isa<BranchInst>(I);
+  if (!IsAllowed)
+    return false;
+  // The only binary instruction allowed is the outer loop step instruction,
+  // the only comparison instructions allowed are the inner loop guard
+  // compare instruction and the outer loop latch compare instruction.
+  if ((isa<BinaryOperator>(I) && &I != &OuterLoopLB->getStepInst()) ||
+      (isa<CmpInst>(I) && &I != OuterLoopLatchCmp && &I != InnerLoopGuardCmp)) {
+    return false;
+  }
+  return true;
+}
+
 bool LoopNest::arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
                                   ScalarEvolution &SE) {
+  return (analyzeLoopNestForPerfectNest(OuterLoop, InnerLoop, SE) ==
+          PerfectLoopNest);
+}
+
+LoopNest::LoopNestEnum LoopNest::analyzeLoopNestForPerfectNest(
+    const Loop &OuterLoop, const Loop &InnerLoop, ScalarEvolution &SE) {
+
   assert(!OuterLoop.isInnermost() && "Outer loop should have subloops");
   assert(!InnerLoop.isOutermost() && "Inner loop should have a parent");
   LLVM_DEBUG(dbgs() << "Checking whether loop '" << OuterLoop.getName()
@@ -66,7 +124,7 @@ bool LoopNest::arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
   //    the outer loop latch.
   if (!checkLoopsStructure(OuterLoop, InnerLoop, SE)) {
     LLVM_DEBUG(dbgs() << "Not perfectly nested: invalid loop structure.\n");
-    return false;
+    return InvalidLoopStructure;
   }
 
   // Bail out if we cannot retrieve the outer loop bounds.
@@ -74,33 +132,11 @@ bool LoopNest::arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
   if (OuterLoopLB == None) {
     LLVM_DEBUG(dbgs() << "Cannot compute loop bounds of OuterLoop: "
                       << OuterLoop << "\n";);
-    return false;
+    return OuterLoopLowerBoundUnknown;
   }
 
-  // Identify the outer loop latch comparison instruction.
-  const BasicBlock *Latch = OuterLoop.getLoopLatch();
-  assert(Latch && "Expecting a valid loop latch");
-  const BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
-  assert(BI && BI->isConditional() &&
-         "Expecting loop latch terminator to be a branch instruction");
-
-  const CmpInst *OuterLoopLatchCmp = dyn_cast<CmpInst>(BI->getCondition());
-  DEBUG_WITH_TYPE(
-      VerboseDebug, if (OuterLoopLatchCmp) {
-        dbgs() << "Outer loop latch compare instruction: " << *OuterLoopLatchCmp
-               << "\n";
-      });
-
-  // Identify the inner loop guard instruction.
-  BranchInst *InnerGuard = InnerLoop.getLoopGuardBranch();
-  const CmpInst *InnerLoopGuardCmp =
-      (InnerGuard) ? dyn_cast<CmpInst>(InnerGuard->getCondition()) : nullptr;
-
-  DEBUG_WITH_TYPE(
-      VerboseDebug, if (InnerLoopGuardCmp) {
-        dbgs() << "Inner loop guard compare instruction: " << *InnerLoopGuardCmp
-               << "\n";
-      });
+  CmpInst *OuterLoopLatchCmp = getOuterLoopLatchCmp(OuterLoop);
+  CmpInst *InnerLoopGuardCmp = getInnerLoopGuardCmp(InnerLoop);
 
   // Determine whether instructions in a basic block are one of:
   //  - the inner loop guard comparison
@@ -109,29 +145,15 @@ bool LoopNest::arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
   //  - a phi node, a cast or a branch
   auto containsOnlySafeInstructions = [&](const BasicBlock &BB) {
     return llvm::all_of(BB, [&](const Instruction &I) {
-      bool isAllowed = isSafeToSpeculativelyExecute(&I) || isa<PHINode>(I) ||
-                       isa<BranchInst>(I);
-      if (!isAllowed) {
-        DEBUG_WITH_TYPE(VerboseDebug, {
-          dbgs() << "Instruction: " << I << "\nin basic block: " << BB
-                 << " is considered unsafe.\n";
-        });
-        return false;
-      }
-
-      // The only binary instruction allowed is the outer loop step instruction,
-      // the only comparison instructions allowed are the inner loop guard
-      // compare instruction and the outer loop latch compare instruction.
-      if ((isa<BinaryOperator>(I) && &I != &OuterLoopLB->getStepInst()) ||
-          (isa<CmpInst>(I) && &I != OuterLoopLatchCmp &&
-           &I != InnerLoopGuardCmp)) {
+      bool IsSafeInstr = checkSafeInstruction(I, InnerLoopGuardCmp,
+                                              OuterLoopLatchCmp, OuterLoopLB);
+      if (IsSafeInstr) {
         DEBUG_WITH_TYPE(VerboseDebug, {
           dbgs() << "Instruction: " << I << "\nin basic block:" << BB
                  << "is unsafe.\n";
         });
-        return false;
       }
-      return true;
+      return IsSafeInstr;
     });
   };
 
@@ -148,13 +170,72 @@ bool LoopNest::arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
       !containsOnlySafeInstructions(*InnerLoop.getExitBlock())) {
     LLVM_DEBUG(dbgs() << "Not perfectly nested: code surrounding inner loop is "
                          "unsafe\n";);
-    return false;
+    return ImperfectLoopNest;
   }
 
   LLVM_DEBUG(dbgs() << "Loop '" << OuterLoop.getName() << "' and '"
                     << InnerLoop.getName() << "' are perfectly nested.\n");
 
-  return true;
+  return PerfectLoopNest;
+}
+
+LoopNest::InstrVectorTy LoopNest::getInterveningInstructions(
+    const Loop &OuterLoop, const Loop &InnerLoop, ScalarEvolution &SE) {
+  InstrVectorTy Instr;
+  switch (analyzeLoopNestForPerfectNest(OuterLoop, InnerLoop, SE)) {
+  case PerfectLoopNest:
+    LLVM_DEBUG(dbgs() << "The loop Nest is Perfect, returning empty "
+                         "instruction vector. \n";);
+    return Instr;
+
+  case InvalidLoopStructure:
+    LLVM_DEBUG(dbgs() << "Not perfectly nested: invalid loop structure. "
+                         "Instruction vector is empty.\n";);
+    return Instr;
+
+  case OuterLoopLowerBoundUnknown:
+    LLVM_DEBUG(dbgs() << "Cannot compute loop bounds of OuterLoop: "
+                      << OuterLoop << "\nInstruction vector is empty.\n";);
+    return Instr;
+
+  case ImperfectLoopNest:
+    break;
+  }
+
+  // Identify the outer loop latch comparison instruction.
+  auto OuterLoopLB = OuterLoop.getBounds(SE);
+
+  CmpInst *OuterLoopLatchCmp = getOuterLoopLatchCmp(OuterLoop);
+  CmpInst *InnerLoopGuardCmp = getInnerLoopGuardCmp(InnerLoop);
+
+  auto GetUnsafeInstructions = [&](const BasicBlock &BB) {
+    for (const Instruction &I : BB) {
+      if (!checkSafeInstruction(I, InnerLoopGuardCmp, OuterLoopLatchCmp,
+                                OuterLoopLB)) {
+        Instr.push_back(&I);
+        DEBUG_WITH_TYPE(VerboseDebug, {
+          dbgs() << "Instruction: " << I << "\nin basic block:" << BB
+                 << "is unsafe.\n";
+        });
+      }
+    }
+  };
+
+  // Check the code surrounding the inner loop for instructions that are deemed
+  // unsafe.
+  const BasicBlock *OuterLoopHeader = OuterLoop.getHeader();
+  const BasicBlock *OuterLoopLatch = OuterLoop.getLoopLatch();
+  const BasicBlock *InnerLoopPreHeader = InnerLoop.getLoopPreheader();
+  const BasicBlock *InnerLoopExitBlock = InnerLoop.getExitBlock();
+
+  GetUnsafeInstructions(*OuterLoopHeader);
+  GetUnsafeInstructions(*OuterLoopLatch);
+  GetUnsafeInstructions(*InnerLoopExitBlock);
+
+  if (InnerLoopPreHeader != OuterLoopHeader) {
+    GetUnsafeInstructions(*InnerLoopPreHeader);
+  }
+  return Instr;
 }
 
 SmallVector<LoopVectorTy, 4>
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 68e997656d843..7af38003743fc 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -111,7 +111,7 @@ static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = {
   {LibFunc_reallocf,            {ReallocLike, 2, 1,  -1}},
   {LibFunc_strdup,              {StrDupLike,  1, -1, -1}},
   {LibFunc_strndup,             {StrDupLike,  2, 1,  -1}},
-  {LibFunc___kmpc_alloc_shared, {MallocLike,  1, 0,  -1}}
+  {LibFunc___kmpc_alloc_shared, {MallocLike,  1, 0,  -1}},
   // TODO: Handle "int posix_memalign(void **, size_t, size_t)"
 };
 
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index b402b0467f5de..dc830ca7a812a 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -90,10 +90,6 @@ bool llvm::VerifyMemorySSA = true;
 #else
 bool llvm::VerifyMemorySSA = false;
 #endif
-/// Enables memory ssa as a dependency for loop passes in legacy pass manager.
-cl::opt<bool> llvm::EnableMSSALoopDependency(
-    "enable-mssa-loop-dependency", cl::Hidden, cl::init(true),
-    cl::desc("Enable MemorySSA dependency for loop pass manager"));
 
 static cl::opt<bool, true>
     VerifyMemorySSAX("verify-memoryssa", cl::location(VerifyMemorySSA),
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index 0214f8901a46d..a85dd7553b086 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -479,7 +479,7 @@ static void computeFunctionSummary(
       F.hasFnAttribute(Attribute::NoRecurse), F.returnDoesNotAlias(),
       // FIXME: refactor this to use the same code that inliner is using.
       // Don't try to import functions with noinline attribute.
-      F.getAttributes().hasFnAttribute(Attribute::NoInline),
+      F.getAttributes().hasFnAttr(Attribute::NoInline),
       F.hasFnAttribute(Attribute::AlwaysInline)};
   std::vector<FunctionSummary::ParamAccess> ParamAccesses;
   if (auto *SSI = GetSSICallback(F))
diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp
index ab5f2db7d1cd9..a3d5c81e261d8 100644
--- a/llvm/lib/Analysis/StackLifetime.cpp
+++ b/llvm/lib/Analysis/StackLifetime.cpp
@@ -257,14 +257,12 @@ void StackLifetime::calculateLiveIntervals() {
       unsigned AllocaNo = It.second.AllocaNo;
 
       if (IsStart) {
-        assert(!Started.test(AllocaNo) || Start[AllocaNo] == BBStart);
         if (!Started.test(AllocaNo)) {
           Started.set(AllocaNo);
           Ended.reset(AllocaNo);
           Start[AllocaNo] = InstNo;
         }
       } else {
-        assert(!Ended.test(AllocaNo));
         if (Started.test(AllocaNo)) {
           LiveRanges[AllocaNo].addRange(Start[AllocaNo], InstNo);
           Started.reset(AllocaNo);
diff --git a/llvm/lib/Analysis/TFUtils.cpp b/llvm/lib/Analysis/TFUtils.cpp
index dcc5e9712edd7..3d10479c45446 100644
--- a/llvm/lib/Analysis/TFUtils.cpp
+++ b/llvm/lib/Analysis/TFUtils.cpp
@@ -292,11 +292,11 @@ class LoggerDataImpl {
   void transferLog(tensorflow::SequenceExample &SE) {
     auto *FL = SE.mutable_feature_lists()->mutable_feature_list();
     if (IncludeReward)
-      (*FL)[RewardSpec.name()].Swap(&Reward);
+      (*FL)[RewardSpec.name()] = std::move(Reward);
     assert(FeatureLists.size() == LoggedFeatureSpecs.size());
     for (size_t I = 0; I < FeatureLists.size(); ++I) {
       const auto &LFS = LoggedFeatureSpecs[I];
-      (*FL)[LFS.getLoggingName()].Swap(&FeatureLists[I]);
+      (*FL)[LFS.getLoggingName()] = std::move(FeatureLists[I]);
     }
   }
 
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index cfd9aff505620..0a2031de5b89a 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -595,6 +595,11 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_stpncpy);
   }
 
+  if (T.isPS4()) {
+    TLI.setUnavailable(LibFunc_stpcpy);
+    TLI.setUnavailable(LibFunc_stpncpy);
+  }
+
   // As currently implemented in clang, NVPTX code has no standard library to
   // speak of.  Headers provide a standard-ish library implementation, but many
   // of the signatures are wrong -- for example, many libm functions are not
@@ -922,9 +927,9 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_vec_malloc:
     return (NumParams == 1 && FTy.getReturnType()->isPointerTy());
   case LibFunc_memcmp:
-    return (NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
-            FTy.getParamType(0)->isPointerTy() &&
-            FTy.getParamType(1)->isPointerTy());
+    return NumParams == 3 && FTy.getReturnType()->isIntegerTy(32) &&
+           FTy.getParamType(0)->isPointerTy() &&
+           FTy.getParamType(1)->isPointerTy() && IsSizeTTy(FTy.getParamType(2));
 
   case LibFunc_memchr:
   case LibFunc_memrchr:
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index ce91e16fdfb46..543f3790fee0a 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1352,9 +1352,7 @@ std::string VFABI::mangleTLIVectorName(StringRef VectorName,
 
 void VFABI::getVectorVariantNames(
     const CallInst &CI, SmallVectorImpl<std::string> &VariantMappings) {
-  const StringRef S =
-      CI.getAttribute(AttributeList::FunctionIndex, VFABI::MappingsAttrName)
-          .getValueAsString();
+  const StringRef S = CI.getFnAttr(VFABI::MappingsAttrName).getValueAsString();
   if (S.empty())
     return;
 
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 4f72c6f9921a9..bee3d947de930 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -643,6 +643,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(convergent);
   KEYWORD(dereferenceable);
   KEYWORD(dereferenceable_or_null);
+  KEYWORD(disable_sanitizer_instrumentation);
   KEYWORD(elementtype);
   KEYWORD(inaccessiblememonly);
   KEYWORD(inaccessiblemem_or_argmemonly);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 799cb03c8c8c5..c24917cd7879e 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -140,7 +140,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
 
     if (Function *Fn = dyn_cast<Function>(V)) {
       AttributeList AS = Fn->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes());
+      AttrBuilder FnAttrs(AS.getFnAttrs());
       AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
 
       FnAttrs.merge(B);
@@ -152,32 +152,28 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
         FnAttrs.removeAttribute(Attribute::Alignment);
       }
 
-      AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
-                            AttributeSet::get(Context, FnAttrs));
+      AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs));
       Fn->setAttributes(AS);
     } else if (CallInst *CI = dyn_cast<CallInst>(V)) {
       AttributeList AS = CI->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes());
+      AttrBuilder FnAttrs(AS.getFnAttrs());
       AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
       FnAttrs.merge(B);
-      AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
-                            AttributeSet::get(Context, FnAttrs));
+      AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs));
       CI->setAttributes(AS);
     } else if (InvokeInst *II = dyn_cast<InvokeInst>(V)) {
       AttributeList AS = II->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes());
+      AttrBuilder FnAttrs(AS.getFnAttrs());
       AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
       FnAttrs.merge(B);
-      AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
-                            AttributeSet::get(Context, FnAttrs));
+      AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs));
       II->setAttributes(AS);
     } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(V)) {
       AttributeList AS = CBI->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes());
+      AttrBuilder FnAttrs(AS.getFnAttrs());
       AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
       FnAttrs.merge(B);
-      AS = AS.addAttributes(Context, AttributeList::FunctionIndex,
-                            AttributeSet::get(Context, FnAttrs));
+      AS = AS.addFnAttributes(Context, AttributeSet::get(Context, FnAttrs));
       CBI->setAttributes(AS);
     } else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
       AttrBuilder Attrs(GV->getAttributes());
@@ -3876,10 +3872,6 @@ struct MDField : public MDFieldImpl<Metadata *> {
   MDField(bool AllowNull = true) : ImplTy(nullptr), AllowNull(AllowNull) {}
 };
 
-struct MDConstant : public MDFieldImpl<ConstantAsMetadata *> {
-  MDConstant() : ImplTy(nullptr) {}
-};
-
 struct MDStringField : public MDFieldImpl<MDString *> {
   bool AllowEmpty;
   MDStringField(bool AllowEmpty = true)
@@ -3914,22 +3906,6 @@ struct MDSignedOrMDField : MDEitherFieldImpl<MDSignedField, MDField> {
   }
 };
 
-struct MDSignedOrUnsignedField
-    : MDEitherFieldImpl<MDSignedField, MDUnsignedField> {
-  MDSignedOrUnsignedField() : ImplTy(MDSignedField(0), MDUnsignedField(0)) {}
-
-  bool isMDSignedField() const { return WhatIs == IsTypeA; }
-  bool isMDUnsignedField() const { return WhatIs == IsTypeB; }
-  int64_t getMDSignedValue() const {
-    assert(isMDSignedField() && "Wrong field type");
-    return A.Val;
-  }
-  uint64_t getMDUnsignedValue() const {
-    assert(isMDUnsignedField() && "Wrong field type");
-    return B.Val;
-  }
-};
-
 } // end anonymous namespace
 
 namespace llvm {
@@ -4578,7 +4554,8 @@ bool LLParser::parseDIDerivedType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(offset, MDUnsignedField, (0, UINT64_MAX));                          \
   OPTIONAL(flags, DIFlagField, );                                              \
   OPTIONAL(extraData, MDField, );                                              \
-  OPTIONAL(dwarfAddressSpace, MDUnsignedField, (UINT32_MAX, UINT32_MAX));
+  OPTIONAL(dwarfAddressSpace, MDUnsignedField, (UINT32_MAX, UINT32_MAX));      \
+  OPTIONAL(annotations, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4590,7 +4567,7 @@ bool LLParser::parseDIDerivedType(MDNode *&Result, bool IsDistinct) {
                            (Context, tag.Val, name.Val, file.Val, line.Val,
                             scope.Val, baseType.Val, size.Val, align.Val,
                             offset.Val, DWARFAddressSpace, flags.Val,
-                            extraData.Val));
+                            extraData.Val, annotations.Val));
   return false;
 }
 
@@ -4615,7 +4592,8 @@ bool LLParser::parseDICompositeType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(dataLocation, MDField, );                                           \
   OPTIONAL(associated, MDField, );                                             \
   OPTIONAL(allocated, MDField, );                                              \
-  OPTIONAL(rank, MDSignedOrMDField, );
+  OPTIONAL(rank, MDSignedOrMDField, );                                         \
+  OPTIONAL(annotations, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4633,7 +4611,7 @@ bool LLParser::parseDICompositeType(MDNode *&Result, bool IsDistinct) {
             scope.Val, baseType.Val, size.Val, align.Val, offset.Val, flags.Val,
             elements.Val, runtimeLang.Val, vtableHolder.Val, templateParams.Val,
             discriminator.Val, dataLocation.Val, associated.Val, allocated.Val,
-            Rank)) {
+            Rank, annotations.Val)) {
       Result = CT;
       return false;
     }
@@ -4645,8 +4623,8 @@ bool LLParser::parseDICompositeType(MDNode *&Result, bool IsDistinct) {
       (Context, tag.Val, name.Val, file.Val, line.Val, scope.Val, baseType.Val,
        size.Val, align.Val, offset.Val, flags.Val, elements.Val,
        runtimeLang.Val, vtableHolder.Val, templateParams.Val, identifier.Val,
-       discriminator.Val, dataLocation.Val, associated.Val, allocated.Val,
-       Rank));
+       discriminator.Val, dataLocation.Val, associated.Val, allocated.Val, Rank,
+       annotations.Val));
   return false;
 }
 
@@ -5571,7 +5549,7 @@ bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) {
       AttributeList::get(Context, AttributeSet::get(Context, FuncAttrs),
                          AttributeSet::get(Context, RetAttrs), Attrs);
 
-  if (PAL.hasAttribute(1, Attribute::StructRet) && !RetType->isVoidTy())
+  if (PAL.hasParamAttr(0, Attribute::StructRet) && !RetType->isVoidTy())
     return error(RetTypeLoc, "functions with 'sret' argument must return void");
 
   FunctionType *FT = FunctionType::get(RetType, ParamTypeList, IsVarArg);
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index d5e366c21f7de..49ba562d8ab7b 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1388,6 +1388,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::Cold;
   case bitc::ATTR_KIND_CONVERGENT:
     return Attribute::Convergent;
+  case bitc::ATTR_KIND_DISABLE_SANITIZER_INSTRUMENTATION:
+    return Attribute::DisableSanitizerInstrumentation;
   case bitc::ATTR_KIND_ELEMENTTYPE:
     return Attribute::ElementType;
   case bitc::ATTR_KIND_INACCESSIBLEMEM_ONLY:
@@ -3270,7 +3272,7 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
   }
 
   if (Record.size() > 12) {
-    auto AS = getAttributes(Record[12]).getFnAttributes();
+    auto AS = getAttributes(Record[12]).getFnAttrs();
     NewGV->setAttributes(AS);
   }
 
@@ -5567,9 +5569,8 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
 
     // Remove incompatible attributes on function calls.
     if (auto *CI = dyn_cast<CallBase>(&I)) {
-      CI->removeAttributes(AttributeList::ReturnIndex,
-                           AttributeFuncs::typeIncompatible(
-                               CI->getFunctionType()->getReturnType()));
+      CI->removeRetAttrs(AttributeFuncs::typeIncompatible(
+          CI->getFunctionType()->getReturnType()));
 
       for (unsigned ArgNo = 0; ArgNo < CI->arg_size(); ++ArgNo)
         CI->removeParamAttrs(ArgNo, AttributeFuncs::typeIncompatible(
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 8493eb7a28b23..3343efdaeb9a5 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1437,7 +1437,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_DERIVED_TYPE: {
-    if (Record.size() < 12 || Record.size() > 13)
+    if (Record.size() < 12 || Record.size() > 14)
       return error("Invalid record");
 
     // DWARF address space is encoded as N->getDWARFAddressSpace() + 1. 0 means
@@ -1446,6 +1446,10 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (Record.size() > 12 && Record[12])
       DWARFAddressSpace = Record[12] - 1;
 
+    Metadata *Annotations = nullptr;
+    if (Record.size() > 13 && Record[13])
+      Annotations = getMDOrNull(Record[13]);
+
     IsDistinct = Record[0];
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[10]);
     MetadataList.assignValue(
@@ -1455,13 +1459,13 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                          getDITypeRefOrNull(Record[5]),
                          getDITypeRefOrNull(Record[6]), Record[7], Record[8],
                          Record[9], DWARFAddressSpace, Flags,
-                         getDITypeRefOrNull(Record[11]))),
+                         getDITypeRefOrNull(Record[11]), Annotations)),
         NextMetadataNo);
     NextMetadataNo++;
     break;
   }
   case bitc::METADATA_COMPOSITE_TYPE: {
-    if (Record.size() < 16 || Record.size() > 21)
+    if (Record.size() < 16 || Record.size() > 22)
       return error("Invalid record");
 
     // If we have a UUID and this is not a forward declaration, lookup the
@@ -1489,6 +1493,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     Metadata *Associated = nullptr;
     Metadata *Allocated = nullptr;
     Metadata *Rank = nullptr;
+    Metadata *Annotations = nullptr;
     auto *Identifier = getMDString(Record[15]);
     // If this module is being parsed so that it can be ThinLTO imported
     // into another module, composite types only need to be imported
@@ -1520,6 +1525,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       if (Record.size() > 20) {
         Rank = getMDOrNull(Record[20]);
       }
+      if (Record.size() > 21) {
+        Annotations = getMDOrNull(Record[21]);
+      }
     }
     DICompositeType *CT = nullptr;
     if (Identifier)
@@ -1527,7 +1535,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
           Context, *Identifier, Tag, Name, File, Line, Scope, BaseType,
           SizeInBits, AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
           VTableHolder, TemplateParams, Discriminator, DataLocation, Associated,
-          Allocated, Rank);
+          Allocated, Rank, Annotations);
 
     // Create a node if we didn't get a lazy ODR type.
     if (!CT)
@@ -1536,7 +1544,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                             SizeInBits, AlignInBits, OffsetInBits, Flags,
                             Elements, RuntimeLang, VTableHolder, TemplateParams,
                             Identifier, Discriminator, DataLocation, Associated,
-                            Allocated, Rank));
+                            Allocated, Rank, Annotations));
     if (!IsNotUsedInTypeRef && Identifier)
       MetadataList.addTypeRef(*Identifier, *cast<DICompositeType>(CT));
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 0a202c376981a..e3a6aba0521e6 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -142,7 +142,6 @@ class BitcodeWriterBase {
       : Stream(Stream), StrtabBuilder(StrtabBuilder) {}
 
 protected:
-  void writeBitcodeHeader();
   void writeModuleVersion();
 };
 
@@ -374,7 +373,6 @@ class ModuleBitcodeWriter : public ModuleBitcodeWriterBase {
   void writeModuleMetadata();
   void writeFunctionMetadata(const Function &F);
   void writeFunctionMetadataAttachment(const Function &F);
-  void writeGlobalVariableMetadataAttachment(const GlobalVariable &GV);
   void pushGlobalMetadataAttachment(SmallVectorImpl<uint64_t> &Record,
                                     const GlobalObject &GO);
   void writeModuleMetadataKinds();
@@ -628,6 +626,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_IN_ALLOCA;
   case Attribute::Cold:
     return bitc::ATTR_KIND_COLD;
+  case Attribute::DisableSanitizerInstrumentation:
+    return bitc::ATTR_KIND_DISABLE_SANITIZER_INSTRUMENTATION;
   case Attribute::Hot:
     return bitc::ATTR_KIND_HOT;
   case Attribute::ElementType:
@@ -1687,6 +1687,8 @@ void ModuleBitcodeWriter::writeDIDerivedType(const DIDerivedType *N,
   else
     Record.push_back(0);
 
+  Record.push_back(VE.getMetadataOrNullID(N->getAnnotations().get()));
+
   Stream.EmitRecord(bitc::METADATA_DERIVED_TYPE, Record, Abbrev);
   Record.clear();
 }
@@ -1716,6 +1718,7 @@ void ModuleBitcodeWriter::writeDICompositeType(
   Record.push_back(VE.getMetadataOrNullID(N->getRawAssociated()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawAllocated()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawRank()));
+  Record.push_back(VE.getMetadataOrNullID(N->getAnnotations().get()));
 
   Stream.EmitRecord(bitc::METADATA_COMPOSITE_TYPE, Record, Abbrev);
   Record.clear();
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 7171bfdd28e2d..4f48d007a1530 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3270,21 +3270,21 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
   // reference the block.  It is possible that there is more than one label
   // here, because multiple LLVM BB's may have been RAUW'd to this block after
   // the references were generated.
+  const BasicBlock *BB = MBB.getBasicBlock();
   if (MBB.hasAddressTaken()) {
-    const BasicBlock *BB = MBB.getBasicBlock();
     if (isVerbose())
       OutStreamer->AddComment("Block address taken");
 
     // MBBs can have their address taken as part of CodeGen without having
     // their corresponding BB's address taken in IR
-    if (BB->hasAddressTaken())
+    if (BB && BB->hasAddressTaken())
       for (MCSymbol *Sym : MMI->getAddrLabelSymbolToEmit(BB))
         OutStreamer->emitLabel(Sym);
   }
 
   // Print some verbose block comments.
   if (isVerbose()) {
-    if (const BasicBlock *BB = MBB.getBasicBlock()) {
+    if (BB) {
       if (BB->hasName()) {
         BB->printAsOperand(OutStreamer->GetCommentOS(),
                            /*PrintType=*/false, BB->getModule());
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 52591a18791f2..e6c97b5d63db3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2147,18 +2147,24 @@ void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
 
   DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
 
+  Asm->OutStreamer->getContext().setDwarfCompileUnitID(
+      getDwarfCompileUnitIDForLineTable(CU));
+
+  // Record beginning of function.
+  PrologEndLoc = emitInitialLocDirective(
+      *MF, Asm->OutStreamer->getContext().getDwarfCompileUnitID());
+}
+
+unsigned
+DwarfDebug::getDwarfCompileUnitIDForLineTable(const DwarfCompileUnit &CU) {
   // Set DwarfDwarfCompileUnitID in MCContext to the Compile Unit this function
   // belongs to so that we add to the correct per-cu line table in the
   // non-asm case.
   if (Asm->OutStreamer->hasRawTextSupport())
     // Use a single line table if we are generating assembly.
-    Asm->OutStreamer->getContext().setDwarfCompileUnitID(0);
+    return 0;
   else
-    Asm->OutStreamer->getContext().setDwarfCompileUnitID(CU.getUniqueID());
-
-  // Record beginning of function.
-  PrologEndLoc = emitInitialLocDirective(
-      *MF, Asm->OutStreamer->getContext().getDwarfCompileUnitID());
+    return CU.getUniqueID();
 }
 
 void DwarfDebug::skippedNonDebugFunction() {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index b55be799b6bcc..6de0956174777 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -612,7 +612,7 @@ class DwarfDebug : public DebugHandlerBase {
                          DenseSet<InlinedEntity> &ProcessedVars);
 
   /// Build the location list for all DBG_VALUEs in the
-  /// function that describe the same variable. If the resulting 
+  /// function that describe the same variable. If the resulting
   /// list has only one entry that is valid for entire variable's
   /// scope return true.
   bool buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
@@ -632,6 +632,9 @@ class DwarfDebug : public DebugHandlerBase {
   /// Gather and emit post-function debug information.
   void endFunctionImpl(const MachineFunction *MF) override;
 
+  /// Get Dwarf compile unit ID for line table.
+  unsigned getDwarfCompileUnitIDForLineTable(const DwarfCompileUnit &CU);
+
   void skippedNonDebugFunction() override;
 
 public:
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index b30d9cc12abcc..ef57031c7294e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -43,6 +43,7 @@ WinException::WinException(AsmPrinter *A) : EHStreamer(A) {
   // platforms use an imagerel32 relocation to refer to symbols.
   useImageRel32 = (A->getDataLayout().getPointerSizeInBits() == 64);
   isAArch64 = Asm->TM.getTargetTriple().isAArch64();
+  isThumb = Asm->TM.getTargetTriple().isThumb();
 }
 
 WinException::~WinException() {}
@@ -330,10 +331,12 @@ const MCExpr *WinException::create32bitRef(const GlobalValue *GV) {
 }
 
 const MCExpr *WinException::getLabel(const MCSymbol *Label) {
-  if (isAArch64)
-    return MCSymbolRefExpr::create(Label, MCSymbolRefExpr::VK_COFF_IMGREL32,
-                                   Asm->OutContext);
-  return MCBinaryExpr::createAdd(create32bitRef(Label),
+  return MCSymbolRefExpr::create(Label, MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                 Asm->OutContext);
+}
+
+const MCExpr *WinException::getLabelPlusOne(const MCSymbol *Label) {
+  return MCBinaryExpr::createAdd(getLabel(Label),
                                  MCConstantExpr::create(1, Asm->OutContext),
                                  Asm->OutContext);
 }
@@ -561,8 +564,8 @@ InvokeStateChangeIterator &InvokeStateChangeIterator::scan() {
 ///   struct Table {
 ///     int NumEntries;
 ///     struct Entry {
-///       imagerel32 LabelStart;
-///       imagerel32 LabelEnd;
+///       imagerel32 LabelStart;       // Inclusive
+///       imagerel32 LabelEnd;         // Exclusive
 ///       imagerel32 FilterOrFinally;  // One means catch-all.
 ///       imagerel32 LabelLPad;        // Zero means __finally.
 ///     } Entries[NumEntries];
@@ -664,7 +667,7 @@ void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
     AddComment("LabelStart");
     OS.emitValue(getLabel(BeginLabel), 4);
     AddComment("LabelEnd");
-    OS.emitValue(getLabel(EndLabel), 4);
+    OS.emitValue(getLabelPlusOne(EndLabel), 4);
     AddComment(UME.IsFinally ? "FinallyFunclet" : UME.Filter ? "FilterFunction"
                                                              : "CatchAll");
     OS.emitValue(FilterOrFinally, 4);
@@ -949,8 +952,15 @@ void WinException::computeIP2StateTable(
       if (!ChangeLabel)
         ChangeLabel = StateChange.PreviousEndLabel;
       // Emit an entry indicating that PCs after 'Label' have this EH state.
+      // NOTE: On ARM architectures, the StateFromIp automatically takes into
+      // account that the return address is after the call instruction (whose EH
+      // state we should be using), but on other platforms we need to +1 to the
+      // label so that we are using the correct EH state.
+      const MCExpr *LabelExpression = (isAArch64 || isThumb)
+                                          ? getLabel(ChangeLabel)
+                                          : getLabelPlusOne(ChangeLabel);
       IPToStateTable.push_back(
-          std::make_pair(getLabel(ChangeLabel), StateChange.NewState));
+          std::make_pair(LabelExpression, StateChange.NewState));
       // FIXME: assert that NewState is between CatchLow and CatchHigh.
     }
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.h b/llvm/lib/CodeGen/AsmPrinter/WinException.h
index feea05ba63adb..638589adf0ddc 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.h
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.h
@@ -39,6 +39,9 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
   /// True if we are generating exception handling on Windows for ARM64.
   bool isAArch64 = false;
 
+  /// True if we are generating exception handling on Windows for ARM (Thumb).
+  bool isThumb = false;
+
   /// Pointer to the current funclet entry BB.
   const MachineBasicBlock *CurrentFuncletEntry = nullptr;
 
@@ -77,6 +80,7 @@ class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
   const MCExpr *create32bitRef(const MCSymbol *Value);
   const MCExpr *create32bitRef(const GlobalValue *GV);
   const MCExpr *getLabel(const MCSymbol *Label);
+  const MCExpr *getLabelPlusOne(const MCSymbol *Label);
   const MCExpr *getOffset(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom);
   const MCExpr *getOffsetPlusOne(const MCSymbol *OffsetOf,
                                  const MCSymbol *OffsetFrom);
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 125a3be585cb5..1297f99698d8b 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/AtomicExpandUtils.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -570,7 +571,9 @@ static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
 }
 
 bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
-  switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
+  LLVMContext &Ctx = AI->getModule()->getContext();
+  TargetLowering::AtomicExpansionKind Kind = TLI->shouldExpandAtomicRMWInIR(AI);
+  switch (Kind) {
   case TargetLoweringBase::AtomicExpansionKind::None:
     return false;
   case TargetLoweringBase::AtomicExpansionKind::LLSC: {
@@ -600,6 +603,18 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
       expandPartwordAtomicRMW(AI,
                               TargetLoweringBase::AtomicExpansionKind::CmpXChg);
     } else {
+      SmallVector<StringRef> SSNs;
+      Ctx.getSyncScopeNames(SSNs);
+      auto MemScope = SSNs[AI->getSyncScopeID()].empty()
+                          ? "system"
+                          : SSNs[AI->getSyncScopeID()];
+      OptimizationRemarkEmitter ORE(AI->getFunction());
+      ORE.emit([&]() {
+        return OptimizationRemark(DEBUG_TYPE, "Passed", AI)
+               << "A compare and swap loop was generated for an atomic "
+               << AI->getOperationName(AI->getOperation()) << " operation at "
+               << MemScope << " memory scope";
+      });
       expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
     }
     return true;
@@ -1850,7 +1865,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // Now, the return type.
   if (CASExpected) {
     ResultTy = Type::getInt1Ty(Ctx);
-    Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt);
+    Attr = Attr.addRetAttribute(Ctx, Attribute::ZExt);
   } else if (HasResult && UseSizedLibcall)
     ResultTy = SizedIntTy;
   else
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 398253fbfce0c..605aee351ef76 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -108,6 +108,7 @@ add_llvm_component_library(LLVMCodeGen
   MachineTraceMetrics.cpp
   MachineVerifier.cpp
   MIRFSDiscriminator.cpp
+  MIRSampleProfile.cpp
   MIRYamlMapping.cpp
   ModuloSchedule.cpp
   MultiHazardRecognizer.cpp
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 77ce3d2fb5633..50e59399d2a72 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -6950,16 +6950,26 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
   BasicBlock *TargetBB = I->getParent();
   bool Changed = false;
   SmallVector<Use *, 4> ToReplace;
+  Instruction *InsertPoint = I;
+  DenseMap<const Instruction *, unsigned long> InstOrdering;
+  unsigned long InstNumber = 0;
+  for (const auto &I : *TargetBB)
+    InstOrdering[&I] = InstNumber++;
+
   for (Use *U : reverse(OpsToSink)) {
     auto *UI = cast<Instruction>(U->get());
-    if (UI->getParent() == TargetBB || isa<PHINode>(UI))
+    if (isa<PHINode>(UI))
+      continue;
+    if (UI->getParent() == TargetBB) {
+      if (InstOrdering[UI] < InstOrdering[InsertPoint])
+        InsertPoint = UI;
       continue;
+    }
     ToReplace.push_back(U);
   }
 
   SetVector<Instruction *> MaybeDead;
   DenseMap<Instruction *, Instruction *> NewInstructions;
-  Instruction *InsertPoint = I;
   for (Use *U : ToReplace) {
     auto *UI = cast<Instruction>(U->get());
     Instruction *NI = UI->clone();
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index e5f5da5cc212c..c30e1c3fb564c 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -672,13 +672,11 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
           if (const auto *F = Call->getCalledFunction())
             if (F->getIntrinsicID() == Intrinsic::debugtrap ||
                 F->getIntrinsicID() == Intrinsic::trap)
-              Call->addAttribute(
-                  AttributeList::FunctionIndex,
+              Call->addFnAttr(
                   Attribute::get(Ctx, "trap-func-name", getTrapFuncName()));
 
   // Let NewAttrs override Attrs.
-  F.setAttributes(
-      Attrs.addAttributes(Ctx, AttributeList::FunctionIndex, NewAttrs));
+  F.setAttributes(Attrs.addFnAttributes(Ctx, NewAttrs));
 }
 
 /// Set function attributes of functions in Module M based on CPU,
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index a8d4d4ebe8bd6..bb8d2b3e9a785 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -158,6 +158,11 @@ struct CachingVPExpander {
   Value *expandPredicationInBinaryOperator(IRBuilder<> &Builder,
                                            VPIntrinsic &PI);
 
+  /// \brief Lower this VP reduction to a call to an unpredicated reduction
+  /// intrinsic.
+  Value *expandPredicationInReduction(IRBuilder<> &Builder,
+                                      VPReductionIntrinsic &PI);
+
   /// \brief Query TTI and expand the vector predication in \p P accordingly.
   Value *expandPredication(VPIntrinsic &PI);
 
@@ -248,6 +253,136 @@ CachingVPExpander::expandPredicationInBinaryOperator(IRBuilder<> &Builder,
   return NewBinOp;
 }
 
+static Value *getNeutralReductionElement(const VPReductionIntrinsic &VPI,
+                                         Type *EltTy) {
+  bool Negative = false;
+  unsigned EltBits = EltTy->getScalarSizeInBits();
+  switch (VPI.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Expecting a VP reduction intrinsic");
+  case Intrinsic::vp_reduce_add:
+  case Intrinsic::vp_reduce_or:
+  case Intrinsic::vp_reduce_xor:
+  case Intrinsic::vp_reduce_umax:
+    return Constant::getNullValue(EltTy);
+  case Intrinsic::vp_reduce_mul:
+    return ConstantInt::get(EltTy, 1, /*IsSigned*/ false);
+  case Intrinsic::vp_reduce_and:
+  case Intrinsic::vp_reduce_umin:
+    return ConstantInt::getAllOnesValue(EltTy);
+  case Intrinsic::vp_reduce_smin:
+    return ConstantInt::get(EltTy->getContext(),
+                            APInt::getSignedMaxValue(EltBits));
+  case Intrinsic::vp_reduce_smax:
+    return ConstantInt::get(EltTy->getContext(),
+                            APInt::getSignedMinValue(EltBits));
+  case Intrinsic::vp_reduce_fmax:
+    Negative = true;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::vp_reduce_fmin: {
+    FastMathFlags Flags = VPI.getFastMathFlags();
+    const fltSemantics &Semantics = EltTy->getFltSemantics();
+    return !Flags.noNaNs() ? ConstantFP::getQNaN(EltTy, Negative)
+           : !Flags.noInfs()
+               ? ConstantFP::getInfinity(EltTy, Negative)
+               : ConstantFP::get(EltTy,
+                                 APFloat::getLargest(Semantics, Negative));
+  }
+  case Intrinsic::vp_reduce_fadd:
+    return ConstantFP::getNegativeZero(EltTy);
+  case Intrinsic::vp_reduce_fmul:
+    return ConstantFP::get(EltTy, 1.0);
+  }
+}
+
+Value *
+CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
+                                                VPReductionIntrinsic &VPI) {
+  assert((isSafeToSpeculativelyExecute(&VPI) ||
+          VPI.canIgnoreVectorLengthParam()) &&
+         "Implicitly dropping %evl in non-speculatable operator!");
+
+  Value *Mask = VPI.getMaskParam();
+  Value *RedOp = VPI.getOperand(VPI.getVectorParamPos());
+
+  // Insert neutral element in masked-out positions
+  if (Mask && !isAllTrueMask(Mask)) {
+    auto *NeutralElt = getNeutralReductionElement(VPI, VPI.getType());
+    auto *NeutralVector = Builder.CreateVectorSplat(
+        cast<VectorType>(RedOp->getType())->getElementCount(), NeutralElt);
+    RedOp = Builder.CreateSelect(Mask, RedOp, NeutralVector);
+  }
+
+  Value *Reduction;
+  Value *Start = VPI.getOperand(VPI.getStartParamPos());
+
+  switch (VPI.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Impossible reduction kind");
+  case Intrinsic::vp_reduce_add:
+    Reduction = Builder.CreateAddReduce(RedOp);
+    Reduction = Builder.CreateAdd(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_mul:
+    Reduction = Builder.CreateMulReduce(RedOp);
+    Reduction = Builder.CreateMul(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_and:
+    Reduction = Builder.CreateAndReduce(RedOp);
+    Reduction = Builder.CreateAnd(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_or:
+    Reduction = Builder.CreateOrReduce(RedOp);
+    Reduction = Builder.CreateOr(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_xor:
+    Reduction = Builder.CreateXorReduce(RedOp);
+    Reduction = Builder.CreateXor(Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_smax:
+    Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ true);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::smax, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_smin:
+    Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ true);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::smin, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_umax:
+    Reduction = Builder.CreateIntMaxReduce(RedOp, /*IsSigned*/ false);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::umax, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_umin:
+    Reduction = Builder.CreateIntMinReduce(RedOp, /*IsSigned*/ false);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::umin, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_fmax:
+    Reduction = Builder.CreateFPMaxReduce(RedOp);
+    transferDecorations(*Reduction, VPI);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_fmin:
+    Reduction = Builder.CreateFPMinReduce(RedOp);
+    transferDecorations(*Reduction, VPI);
+    Reduction =
+        Builder.CreateBinaryIntrinsic(Intrinsic::minnum, Reduction, Start);
+    break;
+  case Intrinsic::vp_reduce_fadd:
+    Reduction = Builder.CreateFAddReduce(Start, RedOp);
+    break;
+  case Intrinsic::vp_reduce_fmul:
+    Reduction = Builder.CreateFMulReduce(Start, RedOp);
+    break;
+  }
+
+  replaceOperation(*Reduction, VPI);
+  return Reduction;
+}
+
 void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
   LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n");
 
@@ -321,6 +456,9 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
   if (OC && Instruction::isBinaryOp(*OC))
     return expandPredicationInBinaryOperator(Builder, VPI);
 
+  if (auto *VPRI = dyn_cast<VPReductionIntrinsic>(&VPI))
+    return expandPredicationInReduction(Builder, *VPRI);
+
   return &VPI;
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 23ff22fe3aa66..949ecacbffd90 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -46,8 +47,9 @@ CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
                                MachineIRBuilder &B, GISelKnownBits *KB,
                                MachineDominatorTree *MDT,
                                const LegalizerInfo *LI)
-    : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer),
-      KB(KB), MDT(MDT), LI(LI) {
+    : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer), KB(KB),
+      MDT(MDT), LI(LI), RBI(Builder.getMF().getSubtarget().getRegBankInfo()),
+      TRI(Builder.getMF().getSubtarget().getRegisterInfo()) {
   (void)this->KB;
 }
 
@@ -143,6 +145,15 @@ void CombinerHelper::replaceRegOpWith(MachineRegisterInfo &MRI,
   Observer.changedInstr(*FromRegOp.getParent());
 }
 
+const RegisterBank *CombinerHelper::getRegBank(Register Reg) const {
+  return RBI->getRegBank(Reg, MRI, *TRI);
+}
+
+void CombinerHelper::setRegBank(Register Reg, const RegisterBank *RegBank) {
+  if (RegBank)
+    MRI.setRegBank(Reg, *RegBank);
+}
+
 bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
   if (matchCombineCopy(MI)) {
     applyCombineCopy(MI);
@@ -1407,7 +1418,6 @@ bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
 
     // Don't promote to an alignment that would require dynamic stack
     // realignment.
-    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->hasStackRealignment(MF))
       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
         NewAlign = NewAlign / 2;
@@ -1512,7 +1522,6 @@ bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
 
     // Don't promote to an alignment that would require dynamic stack
     // realignment.
-    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->hasStackRealignment(MF))
       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
         NewAlign = NewAlign / 2;
@@ -1710,7 +1719,7 @@ bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
   if (!MaybeImmVal)
     return false;
 
-  MachineInstr *Add2Def = MRI.getUniqueVRegDef(Add2);
+  MachineInstr *Add2Def = MRI.getVRegDef(Add2);
   if (!Add2Def || Add2Def->getOpcode() != TargetOpcode::G_PTR_ADD)
     return false;
 
@@ -1751,6 +1760,7 @@ bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
   // Pass the combined immediate to the apply function.
   MatchInfo.Imm = AMNew.BaseOffs;
   MatchInfo.Base = Base;
+  MatchInfo.Bank = getRegBank(Imm2);
   return true;
 }
 
@@ -1760,6 +1770,7 @@ void CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
   MachineIRBuilder MIB(MI);
   LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg());
   auto NewOffset = MIB.buildConstant(OffsetTy, MatchInfo.Imm);
+  setRegBank(NewOffset.getReg(0), MatchInfo.Bank);
   Observer.changingInstr(MI);
   MI.getOperand(1).setReg(MatchInfo.Base);
   MI.getOperand(2).setReg(NewOffset.getReg(0));
@@ -4344,6 +4355,97 @@ bool CombinerHelper::matchConstantFold(MachineInstr &MI, APInt &MatchInfo) {
   return true;
 }
 
+bool CombinerHelper::matchNarrowBinopFeedingAnd(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  // Look for a binop feeding into an AND with a mask:
+  //
+  // %add = G_ADD %lhs, %rhs
+  // %and = G_AND %add, 000...11111111
+  //
+  // Check if it's possible to perform the binop at a narrower width and zext
+  // back to the original width like so:
+  //
+  // %narrow_lhs = G_TRUNC %lhs
+  // %narrow_rhs = G_TRUNC %rhs
+  // %narrow_add = G_ADD %narrow_lhs, %narrow_rhs
+  // %new_add = G_ZEXT %narrow_add
+  // %and = G_AND %new_add, 000...11111111
+  //
+  // This can allow later combines to eliminate the G_AND if it turns out
+  // that the mask is irrelevant.
+  assert(MI.getOpcode() == TargetOpcode::G_AND);
+  Register Dst = MI.getOperand(0).getReg();
+  Register AndLHS = MI.getOperand(1).getReg();
+  Register AndRHS = MI.getOperand(2).getReg();
+  LLT WideTy = MRI.getType(Dst);
+
+  // If the potential binop has more than one use, then it's possible that one
+  // of those uses will need its full width.
+  if (!WideTy.isScalar() || !MRI.hasOneNonDBGUse(AndLHS))
+    return false;
+
+  // Check if the LHS feeding the AND is impacted by the high bits that we're
+  // masking out.
+  //
+  // e.g. for 64-bit x, y:
+  //
+  // add_64(x, y) & 65535 == zext(add_16(trunc(x), trunc(y))) & 65535
+  MachineInstr *LHSInst = getDefIgnoringCopies(AndLHS, MRI);
+  if (!LHSInst)
+    return false;
+  unsigned LHSOpc = LHSInst->getOpcode();
+  switch (LHSOpc) {
+  default:
+    return false;
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_SUB:
+  case TargetOpcode::G_MUL:
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR:
+    break;
+  }
+
+  // Find the mask on the RHS.
+  auto Cst = getConstantVRegValWithLookThrough(AndRHS, MRI);
+  if (!Cst)
+    return false;
+  auto Mask = Cst->Value;
+  if (!Mask.isMask())
+    return false;
+
+  // No point in combining if there's nothing to truncate.
+  unsigned NarrowWidth = Mask.countTrailingOnes();
+  if (NarrowWidth == WideTy.getSizeInBits())
+    return false;
+  LLT NarrowTy = LLT::scalar(NarrowWidth);
+
+  // Check if adding the zext + truncates could be harmful.
+  auto &MF = *MI.getMF();
+  const auto &TLI = getTargetLowering();
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  auto &DL = MF.getDataLayout();
+  if (!TLI.isTruncateFree(WideTy, NarrowTy, DL, Ctx) ||
+      !TLI.isZExtFree(NarrowTy, WideTy, DL, Ctx))
+    return false;
+  if (!isLegalOrBeforeLegalizer({TargetOpcode::G_TRUNC, {NarrowTy, WideTy}}) ||
+      !isLegalOrBeforeLegalizer({TargetOpcode::G_ZEXT, {WideTy, NarrowTy}}))
+    return false;
+  Register BinOpLHS = LHSInst->getOperand(1).getReg();
+  Register BinOpRHS = LHSInst->getOperand(2).getReg();
+  MatchInfo = [=, &MI](MachineIRBuilder &B) {
+    auto NarrowLHS = Builder.buildTrunc(NarrowTy, BinOpLHS);
+    auto NarrowRHS = Builder.buildTrunc(NarrowTy, BinOpRHS);
+    auto NarrowBinOp =
+        Builder.buildInstr(LHSOpc, {NarrowTy}, {NarrowLHS, NarrowRHS});
+    auto Ext = Builder.buildZExt(WideTy, NarrowBinOp);
+    Observer.changingInstr(MI);
+    MI.getOperand(1).setReg(Ext.getReg(0));
+    Observer.changedInstr(MI);
+  };
+  return true;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index d4880f07a1274..b60b1d311b12a 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1763,6 +1763,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_VECREDUCE_UMAX;
     case Intrinsic::vector_reduce_umin:
       return TargetOpcode::G_VECREDUCE_UMIN;
+    case Intrinsic::lround:
+      return TargetOpcode::G_LROUND;
   }
   return Intrinsic::not_intrinsic;
 }
@@ -2228,6 +2230,14 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
 
     return true;
   }
+  case Intrinsic::isnan: {
+    Register Src = getOrCreateVReg(*CI.getArgOperand(0));
+    unsigned Flags = MachineInstr::copyFlagsFromInstruction(CI);
+    if (!CI.getFunction()->getAttributes().hasFnAttr(llvm::Attribute::StrictFP))
+      Flags |= MachineInstr::NoFPExcept;
+    MIRBuilder.buildIsNaN(getOrCreateVReg(CI), Src, Flags);
+    return true;
+  }
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)  \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index e12b3ed2b70ca..463437a4db088 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -497,8 +498,8 @@ static bool isLibCallInTailPosition(MachineInstr &MI,
     return false;
 
   // It's not safe to eliminate the sign / zero extension of the return value.
-  if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) ||
-      CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
+  if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
+      CallerAttrs.hasRetAttr(Attribute::SExt))
     return false;
 
   // Only tail call if the following instruction is a standard return or if we
@@ -3486,6 +3487,10 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
   case G_ROTL:
   case G_ROTR:
     return lowerRotate(MI);
+  case G_ISNAN:
+    return lowerIsNaN(MI);
+  GISEL_VECREDUCE_CASES_NONSEQ
+    return lowerVectorReduction(MI);
   }
 }
 
@@ -4634,35 +4639,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
   return Legalized;
 }
 
-LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
-    MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
-  unsigned Opc = MI.getOpcode();
-  assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
-         Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
-         "Sequential reductions not expected");
-
-  if (TypeIdx != 1)
-    return UnableToLegalize;
-
-  // The semantics of the normal non-sequential reductions allow us to freely
-  // re-associate the operation.
-  Register SrcReg = MI.getOperand(1).getReg();
-  LLT SrcTy = MRI.getType(SrcReg);
-  Register DstReg = MI.getOperand(0).getReg();
-  LLT DstTy = MRI.getType(DstReg);
-
-  if (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0)
-    return UnableToLegalize;
-
-  SmallVector<Register> SplitSrcs;
-  const unsigned NumParts = SrcTy.getNumElements() / NarrowTy.getNumElements();
-  extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
-  SmallVector<Register> PartialReductions;
-  for (unsigned Part = 0; Part < NumParts; ++Part) {
-    PartialReductions.push_back(
-        MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
-  }
-
+static unsigned getScalarOpcForReduction(unsigned Opc) {
   unsigned ScalarOpc;
   switch (Opc) {
   case TargetOpcode::G_VECREDUCE_FADD:
@@ -4705,9 +4682,80 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
     ScalarOpc = TargetOpcode::G_UMIN;
     break;
   default:
-    LLVM_DEBUG(dbgs() << "Can't legalize: unknown reduction kind.\n");
+    llvm_unreachable("Unhandled reduction");
+  }
+  return ScalarOpc;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
+    MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
+  unsigned Opc = MI.getOpcode();
+  assert(Opc != TargetOpcode::G_VECREDUCE_SEQ_FADD &&
+         Opc != TargetOpcode::G_VECREDUCE_SEQ_FMUL &&
+         "Sequential reductions not expected");
+
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  // The semantics of the normal non-sequential reductions allow us to freely
+  // re-associate the operation.
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+
+  if (NarrowTy.isVector() &&
+      (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
     return UnableToLegalize;
+
+  unsigned ScalarOpc = getScalarOpcForReduction(Opc);
+  SmallVector<Register> SplitSrcs;
+  // If NarrowTy is a scalar then we're being asked to scalarize.
+  const unsigned NumParts =
+      NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
+                          : SrcTy.getNumElements();
+
+  extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs);
+  if (NarrowTy.isScalar()) {
+    if (DstTy != NarrowTy)
+      return UnableToLegalize; // FIXME: handle implicit extensions.
+
+    if (isPowerOf2_32(NumParts)) {
+      // Generate a tree of scalar operations to reduce the critical path.
+      SmallVector<Register> PartialResults;
+      unsigned NumPartsLeft = NumParts;
+      while (NumPartsLeft > 1) {
+        for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
+          PartialResults.emplace_back(
+              MIRBuilder
+                  .buildInstr(ScalarOpc, {NarrowTy},
+                              {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
+                  .getReg(0));
+        }
+        SplitSrcs = PartialResults;
+        PartialResults.clear();
+        NumPartsLeft = SplitSrcs.size();
+      }
+      assert(SplitSrcs.size() == 1);
+      MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
+      MI.eraseFromParent();
+      return Legalized;
+    }
+    // If we can't generate a tree, then just do sequential operations.
+    Register Acc = SplitSrcs[0];
+    for (unsigned Idx = 1; Idx < NumParts; ++Idx)
+      Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
+                .getReg(0);
+    MIRBuilder.buildCopy(DstReg, Acc);
+    MI.eraseFromParent();
+    return Legalized;
   }
+  SmallVector<Register> PartialReductions;
+  for (unsigned Part = 0; Part < NumParts; ++Part) {
+    PartialReductions.push_back(
+        MIRBuilder.buildInstr(Opc, {DstTy}, {SplitSrcs[Part]}).getReg(0));
+  }
+
 
   // If the types involved are powers of 2, we can generate intermediate vector
   // ops, before generating a final reduction operation.
@@ -7355,3 +7403,53 @@ LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
   MI.eraseFromParent();
   return Legalized;
 }
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerIsNaN(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT SrcTy = MRI.getType(Src);
+  if (MI.getFlags() & MachineInstr::NoFPExcept) {
+    // Lower to an unordered comparison.
+    auto Zero = MIRBuilder.buildFConstant(SrcTy, 0.0);
+    MIRBuilder.buildFCmp(CmpInst::Predicate::FCMP_UNO, Dst, Src, Zero);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  // Use integer operations to avoid traps if the argument is SNaN.
+
+  // NaN has all exp bits set and a non zero significand. Therefore:
+  // isnan(V) == exp mask < abs(V)
+  auto FPToSI = MIRBuilder.buildFPTOSI(SrcTy, Src);
+  auto Mask = APInt::getSignedMaxValue(SrcTy.getScalarSizeInBits());
+  auto MaskCst = MIRBuilder.buildConstant(SrcTy, Mask);
+  auto AbsV = MIRBuilder.buildAnd(SrcTy, FPToSI, MaskCst);
+  auto *FloatTy = getFloatTypeForLLT(MI.getMF()->getFunction().getContext(),
+                                     SrcTy.getScalarType());
+  if (!FloatTy)
+    return UnableToLegalize;
+  auto ExpMask = APFloat::getInf(FloatTy->getFltSemantics()).bitcastToAPInt();
+  auto ExpMaskCst = MIRBuilder.buildConstant(SrcTy, ExpMask);
+  MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_SLT, Dst, ExpMaskCst, AbsV);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+  LLT DstTy = MRI.getType(SrcReg);
+
+  // The source could be a scalar if the IR type was <1 x sN>.
+  if (SrcTy.isScalar()) {
+    if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
+      return UnableToLegalize; // FIXME: handle extension.
+    // This can be just a plain copy.
+    Observer.changingInstr(MI);
+    MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  return UnableToLegalize;;
+}
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index dc99070583406..233824f1ed755 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -1381,7 +1381,6 @@ class TransferTracker {
       assert(ActiveVLocIt != ActiveVLocs.end());
       ActiveVLocIt->second.Loc = Dst;
 
-      assert(Dst != 0);
       MachineInstr *MI =
           MTracker->emitLoc(Dst, Var, ActiveVLocIt->second.Properties);
       PendingDbgValues.push_back(MI);
@@ -1684,7 +1683,8 @@ class InstrRefBasedLDV : public LDVImpl {
   /// RPOT block ordering.
   void initialSetup(MachineFunction &MF);
 
-  bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) override;
+  bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC,
+                    unsigned InputBBLimit, unsigned InputDbgValLimit) override;
 
 public:
   /// Default construct and initialize the pass.
@@ -1759,6 +1759,17 @@ bool InstrRefBasedLDV::transferDebugValue(const MachineInstr &MI) {
   if (Scope == nullptr)
     return true; // handled it; by doing nothing
 
+  // For now, ignore DBG_VALUE_LISTs when extending ranges. Allow it to
+  // contribute to locations in this block, but don't propagate further.
+  // Interpret it like a DBG_VALUE $noreg.
+  if (MI.isDebugValueList()) {
+    if (VTracker)
+      VTracker->defVar(MI, Properties, None);
+    if (TTracker)
+      TTracker->redefVar(MI, Properties, None);
+    return true;
+  }
+
   const MachineOperand &MO = MI.getOperand(0);
 
   // MLocTracker needs to know that this register is read, even if it's only
@@ -3523,8 +3534,9 @@ void InstrRefBasedLDV::initialSetup(MachineFunction &MF) {
 
 /// Calculate the liveness information for the given machine function and
 /// extend ranges across basic blocks.
-bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
-                                    TargetPassConfig *TPC) {
+bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC,
+                                    unsigned InputBBLimit,
+                                    unsigned InputDbgValLimit) {
   // No subprogram means this function contains no debuginfo.
   if (!MF.getFunction().getSubprogram())
     return false;
@@ -3626,6 +3638,7 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
 
   // To mirror old LiveDebugValues, enumerate variables in RPOT order. Otherwise
   // the order is unimportant, it just has to be stable.
+  unsigned VarAssignCount = 0;
   for (unsigned int I = 0; I < OrderToBB.size(); ++I) {
     auto *MBB = OrderToBB[I];
     auto *VTracker = &vlocs[MBB->getNumber()];
@@ -3643,24 +3656,42 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
       ScopeToVars[Scope].insert(Var);
       ScopeToBlocks[Scope].insert(VTracker->MBB);
       ScopeToDILocation[Scope] = ScopeLoc;
+      ++VarAssignCount;
     }
   }
 
-  // OK. Iterate over scopes: there might be something to be said for
-  // ordering them by size/locality, but that's for the future. For each scope,
-  // solve the variable value problem, producing a map of variables to values
-  // in SavedLiveIns.
-  for (auto &P : ScopeToVars) {
-    vlocDataflow(P.first, ScopeToDILocation[P.first], P.second,
-                 ScopeToBlocks[P.first], SavedLiveIns, MOutLocs, MInLocs,
-                 vlocs);
-  }
+  bool Changed = false;
 
-  // Using the computed value locations and variable values for each block,
-  // create the DBG_VALUE instructions representing the extended variable
-  // locations.
-  emitLocations(MF, SavedLiveIns, MOutLocs, MInLocs, AllVarsNumbering, *TPC);
+  // If we have an extremely large number of variable assignments and blocks,
+  // bail out at this point. We've burnt some time doing analysis already,
+  // however we should cut our losses.
+  if ((unsigned)MaxNumBlocks > InputBBLimit &&
+      VarAssignCount > InputDbgValLimit) {
+    LLVM_DEBUG(dbgs() << "Disabling InstrRefBasedLDV: " << MF.getName()
+                      << " has " << MaxNumBlocks << " basic blocks and "
+                      << VarAssignCount
+                      << " variable assignments, exceeding limits.\n");
+  } else {
+    // Compute the extended ranges, iterating over scopes. There might be
+    // something to be said for ordering them by size/locality, but that's for
+    // the future. For each scope, solve the variable value problem, producing
+    // a map of variables to values in SavedLiveIns.
+    for (auto &P : ScopeToVars) {
+      vlocDataflow(P.first, ScopeToDILocation[P.first], P.second,
+                   ScopeToBlocks[P.first], SavedLiveIns, MOutLocs, MInLocs,
+                   vlocs);
+    }
+
+    // Using the computed value locations and variable values for each block,
+    // create the DBG_VALUE instructions representing the extended variable
+    // locations.
+    emitLocations(MF, SavedLiveIns, MOutLocs, MInLocs, AllVarsNumbering, *TPC);
 
+    // Did we actually make any changes? If we created any DBG_VALUEs, then yes.
+    Changed = TTracker->Transfers.size() != 0;
+  }
+
+  // Common clean-up of memory.
   for (int Idx = 0; Idx < MaxNumBlocks; ++Idx) {
     delete[] MOutLocs[Idx];
     delete[] MInLocs[Idx];
@@ -3668,9 +3699,6 @@ bool InstrRefBasedLDV::ExtendRanges(MachineFunction &MF,
   delete[] MOutLocs;
   delete[] MInLocs;
 
-  // Did we actually make any changes? If we created any DBG_VALUEs, then yes.
-  bool Changed = TTracker->Transfers.size() != 0;
-
   delete MTracker;
   delete TTracker;
   MTracker = nullptr;
diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
index 38e803d1abb55..bc1eaff60440f 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
@@ -40,6 +40,18 @@ static cl::opt<bool>
                               "normal DBG_VALUE inputs"),
                      cl::init(false));
 
+// Options to prevent pathological compile-time behavior. If InputBBLimit and
+// InputDbgValueLimit are both exceeded, range extension is disabled.
+static cl::opt<unsigned> InputBBLimit(
+    "livedebugvalues-input-bb-limit",
+    cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"),
+    cl::init(10000), cl::Hidden);
+static cl::opt<unsigned> InputDbgValueLimit(
+    "livedebugvalues-input-dbg-value-limit",
+    cl::desc(
+        "Maximum input DBG_VALUE insts supported by debug range extension"),
+    cl::init(50000), cl::Hidden);
+
 /// Generic LiveDebugValues pass. Calls through to VarLocBasedLDV or
 /// InstrRefBasedLDV to perform location propagation, via the LDVImpl
 /// base class.
@@ -103,5 +115,5 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
       TheImpl = llvm::makeVarLocBasedLiveDebugValues();
   }
 
-  return TheImpl->ExtendRanges(MF, TPC);
+  return TheImpl->ExtendRanges(MF, TPC, InputBBLimit, InputDbgValueLimit);
 }
diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
index 9c910f180b9fb..e38360b08bafa 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
+++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
@@ -23,7 +23,9 @@ inline namespace SharedLiveDebugValues {
 // implementation.
 class LDVImpl {
 public:
-  virtual bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) = 0;
+  virtual bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC,
+                            unsigned InputBBLimit,
+                            unsigned InputDbgValLimit) = 0;
   virtual ~LDVImpl() {}
 };
 
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index 1e6d65c189535..977d3ede5c776 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -166,18 +166,6 @@ using namespace llvm;
 
 STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
 
-// Options to prevent pathological compile-time behavior. If InputBBLimit and
-// InputDbgValueLimit are both exceeded, range extension is disabled.
-static cl::opt<unsigned> InputBBLimit(
-    "livedebugvalues-input-bb-limit",
-    cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"),
-    cl::init(10000), cl::Hidden);
-static cl::opt<unsigned> InputDbgValueLimit(
-    "livedebugvalues-input-dbg-value-limit",
-    cl::desc(
-        "Maximum input DBG_VALUE insts supported by debug range extension"),
-    cl::init(50000), cl::Hidden);
-
 /// If \p Op is a stack or frame register return true, otherwise return false.
 /// This is used to avoid basing the debug entry values on the registers, since
 /// we do not support it at the moment.
@@ -1007,7 +995,8 @@ class VarLocBasedLDV : public LDVImpl {
   /// had their instruction creation deferred.
   void flushPendingLocs(VarLocInMBB &PendingInLocs, VarLocMap &VarLocIDs);
 
-  bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) override;
+  bool ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC,
+                    unsigned InputBBLimit, unsigned InputDbgValLimit) override;
 
 public:
   /// Default construct and initialize the pass.
@@ -2048,7 +2037,9 @@ void VarLocBasedLDV::recordEntryValue(const MachineInstr &MI,
 
 /// Calculate the liveness information for the given machine function and
 /// extend ranges across basic blocks.
-bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) {
+bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC,
+                                  unsigned InputBBLimit,
+                                  unsigned InputDbgValLimit) {
   LLVM_DEBUG(dbgs() << "\nDebug Range Extension\n");
 
   if (!MF.getFunction().getSubprogram())
@@ -2141,7 +2132,7 @@ bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF, TargetPassConfig *TPC) {
       for (auto &MI : MBB)
         if (MI.isDebugValue())
           ++NumInputDbgValues;
-    if (NumInputDbgValues > InputDbgValueLimit) {
+    if (NumInputDbgValues > InputDbgValLimit) {
       LLVM_DEBUG(dbgs() << "Disabling VarLocBasedLDV: " << MF.getName()
                         << " has " << RPONumber << " basic blocks and "
                         << NumInputDbgValues
diff --git a/llvm/lib/CodeGen/LowLevelType.cpp b/llvm/lib/CodeGen/LowLevelType.cpp
index 62e9c6b629d3b..dce64ab9f5ca4 100644
--- a/llvm/lib/CodeGen/LowLevelType.cpp
+++ b/llvm/lib/CodeGen/LowLevelType.cpp
@@ -52,6 +52,16 @@ MVT llvm::getMVTForLLT(LLT Ty) {
       Ty.getNumElements());
 }
 
+EVT llvm::getApproximateEVTForLLT(LLT Ty, const DataLayout &DL,
+                                  LLVMContext &Ctx) {
+  if (Ty.isVector()) {
+    EVT EltVT = getApproximateEVTForLLT(Ty.getElementType(), DL, Ctx);
+    return EVT::getVectorVT(Ctx, EltVT, Ty.getElementCount());
+  }
+
+  return EVT::getIntegerVT(Ctx, Ty.getSizeInBits());
+}
+
 LLT llvm::getLLTForMVT(MVT Ty) {
   if (!Ty.isVector())
     return LLT::scalar(Ty.getSizeInBits());
diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp
new file mode 100644
index 0000000000000..d4abc4af6a2e1
--- /dev/null
+++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp
@@ -0,0 +1,346 @@
+//===-------- MIRSampleProfile.cpp: MIRSampleFDO (For FSAFDO) -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the implementation of the MIRSampleProfile loader, mainly
+// for flow sensitive SampleFDO.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MIRSampleProfile.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h"
+#include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h"
+
+using namespace llvm;
+using namespace sampleprof;
+using namespace llvm::sampleprofutil;
+using ProfileCount = Function::ProfileCount;
+
+#define DEBUG_TYPE "fs-profile-loader"
+
+static cl::opt<bool> ShowFSBranchProb(
+    "show-fs-branchprob", cl::Hidden, cl::init(false),
+    cl::desc("Print setting flow sensitive branch probabilities"));
+static cl::opt<unsigned> FSProfileDebugProbDiffThreshold(
+    "fs-profile-debug-prob-diff-threshold", cl::init(10),
+    cl::desc("Only show debug message if the branch probility is greater than "
+             "this value (in percentage)."));
+
+static cl::opt<unsigned> FSProfileDebugBWThreshold(
+    "fs-profile-debug-bw-threshold", cl::init(10000),
+    cl::desc("Only show debug message if the source branch weight is greater "
+             " than this value."));
+
+static cl::opt<bool> ViewBFIBefore("fs-viewbfi-before", cl::Hidden,
+                                   cl::init(false),
+                                   cl::desc("View BFI before MIR loader"));
+static cl::opt<bool> ViewBFIAfter("fs-viewbfi-after", cl::Hidden,
+                                  cl::init(false),
+                                  cl::desc("View BFI after MIR loader"));
+
+char MIRProfileLoaderPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MIRProfileLoaderPass, DEBUG_TYPE,
+                      "Load MIR Sample Profile",
+                      /* cfg = */ false, /* is_analysis = */ false)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
+INITIALIZE_PASS_END(MIRProfileLoaderPass, DEBUG_TYPE, "Load MIR Sample Profile",
+                    /* cfg = */ false, /* is_analysis = */ false)
+
+char &llvm::MIRProfileLoaderPassID = MIRProfileLoaderPass::ID;
+
+FunctionPass *llvm::createMIRProfileLoaderPass(std::string File,
+                                               std::string RemappingFile,
+                                               FSDiscriminatorPass P) {
+  return new MIRProfileLoaderPass(File, RemappingFile, P);
+}
+
+namespace llvm {
+
+// Internal option used to control BFI display only after MBP pass.
+// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
+// -view-block-layout-with-bfi={none | fraction | integer | count}
+extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
+namespace afdo_detail {
+template <> struct IRTraits<MachineBasicBlock> {
+  using InstructionT = MachineInstr;
+  using BasicBlockT = MachineBasicBlock;
+  using FunctionT = MachineFunction;
+  using BlockFrequencyInfoT = MachineBlockFrequencyInfo;
+  using LoopT = MachineLoop;
+  using LoopInfoPtrT = MachineLoopInfo *;
+  using DominatorTreePtrT = MachineDominatorTree *;
+  using PostDominatorTreePtrT = MachinePostDominatorTree *;
+  using PostDominatorTreeT = MachinePostDominatorTree;
+  using OptRemarkEmitterT = MachineOptimizationRemarkEmitter;
+  using OptRemarkAnalysisT = MachineOptimizationRemarkAnalysis;
+  using PredRangeT = iterator_range<std::vector<MachineBasicBlock *>::iterator>;
+  using SuccRangeT = iterator_range<std::vector<MachineBasicBlock *>::iterator>;
+  static Function &getFunction(MachineFunction &F) { return F.getFunction(); }
+  static const MachineBasicBlock *getEntryBB(const MachineFunction *F) {
+    return GraphTraits<const MachineFunction *>::getEntryNode(F);
+  }
+  static PredRangeT getPredecessors(MachineBasicBlock *BB) {
+    return BB->predecessors();
+  }
+  static SuccRangeT getSuccessors(MachineBasicBlock *BB) {
+    return BB->successors();
+  }
+};
+} // namespace afdo_detail
+
+class MIRProfileLoader final
+    : public SampleProfileLoaderBaseImpl<MachineBasicBlock> {
+public:
+  void setInitVals(MachineDominatorTree *MDT, MachinePostDominatorTree *MPDT,
+                   MachineLoopInfo *MLI, MachineBlockFrequencyInfo *MBFI,
+                   MachineOptimizationRemarkEmitter *MORE) {
+    DT = MDT;
+    PDT = MPDT;
+    LI = MLI;
+    BFI = MBFI;
+    ORE = MORE;
+  }
+  void setFSPass(FSDiscriminatorPass Pass) {
+    P = Pass;
+    LowBit = getFSPassBitBegin(P);
+    HighBit = getFSPassBitEnd(P);
+    assert(LowBit < HighBit && "HighBit needs to be greater than Lowbit");
+  }
+
+  MIRProfileLoader(StringRef Name, StringRef RemapName)
+      : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName)) {
+  }
+
+  void setBranchProbs(MachineFunction &F);
+  bool runOnFunction(MachineFunction &F);
+  bool doInitialization(Module &M);
+  bool isValid() const { return ProfileIsValid; }
+
+protected:
+  friend class SampleCoverageTracker;
+
+  /// Hold the information of the basic block frequency.
+  MachineBlockFrequencyInfo *BFI;
+
+  /// PassNum is the sequence number this pass is called, start from 1.
+  FSDiscriminatorPass P;
+
+  // LowBit in the FS discriminator used by this instance. Note the number is
+  // 0-based. Base discrimnator use bit 0 to bit 11.
+  unsigned LowBit;
+  // HighwBit in the FS discriminator used by this instance. Note the number
+  // is 0-based.
+  unsigned HighBit;
+
+  bool ProfileIsValid = true;
+};
+
+template <>
+void SampleProfileLoaderBaseImpl<
+    MachineBasicBlock>::computeDominanceAndLoopInfo(MachineFunction &F) {}
+
+void MIRProfileLoader::setBranchProbs(MachineFunction &F) {
+  LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch probs\n");
+  for (auto &BI : F) {
+    MachineBasicBlock *BB = &BI;
+    if (BB->succ_size() < 2)
+      continue;
+    const MachineBasicBlock *EC = EquivalenceClass[BB];
+    uint64_t BBWeight = BlockWeights[EC];
+    uint64_t SumEdgeWeight = 0;
+    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+                                          SE = BB->succ_end();
+         SI != SE; ++SI) {
+      MachineBasicBlock *Succ = *SI;
+      Edge E = std::make_pair(BB, Succ);
+      SumEdgeWeight += EdgeWeights[E];
+    }
+
+    if (BBWeight != SumEdgeWeight) {
+      LLVM_DEBUG(dbgs() << "BBweight is not equal to SumEdgeWeight: BBWWeight="
+                        << BBWeight << " SumEdgeWeight= " << SumEdgeWeight
+                        << "\n");
+      BBWeight = SumEdgeWeight;
+    }
+    if (BBWeight == 0) {
+      LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
+      continue;
+    }
+
+#ifndef NDEBUG
+    uint64_t BBWeightOrig = BBWeight;
+#endif
+    uint32_t MaxWeight = std::numeric_limits<uint32_t>::max();
+    uint32_t Factor = 1;
+    if (BBWeight > MaxWeight) {
+      Factor = BBWeight / MaxWeight + 1;
+      BBWeight /= Factor;
+      LLVM_DEBUG(dbgs() << "Scaling weights by " << Factor << "\n");
+    }
+
+    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+                                          SE = BB->succ_end();
+         SI != SE; ++SI) {
+      MachineBasicBlock *Succ = *SI;
+      Edge E = std::make_pair(BB, Succ);
+      uint64_t EdgeWeight = EdgeWeights[E];
+      EdgeWeight /= Factor;
+
+      assert(BBWeight >= EdgeWeight &&
+             "BBweight is larger than EdgeWeight -- should not happen.\n");
+
+      BranchProbability OldProb = BFI->getMBPI()->getEdgeProbability(BB, SI);
+      BranchProbability NewProb(EdgeWeight, BBWeight);
+      if (OldProb == NewProb)
+        continue;
+      BB->setSuccProbability(SI, NewProb);
+#ifndef NDEBUG
+      if (!ShowFSBranchProb)
+        continue;
+      bool Show = false;
+      BranchProbability Diff;
+      if (OldProb > NewProb)
+        Diff = OldProb - NewProb;
+      else
+        Diff = NewProb - OldProb;
+      Show = (Diff >= BranchProbability(FSProfileDebugProbDiffThreshold, 100));
+      Show &= (BBWeightOrig >= FSProfileDebugBWThreshold);
+
+      auto DIL = BB->findBranchDebugLoc();
+      auto SuccDIL = Succ->findBranchDebugLoc();
+      if (Show) {
+        dbgs() << "Set branch fs prob: MBB (" << BB->getNumber() << " -> "
+               << Succ->getNumber() << "): ";
+        if (DIL)
+          dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
+                 << DIL->getColumn();
+        if (SuccDIL)
+          dbgs() << "-->" << SuccDIL->getFilename() << ":" << SuccDIL->getLine()
+                 << ":" << SuccDIL->getColumn();
+        dbgs() << " W=" << BBWeightOrig << "  " << OldProb << " --> " << NewProb
+               << "\n";
+      }
+#endif
+    }
+  }
+}
+
+bool MIRProfileLoader::doInitialization(Module &M) {
+  auto &Ctx = M.getContext();
+
+  auto ReaderOrErr = sampleprof::SampleProfileReader::create(Filename, Ctx, P,
+                                                             RemappingFilename);
+  if (std::error_code EC = ReaderOrErr.getError()) {
+    std::string Msg = "Could not open profile: " + EC.message();
+    Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
+    return false;
+  }
+
+  Reader = std::move(ReaderOrErr.get());
+  Reader->setModule(&M);
+  ProfileIsValid = (Reader->read() == sampleprof_error::success);
+  Reader->getSummary();
+
+  return true;
+}
+
+bool MIRProfileLoader::runOnFunction(MachineFunction &MF) {
+  Function &Func = MF.getFunction();
+  clearFunctionData(false);
+  Samples = Reader->getSamplesFor(Func);
+  if (!Samples || Samples->empty())
+    return false;
+
+  if (getFunctionLoc(MF) == 0)
+    return false;
+
+  DenseSet<GlobalValue::GUID> InlinedGUIDs;
+  bool Changed = computeAndPropagateWeights(MF, InlinedGUIDs);
+
+  // Set the new BPI, BFI.
+  setBranchProbs(MF);
+
+  return Changed;
+}
+
+} // namespace llvm
+
+MIRProfileLoaderPass::MIRProfileLoaderPass(std::string FileName,
+                                           std::string RemappingFileName,
+                                           FSDiscriminatorPass P)
+    : MachineFunctionPass(ID), ProfileFileName(FileName), P(P),
+      MIRSampleLoader(
+          std::make_unique<MIRProfileLoader>(FileName, RemappingFileName)) {
+  LowBit = getFSPassBitBegin(P);
+  HighBit = getFSPassBitEnd(P);
+  assert(LowBit < HighBit && "HighBit needs to be greater than Lowbit");
+}
+
+bool MIRProfileLoaderPass::runOnMachineFunction(MachineFunction &MF) {
+  if (!MIRSampleLoader->isValid())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "MIRProfileLoader pass working on Func: "
+                    << MF.getFunction().getName() << "\n");
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  MIRSampleLoader->setInitVals(
+      &getAnalysis<MachineDominatorTree>(),
+      &getAnalysis<MachinePostDominatorTree>(), &getAnalysis<MachineLoopInfo>(),
+      MBFI, &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE());
+
+  MF.RenumberBlocks();
+  if (ViewBFIBefore && ViewBlockLayoutWithBFI != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       MF.getFunction().getName().equals(ViewBlockFreqFuncName))) {
+    MBFI->view("MIR_Prof_loader_b." + MF.getName(), false);
+  }
+
+  bool Changed = MIRSampleLoader->runOnFunction(MF);
+
+  if (ViewBFIAfter && ViewBlockLayoutWithBFI != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       MF.getFunction().getName().equals(ViewBlockFreqFuncName))) {
+    MBFI->view("MIR_prof_loader_a." + MF.getName(), false);
+  }
+
+  return Changed;
+}
+
+bool MIRProfileLoaderPass::doInitialization(Module &M) {
+  LLVM_DEBUG(dbgs() << "MIRProfileLoader pass working on Module " << M.getName()
+                    << "\n");
+
+  MIRSampleLoader->setFSPass(P);
+  return MIRSampleLoader->doInitialization(M);
+}
+
+void MIRProfileLoaderPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequiredTransitive<MachineLoopInfo>();
+  AU.addRequired<MachineOptimizationRemarkEmitterPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 0a454b68aca34..88dc5a6544a7d 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -129,8 +129,8 @@ void ilist_alloc_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
 
 static inline unsigned getFnStackAlignment(const TargetSubtargetInfo *STI,
                                            const Function &F) {
-  if (F.hasFnAttribute(Attribute::StackAlignment))
-    return F.getFnStackAlignment();
+  if (auto MA = F.getFnStackAlign())
+    return MA->value();
   return STI->getFrameLowering()->getStackAlign().value();
 }
 
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 883299c452b71..42708659c79e1 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -230,6 +230,9 @@ namespace {
 
     bool IsGuaranteedToExecute(MachineBasicBlock *BB);
 
+    bool isTriviallyReMaterializable(const MachineInstr &MI,
+                                     AAResults *AA) const;
+
     void EnterScope(MachineBasicBlock *MBB);
 
     void ExitScope(MachineBasicBlock *MBB);
@@ -659,6 +662,23 @@ bool MachineLICMBase::IsGuaranteedToExecute(MachineBasicBlock *BB) {
   return true;
 }
 
+/// Check if \p MI is trivially remateralizable and if it does not have any
+/// virtual register uses. Even though rematerializable RA might not actually
+/// rematerialize it in this scenario. In that case we do not want to hoist such
+/// instruction out of the loop in a belief RA will sink it back if needed.
+bool MachineLICMBase::isTriviallyReMaterializable(const MachineInstr &MI,
+                                                  AAResults *AA) const {
+  if (!TII->isTriviallyReMaterializable(MI, AA))
+    return false;
+
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual())
+      return false;
+  }
+
+  return true;
+}
+
 void MachineLICMBase::EnterScope(MachineBasicBlock *MBB) {
   LLVM_DEBUG(dbgs() << "Entering " << printMBBReference(*MBB) << '\n');
 
@@ -1156,9 +1176,9 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI) {
     return false;
   }
 
-  // Rematerializable instructions should always be hoisted since the register
-  // allocator can just pull them down again when needed.
-  if (TII->isTriviallyReMaterializable(MI, AA))
+  // Rematerializable instructions should always be hoisted providing the
+  // register allocator can just pull them down again when needed.
+  if (isTriviallyReMaterializable(MI, AA))
     return true;
 
   // FIXME: If there are long latency loop-invariant instructions inside the
@@ -1211,7 +1231,7 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI) {
 
   // High register pressure situation, only hoist if the instruction is going
   // to be remat'ed.
-  if (!TII->isTriviallyReMaterializable(MI, AA) &&
+  if (!isTriviallyReMaterializable(MI, AA) &&
       !MI.isDereferenceableInvariantLoad(AA)) {
     LLVM_DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI);
     return false;
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index caa3f8049aebd..f32c8a2978fe3 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -200,8 +200,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
   if (!EnableSWP)
     return false;
 
-  if (mf.getFunction().getAttributes().hasAttribute(
-          AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
+  if (mf.getFunction().getAttributes().hasFnAttr(Attribute::OptimizeForSize) &&
       !EnableSWPOptSize.getPosition())
     return false;
 
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 7e3198af02cd6..93942cdd0e4de 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -210,6 +210,11 @@ namespace {
     void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB);
     void visitMachineBundleBefore(const MachineInstr *MI);
 
+    /// Verify that all of \p MI's virtual register operands are scalars.
+    /// \returns True if all virtual register operands are scalar. False
+    /// otherwise.
+    bool verifyAllRegOpsScalar(const MachineInstr &MI,
+                               const MachineRegisterInfo &MRI);
     bool verifyVectorElementMatch(LLT Ty0, LLT Ty1, const MachineInstr *MI);
     void verifyPreISelGenericInstruction(const MachineInstr *MI);
     void visitMachineInstrBefore(const MachineInstr *MI);
@@ -849,6 +854,21 @@ void MachineVerifier::verifyInlineAsm(const MachineInstr *MI) {
   }
 }
 
+bool MachineVerifier::verifyAllRegOpsScalar(const MachineInstr &MI,
+                                            const MachineRegisterInfo &MRI) {
+  if (none_of(MI.explicit_operands(), [&MRI](const MachineOperand &Op) {
+        if (!Op.isReg())
+          return false;
+        const auto Reg = Op.getReg();
+        if (Reg.isPhysical())
+          return false;
+        return !MRI.getType(Reg).isScalar();
+      }))
+    return true;
+  report("All register operands must have scalar types", &MI);
+  return false;
+}
+
 /// Check that types are consistent when two operands need to have the same
 /// number of vector elements.
 /// \return true if the types are valid.
@@ -947,6 +967,25 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
   // Verify properties of various specific instruction types
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
+  case TargetOpcode::G_ISNAN: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    LLT S1 = DstTy.isVector() ? DstTy.getElementType() : DstTy;
+    if (S1 != LLT::scalar(1)) {
+      report("Destination must be a 1-bit scalar or vector of 1-bit elements",
+             MI);
+      break;
+    }
+
+    // Disallow pointers.
+    LLT SrcOrElt = SrcTy.isVector() ? SrcTy.getElementType() : SrcTy;
+    if (!SrcOrElt.isScalar()) {
+      report("Source must be a scalar or vector of scalars", MI);
+      break;
+    }
+    verifyVectorElementMatch(DstTy, SrcTy, MI);
+    break;
+  }
   case TargetOpcode::G_ASSERT_SEXT:
   case TargetOpcode::G_ASSERT_ZEXT: {
     std::string OpcName =
@@ -1392,7 +1431,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
       AttributeList Attrs
         = Intrinsic::getAttributes(MF->getFunction().getContext(),
                                    static_cast<Intrinsic::ID>(IntrID));
-      bool DeclHasSideEffects = !Attrs.hasFnAttribute(Attribute::ReadNone);
+      bool DeclHasSideEffects = !Attrs.hasFnAttr(Attribute::ReadNone);
       if (NoSideEffects && DeclHasSideEffects) {
         report("G_INTRINSIC used with intrinsic that accesses memory", MI);
         break;
@@ -1570,11 +1609,8 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
   case TargetOpcode::G_VECREDUCE_UMAX:
   case TargetOpcode::G_VECREDUCE_UMIN: {
     LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
-    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
     if (!DstTy.isScalar())
       report("Vector reduction requires a scalar destination type", MI);
-    if (!SrcTy.isVector())
-      report("Vector reduction requires vector source=", MI);
     break;
   }
 
@@ -1598,7 +1634,11 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     }
     break;
   }
-
+  case TargetOpcode::G_LLROUND:
+  case TargetOpcode::G_LROUND: {
+    verifyAllRegOpsScalar(*MI, *MRI);
+    break;
+  }
   default:
     break;
   }
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index cc08aad246cd0..688c543ac6274 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -760,7 +760,6 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
     // Giant live ranges fall back to the global assignment heuristic, which
     // prevents excessive spilling in pathological cases.
     bool ReverseLocal = TRI->reverseLocalAssignment();
-    bool AddPriorityToGlobal = TRI->addAllocPriorityToGlobalRanges();
     const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
     bool ForceGlobal = !ReverseLocal &&
       (Size / SlotIndex::InstrDist) > (2 * RC.getNumRegs());
@@ -785,8 +784,7 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
       // interference.  Mark a bit to prioritize global above local ranges.
       Prio = (1u << 29) + Size;
 
-      if (AddPriorityToGlobal)
-        Prio |= RC.AllocationPriority << 24;
+      Prio |= RC.AllocationPriority << 24;
     }
     // Mark a higher bit to prioritize global and local above RS_Split.
     Prio |= (1u << 31);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 3e008ae76ca1a..839787e381153 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16854,8 +16854,10 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
                                               SDValue &ConstNode) {
   APInt Val;
 
-  // If the add only has one use, this would be OK to do.
-  if (AddNode.getNode()->hasOneUse())
+  // If the add only has one use, and the target thinks the folding is
+  // profitable or does not lead to worse code, this would be OK to do.
+  if (AddNode.getNode()->hasOneUse() &&
+      TLI.isMulAddWithConstProfitable(AddNode, ConstNode))
     return true;
 
   // Walk all the users of the constant with which we're multiplying.
@@ -19865,6 +19867,44 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
   return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops));
 }
 
+// Attempt to merge nested concat_vectors/undefs.
+// Fold concat_vectors(concat_vectors(x,y,z,w),u,u,concat_vectors(a,b,c,d))
+//  --> concat_vectors(x,y,z,w,u,u,u,u,u,u,u,u,a,b,c,d)
+static SDValue combineConcatVectorOfConcatVectors(SDNode *N,
+                                                  SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  // Ensure we're concatenating UNDEF and CONCAT_VECTORS nodes of similar types.
+  EVT SubVT;
+  SDValue FirstConcat;
+  for (const SDValue &Op : N->ops()) {
+    if (Op.isUndef())
+      continue;
+    if (Op.getOpcode() != ISD::CONCAT_VECTORS)
+      return SDValue();
+    if (!FirstConcat) {
+      SubVT = Op.getOperand(0).getValueType();
+      if (!DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
+        return SDValue();
+      FirstConcat = Op;
+      continue;
+    }
+    if (SubVT != Op.getOperand(0).getValueType())
+      return SDValue();
+  }
+  assert(FirstConcat && "Concat of all-undefs found");
+
+  SmallVector<SDValue> ConcatOps;
+  for (const SDValue &Op : N->ops()) {
+    if (Op.isUndef()) {
+      ConcatOps.append(FirstConcat->getNumOperands(), DAG.getUNDEF(SubVT));
+      continue;
+    }
+    ConcatOps.append(Op->op_begin(), Op->op_end());
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, ConcatOps);
+}
+
 // Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
 // operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at
 // most two distinct vectors the same size as the result, attempt to turn this
@@ -20124,13 +20164,19 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   }
 
   // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
+  // FIXME: Add support for concat_vectors(bitcast(vec0),bitcast(vec1),...).
   if (SDValue V = combineConcatVectorOfScalars(N, DAG))
     return V;
 
-  // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
-  if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
+  if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) {
+    // Fold CONCAT_VECTORS of CONCAT_VECTORS (or undef) to VECTOR_SHUFFLE.
+    if (SDValue V = combineConcatVectorOfConcatVectors(N, DAG))
+      return V;
+
+    // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
     if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
       return V;
+  }
 
   if (SDValue V = combineConcatVectorOfCasts(N, DAG))
     return V;
@@ -20583,8 +20629,12 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     //    otherwise => (extract_subvec V1, ExtIdx)
     uint64_t InsIdx = V.getConstantOperandVal(2);
     if (InsIdx * SmallVT.getScalarSizeInBits() ==
-        ExtIdx * NVT.getScalarSizeInBits())
+        ExtIdx * NVT.getScalarSizeInBits()) {
+      if (LegalOperations && !TLI.isOperationLegal(ISD::BITCAST, NVT))
+        return SDValue();
+
       return DAG.getBitcast(NVT, V.getOperand(1));
+    }
     return DAG.getNode(
         ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,
         DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index ceca4aaf4188f..8dfdb00f64bf9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -816,7 +816,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSHLSAT(SDNode *N) {
 
   // Shift cannot use a min/max expansion, we can't detect overflow if all of
   // the bits have been shifted out.
-  if (IsShift || TLI.isOperationLegalOrCustom(Opcode, PromotedType)) {
+  if (IsShift || TLI.isOperationLegal(Opcode, PromotedType)) {
     unsigned ShiftOp;
     switch (Opcode) {
     case ISD::SADDSAT:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index bfd3927f8a160..508823441e9f4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -915,7 +915,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N);
 
   SDValue WidenVecRes_Ternary(SDNode *N);
-  SDValue WidenVecRes_Binary(SDNode *N);
+  SDValue WidenVecRes_Binary(SDNode *N, bool IsVP);
   SDValue WidenVecRes_BinaryCanTrap(SDNode *N);
   SDValue WidenVecRes_BinaryWithExtraScalarOp(SDNode *N);
   SDValue WidenVecRes_StrictFP(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index ebe3bfc4b75ac..c29822120921b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -538,8 +538,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   return RecursivelyLegalizeResults(Op, ResultVals);
 }
 
-// FIXME: This is very similar to the X86 override of
-// TargetLowering::LowerOperationWrapper. Can we merge them somehow?
+// FIXME: This is very similar to TargetLowering::LowerOperationWrapper. Can we
+// merge them somehow?
 bool VectorLegalizer::LowerOperationWrapper(SDNode *Node,
                                             SmallVectorImpl<SDValue> &Results) {
   SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index d3ebf6ca89ce2..2c6cef8ebf160 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3070,7 +3070,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::USHLSAT:
   case ISD::ROTL:
   case ISD::ROTR:
-    Res = WidenVecRes_Binary(N);
+    Res = WidenVecRes_Binary(N, /*IsVP*/ false);
     break;
 
   case ISD::FADD:
@@ -3194,6 +3194,31 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FSHR:
     Res = WidenVecRes_Ternary(N);
     break;
+  case ISD::VP_ADD:
+  case ISD::VP_AND:
+  case ISD::VP_MUL:
+  case ISD::VP_OR:
+  case ISD::VP_SUB:
+  case ISD::VP_XOR:
+  case ISD::VP_SHL:
+  case ISD::VP_LSHR:
+  case ISD::VP_ASHR:
+  case ISD::VP_SDIV:
+  case ISD::VP_UDIV:
+  case ISD::VP_SREM:
+  case ISD::VP_UREM:
+  case ISD::VP_FADD:
+  case ISD::VP_FSUB:
+  case ISD::VP_FMUL:
+  case ISD::VP_FDIV:
+  case ISD::VP_FREM:
+    // Vector-predicated binary op widening. Note that -- unlike the
+    // unpredicated versions -- we don't have to worry about trapping on
+    // operations like UDIV, FADD, etc., as we pass on the original vector
+    // length parameter. This means the widened elements containing garbage
+    // aren't active.
+    Res = WidenVecRes_Binary(N, /*IsVP*/ true);
+    break;
   }
 
   // If Res is null, the sub-method took care of registering the result.
@@ -3211,13 +3236,31 @@ SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
   return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
 }
 
-SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
+SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N, bool IsVP) {
   // Binary op widening.
   SDLoc dl(N);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(0));
   SDValue InOp2 = GetWidenedVector(N->getOperand(1));
-  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, N->getFlags());
+  if (!IsVP)
+    return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2,
+                       N->getFlags());
+  // For VP operations, we must also widen the mask. Note that the mask type
+  // may not actually need widening, leading it be split along with the VP
+  // operation.
+  // FIXME: This could lead to an infinite split/widen loop. We only handle the
+  // case where the mask needs widening to an identically-sized type as the
+  // vector inputs.
+  SDValue Mask = N->getOperand(2);
+  assert(getTypeAction(Mask.getValueType()) ==
+             TargetLowering::TypeWidenVector &&
+         "Unable to widen binary VP op");
+  Mask = GetWidenedVector(Mask);
+  assert(Mask.getValueType().getVectorElementCount() ==
+             WidenVT.getVectorElementCount() &&
+         "Unable to widen binary VP op");
+  return DAG.getNode(N->getOpcode(), dl, WidenVT,
+                     {InOp1, InOp2, Mask, N->getOperand(3)}, N->getFlags());
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_BinaryWithExtraScalarOp(SDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 1450bf0eafca3..59a3243742c28 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3647,6 +3647,12 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
         }))
       return true;
 
+  // Is the operand of a splat vector a constant power of two?
+  if (Val.getOpcode() == ISD::SPLAT_VECTOR)
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val->getOperand(0)))
+      if (C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2())
+        return true;
+
   // More could be done here, though the above checks are enough
   // to handle some common cases.
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 074ddaf4848a3..4286be5a15592 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1953,16 +1953,13 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
           /*IsVarArg*/ false, DL);
 
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
-      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                          Attribute::SExt))
+      if (F->getAttributes().hasRetAttr(Attribute::SExt))
         ExtendKind = ISD::SIGN_EXTEND;
-      else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                               Attribute::ZExt))
+      else if (F->getAttributes().hasRetAttr(Attribute::ZExt))
         ExtendKind = ISD::ZERO_EXTEND;
 
       LLVMContext &Context = F->getContext();
-      bool RetInReg = F->getAttributes().hasAttribute(
-          AttributeList::ReturnIndex, Attribute::InReg);
+      bool RetInReg = F->getAttributes().hasRetAttr(Attribute::InReg);
 
       for (unsigned j = 0; j != NumValues; ++j) {
         EVT VT = ValueVTs[j];
@@ -2003,7 +2000,8 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
           Flags.setZExt();
 
         for (unsigned i = 0; i < NumParts; ++i) {
-          Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
+          Outs.push_back(ISD::OutputArg(Flags,
+                                        Parts[i].getValueType().getSimpleVT(),
                                         VT, /*isfixed=*/true, 0, 0));
           OutVals.push_back(Parts[i]);
         }
@@ -2020,10 +2018,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     assert(SwiftError.getFunctionArg() && "Need a swift error argument");
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
     Flags.setSwiftError();
-    Outs.push_back(ISD::OutputArg(Flags, EVT(TLI.getPointerTy(DL)) /*vt*/,
-                                  EVT(TLI.getPointerTy(DL)) /*argvt*/,
-                                  true /*isfixed*/, 1 /*origidx*/,
-                                  0 /*partOffs*/));
+    Outs.push_back(ISD::OutputArg(
+        Flags, /*vt=*/TLI.getPointerTy(DL), /*argvt=*/EVT(TLI.getPointerTy(DL)),
+        /*isfixed=*/true, /*origidx=*/1, /*partOffs=*/0));
     // Create SDNode for the swifterror virtual register.
     OutVals.push_back(
         DAG.getRegister(SwiftError.getOrCreateVRegUseAt(
@@ -2671,7 +2668,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
     TargetLowering::ArgListEntry Entry;
     Entry.Node = GuardVal;
     Entry.Ty = FnTy->getParamType(0);
-    if (GuardCheckFn->hasAttribute(1, Attribute::AttrKind::InReg))
+    if (GuardCheckFn->hasParamAttribute(0, Attribute::AttrKind::InReg))
       Entry.IsInReg = true;
     Args.push_back(Entry);
 
@@ -6419,7 +6416,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Op = getValue(I.getArgOperand(0));
     SDNodeFlags Flags;
     Flags.setNoFPExcept(
-        !F.getAttributes().hasFnAttribute(llvm::Attribute::StrictFP));
+        !F.getAttributes().hasFnAttr(llvm::Attribute::StrictFP));
 
     // If ISD::ISNAN should be expanded, do it right now, because the expansion
     // can use illegal types. Making expansion early allows to legalize these
@@ -6738,9 +6735,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::debugtrap:
   case Intrinsic::trap: {
     StringRef TrapFuncName =
-        I.getAttributes()
-            .getAttribute(AttributeList::FunctionIndex, "trap-func-name")
-            .getValueAsString();
+        I.getAttributes().getFnAttr("trap-func-name").getValueAsString();
     if (TrapFuncName.empty()) {
       switch (Intrinsic) {
       case Intrinsic::trap:
@@ -7359,6 +7354,13 @@ static unsigned getISDForVPIntrinsic(const VPIntrinsic &VPIntrin) {
     llvm_unreachable(
         "Inconsistency: no SDNode available for this VPIntrinsic!");
 
+  if (*ResOPC == ISD::VP_REDUCE_SEQ_FADD ||
+      *ResOPC == ISD::VP_REDUCE_SEQ_FMUL) {
+    if (VPIntrin.getFastMathFlags().allowReassoc())
+      return *ResOPC == ISD::VP_REDUCE_SEQ_FADD ? ISD::VP_REDUCE_FADD
+                                                : ISD::VP_REDUCE_FMUL;
+  }
+
   return ResOPC.getValue();
 }
 
@@ -8706,7 +8708,7 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
           MachineFunction &MF = DAG.getMachineFunction();
           MachineRegisterInfo &MRI = MF.getRegInfo();
           const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-          RegisterSDNode *R = dyn_cast<RegisterSDNode>(AsmNodeOperands[CurOp+1]);
+          auto *R = cast<RegisterSDNode>(AsmNodeOperands[CurOp+1]);
           Register TiedReg = R->getReg();
           MVT RegVT = R->getSimpleValueType(0);
           const TargetRegisterClass *RC = TiedReg.isVirtual() ?
@@ -9703,9 +9705,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // if it isn't first piece, alignment must be 1
         // For scalable vectors the scalable part is currently handled
         // by individual targets, so we just use the known minimum size here.
-        ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(), VT,
-                    i < CLI.NumFixedArgs, i,
-                    j*Parts[j].getValueType().getStoreSize().getKnownMinSize());
+        ISD::OutputArg MyFlags(
+            Flags, Parts[j].getValueType().getSimpleVT(), VT,
+            i < CLI.NumFixedArgs, i,
+            j * Parts[j].getValueType().getStoreSize().getKnownMinSize());
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
         else if (j != 0) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index dd282a3beae83..accda2588c883 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -4336,7 +4336,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
 
     // When division is cheap or optimizing for minimum size,
     // fall through to DIVREM creation by skipping this fold.
-    if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize)) {
+    if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttr(Attribute::MinSize)) {
       if (N0.getOpcode() == ISD::UREM) {
         if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl))
           return Folded;
@@ -8103,14 +8103,12 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const {
     return DAG.getSelect(dl, VT, Overflow, Zero, SumDiff);
   }
 
-  // SatMax -> Overflow && SumDiff < 0
-  // SatMin -> Overflow && SumDiff >= 0
+  // Overflow ? (SumDiff >> BW) ^ MinVal : SumDiff
   APInt MinVal = APInt::getSignedMinValue(BitWidth);
-  APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
   SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
-  SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
-  SDValue SumNeg = DAG.getSetCC(dl, BoolVT, SumDiff, Zero, ISD::SETLT);
-  Result = DAG.getSelect(dl, VT, SumNeg, SatMax, SatMin);
+  SDValue Shift = DAG.getNode(ISD::SRA, dl, VT, SumDiff,
+                              DAG.getConstant(BitWidth - 1, dl, VT));
+  Result = DAG.getNode(ISD::XOR, dl, VT, Shift, SatMin);
   return DAG.getSelect(dl, VT, Overflow, Result, SumDiff);
 }
 
@@ -8421,7 +8419,7 @@ void TargetLowering::expandSADDSUBO(
 
   // If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
   unsigned OpcSat = IsAdd ? ISD::SADDSAT : ISD::SSUBSAT;
-  if (isOperationLegalOrCustom(OpcSat, LHS.getValueType())) {
+  if (isOperationLegal(OpcSat, LHS.getValueType())) {
     SDValue Sat = DAG.getNode(OpcSat, dl, LHS.getValueType(), LHS, RHS);
     SDValue SetCC = DAG.getSetCC(dl, OType, Result, Sat, ISD::SETNE);
     Overflow = DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index d11c6b05aad7c..201697c37140f 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -237,6 +237,8 @@ RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
       return FPEXT_F16_F32;
     if (RetVT == MVT::f64)
       return FPEXT_F16_F64;
+    if (RetVT == MVT::f80)
+      return FPEXT_F16_F80;
     if (RetVT == MVT::f128)
       return FPEXT_F16_F128;
   } else if (OpVT == MVT::f32) {
@@ -1657,9 +1659,9 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
     EVT VT = ValueVTs[j];
     ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
-    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
+    if (attr.hasRetAttr(Attribute::SExt))
       ExtendKind = ISD::SIGN_EXTEND;
-    else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))
+    else if (attr.hasRetAttr(Attribute::ZExt))
       ExtendKind = ISD::ZERO_EXTEND;
 
     // FIXME: C calling convention requires the return type to be promoted to
@@ -1679,13 +1681,13 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
 
     // 'inreg' on function refers to return value
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::InReg))
+    if (attr.hasRetAttr(Attribute::InReg))
       Flags.setInReg();
 
     // Propagate extension type if any
-    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
+    if (attr.hasRetAttr(Attribute::SExt))
       Flags.setSExt();
-    else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))
+    else if (attr.hasRetAttr(Attribute::ZExt))
       Flags.setZExt();
 
     for (unsigned i = 0; i < NumParts; ++i)
@@ -1749,8 +1751,9 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
                                             const DataLayout &DL, LLT Ty,
                                             const MachineMemOperand &MMO,
                                             bool *Fast) const {
-  return allowsMemoryAccess(Context, DL, getMVTForLLT(Ty), MMO.getAddrSpace(),
-                            MMO.getAlign(), MMO.getFlags(), Fast);
+  EVT VT = getApproximateEVTForLLT(Ty, DL, Context);
+  return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(), MMO.getAlign(),
+                            MMO.getFlags(), Fast);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index c31deceb9ae35..2a90c3154bb41 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -172,6 +172,24 @@ static cl::opt<bool>
     FSNoFinalDiscrim("fs-no-final-discrim", cl::init(false), cl::Hidden,
                      cl::desc("Do not insert FS-AFDO discriminators before "
                               "emit."));
+// Disable MIRProfileLoader before RegAlloc. This is for for debugging and
+// tuning purpose.
+static cl::opt<bool> DisableRAFSProfileLoader(
+    "disable-ra-fsprofile-loader", cl::init(true), cl::Hidden,
+    cl::desc("Disable MIRProfileLoader before RegAlloc"));
+// Disable MIRProfileLoader before BloackPlacement. This is for for debugging
+// and tuning purpose.
+static cl::opt<bool> DisableLayoutFSProfileLoader(
+    "disable-layout-fsprofile-loader", cl::init(true), cl::Hidden,
+    cl::desc("Disable MIRProfileLoader before BlockPlacement"));
+// Specify FSProfile file name.
+static cl::opt<std::string>
+    FSProfileFile("fs-profile-file", cl::init(""), cl::value_desc("filename"),
+                  cl::desc("Flow Sensitive profile file name."), cl::Hidden);
+// Specify Remapping file for FSProfile.
+static cl::opt<std::string> FSRemappingFile(
+    "fs-remapping-file", cl::init(""), cl::value_desc("filename"),
+    cl::desc("Flow Sensitive profile remapping file name."), cl::Hidden);
 
 // Temporary option to allow experimenting with MachineScheduler as a post-RA
 // scheduler. Targets can "properly" enable this with
@@ -308,6 +326,28 @@ static IdentifyingPassPtr overridePass(AnalysisID StandardID,
   return TargetID;
 }
 
+// Find the FSProfile file name. The internal option takes the precedence
+// before getting from TargetMachine.
+static const std::string getFSProfileFile(const TargetMachine *TM) {
+  if (!FSProfileFile.empty())
+    return FSProfileFile.getValue();
+  const Optional<PGOOptions> &PGOOpt = TM->getPGOOption();
+  if (PGOOpt == None || PGOOpt->Action != PGOOptions::SampleUse)
+    return std::string();
+  return PGOOpt->ProfileFile;
+}
+
+// Find the Profile remapping file name. The internal option takes the
+// precedence before getting from TargetMachine.
+static const std::string getFSRemappingFile(const TargetMachine *TM) {
+  if (!FSRemappingFile.empty())
+    return FSRemappingFile.getValue();
+  const Optional<PGOOptions> &PGOOpt = TM->getPGOOption();
+  if (PGOOpt == None || PGOOpt->Action != PGOOptions::SampleUse)
+    return std::string();
+  return PGOOpt->ProfileRemappingFile;
+}
+
 //===---------------------------------------------------------------------===//
 /// TargetPassConfig
 //===---------------------------------------------------------------------===//
@@ -1115,9 +1155,15 @@ void TargetPassConfig::addMachinePasses() {
 
   // Add a FSDiscriminator pass right before RA, so that we could get
   // more precise SampleFDO profile for RA.
-  if (EnableFSDiscriminator)
+  if (EnableFSDiscriminator) {
     addPass(createMIRAddFSDiscriminatorsPass(
         sampleprof::FSDiscriminatorPass::Pass1));
+    const std::string ProfileFile = getFSProfileFile(TM);
+    if (!ProfileFile.empty() && !DisableRAFSProfileLoader)
+      addPass(
+          createMIRProfileLoaderPass(ProfileFile, getFSRemappingFile(TM),
+                                     sampleprof::FSDiscriminatorPass::Pass1));
+  }
 
   // Run register allocation and passes that are tightly coupled with it,
   // including phi elimination and scheduling.
@@ -1471,9 +1517,15 @@ bool TargetPassConfig::addGCPasses() {
 
 /// Add standard basic block placement passes.
 void TargetPassConfig::addBlockPlacement() {
-  if (EnableFSDiscriminator)
+  if (EnableFSDiscriminator) {
     addPass(createMIRAddFSDiscriminatorsPass(
         sampleprof::FSDiscriminatorPass::Pass2));
+    const std::string ProfileFile = getFSProfileFile(TM);
+    if (!ProfileFile.empty() && !DisableLayoutFSProfileLoader)
+      addPass(
+          createMIRProfileLoaderPass(ProfileFile, getFSRemappingFile(TM),
+                                     sampleprof::FSDiscriminatorPass::Pass2));
+  }
   if (addPass(&MachineBlockPlacementID)) {
     // Run a separate pass to collect block placement statistics.
     if (EnableBlockPlacementStats)
diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
index 2ce6ea1d42120..29a74d3f20fe9 100644
--- a/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -192,11 +192,8 @@ class TypePromotion : public FunctionPass {
 
 }
 
-static bool GenerateSignBits(Value *V) {
-  if (!isa<Instruction>(V))
-    return false;
-
-  unsigned Opc = cast<Instruction>(V)->getOpcode();
+static bool GenerateSignBits(Instruction *I) {
+  unsigned Opc = I->getOpcode();
   return Opc == Instruction::AShr || Opc == Instruction::SDiv ||
          Opc == Instruction::SRem || Opc == Instruction::SExt;
 }
@@ -403,17 +400,14 @@ bool TypePromotion::shouldPromote(Value *V) {
 
 /// Return whether we can safely mutate V's type to ExtTy without having to be
 /// concerned with zero extending or truncation.
-static bool isPromotedResultSafe(Value *V) {
-  if (GenerateSignBits(V))
+static bool isPromotedResultSafe(Instruction *I) {
+  if (GenerateSignBits(I))
     return false;
 
-  if (!isa<Instruction>(V))
+  if (!isa<OverflowingBinaryOperator>(I))
     return true;
 
-  if (!isa<OverflowingBinaryOperator>(V))
-    return true;
-
-  return cast<Instruction>(V)->hasNoUnsignedWrap();
+  return I->hasNoUnsignedWrap();
 }
 
 void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) {
@@ -515,8 +509,6 @@ void IRPromoter::ExtendSources() {
 void IRPromoter::PromoteTree() {
   LLVM_DEBUG(dbgs() << "IR Promotion: Mutating the tree..\n");
 
-  IRBuilder<> Builder{Ctx};
-
   // Mutate the types of the instructions within the tree. Here we handle
   // constant operands.
   for (auto *V : Visited) {
@@ -539,8 +531,8 @@ void IRPromoter::PromoteTree() {
         I->setOperand(i, UndefValue::get(ExtTy));
     }
 
-    // Mutate the result type, unless this is an icmp.
-    if (!isa<ICmpInst>(I)) {
+    // Mutate the result type, unless this is an icmp or switch.
+    if (!isa<ICmpInst>(I) && !isa<SwitchInst>(I)) {
       I->mutateType(ExtTy);
       Promoted.insert(I);
     }
@@ -798,7 +790,7 @@ bool TypePromotion::isLegalToPromote(Value *V) {
   if (SafeToPromote.count(I))
    return true;
 
-  if (isPromotedResultSafe(V) || isSafeWrap(I)) {
+  if (isPromotedResultSafe(I) || isSafeWrap(I)) {
     SafeToPromote.insert(I);
     return true;
   }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index f17dacfce6656..f6bad56f6bfa7 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -349,29 +349,6 @@ bool DWARFUnitHeader::applyIndexEntry(const DWARFUnitIndex::Entry *Entry) {
   return true;
 }
 
-// Parse the rangelist table header, including the optional array of offsets
-// following it (DWARF v5 and later).
-template<typename ListTableType>
-static Expected<ListTableType>
-parseListTableHeader(DWARFDataExtractor &DA, uint64_t Offset,
-                        DwarfFormat Format) {
-  // We are expected to be called with Offset 0 or pointing just past the table
-  // header. Correct Offset in the latter case so that it points to the start
-  // of the header.
-  if (Offset > 0) {
-    uint64_t HeaderSize = DWARFListTableHeader::getHeaderSize(Format);
-    if (Offset < HeaderSize)
-      return createStringError(errc::invalid_argument, "did not detect a valid"
-                               " list table with base = 0x%" PRIx64 "\n",
-                               Offset);
-    Offset -= HeaderSize;
-  }
-  ListTableType Table;
-  if (Error E = Table.extractHeaderAndOffsets(DA, &Offset))
-    return std::move(E);
-  return Table;
-}
-
 Error DWARFUnit::extractRangeList(uint64_t RangeListOffset,
                                   DWARFDebugRangeList &RangeList) const {
   // Require that compile unit is extracted.
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index ac624ec8b80fb..addf78ed4b2d6 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -50,6 +50,9 @@ DWARFVerifier::DieRangeInfo::insert(const DWARFAddressRange &R) {
 
 DWARFVerifier::DieRangeInfo::die_range_info_iterator
 DWARFVerifier::DieRangeInfo::insert(const DieRangeInfo &RI) {
+  if (RI.Ranges.empty())
+    return Children.end();
+
   auto End = Children.end();
   auto Iter = Children.begin();
   while (Iter != End) {
@@ -158,7 +161,9 @@ bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
   return Success;
 }
 
-unsigned DWARFVerifier::verifyUnitContents(DWARFUnit &Unit) {
+unsigned DWARFVerifier::verifyUnitContents(DWARFUnit &Unit,
+                                           ReferenceMap &UnitLocalReferences,
+                                           ReferenceMap &CrossUnitReferences) {
   unsigned NumUnitErrors = 0;
   unsigned NumDies = Unit.getNumDIEs();
   for (unsigned I = 0; I < NumDies; ++I) {
@@ -169,7 +174,8 @@ unsigned DWARFVerifier::verifyUnitContents(DWARFUnit &Unit) {
 
     for (auto AttrValue : Die.attributes()) {
       NumUnitErrors += verifyDebugInfoAttribute(Die, AttrValue);
-      NumUnitErrors += verifyDebugInfoForm(Die, AttrValue);
+      NumUnitErrors += verifyDebugInfoForm(Die, AttrValue, UnitLocalReferences,
+                                           CrossUnitReferences);
     }
 
     if (Die.hasChildren()) {
@@ -299,6 +305,10 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
   bool hasDIE = DebugInfoData.isValidOffset(Offset);
   DWARFUnitVector TypeUnitVector;
   DWARFUnitVector CompileUnitVector;
+  /// A map that tracks all references (converted absolute references) so we
+  /// can verify each reference points to a valid DIE and not an offset that
+  /// lies between to valid DIEs.
+  ReferenceMap CrossUnitReferences;
   while (hasDIE) {
     OffsetStart = Offset;
     if (!verifyUnitHeader(DebugInfoData, &Offset, UnitIdx, UnitType,
@@ -309,6 +319,7 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
     } else {
       DWARFUnitHeader Header;
       Header.extract(DCtx, DebugInfoData, &OffsetStart, SectionKind);
+      ReferenceMap UnitLocalReferences;
       DWARFUnit *Unit;
       switch (UnitType) {
       case dwarf::DW_UT_type:
@@ -337,7 +348,10 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
       }
       default: { llvm_unreachable("Invalid UnitType."); }
       }
-      NumDebugInfoErrors += verifyUnitContents(*Unit);
+      NumDebugInfoErrors +=
+          verifyUnitContents(*Unit, UnitLocalReferences, CrossUnitReferences);
+      NumDebugInfoErrors += verifyDebugInfoReferences(
+          UnitLocalReferences, [&](uint64_t Offset) { return Unit; });
     }
     hasDIE = DebugInfoData.isValidOffset(Offset);
     ++UnitIdx;
@@ -348,7 +362,14 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
   }
   if (!isHeaderChainValid)
     ++NumDebugInfoErrors;
-  NumDebugInfoErrors += verifyDebugInfoReferences();
+  NumDebugInfoErrors += verifyDebugInfoReferences(
+      CrossUnitReferences, [&](uint64_t Offset) -> DWARFUnit * {
+        if (DWARFUnit *U = TypeUnitVector.getUnitForOffset(Offset))
+          return U;
+        if (DWARFUnit *U = CompileUnitVector.getUnitForOffset(Offset))
+          return U;
+        return nullptr;
+      });
   return NumDebugInfoErrors;
 }
 
@@ -383,7 +404,7 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
     return NumErrors;
   }
 
-  DWARFAddressRangesVector Ranges = RangesOrError.get();
+  const DWARFAddressRangesVector &Ranges = RangesOrError.get();
   // Build RI for this DIE and check that ranges within this DIE do not
   // overlap.
   DieRangeInfo RI(Die);
@@ -444,7 +465,7 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
   }
 
   // Verify that ranges are contained within their parent.
-  bool ShouldBeContained = !Ranges.empty() && !ParentRI.Ranges.empty() &&
+  bool ShouldBeContained = !RI.Ranges.empty() && !ParentRI.Ranges.empty() &&
                            !(Die.getTag() == DW_TAG_subprogram &&
                              ParentRI.Die.getTag() == DW_TAG_subprogram);
   if (ShouldBeContained && !ParentRI.contains(RI)) {
@@ -587,7 +608,9 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
 }
 
 unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
-                                            DWARFAttribute &AttrValue) {
+                                            DWARFAttribute &AttrValue,
+                                            ReferenceMap &LocalReferences,
+                                            ReferenceMap &CrossUnitReferences) {
   const DWARFObject &DObj = DCtx.getDWARFObj();
   auto DieCU = Die.getDwarfUnit();
   unsigned NumErrors = 0;
@@ -615,7 +638,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
       } else {
         // Valid reference, but we will verify it points to an actual
         // DIE later.
-        ReferenceToDIEOffsets[*RefVal].insert(Die.getOffset());
+        LocalReferences[*RefVal].insert(Die.getOffset());
       }
     }
     break;
@@ -634,7 +657,7 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
       } else {
         // Valid reference, but we will verify it points to an actual
         // DIE later.
-        ReferenceToDIEOffsets[*RefVal].insert(Die.getOffset());
+        CrossUnitReferences[*RefVal].insert(Die.getOffset());
       }
     }
     break;
@@ -694,20 +717,24 @@ unsigned DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die,
   return NumErrors;
 }
 
-unsigned DWARFVerifier::verifyDebugInfoReferences() {
-  // Take all references and make sure they point to an actual DIE by
-  // getting the DIE by offset and emitting an error
-  OS << "Verifying .debug_info references...\n";
+unsigned DWARFVerifier::verifyDebugInfoReferences(
+    const ReferenceMap &References,
+    llvm::function_ref<DWARFUnit *(uint64_t)> GetUnitForOffset) {
+  auto GetDIEForOffset = [&](uint64_t Offset) {
+    if (DWARFUnit *U = GetUnitForOffset(Offset))
+      return U->getDIEForOffset(Offset);
+    return DWARFDie();
+  };
   unsigned NumErrors = 0;
   for (const std::pair<const uint64_t, std::set<uint64_t>> &Pair :
-       ReferenceToDIEOffsets) {
-    if (DCtx.getDIEForOffset(Pair.first))
+       References) {
+    if (GetDIEForOffset(Pair.first))
       continue;
     ++NumErrors;
     error() << "invalid DIE reference " << format("0x%08" PRIx64, Pair.first)
             << ". Offset is in between DIEs:\n";
     for (auto Offset : Pair.second)
-      dump(DCtx.getDIEForOffset(Offset)) << '\n';
+      dump(GetDIEForOffset(Offset)) << '\n';
     OS << "\n";
   }
   return NumErrors;
diff --git a/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index addec6871fa10..672fd7b991c25 100644
--- a/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -188,8 +188,7 @@ LLVMBool LLVMCreateMCJITCompilerForModule(
     for (auto &F : *Mod) {
       auto Attrs = F.getAttributes();
       StringRef Value = options.NoFramePointerElim ? "all" : "none";
-      Attrs = Attrs.addAttribute(F.getContext(), AttributeList::FunctionIndex,
-                                 "frame-pointer", Value);
+      Attrs = Attrs.addFnAttribute(F.getContext(), "frame-pointer", Value);
       F.setAttributes(Attrs);
     }
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
index 2b2a1a8db4c1d..20738b0c9c83d 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
@@ -173,8 +173,10 @@ ELFLinkGraphBuilder<ELFT>::getSymbolLinkageAndScope(
     L = Linkage::Weak;
     break;
   default:
-    return make_error<StringError>("Unrecognized symbol binding for " + Name,
-                                   inconvertibleErrorCode());
+    return make_error<StringError>(
+        "Unrecognized symbol binding " +
+            Twine(static_cast<int>(Sym.getBinding())) + " for " + Name,
+        inconvertibleErrorCode());
   }
 
   switch (Sym.getVisibility()) {
@@ -190,8 +192,10 @@ ELFLinkGraphBuilder<ELFT>::getSymbolLinkageAndScope(
       S = Scope::Hidden;
     break;
   case ELF::STV_INTERNAL:
-    return make_error<StringError>("Unrecognized symbol visibility for " + Name,
-                                   inconvertibleErrorCode());
+    return make_error<StringError>(
+        "Unrecognized symbol visibility " +
+            Twine(static_cast<int>(Sym.getVisibility())) + " for " + Name,
+        inconvertibleErrorCode());
   }
 
   return std::make_pair(L, S);
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
index 3d81bd20f855e..e65af56a2f246 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -53,8 +53,10 @@ class PerGraphGOTAndPLTStubsBuilder_ELF_x86_64
     return E.getKind() == x86_64::RequestGOTAndTransformToDelta32 ||
            E.getKind() == x86_64::RequestGOTAndTransformToDelta64 ||
            E.getKind() ==
-               x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable ||
-           E.getKind() == x86_64::RequestGOTAndTransformToDelta64FromGOT;
+               x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable ||
+           E.getKind() == x86_64::RequestGOTAndTransformToDelta64FromGOT ||
+           E.getKind() ==
+               x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable;
   }
 
   Symbol &createGOTEntry(Symbol &Target) {
@@ -71,6 +73,9 @@ class PerGraphGOTAndPLTStubsBuilder_ELF_x86_64
     // optimizeMachO_x86_64_GOTAndStubs pass below.
     // If it's a GOT64 leave it as is.
     switch (E.getKind()) {
+    case x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable:
+      E.setKind(x86_64::PCRel32GOTLoadREXRelaxable);
+      break;
     case x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable:
       E.setKind(x86_64::PCRel32GOTLoadRelaxable);
       break;
@@ -107,10 +112,9 @@ class PerGraphGOTAndPLTStubsBuilder_ELF_x86_64
   void fixPLTEdge(Edge &E, Symbol &Stub) {
     assert(E.getKind() == x86_64::BranchPCRel32 && "Not a Branch32 edge?");
 
-    // Set the edge kind to Branch32ToStub. We will use this to check for stub
-    // optimization opportunities in the optimize ELF_x86_64_GOTAndStubs pass
-    // below.
-    E.setKind(x86_64::BranchPCRel32ToPtrJumpStubRelaxable);
+    // Set the edge kind to Branch32ToPtrJumpStubBypassable to enable it to be
+    // optimized when the target is in-range.
+    E.setKind(x86_64::BranchPCRel32ToPtrJumpStubBypassable);
     E.setTarget(Stub);
   }
 
@@ -150,82 +154,6 @@ const uint8_t PerGraphGOTAndPLTStubsBuilder_ELF_x86_64::NullGOTEntryContent[8] =
 const uint8_t PerGraphGOTAndPLTStubsBuilder_ELF_x86_64::StubContent[6] = {
     0xFF, 0x25, 0x00, 0x00, 0x00, 0x00};
 
-static Error optimizeELF_x86_64_GOTAndStubs(LinkGraph &G) {
-  LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n");
-
-  for (auto *B : G.blocks())
-    for (auto &E : B->edges())
-      if (E.getKind() == x86_64::PCRel32GOTLoadRelaxable) {
-        // Replace GOT load with LEA only for MOVQ instructions.
-        constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b};
-        if (E.getOffset() < 3 ||
-            strncmp(B->getContent().data() + E.getOffset() - 3,
-                    reinterpret_cast<const char *>(MOVQRIPRel), 2) != 0)
-          continue;
-
-        auto &GOTBlock = E.getTarget().getBlock();
-        assert(GOTBlock.getSize() == G.getPointerSize() &&
-               "GOT entry block should be pointer sized");
-        assert(GOTBlock.edges_size() == 1 &&
-               "GOT entry should only have one outgoing edge");
-
-        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
-        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
-        JITTargetAddress TargetAddr = GOTTarget.getAddress();
-
-        int64_t Displacement = TargetAddr - EdgeAddr + 4;
-        if (Displacement >= std::numeric_limits<int32_t>::min() &&
-            Displacement <= std::numeric_limits<int32_t>::max()) {
-          // Change the edge kind as we don't go through GOT anymore. This is
-          // for formal correctness only. Technically, the two relocation kinds
-          // are resolved the same way.
-          E.setKind(x86_64::Delta32);
-          E.setTarget(GOTTarget);
-          E.setAddend(E.getAddend() - 4);
-          auto *BlockData = reinterpret_cast<uint8_t *>(
-              const_cast<char *>(B->getContent().data()));
-          BlockData[E.getOffset() - 2] = 0x8d;
-          LLVM_DEBUG({
-            dbgs() << "  Replaced GOT load wih LEA:\n    ";
-            printEdge(dbgs(), *B, E, getELFX86RelocationKindName(E.getKind()));
-            dbgs() << "\n";
-          });
-        }
-      } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubRelaxable) {
-        auto &StubBlock = E.getTarget().getBlock();
-        assert(
-            StubBlock.getSize() ==
-                sizeof(PerGraphGOTAndPLTStubsBuilder_ELF_x86_64::StubContent) &&
-            "Stub block should be stub sized");
-        assert(StubBlock.edges_size() == 1 &&
-               "Stub block should only have one outgoing edge");
-
-        auto &GOTBlock = StubBlock.edges().begin()->getTarget().getBlock();
-        assert(GOTBlock.getSize() == G.getPointerSize() &&
-               "GOT block should be pointer sized");
-        assert(GOTBlock.edges_size() == 1 &&
-               "GOT block should only have one outgoing edge");
-
-        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
-        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
-        JITTargetAddress TargetAddr = GOTTarget.getAddress();
-
-        int64_t Displacement = TargetAddr - EdgeAddr + 4;
-        if (Displacement >= std::numeric_limits<int32_t>::min() &&
-            Displacement <= std::numeric_limits<int32_t>::max()) {
-          E.setKind(x86_64::BranchPCRel32);
-          E.setTarget(GOTTarget);
-          LLVM_DEBUG({
-            dbgs() << "  Replaced stub branch with direct branch:\n    ";
-            printEdge(dbgs(), *B, E, getELFX86RelocationKindName(E.getKind()));
-            dbgs() << "\n";
-          });
-        }
-      }
-
-  return Error::success();
-}
-
 static const char *getELFX86_64RelocName(uint32_t Type) {
   switch (Type) {
 #define ELF_RELOC(Name, Number)                                                \
@@ -248,6 +176,8 @@ class ELFLinkGraphBuilder_x86_64 : public ELFLinkGraphBuilder<object::ELF64LE> {
   static Expected<ELF_x86_64_Edges::ELFX86RelocationKind>
   getRelocationKind(const uint32_t Type) {
     switch (Type) {
+    case ELF::R_X86_64_32S:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::Pointer32Signed;
     case ELF::R_X86_64_PC32:
       return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32;
     case ELF::R_X86_64_PC64:
@@ -256,9 +186,11 @@ class ELFLinkGraphBuilder_x86_64 : public ELFLinkGraphBuilder<object::ELF64LE> {
     case ELF::R_X86_64_64:
       return ELF_x86_64_Edges::ELFX86RelocationKind::Pointer64;
     case ELF::R_X86_64_GOTPCREL:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoad;
     case ELF::R_X86_64_GOTPCRELX:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoadRelaxable;
     case ELF::R_X86_64_REX_GOTPCRELX:
-      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32GOTLoad;
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32REXGOTLoadRelaxable;
     case ELF::R_X86_64_GOTPCREL64:
       return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel64GOT;
     case ELF::R_X86_64_GOT64:
@@ -368,10 +300,22 @@ class ELFLinkGraphBuilder_x86_64 : public ELFLinkGraphBuilder<object::ELF64LE> {
         case Delta64:
           Kind = x86_64::Delta64;
           break;
+        case Pointer32Signed:
+          Kind = x86_64::Pointer32Signed;
+          break;
         case Pointer64:
           Kind = x86_64::Pointer64;
           break;
         case PCRel32GOTLoad: {
+          Kind = x86_64::RequestGOTAndTransformToDelta32;
+          break;
+        }
+        case PCRel32REXGOTLoadRelaxable: {
+          Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable;
+          Addend = 0;
+          break;
+        }
+        case PCRel32GOTLoadRelaxable: {
           Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable;
           Addend = 0;
           break;
@@ -546,7 +490,7 @@ void link_ELF_x86_64(std::unique_ptr<LinkGraph> G,
             identifyELFSectionStartAndEndSymbols));
 
     // Add GOT/Stubs optimizer pass.
-    Config.PreFixupPasses.push_back(optimizeELF_x86_64_GOTAndStubs);
+    Config.PreFixupPasses.push_back(x86_64::optimize_x86_64_GOTAndStubs);
   }
 
   if (auto Err = Ctx->modifyPassConfig(*G, Config))
@@ -558,12 +502,18 @@ const char *getELFX86RelocationKindName(Edge::Kind R) {
   switch (R) {
   case Branch32:
     return "Branch32";
+  case Pointer32Signed:
+    return "Pointer32Signed";
   case Pointer64:
     return "Pointer64";
   case PCRel32:
     return "PCRel32";
   case PCRel32GOTLoad:
     return "PCRel32GOTLoad";
+  case PCRel32GOTLoadRelaxable:
+    return "PCRel32GOTLoadRelaxable";
+  case PCRel32REXGOTLoadRelaxable:
+    return "PCRel32REXGOTLoad";
   case PCRel64GOT:
     return "PCRel64GOT";
   case Delta64:
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index 2acd0150c7261..5e3b1e7b234f0 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -300,7 +300,7 @@ class MachOLinkGraphBuilder_x86_64 : public MachOLinkGraphBuilder {
           else
             return TargetSymbolOrErr.takeError();
           Addend = *(const little32_t *)FixupContent;
-          Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable;
+          Kind = x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable;
           if (FixupOffset < 3)
             return make_error<JITLinkError>("GOTLD at invalid offset " +
                                             formatv("{0}", FixupOffset));
@@ -319,7 +319,10 @@ class MachOLinkGraphBuilder_x86_64 : public MachOLinkGraphBuilder {
           else
             return TargetSymbolOrErr.takeError();
           Addend = *(const little32_t *)FixupContent;
-          Kind = x86_64::RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable;
+          Kind = x86_64::RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable;
+          if (FixupOffset < 3)
+            return make_error<JITLinkError>("TLV at invalid offset " +
+                                            formatv("{0}", FixupOffset));
           break;
         case MachOPointer32:
           if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
@@ -429,7 +432,7 @@ class PerGraphGOTAndPLTStubsBuilder_MachO_x86_64
   bool isGOTEdgeToFix(Edge &E) const {
     return E.getKind() == x86_64::RequestGOTAndTransformToDelta32 ||
            E.getKind() ==
-               x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable;
+               x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable;
   }
 
   Symbol &createGOTEntry(Symbol &Target) {
@@ -442,8 +445,8 @@ class PerGraphGOTAndPLTStubsBuilder_MachO_x86_64
     case x86_64::RequestGOTAndTransformToDelta32:
       E.setKind(x86_64::Delta32);
       break;
-    case x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable:
-      E.setKind(x86_64::PCRel32GOTLoadRelaxable);
+    case x86_64::RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable:
+      E.setKind(x86_64::PCRel32GOTLoadREXRelaxable);
       break;
     default:
       llvm_unreachable("Not a GOT transform edge");
@@ -466,10 +469,10 @@ class PerGraphGOTAndPLTStubsBuilder_MachO_x86_64
     assert(E.getAddend() == 0 &&
            "BranchPCRel32 edge has unexpected addend value");
 
-    // Set the edge kind to BranchPCRel32ToPtrJumpStubRelaxable. We will use
+    // Set the edge kind to BranchPCRel32ToPtrJumpStubBypassable. We will use
     // this to check for stub optimization opportunities in the
     // optimizeMachO_x86_64_GOTAndStubs pass below.
-    E.setKind(x86_64::BranchPCRel32ToPtrJumpStubRelaxable);
+    E.setKind(x86_64::BranchPCRel32ToPtrJumpStubBypassable);
     E.setTarget(Stub);
   }
 
@@ -495,79 +498,6 @@ class PerGraphGOTAndPLTStubsBuilder_MachO_x86_64
 
 } // namespace
 
-static Error optimizeMachO_x86_64_GOTAndStubs(LinkGraph &G) {
-  LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n");
-
-  for (auto *B : G.blocks())
-    for (auto &E : B->edges())
-      if (E.getKind() == x86_64::PCRel32GOTLoadRelaxable) {
-        assert(E.getOffset() >= 3 && "GOT edge occurs too early in block");
-
-        // Optimize GOT references.
-        auto &GOTBlock = E.getTarget().getBlock();
-        assert(GOTBlock.getSize() == G.getPointerSize() &&
-               "GOT entry block should be pointer sized");
-        assert(GOTBlock.edges_size() == 1 &&
-               "GOT entry should only have one outgoing edge");
-
-        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
-        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
-        JITTargetAddress TargetAddr = GOTTarget.getAddress();
-
-        // Check that this is a recognized MOV instruction.
-        // FIXME: Can we assume this?
-        constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b};
-        if (strncmp(B->getContent().data() + E.getOffset() - 3,
-                    reinterpret_cast<const char *>(MOVQRIPRel), 2) != 0)
-          continue;
-
-        int64_t Displacement = TargetAddr - EdgeAddr + 4;
-        if (Displacement >= std::numeric_limits<int32_t>::min() &&
-            Displacement <= std::numeric_limits<int32_t>::max()) {
-          E.setTarget(GOTTarget);
-          E.setKind(x86_64::Delta32);
-          E.setAddend(E.getAddend() - 4);
-          char *BlockData = B->getMutableContent(G).data();
-          BlockData[E.getOffset() - 2] = (char)0x8d;
-          LLVM_DEBUG({
-            dbgs() << "  Replaced GOT load wih LEA:\n    ";
-            printEdge(dbgs(), *B, E, x86_64::getEdgeKindName(E.getKind()));
-            dbgs() << "\n";
-          });
-        }
-      } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubRelaxable) {
-        auto &StubBlock = E.getTarget().getBlock();
-        assert(StubBlock.getSize() == sizeof(x86_64::PointerJumpStubContent) &&
-               "Stub block should be stub sized");
-        assert(StubBlock.edges_size() == 1 &&
-               "Stub block should only have one outgoing edge");
-
-        auto &GOTBlock = StubBlock.edges().begin()->getTarget().getBlock();
-        assert(GOTBlock.getSize() == G.getPointerSize() &&
-               "GOT block should be pointer sized");
-        assert(GOTBlock.edges_size() == 1 &&
-               "GOT block should only have one outgoing edge");
-
-        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
-        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
-        JITTargetAddress TargetAddr = GOTTarget.getAddress();
-
-        int64_t Displacement = TargetAddr - EdgeAddr + 4;
-        if (Displacement >= std::numeric_limits<int32_t>::min() &&
-            Displacement <= std::numeric_limits<int32_t>::max()) {
-          E.setKind(x86_64::BranchPCRel32);
-          E.setTarget(GOTTarget);
-          LLVM_DEBUG({
-            dbgs() << "  Replaced stub branch with direct branch:\n    ";
-            printEdge(dbgs(), *B, E, x86_64::getEdgeKindName(E.getKind()));
-            dbgs() << "\n";
-          });
-        }
-      }
-
-  return Error::success();
-}
-
 namespace llvm {
 namespace jitlink {
 
@@ -615,7 +545,7 @@ void link_MachO_x86_64(std::unique_ptr<LinkGraph> G,
         PerGraphGOTAndPLTStubsBuilder_MachO_x86_64::asPass);
 
     // Add GOT/Stubs optimizer pass.
-    Config.PreFixupPasses.push_back(optimizeMachO_x86_64_GOTAndStubs);
+    Config.PreFixupPasses.push_back(x86_64::optimize_x86_64_GOTAndStubs);
   }
 
   if (auto Err = Ctx->modifyPassConfig(*G, Config))
diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
index 7d28007b49945..4c107cbf41924 100644
--- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
@@ -24,6 +24,8 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "Pointer64";
   case Pointer32:
     return "Pointer32";
+  case Pointer32Signed:
+    return "Pointer32Signed";
   case Delta64:
     return "Delta64";
   case Delta32:
@@ -38,22 +40,26 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "BranchPCRel32";
   case BranchPCRel32ToPtrJumpStub:
     return "BranchPCRel32ToPtrJumpStub";
-  case BranchPCRel32ToPtrJumpStubRelaxable:
-    return "BranchPCRel32ToPtrJumpStubRelaxable";
+  case BranchPCRel32ToPtrJumpStubBypassable:
+    return "BranchPCRel32ToPtrJumpStubBypassable";
   case RequestGOTAndTransformToDelta32:
     return "RequestGOTAndTransformToDelta32";
   case RequestGOTAndTransformToDelta64:
     return "RequestGOTAndTransformToDelta64";
   case RequestGOTAndTransformToDelta64FromGOT:
     return "RequestGOTAndTransformToDelta64FromGOT";
+  case PCRel32GOTLoadREXRelaxable:
+    return "PCRel32GOTLoadREXRelaxable";
+  case RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable:
+    return "RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable";
   case PCRel32GOTLoadRelaxable:
     return "PCRel32GOTLoadRelaxable";
   case RequestGOTAndTransformToPCRel32GOTLoadRelaxable:
     return "RequestGOTAndTransformToPCRel32GOTLoadRelaxable";
-  case PCRel32TLVPLoadRelaxable:
-    return "PCRel32TLVPLoadRelaxable";
-  case RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable:
-    return "RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable";
+  case PCRel32TLVPLoadREXRelaxable:
+    return "PCRel32TLVPLoadREXRelaxable";
+  case RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable:
+    return "RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable";
   default:
     return getGenericEdgeKindName(static_cast<Edge::Kind>(K));
   }
@@ -65,6 +71,119 @@ const char NullPointerContent[PointerSize] = {0x00, 0x00, 0x00, 0x00,
 const char PointerJumpStubContent[6] = {
     static_cast<char>(0xFFu), 0x25, 0x00, 0x00, 0x00, 0x00};
 
+Error optimize_x86_64_GOTAndStubs(LinkGraph &G) {
+  LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n");
+
+  for (auto *B : G.blocks())
+    for (auto &E : B->edges()) {
+      if (E.getKind() == x86_64::PCRel32GOTLoadRelaxable ||
+          E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable) {
+#ifndef NDEBUG
+        bool REXPrefix = E.getKind() == x86_64::PCRel32GOTLoadREXRelaxable;
+        assert(E.getOffset() >= (REXPrefix ? 3 : 2) &&
+               "GOT edge occurs too early in block");
+#endif
+        auto *FixupData = reinterpret_cast<uint8_t *>(
+                              const_cast<char *>(B->getContent().data())) +
+                          E.getOffset();
+        const uint8_t Op = FixupData[-2];
+        const uint8_t ModRM = FixupData[-1];
+
+        auto &GOTEntryBlock = E.getTarget().getBlock();
+        assert(GOTEntryBlock.getSize() == G.getPointerSize() &&
+               "GOT entry block should be pointer sized");
+        assert(GOTEntryBlock.edges_size() == 1 &&
+               "GOT entry should only have one outgoing edge");
+        auto &GOTTarget = GOTEntryBlock.edges().begin()->getTarget();
+        JITTargetAddress TargetAddr = GOTTarget.getAddress();
+        JITTargetAddress EdgeAddr = B->getFixupAddress(E);
+        int64_t Displacement = TargetAddr - EdgeAddr + 4;
+        bool TargetInRangeForImmU32 = isInRangeForImmU32(TargetAddr);
+        bool DisplacementInRangeForImmS32 = isInRangeForImmS32(Displacement);
+
+        // If both of the Target and displacement is out of range, then
+        // there isn't optimization chance.
+        if (!(TargetInRangeForImmU32 || DisplacementInRangeForImmS32))
+          continue;
+
+        // Transform "mov foo@GOTPCREL(%rip),%reg" to "lea foo(%rip),%reg".
+        if (Op == 0x8b && DisplacementInRangeForImmS32) {
+          FixupData[-2] = 0x8d;
+          E.setKind(x86_64::Delta32);
+          E.setTarget(GOTTarget);
+          E.setAddend(E.getAddend() - 4);
+          LLVM_DEBUG({
+            dbgs() << "  Replaced GOT load wih LEA:\n    ";
+            printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
+            dbgs() << "\n";
+          });
+          continue;
+        }
+
+        // Transform call/jmp instructions
+        if (Op == 0xff && TargetInRangeForImmU32) {
+          if (ModRM == 0x15) {
+            // ABI says we can convert "call *foo@GOTPCREL(%rip)" to "nop; call
+            // foo" But lld convert it to "addr32 call foo, because that makes
+            // result expression to be a single instruction.
+            FixupData[-2] = 0x67;
+            FixupData[-1] = 0xe8;
+            LLVM_DEBUG({
+              dbgs() << "  replaced call instruction's memory operand wih imm "
+                        "operand:\n    ";
+              printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
+              dbgs() << "\n";
+            });
+          } else {
+            // Transform "jmp *foo@GOTPCREL(%rip)" to "jmp foo; nop"
+            assert(ModRM == 0x25 && "Invalid ModRm for call/jmp instructions");
+            FixupData[-2] = 0xe9;
+            FixupData[3] = 0x90;
+            E.setOffset(E.getOffset() - 1);
+            LLVM_DEBUG({
+              dbgs() << "  replaced jmp instruction's memory operand wih imm "
+                        "operand:\n    ";
+              printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
+              dbgs() << "\n";
+            });
+          }
+          E.setKind(x86_64::Pointer32);
+          E.setTarget(GOTTarget);
+          continue;
+        }
+      } else if (E.getKind() == x86_64::BranchPCRel32ToPtrJumpStubBypassable) {
+        auto &StubBlock = E.getTarget().getBlock();
+        assert(StubBlock.getSize() == sizeof(PointerJumpStubContent) &&
+               "Stub block should be stub sized");
+        assert(StubBlock.edges_size() == 1 &&
+               "Stub block should only have one outgoing edge");
+
+        auto &GOTBlock = StubBlock.edges().begin()->getTarget().getBlock();
+        assert(GOTBlock.getSize() == G.getPointerSize() &&
+               "GOT block should be pointer sized");
+        assert(GOTBlock.edges_size() == 1 &&
+               "GOT block should only have one outgoing edge");
+
+        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
+        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
+        JITTargetAddress TargetAddr = GOTTarget.getAddress();
+
+        int64_t Displacement = TargetAddr - EdgeAddr + 4;
+        if (isInRangeForImmS32(Displacement)) {
+          E.setKind(x86_64::BranchPCRel32);
+          E.setTarget(GOTTarget);
+          LLVM_DEBUG({
+            dbgs() << "  Replaced stub branch with direct branch:\n    ";
+            printEdge(dbgs(), *B, E, getEdgeKindName(E.getKind()));
+            dbgs() << "\n";
+          });
+        }
+      }
+    }
+
+  return Error::success();
+}
+
 } // end namespace x86_64
 } // end namespace jitlink
 } // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
index edf9b3ff9b394..fb697814fc062 100644
--- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_component_library(LLVMOrcJIT
   EPCDynamicLibrarySearchGenerator.cpp
   EPCDebugObjectRegistrar.cpp
   EPCEHFrameRegistrar.cpp
+  EPCGenericMemoryAccess.cpp
   EPCIndirectionUtils.cpp
   ExecutionUtils.cpp
   IndirectionUtils.cpp
@@ -15,6 +16,7 @@ add_llvm_component_library(LLVMOrcJIT
   JITTargetMachineBuilder.cpp
   LazyReexports.cpp
   Layer.cpp
+  LookupAndRecordAddrs.cpp
   LLJIT.cpp
   MachOPlatform.cpp
   Mangling.cpp
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericMemoryAccess.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericMemoryAccess.cpp
new file mode 100644
index 0000000000000..319909fed7837
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericMemoryAccess.cpp
@@ -0,0 +1,44 @@
+//===----- EPCGenericMemoryAccess.cpp - Generic EPC MemoryAccess impl -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h"
+#include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h"
+
+namespace llvm {
+namespace orc {
+
+/// Create from a ExecutorProcessControl instance.
+Expected<std::unique_ptr<EPCGenericMemoryAccess>>
+EPCGenericMemoryAccess::CreateUsingOrcRTFuncs(ExecutionSession &ES,
+                                              JITDylib &OrcRuntimeJD) {
+
+  StringRef GlobalPrefix = "";
+  if (ES.getExecutorProcessControl().getTargetTriple().isOSBinFormatMachO())
+    GlobalPrefix = "_";
+
+  FuncAddrs FAs;
+  if (auto Err = lookupAndRecordAddrs(
+          ES, LookupKind::Static, makeJITDylibSearchOrder(&OrcRuntimeJD),
+          {{ES.intern((GlobalPrefix + "__orc_rt_write_uint8s_wrapper").str()),
+            &FAs.WriteUInt8s},
+           {ES.intern((GlobalPrefix + "__orc_rt_write_uint16s_wrapper").str()),
+            &FAs.WriteUInt16s},
+           {ES.intern((GlobalPrefix + "__orc_rt_write_uint32s_wrapper").str()),
+            &FAs.WriteUInt32s},
+           {ES.intern((GlobalPrefix + "__orc_rt_write_uint64s_wrapper").str()),
+            &FAs.WriteUInt64s},
+           {ES.intern((GlobalPrefix + "__orc_rt_write_buffers_wrapper").str()),
+            &FAs.WriteBuffers}}))
+    return std::move(Err);
+
+  return std::make_unique<EPCGenericMemoryAccess>(
+      ES.getExecutorProcessControl(), FAs);
+}
+
+} // end namespace orc
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
index 7d86d125d1db2..e2511f870d6d9 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutorProcessControl.cpp
@@ -120,35 +120,35 @@ void SelfExecutorProcessControl::callWrapperAsync(
 
 Error SelfExecutorProcessControl::disconnect() { return Error::success(); }
 
-void SelfExecutorProcessControl::writeUInt8s(ArrayRef<tpctypes::UInt8Write> Ws,
-                                             WriteResultFn OnWriteComplete) {
+void SelfExecutorProcessControl::writeUInt8sAsync(
+    ArrayRef<tpctypes::UInt8Write> Ws, WriteResultFn OnWriteComplete) {
   for (auto &W : Ws)
     *jitTargetAddressToPointer<uint8_t *>(W.Address) = W.Value;
   OnWriteComplete(Error::success());
 }
 
-void SelfExecutorProcessControl::writeUInt16s(
+void SelfExecutorProcessControl::writeUInt16sAsync(
     ArrayRef<tpctypes::UInt16Write> Ws, WriteResultFn OnWriteComplete) {
   for (auto &W : Ws)
     *jitTargetAddressToPointer<uint16_t *>(W.Address) = W.Value;
   OnWriteComplete(Error::success());
 }
 
-void SelfExecutorProcessControl::writeUInt32s(
+void SelfExecutorProcessControl::writeUInt32sAsync(
     ArrayRef<tpctypes::UInt32Write> Ws, WriteResultFn OnWriteComplete) {
   for (auto &W : Ws)
     *jitTargetAddressToPointer<uint32_t *>(W.Address) = W.Value;
   OnWriteComplete(Error::success());
 }
 
-void SelfExecutorProcessControl::writeUInt64s(
+void SelfExecutorProcessControl::writeUInt64sAsync(
     ArrayRef<tpctypes::UInt64Write> Ws, WriteResultFn OnWriteComplete) {
   for (auto &W : Ws)
     *jitTargetAddressToPointer<uint64_t *>(W.Address) = W.Value;
   OnWriteComplete(Error::success());
 }
 
-void SelfExecutorProcessControl::writeBuffers(
+void SelfExecutorProcessControl::writeBuffersAsync(
     ArrayRef<tpctypes::BufferWrite> Ws, WriteResultFn OnWriteComplete) {
   for (auto &W : Ws)
     memcpy(jitTargetAddressToPointer<char *>(W.Address), W.Buffer.data(),
diff --git a/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp b/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp
new file mode 100644
index 0000000000000..35b469692f80d
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/LookupAndRecordAddrs.cpp
@@ -0,0 +1,82 @@
+//===------- LookupAndRecordAddrs.h - Symbol lookup support utility -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h"
+
+#include <future>
+
+namespace llvm {
+namespace orc {
+
+void lookupAndRecordAddrs(
+    unique_function<void(Error)> OnRecorded, ExecutionSession &ES, LookupKind K,
+    const JITDylibSearchOrder &SearchOrder,
+    std::vector<std::pair<SymbolStringPtr, ExecutorAddress *>> Pairs,
+    SymbolLookupFlags LookupFlags) {
+
+  SymbolLookupSet Symbols;
+  for (auto &KV : Pairs)
+    Symbols.add(KV.first, LookupFlags);
+
+  ES.lookup(
+      K, SearchOrder, Symbols, SymbolState::Ready,
+      [Pairs = std::move(Pairs),
+       OnRec = std::move(OnRecorded)](Expected<SymbolMap> Result) mutable {
+        if (!Result)
+          return OnRec(Result.takeError());
+        for (auto &KV : Pairs) {
+          auto I = Result->find(KV.first);
+          KV.second->setValue((I != Result->end()) ? I->second.getAddress()
+                                                   : 0);
+        }
+        OnRec(Error::success());
+      },
+      NoDependenciesToRegister);
+}
+
+Error lookupAndRecordAddrs(
+    ExecutionSession &ES, LookupKind K, const JITDylibSearchOrder &SearchOrder,
+    std::vector<std::pair<SymbolStringPtr, ExecutorAddress *>> Pairs,
+    SymbolLookupFlags LookupFlags) {
+
+  std::promise<MSVCPError> ResultP;
+  auto ResultF = ResultP.get_future();
+  lookupAndRecordAddrs([&](Error Err) { ResultP.set_value(std::move(Err)); },
+                       ES, K, SearchOrder, Pairs, LookupFlags);
+  return ResultF.get();
+}
+
+Error lookupAndRecordAddrs(
+    ExecutorProcessControl &EPC, tpctypes::DylibHandle H,
+    std::vector<std::pair<SymbolStringPtr, ExecutorAddress *>> Pairs,
+    SymbolLookupFlags LookupFlags) {
+
+  SymbolLookupSet Symbols;
+  for (auto &KV : Pairs)
+    Symbols.add(KV.first, LookupFlags);
+
+  ExecutorProcessControl::LookupRequest LR(H, Symbols);
+  auto Result = EPC.lookupSymbols(LR);
+  if (!Result)
+    return Result.takeError();
+
+  if (Result->size() != 1)
+    return make_error<StringError>("Error in lookup result",
+                                   inconvertibleErrorCode());
+  if (Result->front().size() != Pairs.size())
+    return make_error<StringError>("Error in lookup result elements",
+                                   inconvertibleErrorCode());
+
+  for (unsigned I = 0; I != Pairs.size(); ++I)
+    Pairs[I].second->setValue(Result->front()[I]);
+
+  return Error::success();
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 66ef835dc34da..e04fe1f19212c 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/Debug.h"
 
@@ -496,31 +497,17 @@ void MachOPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult,
 
 Error MachOPlatform::bootstrapMachORuntime(JITDylib &PlatformJD) {
 
-  std::pair<const char *, ExecutorAddress *> Symbols[] = {
-      {"___orc_rt_macho_platform_bootstrap", &orc_rt_macho_platform_bootstrap},
-      {"___orc_rt_macho_platform_shutdown", &orc_rt_macho_platform_shutdown},
-      {"___orc_rt_macho_register_object_sections",
-       &orc_rt_macho_register_object_sections},
-      {"___orc_rt_macho_create_pthread_key", &orc_rt_macho_create_pthread_key}};
-
-  SymbolLookupSet RuntimeSymbols;
-  std::vector<std::pair<SymbolStringPtr, ExecutorAddress *>> AddrsToRecord;
-  for (const auto &KV : Symbols) {
-    auto Name = ES.intern(KV.first);
-    RuntimeSymbols.add(Name);
-    AddrsToRecord.push_back({std::move(Name), KV.second});
-  }
-
-  auto RuntimeSymbolAddrs = ES.lookup(
-      {{&PlatformJD, JITDylibLookupFlags::MatchAllSymbols}}, RuntimeSymbols);
-  if (!RuntimeSymbolAddrs)
-    return RuntimeSymbolAddrs.takeError();
-
-  for (const auto &KV : AddrsToRecord) {
-    auto &Name = KV.first;
-    assert(RuntimeSymbolAddrs->count(Name) && "Missing runtime symbol?");
-    KV.second->setValue((*RuntimeSymbolAddrs)[Name].getAddress());
-  }
+  if (auto Err = lookupAndRecordAddrs(
+          ES, LookupKind::Static, makeJITDylibSearchOrder(&PlatformJD),
+          {{ES.intern("___orc_rt_macho_platform_bootstrap"),
+            &orc_rt_macho_platform_bootstrap},
+           {ES.intern("___orc_rt_macho_platform_shutdown"),
+            &orc_rt_macho_platform_shutdown},
+           {ES.intern("___orc_rt_macho_register_object_sections"),
+            &orc_rt_macho_register_object_sections},
+           {ES.intern("___orc_rt_macho_create_pthread_key"),
+            &orc_rt_macho_create_pthread_key}}))
+    return Err;
 
   if (auto Err =
           ES.callSPSWrapper<void()>(orc_rt_macho_platform_bootstrap.getValue()))
@@ -950,9 +937,9 @@ Error MachOPlatform::MachOPlatformPlugin::fixTLVSectionsAndEdges(
   for (auto *B : G.blocks())
     for (auto &E : B->edges())
       if (E.getKind() ==
-          jitlink::x86_64::RequestTLVPAndTransformToPCRel32TLVPLoadRelaxable)
-        E.setKind(
-            jitlink::x86_64::RequestGOTAndTransformToPCRel32GOTLoadRelaxable);
+          jitlink::x86_64::RequestTLVPAndTransformToPCRel32TLVPLoadREXRelaxable)
+        E.setKind(jitlink::x86_64::
+                      RequestGOTAndTransformToPCRel32GOTLoadREXRelaxable);
 
   return Error::success();
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
index db002a34b4445..6f891afabfdc3 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
@@ -85,11 +85,11 @@ static Error deregisterFrameWrapper(const void *P) {
 }
 #endif
 
-#ifdef __APPLE__
+#ifdef HAVE_UNW_ADD_DYNAMIC_FDE
 
 template <typename HandleFDEFn>
-Error walkAppleEHFrameSection(const char *const SectionStart,
-                              size_t SectionSize, HandleFDEFn HandleFDE) {
+Error walkLibunwindEHFrameSection(const char *const SectionStart,
+                                  size_t SectionSize, HandleFDEFn HandleFDE) {
   const char *CurCFIRecord = SectionStart;
   const char *End = SectionStart + SectionSize;
   uint64_t Size = *reinterpret_cast<const uint32_t *>(CurCFIRecord);
@@ -123,16 +123,19 @@ Error walkAppleEHFrameSection(const char *const SectionStart,
   return Error::success();
 }
 
-#endif // __APPLE__
+#endif // HAVE_UNW_ADD_DYNAMIC_FDE
 
 Error registerEHFrameSection(const void *EHFrameSectionAddr,
                              size_t EHFrameSectionSize) {
-#ifdef __APPLE__
-  // On Darwin __register_frame has to be called for each FDE entry.
-  return walkAppleEHFrameSection(static_cast<const char *>(EHFrameSectionAddr),
-                                 EHFrameSectionSize, registerFrameWrapper);
+  /* libgcc and libunwind __register_frame behave differently. We use the
+   * presence of __unw_add_dynamic_fde to detect libunwind. */
+#ifdef HAVE_UNW_ADD_DYNAMIC_FDE
+  // With libunwind, __register_frame has to be called for each FDE entry.
+  return walkLibunwindEHFrameSection(
+      static_cast<const char *>(EHFrameSectionAddr), EHFrameSectionSize,
+      registerFrameWrapper);
 #else
-  // On Linux __register_frame takes a single argument:
+  // With libgcc, __register_frame takes a single argument:
   // a pointer to the start of the .eh_frame section.
 
   // How can it find the end? Because crtendS.o is linked
@@ -143,9 +146,10 @@ Error registerEHFrameSection(const void *EHFrameSectionAddr,
 
 Error deregisterEHFrameSection(const void *EHFrameSectionAddr,
                                size_t EHFrameSectionSize) {
-#ifdef __APPLE__
-  return walkAppleEHFrameSection(static_cast<const char *>(EHFrameSectionAddr),
-                                 EHFrameSectionSize, deregisterFrameWrapper);
+#ifdef HAVE_UNW_ADD_DYNAMIC_FDE
+  return walkLibunwindEHFrameSection(
+      static_cast<const char *>(EHFrameSectionAddr), EHFrameSectionSize,
+      deregisterFrameWrapper);
 #else
   return deregisterFrameWrapper(EHFrameSectionAddr);
 #endif
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
index b6ccd02405c10..252e20c3c38c9 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp
@@ -67,7 +67,9 @@ static void __deregister_frame(void *p) {
 }
 #endif
 
-#ifdef __APPLE__
+/* libgcc and libunwind __register_frame behave differently. We use the presence
+ * of __unw_add_dynamic_fde to detect libunwind. */
+#ifdef HAVE_UNW_ADD_DYNAMIC_FDE
 
 static const char *processFDE(const char *Entry, bool isDeregister) {
   const char *P = Entry;
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index f90774412d417..29fe2a8f0b391 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -44,11 +44,11 @@ void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
 
   // Get the function's current attributes.
   auto Attrs = Fn.getAttributes();
-  auto FnAttrs = Attrs.getFnAttributes();
-  auto RetAttrs = Attrs.getRetAttributes();
+  auto FnAttrs = Attrs.getFnAttrs();
+  auto RetAttrs = Attrs.getRetAttrs();
   SmallVector<AttributeSet, 4> ArgAttrs;
   for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
-    ArgAttrs.emplace_back(Attrs.getParamAttributes(ArgNo));
+    ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
 
 #define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
@@ -261,14 +261,6 @@ Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
   return Builder.CreatePointerCast(Ident, IdentPtr);
 }
 
-Type *OpenMPIRBuilder::getLanemaskType() {
-  LLVMContext &Ctx = M.getContext();
-  Triple triple(M.getTargetTriple());
-
-  // This test is adequate until deviceRTL has finer grained lane widths
-  return triple.isAMDGCN() ? Type::getInt64Ty(Ctx) : Type::getInt32Ty(Ctx);
-}
-
 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr) {
   Constant *&SrcLocStr = SrcLocStrMap[LocStr];
   if (!SrcLocStr) {
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 5842d400eb67b..1523e222b5bf3 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -988,7 +988,7 @@ void SlotTracker::processModule() {
 
     // Add all the function attributes to the table.
     // FIXME: Add attributes of other objects?
-    AttributeSet FnAttrs = F.getAttributes().getFnAttributes();
+    AttributeSet FnAttrs = F.getAttributes().getFnAttrs();
     if (FnAttrs.hasAttributes())
       CreateAttributeSetSlot(FnAttrs);
   }
@@ -1029,7 +1029,7 @@ void SlotTracker::processFunction() {
       // target may not be linked into the optimizer.
       if (const auto *Call = dyn_cast<CallBase>(&I)) {
         // Add all the call attributes to the table.
-        AttributeSet Attrs = Call->getAttributes().getFnAttributes();
+        AttributeSet Attrs = Call->getAttributes().getFnAttrs();
         if (Attrs.hasAttributes())
           CreateAttributeSetSlot(Attrs);
       }
@@ -2039,6 +2039,7 @@ static void writeDIDerivedType(raw_ostream &Out, const DIDerivedType *N,
   if (const auto &DWARFAddressSpace = N->getDWARFAddressSpace())
     Printer.printInt("dwarfAddressSpace", *DWARFAddressSpace,
                      /* ShouldSkipZero */ false);
+  Printer.printMetadata("annotations", N->getRawAnnotations());
   Out << ")";
 }
 
@@ -2072,6 +2073,7 @@ static void writeDICompositeType(raw_ostream &Out, const DICompositeType *N,
                      /* ShouldSkipZero */ false);
   else
     Printer.printMetadata("rank", N->getRawRank(), /*ShouldSkipNull */ true);
+  Printer.printMetadata("annotations", N->getRawAnnotations());
   Out << ")";
 }
 
@@ -3682,8 +3684,8 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << "; Materializable\n";
 
   const AttributeList &Attrs = F->getAttributes();
-  if (Attrs.hasAttributes(AttributeList::FunctionIndex)) {
-    AttributeSet AS = Attrs.getFnAttributes();
+  if (Attrs.hasFnAttrs()) {
+    AttributeSet AS = Attrs.getFnAttrs();
     std::string AttrStr;
 
     for (const Attribute &Attr : AS) {
@@ -3720,7 +3722,7 @@ void AssemblyWriter::printFunction(const Function *F) {
   }
 
   FunctionType *FT = F->getFunctionType();
-  if (Attrs.hasAttributes(AttributeList::ReturnIndex))
+  if (Attrs.hasRetAttrs())
     Out << Attrs.getAsString(AttributeList::ReturnIndex) << ' ';
   TypePrinter.print(F->getReturnType(), Out);
   Out << ' ';
@@ -3737,7 +3739,7 @@ void AssemblyWriter::printFunction(const Function *F) {
       // Output type...
       TypePrinter.print(FT->getParamType(I), Out);
 
-      AttributeSet ArgAttrs = Attrs.getParamAttributes(I);
+      AttributeSet ArgAttrs = Attrs.getParamAttrs(I);
       if (ArgAttrs.hasAttributes()) {
         Out << ' ';
         writeAttributeSet(ArgAttrs);
@@ -3749,7 +3751,7 @@ void AssemblyWriter::printFunction(const Function *F) {
       // Insert commas as we go... the first arg doesn't get a comma
       if (Arg.getArgNo() != 0)
         Out << ", ";
-      printArgument(&Arg, Attrs.getParamAttributes(Arg.getArgNo()));
+      printArgument(&Arg, Attrs.getParamAttrs(Arg.getArgNo()));
     }
   }
 
@@ -3769,8 +3771,8 @@ void AssemblyWriter::printFunction(const Function *F) {
   if (F->getAddressSpace() != 0 || !Mod ||
       Mod->getDataLayout().getProgramAddressSpace() != 0)
     Out << " addrspace(" << F->getAddressSpace() << ")";
-  if (Attrs.hasAttributes(AttributeList::FunctionIndex))
-    Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttributes());
+  if (Attrs.hasFnAttrs())
+    Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttrs());
   if (F->hasSection()) {
     Out << " section \"";
     printEscapedString(F->getSection(), Out);
@@ -4126,7 +4128,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Type *RetTy = FTy->getReturnType();
     const AttributeList &PAL = CI->getAttributes();
 
-    if (PAL.hasAttributes(AttributeList::ReturnIndex))
+    if (PAL.hasRetAttrs())
       Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex);
 
     // Only print addrspace(N) if necessary:
@@ -4144,7 +4146,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     for (unsigned op = 0, Eop = CI->getNumArgOperands(); op < Eop; ++op) {
       if (op > 0)
         Out << ", ";
-      writeParamOperand(CI->getArgOperand(op), PAL.getParamAttributes(op));
+      writeParamOperand(CI->getArgOperand(op), PAL.getParamAttrs(op));
     }
 
     // Emit an ellipsis if this is a musttail call in a vararg function.  This
@@ -4155,8 +4157,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       Out << ", ...";
 
     Out << ')';
-    if (PAL.hasAttributes(AttributeList::FunctionIndex))
-      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
+    if (PAL.hasFnAttrs())
+      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttrs());
 
     writeOperandBundles(CI);
   } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
@@ -4171,7 +4173,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       PrintCallingConv(II->getCallingConv(), Out);
     }
 
-    if (PAL.hasAttributes(AttributeList::ReturnIndex))
+    if (PAL.hasRetAttrs())
       Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex);
 
     // Only print addrspace(N) if necessary:
@@ -4189,12 +4191,12 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     for (unsigned op = 0, Eop = II->getNumArgOperands(); op < Eop; ++op) {
       if (op)
         Out << ", ";
-      writeParamOperand(II->getArgOperand(op), PAL.getParamAttributes(op));
+      writeParamOperand(II->getArgOperand(op), PAL.getParamAttrs(op));
     }
 
     Out << ')';
-    if (PAL.hasAttributes(AttributeList::FunctionIndex))
-      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
+    if (PAL.hasFnAttrs())
+      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttrs());
 
     writeOperandBundles(II);
 
@@ -4214,7 +4216,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       PrintCallingConv(CBI->getCallingConv(), Out);
     }
 
-    if (PAL.hasAttributes(AttributeList::ReturnIndex))
+    if (PAL.hasRetAttrs())
       Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex);
 
     // If possible, print out the short form of the callbr instruction. We can
@@ -4229,12 +4231,12 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     for (unsigned op = 0, Eop = CBI->getNumArgOperands(); op < Eop; ++op) {
       if (op)
         Out << ", ";
-      writeParamOperand(CBI->getArgOperand(op), PAL.getParamAttributes(op));
+      writeParamOperand(CBI->getArgOperand(op), PAL.getParamAttrs(op));
     }
 
     Out << ')';
-    if (PAL.hasAttributes(AttributeList::FunctionIndex))
-      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
+    if (PAL.hasFnAttrs())
+      Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttrs());
 
     writeOperandBundles(CBI);
 
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index ea71472a47cad..d0bf6c121d17f 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -1335,52 +1335,51 @@ AttributeList AttributeList::removeAttributes(LLVMContext &C,
   return getImpl(C, AttrSets);
 }
 
-AttributeList AttributeList::addDereferenceableAttr(LLVMContext &C,
-                                                    unsigned Index,
-                                                    uint64_t Bytes) const {
+AttributeList AttributeList::addDereferenceableRetAttr(LLVMContext &C,
+                                                       uint64_t Bytes) const {
   AttrBuilder B;
   B.addDereferenceableAttr(Bytes);
-  return addAttributes(C, Index, B);
+  return addRetAttributes(C, B);
 }
 
-AttributeList
-AttributeList::addDereferenceableOrNullAttr(LLVMContext &C, unsigned Index,
-                                            uint64_t Bytes) const {
+AttributeList AttributeList::addDereferenceableParamAttr(LLVMContext &C,
+                                                         unsigned Index,
+                                                         uint64_t Bytes) const {
   AttrBuilder B;
-  B.addDereferenceableOrNullAttr(Bytes);
-  return addAttributes(C, Index, B);
+  B.addDereferenceableAttr(Bytes);
+  return addParamAttributes(C, Index, B);
 }
 
 AttributeList
-AttributeList::addAllocSizeAttr(LLVMContext &C, unsigned Index,
-                                unsigned ElemSizeArg,
-                                const Optional<unsigned> &NumElemsArg) {
+AttributeList::addDereferenceableOrNullParamAttr(LLVMContext &C, unsigned Index,
+                                                 uint64_t Bytes) const {
   AttrBuilder B;
-  B.addAllocSizeAttr(ElemSizeArg, NumElemsArg);
-  return addAttributes(C, Index, B);
+  B.addDereferenceableOrNullAttr(Bytes);
+  return addParamAttributes(C, Index, B);
 }
 
-AttributeList AttributeList::addVScaleRangeAttr(LLVMContext &C, unsigned Index,
-                                                unsigned MinValue,
-                                                unsigned MaxValue) {
+AttributeList
+AttributeList::addAllocSizeParamAttr(LLVMContext &C, unsigned Index,
+                                     unsigned ElemSizeArg,
+                                     const Optional<unsigned> &NumElemsArg) {
   AttrBuilder B;
-  B.addVScaleRangeAttr(MinValue, MaxValue);
-  return addAttributes(C, Index, B);
+  B.addAllocSizeAttr(ElemSizeArg, NumElemsArg);
+  return addParamAttributes(C, Index, B);
 }
 
 //===----------------------------------------------------------------------===//
 // AttributeList Accessor Methods
 //===----------------------------------------------------------------------===//
 
-AttributeSet AttributeList::getParamAttributes(unsigned ArgNo) const {
+AttributeSet AttributeList::getParamAttrs(unsigned ArgNo) const {
   return getAttributes(ArgNo + FirstArgIndex);
 }
 
-AttributeSet AttributeList::getRetAttributes() const {
+AttributeSet AttributeList::getRetAttrs() const {
   return getAttributes(ReturnIndex);
 }
 
-AttributeSet AttributeList::getFnAttributes() const {
+AttributeSet AttributeList::getFnAttrs() const {
   return getAttributes(FunctionIndex);
 }
 
@@ -1397,11 +1396,11 @@ bool AttributeList::hasAttributes(unsigned Index) const {
   return getAttributes(Index).hasAttributes();
 }
 
-bool AttributeList::hasFnAttribute(Attribute::AttrKind Kind) const {
+bool AttributeList::hasFnAttr(Attribute::AttrKind Kind) const {
   return pImpl && pImpl->hasFnAttribute(Kind);
 }
 
-bool AttributeList::hasFnAttribute(StringRef Kind) const {
+bool AttributeList::hasFnAttr(StringRef Kind) const {
   return hasAttribute(AttributeList::FunctionIndex, Kind);
 }
 
@@ -1455,26 +1454,29 @@ Type *AttributeList::getParamElementType(unsigned Index) const {
   return getAttributes(Index + FirstArgIndex).getElementType();
 }
 
-MaybeAlign AttributeList::getStackAlignment(unsigned Index) const {
-  return getAttributes(Index).getStackAlignment();
+MaybeAlign AttributeList::getFnStackAlignment() const {
+  return getFnAttrs().getStackAlignment();
 }
 
-uint64_t AttributeList::getDereferenceableBytes(unsigned Index) const {
-  return getAttributes(Index).getDereferenceableBytes();
+MaybeAlign AttributeList::getRetStackAlignment() const {
+  return getRetAttrs().getStackAlignment();
 }
 
-uint64_t AttributeList::getDereferenceableOrNullBytes(unsigned Index) const {
-  return getAttributes(Index).getDereferenceableOrNullBytes();
+uint64_t AttributeList::getRetDereferenceableBytes() const {
+  return getRetAttrs().getDereferenceableBytes();
 }
 
-std::pair<unsigned, Optional<unsigned>>
-AttributeList::getAllocSizeArgs(unsigned Index) const {
-  return getAttributes(Index).getAllocSizeArgs();
+uint64_t AttributeList::getParamDereferenceableBytes(unsigned Index) const {
+  return getParamAttrs(Index).getDereferenceableBytes();
+}
+
+uint64_t AttributeList::getRetDereferenceableOrNullBytes() const {
+  return getRetAttrs().getDereferenceableOrNullBytes();
 }
 
-std::pair<unsigned, unsigned>
-AttributeList::getVScaleRangeArgs(unsigned Index) const {
-  return getAttributes(Index).getVScaleRangeArgs();
+uint64_t
+AttributeList::getParamDereferenceableOrNullBytes(unsigned Index) const {
+  return getParamAttrs(Index).getDereferenceableOrNullBytes();
 }
 
 std::string AttributeList::getAsString(unsigned Index, bool InAttrGrp) const {
@@ -1961,11 +1963,11 @@ static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) {
       .addAttribute(Attribute::StackProtectReq);
 
   if (Callee.hasFnAttribute(Attribute::StackProtectReq)) {
-    Caller.removeAttributes(AttributeList::FunctionIndex, OldSSPAttr);
+    Caller.removeFnAttrs(OldSSPAttr);
     Caller.addFnAttr(Attribute::StackProtectReq);
   } else if (Callee.hasFnAttribute(Attribute::StackProtectStrong) &&
              !Caller.hasFnAttribute(Attribute::StackProtectReq)) {
-    Caller.removeAttributes(AttributeList::FunctionIndex, OldSSPAttr);
+    Caller.removeFnAttrs(OldSSPAttr);
     Caller.addFnAttr(Attribute::StackProtectStrong);
   } else if (Callee.hasFnAttribute(Attribute::StackProtect) &&
              !Caller.hasFnAttribute(Attribute::StackProtectReq) &&
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 6271385183eb9..8e9d0c6c81f51 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -4361,8 +4361,8 @@ struct StrictFPUpgradeVisitor : public InstVisitor<StrictFPUpgradeVisitor> {
       return;
     // If we get here, the caller doesn't have the strictfp attribute
     // but this callsite does. Replace the strictfp attribute with nobuiltin.
-    Call.removeAttribute(AttributeList::FunctionIndex, Attribute::StrictFP);
-    Call.addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin);
+    Call.removeFnAttr(Attribute::StrictFP);
+    Call.addFnAttr(Attribute::NoBuiltin);
   }
 };
 } // namespace
@@ -4383,8 +4383,7 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
   }
 
   // Remove all incompatibile attributes from function.
-  F.removeAttributes(AttributeList::ReturnIndex,
-                     AttributeFuncs::typeIncompatible(F.getReturnType()));
+  F.removeRetAttrs(AttributeFuncs::typeIncompatible(F.getReturnType()));
   for (auto &Arg : F.args())
     Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType()));
 }
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 8a7060c148c95..f171066cd0561 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -2502,7 +2502,7 @@ void LLVMAddTargetDependentFunctionAttr(LLVMValueRef Fn, const char *A,
                                         const char *V) {
   Function *Func = unwrap<Function>(Fn);
   Attribute Attr = Attribute::get(Func->getContext(), A, V);
-  Func->addAttribute(AttributeList::FunctionIndex, Attr);
+  Func->addFnAttr(Attr);
 }
 
 /*--.. Operations on parameters ............................................--*/
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 61d3b5e69e9e0..295c681601d2e 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -345,15 +345,14 @@ DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
                             Flags, ExtraData);
 }
 
-DIDerivedType *DIBuilder::createMemberType(DIScope *Scope, StringRef Name,
-                                           DIFile *File, unsigned LineNumber,
-                                           uint64_t SizeInBits,
-                                           uint32_t AlignInBits,
-                                           uint64_t OffsetInBits,
-                                           DINode::DIFlags Flags, DIType *Ty) {
+DIDerivedType *DIBuilder::createMemberType(
+    DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
+    uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
+    DINode::DIFlags Flags, DIType *Ty, DINodeArray Annotations) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(Scope), Ty,
-                            SizeInBits, AlignInBits, OffsetInBits, None, Flags);
+                            SizeInBits, AlignInBits, OffsetInBits, None, Flags,
+                            nullptr, Annotations);
 }
 
 static ConstantAsMetadata *getConstantOrNull(Constant *C) {
@@ -375,14 +374,15 @@ DIDerivedType *DIBuilder::createVariantMemberType(
 DIDerivedType *DIBuilder::createBitFieldMemberType(
     DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
     uint64_t SizeInBits, uint64_t OffsetInBits, uint64_t StorageOffsetInBits,
-    DINode::DIFlags Flags, DIType *Ty) {
+    DINode::DIFlags Flags, DIType *Ty, DINodeArray Annotations) {
   Flags |= DINode::FlagBitField;
   return DIDerivedType::get(
       VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
       getNonCompileUnitScope(Scope), Ty, SizeInBits, /* AlignInBits */ 0,
       OffsetInBits, None, Flags,
       ConstantAsMetadata::get(ConstantInt::get(IntegerType::get(VMContext, 64),
-                                               StorageOffsetInBits)));
+                                               StorageOffsetInBits)),
+      Annotations);
 }
 
 DIDerivedType *
@@ -628,12 +628,14 @@ DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIScope *Scope,
 DICompositeType *DIBuilder::createReplaceableCompositeType(
     unsigned Tag, StringRef Name, DIScope *Scope, DIFile *F, unsigned Line,
     unsigned RuntimeLang, uint64_t SizeInBits, uint32_t AlignInBits,
-    DINode::DIFlags Flags, StringRef UniqueIdentifier) {
+    DINode::DIFlags Flags, StringRef UniqueIdentifier,
+    DINodeArray Annotations) {
   auto *RetTy =
       DICompositeType::getTemporary(
           VMContext, Tag, Name, F, Line, getNonCompileUnitScope(Scope), nullptr,
           SizeInBits, AlignInBits, 0, Flags, nullptr, RuntimeLang, nullptr,
-          nullptr, UniqueIdentifier)
+          nullptr, UniqueIdentifier, nullptr, nullptr, nullptr, nullptr, nullptr,
+          Annotations)
           .release();
   trackIfUnresolved(RetTy);
   return RetTy;
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index e335064257e1e..7163823291f75 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -582,13 +582,13 @@ DIDerivedType *DIDerivedType::getImpl(
     unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
     uint32_t AlignInBits, uint64_t OffsetInBits,
     Optional<unsigned> DWARFAddressSpace, DIFlags Flags, Metadata *ExtraData,
-    StorageType Storage, bool ShouldCreate) {
+    Metadata *Annotations, StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIDerivedType,
                         (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                          AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
-                         ExtraData));
-  Metadata *Ops[] = {File, Scope, Name, BaseType, ExtraData};
+                         ExtraData, Annotations));
+  Metadata *Ops[] = {File, Scope, Name, BaseType, ExtraData, Annotations};
   DEFINE_GETIMPL_STORE(
       DIDerivedType, (Tag, Line, SizeInBits, AlignInBits, OffsetInBits,
                       DWARFAddressSpace, Flags), Ops);
@@ -601,19 +601,21 @@ DICompositeType *DICompositeType::getImpl(
     Metadata *Elements, unsigned RuntimeLang, Metadata *VTableHolder,
     Metadata *TemplateParams, MDString *Identifier, Metadata *Discriminator,
     Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
-    Metadata *Rank, StorageType Storage, bool ShouldCreate) {
+    Metadata *Rank, Metadata *Annotations, StorageType Storage,
+    bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
 
   // Keep this in sync with buildODRType.
-  DEFINE_GETIMPL_LOOKUP(
-      DICompositeType,
-      (Tag, Name, File, Line, Scope, BaseType, SizeInBits, AlignInBits,
-       OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder, TemplateParams,
-       Identifier, Discriminator, DataLocation, Associated, Allocated, Rank));
+  DEFINE_GETIMPL_LOOKUP(DICompositeType,
+                        (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                         AlignInBits, OffsetInBits, Flags, Elements,
+                         RuntimeLang, VTableHolder, TemplateParams, Identifier,
+                         Discriminator, DataLocation, Associated, Allocated,
+                         Rank, Annotations));
   Metadata *Ops[] = {File,          Scope,        Name,           BaseType,
                      Elements,      VTableHolder, TemplateParams, Identifier,
                      Discriminator, DataLocation, Associated,     Allocated,
-                     Rank};
+                     Rank,          Annotations};
   DEFINE_GETIMPL_STORE(DICompositeType, (Tag, Line, RuntimeLang, SizeInBits,
                                          AlignInBits, OffsetInBits, Flags),
                        Ops);
@@ -626,7 +628,7 @@ DICompositeType *DICompositeType::buildODRType(
     DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
     Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator,
     Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
-    Metadata *Rank) {
+    Metadata *Rank, Metadata *Annotations) {
   assert(!Identifier.getString().empty() && "Expected valid identifier");
   if (!Context.isODRUniquingDebugTypes())
     return nullptr;
@@ -636,7 +638,7 @@ DICompositeType *DICompositeType::buildODRType(
                Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
                VTableHolder, TemplateParams, &Identifier, Discriminator,
-               DataLocation, Associated, Allocated, Rank);
+               DataLocation, Associated, Allocated, Rank, Annotations);
 
   // Only mutate CT if it's a forward declaration and the new operands aren't.
   assert(CT->getRawIdentifier() == &Identifier && "Wrong ODR identifier?");
@@ -649,7 +651,7 @@ DICompositeType *DICompositeType::buildODRType(
   Metadata *Ops[] = {File,          Scope,        Name,           BaseType,
                      Elements,      VTableHolder, TemplateParams, &Identifier,
                      Discriminator, DataLocation, Associated,     Allocated,
-                     Rank};
+                     Rank,          Annotations};
   assert((std::end(Ops) - std::begin(Ops)) == (int)CT->getNumOperands() &&
          "Mismatched number of operands");
   for (unsigned I = 0, E = CT->getNumOperands(); I != E; ++I)
@@ -665,7 +667,7 @@ DICompositeType *DICompositeType::getODRType(
     DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
     Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator,
     Metadata *DataLocation, Metadata *Associated, Metadata *Allocated,
-    Metadata *Rank) {
+    Metadata *Rank, Metadata *Annotations) {
   assert(!Identifier.getString().empty() && "Expected valid identifier");
   if (!Context.isODRUniquingDebugTypes())
     return nullptr;
@@ -675,7 +677,7 @@ DICompositeType *DICompositeType::getODRType(
         Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
         AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder,
         TemplateParams, &Identifier, Discriminator, DataLocation, Associated,
-        Allocated, Rank);
+        Allocated, Rank, Annotations);
   return CT;
 }
 
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 2276c40ab6f09..9313c4980dfbb 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -177,7 +177,7 @@ static Type *getMemoryParamAllocType(AttributeSet ParamAttrs, Type *ArgTy) {
 
 uint64_t Argument::getPassPointeeByValueCopySize(const DataLayout &DL) const {
   AttributeSet ParamAttrs =
-      getParent()->getAttributes().getParamAttributes(getArgNo());
+      getParent()->getAttributes().getParamAttrs(getArgNo());
   if (Type *MemTy = getMemoryParamAllocType(ParamAttrs, getType()))
     return DL.getTypeAllocSize(MemTy);
   return 0;
@@ -185,7 +185,7 @@ uint64_t Argument::getPassPointeeByValueCopySize(const DataLayout &DL) const {
 
 Type *Argument::getPointeeInMemoryValueType() const {
   AttributeSet ParamAttrs =
-      getParent()->getAttributes().getParamAttributes(getArgNo());
+      getParent()->getAttributes().getParamAttrs(getArgNo());
   return getMemoryParamAllocType(ParamAttrs, getType());
 }
 
@@ -354,7 +354,7 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty,
     B.addAttribute("frame-pointer", "all");
     break;
   }
-  F->addAttributes(AttributeList::FunctionIndex, B);
+  F->addFnAttrs(B);
   return F;
 }
 
@@ -529,101 +529,135 @@ void Function::dropAllReferences() {
   clearMetadata();
 }
 
-void Function::addAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addAttribute(getContext(), i, Kind);
-  setAttributes(PAL);
+void Function::addAttribute(unsigned i, Attribute Attr) {
+  AttributeSets = AttributeSets.addAttribute(getContext(), i, Attr);
 }
 
-void Function::addAttribute(unsigned i, Attribute Attr) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addAttribute(getContext(), i, Attr);
-  setAttributes(PAL);
+void Function::addFnAttr(Attribute::AttrKind Kind) {
+  AttributeSets = AttributeSets.addFnAttribute(getContext(), Kind);
 }
 
-void Function::addAttributes(unsigned i, const AttrBuilder &Attrs) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addAttributes(getContext(), i, Attrs);
-  setAttributes(PAL);
+void Function::addFnAttr(StringRef Kind, StringRef Val) {
+  AttributeSets = AttributeSets.addFnAttribute(getContext(), Kind, Val);
+}
+
+void Function::addFnAttr(Attribute Attr) {
+  AttributeSets = AttributeSets.addFnAttribute(getContext(), Attr);
+}
+
+void Function::addFnAttrs(const AttrBuilder &Attrs) {
+  AttributeSets = AttributeSets.addFnAttributes(getContext(), Attrs);
+}
+
+void Function::addRetAttr(Attribute::AttrKind Kind) {
+  AttributeSets = AttributeSets.addRetAttribute(getContext(), Kind);
 }
 
 void Function::addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addParamAttribute(getContext(), ArgNo, Kind);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.addParamAttribute(getContext(), ArgNo, Kind);
 }
 
 void Function::addParamAttr(unsigned ArgNo, Attribute Attr) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addParamAttribute(getContext(), ArgNo, Attr);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.addParamAttribute(getContext(), ArgNo, Attr);
 }
 
 void Function::addParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addParamAttributes(getContext(), ArgNo, Attrs);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.addParamAttributes(getContext(), ArgNo, Attrs);
 }
 
 void Function::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeAttribute(getContext(), i, Kind);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.removeAttribute(getContext(), i, Kind);
 }
 
 void Function::removeAttribute(unsigned i, StringRef Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeAttribute(getContext(), i, Kind);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.removeAttribute(getContext(), i, Kind);
+}
+
+void Function::removeFnAttr(Attribute::AttrKind Kind) {
+  AttributeSets = AttributeSets.removeFnAttribute(getContext(), Kind);
+}
+
+void Function::removeFnAttr(StringRef Kind) {
+  AttributeSets = AttributeSets.removeFnAttribute(getContext(), Kind);
+}
+
+void Function::removeFnAttrs(const AttrBuilder &Attrs) {
+  AttributeSets = AttributeSets.removeFnAttributes(getContext(), Attrs);
+}
+
+void Function::removeRetAttr(Attribute::AttrKind Kind) {
+  AttributeSets = AttributeSets.removeRetAttribute(getContext(), Kind);
+}
+
+void Function::removeRetAttr(StringRef Kind) {
+  AttributeSets = AttributeSets.removeRetAttribute(getContext(), Kind);
 }
 
-void Function::removeAttributes(unsigned i, const AttrBuilder &Attrs) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeAttributes(getContext(), i, Attrs);
-  setAttributes(PAL);
+void Function::removeRetAttrs(const AttrBuilder &Attrs) {
+  AttributeSets = AttributeSets.removeRetAttributes(getContext(), Attrs);
 }
 
 void Function::removeParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.removeParamAttribute(getContext(), ArgNo, Kind);
 }
 
 void Function::removeParamAttr(unsigned ArgNo, StringRef Kind) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeParamAttribute(getContext(), ArgNo, Kind);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.removeParamAttribute(getContext(), ArgNo, Kind);
 }
 
 void Function::removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.removeParamAttributes(getContext(), ArgNo, Attrs);
-  setAttributes(PAL);
+  AttributeSets =
+      AttributeSets.removeParamAttributes(getContext(), ArgNo, Attrs);
 }
 
-void Function::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
-  setAttributes(PAL);
+void Function::addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes) {
+  AttributeSets =
+      AttributeSets.addDereferenceableParamAttr(getContext(), ArgNo, Bytes);
 }
 
-void Function::addDereferenceableParamAttr(unsigned ArgNo, uint64_t Bytes) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addDereferenceableParamAttr(getContext(), ArgNo, Bytes);
-  setAttributes(PAL);
+bool Function::hasFnAttribute(Attribute::AttrKind Kind) const {
+  return AttributeSets.hasFnAttr(Kind);
+}
+
+bool Function::hasFnAttribute(StringRef Kind) const {
+  return AttributeSets.hasFnAttr(Kind);
+}
+
+bool Function::hasRetAttribute(Attribute::AttrKind Kind) const {
+  return AttributeSets.hasRetAttr(Kind);
+}
+
+bool Function::hasParamAttribute(unsigned ArgNo,
+                                 Attribute::AttrKind Kind) const {
+  return AttributeSets.hasParamAttr(ArgNo, Kind);
+}
+
+Attribute Function::getAttribute(unsigned i, Attribute::AttrKind Kind) const {
+  return AttributeSets.getAttribute(i, Kind);
+}
+
+Attribute Function::getAttribute(unsigned i, StringRef Kind) const {
+  return AttributeSets.getAttribute(i, Kind);
+}
+
+Attribute Function::getFnAttribute(Attribute::AttrKind Kind) const {
+  return AttributeSets.getFnAttr(Kind);
+}
+
+Attribute Function::getFnAttribute(StringRef Kind) const {
+  return AttributeSets.getFnAttr(Kind);
 }
 
-void Function::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
-  setAttributes(PAL);
+/// gets the specified attribute from the list of attributes.
+Attribute Function::getParamAttribute(unsigned ArgNo,
+                                      Attribute::AttrKind Kind) const {
+  return AttributeSets.getParamAttr(ArgNo, Kind);
 }
 
 void Function::addDereferenceableOrNullParamAttr(unsigned ArgNo,
                                                  uint64_t Bytes) {
-  AttributeList PAL = getAttributes();
-  PAL = PAL.addDereferenceableOrNullParamAttr(getContext(), ArgNo, Bytes);
-  setAttributes(PAL);
+  AttributeSets = AttributeSets.addDereferenceableOrNullParamAttr(getContext(),
+                                                                  ArgNo, Bytes);
 }
 
 DenormalMode Function::getDenormalMode(const fltSemantics &FPType) const {
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 937dc69578063..e91fc1d71c34f 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -180,7 +180,7 @@ void Instruction::dropUndefImplyingAttrsAndUnknownMetadata(
   AttrBuilder UBImplyingAttributes = AttributeFuncs::getUBImplyingAttributes();
   for (unsigned ArgNo = 0; ArgNo < CB->getNumArgOperands(); ArgNo++)
     CB->removeParamAttrs(ArgNo, UBImplyingAttributes);
-  CB->removeAttributes(AttributeList::ReturnIndex, UBImplyingAttributes);
+  CB->removeRetAttrs(UBImplyingAttributes);
 }
 
 bool Instruction::isExact() const {
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 298cf0598224b..16507f5bcdef2 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -318,9 +318,8 @@ bool CallBase::isReturnNonNull() const {
   if (hasRetAttr(Attribute::NonNull))
     return true;
 
-  if (getDereferenceableBytes(AttributeList::ReturnIndex) > 0 &&
-           !NullPointerIsDefined(getCaller(),
-                                 getType()->getPointerAddressSpace()))
+  if (getRetDereferenceableBytes() > 0 &&
+      !NullPointerIsDefined(getCaller(), getType()->getPointerAddressSpace()))
     return true;
 
   return false;
@@ -352,13 +351,13 @@ bool CallBase::paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
 
 bool CallBase::hasFnAttrOnCalledFunction(Attribute::AttrKind Kind) const {
   if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasFnAttribute(Kind);
+    return F->getAttributes().hasFnAttr(Kind);
   return false;
 }
 
 bool CallBase::hasFnAttrOnCalledFunction(StringRef Kind) const {
   if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasFnAttribute(Kind);
+    return F->getAttributes().hasFnAttr(Kind);
   return false;
 }
 
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 19942fa187fd2..7a7ff915cf28b 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -473,9 +473,15 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
   assert(isVPIntrinsic(VPID) && "not a VP intrinsic");
   Function *VPFunc;
   switch (VPID) {
-  default:
-    VPFunc = Intrinsic::getDeclaration(M, VPID, Params[0]->getType());
+  default: {
+    Type *OverloadTy = Params[0]->getType();
+    if (VPReductionIntrinsic::isVPReduction(VPID))
+      OverloadTy =
+          Params[*VPReductionIntrinsic::getVectorParamPos(VPID)]->getType();
+
+    VPFunc = Intrinsic::getDeclaration(M, VPID, OverloadTy);
     break;
+  }
   case Intrinsic::vp_load:
     VPFunc = Intrinsic::getDeclaration(
         M, VPID,
@@ -504,6 +510,48 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
   return VPFunc;
 }
 
+bool VPReductionIntrinsic::isVPReduction(Intrinsic::ID ID) {
+  switch (ID) {
+  default:
+    return false;
+#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS)                         \
+  case Intrinsic::VPID:                                                        \
+    break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  return true;
+}
+
+unsigned VPReductionIntrinsic::getVectorParamPos() const {
+  return *VPReductionIntrinsic::getVectorParamPos(getIntrinsicID());
+}
+
+unsigned VPReductionIntrinsic::getStartParamPos() const {
+  return *VPReductionIntrinsic::getStartParamPos(getIntrinsicID());
+}
+
+Optional<unsigned> VPReductionIntrinsic::getVectorParamPos(Intrinsic::ID ID) {
+  switch (ID) {
+#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS)                         \
+  case Intrinsic::VPID:                                                        \
+    return VECTORPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  default:
+    return None;
+  }
+}
+
+Optional<unsigned> VPReductionIntrinsic::getStartParamPos(Intrinsic::ID ID) {
+  switch (ID) {
+#define HANDLE_VP_REDUCTION(VPID, STARTPOS, VECTORPOS)                         \
+  case Intrinsic::VPID:                                                        \
+    return STARTPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  default:
+    return None;
+  }
+}
+
 Instruction::BinaryOps BinaryOpIntrinsic::getBinaryOp() const {
   switch (getIntrinsicID()) {
   case Intrinsic::uadd_with_overflow:
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 2ae23fdc95a8a..bd38c86996c97 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -471,23 +471,24 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
   Optional<unsigned> DWARFAddressSpace;
   unsigned Flags;
   Metadata *ExtraData;
+  Metadata *Annotations;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
                 uint32_t AlignInBits, uint64_t OffsetInBits,
                 Optional<unsigned> DWARFAddressSpace, unsigned Flags,
-                Metadata *ExtraData)
+                Metadata *ExtraData, Metadata *Annotations)
       : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
         BaseType(BaseType), SizeInBits(SizeInBits), OffsetInBits(OffsetInBits),
         AlignInBits(AlignInBits), DWARFAddressSpace(DWARFAddressSpace),
-        Flags(Flags), ExtraData(ExtraData) {}
+        Flags(Flags), ExtraData(ExtraData), Annotations(Annotations) {}
   MDNodeKeyImpl(const DIDerivedType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
         BaseType(N->getRawBaseType()), SizeInBits(N->getSizeInBits()),
         OffsetInBits(N->getOffsetInBits()), AlignInBits(N->getAlignInBits()),
         DWARFAddressSpace(N->getDWARFAddressSpace()), Flags(N->getFlags()),
-        ExtraData(N->getRawExtraData()) {}
+        ExtraData(N->getRawExtraData()), Annotations(N->getRawAnnotations()) {}
 
   bool isKeyOf(const DIDerivedType *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
@@ -497,8 +498,8 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
            AlignInBits == RHS->getAlignInBits() &&
            OffsetInBits == RHS->getOffsetInBits() &&
            DWARFAddressSpace == RHS->getDWARFAddressSpace() &&
-           Flags == RHS->getFlags() &&
-           ExtraData == RHS->getRawExtraData();
+           Flags == RHS->getFlags() && ExtraData == RHS->getRawExtraData() &&
+           Annotations == RHS->getRawAnnotations();
   }
 
   unsigned getHashValue() const {
@@ -569,6 +570,7 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
   Metadata *Associated;
   Metadata *Allocated;
   Metadata *Rank;
+  Metadata *Annotations;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
@@ -577,14 +579,15 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
                 Metadata *VTableHolder, Metadata *TemplateParams,
                 MDString *Identifier, Metadata *Discriminator,
                 Metadata *DataLocation, Metadata *Associated,
-                Metadata *Allocated, Metadata *Rank)
+                Metadata *Allocated, Metadata *Rank, Metadata *Annotations)
       : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
         BaseType(BaseType), SizeInBits(SizeInBits), OffsetInBits(OffsetInBits),
         AlignInBits(AlignInBits), Flags(Flags), Elements(Elements),
         RuntimeLang(RuntimeLang), VTableHolder(VTableHolder),
         TemplateParams(TemplateParams), Identifier(Identifier),
         Discriminator(Discriminator), DataLocation(DataLocation),
-        Associated(Associated), Allocated(Allocated), Rank(Rank) {}
+        Associated(Associated), Allocated(Allocated), Rank(Rank),
+        Annotations(Annotations) {}
   MDNodeKeyImpl(const DICompositeType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
@@ -597,7 +600,7 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
         Discriminator(N->getRawDiscriminator()),
         DataLocation(N->getRawDataLocation()),
         Associated(N->getRawAssociated()), Allocated(N->getRawAllocated()),
-        Rank(N->getRawRank()) {}
+        Rank(N->getRawRank()), Annotations(N->getRawAnnotations()) {}
 
   bool isKeyOf(const DICompositeType *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
@@ -614,7 +617,8 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
            Discriminator == RHS->getRawDiscriminator() &&
            DataLocation == RHS->getRawDataLocation() &&
            Associated == RHS->getRawAssociated() &&
-           Allocated == RHS->getRawAllocated() && Rank == RHS->getRawRank();
+           Allocated == RHS->getRawAllocated() && Rank == RHS->getRawRank() &&
+           Annotations == RHS->getRawAnnotations();
   }
 
   unsigned getHashValue() const {
@@ -623,7 +627,7 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
     // collision "most of the time". There is no correctness issue in case of
     // collision because of the full check above.
     return hash_combine(Name, File, Line, BaseType, Scope, Elements,
-                        TemplateParams);
+                        TemplateParams, Annotations);
   }
 };
 
diff --git a/llvm/lib/IR/Statepoint.cpp b/llvm/lib/IR/Statepoint.cpp
index bbfbbe489bae7..b5916e4937c6e 100644
--- a/llvm/lib/IR/Statepoint.cpp
+++ b/llvm/lib/IR/Statepoint.cpp
@@ -26,16 +26,14 @@ StatepointDirectives
 llvm::parseStatepointDirectivesFromAttrs(AttributeList AS) {
   StatepointDirectives Result;
 
-  Attribute AttrID =
-      AS.getAttribute(AttributeList::FunctionIndex, "statepoint-id");
+  Attribute AttrID = AS.getFnAttr("statepoint-id");
   uint64_t StatepointID;
   if (AttrID.isStringAttribute())
     if (!AttrID.getValueAsString().getAsInteger(10, StatepointID))
       Result.StatepointID = StatepointID;
 
   uint32_t NumPatchBytes;
-  Attribute AttrNumPatchBytes = AS.getAttribute(AttributeList::FunctionIndex,
-                                                "statepoint-num-patch-bytes");
+  Attribute AttrNumPatchBytes = AS.getFnAttr("statepoint-num-patch-bytes");
   if (AttrNumPatchBytes.isStringAttribute())
     if (!AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes))
       Result.NumPatchBytes = NumPatchBytes;
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index 1c595651b3d72..da67da1ac6c59 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -852,10 +852,9 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL,
       CanBeNull = true;
     }
   } else if (const auto *Call = dyn_cast<CallBase>(this)) {
-    DerefBytes = Call->getDereferenceableBytes(AttributeList::ReturnIndex);
+    DerefBytes = Call->getRetDereferenceableBytes();
     if (DerefBytes == 0) {
-      DerefBytes =
-          Call->getDereferenceableOrNullBytes(AttributeList::ReturnIndex);
+      DerefBytes = Call->getRetDereferenceableOrNullBytes();
       CanBeNull = true;
     }
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(this)) {
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 1a66755f4668f..cdf9dc522f9b6 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -1824,9 +1824,8 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
 
 void Verifier::checkUnsignedBaseTenFuncAttr(AttributeList Attrs, StringRef Attr,
                                             const Value *V) {
-  if (Attrs.hasFnAttribute(Attr)) {
-    StringRef S = Attrs.getAttribute(AttributeList::FunctionIndex, Attr)
-                      .getValueAsString();
+  if (Attrs.hasFnAttr(Attr)) {
+    StringRef S = Attrs.getFnAttr(Attr).getValueAsString();
     unsigned N;
     if (S.getAsInteger(10, N))
       CheckFailed("\"" + Attr + "\" takes an unsigned integer: " + S, V);
@@ -1861,7 +1860,7 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
   bool SawSwiftError = false;
 
   // Verify return value attributes.
-  AttributeSet RetAttrs = Attrs.getRetAttributes();
+  AttributeSet RetAttrs = Attrs.getRetAttrs();
   for (Attribute RetAttr : RetAttrs)
     Assert(RetAttr.isStringAttribute() ||
            Attribute::canUseAsRetAttr(RetAttr.getKindAsEnum()),
@@ -1874,7 +1873,7 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
   // Verify parameter attributes.
   for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
     Type *Ty = FT->getParamType(i);
-    AttributeSet ArgAttrs = Attrs.getParamAttributes(i);
+    AttributeSet ArgAttrs = Attrs.getParamAttrs(i);
 
     if (!IsIntrinsic) {
       Assert(!ArgAttrs.hasAttribute(Attribute::ImmArg),
@@ -1928,63 +1927,63 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
     }
   }
 
-  if (!Attrs.hasAttributes(AttributeList::FunctionIndex))
+  if (!Attrs.hasFnAttrs())
     return;
 
-  verifyAttributeTypes(Attrs.getFnAttributes(), V);
-  for (Attribute FnAttr : Attrs.getFnAttributes())
+  verifyAttributeTypes(Attrs.getFnAttrs(), V);
+  for (Attribute FnAttr : Attrs.getFnAttrs())
     Assert(FnAttr.isStringAttribute() ||
            Attribute::canUseAsFnAttr(FnAttr.getKindAsEnum()),
            "Attribute '" + FnAttr.getAsString() +
                "' does not apply to functions!",
            V);
 
-  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
-           Attrs.hasFnAttribute(Attribute::ReadOnly)),
+  Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
+           Attrs.hasFnAttr(Attribute::ReadOnly)),
          "Attributes 'readnone and readonly' are incompatible!", V);
 
-  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
-           Attrs.hasFnAttribute(Attribute::WriteOnly)),
+  Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
+           Attrs.hasFnAttr(Attribute::WriteOnly)),
          "Attributes 'readnone and writeonly' are incompatible!", V);
 
-  Assert(!(Attrs.hasFnAttribute(Attribute::ReadOnly) &&
-           Attrs.hasFnAttribute(Attribute::WriteOnly)),
+  Assert(!(Attrs.hasFnAttr(Attribute::ReadOnly) &&
+           Attrs.hasFnAttr(Attribute::WriteOnly)),
          "Attributes 'readonly and writeonly' are incompatible!", V);
 
-  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
-           Attrs.hasFnAttribute(Attribute::InaccessibleMemOrArgMemOnly)),
+  Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
+           Attrs.hasFnAttr(Attribute::InaccessibleMemOrArgMemOnly)),
          "Attributes 'readnone and inaccessiblemem_or_argmemonly' are "
          "incompatible!",
          V);
 
-  Assert(!(Attrs.hasFnAttribute(Attribute::ReadNone) &&
-           Attrs.hasFnAttribute(Attribute::InaccessibleMemOnly)),
+  Assert(!(Attrs.hasFnAttr(Attribute::ReadNone) &&
+           Attrs.hasFnAttr(Attribute::InaccessibleMemOnly)),
          "Attributes 'readnone and inaccessiblememonly' are incompatible!", V);
 
-  Assert(!(Attrs.hasFnAttribute(Attribute::NoInline) &&
-           Attrs.hasFnAttribute(Attribute::AlwaysInline)),
+  Assert(!(Attrs.hasFnAttr(Attribute::NoInline) &&
+           Attrs.hasFnAttr(Attribute::AlwaysInline)),
          "Attributes 'noinline and alwaysinline' are incompatible!", V);
 
-  if (Attrs.hasFnAttribute(Attribute::OptimizeNone)) {
-    Assert(Attrs.hasFnAttribute(Attribute::NoInline),
+  if (Attrs.hasFnAttr(Attribute::OptimizeNone)) {
+    Assert(Attrs.hasFnAttr(Attribute::NoInline),
            "Attribute 'optnone' requires 'noinline'!", V);
 
-    Assert(!Attrs.hasFnAttribute(Attribute::OptimizeForSize),
+    Assert(!Attrs.hasFnAttr(Attribute::OptimizeForSize),
            "Attributes 'optsize and optnone' are incompatible!", V);
 
-    Assert(!Attrs.hasFnAttribute(Attribute::MinSize),
+    Assert(!Attrs.hasFnAttr(Attribute::MinSize),
            "Attributes 'minsize and optnone' are incompatible!", V);
   }
 
-  if (Attrs.hasFnAttribute(Attribute::JumpTable)) {
+  if (Attrs.hasFnAttr(Attribute::JumpTable)) {
     const GlobalValue *GV = cast<GlobalValue>(V);
     Assert(GV->hasGlobalUnnamedAddr(),
            "Attribute 'jumptable' requires 'unnamed_addr'", V);
   }
 
-  if (Attrs.hasFnAttribute(Attribute::AllocSize)) {
+  if (Attrs.hasFnAttr(Attribute::AllocSize)) {
     std::pair<unsigned, Optional<unsigned>> Args =
-        Attrs.getAllocSizeArgs(AttributeList::FunctionIndex);
+        Attrs.getFnAttrs().getAllocSizeArgs();
 
     auto CheckParam = [&](StringRef Name, unsigned ParamNo) {
       if (ParamNo >= FT->getNumParams()) {
@@ -2009,17 +2008,16 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
       return;
   }
 
-  if (Attrs.hasFnAttribute(Attribute::VScaleRange)) {
+  if (Attrs.hasFnAttr(Attribute::VScaleRange)) {
     std::pair<unsigned, unsigned> Args =
-        Attrs.getVScaleRangeArgs(AttributeList::FunctionIndex);
+        Attrs.getFnAttrs().getVScaleRangeArgs();
 
     if (Args.first > Args.second && Args.second != 0)
       CheckFailed("'vscale_range' minimum cannot be greater than maximum", V);
   }
 
-  if (Attrs.hasFnAttribute("frame-pointer")) {
-    StringRef FP = Attrs.getAttribute(AttributeList::FunctionIndex,
-                                      "frame-pointer").getValueAsString();
+  if (Attrs.hasFnAttr("frame-pointer")) {
+    StringRef FP = Attrs.getFnAttr("frame-pointer").getValueAsString();
     if (FP != "all" && FP != "non-leaf" && FP != "none")
       CheckFailed("invalid value for 'frame-pointer' attribute: " + FP, V);
   }
@@ -2168,7 +2166,7 @@ void Verifier::verifyStatepoint(const CallBase &Call) {
            Call);
 
     if (TargetFuncType->isVarArg()) {
-      AttributeSet ArgAttrs = Attrs.getParamAttributes(5 + i);
+      AttributeSet ArgAttrs = Attrs.getParamAttrs(5 + i);
       Assert(!ArgAttrs.hasAttribute(Attribute::StructRet),
              "Attribute 'sret' cannot be used for vararg call arguments!",
              Call);
@@ -2334,7 +2332,7 @@ void Verifier::visitFunction(const Function &F) {
   // On function declarations/definitions, we do not support the builtin
   // attribute. We do not check this in VerifyFunctionAttrs since that is
   // checking for Attributes that can/can not ever be on functions.
-  Assert(!Attrs.hasFnAttribute(Attribute::Builtin),
+  Assert(!Attrs.hasFnAttr(Attribute::Builtin),
          "Attribute 'builtin' can only be applied to a callsite.", &F);
 
   Assert(!Attrs.hasAttrSomewhere(Attribute::ElementType),
@@ -2693,6 +2691,7 @@ void Verifier::visitReturnInst(ReturnInst &RI) {
 }
 
 void Verifier::visitSwitchInst(SwitchInst &SI) {
+  Assert(SI.getType()->isVoidTy(), "Switch must have void result type!", &SI);
   // Check to make sure that all of the constants in the switch instruction
   // have the same type as the switched-on value.
   Type *SwitchTy = SI.getCondition()->getType();
@@ -3071,14 +3070,14 @@ void Verifier::visitCallBase(CallBase &Call) {
     Assert(Callee->getValueType() == FTy,
            "Intrinsic called with incompatible signature", Call);
 
-  if (Attrs.hasFnAttribute(Attribute::Speculatable)) {
+  if (Attrs.hasFnAttr(Attribute::Speculatable)) {
     // Don't allow speculatable on call sites, unless the underlying function
     // declaration is also speculatable.
     Assert(Callee && Callee->isSpeculatable(),
            "speculatable attribute may not apply to call sites", Call);
   }
 
-  if (Attrs.hasFnAttribute(Attribute::Preallocated)) {
+  if (Attrs.hasFnAttr(Attribute::Preallocated)) {
     Assert(Call.getCalledFunction()->getIntrinsicID() ==
                Intrinsic::call_preallocated_arg,
            "preallocated as a call site attribute can only be on "
@@ -3159,7 +3158,7 @@ void Verifier::visitCallBase(CallBase &Call) {
     // Check attributes on the varargs part.
     for (unsigned Idx = FTy->getNumParams(); Idx < Call.arg_size(); ++Idx) {
       Type *Ty = Call.getArgOperand(Idx)->getType();
-      AttributeSet ArgAttrs = Attrs.getParamAttributes(Idx);
+      AttributeSet ArgAttrs = Attrs.getParamAttrs(Idx);
       verifyParameterAttrs(ArgAttrs, Ty, &Call);
 
       if (ArgAttrs.hasAttribute(Attribute::Nest)) {
@@ -3323,7 +3322,7 @@ static AttrBuilder getParameterABIAttributes(int I, AttributeList Attrs) {
       Attribute::ByRef};
   AttrBuilder Copy;
   for (auto AK : ABIAttrs) {
-    Attribute Attr = Attrs.getParamAttributes(I).getAttribute(AK);
+    Attribute Attr = Attrs.getParamAttrs(I).getAttribute(AK);
     if (Attr.isValid())
       Copy.addAttribute(Attr);
   }
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 6563af0d6c74b..73a48a7fcf223 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -230,6 +230,8 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
     PGOOpt = PGOOptions("", "", "", PGOOptions::NoAction,
                         PGOOptions::NoCSAction, true);
   }
+  if (TM)
+    TM->setPGOOption(PGOOpt);
 
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
diff --git a/llvm/lib/Linker/LinkModules.cpp b/llvm/lib/Linker/LinkModules.cpp
index 97d6f8cd80755..971d3f0b21683 100644
--- a/llvm/lib/Linker/LinkModules.cpp
+++ b/llvm/lib/Linker/LinkModules.cpp
@@ -526,6 +526,10 @@ bool ModuleLinker::run() {
     if (linkIfNeeded(GA))
       return true;
 
+  for (GlobalIFunc &GI : SrcM->ifuncs())
+    if (linkIfNeeded(GI))
+      return true;
+
   for (unsigned I = 0; I < ValuesToLink.size(); ++I) {
     GlobalValue *GV = ValuesToLink[I];
     const Comdat *SC = GV->getComdat();
diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp
index e328ba5315af5..bf9b9e916d6f6 100644
--- a/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -228,6 +228,7 @@ AsmToken AsmLexer::LexLineComment() {
   int CurChar = getNextChar();
   while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
     CurChar = getNextChar();
+  const char *NewlinePtr = CurPtr;
   if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
     ++CurPtr;
 
@@ -235,7 +236,7 @@ AsmToken AsmLexer::LexLineComment() {
   if (CommentConsumer) {
     CommentConsumer->HandleComment(
         SMLoc::getFromPointer(CommentTextStart),
-        StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
+        StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
   }
 
   IsAtStartOfLine = true;
diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
index 0b71f7d10a628..fa5c0fc66b9ed 100644
--- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
+++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
@@ -261,6 +261,7 @@ llvm::Error InOrderIssueStage::tryIssue(InstRef &IR) {
   // the execution and retirement now.
   if (IS.isExecuted()) {
     PRF.onInstructionExecuted(&IS);
+    LSU.onInstructionExecuted(IR);
     notifyEvent<HWInstructionEvent>(
         HWInstructionEvent(HWInstructionEvent::Executed, IR));
     LLVM_DEBUG(dbgs() << "[E] Instruction #" << IR << " is executed\n");
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index 9efb28c6da2b3..2d6d25d2688d4 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -654,3 +654,72 @@ ELFObjectFileBase::getPltAddresses() const {
   }
   return Result;
 }
+
+template <class ELFT>
+static Expected<std::vector<VersionEntry>>
+readDynsymVersionsImpl(const ELFFile<ELFT> &EF,
+                       ELFObjectFileBase::elf_symbol_iterator_range Symbols) {
+  using Elf_Shdr = typename ELFT::Shdr;
+  const Elf_Shdr *VerSec = nullptr;
+  const Elf_Shdr *VerNeedSec = nullptr;
+  const Elf_Shdr *VerDefSec = nullptr;
+  // The user should ensure sections() can't fail here.
+  for (const Elf_Shdr &Sec : cantFail(EF.sections())) {
+    if (Sec.sh_type == ELF::SHT_GNU_versym)
+      VerSec = &Sec;
+    else if (Sec.sh_type == ELF::SHT_GNU_verdef)
+      VerDefSec = &Sec;
+    else if (Sec.sh_type == ELF::SHT_GNU_verneed)
+      VerNeedSec = &Sec;
+  }
+  if (!VerSec)
+    return std::vector<VersionEntry>();
+
+  Expected<SmallVector<Optional<VersionEntry>, 0>> MapOrErr =
+      EF.loadVersionMap(VerNeedSec, VerDefSec);
+  if (!MapOrErr)
+    return MapOrErr.takeError();
+
+  std::vector<VersionEntry> Ret;
+  size_t I = 0;
+  for (auto It = Symbols.begin(), E = Symbols.end(); It != E; ++It) {
+    ++I;
+    Expected<const typename ELFT::Versym *> VerEntryOrErr =
+        EF.template getEntry<typename ELFT::Versym>(*VerSec, I);
+    if (!VerEntryOrErr)
+      return createError("unable to read an entry with index " + Twine(I) +
+                         " from " + describe(EF, *VerSec) + ": " +
+                         toString(VerEntryOrErr.takeError()));
+
+    Expected<uint32_t> FlagsOrErr = It->getFlags();
+    if (!FlagsOrErr)
+      return createError("unable to read flags for symbol with index " +
+                         Twine(I) + ": " + toString(FlagsOrErr.takeError()));
+
+    bool IsDefault;
+    Expected<StringRef> VerOrErr = EF.getSymbolVersionByIndex(
+        (*VerEntryOrErr)->vs_index, IsDefault, *MapOrErr,
+        (*FlagsOrErr) & SymbolRef::SF_Undefined);
+    if (!VerOrErr)
+      return createError("unable to get a version for entry " + Twine(I) +
+                         " of " + describe(EF, *VerSec) + ": " +
+                         toString(VerOrErr.takeError()));
+
+    Ret.push_back({(*VerOrErr).str(), IsDefault});
+  }
+
+  return Ret;
+}
+
+Expected<std::vector<VersionEntry>>
+ELFObjectFileBase::readDynsymVersions() const {
+  elf_symbol_iterator_range Symbols = getDynamicSymbolIterators();
+  if (const auto *Obj = dyn_cast<ELF32LEObjectFile>(this))
+    return readDynsymVersionsImpl(Obj->getELFFile(), Symbols);
+  if (const auto *Obj = dyn_cast<ELF32BEObjectFile>(this))
+    return readDynsymVersionsImpl(Obj->getELFFile(), Symbols);
+  if (const auto *Obj = dyn_cast<ELF64LEObjectFile>(this))
+    return readDynsymVersionsImpl(Obj->getELFFile(), Symbols);
+  return readDynsymVersionsImpl(cast<ELF64BEObjectFile>(this)->getELFFile(),
+                                Symbols);
+}
diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp
index 28355b69689c5..7ec418c7b2f00 100644
--- a/llvm/lib/Object/XCOFFObjectFile.cpp
+++ b/llvm/lib/Object/XCOFFObjectFile.cpp
@@ -69,15 +69,18 @@ bool XCOFFSectionHeader<T>::isReservedSectionType() const {
   return getSectionType() & SectionFlagsReservedMask;
 }
 
-bool XCOFFRelocation32::isRelocationSigned() const {
+template <typename AddressType>
+bool XCOFFRelocation<AddressType>::isRelocationSigned() const {
   return Info & XR_SIGN_INDICATOR_MASK;
 }
 
-bool XCOFFRelocation32::isFixupIndicated() const {
+template <typename AddressType>
+bool XCOFFRelocation<AddressType>::isFixupIndicated() const {
   return Info & XR_FIXUP_INDICATOR_MASK;
 }
 
-uint8_t XCOFFRelocation32::getRelocatedLength() const {
+template <typename AddressType>
+uint8_t XCOFFRelocation<AddressType>::getRelocatedLength() const {
   // The relocation encodes the bit length being relocated minus 1. Add back
   // the 1 to get the actual length being relocated.
   return (Info & XR_BIASED_LENGTH_MASK) + 1;
@@ -146,6 +149,10 @@ const XCOFFFileHeader64 *XCOFFObjectFile::fileHeader64() const {
   return static_cast<const XCOFFFileHeader64 *>(FileHeader);
 }
 
+template <typename T> const T *XCOFFObjectFile::sectionHeaderTable() const {
+  return static_cast<const T *>(SectionHeaderTable);
+}
+
 const XCOFFSectionHeader32 *
 XCOFFObjectFile::sectionHeaderTable32() const {
   assert(!is64Bit() && "32-bit interface called on 64-bit object file.");
@@ -329,61 +336,112 @@ bool XCOFFObjectFile::isSectionVirtual(DataRefImpl Sec) const {
 }
 
 relocation_iterator XCOFFObjectFile::section_rel_begin(DataRefImpl Sec) const {
-  if (is64Bit())
-    report_fatal_error("64-bit support not implemented yet");
-  const XCOFFSectionHeader32 *SectionEntPtr = toSection32(Sec);
-  auto RelocationsOrErr = relocations(*SectionEntPtr);
-  if (Error E = RelocationsOrErr.takeError())
-    return relocation_iterator(RelocationRef());
   DataRefImpl Ret;
-  Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().begin());
+  if (is64Bit()) {
+    const XCOFFSectionHeader64 *SectionEntPtr = toSection64(Sec);
+    auto RelocationsOrErr =
+        relocations<XCOFFSectionHeader64, XCOFFRelocation64>(*SectionEntPtr);
+    if (Error E = RelocationsOrErr.takeError()) {
+      // TODO: report the error up the stack.
+      consumeError(std::move(E));
+      return relocation_iterator(RelocationRef());
+    }
+    Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().begin());
+  } else {
+    const XCOFFSectionHeader32 *SectionEntPtr = toSection32(Sec);
+    auto RelocationsOrErr =
+        relocations<XCOFFSectionHeader32, XCOFFRelocation32>(*SectionEntPtr);
+    if (Error E = RelocationsOrErr.takeError()) {
+      // TODO: report the error up the stack.
+      consumeError(std::move(E));
+      return relocation_iterator(RelocationRef());
+    }
+    Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().begin());
+  }
   return relocation_iterator(RelocationRef(Ret, this));
 }
 
 relocation_iterator XCOFFObjectFile::section_rel_end(DataRefImpl Sec) const {
-  if (is64Bit())
-    report_fatal_error("64-bit support not implemented yet");
-  const XCOFFSectionHeader32 *SectionEntPtr = toSection32(Sec);
-  auto RelocationsOrErr = relocations(*SectionEntPtr);
-  if (Error E = RelocationsOrErr.takeError())
-    return relocation_iterator(RelocationRef());
   DataRefImpl Ret;
-  Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().end());
+  if (is64Bit()) {
+    const XCOFFSectionHeader64 *SectionEntPtr = toSection64(Sec);
+    auto RelocationsOrErr =
+        relocations<XCOFFSectionHeader64, XCOFFRelocation64>(*SectionEntPtr);
+    if (Error E = RelocationsOrErr.takeError()) {
+      // TODO: report the error up the stack.
+      consumeError(std::move(E));
+      return relocation_iterator(RelocationRef());
+    }
+    Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().end());
+  } else {
+    const XCOFFSectionHeader32 *SectionEntPtr = toSection32(Sec);
+    auto RelocationsOrErr =
+        relocations<XCOFFSectionHeader32, XCOFFRelocation32>(*SectionEntPtr);
+    if (Error E = RelocationsOrErr.takeError()) {
+      // TODO: report the error up the stack.
+      consumeError(std::move(E));
+      return relocation_iterator(RelocationRef());
+    }
+    Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().end());
+  }
   return relocation_iterator(RelocationRef(Ret, this));
 }
 
 void XCOFFObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
-  Rel.p = reinterpret_cast<uintptr_t>(viewAs<XCOFFRelocation32>(Rel.p) + 1);
+  if (is64Bit())
+    Rel.p = reinterpret_cast<uintptr_t>(viewAs<XCOFFRelocation64>(Rel.p) + 1);
+  else
+    Rel.p = reinterpret_cast<uintptr_t>(viewAs<XCOFFRelocation32>(Rel.p) + 1);
 }
 
 uint64_t XCOFFObjectFile::getRelocationOffset(DataRefImpl Rel) const {
-  if (is64Bit())
-    report_fatal_error("64-bit support not implemented yet");
-  const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
-  const XCOFFSectionHeader32 *Sec32 = sectionHeaderTable32();
-  const uint32_t RelocAddress = Reloc->VirtualAddress;
-  const uint16_t NumberOfSections = getNumberOfSections();
-  for (uint16_t i = 0; i < NumberOfSections; ++i) {
-    // Find which section this relocation is belonging to, and get the
-    // relocation offset relative to the start of the section.
-    if (Sec32->VirtualAddress <= RelocAddress &&
-        RelocAddress < Sec32->VirtualAddress + Sec32->SectionSize) {
-      return RelocAddress - Sec32->VirtualAddress;
+  if (is64Bit()) {
+    const XCOFFRelocation64 *Reloc = viewAs<XCOFFRelocation64>(Rel.p);
+    const XCOFFSectionHeader64 *Sec64 = sectionHeaderTable64();
+    const uint64_t RelocAddress = Reloc->VirtualAddress;
+    const uint16_t NumberOfSections = getNumberOfSections();
+    for (uint16_t I = 0; I < NumberOfSections; ++I) {
+      // Find which section this relocation belongs to, and get the
+      // relocation offset relative to the start of the section.
+      if (Sec64->VirtualAddress <= RelocAddress &&
+          RelocAddress < Sec64->VirtualAddress + Sec64->SectionSize) {
+        return RelocAddress - Sec64->VirtualAddress;
+      }
+      ++Sec64;
+    }
+  } else {
+    const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
+    const XCOFFSectionHeader32 *Sec32 = sectionHeaderTable32();
+    const uint32_t RelocAddress = Reloc->VirtualAddress;
+    const uint16_t NumberOfSections = getNumberOfSections();
+    for (uint16_t I = 0; I < NumberOfSections; ++I) {
+      // Find which section this relocation belongs to, and get the
+      // relocation offset relative to the start of the section.
+      if (Sec32->VirtualAddress <= RelocAddress &&
+          RelocAddress < Sec32->VirtualAddress + Sec32->SectionSize) {
+        return RelocAddress - Sec32->VirtualAddress;
+      }
+      ++Sec32;
     }
-    ++Sec32;
   }
   return InvalidRelocOffset;
 }
 
 symbol_iterator XCOFFObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
-  if (is64Bit())
-    report_fatal_error("64-bit support not implemented yet");
-  const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
-  const uint32_t Index = Reloc->SymbolIndex;
-
-  if (Index >= getLogicalNumberOfSymbolTableEntries32())
-    return symbol_end();
-
+  uint32_t Index;
+  if (is64Bit()) {
+    const XCOFFRelocation64 *Reloc = viewAs<XCOFFRelocation64>(Rel.p);
+    Index = Reloc->SymbolIndex;
+
+    if (Index >= getNumberOfSymbolTableEntries64())
+      return symbol_end();
+  } else {
+    const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
+    Index = Reloc->SymbolIndex;
+
+    if (Index >= getLogicalNumberOfSymbolTableEntries32())
+      return symbol_end();
+  }
   DataRefImpl SymDRI;
   SymDRI.p = getSymbolEntryAddressByIndex(Index);
   return symbol_iterator(SymbolRef(SymDRI, this));
@@ -391,16 +449,20 @@ symbol_iterator XCOFFObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
 
 uint64_t XCOFFObjectFile::getRelocationType(DataRefImpl Rel) const {
   if (is64Bit())
-    report_fatal_error("64-bit support not implemented yet");
+    return viewAs<XCOFFRelocation64>(Rel.p)->Type;
   return viewAs<XCOFFRelocation32>(Rel.p)->Type;
 }
 
 void XCOFFObjectFile::getRelocationTypeName(
     DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
-  if (is64Bit())
-    report_fatal_error("64-bit support not implemented yet");
-  const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
-  StringRef Res = XCOFF::getRelocationTypeString(Reloc->Type);
+  StringRef Res;
+  if (is64Bit()) {
+    const XCOFFRelocation64 *Reloc = viewAs<XCOFFRelocation64>(Rel.p);
+    Res = XCOFF::getRelocationTypeString(Reloc->Type);
+  } else {
+    const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
+    Res = XCOFF::getRelocationTypeString(Reloc->Type);
+  }
   Result.append(Res.begin(), Res.end());
 }
 
@@ -661,13 +723,16 @@ ArrayRef<XCOFFSectionHeader32> XCOFFObjectFile::sections32() const {
 // section header contains the actual count of relocation entries in the s_paddr
 // field. STYP_OVRFLO headers contain the section index of their corresponding
 // sections as their raw "NumberOfRelocations" field value.
-Expected<uint32_t> XCOFFObjectFile::getLogicalNumberOfRelocationEntries(
-    const XCOFFSectionHeader32 &Sec) const {
-
-  uint16_t SectionIndex = &Sec - sectionHeaderTable32() + 1;
+template <typename T>
+Expected<uint32_t> XCOFFObjectFile::getNumberOfRelocationEntries(
+    const XCOFFSectionHeader<T> &Sec) const {
+  const T &Section = static_cast<const T &>(Sec);
+  if (is64Bit())
+    return Section.NumberOfRelocations;
 
-  if (Sec.NumberOfRelocations < XCOFF::RelocOverflow)
-    return Sec.NumberOfRelocations;
+  uint16_t SectionIndex = &Section - sectionHeaderTable<T>() + 1;
+  if (Section.NumberOfRelocations < XCOFF::RelocOverflow)
+    return Section.NumberOfRelocations;
   for (const auto &Sec : sections32()) {
     if (Sec.Flags == XCOFF::STYP_OVRFLO &&
         Sec.NumberOfRelocations == SectionIndex)
@@ -676,27 +741,27 @@ Expected<uint32_t> XCOFFObjectFile::getLogicalNumberOfRelocationEntries(
   return errorCodeToError(object_error::parse_failed);
 }
 
-Expected<ArrayRef<XCOFFRelocation32>>
-XCOFFObjectFile::relocations(const XCOFFSectionHeader32 &Sec) const {
+template <typename Shdr, typename Reloc>
+Expected<ArrayRef<Reloc>> XCOFFObjectFile::relocations(const Shdr &Sec) const {
   uintptr_t RelocAddr = getWithOffset(reinterpret_cast<uintptr_t>(FileHeader),
                                       Sec.FileOffsetToRelocationInfo);
-  auto NumRelocEntriesOrErr = getLogicalNumberOfRelocationEntries(Sec);
+  auto NumRelocEntriesOrErr = getNumberOfRelocationEntries(Sec);
   if (Error E = NumRelocEntriesOrErr.takeError())
     return std::move(E);
 
   uint32_t NumRelocEntries = NumRelocEntriesOrErr.get();
-
-  static_assert(
-      sizeof(XCOFFRelocation32) == XCOFF::RelocationSerializationSize32, "");
+  static_assert((sizeof(Reloc) == XCOFF::RelocationSerializationSize64 ||
+                 sizeof(Reloc) == XCOFF::RelocationSerializationSize32),
+                "Relocation structure is incorrect");
   auto RelocationOrErr =
-      getObject<XCOFFRelocation32>(Data, reinterpret_cast<void *>(RelocAddr),
-                                   NumRelocEntries * sizeof(XCOFFRelocation32));
+      getObject<Reloc>(Data, reinterpret_cast<void *>(RelocAddr),
+                       NumRelocEntries * sizeof(Reloc));
   if (Error E = RelocationOrErr.takeError())
     return std::move(E);
 
-  const XCOFFRelocation32 *StartReloc = RelocationOrErr.get();
+  const Reloc *StartReloc = RelocationOrErr.get();
 
-  return ArrayRef<XCOFFRelocation32>(StartReloc, StartReloc + NumRelocEntries);
+  return ArrayRef<Reloc>(StartReloc, StartReloc + NumRelocEntries);
 }
 
 Expected<XCOFFStringTable>
@@ -900,6 +965,18 @@ Expected<StringRef> XCOFFSymbolRef::getName() const {
 template struct XCOFFSectionHeader<XCOFFSectionHeader32>;
 template struct XCOFFSectionHeader<XCOFFSectionHeader64>;
 
+template struct XCOFFRelocation<llvm::support::ubig32_t>;
+template struct XCOFFRelocation<llvm::support::ubig64_t>;
+
+template llvm::Expected<llvm::ArrayRef<llvm::object::XCOFFRelocation64>>
+llvm::object::XCOFFObjectFile::relocations<llvm::object::XCOFFSectionHeader64,
+                                           llvm::object::XCOFFRelocation64>(
+    llvm::object::XCOFFSectionHeader64 const &) const;
+template llvm::Expected<llvm::ArrayRef<llvm::object::XCOFFRelocation32>>
+llvm::object::XCOFFObjectFile::relocations<llvm::object::XCOFFSectionHeader32,
+                                           llvm::object::XCOFFRelocation32>(
+    llvm::object::XCOFFSectionHeader32 const &) const;
+
 bool doesXCOFFTracebackTableBegin(ArrayRef<uint8_t> Bytes) {
   if (Bytes.size() < 4)
     return false;
diff --git a/llvm/lib/Option/OptTable.cpp b/llvm/lib/Option/OptTable.cpp
index f5bf166e9e65b..a994a42639ed9 100644
--- a/llvm/lib/Option/OptTable.cpp
+++ b/llvm/lib/Option/OptTable.cpp
@@ -654,7 +654,7 @@ void OptTable::printHelp(raw_ostream &OS, const char *Usage, const char *Title,
         HelpText = getOptionHelpText(Alias.getID());
     }
 
-    if (HelpText) {
+    if (HelpText && (strlen(HelpText) != 0)) {
       const char *HelpGroup = getOptionHelpGroup(*this, Id);
       const std::string &OptName = getOptionHelpName(*this, Id);
       GroupedOptionHelp[HelpGroup].push_back({OptName, HelpText});
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index bf968e192aed0..639f101dbd712 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -456,6 +456,8 @@ PassBuilder::PassBuilder(TargetMachine *TM, PipelineTuningOptions PTO,
   if (PIC && shouldPopulateClassToPassNames()) {
 #define MODULE_PASS(NAME, CREATE_PASS)                                         \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
+#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
+  PIC->addClassToPassName(CLASS, NAME);
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   PIC->addClassToPassName(decltype(CREATE_PASS)::name(), NAME);
 #define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
@@ -620,7 +622,7 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(
       RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
-                                              EnableMSSALoopDependency,
+                                              /*UseMemorySSA=*/true,
                                               /*UseBlockFrequencyInfo=*/true));
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
@@ -794,7 +796,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(
       RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1),
-                                              EnableMSSALoopDependency,
+                                              /*UseMemorySSA=*/true,
                                               /*UseBlockFrequencyInfo=*/true));
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
@@ -851,7 +853,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(DSEPass());
   FPM.addPass(createFunctionToLoopPassAdaptor(
       LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-      EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true));
+      /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
 
   FPM.addPass(CoroElidePass());
 
@@ -926,7 +928,7 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
   FunctionPassManager FPM;
   // Disable header duplication in loop rotation at -Oz.
   FPM.addPass(createFunctionToLoopPassAdaptor(
-      LoopRotatePass(Level != OptimizationLevel::Oz), EnableMSSALoopDependency,
+      LoopRotatePass(Level != OptimizationLevel::Oz), /*UseMemorySSA=*/false,
       /*UseBlockFrequencyInfo=*/false));
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
 
@@ -1249,9 +1251,9 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
                                        OptimizationLevel::O3));
     FPM.addPass(
         RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
-    FPM.addPass(createFunctionToLoopPassAdaptor(
-        std::move(LPM), EnableMSSALoopDependency,
-        /*UseBlockFrequencyInfo=*/true));
+    FPM.addPass(
+        createFunctionToLoopPassAdaptor(std::move(LPM), /*UseMemorySSA=*/true,
+                                        /*UseBlockFrequencyInfo=*/true));
     FPM.addPass(SimplifyCFGPass());
     FPM.addPass(InstCombinePass());
   }
@@ -1310,7 +1312,7 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
         RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
     FPM.addPass(createFunctionToLoopPassAdaptor(
         LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-        EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true));
+        /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
   }
 
   // Now that we've vectorized and unrolled loops, we may have more refined
@@ -1402,8 +1404,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   // Disable header duplication at -Oz.
   OptimizePM.addPass(createFunctionToLoopPassAdaptor(
       LoopRotatePass(Level != OptimizationLevel::Oz, LTOPreLink),
-      EnableMSSALoopDependency,
-      /*UseBlockFrequencyInfo=*/false));
+      /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/false));
 
   // Distribute loops to allow partial vectorization.  I.e. isolate dependences
   // into separate loop that would otherwise inhibit vectorization.  This is
@@ -1832,7 +1833,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   FunctionPassManager MainFPM;
   MainFPM.addPass(createFunctionToLoopPassAdaptor(
       LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-      EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true));
+      /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
 
   if (RunNewGVN)
     MainFPM.addPass(NewGVNPass());
@@ -2141,6 +2142,79 @@ Expected<LoopUnrollOptions> parseLoopUnrollOptions(StringRef Params) {
   return UnrollOpts;
 }
 
+Expected<bool> parseSinglePassOption(StringRef Params, StringRef OptionName,
+                                     StringRef PassName) {
+  bool Result = false;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    if (ParamName == OptionName) {
+      Result = true;
+    } else {
+      return make_error<StringError>(
+          formatv("invalid {1} pass parameter '{0}' ", ParamName, PassName)
+              .str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
+
+Expected<bool> parseEarlyCSEPassOptions(StringRef Params) {
+  return parseSinglePassOption(Params, "memssa", "EarlyCSE");
+}
+
+Expected<bool> parseEntryExitInstrumenterPassOptions(StringRef Params) {
+  return parseSinglePassOption(Params, "post-inline", "EntryExitInstrumenter");
+}
+
+Expected<bool> parseLoopExtractorPassOptions(StringRef Params) {
+  return parseSinglePassOption(Params, "single", "LoopExtractor");
+}
+
+Expected<bool> parseLowerMatrixIntrinsicsPassOptions(StringRef Params) {
+  return parseSinglePassOption(Params, "minimal", "LowerMatrixIntrinsics");
+}
+
+Expected<AddressSanitizerOptions> parseASanPassOptions(StringRef Params) {
+  AddressSanitizerOptions Result;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    if (ParamName == "kernel") {
+      Result.CompileKernel = true;
+    } else {
+      return make_error<StringError>(
+          formatv("invalid AddressSanitizer pass parameter '{0}' ", ParamName)
+              .str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
+
+Expected<HWAddressSanitizerOptions> parseHWASanPassOptions(StringRef Params) {
+  HWAddressSanitizerOptions Result;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    if (ParamName == "recover") {
+      Result.Recover = true;
+    } else if (ParamName == "kernel") {
+      Result.CompileKernel = true;
+    } else {
+      return make_error<StringError>(
+          formatv("invalid HWAddressSanitizer pass parameter '{0}' ", ParamName)
+              .str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
+
 Expected<MemorySanitizerOptions> parseMSanPassOptions(StringRef Params) {
   MemorySanitizerOptions Result;
   while (!Params.empty()) {
@@ -2360,6 +2434,9 @@ static bool isModulePassName(StringRef Name, CallbacksT &Callbacks) {
 #define MODULE_PASS(NAME, CREATE_PASS)                                         \
   if (Name == NAME)                                                            \
     return true;
+#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
+  if (checkParametrizedPassName(Name, NAME))                                   \
+    return true;
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   if (Name == "require<" NAME ">" || Name == "invalidate<" NAME ">")           \
     return true;
@@ -2420,15 +2497,19 @@ static bool isFunctionPassName(StringRef Name, CallbacksT &Callbacks) {
 }
 
 template <typename CallbacksT>
-static bool isLoopPassName(StringRef Name, CallbacksT &Callbacks) {
-  // Explicitly handle pass manager names.
-  if (Name == "loop" || Name == "loop-mssa")
-    return true;
+static bool isLoopPassName(StringRef Name, CallbacksT &Callbacks,
+                           bool &UseMemorySSA) {
+  UseMemorySSA = false;
 
   // Explicitly handle custom-parsed pass names.
   if (parseRepeatPassName(Name))
     return true;
 
+  if (Name == "licm") {
+    UseMemorySSA = true;
+    return true;
+  }
+
 #define LOOP_PASS(NAME, CREATE_PASS)                                           \
   if (Name == NAME)                                                            \
     return true;
@@ -2600,6 +2681,14 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
     MPM.addPass(CREATE_PASS);                                                  \
     return Error::success();                                                   \
   }
+#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
+  if (checkParametrizedPassName(Name, NAME)) {                                 \
+    auto Params = parsePassParameters(PARSER, Name, NAME);                     \
+    if (!Params)                                                               \
+      return Params.takeError();                                               \
+    MPM.addPass(CREATE_PASS(Params.get()));                                    \
+    return Error::success();                                                   \
+  }
 #define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
   if (Name == "require<" NAME ">") {                                           \
     MPM.addPass(                                                               \
@@ -3017,13 +3106,16 @@ Error PassBuilder::parsePassPipeline(ModulePassManager &MPM,
   StringRef FirstName = Pipeline->front().Name;
 
   if (!isModulePassName(FirstName, ModulePipelineParsingCallbacks)) {
+    bool UseMemorySSA;
     if (isCGSCCPassName(FirstName, CGSCCPipelineParsingCallbacks)) {
       Pipeline = {{"cgscc", std::move(*Pipeline)}};
     } else if (isFunctionPassName(FirstName,
                                   FunctionPipelineParsingCallbacks)) {
       Pipeline = {{"function", std::move(*Pipeline)}};
-    } else if (isLoopPassName(FirstName, LoopPipelineParsingCallbacks)) {
-      Pipeline = {{"function", {{"loop", std::move(*Pipeline)}}}};
+    } else if (isLoopPassName(FirstName, LoopPipelineParsingCallbacks,
+                              UseMemorySSA)) {
+      Pipeline = {{"function", {{UseMemorySSA ? "loop-mssa" : "loop",
+                                 std::move(*Pipeline)}}}};
     } else {
       for (auto &C : TopLevelPipelineParsingCallbacks)
         if (C(MPM, *Pipeline))
@@ -3173,6 +3265,11 @@ void PassBuilder::printPassNames(raw_ostream &OS) {
 #define MODULE_PASS(NAME, CREATE_PASS) printPassName(NAME, OS);
 #include "PassRegistry.def"
 
+  OS << "Module passes with params:\n";
+#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)      \
+  printPassName(NAME, PARAMS, OS);
+#include "PassRegistry.def"
+
   OS << "Module analyses:\n";
 #define MODULE_ANALYSIS(NAME, CREATE_PASS) printPassName(NAME, OS);
 #include "PassRegistry.def"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index e5640ebf800b7..6f4377ae0e9fd 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -60,8 +60,6 @@ MODULE_PASS("globaldce", GlobalDCEPass())
 MODULE_PASS("globalopt", GlobalOptPass())
 MODULE_PASS("globalsplit", GlobalSplitPass())
 MODULE_PASS("hotcoldsplit", HotColdSplittingPass())
-MODULE_PASS("hwasan", HWAddressSanitizerPass(false, false))
-MODULE_PASS("khwasan", HWAddressSanitizerPass(true, true))
 MODULE_PASS("inferattrs", InferFunctionAttrsPass())
 MODULE_PASS("inliner-wrapper", ModuleInlinerWrapperPass())
 MODULE_PASS("inliner-wrapper-no-mandatory-first", ModuleInlinerWrapperPass(
@@ -75,7 +73,6 @@ MODULE_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 MODULE_PASS("ipsccp", IPSCCPPass())
 MODULE_PASS("iroutliner", IROutlinerPass())
 MODULE_PASS("print-ir-similarity", IRSimilarityAnalysisPrinterPass(dbgs()))
-MODULE_PASS("loop-extract", LoopExtractorPass())
 MODULE_PASS("lowertypetests", LowerTypeTestsPass())
 MODULE_PASS("metarenamer", MetaRenamerPass())
 MODULE_PASS("mergefunc", MergeFunctionsPass())
@@ -101,7 +98,6 @@ MODULE_PASS("rpo-function-attrs", ReversePostOrderFunctionAttrsPass())
 MODULE_PASS("sample-profile", SampleProfileLoaderPass())
 MODULE_PASS("scc-oz-module-inliner",
   buildInlinerPipeline(OptimizationLevel::Oz, ThinOrFullLTOPhase::None))
-MODULE_PASS("loop-extract-single", LoopExtractorPass(1))
 MODULE_PASS("strip", StripSymbolsPass())
 MODULE_PASS("strip-dead-debug-info", StripDeadDebugInfoPass())
 MODULE_PASS("pseudo-probe", SampleProfileProbePass(TM))
@@ -127,6 +123,27 @@ MODULE_PASS("SPIRITTAnnotations", SPIRITTAnnotationsPass())
 MODULE_PASS("deadargelim-sycl", DeadArgumentEliminationSYCLPass())
 #undef MODULE_PASS
 
+#ifndef MODULE_PASS_WITH_PARAMS
+#define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)
+#endif
+MODULE_PASS_WITH_PARAMS("loop-extract",
+                        "LoopExtractorPass",
+                        [](bool Single) {
+                          if (Single)
+                            return LoopExtractorPass(1);
+                          return LoopExtractorPass();
+                        },
+                        parseLoopExtractorPassOptions,
+                        "single")
+MODULE_PASS_WITH_PARAMS("hwasan",
+                        "HWAddressSanitizerPass",
+                        [](HWAddressSanitizerOptions Opts) {
+                          return HWAddressSanitizerPass(Opts);
+                        },
+                        parseHWASanPassOptions,
+                        "kernel;recover")
+#undef MODULE_PASS_WITH_PARAMS
+
 #ifndef CGSCC_ANALYSIS
 #define CGSCC_ANALYSIS(NAME, CREATE_PASS)
 #endif
@@ -221,12 +238,8 @@ FUNCTION_PASS("div-rem-pairs", DivRemPairsPass())
 FUNCTION_PASS("dse", DSEPass())
 FUNCTION_PASS("dot-cfg", CFGPrinterPass())
 FUNCTION_PASS("dot-cfg-only", CFGOnlyPrinterPass())
-FUNCTION_PASS("early-cse", EarlyCSEPass(/*UseMemorySSA=*/false))
-FUNCTION_PASS("early-cse-memssa", EarlyCSEPass(/*UseMemorySSA=*/true))
-FUNCTION_PASS("ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/false))
 FUNCTION_PASS("fix-irreducible", FixIrreduciblePass())
 FUNCTION_PASS("make-guards-explicit", MakeGuardsExplicitPass())
-FUNCTION_PASS("post-inline-ee-instrument", EntryExitInstrumenterPass(/*PostInlining=*/true))
 FUNCTION_PASS("gvn-hoist", GVNHoistPass())
 FUNCTION_PASS("gvn-sink", GVNSinkPass())
 FUNCTION_PASS("helloworld", HelloWorldPass())
@@ -246,8 +259,6 @@ FUNCTION_PASS("loweratomic", LowerAtomicPass())
 FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass())
 FUNCTION_PASS("lower-guard-intrinsic", LowerGuardIntrinsicPass())
 FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass())
-FUNCTION_PASS("lower-matrix-intrinsics", LowerMatrixIntrinsicsPass())
-FUNCTION_PASS("lower-matrix-intrinsics-minimal", LowerMatrixIntrinsicsPass(true))
 FUNCTION_PASS("lower-widenable-condition", LowerWidenableConditionPass())
 FUNCTION_PASS("guard-widening", GuardWideningPass())
 FUNCTION_PASS("load-store-vectorizer", LoadStoreVectorizerPass())
@@ -326,10 +337,7 @@ FUNCTION_PASS("verify<scalar-evolution>", ScalarEvolutionVerifierPass())
 FUNCTION_PASS("view-cfg", CFGViewerPass())
 FUNCTION_PASS("view-cfg-only", CFGOnlyViewerPass())
 FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
-FUNCTION_PASS("asan", AddressSanitizerPass(false, false, false))
-FUNCTION_PASS("kasan", AddressSanitizerPass(true, false, false))
 FUNCTION_PASS("msan", MemorySanitizerPass({}))
-FUNCTION_PASS("kmsan", MemorySanitizerPass({0, false, /*Kernel=*/true}))
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
 FUNCTION_PASS("memprof", MemProfilerPass())
 FUNCTION_PASS("LowerWGScope", SYCLLowerWGScopePass())
@@ -339,6 +347,27 @@ FUNCTION_PASS("ESIMDLowerLoadStore", ESIMDLowerLoadStorePass())
 #ifndef FUNCTION_PASS_WITH_PARAMS
 #define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)
 #endif
+FUNCTION_PASS_WITH_PARAMS("early-cse",
+                          "EarlyCSEPass",
+                           [](bool UseMemorySSA) {
+                             return EarlyCSEPass(UseMemorySSA);
+                           },
+                          parseEarlyCSEPassOptions,
+                          "memssa")
+FUNCTION_PASS_WITH_PARAMS("ee-instrument",
+                          "EntryExitInstrumenterPass",
+                           [](bool PostInlining) {
+                             return EntryExitInstrumenterPass(PostInlining);
+                           },
+                          parseEntryExitInstrumenterPassOptions,
+                          "post-inline")
+FUNCTION_PASS_WITH_PARAMS("lower-matrix-intrinsics",
+                          "LowerMatrixIntrinsicsPass",
+                           [](bool Minimal) {
+                             return LowerMatrixIntrinsicsPass(Minimal);
+                           },
+                          parseLowerMatrixIntrinsicsPassOptions,
+                          "minimal")
 FUNCTION_PASS_WITH_PARAMS("loop-unroll",
                           "LoopUnrollPass",
                            [](LoopUnrollOptions Opts) {
@@ -351,6 +380,13 @@ FUNCTION_PASS_WITH_PARAMS("loop-unroll",
                           "no-profile-peeling;profile-peeling;"
                           "no-runtime;runtime;"
                           "no-upperbound;upperbound")
+FUNCTION_PASS_WITH_PARAMS("asan",
+                          "AddressSanitizerPass",
+                           [](AddressSanitizerOptions Opts) {
+                             return AddressSanitizerPass(Opts);
+                           },
+                          parseASanPassOptions,
+                          "kernel")
 FUNCTION_PASS_WITH_PARAMS("msan",
                           "MemorySanitizerPass",
                            [](MemorySanitizerOptions Opts) {
diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp
index 60e707b146d5e..adbec7aef0e01 100644
--- a/llvm/lib/ProfileData/SampleProf.cpp
+++ b/llvm/lib/ProfileData/SampleProf.cpp
@@ -198,6 +198,23 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
   return OS;
 }
 
+void sampleprof::sortFuncProfiles(
+    const StringMap<FunctionSamples> &ProfileMap,
+    std::vector<NameFunctionSamples> &SortedProfiles) {
+  for (const auto &I : ProfileMap) {
+    assert(I.getKey() == I.second.getNameWithContext() &&
+           "Inconsistent profile map");
+    SortedProfiles.push_back(
+        std::make_pair(I.second.getNameWithContext(), &I.second));
+  }
+  llvm::stable_sort(SortedProfiles, [](const NameFunctionSamples &A,
+                                       const NameFunctionSamples &B) {
+    if (A.second->getTotalSamples() == B.second->getTotalSamples())
+      return A.first > B.first;
+    return A.second->getTotalSamples() > B.second->getTotalSamples();
+  });
+}
+
 unsigned FunctionSamples::getOffset(const DILocation *DIL) {
   return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
       0xffff;
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index 6058eddb13dc7..7fc95520951fb 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -53,7 +53,7 @@ using namespace sampleprof;
 // For ext-binary format profiles, the flag is set in the summary.
 static cl::opt<bool> ProfileIsFSDisciminator(
     "profile-isfs", cl::Hidden, cl::init(false),
-    cl::desc("Profile uses flow senstive discriminators"));
+    cl::desc("Profile uses flow sensitive discriminators"));
 
 /// Dump the function profile for \p FName.
 ///
@@ -66,8 +66,10 @@ void SampleProfileReader::dumpFunctionProfile(StringRef FName,
 
 /// Dump all the function profiles found on stream \p OS.
 void SampleProfileReader::dump(raw_ostream &OS) {
-  for (const auto &I : Profiles)
-    dumpFunctionProfile(I.getKey(), OS);
+  std::vector<NameFunctionSamples> V;
+  sortFuncProfiles(Profiles, V);
+  for (const auto &I : V)
+    dumpFunctionProfile(I.first, OS);
 }
 
 /// Parse \p Input as function head.
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index 535f879681048..3b29395bc2c92 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -43,21 +43,8 @@ using namespace sampleprof;
 
 std::error_code SampleProfileWriter::writeFuncProfiles(
     const StringMap<FunctionSamples> &ProfileMap) {
-  // Sort the ProfileMap by total samples.
-  typedef std::pair<StringRef, const FunctionSamples *> NameFunctionSamples;
   std::vector<NameFunctionSamples> V;
-  for (const auto &I : ProfileMap) {
-    assert(I.getKey() == I.second.getNameWithContext() &&
-           "Inconsistent profile map");
-    V.push_back(std::make_pair(I.second.getNameWithContext(), &I.second));
-  }
-  llvm::stable_sort(
-      V, [](const NameFunctionSamples &A, const NameFunctionSamples &B) {
-        if (A.second->getTotalSamples() == B.second->getTotalSamples())
-          return A.first > B.first;
-        return A.second->getTotalSamples() > B.second->getTotalSamples();
-      });
-
+  sortFuncProfiles(ProfileMap, V);
   for (const auto &I : V) {
     if (std::error_code EC = writeSample(*I.second))
       return EC;
diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt
index eedc257075e4e..3aab25d6ca619 100644
--- a/llvm/lib/SYCLLowerIR/CMakeLists.txt
+++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt
@@ -14,7 +14,7 @@ if (NOT TARGET LLVMGenXIntrinsics)
   if (NOT DEFINED LLVMGenXIntrinsics_SOURCE_DIR)
     message(STATUS "vc-intrinsics are missing. Will try to download them from github.com")
 
-    set( LLVMGenXIntrinsics_GIT_TAG 19e6ddafc545bc96ceaacbf7c81cacbff59a39b6 )
+    set( LLVMGenXIntrinsics_GIT_TAG fd9bf4b3a91f853ed74a376df58123bb75f64449 )
 
     include(FetchContent)
     FetchContent_Declare(vc-intrinsics
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 6c140863b13cc..e64934aa90cc8 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -1321,12 +1321,20 @@ bool cl::ParseCommandLineOptions(int argc, const char *const *argv,
                                                Errs, LongOptionsUseDoubleDash);
 }
 
+/// Reset all options at least once, so that we can parse different options.
 void CommandLineParser::ResetAllOptionOccurrences() {
-  // So that we can parse different command lines multiple times in succession
-  // we reset all option values to look like they have never been seen before.
+  // Reset all option values to look like they have never been seen before.
+  // Options might be reset twice (they can be reference in both OptionsMap
+  // and one of the other members), but that does not harm.
   for (auto *SC : RegisteredSubCommands) {
     for (auto &O : SC->OptionsMap)
       O.second->reset();
+    for (Option *O : SC->PositionalOpts)
+      O->reset();
+    for (Option *O : SC->SinkOpts)
+      O->reset();
+    if (SC->ConsumeAfterOpt)
+      SC->ConsumeAfterOpt->reset();
   }
 }
 
diff --git a/llvm/lib/Support/MD5.cpp b/llvm/lib/Support/MD5.cpp
index 5e0b076f176e8..9dceb4d418cde 100644
--- a/llvm/lib/Support/MD5.cpp
+++ b/llvm/lib/Support/MD5.cpp
@@ -67,11 +67,11 @@
 // SET reads 4 input bytes in little-endian byte order and stores them
 // in a properly aligned word in host byte order.
 #define SET(n)                                                                 \
-  (block[(n)] =                                                                \
-       (MD5_u32plus) ptr[(n) * 4] | ((MD5_u32plus) ptr[(n) * 4 + 1] << 8) |    \
-       ((MD5_u32plus) ptr[(n) * 4 + 2] << 16) |                                \
-       ((MD5_u32plus) ptr[(n) * 4 + 3] << 24))
-#define GET(n) (block[(n)])
+  (InternalState.block[(n)] = (MD5_u32plus)ptr[(n)*4] |                        \
+                              ((MD5_u32plus)ptr[(n)*4 + 1] << 8) |             \
+                              ((MD5_u32plus)ptr[(n)*4 + 2] << 16) |            \
+                              ((MD5_u32plus)ptr[(n)*4 + 3] << 24))
+#define GET(n) (InternalState.block[(n)])
 
 using namespace llvm;
 
@@ -85,10 +85,10 @@ const uint8_t *MD5::body(ArrayRef<uint8_t> Data) {
 
   ptr = Data.data();
 
-  a = this->a;
-  b = this->b;
-  c = this->c;
-  d = this->d;
+  a = InternalState.a;
+  b = InternalState.b;
+  c = InternalState.c;
+  d = InternalState.d;
 
   do {
     saved_a = a;
@@ -176,10 +176,10 @@ const uint8_t *MD5::body(ArrayRef<uint8_t> Data) {
     ptr += 64;
   } while (Size -= 64);
 
-  this->a = a;
-  this->b = b;
-  this->c = c;
-  this->d = d;
+  InternalState.a = a;
+  InternalState.b = b;
+  InternalState.c = c;
+  InternalState.d = d;
 
   return ptr;
 }
@@ -193,10 +193,10 @@ void MD5::update(ArrayRef<uint8_t> Data) {
   const uint8_t *Ptr = Data.data();
   unsigned long Size = Data.size();
 
-  saved_lo = lo;
-  if ((lo = (saved_lo + Size) & 0x1fffffff) < saved_lo)
-    hi++;
-  hi += Size >> 29;
+  saved_lo = InternalState.lo;
+  if ((InternalState.lo = (saved_lo + Size) & 0x1fffffff) < saved_lo)
+    InternalState.hi++;
+  InternalState.hi += Size >> 29;
 
   used = saved_lo & 0x3f;
 
@@ -204,14 +204,14 @@ void MD5::update(ArrayRef<uint8_t> Data) {
     free = 64 - used;
 
     if (Size < free) {
-      memcpy(&buffer[used], Ptr, Size);
+      memcpy(&InternalState.buffer[used], Ptr, Size);
       return;
     }
 
-    memcpy(&buffer[used], Ptr, free);
+    memcpy(&InternalState.buffer[used], Ptr, free);
     Ptr = Ptr + free;
     Size -= free;
-    body(makeArrayRef(buffer, 64));
+    body(makeArrayRef(InternalState.buffer, 64));
   }
 
   if (Size >= 64) {
@@ -219,7 +219,7 @@ void MD5::update(ArrayRef<uint8_t> Data) {
     Size &= 0x3f;
   }
 
-  memcpy(buffer, Ptr, Size);
+  memcpy(InternalState.buffer, Ptr, Size);
 }
 
 /// Add the bytes in the StringRef \p Str to the hash.
@@ -235,31 +235,48 @@ void MD5::update(StringRef Str) {
 void MD5::final(MD5Result &Result) {
   unsigned long used, free;
 
-  used = lo & 0x3f;
+  used = InternalState.lo & 0x3f;
 
-  buffer[used++] = 0x80;
+  InternalState.buffer[used++] = 0x80;
 
   free = 64 - used;
 
   if (free < 8) {
-    memset(&buffer[used], 0, free);
-    body(makeArrayRef(buffer, 64));
+    memset(&InternalState.buffer[used], 0, free);
+    body(makeArrayRef(InternalState.buffer, 64));
     used = 0;
     free = 64;
   }
 
-  memset(&buffer[used], 0, free - 8);
+  memset(&InternalState.buffer[used], 0, free - 8);
 
-  lo <<= 3;
-  support::endian::write32le(&buffer[56], lo);
-  support::endian::write32le(&buffer[60], hi);
+  InternalState.lo <<= 3;
+  support::endian::write32le(&InternalState.buffer[56], InternalState.lo);
+  support::endian::write32le(&InternalState.buffer[60], InternalState.hi);
 
-  body(makeArrayRef(buffer, 64));
+  body(makeArrayRef(InternalState.buffer, 64));
 
-  support::endian::write32le(&Result[0], a);
-  support::endian::write32le(&Result[4], b);
-  support::endian::write32le(&Result[8], c);
-  support::endian::write32le(&Result[12], d);
+  support::endian::write32le(&Result[0], InternalState.a);
+  support::endian::write32le(&Result[4], InternalState.b);
+  support::endian::write32le(&Result[8], InternalState.c);
+  support::endian::write32le(&Result[12], InternalState.d);
+}
+
+StringRef MD5::final() {
+  final(Result);
+  return StringRef(reinterpret_cast<char *>(Result.Bytes.data()),
+                   Result.Bytes.size());
+}
+
+StringRef MD5::result() {
+  auto StateToRestore = InternalState;
+
+  auto Hash = final();
+
+  // Restore the state
+  InternalState = StateToRestore;
+
+  return Hash;
 }
 
 SmallString<32> MD5::MD5Result::digest() const {
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 4b7ce565eb1e4..c906014439344 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -50,9 +50,9 @@ def CC_AArch64_AAPCS : CallingConv<[
   // "sret" on argument 1 means instance methods.
 
   CCIfInReg<CCIfType<[i64],
-    CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1], [W0, W1]>>>>>,
+    CCIfSRet<CCIfType<[i64], CCAssignToReg<[X0, X1]>>>>>,
 
-  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+  CCIfSRet<CCIfType<[i64], CCAssignToReg<[X8]>>>,
 
   // Put ByVal arguments directly on the stack. Minimum size and alignment of a
   // slot is 64-bit.
@@ -64,14 +64,14 @@ def CC_AArch64_AAPCS : CallingConv<[
   CCIfNest<CCAssignToReg<[X18]>>,
 
   // Pass SwiftSelf in a callee saved register.
-  CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[X20]>>>,
 
   // A SwiftError is passed in X21.
-  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[X21]>>>,
 
   // Pass SwiftAsync in an otherwise callee saved register so that it will be
   // preserved for normal function calls.
-  CCIfSwiftAsync<CCIfType<[i64], CCAssignToRegWithShadow<[X22], [W22]>>>,
+  CCIfSwiftAsync<CCIfType<[i64], CCAssignToReg<[X22]>>>,
 
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
@@ -90,8 +90,7 @@ def CC_AArch64_AAPCS : CallingConv<[
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i32], CCAssignToReg<[W0, W1, W2, W3, W4, W5, W6, W7]>>,
   // i128 is split to two i64s, we can't fit half to register X7.
   CCIfType<[i64], CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6],
                                                     [X0, X1, X3, X5]>>>,
@@ -99,19 +98,13 @@ def CC_AArch64_AAPCS : CallingConv<[
   // i128 is split to two i64s, and its stack alignment is 16 bytes.
   CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
 
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
-                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[f16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
+  CCIfType<[bf16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
-           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+           CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
@@ -131,7 +124,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
 
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
-  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[X21]>>>,
 
   // Big endian vectors must be passed as if they were 1-element vectors so that
   // their lanes are in a consistent order.
@@ -141,21 +134,14 @@ def RetCC_AArch64_AAPCS : CallingConv<[
                          CCBitConvertToType<f128>>>,
 
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
-                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[i32], CCAssignToReg<[W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[f16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
+  CCIfType<[bf16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
-      CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                              [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+      CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
       CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
@@ -195,49 +181,41 @@ def CC_AArch64_DarwinPCS : CallingConv<[
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
 
   // An SRet is passed in X8, not X0 like a normal pointer parameter.
-  CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
+  CCIfSRet<CCIfType<[i64], CCAssignToReg<[X8]>>>,
 
   // Put ByVal arguments directly on the stack. Minimum size and alignment of a
   // slot is 64-bit.
   CCIfByVal<CCPassByVal<8, 8>>,
 
   // Pass SwiftSelf in a callee saved register.
-  CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[X20]>>>,
 
   // A SwiftError is passed in X21.
-  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[X21]>>>,
 
   // Pass SwiftAsync in an otherwise callee saved register so that it will be
   // preserved for normal function calls.
-  CCIfSwiftAsync<CCIfType<[i64], CCAssignToRegWithShadow<[X22], [W22]>>>,
+  CCIfSwiftAsync<CCIfType<[i64], CCAssignToReg<[X22]>>>,
 
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[i32], CCAssignToReg<[W0, W1, W2, W3, W4, W5, W6, W7]>>,
   // i128 is split to two i64s, we can't fit half to register X7.
   CCIfType<[i64],
-           CCIfSplit<CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6],
-                                             [W0, W1, W2, W3, W4, W5, W6]>>>,
+           CCIfSplit<CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6]>>>,
   // i128 is split to two i64s, and its stack alignment is 16 bytes.
   CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
 
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
-                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[f16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
+  CCIfType<[bf16], CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
-           CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                   [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+           CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
@@ -310,8 +288,8 @@ let Entry = 1 in
 def CC_AArch64_WebKit_JS : CallingConv<[
   // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
+  CCIfType<[i32], CCAssignToReg<[W0]>>,
+  CCIfType<[i64], CCAssignToReg<[X0]>>,
 
   // Pass the remaining arguments on the stack instead.
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
@@ -320,14 +298,10 @@ def CC_AArch64_WebKit_JS : CallingConv<[
 
 let Entry = 1 in
 def RetCC_AArch64_WebKit_JS : CallingConv<[
-  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
-                                          [X0, X1, X2, X3, X4, X5, X6, X7]>>,
-  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
-                                          [W0, W1, W2, W3, W4, W5, W6, W7]>>,
-  CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
-                                          [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
+  CCIfType<[i32], CCAssignToReg<[W0, W1, W2, W3, W4, W5, W6, W7]>>,
+  CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>
 ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 58048869fa2ec..e556533cb0891 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -2965,8 +2965,8 @@ static int getIntOperandFromRegisterString(StringRef RegString) {
 // form described in getIntOperandsFromRegsterString) or is a named register
 // known by the MRS SysReg mapper.
 bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
-  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
-  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  const auto *MD = cast<MDNodeSDNode>(N->getOperand(1));
+  const auto *RegString = cast<MDString>(MD->getMD()->getOperand(0));
   SDLoc DL(N);
 
   int Reg = getIntOperandFromRegisterString(RegString->getString());
@@ -3011,8 +3011,8 @@ bool AArch64DAGToDAGISel::tryReadRegister(SDNode *N) {
 // form described in getIntOperandsFromRegsterString) or is a named register
 // known by the MSR SysReg mapper.
 bool AArch64DAGToDAGISel::tryWriteRegister(SDNode *N) {
-  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
-  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  const auto *MD = cast<MDNodeSDNode>(N->getOperand(1));
+  const auto *RegString = cast<MDString>(MD->getMD()->getOperand(0));
   SDLoc DL(N);
 
   int Reg = getIntOperandFromRegisterString(RegString->getString());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 732310c58ec3c..338145b86490a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10459,8 +10459,29 @@ SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
          isTypeLegal(Op.getValueType()) &&
          "Expected legal scalable vector type!");
 
-  if (isTypeLegal(Op.getOperand(0).getValueType()) && Op.getNumOperands() == 2)
-    return Op;
+  if (isTypeLegal(Op.getOperand(0).getValueType())) {
+    unsigned NumOperands = Op->getNumOperands();
+    assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
+           "Unexpected number of operands in CONCAT_VECTORS");
+
+    if (NumOperands == 2)
+      return Op;
+
+    // Concat each pair of subvectors and pack into the lower half of the array.
+    SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
+    while (ConcatOps.size() > 1) {
+      for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
+        SDValue V1 = ConcatOps[I];
+        SDValue V2 = ConcatOps[I + 1];
+        EVT SubVT = V1.getValueType();
+        EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
+        ConcatOps[I / 2] =
+            DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
+      }
+      ConcatOps.resize(ConcatOps.size() / 2);
+    }
+    return ConcatOps[0];
+  }
 
   return SDValue();
 }
@@ -12053,8 +12074,7 @@ SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
 
 EVT AArch64TargetLowering::getOptimalMemOpType(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
-  bool CanImplicitFloat =
-      !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
+  bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
   bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
@@ -12071,8 +12091,8 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
   };
 
   if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
-      AlignmentIsAcceptable(MVT::v2i64, Align(16)))
-    return MVT::v2i64;
+      AlignmentIsAcceptable(MVT::v16i8, Align(16)))
+    return MVT::v16i8;
   if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
     return MVT::f128;
   if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
@@ -12084,8 +12104,7 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
 
 LLT AArch64TargetLowering::getOptimalMemOpLLT(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
-  bool CanImplicitFloat =
-      !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
+  bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
   bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
@@ -13623,7 +13642,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
   // splat. The indexed instructions are going to be expecting a DUPLANE64, so
   // canonicalise to that.
-  if (N0 == N1 && VT.getVectorNumElements() == 2) {
+  if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
     assert(VT.getScalarSizeInBits() == 64);
     return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
                        DAG.getConstant(0, dl, MVT::i64));
@@ -13638,7 +13657,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   // becomes
   //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
 
-  if (N1Opc != ISD::BITCAST)
+  if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
     return SDValue();
   SDValue RHS = N1->getOperand(0);
   MVT RHSTy = RHS.getValueType().getSimpleVT();
@@ -15359,11 +15378,10 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
         ExtOpCode != ISD::ANY_EXTEND)
       return SDValue();
     SDValue Orig = Ext->getOperand(0);
-    if (Store->getMemoryVT() != Orig->getValueType(0))
+    if (Store->getMemoryVT() != Orig.getValueType())
       return SDValue();
     return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
-                        Store->getBasePtr(), Store->getPointerInfo(),
-                        Store->getAlign());
+                        Store->getBasePtr(), Store->getMemOperand());
   }
 
   return SDValue();
@@ -17725,7 +17743,7 @@ void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
         Type::getInt8PtrTy(M.getContext()));
     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
       F->setCallingConv(CallingConv::Win64);
-      F->addAttribute(1, Attribute::AttrKind::InReg);
+      F->addParamAttr(0, Attribute::AttrKind::InReg);
     }
     return;
   }
@@ -17851,7 +17869,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // integer division, leaving the division as-is is a loss even in terms of
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
-  bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
+  bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 3a0cbbb275b5a..0ec4b5753ee17 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1120,6 +1120,16 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
   if (!MI.getOperand(1).isReg())
     return false;
 
+  auto NormalizeCmpValue = [](int64_t Value) -> int {
+    // Comparison immediates may be 64-bit, but CmpValue is only an int.
+    // Normalize to 0/1/2 return value, where 2 indicates any value apart from
+    // 0 or 1.
+    // TODO: Switch CmpValue to int64_t in the API to avoid this.
+    if (Value == 0 || Value == 1)
+      return Value;
+    return 2;
+  };
+
   switch (MI.getOpcode()) {
   default:
     break;
@@ -1155,8 +1165,7 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    // FIXME: In order to convert CmpValue to 0 or 1
-    CmpValue = MI.getOperand(2).getImm() != 0;
+    CmpValue = NormalizeCmpValue(MI.getOperand(2).getImm());
     return true;
   case AArch64::ANDSWri:
   case AArch64::ANDSXri:
@@ -1165,14 +1174,9 @@ bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
-    // FIXME:The return val type of decodeLogicalImmediate is uint64_t,
-    // while the type of CmpValue is int. When converting uint64_t to int,
-    // the high 32 bits of uint64_t will be lost.
-    // In fact it causes a bug in spec2006-483.xalancbmk
-    // CmpValue is only used to compare with zero in OptimizeCompareInstr
-    CmpValue = AArch64_AM::decodeLogicalImmediate(
+    CmpValue = NormalizeCmpValue(AArch64_AM::decodeLogicalImmediate(
                    MI.getOperand(2).getImm(),
-                   MI.getOpcode() == AArch64::ANDSWri ? 32 : 64) != 0;
+                   MI.getOpcode() == AArch64::ANDSWri ? 32 : 64));
     return true;
   }
 
@@ -1462,10 +1466,9 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   if (CmpInstr.getOpcode() == AArch64::PTEST_PP)
     return optimizePTestInstr(&CmpInstr, SrcReg, SrcReg2, MRI);
 
-  // Continue only if we have a "ri" where immediate is zero.
-  // FIXME:CmpValue has already been converted to 0 or 1 in analyzeCompare
-  // function.
-  assert((CmpValue == 0 || CmpValue == 1) && "CmpValue must be 0 or 1!");
+  // Warning: CmpValue == 2 indicates *any* value apart from 0 or 1.
+  assert((CmpValue == 0 || CmpValue == 1 || CmpValue == 2) &&
+         "CmpValue must be 0, 1, or 2!");
   if (SrcReg2 != 0)
     return false;
 
@@ -1473,9 +1476,10 @@ bool AArch64InstrInfo::optimizeCompareInstr(
   if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
     return false;
 
-  if (!CmpValue && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
+  if (CmpValue == 0 && substituteCmpToZero(CmpInstr, SrcReg, *MRI))
     return true;
-  return removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
+  return (CmpValue == 0 || CmpValue == 1) &&
+         removeCmpToZeroOrOne(CmpInstr, SrcReg, CmpValue, *MRI);
 }
 
 /// Get opcode of S version of Instr.
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index bf042c83294ad..3a836ac33064e 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1613,8 +1613,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
           // If the stored value and the address of the second instruction is
           // the same, it needs to be using the updated register and therefore
           // it must not be folded.
-          bool IsMIRegTheSame =
-              getLdStRegOp(MI).getReg() == getLdStBaseOp(MI).getReg();
+          bool IsMIRegTheSame = TRI->regsOverlap(getLdStRegOp(MI).getReg(),
+                                                 getLdStBaseOp(MI).getReg());
           if (IsOutOfBounds || IsBaseRegUsed || IsBaseRegModified ||
               IsMIRegTheSame) {
             LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 882f1c01664fc..b479b4808ec2a 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1549,7 +1549,7 @@ InstructionCost
 AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                       Align Alignment, unsigned AddressSpace,
                                       TTI::TargetCostKind CostKind) {
-  if (!isa<ScalableVectorType>(Src))
+  if (useNeonVector(Src))
     return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                         CostKind);
   auto LT = TLI->getTypeLegalizationCost(DL, Src);
@@ -1589,7 +1589,7 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
   ElementCount LegalVF = LT.second.getVectorElementCount();
   InstructionCost MemOpCost =
       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
-  return LT.first * MemOpCost * getMaxNumElements(LegalVF);
+  return LT.first * MemOpCost * getMaxNumElements(LegalVF, I->getFunction());
 }
 
 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
@@ -1754,6 +1754,8 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   // Enable partial unrolling and runtime unrolling.
   BaseT::getUnrollingPreferences(L, SE, UP, ORE);
 
+  UP.UpperBound = true;
+
   // For inner loop, it is more likely to be a hot one, and the runtime check
   // can be promoted out from LICM pass, so the overhead is less, let's try
   // a larger threshold to unroll more loops.
@@ -1794,7 +1796,6 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       !ST->getSchedModel().isOutOfOrder()) {
     UP.Runtime = true;
     UP.Partial = true;
-    UP.UpperBound = true;
     UP.UnrollRemainder = true;
     UP.DefaultUnrollRuntimeCount = 4;
 
@@ -1999,8 +2000,13 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
                                            Optional<FastMathFlags> FMF,
                                            TTI::TargetCostKind CostKind) {
   if (TTI::requiresOrderedReduction(FMF)) {
-    if (!isa<ScalableVectorType>(ValTy))
-      return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
+    if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
+      InstructionCost BaseCost =
+          BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
+      // Add on extra cost to reflect the extra overhead on some CPUs. We still
+      // end up vectorizing for more computationally intensive loops.
+      return BaseCost + FixedVTy->getNumElements();
+    }
 
     if (Opcode != Instruction::FAdd)
       return InstructionCost::getInvalid();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 5369eb7b7e828..5c095048ba0a3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -125,22 +125,25 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return ST->getMinVectorRegisterBitWidth();
   }
 
-  Optional<unsigned> getMaxVScale() const {
-    if (ST->hasSVE())
-      return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock;
-    return BaseT::getMaxVScale();
-  }
 
   /// Try to return an estimate cost factor that can be used as a multiplier
   /// when scalarizing an operation for a vector with ElementCount \p VF.
   /// For scalable vectors this currently takes the most pessimistic view based
   /// upon the maximum possible value for vscale.
-  unsigned getMaxNumElements(ElementCount VF) const {
+  unsigned getMaxNumElements(ElementCount VF,
+                             const Function *F = nullptr) const {
     if (!VF.isScalable())
       return VF.getFixedValue();
-    Optional<unsigned> MaxNumVScale = getMaxVScale();
-    assert(MaxNumVScale && "Expected valid max vscale value");
-    return *MaxNumVScale * VF.getKnownMinValue();
+
+    unsigned MaxNumVScale = 16;
+    if (F && F->hasFnAttribute(Attribute::VScaleRange)) {
+      unsigned VScaleMax =
+          F->getFnAttribute(Attribute::VScaleRange).getVScaleRangeArgs().second;
+      if (VScaleMax > 0)
+        MaxNumVScale = VScaleMax;
+    }
+
+    return MaxNumVScale * VF.getKnownMinValue();
   }
 
   unsigned getMaxInterleaveFactor(unsigned VF);
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index cecb44e9dbff6..2f93e81b070ca 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3362,7 +3362,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     else if (!IC->haveFeatures(getSTI().getFeatureBits())) {
       std::string Str("IC " + std::string(IC->Name) + " requires: ");
       setRequiredFeatureString(IC->getRequiredFeatures(), Str);
-      return TokError(Str.c_str());
+      return TokError(Str);
     }
     createSysAlias(IC->Encoding, Operands, S);
   } else if (Mnemonic == "dc") {
@@ -3372,7 +3372,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     else if (!DC->haveFeatures(getSTI().getFeatureBits())) {
       std::string Str("DC " + std::string(DC->Name) + " requires: ");
       setRequiredFeatureString(DC->getRequiredFeatures(), Str);
-      return TokError(Str.c_str());
+      return TokError(Str);
     }
     createSysAlias(DC->Encoding, Operands, S);
   } else if (Mnemonic == "at") {
@@ -3382,7 +3382,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     else if (!AT->haveFeatures(getSTI().getFeatureBits())) {
       std::string Str("AT " + std::string(AT->Name) + " requires: ");
       setRequiredFeatureString(AT->getRequiredFeatures(), Str);
-      return TokError(Str.c_str());
+      return TokError(Str);
     }
     createSysAlias(AT->Encoding, Operands, S);
   } else if (Mnemonic == "tlbi") {
@@ -3392,7 +3392,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) {
       std::string Str("TLBI " + std::string(TLBI->Name) + " requires: ");
       setRequiredFeatureString(TLBI->getRequiredFeatures(), Str);
-      return TokError(Str.c_str());
+      return TokError(Str);
     }
     createSysAlias(TLBI->Encoding, Operands, S);
   } else if (Mnemonic == "cfp" || Mnemonic == "dvp" || Mnemonic == "cpp") {
@@ -3403,7 +3403,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
       std::string Str(
           Mnemonic.upper() + std::string(PRCTX->Name) + " requires: ");
       setRequiredFeatureString(PRCTX->getRequiredFeatures(), Str);
-      return TokError(Str.c_str());
+      return TokError(Str);
     }
     uint16_t PRCTX_Op2 =
       Mnemonic == "cfp" ? 4 :
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index df5dd59df190b..5ad2c6e6e7a08 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -387,11 +387,9 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
         MVT NewVT = TLI.getRegisterTypeForCallingConv(Ctx, CC, SplitEVTs[i]);
         if (EVT(NewVT) != SplitEVTs[i]) {
           unsigned ExtendOp = TargetOpcode::G_ANYEXT;
-          if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                             Attribute::SExt))
+          if (F.getAttributes().hasRetAttr(Attribute::SExt))
             ExtendOp = TargetOpcode::G_SEXT;
-          else if (F.getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                                  Attribute::ZExt))
+          else if (F.getAttributes().hasRetAttr(Attribute::ZExt))
             ExtendOp = TargetOpcode::G_ZEXT;
 
           LLT NewLLT(NewVT);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index c88f0bcd70b9b..15bdc81330f62 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -22,6 +22,7 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
@@ -104,7 +105,7 @@ class AArch64InstructionSelector : public InstructionSelector {
   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
 
   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
-  bool contractCrossBankCopyIntoStore(MachineInstr &I,
+  bool contractCrossBankCopyIntoStore(GStore &I,
                                       MachineRegisterInfo &MRI);
 
   bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
@@ -496,14 +497,18 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
   }
 
   if (RB.getID() == AArch64::FPRRegBankID) {
-    if (Ty.getSizeInBits() <= 16)
+    switch (Ty.getSizeInBits()) {
+    case 8:
+      return &AArch64::FPR8RegClass;
+    case 16:
       return &AArch64::FPR16RegClass;
-    if (Ty.getSizeInBits() == 32)
+    case 32:
       return &AArch64::FPR32RegClass;
-    if (Ty.getSizeInBits() == 64)
+    case 64:
       return &AArch64::FPR64RegClass;
-    if (Ty.getSizeInBits() == 128)
+    case 128:
       return &AArch64::FPR128RegClass;
+    }
     return nullptr;
   }
 
@@ -1930,8 +1935,9 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
     return true;
   }
   case TargetOpcode::G_STORE: {
-    bool Changed = contractCrossBankCopyIntoStore(I, MRI);
-    MachineOperand &SrcOp = I.getOperand(0);
+    auto &StoreMI = cast<GStore>(I);
+    bool Changed = contractCrossBankCopyIntoStore(StoreMI, MRI);
+    MachineOperand &SrcOp = StoreMI.getOperand(0);
     if (MRI.getType(SrcOp.getReg()).isPointer()) {
       // Allow matching with imported patterns for stores of pointers. Unlike
       // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
@@ -1942,6 +1948,28 @@ bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
       RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
       Changed = true;
     }
+#if 0
+    // Now look for truncating stores to the FPR bank. We don't support these,
+    // but since truncating store formation happens before RBS, we can only
+    // split them up again here. We don't want to assign truncstores to GPR only
+    // since that would have a perf impact due to extra moves.
+    LLT SrcTy = MRI.getType(SrcReg);
+    if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
+      if (SrcTy.isScalar() &&
+          SrcTy.getSizeInBits() > StoreMI.getMemSizeInBits()) {
+        // Generate an explicit truncate and make this into a non-truncating
+        // store.
+        auto Trunc =
+            MIB.buildTrunc(LLT::scalar(StoreMI.getMemSizeInBits()), SrcReg);
+        MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
+        if (!select(*Trunc)) {
+          return false;
+        }
+        SrcOp.setReg(Trunc.getReg(0));
+        return true;
+      }
+    }
+#endif
     return Changed;
   }
   case TargetOpcode::G_PTR_ADD:
@@ -2077,8 +2105,7 @@ bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
 }
 
 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
-    MachineInstr &I, MachineRegisterInfo &MRI) {
-  assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
+    GStore &StoreMI, MachineRegisterInfo &MRI) {
   // If we're storing a scalar, it doesn't matter what register bank that
   // scalar is on. All that matters is the size.
   //
@@ -2093,11 +2120,11 @@ bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
   // G_STORE %x:gpr(s32)
   //
   // And then continue the selection process normally.
-  Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
+  Register DefDstReg = getSrcRegIgnoringCopies(StoreMI.getValueReg(), MRI);
   if (!DefDstReg.isValid())
     return false;
   LLT DefDstTy = MRI.getType(DefDstReg);
-  Register StoreSrcReg = I.getOperand(0).getReg();
+  Register StoreSrcReg = StoreMI.getValueReg();
   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
 
   // If we get something strange like a physical register, then we shouldn't
@@ -2109,12 +2136,16 @@ bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
     return false;
 
+  // Is this store a truncating one?
+  if (StoreSrcTy.getSizeInBits() != StoreMI.getMemSizeInBits())
+    return false;
+
   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
       RBI.getRegBank(DefDstReg, MRI, TRI))
     return false;
 
   // We have a cross-bank copy, which is entering a store. Let's fold it.
-  I.getOperand(0).setReg(DefDstReg);
+  StoreMI.getOperand(0).setReg(DefDstReg);
   return true;
 }
 
@@ -3830,6 +3861,10 @@ static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
   // Choose a lane copy opcode and subregister based off of the size of the
   // vector's elements.
   switch (EltSize) {
+  case 8:
+    CopyOpc = AArch64::CPYi8;
+    ExtractSubReg = AArch64::bsub;
+    break;
   case 16:
     CopyOpc = AArch64::CPYi16;
     ExtractSubReg = AArch64::hsub;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 26af9e2c2b809..9a27a19d65d8f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/MathExtras.h"
@@ -169,7 +170,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder({G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
       .lowerFor({s1, s8, s16, s32, s64});
 
-  getActionDefinitionsBuilder({G_SMULO, G_UMULO}).lowerFor({{s64, s1}});
+  getActionDefinitionsBuilder({G_SMULO, G_UMULO})
+      .widenScalarToNextPow2(0, /*Min = */ 32)
+      .clampScalar(0, s32, s64)
+      .lowerIf(typeIs(1, s1));
 
   getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64});
 
@@ -307,6 +311,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(0, s16, 8)
       .clampMaxNumElements(0, s32, 4)
       .clampMaxNumElements(0, s64, 2)
+      .clampMaxNumElements(0, p0, 2)
       .customIf(IsPtrVecPred)
       .scalarizeIf(typeIs(0, v2s16), 0);
 
@@ -342,6 +347,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(0, s16, 8)
       .clampMaxNumElements(0, s32, 4)
       .clampMaxNumElements(0, s64, 2)
+      .clampMaxNumElements(0, p0, 2)
       .lowerIfMemSizeNotPow2()
       .customIf(IsPtrVecPred)
       .scalarizeIf(typeIs(0, v2s16), 0);
@@ -625,7 +631,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF).lower();
 
   // TODO: Custom lowering for v2s32, v4s32, v2s64.
-  getActionDefinitionsBuilder(G_BITREVERSE).legalFor({s32, s64, v8s8, v16s8});
+  getActionDefinitionsBuilder(G_BITREVERSE)
+      .legalFor({s32, s64, v8s8, v16s8})
+      .widenScalarToNextPow2(0, /*Min = */ 32)
+      .clampScalar(0, s32, s64);
 
   getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF).lower();
 
@@ -691,6 +700,27 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampMaxNumElements(1, s32, 4)
       .lower();
 
+  getActionDefinitionsBuilder(G_VECREDUCE_OR)
+      // Try to break down into smaller vectors as long as they're at least 64
+      // bits. This lets us use vector operations for some parts of the
+      // reduction.
+      .fewerElementsIf(
+          [=](const LegalityQuery &Q) {
+            LLT SrcTy = Q.Types[1];
+            if (SrcTy.isScalar())
+              return false;
+            if (!isPowerOf2_32(SrcTy.getNumElements()))
+              return false;
+            // We can usually perform 64b vector operations.
+            return SrcTy.getSizeInBits() > 64;
+          },
+          [=](const LegalityQuery &Q) {
+            LLT SrcTy = Q.Types[1];
+            return std::make_pair(1, SrcTy.divide(2));
+          })
+      .scalarize(1)
+      .lower();
+
   getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
       .lowerIf([=](const LegalityQuery &Q) { return Q.Types[0].isScalar(); });
 
@@ -727,6 +757,18 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   // TODO: Vector types.
   getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}).lowerIf(isScalar(0));
 
+  getActionDefinitionsBuilder(G_ISNAN).lowerIf(
+      any(isVector(1), typeInSet(1, {s16, s32, s64})));
+
+  // TODO: Vector types.
+  getActionDefinitionsBuilder({G_FMAXNUM, G_FMINNUM})
+      .legalFor({MinFPScalar, s32, s64})
+      .libcallFor({s128})
+      .minScalar(0, MinFPScalar);
+
+  // TODO: Libcall support for s128.
+  getActionDefinitionsBuilder(G_LROUND).legalFor({{s64, s32}, {s64, s64}});
+
   getLegacyLegalizerInfo().computeTables();
   verify(*ST.getInstrInfo());
 }
@@ -926,6 +968,12 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     MI.eraseFromParent();
     return true;
   }
+  case Intrinsic::get_dynamic_area_offset: {
+    MachineIRBuilder &MIB = Helper.MIRBuilder;
+    MIB.buildConstant(MI.getOperand(0).getReg(), 0);
+    MI.eraseFromParent();
+    return true;
+  }
   }
 
   return true;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 8c34027f7bb32..78c9e17dadc25 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -424,6 +424,8 @@ static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) {
   case TargetOpcode::G_FRINT:
   case TargetOpcode::G_INTRINSIC_TRUNC:
   case TargetOpcode::G_INTRINSIC_ROUND:
+  case TargetOpcode::G_FMAXNUM:
+  case TargetOpcode::G_FMINNUM:
     return true;
   }
   return false;
@@ -529,6 +531,7 @@ bool AArch64RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI:
   case TargetOpcode::G_FCMP:
+  case TargetOpcode::G_LROUND:
     return true;
   default:
     break;
@@ -957,6 +960,11 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     break;
   }
+  case TargetOpcode::G_LROUND: {
+    // Source is always floating point and destination is always integer.
+    OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
+    break;
+  }
   }
 
   // Finally construct the computed mapping.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 3c2df1621e118..987cabce6cc98 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -57,7 +57,16 @@ createAArch64MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
       CPU = "apple-a12";
   }
 
-  return createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
+  // Most of the NEON instruction set isn't supported in streaming mode on SME
+  // targets, disable NEON unless explicitly requested.
+  bool RequestedNEON = FS.contains("neon");
+  bool RequestedStreamingSVE = FS.contains("streaming-sve");
+  MCSubtargetInfo *STI =
+      createAArch64MCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS);
+  if (RequestedStreamingSVE && !RequestedNEON &&
+      STI->hasFeature(AArch64::FeatureNEON))
+    STI->ToggleFeature(AArch64::FeatureNEON);
+  return STI;
 }
 
 void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 61b1d22edc330..0c24903490f0f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -302,12 +302,6 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     CallingConv::ID CC = F->getCallingConv();
     bool CallingConvSupportsAllImplicits = (CC != CallingConv::AMDGPU_Gfx);
 
-    // Don't add attributes to instrinsics
-    if (F->isIntrinsic()) {
-      indicatePessimisticFixpoint();
-      return;
-    }
-
     // Ignore functions with graphics calling conventions, these are currently
     // not allowed to have kernel arguments.
     if (AMDGPU::isGraphics(F->getCallingConv())) {
@@ -500,8 +494,10 @@ class AMDGPUAttributor : public ModulePass {
   bool runOnModule(Module &M) override {
     SetVector<Function *> Functions;
     AnalysisGetter AG;
-    for (Function &F : M)
-      Functions.insert(&F);
+    for (Function &F : M) {
+      if (!F.isIntrinsic())
+        Functions.insert(&F);
+    }
 
     CallGraphUpdater CGUpdater;
     BumpPtrAllocator Allocator;
@@ -509,8 +505,10 @@ class AMDGPUAttributor : public ModulePass {
     Attributor A(Functions, InfoCache, CGUpdater);
 
     for (Function &F : M) {
-      A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
-      A.getOrCreateAAFor<AAAMDWorkGroupSize>(IRPosition::function(F));
+      if (!F.isIntrinsic()) {
+        A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
+        A.getOrCreateAAFor<AAAMDWorkGroupSize>(IRPosition::function(F));
+      }
     }
 
     ChangeStatus Change = A.run();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index c6273adca50f7..28946435af467 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -83,7 +83,7 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
 }
 
 def AMDGPURegBankCombinerHelper : GICombinerHelper<
-  "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3]> {
+  "AMDGPUGenRegBankCombinerHelper", [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain]> {
   let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
   let StateClass = "AMDGPURegBankCombinerHelperState";
   let AdditionalArguments = [];
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 06aa0055e4bb5..1a89b85c798d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -586,8 +586,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
-        NewCall->addAttribute(AttributeList::FunctionIndex,
-                              Attribute::Convergent);
+        NewCall->addFnAttr(Attribute::Convergent);
         NewCall->takeName(&II);
         return IC.replaceInstUsesWith(II, NewCall);
       }
@@ -712,8 +711,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
         CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
-        NewCall->addAttribute(AttributeList::FunctionIndex,
-                              Attribute::Convergent);
+        NewCall->addFnAttr(Attribute::Convergent);
         NewCall->takeName(&II);
         return IC.replaceInstUsesWith(II, NewCall);
       }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 5a8c14234bb77..71e120974c2e2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4825,18 +4825,25 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
 
   const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
   const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
-  const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
-  const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
-  const unsigned Opcodes[2][2][2] = {
-      {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa,
-        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa},
-       {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa,
-        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}},
-      {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa,
-        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa},
-       {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa,
-        AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}};
-  const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64];
+  const unsigned NumVDataDwords = 4;
+  const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
+  const bool UseNSA =
+      ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize();
+  const unsigned BaseOpcodes[2][2] = {
+      {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
+      {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
+       AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
+  int Opcode;
+  if (UseNSA) {
+    Opcode =
+        AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA,
+                              NumVDataDwords, NumVAddrDwords);
+  } else {
+    Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
+                                   AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
+                                   PowerOf2Ceil(NumVAddrDwords));
+  }
+  assert(Opcode != -1);
 
   SmallVector<Register, 12> Ops;
   if (Is64) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 32262ea75fd38..452142a701cd2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -990,10 +990,8 @@ FunctionCallee AMDGPULibFunc::getOrInsertFunction(Module *M,
   } else {
     AttributeList Attr;
     LLVMContext &Ctx = M->getContext();
-    Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex,
-                             Attribute::ReadOnly);
-    Attr = Attr.addAttribute(Ctx, AttributeList::FunctionIndex,
-                             Attribute::NoUnwind);
+    Attr = Attr.addFnAttribute(Ctx, Attribute::ReadOnly);
+    Attr = Attr.addFnAttribute(Ctx, Attribute::NoUnwind);
     C = M->getOrInsertFunction(FuncName, FuncTy, Attr);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 0f157e53c3db3..c34c12ab9fecb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -82,9 +82,9 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
       Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
                               nullptr, F.getName() + ".kernarg.segment");
 
-  KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
-  KernArgSegment->addAttribute(AttributeList::ReturnIndex,
-    Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
+  KernArgSegment->addRetAttr(Attribute::NonNull);
+  KernArgSegment->addRetAttr(
+      Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
 
   unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
   uint64_t ExplicitArgOffset = 0;
@@ -232,8 +232,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
     }
   }
 
-  KernArgSegment->addAttribute(
-      AttributeList::ReturnIndex,
+  KernArgSegment->addRetAttr(
       Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
 
   return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 70ecea8dbc3e2..26e2b5ff4d4bd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -42,6 +42,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -282,6 +283,21 @@ class AMDGPULowerModuleLDS : public ModulePass {
     // so remove the variables from these lists before replaceAllUsesWith
     removeFromUsedLists(M, LocalVars);
 
+    // Create alias.scope and their lists. Each field in the new structure
+    // does not alias with all other fields.
+    SmallVector<MDNode *> AliasScopes;
+    SmallVector<Metadata *> NoAliasList;
+    if (LocalVars.size() > 1) {
+      MDBuilder MDB(Ctx);
+      AliasScopes.reserve(LocalVars.size());
+      MDNode *Domain = MDB.createAnonymousAliasScopeDomain();
+      for (size_t I = 0; I < LocalVars.size(); I++) {
+        MDNode *Scope = MDB.createAnonymousAliasScope(Domain);
+        AliasScopes.push_back(Scope);
+      }
+      NoAliasList.append(&AliasScopes[1], AliasScopes.end());
+    }
+
     // Replace uses of ith variable with a constantexpr to the ith field of the
     // instance that will be allocated by AMDGPUMachineFunction
     Type *I32 = Type::getInt32Ty(Ctx);
@@ -313,7 +329,15 @@ class AMDGPULowerModuleLDS : public ModulePass {
 
       uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
       Align A = commonAlignment(StructAlign, Off);
-      refineUsesAlignment(GEP, A, DL);
+
+      if (I)
+        NoAliasList[I - 1] = AliasScopes[I - 1];
+      MDNode *NoAlias =
+          NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
+      MDNode *AliasScope =
+          AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});
+
+      refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
     }
 
     // Mark kernels with asm that reads the address of the allocated structure
@@ -334,12 +358,25 @@ class AMDGPULowerModuleLDS : public ModulePass {
     return true;
   }
 
-  void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL,
-                           unsigned MaxDepth = 5) {
-    if (!MaxDepth || A == 1)
+  void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
+                                MDNode *AliasScope, MDNode *NoAlias,
+                                unsigned MaxDepth = 5) {
+    if (!MaxDepth || (A == 1 && !AliasScope))
       return;
 
     for (User *U : Ptr->users()) {
+      if (auto *I = dyn_cast<Instruction>(U)) {
+        if (AliasScope && I->mayReadOrWriteMemory()) {
+          MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
+          AS = MDNode::concatenate(AS, AliasScope);
+          I->setMetadata(LLVMContext::MD_alias_scope, AS);
+
+          MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
+          NA = MDNode::concatenate(NA, NoAlias);
+          I->setMetadata(LLVMContext::MD_noalias, NA);
+        }
+      }
+
       if (auto *LI = dyn_cast<LoadInst>(U)) {
         LI->setAlignment(std::max(A, LI->getAlign()));
         continue;
@@ -364,17 +401,19 @@ class AMDGPULowerModuleLDS : public ModulePass {
       if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
         unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
         APInt Off(BitWidth, 0);
-        if (GEP->getPointerOperand() == Ptr &&
-            GEP->accumulateConstantOffset(DL, Off)) {
-          Align GA = commonAlignment(A, Off.getLimitedValue());
-          refineUsesAlignment(GEP, GA, DL, MaxDepth - 1);
+        if (GEP->getPointerOperand() == Ptr) {
+          Align GA;
+          if (GEP->accumulateConstantOffset(DL, Off))
+            GA = commonAlignment(A, Off.getLimitedValue());
+          refineUsesAlignmentAndAA(GEP, GA, DL, AliasScope, NoAlias,
+                                   MaxDepth - 1);
         }
         continue;
       }
       if (auto *I = dyn_cast<Instruction>(U)) {
         if (I->getOpcode() == Instruction::BitCast ||
             I->getOpcode() == Instruction::AddrSpaceCast)
-          refineUsesAlignment(I, A, DL, MaxDepth - 1);
+          refineUsesAlignmentAndAA(I, A, DL, AliasScope, NoAlias, MaxDepth - 1);
       }
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 3f1f21a33f7ea..e483d286b1284 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -256,11 +256,11 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
 
   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
-  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
-  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+  DispatchPtr->addRetAttr(Attribute::NoAlias);
+  DispatchPtr->addRetAttr(Attribute::NonNull);
 
   // Size of the dispatch packet struct.
-  DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
+  DispatchPtr->addDereferenceableRetAttr(64);
 
   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
   Value *CastDispatchPtr = Builder.CreateBitCast(
@@ -1065,9 +1065,9 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
                                     MI->getRawSource(), MI->getSourceAlign(),
                                     MI->getLength(), MI->isVolatile());
 
-    for (unsigned I = 1; I != 3; ++I) {
-      if (uint64_t Bytes = Intr->getDereferenceableBytes(I)) {
-        B->addDereferenceableAttr(I, Bytes);
+    for (unsigned I = 0; I != 2; ++I) {
+      if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
+        B->addDereferenceableParamAttr(I, Bytes);
       }
     }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 24e2e720153a3..9c00ed76985f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2539,7 +2539,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
     unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
                           ? AMDGPU::G_AMDGPU_FFBH_U32
-                          : Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
+                          : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
                                 ? AMDGPU::G_AMDGPU_FFBL_B32
                                 : Opc;
     unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index e2aafa25142e8..d8cdc8c381047 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -357,7 +357,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
   RetAttrs.addAttribute(Attribute::SExt);
   RetAttrs.addAttribute(Attribute::ZExt);
   RetAttrs.addAttribute(Attribute::NoAlias);
-  NewFunc->removeAttributes(AttributeList::ReturnIndex, RetAttrs);
+  NewFunc->removeRetAttrs(RetAttrs);
   // TODO: How to preserve metadata?
 
   // Move the body of the function into the new rewritten function, and replace
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 5f43aa8388ee2..9c8a3464fcc0c 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1438,6 +1438,13 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">;
 
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+  defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f32, "BUFFER_ATOMIC_FMIN">;
+  defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f32, "BUFFER_ATOMIC_FMAX">;
+  defm : BufferAtomicPatterns<SIbuffer_atomic_fmin, f64, "BUFFER_ATOMIC_FMIN_X2">;
+  defm : BufferAtomicPatterns<SIbuffer_atomic_fmax, f64, "BUFFER_ATOMIC_FMAX_X2">;
+}
+
 class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag <
   (ops node:$src0, node:$src1, node:$src2, node:$src3, node:$src4, node:$src5, node:$src6, node:$src7),
   (vt (Op $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7)),
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fe62b8590fa0e..555f6bc5cd960 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -693,22 +693,21 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
   int D16Idx   = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
                                             AMDGPU::OpName::d16);
 
+  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
+  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+      AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+
   assert(VDataIdx != -1);
-  if (DMaskIdx == -1 || TFEIdx == -1) {// intersect_ray
+  if (BaseOpcode->BVH) {
+    // Add A16 operand for intersect_ray instructions
     if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16) > -1) {
-      assert(MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa ||
-             MI.getOpcode() == AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa ||
-             MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa ||
-             MI.getOpcode() == AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa);
       addOperand(MI, MCOperand::createImm(1));
     }
     return MCDisassembler::Success;
   }
 
-  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
   bool IsAtomic = (VDstIdx != -1);
   bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4;
-
   bool IsNSA = false;
   unsigned AddrSize = Info->VAddrDwords;
 
@@ -717,8 +716,6 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dim);
     int A16Idx =
         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::a16);
-    const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
-        AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
     const AMDGPU::MIMGDimInfo *Dim =
         AMDGPU::getMIMGDimInfoByEncoding(MI.getOperand(DimIdx).getImm());
     const bool IsA16 = (A16Idx != -1 && MI.getOperand(A16Idx).getImm());
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index bc2fb1e9770c8..aca8b533d7dc1 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -349,20 +349,16 @@ void GCNHazardRecognizer::AdvanceCycle() {
     return;
   }
 
-  // Do not track non-instructions which do not affect the wait states.
-  // If included, these instructions can lead to buffer overflow such that
-  // detectable hazards are missed.
-  if (CurrCycleInstr->isMetaInstruction()) {
-    CurrCycleInstr = nullptr;
-    return;
-  }
-
   if (CurrCycleInstr->isBundle()) {
     processBundle();
     return;
   }
 
   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
+  if (!NumWaitStates) {
+    CurrCycleInstr = nullptr;
+    return;
+  }
 
   // Keep track of emitted instructions
   EmittedInstrs.push_front(CurrCycleInstr);
@@ -409,7 +405,7 @@ static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
     if (IsHazard(*I))
       return WaitStates;
 
-    if (I->isInlineAsm() || I->isMetaInstruction())
+    if (I->isInlineAsm())
       continue;
 
     WaitStates += SIInstrInfo::getNumWaitStates(*I);
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index bacb790aac62f..288cf6b02f9ff 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -43,6 +43,7 @@ class MIMGBaseOpcode : PredicateControl {
   bit HasD16 = 0;
   bit IsAtomicRet = 0;
   bit MSAA = 0;
+  bit BVH = 0;
 }
 
 def MIMGBaseOpcode : GenericEnum {
@@ -54,7 +55,7 @@ def MIMGBaseOpcodesTable : GenericTable {
   let CppTypeName = "MIMGBaseOpcodeInfo";
   let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
                 "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
-                "LodOrClampOrMip", "HasD16", "MSAA"];
+                "LodOrClampOrMip", "HasD16", "MSAA", "BVH"];
   string TypeOf_BaseOpcode = "MIMGBaseOpcode";
 
   let PrimaryKey = ["BaseOpcode"];
@@ -872,6 +873,14 @@ multiclass MIMG_Gather <mimgopc op, AMDGPUSampleVariant sample, bit wqm = 0,
 multiclass MIMG_Gather_WQM <mimgopc op, AMDGPUSampleVariant sample>
     : MIMG_Gather<op, sample, 1>;
 
+class MIMG_IntersectRay_Helper<bit Is64, bit A16> {
+  int num_addrs = !if(Is64, !if(A16, 9, 12), !if(A16, 8, 11));
+  // TODO: MIMGAddrSize will choose VReg_512 which is a 16 register tuple,
+  // when we only need 9, 11 or 12 depending on A16 field and ptr size.
+  RegisterClass RegClass = MIMGAddrSize<num_addrs, 0>.RegClass;
+  int VAddrDwords = !srl(RegClass.Size, 5);
+}
+
 class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC, bit A16>
     : MIMG_gfx10<op.BASE, (outs VReg_128:$vdata), "AMDGPU"> {
 
@@ -890,8 +899,11 @@ class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs, bit
   let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(A16, "$a16", "");
 }
 
-multiclass MIMG_IntersectRay<mimgopc op, string opcode, int num_addrs, bit A16> {
-  def "" : MIMGBaseOpcode;
+multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit A16> {
+  defvar info = MIMG_IntersectRay_Helper<Is64, A16>;
+  def "" : MIMGBaseOpcode {
+    let BVH = 1;
+  }
   let SubtargetPredicate = HasGFX10_AEncoding,
       AssemblerPredicate = HasGFX10_AEncoding,
       AsmMatchConverter = !if(A16, "cvtIntersectRay", ""),
@@ -908,13 +920,11 @@ multiclass MIMG_IntersectRay<mimgopc op, string opcode, int num_addrs, bit A16>
       d16 = 0,
       BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
       VDataDwords = 4 in {
-    // TODO: MIMGAddrSize will choose VReg_512 which is a 16 register tuple,
-    // when we only need 9, 11 or 12 depending on A16 field and ptr size.
-    def "_sa" : MIMG_IntersectRay_gfx10<op, opcode, MIMGAddrSize<num_addrs, 0>.RegClass, A16> {
-      let VAddrDwords = !srl(MIMGAddrSize<num_addrs, 0>.RegClass.Size, 5);
+    def _sa_gfx10 : MIMG_IntersectRay_gfx10<op, opcode, info.RegClass, A16> {
+      let VAddrDwords = info.VAddrDwords;
     }
-    def _nsa : MIMG_IntersectRay_nsa_gfx10<op, opcode, num_addrs, A16> {
-      let VAddrDwords = num_addrs;
+    def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10<op, opcode, info.num_addrs, A16> {
+      let VAddrDwords = info.num_addrs;
     }
   }
 }
@@ -1045,10 +1055,10 @@ defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<0xef>, AMDGPUSample_c_cd
 let SubtargetPredicate = HasGFX10_AEncoding in
 defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<0x80>, "image_msaa_load", 1, 0, 0, 1>;
 
-defm IMAGE_BVH_INTERSECT_RAY       : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 11, 0>;
-defm IMAGE_BVH_INTERSECT_RAY_a16   : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 8, 1>;
-defm IMAGE_BVH64_INTERSECT_RAY     : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 12, 0>;
-defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 9, 1>;
+defm IMAGE_BVH_INTERSECT_RAY       : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 0, 0>;
+defm IMAGE_BVH_INTERSECT_RAY_a16   : MIMG_IntersectRay<mimgopc<0xe6>, "image_bvh_intersect_ray", 0, 1>;
+defm IMAGE_BVH64_INTERSECT_RAY     : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 1, 0>;
+defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0xe7>, "image_bvh64_intersect_ray", 1, 1>;
 
 /********** ========================================= **********/
 /********** Table of dimension-aware image intrinsics **********/
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index ad910522ba908..6bf710f03bd27 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -452,7 +452,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
     const SIRegisterInfo &SRI = TII->getRegisterInfo();
 
     // Fine if the operand can be encoded as an inline constant
-    if (OpToFold->isImm()) {
+    if (TII->isLiteralConstantLike(*OpToFold, OpInfo)) {
       if (!SRI.opCanUseInlineConstant(OpInfo.OperandType) ||
           !TII->isInlineConstant(*OpToFold, OpInfo)) {
         // Otherwise check for another constant
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 41dcffdf2d459..faf38bb0b914b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -19,6 +19,7 @@
 #include "SIRegisterInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -1066,7 +1067,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
           AMDGPU::lookupRsrcIntrinsic(IntrID)) {
     AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
                                                   (Intrinsic::ID)IntrID);
-    if (Attr.hasFnAttribute(Attribute::ReadNone))
+    if (Attr.hasFnAttr(Attribute::ReadNone))
       return false;
 
     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -1081,7 +1082,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     }
 
     Info.flags = MachineMemOperand::MODereferenceable;
-    if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
+    if (Attr.hasFnAttr(Attribute::ReadOnly)) {
       unsigned DMaskLanes = 4;
 
       if (RsrcIntr->IsImage) {
@@ -1105,7 +1106,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       // FIXME: What does alignment mean for an image?
       Info.opc = ISD::INTRINSIC_W_CHAIN;
       Info.flags |= MachineMemOperand::MOLoad;
-    } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
+    } else if (Attr.hasFnAttr(Attribute::WriteOnly)) {
       Info.opc = ISD::INTRINSIC_VOID;
 
       Type *DataTy = CI.getArgOperand(0)->getType();
@@ -5948,6 +5949,9 @@ static SDValue constructRetValue(SelectionDAG &DAG,
 
   EVT LegalReqRetVT = ReqRetVT;
   if (!ReqRetVT.isVector()) {
+    if (!Data.getValueType().isInteger())
+      Data = DAG.getNode(ISD::BITCAST, DL,
+                         Data.getValueType().changeTypeToInteger(), Data);
     Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
   } else {
     // We need to widen the return vector to a legal type
@@ -7373,19 +7377,25 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
     const bool Is64 = NodePtr.getValueType() == MVT::i64;
-    const unsigned NumVAddrs = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
-    const bool UseNSA =
-        Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize();
-    const unsigned Opcodes[2][2][2] = {
-        {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa,
-          AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa},
-         {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa,
-          AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa}},
-        {{AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa,
-          AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa},
-         {AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa,
-          AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa}}};
-    const unsigned Opcode = Opcodes[UseNSA][IsA16][Is64];
+    const unsigned NumVDataDwords = 4;
+    const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
+    const bool UseNSA = Subtarget->hasNSAEncoding() &&
+                        NumVAddrDwords <= Subtarget->getNSAMaxSize();
+    const unsigned BaseOpcodes[2][2] = {
+        {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
+        {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
+         AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
+    int Opcode;
+    if (UseNSA) {
+      Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
+                                     AMDGPU::MIMGEncGfx10NSA, NumVDataDwords,
+                                     NumVAddrDwords);
+    } else {
+      Opcode = AMDGPU::getMIMGOpcode(
+          BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
+          PowerOf2Ceil(NumVAddrDwords));
+    }
+    assert(Opcode != -1);
 
     SmallVector<SDValue, 16> Ops;
 
@@ -7428,7 +7438,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     if (!UseNSA) {
       // Build a single vector containing all the operands so far prepared.
-      if (NumVAddrs > 8) {
+      if (NumVAddrDwords > 8) {
         SDValue Undef = DAG.getUNDEF(MVT::i32);
         Ops.append(16 - Ops.size(), Undef);
       }
@@ -12120,6 +12130,25 @@ static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
 
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+
+  auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) {
+    OptimizationRemarkEmitter ORE(RMW->getFunction());
+    LLVMContext &Ctx = RMW->getFunction()->getContext();
+    SmallVector<StringRef> SSNs;
+    Ctx.getSyncScopeNames(SSNs);
+    auto MemScope = SSNs[RMW->getSyncScopeID()].empty()
+                        ? "system"
+                        : SSNs[RMW->getSyncScopeID()];
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
+             << "Hardware instruction generated for atomic "
+             << RMW->getOperationName(RMW->getOperation())
+             << " operation at memory scope " << MemScope
+             << " due to an unsafe request.";
+    });
+    return Kind;
+  };
+
   switch (RMW->getOperation()) {
   case AtomicRMWInst::FAdd: {
     Type *Ty = RMW->getType();
@@ -12154,13 +12183,13 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
             SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"))
           return AtomicExpansionKind::CmpXChg;
 
-        return AtomicExpansionKind::None;
+        return ReportUnsafeHWInst(AtomicExpansionKind::None);
       }
 
       if (AS == AMDGPUAS::FLAT_ADDRESS)
         return AtomicExpansionKind::CmpXChg;
 
-      return RMW->use_empty() ? AtomicExpansionKind::None
+      return RMW->use_empty() ? ReportUnsafeHWInst(AtomicExpansionKind::None)
                               : AtomicExpansionKind::CmpXChg;
     }
 
@@ -12171,11 +12200,13 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
       if (!Ty->isDoubleTy())
         return AtomicExpansionKind::None;
 
-      return (fpModeMatchesGlobalFPAtomicMode(RMW) ||
-              RMW->getFunction()
-                      ->getFnAttribute("amdgpu-unsafe-fp-atomics")
-                      .getValueAsString() == "true")
-                 ? AtomicExpansionKind::None
+      if (fpModeMatchesGlobalFPAtomicMode(RMW))
+        return AtomicExpansionKind::None;
+
+      return RMW->getFunction()
+                         ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+                         .getValueAsString() == "true"
+                 ? ReportUnsafeHWInst(AtomicExpansionKind::None)
                  : AtomicExpansionKind::CmpXChg;
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7ab0f7a100c5e..809da5d425b7a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1637,10 +1637,20 @@ void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
 
 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
-  default: return 1; // FIXME: Do wait states equal cycles?
+  default:
+    if (MI.isMetaInstruction())
+      return 0;
+    return 1; // FIXME: Do wait states equal cycles?
 
   case AMDGPU::S_NOP:
     return MI.getOperand(0).getImm() + 1;
+
+  // FIXME: Any other pseudo instruction?
+  // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
+  // hazard, even if one exist, won't really be visible. Should we handle it?
+  case AMDGPU::SI_MASKED_UNREACHABLE:
+  case AMDGPU::WAVE_BARRIER:
+    return 0;
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 493c1ad87f93d..5dd621856a721 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -343,6 +343,9 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
       if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
         return UNKNOWN;
+      // Ignore BVH instructions
+      if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
+        return UNKNOWN;
       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
           TII.isGather4(Opc))
@@ -380,15 +383,6 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::DS_WRITE_B64:
   case AMDGPU::DS_WRITE_B64_gfx9:
     return DS_WRITE;
-  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa:
-  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa:
-  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa:
-  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa:
-  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa:
-  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa:
-  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa:
-  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa:
-    return UNKNOWN;
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 85cfe36df16aa..76bec48d77f19 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -151,10 +151,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   }
 
   bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
-  if (isAmdHsaOrMesa) {
-    if (!ST.enableFlatScratch())
-      PrivateSegmentBuffer = true;
+  if (isAmdHsaOrMesa && !ST.enableFlatScratch())
+    PrivateSegmentBuffer = true;
+  else if (ST.isMesaGfxShader(F))
+    ImplicitBufferPtr = true;
+
+  if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
+    KernargSegmentPtr = true;
 
+  if (!AMDGPU::isGraphics(CC)) {
     if (UseFixedABI) {
       DispatchPtr = true;
       QueuePtr = true;
@@ -171,13 +176,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
       if (F.hasFnAttribute("amdgpu-dispatch-id"))
         DispatchID = true;
     }
-  } else if (ST.isMesaGfxShader(F)) {
-    ImplicitBufferPtr = true;
   }
 
-  if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
-    KernargSegmentPtr = true;
-
   // TODO: This could be refined a lot. The attribute is a poor way of
   // detecting calls or stack objects that may require it before argument
   // lowering.
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index a2510d8fff34f..2a92051e5fb2e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -70,10 +70,6 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
                                        CallingConv::ID) const override;
   const uint32_t *getNoPreservedMask() const override;
 
-  bool addAllocPriorityToGlobalRanges() const override {
-    return true;
-  }
-
   // Stack access is very expensive. CSRs are also the high registers, and we
   // want to minimize the number of used registers.
   unsigned getCSRFirstUseCost() const override {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 72c872dec5ba7..5bd9f85fab993 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -292,8 +292,12 @@ struct MIMGBaseOpcodeInfo {
   bool LodOrClampOrMip;
   bool HasD16;
   bool MSAA;
+  bool BVH;
 };
 
+LLVM_READONLY
+const MIMGBaseOpcodeInfo *getMIMGBaseOpcode(unsigned Opc);
+
 LLVM_READONLY
 const MIMGBaseOpcodeInfo *getMIMGBaseOpcodeInfo(unsigned BaseOpcode);
 
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 9b058ff7dbcbd..971eef1e9353e 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1241,7 +1241,9 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         llvm_unreachable("Unknown reg class!");
       break;
     case 32:
-      if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
+      if (ARM::QQPRRegClass.hasSubClassEq(RC) ||
+          ARM::MQQPRRegClass.hasSubClassEq(RC) ||
+          ARM::DQuadRegClass.hasSubClassEq(RC)) {
         if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
             Subtarget.hasNEON()) {
           // FIXME: It's possible to only store part of the QQ register if the
@@ -1252,6 +1254,11 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
               .addReg(SrcReg, getKillRegState(isKill))
               .addMemOperand(MMO)
               .add(predOps(ARMCC::AL));
+        } else if (Subtarget.hasMVEIntegerOps()) {
+          BuildMI(MBB, I, DebugLoc(), get(ARM::MQQPRStore))
+              .addReg(SrcReg, getKillRegState(isKill))
+              .addFrameIndex(FI)
+              .addMemOperand(MMO);
         } else {
           MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(),
                                             get(ARM::VSTMDIA))
@@ -1267,7 +1274,13 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         llvm_unreachable("Unknown reg class!");
       break;
     case 64:
-      if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+      if (ARM::MQQQQPRRegClass.hasSubClassEq(RC) &&
+          Subtarget.hasMVEIntegerOps()) {
+        BuildMI(MBB, I, DebugLoc(), get(ARM::MQQQQPRStore))
+            .addReg(SrcReg, getKillRegState(isKill))
+            .addFrameIndex(FI)
+            .addMemOperand(MMO);
+      } else if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
         MachineInstrBuilder MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::VSTMDIA))
                                       .addFrameIndex(FI)
                                       .add(predOps(ARMCC::AL))
@@ -1328,6 +1341,13 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
       return MI.getOperand(0).getReg();
     }
     break;
+  case ARM::MQQPRStore:
+  case ARM::MQQQQPRStore:
+    if (MI.getOperand(1).isFI()) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
   }
 
   return 0;
@@ -1473,31 +1493,42 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       llvm_unreachable("Unknown reg class!");
     break;
    case 32:
-    if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
-      if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
-          Subtarget.hasNEON()) {
-        BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
-            .addFrameIndex(FI)
-            .addImm(16)
-            .addMemOperand(MMO)
-            .add(predOps(ARMCC::AL));
-      } else {
-        MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
-                                      .addFrameIndex(FI)
-                                      .add(predOps(ARMCC::AL))
-                                      .addMemOperand(MMO);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
-        MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
-        if (Register::isPhysicalRegister(DestReg))
-          MIB.addReg(DestReg, RegState::ImplicitDefine);
-      }
-    } else
-      llvm_unreachable("Unknown reg class!");
-    break;
+     if (ARM::QQPRRegClass.hasSubClassEq(RC) ||
+         ARM::MQQPRRegClass.hasSubClassEq(RC) ||
+         ARM::DQuadRegClass.hasSubClassEq(RC)) {
+       if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
+           Subtarget.hasNEON()) {
+         BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
+             .addFrameIndex(FI)
+             .addImm(16)
+             .addMemOperand(MMO)
+             .add(predOps(ARMCC::AL));
+       } else if (Subtarget.hasMVEIntegerOps()) {
+         BuildMI(MBB, I, DL, get(ARM::MQQPRLoad), DestReg)
+             .addFrameIndex(FI)
+             .addMemOperand(MMO);
+       } else {
+         MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
+                                       .addFrameIndex(FI)
+                                       .add(predOps(ARMCC::AL))
+                                       .addMemOperand(MMO);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_0, RegState::DefineNoRead, TRI);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_1, RegState::DefineNoRead, TRI);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_2, RegState::DefineNoRead, TRI);
+         MIB = AddDReg(MIB, DestReg, ARM::dsub_3, RegState::DefineNoRead, TRI);
+         if (Register::isPhysicalRegister(DestReg))
+           MIB.addReg(DestReg, RegState::ImplicitDefine);
+       }
+     } else
+       llvm_unreachable("Unknown reg class!");
+     break;
   case 64:
-    if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
+    if (ARM::MQQQQPRRegClass.hasSubClassEq(RC) &&
+        Subtarget.hasMVEIntegerOps()) {
+      BuildMI(MBB, I, DL, get(ARM::MQQQQPRLoad), DestReg)
+          .addFrameIndex(FI)
+          .addMemOperand(MMO);
+    } else if (ARM::QQQQPRRegClass.hasSubClassEq(RC)) {
       MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(ARM::VLDMDIA))
                                     .addFrameIndex(FI)
                                     .add(predOps(ARMCC::AL))
@@ -1566,6 +1597,13 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
       return MI.getOperand(0).getReg();
     }
     break;
+  case ARM::MQQPRLoad:
+  case ARM::MQQQQPRLoad:
+    if (MI.getOperand(1).isFI()) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+    break;
   }
 
   return 0;
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 4883e5693f87d..5b2a1f40703b5 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -263,6 +263,13 @@ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
     case ARM::QQQQPRRegClassID:
       if (MF.getSubtarget<ARMSubtarget>().hasNEON())
         return Super;
+      break;
+    case ARM::MQPRRegClassID:
+    case ARM::MQQPRRegClassID:
+    case ARM::MQQQQPRRegClassID:
+      if (MF.getSubtarget<ARMSubtarget>().hasMVEIntegerOps())
+        return Super;
+      break;
     }
     Super = *I++;
   } while (Super);
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index e68a3aa8bf478..0e8360814ae26 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -69,6 +69,7 @@ namespace {
     void ExpandLaneOp(MachineBasicBlock::iterator &MBBI);
     void ExpandVTBL(MachineBasicBlock::iterator &MBBI,
                     unsigned Opc, bool IsExt);
+    void ExpandMQQPRLoadStore(MachineBasicBlock::iterator &MBBI);
     void ExpandMOV32BitImm(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator &MBBI);
     void CMSEClearGPRegs(MachineBasicBlock &MBB,
@@ -887,6 +888,43 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   LLVM_DEBUG(dbgs() << "To:        "; MIB.getInstr()->dump(););
 }
 
+void ARMExpandPseudo::ExpandMQQPRLoadStore(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  MachineBasicBlock &MBB = *MI.getParent();
+  unsigned NewOpc =
+      MI.getOpcode() == ARM::MQQPRStore || MI.getOpcode() == ARM::MQQQQPRStore
+          ? ARM::VSTMDIA
+          : ARM::VLDMDIA;
+  MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc));
+
+  unsigned Flags = getKillRegState(MI.getOperand(0).isKill()) |
+                   getDefRegState(MI.getOperand(0).isDef());
+  Register SrcReg = MI.getOperand(0).getReg();
+
+  // Copy the destination register.
+  MIB.add(MI.getOperand(1));
+  MIB.add(predOps(ARMCC::AL));
+  MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_0), Flags);
+  MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_1), Flags);
+  MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_2), Flags);
+  MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_3), Flags);
+  if (MI.getOpcode() == ARM::MQQQQPRStore ||
+      MI.getOpcode() == ARM::MQQQQPRLoad) {
+    MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_4), Flags);
+    MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_5), Flags);
+    MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_6), Flags);
+    MIB.addReg(TRI->getSubReg(SrcReg, ARM::dsub_7), Flags);
+  }
+
+  if (NewOpc == ARM::VSTMDIA)
+    MIB.addReg(SrcReg, RegState::Implicit);
+
+  TransferImpOps(MI, MIB, MIB);
+  MIB.cloneMemRefs(MI);
+  MI.eraseFromParent();
+}
+
 static bool IsAnAddressOperand(const MachineOperand &MO) {
   // This check is overly conservative.  Unless we are certain that the machine
   // operand is not a symbol reference, we return that it is a symbol reference.
@@ -2916,6 +2954,13 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
     case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;
 
+    case ARM::MQQPRLoad:
+    case ARM::MQQPRStore:
+    case ARM::MQQQQPRLoad:
+    case ARM::MQQQQPRStore:
+      ExpandMQQPRLoadStore(MBBI);
+      return true;
+
     case ARM::tCMP_SWAP_8:
       assert(STI->isThumb());
       return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB, ARM::tUXTB,
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 7c238a1099d86..372f79660d03e 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -5499,8 +5499,8 @@ static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
 // using the supplied metadata string to select the instruction node to use
 // and the registers/masks to construct as operands for the node.
 bool ARMDAGToDAGISel::tryReadRegister(SDNode *N){
-  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
-  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  const auto *MD = cast<MDNodeSDNode>(N->getOperand(1));
+  const auto *RegString = cast<MDString>(MD->getMD()->getOperand(0));
   bool IsThumb2 = Subtarget->isThumb2();
   SDLoc DL(N);
 
@@ -5614,8 +5614,8 @@ bool ARMDAGToDAGISel::tryReadRegister(SDNode *N){
 // using the supplied metadata string to select the instruction node to use
 // and the registers/masks to use in the nodes
 bool ARMDAGToDAGISel::tryWriteRegister(SDNode *N){
-  const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(N->getOperand(1));
-  const MDString *RegString = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  const auto *MD = cast<MDNodeSDNode>(N->getOperand(1));
+  const auto *RegString = cast<MDString>(MD->getMD()->getOperand(0));
   bool IsThumb2 = Subtarget->isThumb2();
   SDLoc DL(N);
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 9e7f40301d92e..5366a64bcf899 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1853,12 +1853,18 @@ ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
   // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
   // MVE Q registers.
-  if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
+  if (Subtarget->hasNEON()) {
     if (VT == MVT::v4i64)
       return &ARM::QQPRRegClass;
     if (VT == MVT::v8i64)
       return &ARM::QQQQPRRegClass;
   }
+  if (Subtarget->hasMVEIntegerOps()) {
+    if (VT == MVT::v4i64)
+      return &ARM::MQQPRRegClass;
+    if (VT == MVT::v8i64)
+      return &ARM::MQQQQPRRegClass;
+  }
   return TargetLowering::getRegClassFor(VT);
 }
 
@@ -2289,7 +2295,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool PreferIndirect = false;
 
   // Determine whether this is a non-secure function call.
-  if (CLI.CB && CLI.CB->getAttributes().hasFnAttribute("cmse_nonsecure_call"))
+  if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
     isCmseNSCall = true;
 
   // Disable tail calls if they're not supported.
@@ -8824,54 +8830,68 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG,
 
 static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
                                       const ARMSubtarget *ST) {
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
   SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  EVT Op1VT = V1.getValueType();
-  EVT Op2VT = V2.getValueType();
-  unsigned NumElts = VT.getVectorNumElements();
-
-  assert(Op1VT == Op2VT && "Operand types don't match!");
-  assert(VT.getScalarSizeInBits() == 1 &&
+  assert(Op.getValueType().getScalarSizeInBits() == 1 &&
+         "Unexpected custom CONCAT_VECTORS lowering");
+  assert(isPowerOf2_32(Op.getNumOperands()) &&
          "Unexpected custom CONCAT_VECTORS lowering");
   assert(ST->hasMVEIntegerOps() &&
          "CONCAT_VECTORS lowering only supported for MVE");
 
-  SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
-  SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
-
-  // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
-  // promoted to v8i16, etc.
-
-  MVT ElType = getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
-
-  // Extract the vector elements from Op1 and Op2 one by one and truncate them
-  // to be the right size for the destination. For example, if Op1 is v4i1 then
-  // the promoted vector is v4i32. The result of concatentation gives a v8i1,
-  // which when promoted is v8i16. That means each i32 element from Op1 needs
-  // truncating to i16 and inserting in the result.
-  EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
-  SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
-  auto ExractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
-    EVT NewVT = NewV.getValueType();
-    EVT ConcatVT = ConVec.getValueType();
-    for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
-      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
-                                DAG.getIntPtrConstant(i, dl));
-      ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
-                           DAG.getConstant(j, dl, MVT::i32));
-    }
-    return ConVec;
+  auto ConcatPair = [&](SDValue V1, SDValue V2) {
+    EVT Op1VT = V1.getValueType();
+    EVT Op2VT = V2.getValueType();
+    assert(Op1VT == Op2VT && "Operand types don't match!");
+    EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
+
+    SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
+    SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
+
+    // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
+    // promoted to v8i16, etc.
+    MVT ElType =
+        getVectorTyFromPredicateVector(VT).getScalarType().getSimpleVT();
+    unsigned NumElts = 2 * Op1VT.getVectorNumElements();
+
+    // Extract the vector elements from Op1 and Op2 one by one and truncate them
+    // to be the right size for the destination. For example, if Op1 is v4i1
+    // then the promoted vector is v4i32. The result of concatentation gives a
+    // v8i1, which when promoted is v8i16. That means each i32 element from Op1
+    // needs truncating to i16 and inserting in the result.
+    EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
+    SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
+    auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
+      EVT NewVT = NewV.getValueType();
+      EVT ConcatVT = ConVec.getValueType();
+      for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
+        SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
+                                  DAG.getIntPtrConstant(i, dl));
+        ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
+                             DAG.getConstant(j, dl, MVT::i32));
+      }
+      return ConVec;
+    };
+    unsigned j = 0;
+    ConVec = ExtractInto(NewV1, ConVec, j);
+    ConVec = ExtractInto(NewV2, ConVec, j);
+
+    // Now return the result of comparing the subvector with zero,
+    // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
+    return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
+                       DAG.getConstant(ARMCC::NE, dl, MVT::i32));
   };
-  unsigned j = 0;
-  ConVec = ExractInto(NewV1, ConVec, j);
-  ConVec = ExractInto(NewV2, ConVec, j);
 
-  // Now return the result of comparing the subvector with zero,
-  // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
-  return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
-                     DAG.getConstant(ARMCC::NE, dl, MVT::i32));
+  // Concat each pair of subvectors and pack into the lower half of the array.
+  SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
+  while (ConcatOps.size() > 1) {
+    for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
+      SDValue V1 = ConcatOps[I];
+      SDValue V2 = ConcatOps[I + 1];
+      ConcatOps[I / 2] = ConcatPair(V1, V2);
+    }
+    ConcatOps.resize(ConcatOps.size() / 2);
+  }
+  return ConcatOps[0];
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
@@ -12870,6 +12890,9 @@ static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue Shft;
   ConstantSDNode *Clamp;
 
+  if (!VT.isVector())
+    return SDValue();
+
   if (N->getOpcode() == ISD::SMIN) {
     Shft = N->getOperand(0);
     Clamp = isConstOrConstSplat(N->getOperand(1));
@@ -18134,7 +18157,7 @@ EVT ARMTargetLowering::getOptimalMemOpType(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
   // See if we can use NEON instructions for this...
   if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
-      !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
+      !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
     bool Fast;
     if (Op.size() >= 16 &&
         (Op.isAligned(Align(16)) ||
@@ -20351,7 +20374,7 @@ void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
       "__security_check_cookie", Type::getVoidTy(M.getContext()),
       Type::getInt8PtrTy(M.getContext()));
   if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
-    F->addAttribute(1, Attribute::AttrKind::InReg);
+    F->addParamAttr(0, Attribute::AttrKind::InReg);
 }
 
 Value *ARMTargetLowering::getSDagStackGuard(const Module &M) const {
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 372893814092e..0777532e58e04 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -97,7 +97,7 @@ def VecList2QAsmOperand : AsmOperandClass {
                          "q-registers in range [q0,q7]";
 }
 
-def VecList2Q : RegisterOperand<QQPR, "printMVEVectorListTwoQ"> {
+def VecList2Q : RegisterOperand<MQQPR, "printMVEVectorListTwoQ"> {
   let ParserMatchClass = VecList2QAsmOperand;
   let PrintMethod = "printMVEVectorList<2>";
 }
@@ -110,7 +110,7 @@ def VecList4QAsmOperand : AsmOperandClass {
                          "q-registers in range [q0,q7]";
 }
 
-def VecList4Q : RegisterOperand<QQQQPR, "printMVEVectorListFourQ"> {
+def VecList4Q : RegisterOperand<MQQQQPR, "printMVEVectorListFourQ"> {
   let ParserMatchClass = VecList4QAsmOperand;
   let PrintMethod = "printMVEVectorList<4>";
 }
@@ -6037,13 +6037,13 @@ multiclass MVE_vst24_patterns<int lanesize, ValueType VT> {
     def : Pat<(int_arm_mve_vst2q i32:$addr,
                 (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)),
               (!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize)
-                (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
+                (REG_SEQUENCE MQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
                 t2_addr_offset_none:$addr)>;
   foreach stage = [0,1] in
     def : Pat<(i32 (MVEVST2UPD i32:$addr, (i32 32),
                 (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage))),
               (i32 (!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize#_wb)
-                (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
+                (REG_SEQUENCE MQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
                 t2_addr_offset_none:$addr))>;
 
   foreach stage = [0,1,2,3] in
@@ -6051,16 +6051,16 @@ multiclass MVE_vst24_patterns<int lanesize, ValueType VT> {
                 (VT MQPR:$v0), (VT MQPR:$v1),
                 (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)),
               (!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize)
-                (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
-                                      VT:$v2, qsub_2, VT:$v3, qsub_3),
+                (REG_SEQUENCE MQQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
+                                       VT:$v2, qsub_2, VT:$v3, qsub_3),
                 t2_addr_offset_none:$addr)>;
   foreach stage = [0,1,2,3] in
     def : Pat<(i32 (MVEVST4UPD i32:$addr, (i32 64),
                 (VT MQPR:$v0), (VT MQPR:$v1),
                 (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage))),
               (i32 (!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize#_wb)
-                (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
-                                      VT:$v2, qsub_2, VT:$v3, qsub_3),
+                (REG_SEQUENCE MQQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
+                                       VT:$v2, qsub_2, VT:$v3, qsub_3),
                 t2_addr_offset_none:$addr))>;
 }
 defm : MVE_vst24_patterns<8, v16i8>;
@@ -6930,6 +6930,26 @@ def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> {
 }
 
 
+// Pseudo instructions for lowering MQQPR and MQQQQPR stack spills and reloads.
+// They are equivalent to VLDMDIA/VSTMDIA with a single reg, as opposed to multiple
+// dreg subregs.
+
+let Predicates = [HasMVEInt], AM = AddrMode4 in {
+let mayStore = 1, hasSideEffects = 0 in {
+  def MQQPRStore : t2PseudoInst<(outs), (ins MQQPR:$val, GPRnopc:$ptr),
+                                4, NoItinerary, []>;
+  def MQQQQPRStore : t2PseudoInst<(outs), (ins MQQQQPR:$val, GPRnopc:$ptr),
+                                  4, NoItinerary, []>;
+}
+let mayLoad = 1, hasSideEffects = 0 in {
+  def MQQPRLoad : t2PseudoInst<(outs MQQPR:$val), (ins GPRnopc:$ptr),
+                               4, NoItinerary, []>;
+  def MQQQQPRLoad : t2PseudoInst<(outs MQQQQPR:$val), (ins GPRnopc:$ptr),
+                                 4, NoItinerary, []>;
+}
+}
+
+
 //===----------------------------------------------------------------------===//
 // Patterns
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td
index 93f3b75823480..9752b3166b454 100644
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -557,6 +557,9 @@ def QQPR : RegisterClass<"ARM", [v4i64], 256, (add Tuples2Q)> {
   let AltOrderSelect = [{ return 1; }];
 }
 
+// Same as QQPR but for MVE, containing the 7 register pairs made up from Q0-Q7.
+def MQQPR : RegisterClass<"ARM", [v4i64], 256, (trunc QQPR, 7)>;
+
 // Tuples of 4 D regs that isn't also a pair of Q regs.
 def TuplesOE4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
                                 [(decimate (shl DPR, 1), 2),
@@ -580,6 +583,9 @@ def QQQQPR : RegisterClass<"ARM", [v8i64], 256, (add Tuples2QQ)> {
   let AltOrderSelect = [{ return 1; }];
 }
 
+// Same as QQPR but for MVE, containing the 5 register quads made up from Q0-Q7.
+def MQQQQPR : RegisterClass<"ARM", [v8i64], 256, (trunc QQQQPR, 5)>;
+
 
 // Pseudo-registers representing 2-spaced consecutive D registers.
 def Tuples2DSpc : RegisterTuples<[dsub_0, dsub_2],
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 90f1b693fec60..2e5bbb66604dd 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -389,7 +389,13 @@ bool ARMSubtarget::enableMachineScheduler() const {
   return useMachineScheduler();
 }
 
-bool ARMSubtarget::enableSubRegLiveness() const { return EnableSubRegLiveness; }
+bool ARMSubtarget::enableSubRegLiveness() const {
+  if (EnableSubRegLiveness.getNumOccurrences())
+    return EnableSubRegLiveness;
+  // Enable SubRegLiveness for MVE to better optimize s subregs for mqpr regs
+  // and q subregs for qqqqpr regs.
+  return hasMVEIntegerOps();
+}
 
 // This overrides the PostRAScheduler bit in the SchedModel for any CPU.
 bool ARMSubtarget::enablePostRAScheduler() const {
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index e410fe0aeff28..339919db2bdea 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -3343,16 +3343,16 @@ class ARMOperand : public MCParsedAsmOperand {
     // regs) or q0-q4 (for 4)
     //
     // The MVE instructions taking a register range of this kind will
-    // need an operand in the QQPR or QQQQPR class, representing the
+    // need an operand in the MQQPR or MQQQQPR class, representing the
     // entire range as a unit. So we must translate into that class,
     // by finding the index of the base register in the MQPR reg
     // class, and returning the super-register at the corresponding
     // index in the target class.
 
     const MCRegisterClass *RC_in = &ARMMCRegisterClasses[ARM::MQPRRegClassID];
-    const MCRegisterClass *RC_out = (VectorList.Count == 2) ?
-      &ARMMCRegisterClasses[ARM::QQPRRegClassID] :
-      &ARMMCRegisterClasses[ARM::QQQQPRRegClassID];
+    const MCRegisterClass *RC_out =
+        (VectorList.Count == 2) ? &ARMMCRegisterClasses[ARM::MQQPRRegClassID]
+                                : &ARMMCRegisterClasses[ARM::MQQQQPRRegClassID];
 
     unsigned I, E = RC_out->getNumRegs();
     for (I = 0; I < E; I++)
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 51fd450345345..f9a786840db00 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -227,10 +227,12 @@ static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeMQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
 static DecodeStatus DecodeDPairRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeDPairSpacedRegisterClass(MCInst &Inst,
@@ -6154,9 +6156,9 @@ static const uint16_t QQPRDecoderTable[] = {
      ARM::Q4_Q5,  ARM::Q5_Q6,  ARM::Q6_Q7
 };
 
-static DecodeStatus DecodeQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                              uint64_t Address,
-                              const void *Decoder) {
+static DecodeStatus DecodeMQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
   if (RegNo > 6)
     return MCDisassembler::Fail;
 
@@ -6170,9 +6172,9 @@ static const uint16_t QQQQPRDecoderTable[] = {
      ARM::Q3_Q4_Q5_Q6,  ARM::Q4_Q5_Q6_Q7
 };
 
-static DecodeStatus DecodeQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                              uint64_t Address,
-                              const void *Decoder) {
+static DecodeStatus DecodeMQQQQPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
   if (RegNo > 4)
     return MCDisassembler::Fail;
 
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 58a7aed91cdf3..ce50ed0bcfd74 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -1488,8 +1488,7 @@ AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   // Don't emit the ret/reti instruction when the naked attribute is present in
   // the function being compiled.
-  if (MF.getFunction().getAttributes().hasAttribute(
-          AttributeList::FunctionIndex, Attribute::Naked)) {
+  if (MF.getFunction().getAttributes().hasFnAttr(Attribute::Naked)) {
     return Chain;
   }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
index e026bb6d601d0..1941c0f35e5b4 100644
--- a/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
@@ -67,12 +67,12 @@ bool HexagonOptimizeSZextends::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  unsigned Idx = 1;
+  unsigned Idx = 0;
   // Try to optimize sign extends in formal parameters. It's relying on
   // callee already sign extending the values. I'm not sure if our ABI
   // requires callee to sign extend though.
   for (auto &Arg : F.args()) {
-    if (F.getAttributes().hasAttribute(Idx, Attribute::SExt)) {
+    if (F.getAttributes().hasParamAttr(Idx, Attribute::SExt)) {
       if (!isa<PointerType>(Arg.getType())) {
         for (auto UI = Arg.use_begin(); UI != Arg.use_end();) {
           if (isa<SExtInst>(*UI)) {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 574524410a916..a476d3ed0909e 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -238,9 +238,9 @@ const HexagonSubtarget *
 HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
   AttributeList FnAttrs = F.getAttributes();
   Attribute CPUAttr =
-      FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-cpu");
+      FnAttrs.getFnAttr("target-cpu");
   Attribute FSAttr =
-      FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-features");
+      FnAttrs.getFnAttr("target-features");
 
   std::string CPU =
       CPUAttr.isValid() ? CPUAttr.getValueAsString().str() : TargetCPU;
diff --git a/llvm/lib/Target/M68k/M68kCallingConv.h b/llvm/lib/Target/M68k/M68kCallingConv.h
index 18f72c95cedb2..20ffa993897f0 100644
--- a/llvm/lib/Target/M68k/M68kCallingConv.h
+++ b/llvm/lib/Target/M68k/M68kCallingConv.h
@@ -24,14 +24,13 @@
 namespace llvm {
 
 /// Custom state to propagate llvm type info to register CC assigner
-class M68kCCState : public CCState {
-public:
-  const llvm::Function &F;
+struct M68kCCState : public CCState {
+  ArrayRef<Type *> ArgTypeList;
 
-  M68kCCState(const llvm::Function &F, CallingConv::ID CC, bool IsVarArg,
+  M68kCCState(ArrayRef<Type *> ArgTypes, CallingConv::ID CC, bool IsVarArg,
               MachineFunction &MF, SmallVectorImpl<CCValAssign> &Locs,
               LLVMContext &C)
-      : CCState(CC, IsVarArg, MF, Locs, C), F(F) {}
+      : CCState(CC, IsVarArg, MF, Locs, C), ArgTypeList(ArgTypes) {}
 };
 
 /// NOTE this function is used to select registers for formal arguments and call
@@ -39,7 +38,7 @@ class M68kCCState : public CCState {
 inline bool CC_M68k_Any_AssignToReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                     CCValAssign::LocInfo &LocInfo,
                                     ISD::ArgFlagsTy &ArgFlags, CCState &State) {
-  M68kCCState CCInfo = static_cast<M68kCCState &>(State);
+  const M68kCCState &CCInfo = static_cast<M68kCCState &>(State);
 
   static const MCPhysReg DataRegList[] = {M68k::D0, M68k::D1, M68k::A0,
                                           M68k::A1};
@@ -52,14 +51,15 @@ inline bool CC_M68k_Any_AssignToReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
       M68k::D1,
   };
 
-  auto I = CCInfo.F.arg_begin();
+  const auto &ArgTypes = CCInfo.ArgTypeList;
+  auto I = ArgTypes.begin(), End = ArgTypes.end();
   int No = ValNo;
-  while (No > 0) {
-    No -= I->getType()->isIntegerTy(64) ? 2 : 1;
-    I++;
+  while (No > 0 && I != End) {
+    No -= (*I)->isIntegerTy(64) ? 2 : 1;
+    ++I;
   }
 
-  bool IsPtr = I != CCInfo.F.arg_end() && I->getType()->isPointerTy();
+  bool IsPtr = I != End && (*I)->isPointerTy();
 
   unsigned Reg =
       IsPtr ? State.AllocateReg(AddrRegList) : State.AllocateReg(DataRegList);
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index 3e7cee9889d7c..f00083aaa1d89 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -519,9 +519,10 @@ SDValue M68kTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  // It is empty for LibCall
-  const Function *CalleeFunc = CLI.CB ? CLI.CB->getCalledFunction() : nullptr;
-  M68kCCState CCInfo(*CalleeFunc, CallConv, IsVarArg, MF, ArgLocs,
+  SmallVector<Type *, 4> ArgTypes;
+  for (const auto &Arg : CLI.getArgs())
+    ArgTypes.emplace_back(Arg.Ty);
+  M68kCCState CCInfo(ArgTypes, CallConv, IsVarArg, MF, ArgLocs,
                      *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CC_M68k);
 
@@ -876,8 +877,10 @@ SDValue M68kTargetLowering::LowerFormalArguments(
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  M68kCCState CCInfo(MF.getFunction(), CCID, IsVarArg, MF, ArgLocs,
-                     *DAG.getContext());
+  SmallVector<Type *, 4> ArgTypes;
+  for (const Argument &Arg : MF.getFunction().args())
+    ArgTypes.emplace_back(Arg.getType());
+  M68kCCState CCInfo(ArgTypes, CCID, IsVarArg, MF, ArgLocs, *DAG.getContext());
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_M68k);
 
diff --git a/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/llvm/lib/Target/Mips/Mips16HardFloat.cpp
index 6c5f63804d191..203e05dde7ad9 100644
--- a/llvm/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/llvm/lib/Target/Mips/Mips16HardFloat.cpp
@@ -408,12 +408,9 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
         // during call setup, the proper call lowering to the helper
         // functions will take place.
         //
-        A = A.addAttribute(C, AttributeList::FunctionIndex,
-                           "__Mips16RetHelper");
-        A = A.addAttribute(C, AttributeList::FunctionIndex,
-                           Attribute::ReadNone);
-        A = A.addAttribute(C, AttributeList::FunctionIndex,
-                           Attribute::NoInline);
+        A = A.addFnAttribute(C, "__Mips16RetHelper");
+        A = A.addFnAttribute(C, Attribute::ReadNone);
+        A = A.addFnAttribute(C, Attribute::NoInline);
         FunctionCallee F = (M->getOrInsertFunction(Name, A, MyVoid, T));
         CallInst::Create(F, Params, "", &I);
       } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
@@ -485,11 +482,11 @@ static void removeUseSoftFloat(Function &F) {
   AttrBuilder B;
   LLVM_DEBUG(errs() << "removing -use-soft-float\n");
   B.addAttribute("use-soft-float", "false");
-  F.removeAttributes(AttributeList::FunctionIndex, B);
+  F.removeFnAttrs(B);
   if (F.hasFnAttribute("use-soft-float")) {
     LLVM_DEBUG(errs() << "still has -use-soft-float\n");
   }
-  F.addAttributes(AttributeList::FunctionIndex, B);
+  F.addFnAttrs(B);
 }
 
 // This pass only makes sense when the underlying chip has floating point but
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index b7d0f57471f0c..fdc413d08b77d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -703,7 +703,7 @@ void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
   for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
     const Function *F = &*FI;
 
-    if (F->getAttributes().hasFnAttribute("nvptx-libcall-callee")) {
+    if (F->getAttributes().hasFnAttr("nvptx-libcall-callee")) {
       emitDeclaration(F, O);
       continue;
     }
diff --git a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp
index bb8f7362cab42..e76d4f14ace9d 100644
--- a/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp
+++ b/llvm/lib/Target/NVPTX/SYCL/GlobalOffset.cpp
@@ -118,7 +118,7 @@ class GlobalOffset : public ModulePass {
         Builder.CreateMemSet(ImplicitOffset, Builder.getInt8(0), AllocByteSize,
                              ImplicitOffset->getAlign());
     MemsetCall->addParamAttr(0, Attribute::NonNull);
-    MemsetCall->addDereferenceableAttr(1, AllocByteSize);
+    MemsetCall->addDereferenceableParamAttr(0, AllocByteSize);
     ProcessedFunctions[Func] = Builder.CreateConstInBoundsGEP2_32(
         ImplicitOffsetType, ImplicitOffset, 0, 0);
   }
@@ -240,7 +240,7 @@ class GlobalOffset : public ModulePass {
                                 FuncEnd = Func->arg_end();
          FuncArg != FuncEnd; ++FuncArg, ++i) {
       Arguments.push_back(FuncArg->getType());
-      ArgumentAttributes.push_back(FuncAttrs.getParamAttributes(i));
+      ArgumentAttributes.push_back(FuncAttrs.getParamAttrs(i));
     }
 
     // Add the offset argument. Must be the same type as returned by
@@ -251,8 +251,8 @@ class GlobalOffset : public ModulePass {
 
     // Build the new function.
     AttributeList NAttrs =
-        AttributeList::get(Func->getContext(), FuncAttrs.getFnAttributes(),
-                           FuncAttrs.getRetAttributes(), ArgumentAttributes);
+        AttributeList::get(Func->getContext(), FuncAttrs.getFnAttrs(),
+                           FuncAttrs.getRetAttrs(), ArgumentAttributes);
     assert(!FuncTy->isVarArg() && "Variadic arguments prohibited in SYCL");
     FunctionType *NewFuncTy = FunctionType::get(FuncTy->getReturnType(),
                                                 Arguments, FuncTy->isVarArg());
diff --git a/llvm/lib/Target/NVPTX/SYCL/LocalAccessorToSharedMemory.cpp b/llvm/lib/Target/NVPTX/SYCL/LocalAccessorToSharedMemory.cpp
index fc8374f647f82..f2409f569c963 100644
--- a/llvm/lib/Target/NVPTX/SYCL/LocalAccessorToSharedMemory.cpp
+++ b/llvm/lib/Target/NVPTX/SYCL/LocalAccessorToSharedMemory.cpp
@@ -148,14 +148,14 @@ class LocalAccessorToSharedMemory : public ModulePass {
       } else {
         // Replace other arguments with the same type as before.
         Arguments.push_back(FA->getType());
-        ArgumentAttributes.push_back(FAttrs.getParamAttributes(i));
+        ArgumentAttributes.push_back(FAttrs.getParamAttrs(i));
       }
     }
 
     // Create new function type.
     AttributeList NAttrs =
-        AttributeList::get(F->getContext(), FAttrs.getFnAttributes(),
-                           FAttrs.getRetAttributes(), ArgumentAttributes);
+        AttributeList::get(F->getContext(), FAttrs.getFnAttrs(),
+                           FAttrs.getRetAttrs(), ArgumentAttributes);
     FunctionType *NFTy =
         FunctionType::get(FTy->getReturnType(), Arguments, FTy->isVarArg());
 
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 1de6b633d20ab..fc3c7ec35b8d9 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -279,11 +279,11 @@ static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
 
 /// determineFrameLayoutAndUpdate - Determine the size of the frame and maximum
 /// call frame size. Update the MachineFunction object with the stack size.
-unsigned
+uint64_t
 PPCFrameLowering::determineFrameLayoutAndUpdate(MachineFunction &MF,
                                                 bool UseEstimate) const {
   unsigned NewMaxCallFrameSize = 0;
-  unsigned FrameSize = determineFrameLayout(MF, UseEstimate,
+  uint64_t FrameSize = determineFrameLayout(MF, UseEstimate,
                                             &NewMaxCallFrameSize);
   MF.getFrameInfo().setStackSize(FrameSize);
   MF.getFrameInfo().setMaxCallFrameSize(NewMaxCallFrameSize);
@@ -292,7 +292,7 @@ PPCFrameLowering::determineFrameLayoutAndUpdate(MachineFunction &MF,
 
 /// determineFrameLayout - Determine the size of the frame and maximum call
 /// frame size.
-unsigned
+uint64_t
 PPCFrameLowering::determineFrameLayout(const MachineFunction &MF,
                                        bool UseEstimate,
                                        unsigned *NewMaxCallFrameSize) const {
@@ -300,7 +300,7 @@ PPCFrameLowering::determineFrameLayout(const MachineFunction &MF,
   const PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
 
   // Get the number of bytes to allocate from the FrameInfo
-  unsigned FrameSize =
+  uint64_t FrameSize =
     UseEstimate ? MFI.estimateStackSize(MF) : MFI.getStackSize();
 
   // Get stack alignments. The frame must be aligned to the greatest of these:
@@ -624,9 +624,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   assert((isSVR4ABI || Subtarget.isAIXABI()) && "Unsupported PPC ABI.");
 
   // Work out frame sizes.
-  unsigned FrameSize = determineFrameLayoutAndUpdate(MF);
-  int NegFrameSize = -FrameSize;
-  if (!isInt<32>(NegFrameSize))
+  uint64_t FrameSize = determineFrameLayoutAndUpdate(MF);
+  int64_t NegFrameSize = -FrameSize;
+  if (!isInt<32>(FrameSize) || !isInt<32>(NegFrameSize))
     llvm_unreachable("Unhandled stack size!");
 
   if (MFI.isFrameAddressTaken())
@@ -692,9 +692,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
 
   SingleScratchReg = ScratchReg == TempReg;
 
-  int LROffset = getReturnSaveOffset();
+  int64_t LROffset = getReturnSaveOffset();
 
-  int FPOffset = 0;
+  int64_t FPOffset = 0;
   if (HasFP) {
     MachineFrameInfo &MFI = MF.getFrameInfo();
     int FPIndex = FI->getFramePointerSaveIndex();
@@ -702,7 +702,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
     FPOffset = MFI.getObjectOffset(FPIndex);
   }
 
-  int BPOffset = 0;
+  int64_t BPOffset = 0;
   if (HasBP) {
     MachineFrameInfo &MFI = MF.getFrameInfo();
     int BPIndex = FI->getBasePointerSaveIndex();
@@ -710,7 +710,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
     BPOffset = MFI.getObjectOffset(BPIndex);
   }
 
-  int PBPOffset = 0;
+  int64_t PBPOffset = 0;
   if (FI->usesPICBase()) {
     MachineFrameInfo &MFI = MF.getFrameInfo();
     int PBPIndex = FI->getPICBasePointerSaveIndex();
@@ -854,7 +854,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
     // ABI.
     if (HasROPProtect) {
       const int SaveIndex = FI->getROPProtectionHashSaveIndex();
-      const int ImmOffset = MFI.getObjectOffset(SaveIndex);
+      const int64_t ImmOffset = MFI.getObjectOffset(SaveIndex);
       assert((ImmOffset <= -8 && ImmOffset >= -512) &&
              "ROP hash save offset out of range.");
       assert(((ImmOffset & 0x7) == 0) &&
@@ -1212,7 +1212,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
         BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIRegister);
       } else {
-        int Offset = MFI.getObjectOffset(CSI[I].getFrameIdx());
+        int64_t Offset = MFI.getObjectOffset(CSI[I].getFrameIdx());
         // We have changed the object offset above but we do not want to change
         // the actual offsets in the CFI instruction so we have to undo the
         // offset change here.
@@ -1550,7 +1550,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
   // Get the number of bytes allocated from the FrameInfo.
-  int FrameSize = MFI.getStackSize();
+  int64_t FrameSize = MFI.getStackSize();
 
   // Get processor type.
   bool isPPC64 = Subtarget.isPPC64();
@@ -1592,9 +1592,9 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
                                                      : PPC::MTOCRF);
   const MCInstrDesc &HashChk =
       TII.get(HasPrivileged ? PPC::HASHCHKP : PPC::HASHCHK);
-  int LROffset = getReturnSaveOffset();
+  int64_t LROffset = getReturnSaveOffset();
 
-  int FPOffset = 0;
+  int64_t FPOffset = 0;
 
   // Using the same bool variable as below to suppress compiler warnings.
   bool SingleScratchReg = findScratchRegister(&MBB, true, false, &ScratchReg,
@@ -1610,14 +1610,14 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
     FPOffset = MFI.getObjectOffset(FPIndex);
   }
 
-  int BPOffset = 0;
+  int64_t BPOffset = 0;
   if (HasBP) {
       int BPIndex = FI->getBasePointerSaveIndex();
       assert(BPIndex && "No Base Pointer Save Slot!");
       BPOffset = MFI.getObjectOffset(BPIndex);
   }
 
-  int PBPOffset = 0;
+  int64_t PBPOffset = 0;
   if (FI->usesPICBase()) {
     int PBPIndex = FI->getPICBasePointerSaveIndex();
     assert(PBPIndex && "No PIC Base Pointer Save Slot!");
@@ -1865,7 +1865,7 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
     // hash and then compare it to the hash stored in the prologue.
     if (HasROPProtect) {
       const int SaveIndex = FI->getROPProtectionHashSaveIndex();
-      const int ImmOffset = MFI.getObjectOffset(SaveIndex);
+      const int64_t ImmOffset = MFI.getObjectOffset(SaveIndex);
       assert((ImmOffset <= -8 && ImmOffset >= -512) &&
              "ROP hash check location offset out of range.");
       assert(((ImmOffset & 0x7) == 0) &&
@@ -2680,15 +2680,15 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
   return true;
 }
 
-unsigned PPCFrameLowering::getTOCSaveOffset() const {
+uint64_t PPCFrameLowering::getTOCSaveOffset() const {
   return TOCSaveOffset;
 }
 
-unsigned PPCFrameLowering::getFramePointerSaveOffset() const {
+uint64_t PPCFrameLowering::getFramePointerSaveOffset() const {
   return FramePointerSaveOffset;
 }
 
-unsigned PPCFrameLowering::getBasePointerSaveOffset() const {
+uint64_t PPCFrameLowering::getBasePointerSaveOffset() const {
   return BasePointerSaveOffset;
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/llvm/lib/Target/PowerPC/PPCFrameLowering.h
index b378c27399254..21883b19a5755 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.h
@@ -21,12 +21,12 @@ class PPCSubtarget;
 
 class PPCFrameLowering: public TargetFrameLowering {
   const PPCSubtarget &Subtarget;
-  const unsigned ReturnSaveOffset;
-  const unsigned TOCSaveOffset;
-  const unsigned FramePointerSaveOffset;
+  const uint64_t ReturnSaveOffset;
+  const uint64_t TOCSaveOffset;
+  const uint64_t FramePointerSaveOffset;
   const unsigned LinkageSize;
-  const unsigned BasePointerSaveOffset;
-  const unsigned CRSaveOffset;
+  const uint64_t BasePointerSaveOffset;
+  const uint64_t CRSaveOffset;
 
   // Map each group of one or two GPRs to corresponding VSR for spilling.
   // TODO: Use local table in methods to avoid this mutable member.
@@ -88,7 +88,7 @@ class PPCFrameLowering: public TargetFrameLowering {
   /**
    * Determine the frame layout and update the machine function.
    */
-  unsigned determineFrameLayoutAndUpdate(MachineFunction &MF,
+  uint64_t determineFrameLayoutAndUpdate(MachineFunction &MF,
                                          bool UseEstimate = false) const;
 
   /**
@@ -96,7 +96,7 @@ class PPCFrameLowering: public TargetFrameLowering {
    * The MachineFunction object can be const in this case as it is not
    * modified.
    */
-  unsigned determineFrameLayout(const MachineFunction &MF,
+  uint64_t determineFrameLayout(const MachineFunction &MF,
                                 bool UseEstimate = false,
                                 unsigned *NewMaxCallFrameSize = nullptr) const;
 
@@ -146,19 +146,19 @@ class PPCFrameLowering: public TargetFrameLowering {
 
   /// getReturnSaveOffset - Return the previous frame offset to save the
   /// return address.
-  unsigned getReturnSaveOffset() const { return ReturnSaveOffset; }
+  uint64_t getReturnSaveOffset() const { return ReturnSaveOffset; }
 
   /// getTOCSaveOffset - Return the previous frame offset to save the
   /// TOC register -- 64-bit SVR4 ABI only.
-  unsigned getTOCSaveOffset() const;
+  uint64_t getTOCSaveOffset() const;
 
   /// getFramePointerSaveOffset - Return the previous frame offset to save the
   /// frame pointer.
-  unsigned getFramePointerSaveOffset() const;
+  uint64_t getFramePointerSaveOffset() const;
 
   /// getBasePointerSaveOffset - Return the previous frame offset to save the
   /// base pointer.
-  unsigned getBasePointerSaveOffset() const;
+  uint64_t getBasePointerSaveOffset() const;
 
   /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
   ///
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 9dd35d5f44d1a..e9804516006ee 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -5241,8 +5241,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
               return false;
             const IntegerType *IntTy =
               dyn_cast<IntegerType>(CalleeFn->getReturnType());
-            const AttributeSet &Attrs =
-              CalleeFn->getAttributes().getRetAttributes();
+            const AttributeSet &Attrs = CalleeFn->getAttributes().getRetAttrs();
             if (IntTy && IntTy->getBitWidth() <= 32)
               return Attrs.hasAttribute(SignExt ? Attribute::SExt :
                                                   Attribute::ZExt);
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index c22a5826337bd..2e534dd1bcd54 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -147,8 +147,6 @@ class PPCRegisterInfo : public PPCGenRegisterInfo {
                            unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
 
-  bool addAllocPriorityToGlobalRanges() const override { return true; }
-
   // Support for virtual base registers.
   bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
   Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx,
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 9866567ac1eef..669245f8edf26 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -108,7 +108,21 @@ void RISCVDAGToDAGISel::PreprocessISelDAG() {
 }
 
 void RISCVDAGToDAGISel::PostprocessISelDAG() {
-  doPeepholeLoadStoreADDI();
+  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
+
+  bool MadeChange = false;
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--Position;
+    // Skip dead nodes and any non-machine opcodes.
+    if (N->use_empty() || !N->isMachineOpcode())
+      continue;
+
+    MadeChange |= doPeepholeSExtW(N);
+    MadeChange |= doPeepholeLoadStoreADDI(N);
+  }
+
+  if (MadeChange)
+    CurDAG->RemoveDeadNodes();
 }
 
 static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm,
@@ -459,8 +473,18 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       ReplaceNode(Node, New.getNode());
       return;
     }
-    ReplaceNode(Node,
-                selectImm(CurDAG, DL, ConstNode->getSExtValue(), *Subtarget));
+    int64_t Imm = ConstNode->getSExtValue();
+    // If the upper XLen-16 bits are not used, try to convert this to a simm12
+    // by sign extending bit 15.
+    if (isUInt<16>(Imm) && isInt<12>(SignExtend64(Imm, 16)) &&
+        hasAllHUsers(Node))
+      Imm = SignExtend64(Imm, 16);
+    // If the upper 32-bits are not used try to convert this into a simm32 by
+    // sign extending bit 32.
+    if (!isInt<32>(Imm) && isUInt<32>(Imm) && hasAllWUsers(Node))
+      Imm = SignExtend64(Imm, 32);
+
+    ReplaceNode(Node, selectImm(CurDAG, DL, Imm, *Subtarget));
     return;
   }
   case ISD::FrameIndex: {
@@ -1496,6 +1520,89 @@ bool RISCVDAGToDAGISel::selectZExti32(SDValue N, SDValue &Val) {
   return false;
 }
 
+// Return true if all users of this SDNode* only consume the lower \p Bits.
+// This can be used to form W instructions for add/sub/mul/shl even when the
+// root isn't a sext_inreg. This can allow the ADDW/SUBW/MULW/SLLIW to CSE if
+// SimplifyDemandedBits has made it so some users see a sext_inreg and some
+// don't. The sext_inreg+add/sub/mul/shl will get selected, but still leave
+// the add/sub/mul/shl to become non-W instructions. By checking the users we
+// may be able to use a W instruction and CSE with the other instruction if
+// this has happened. We could try to detect that the CSE opportunity exists
+// before doing this, but that would be more complicated.
+// TODO: Does this need to look through AND/OR/XOR to their users to find more
+// opportunities.
+bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const {
+  assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB ||
+          Node->getOpcode() == ISD::MUL || Node->getOpcode() == ISD::SHL ||
+          isa<ConstantSDNode>(Node)) &&
+         "Unexpected opcode");
+
+  for (auto UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    // Users of this node should have already been instruction selected
+    if (!User->isMachineOpcode())
+      return false;
+
+    // TODO: Add more opcodes?
+    switch (User->getMachineOpcode()) {
+    default:
+      return false;
+    case RISCV::ADDW:
+    case RISCV::ADDIW:
+    case RISCV::SUBW:
+    case RISCV::MULW:
+    case RISCV::SLLW:
+    case RISCV::SLLIW:
+    case RISCV::SRAW:
+    case RISCV::SRAIW:
+    case RISCV::SRLW:
+    case RISCV::SRLIW:
+    case RISCV::DIVW:
+    case RISCV::DIVUW:
+    case RISCV::REMW:
+    case RISCV::REMUW:
+    case RISCV::ROLW:
+    case RISCV::RORW:
+    case RISCV::RORIW:
+    case RISCV::CLZW:
+    case RISCV::CTZW:
+    case RISCV::CPOPW:
+    case RISCV::SLLIUW:
+      if (Bits < 32)
+        return false;
+      break;
+    case RISCV::SLLI:
+      // SLLI only uses the lower (XLen - ShAmt) bits.
+      if (Bits < Subtarget->getXLen() - User->getConstantOperandVal(1))
+        return false;
+      break;
+    case RISCV::ADDUW:
+    case RISCV::SH1ADDUW:
+    case RISCV::SH2ADDUW:
+    case RISCV::SH3ADDUW:
+      // The first operand to add.uw/shXadd.uw is implicitly zero extended from
+      // 32 bits.
+      if (UI.getOperandNo() != 0 || Bits < 32)
+        return false;
+      break;
+    case RISCV::SB:
+      if (UI.getOperandNo() != 0 || Bits < 8)
+        return false;
+      break;
+    case RISCV::SH:
+      if (UI.getOperandNo() != 0 || Bits < 16)
+        return false;
+      break;
+    case RISCV::SW:
+      if (UI.getOperandNo() != 0 || Bits < 32)
+        return false;
+      break;
+    }
+  }
+
+  return true;
+}
+
 // Select VL as a 5 bit immediate or a value that will become a register. This
 // allows us to choose betwen VSETIVLI or VSETVLI later.
 bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) {
@@ -1609,113 +1716,162 @@ bool RISCVDAGToDAGISel::selectRVVSimm5(SDValue N, unsigned Width,
 // (load (addi base, off1), off2) -> (load base, off1+off2)
 // (store val, (addi base, off1), off2) -> (store val, base, off1+off2)
 // This is possible when off1+off2 fits a 12-bit immediate.
-void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
-  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
-  ++Position;
+bool RISCVDAGToDAGISel::doPeepholeLoadStoreADDI(SDNode *N) {
+  int OffsetOpIdx;
+  int BaseOpIdx;
 
-  while (Position != CurDAG->allnodes_begin()) {
-    SDNode *N = &*--Position;
-    // Skip dead nodes and any non-machine opcodes.
-    if (N->use_empty() || !N->isMachineOpcode())
-      continue;
+  // Only attempt this optimisation for I-type loads and S-type stores.
+  switch (N->getMachineOpcode()) {
+  default:
+    return false;
+  case RISCV::LB:
+  case RISCV::LH:
+  case RISCV::LW:
+  case RISCV::LBU:
+  case RISCV::LHU:
+  case RISCV::LWU:
+  case RISCV::LD:
+  case RISCV::FLH:
+  case RISCV::FLW:
+  case RISCV::FLD:
+    BaseOpIdx = 0;
+    OffsetOpIdx = 1;
+    break;
+  case RISCV::SB:
+  case RISCV::SH:
+  case RISCV::SW:
+  case RISCV::SD:
+  case RISCV::FSH:
+  case RISCV::FSW:
+  case RISCV::FSD:
+    BaseOpIdx = 1;
+    OffsetOpIdx = 2;
+    break;
+  }
 
-    int OffsetOpIdx;
-    int BaseOpIdx;
+  if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)))
+    return false;
 
-    // Only attempt this optimisation for I-type loads and S-type stores.
-    switch (N->getMachineOpcode()) {
-    default:
-      continue;
-    case RISCV::LB:
-    case RISCV::LH:
-    case RISCV::LW:
-    case RISCV::LBU:
-    case RISCV::LHU:
-    case RISCV::LWU:
-    case RISCV::LD:
-    case RISCV::FLH:
-    case RISCV::FLW:
-    case RISCV::FLD:
-      BaseOpIdx = 0;
-      OffsetOpIdx = 1;
-      break;
-    case RISCV::SB:
-    case RISCV::SH:
-    case RISCV::SW:
-    case RISCV::SD:
-    case RISCV::FSH:
-    case RISCV::FSW:
-    case RISCV::FSD:
-      BaseOpIdx = 1;
-      OffsetOpIdx = 2;
-      break;
-    }
+  SDValue Base = N->getOperand(BaseOpIdx);
 
-    if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)))
-      continue;
+  // If the base is an ADDI, we can merge it in to the load/store.
+  if (!Base.isMachineOpcode() || Base.getMachineOpcode() != RISCV::ADDI)
+    return false;
 
-    SDValue Base = N->getOperand(BaseOpIdx);
+  SDValue ImmOperand = Base.getOperand(1);
+  uint64_t Offset2 = N->getConstantOperandVal(OffsetOpIdx);
 
-    // If the base is an ADDI, we can merge it in to the load/store.
-    if (!Base.isMachineOpcode() || Base.getMachineOpcode() != RISCV::ADDI)
-      continue;
+  if (auto *Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
+    int64_t Offset1 = Const->getSExtValue();
+    int64_t CombinedOffset = Offset1 + Offset2;
+    if (!isInt<12>(CombinedOffset))
+      return false;
+    ImmOperand = CurDAG->getTargetConstant(CombinedOffset, SDLoc(ImmOperand),
+                                           ImmOperand.getValueType());
+  } else if (auto *GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
+    // If the off1 in (addi base, off1) is a global variable's address (its
+    // low part, really), then we can rely on the alignment of that variable
+    // to provide a margin of safety before off1 can overflow the 12 bits.
+    // Check if off2 falls within that margin; if so off1+off2 can't overflow.
+    const DataLayout &DL = CurDAG->getDataLayout();
+    Align Alignment = GA->getGlobal()->getPointerAlignment(DL);
+    if (Offset2 != 0 && Alignment <= Offset2)
+      return false;
+    int64_t Offset1 = GA->getOffset();
+    int64_t CombinedOffset = Offset1 + Offset2;
+    ImmOperand = CurDAG->getTargetGlobalAddress(
+        GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(),
+        CombinedOffset, GA->getTargetFlags());
+  } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(ImmOperand)) {
+    // Ditto.
+    Align Alignment = CP->getAlign();
+    if (Offset2 != 0 && Alignment <= Offset2)
+      return false;
+    int64_t Offset1 = CP->getOffset();
+    int64_t CombinedOffset = Offset1 + Offset2;
+    ImmOperand = CurDAG->getTargetConstantPool(
+        CP->getConstVal(), ImmOperand.getValueType(), CP->getAlign(),
+        CombinedOffset, CP->getTargetFlags());
+  } else {
+    return false;
+  }
 
-    SDValue ImmOperand = Base.getOperand(1);
-    uint64_t Offset2 = N->getConstantOperandVal(OffsetOpIdx);
-
-    if (auto *Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
-      int64_t Offset1 = Const->getSExtValue();
-      int64_t CombinedOffset = Offset1 + Offset2;
-      if (!isInt<12>(CombinedOffset))
-        continue;
-      ImmOperand = CurDAG->getTargetConstant(CombinedOffset, SDLoc(ImmOperand),
-                                             ImmOperand.getValueType());
-    } else if (auto *GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
-      // If the off1 in (addi base, off1) is a global variable's address (its
-      // low part, really), then we can rely on the alignment of that variable
-      // to provide a margin of safety before off1 can overflow the 12 bits.
-      // Check if off2 falls within that margin; if so off1+off2 can't overflow.
-      const DataLayout &DL = CurDAG->getDataLayout();
-      Align Alignment = GA->getGlobal()->getPointerAlignment(DL);
-      if (Offset2 != 0 && Alignment <= Offset2)
-        continue;
-      int64_t Offset1 = GA->getOffset();
-      int64_t CombinedOffset = Offset1 + Offset2;
-      ImmOperand = CurDAG->getTargetGlobalAddress(
-          GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(),
-          CombinedOffset, GA->getTargetFlags());
-    } else if (auto *CP = dyn_cast<ConstantPoolSDNode>(ImmOperand)) {
-      // Ditto.
-      Align Alignment = CP->getAlign();
-      if (Offset2 != 0 && Alignment <= Offset2)
-        continue;
-      int64_t Offset1 = CP->getOffset();
-      int64_t CombinedOffset = Offset1 + Offset2;
-      ImmOperand = CurDAG->getTargetConstantPool(
-          CP->getConstVal(), ImmOperand.getValueType(), CP->getAlign(),
-          CombinedOffset, CP->getTargetFlags());
-    } else {
-      continue;
+  LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase:    ");
+  LLVM_DEBUG(Base->dump(CurDAG));
+  LLVM_DEBUG(dbgs() << "\nN: ");
+  LLVM_DEBUG(N->dump(CurDAG));
+  LLVM_DEBUG(dbgs() << "\n");
+
+  // Modify the offset operand of the load/store.
+  if (BaseOpIdx == 0) // Load
+    CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
+                               N->getOperand(2));
+  else // Store
+    CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
+                               ImmOperand, N->getOperand(3));
+
+  return true;
+}
+
+// Try to remove sext.w if the input is a W instruction or can be made into
+// a W instruction cheaply.
+bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) {
+  // Look for the sext.w pattern, addiw rd, rs1, 0.
+  if (N->getMachineOpcode() != RISCV::ADDIW ||
+      !isNullConstant(N->getOperand(1)))
+    return false;
+
+  SDValue N0 = N->getOperand(0);
+  if (!N0.isMachineOpcode())
+    return false;
+
+  switch (N0.getMachineOpcode()) {
+  default:
+    break;
+  case RISCV::ADD:
+  case RISCV::ADDI:
+  case RISCV::SUB:
+  case RISCV::MUL:
+  case RISCV::SLLI: {
+    // Convert sext.w+add/sub/mul to their W instructions. This will create
+    // a new independent instruction. This improves latency.
+    unsigned Opc;
+    switch (N0.getMachineOpcode()) {
+    default:
+      llvm_unreachable("Unexpected opcode!");
+    case RISCV::ADD:  Opc = RISCV::ADDW;  break;
+    case RISCV::ADDI: Opc = RISCV::ADDIW; break;
+    case RISCV::SUB:  Opc = RISCV::SUBW;  break;
+    case RISCV::MUL:  Opc = RISCV::MULW;  break;
+    case RISCV::SLLI: Opc = RISCV::SLLIW; break;
     }
 
-    LLVM_DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase:    ");
-    LLVM_DEBUG(Base->dump(CurDAG));
-    LLVM_DEBUG(dbgs() << "\nN: ");
-    LLVM_DEBUG(N->dump(CurDAG));
-    LLVM_DEBUG(dbgs() << "\n");
-
-    // Modify the offset operand of the load/store.
-    if (BaseOpIdx == 0) // Load
-      CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
-                                 N->getOperand(2));
-    else // Store
-      CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
-                                 ImmOperand, N->getOperand(3));
-
-    // The add-immediate may now be dead, in which case remove it.
-    if (Base.getNode()->use_empty())
-      CurDAG->RemoveDeadNode(Base.getNode());
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+
+    // Shift amount needs to be uimm5.
+    if (N0.getMachineOpcode() == RISCV::SLLI &&
+        !isUInt<5>(cast<ConstantSDNode>(N01)->getSExtValue()))
+      break;
+
+    SDNode *Result =
+        CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
+                               N00, N01);
+    ReplaceUses(N, Result);
+    return true;
   }
+  case RISCV::ADDW:
+  case RISCV::ADDIW:
+  case RISCV::SUBW:
+  case RISCV::MULW:
+  case RISCV::SLLIW:
+    // Result is already sign extended just remove the sext.w.
+    // NOTE: We only handle the nodes that are selected with hasAllWUsers.
+    ReplaceUses(N, N0.getNode());
+    return true;
+  }
+
+  return false;
 }
 
 // This pass converts a legalized DAG into a RISCV-specific DAG, ready
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 296762e77b650..5feaa8f710b13 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -58,6 +58,10 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
   bool selectSExti32(SDValue N, SDValue &Val);
   bool selectZExti32(SDValue N, SDValue &Val);
 
+  bool hasAllNBitUsers(SDNode *Node, unsigned Bits) const;
+  bool hasAllHUsers(SDNode *Node) const { return hasAllNBitUsers(Node, 16); }
+  bool hasAllWUsers(SDNode *Node) const { return hasAllNBitUsers(Node, 32); }
+
   bool selectVLOp(SDValue N, SDValue &VL);
 
   bool selectVSplat(SDValue N, SDValue &SplatVal);
@@ -109,7 +113,8 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
 #include "RISCVGenDAGISel.inc"
 
 private:
-  void doPeepholeLoadStoreADDI();
+  bool doPeepholeLoadStoreADDI(SDNode *Node);
+  bool doPeepholeSExtW(SDNode *Node);
 };
 
 namespace RISCV {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0c92a1f54d4d6..a80d9f996b4ce 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5053,8 +5053,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::SUB:
     assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
            "Unexpected custom legalisation");
-    if (N->getOperand(1).getOpcode() == ISD::Constant)
-      return;
     Results.push_back(customLegalizeToWOpWithSExt(N, DAG));
     break;
   case ISD::SHL:
@@ -5970,6 +5968,20 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
 
+  // Helper to call SimplifyDemandedBits on an operand of N where only some low
+  // bits are demanded. N will be added to the Worklist if it was not deleted.
+  // Caller should return SDValue(N, 0) if this returns true.
+  auto SimplifyDemandedLowBitsHelper = [&](unsigned OpNo, unsigned LowBits) {
+    SDValue Op = N->getOperand(OpNo);
+    APInt Mask = APInt::getLowBitsSet(Op.getValueSizeInBits(), LowBits);
+    if (!SimplifyDemandedBits(Op, Mask, DCI))
+      return false;
+
+    if (N->getOpcode() != ISD::DELETED_NODE)
+      DCI.AddToWorklist(N);
+    return true;
+  };
+
   switch (N->getOpcode()) {
   default:
     break;
@@ -6021,136 +6033,85 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case RISCVISD::ROLW:
   case RISCVISD::RORW: {
     // Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
-    SDValue LHS = N->getOperand(0);
-    SDValue RHS = N->getOperand(1);
-    APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
-    APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5);
-    if (SimplifyDemandedBits(N->getOperand(0), LHSMask, DCI) ||
-        SimplifyDemandedBits(N->getOperand(1), RHSMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(0, 32) ||
+        SimplifyDemandedLowBitsHelper(1, 5))
       return SDValue(N, 0);
-    }
     break;
   }
   case RISCVISD::CLZW:
   case RISCVISD::CTZW: {
     // Only the lower 32 bits of the first operand are read
-    SDValue Op0 = N->getOperand(0);
-    APInt Mask = APInt::getLowBitsSet(Op0.getValueSizeInBits(), 32);
-    if (SimplifyDemandedBits(Op0, Mask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(0, 32))
       return SDValue(N, 0);
-    }
     break;
   }
   case RISCVISD::FSL:
   case RISCVISD::FSR: {
     // Only the lower log2(Bitwidth)+1 bits of the the shift amount are read.
-    SDValue ShAmt = N->getOperand(2);
-    unsigned BitWidth = ShAmt.getValueSizeInBits();
+    unsigned BitWidth = N->getOperand(2).getValueSizeInBits();
     assert(isPowerOf2_32(BitWidth) && "Unexpected bit width");
-    APInt ShAmtMask(BitWidth, (BitWidth * 2) - 1);
-    if (SimplifyDemandedBits(ShAmt, ShAmtMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(2, Log2_32(BitWidth) + 1))
       return SDValue(N, 0);
-    }
     break;
   }
   case RISCVISD::FSLW:
   case RISCVISD::FSRW: {
     // Only the lower 32 bits of Values and lower 6 bits of shift amount are
     // read.
-    SDValue Op0 = N->getOperand(0);
-    SDValue Op1 = N->getOperand(1);
-    SDValue ShAmt = N->getOperand(2);
-    APInt OpMask = APInt::getLowBitsSet(Op0.getValueSizeInBits(), 32);
-    APInt ShAmtMask = APInt::getLowBitsSet(ShAmt.getValueSizeInBits(), 6);
-    if (SimplifyDemandedBits(Op0, OpMask, DCI) ||
-        SimplifyDemandedBits(Op1, OpMask, DCI) ||
-        SimplifyDemandedBits(ShAmt, ShAmtMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(0, 32) ||
+        SimplifyDemandedLowBitsHelper(1, 32) ||
+        SimplifyDemandedLowBitsHelper(2, 6))
       return SDValue(N, 0);
-    }
     break;
   }
   case RISCVISD::GREV:
   case RISCVISD::GORC: {
     // Only the lower log2(Bitwidth) bits of the the shift amount are read.
-    SDValue ShAmt = N->getOperand(1);
-    unsigned BitWidth = ShAmt.getValueSizeInBits();
+    unsigned BitWidth = N->getOperand(1).getValueSizeInBits();
     assert(isPowerOf2_32(BitWidth) && "Unexpected bit width");
-    APInt ShAmtMask(BitWidth, BitWidth - 1);
-    if (SimplifyDemandedBits(ShAmt, ShAmtMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(1, Log2_32(BitWidth)))
       return SDValue(N, 0);
-    }
 
     return combineGREVI_GORCI(N, DCI.DAG);
   }
   case RISCVISD::GREVW:
   case RISCVISD::GORCW: {
     // Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
-    SDValue LHS = N->getOperand(0);
-    SDValue RHS = N->getOperand(1);
-    APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
-    APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 5);
-    if (SimplifyDemandedBits(LHS, LHSMask, DCI) ||
-        SimplifyDemandedBits(RHS, RHSMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(0, 32) ||
+        SimplifyDemandedLowBitsHelper(1, 5))
       return SDValue(N, 0);
-    }
 
     return combineGREVI_GORCI(N, DCI.DAG);
   }
   case RISCVISD::SHFL:
   case RISCVISD::UNSHFL: {
-    // Only the lower log2(Bitwidth) bits of the the shift amount are read.
-    SDValue ShAmt = N->getOperand(1);
-    unsigned BitWidth = ShAmt.getValueSizeInBits();
+    // Only the lower log2(Bitwidth)-1 bits of the the shift amount are read.
+    unsigned BitWidth = N->getOperand(1).getValueSizeInBits();
     assert(isPowerOf2_32(BitWidth) && "Unexpected bit width");
-    APInt ShAmtMask(BitWidth, (BitWidth / 2) - 1);
-    if (SimplifyDemandedBits(ShAmt, ShAmtMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(1, Log2_32(BitWidth) - 1))
       return SDValue(N, 0);
-    }
 
     break;
   }
   case RISCVISD::SHFLW:
   case RISCVISD::UNSHFLW: {
-    // Only the lower 32 bits of LHS and lower 5 bits of RHS are read.
+    // Only the lower 32 bits of LHS and lower 4 bits of RHS are read.
     SDValue LHS = N->getOperand(0);
     SDValue RHS = N->getOperand(1);
     APInt LHSMask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
     APInt RHSMask = APInt::getLowBitsSet(RHS.getValueSizeInBits(), 4);
-    if (SimplifyDemandedBits(LHS, LHSMask, DCI) ||
-        SimplifyDemandedBits(RHS, RHSMask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(0, 32) ||
+        SimplifyDemandedLowBitsHelper(1, 4))
       return SDValue(N, 0);
-    }
 
     break;
   }
   case RISCVISD::BCOMPRESSW:
   case RISCVISD::BDECOMPRESSW: {
     // Only the lower 32 bits of LHS and RHS are read.
-    SDValue LHS = N->getOperand(0);
-    SDValue RHS = N->getOperand(1);
-    APInt Mask = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 32);
-    if (SimplifyDemandedBits(LHS, Mask, DCI) ||
-        SimplifyDemandedBits(RHS, Mask, DCI)) {
-      if (N->getOpcode() != ISD::DELETED_NODE)
-        DCI.AddToWorklist(N);
+    if (SimplifyDemandedLowBitsHelper(0, 32) ||
+        SimplifyDemandedLowBitsHelper(1, 32))
       return SDValue(N, 0);
-    }
 
     break;
   }
@@ -6723,6 +6684,12 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
   switch (Op.getOpcode()) {
   default:
     break;
+  case RISCVISD::SELECT_CC: {
+    unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(3), DemandedElts, Depth + 1);
+    if (Tmp == 1) return 1;  // Early out.
+    unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(4), DemandedElts, Depth + 1);
+    return std::min(Tmp, Tmp2);
+  }
   case RISCVISD::SLLW:
   case RISCVISD::SRAW:
   case RISCVISD::SRLW:
@@ -9077,6 +9044,29 @@ bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
   return false;
 }
 
+bool RISCVTargetLowering::isMulAddWithConstProfitable(
+    const SDValue &AddNode, const SDValue &ConstNode) const {
+  // Let the DAGCombiner decide for vectors.
+  EVT VT = AddNode.getValueType();
+  if (VT.isVector())
+    return true;
+
+  // Let the DAGCombiner decide for larger types.
+  if (VT.getScalarSizeInBits() > Subtarget.getXLen())
+    return true;
+
+  // It is worse if c1 is simm12 while c1*c2 is not.
+  ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
+  ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
+  const APInt &C1 = C1Node->getAPIntValue();
+  const APInt &C2 = C2Node->getAPIntValue();
+  if (C1.isSignedIntN(12) && !(C1 * C2).isSignedIntN(12))
+    return false;
+
+  // Default to true and let the DAGCombiner decide.
+  return true;
+}
+
 bool RISCVTargetLowering::allowsMisalignedMemoryAccesses(
     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
     bool *Fast) const {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 092bbc925766a..06f4235aeb922 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -461,6 +461,9 @@ class RISCVTargetLowering : public TargetLowering {
   bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
                               SDValue C) const override;
 
+  bool isMulAddWithConstProfitable(const SDValue &AddNode,
+                                   const SDValue &ConstNode) const override;
+
   TargetLowering::AtomicExpansionKind
   shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
   Value *emitMaskedAtomicRMWIntrinsic(IRBuilderBase &Builder, AtomicRMWInst *AI,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 01cb9ceb7b760..fcfd98ec7ed93 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1273,7 +1273,7 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
         // Both of operands are not fixed. Set one of commutable
         // operands to the tied source.
         CommutableOpIdx1 = 1;
-      } else if (SrcOpIdx1 == CommutableOpIdx1) {
+      } else if (SrcOpIdx1 == CommuteAnyOperandIndex) {
         // Only one of the operands is not fixed.
         CommutableOpIdx1 = SrcOpIdx2;
       }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 331415f9b8d70..ce23dc6cc866b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1254,6 +1254,14 @@ def : Pat<(i64 (shl (and GPR:$rs1, 0xffffffff), uimm5:$shamt)),
           (SRLI (SLLI GPR:$rs1, 32), (ImmSubFrom32 uimm5:$shamt))>;
 }
 
+// PatFrag to allow ADDW/SUBW/MULW/SLLW to be selected from i64 add/sub/mul/shl
+// if only the lower 32 bits of their result is used.
+class overflowingbinopw<SDPatternOperator operator>
+    : PatFrag<(ops node:$lhs, node:$rhs),
+              (operator node:$lhs, node:$rhs), [{
+  return hasAllWUsers(Node);
+}]>;
+
 let Predicates = [IsRV64] in {
 
 /// sext and zext
@@ -1262,14 +1270,6 @@ def : Pat<(sext_inreg GPR:$rs1, i32), (ADDIW GPR:$rs1, 0)>;
 
 /// ALU operations
 
-def : Pat<(sext_inreg (add GPR:$rs1, GPR:$rs2), i32),
-          (ADDW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(sext_inreg (add GPR:$rs1, simm12:$imm12), i32),
-          (ADDIW GPR:$rs1, simm12:$imm12)>;
-def : Pat<(sext_inreg (sub GPR:$rs1, GPR:$rs2), i32),
-          (SUBW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(sext_inreg (shl GPR:$rs1, uimm5:$shamt), i32),
-          (SLLIW GPR:$rs1, uimm5:$shamt)>;
 def : Pat<(i64 (srl (and GPR:$rs1, 0xffffffff), uimm5:$shamt)),
           (SRLIW GPR:$rs1, uimm5:$shamt)>;
 def : Pat<(i64 (srl (shl GPR:$rs1, (i64 32)), uimm6gt32:$shamt)),
@@ -1283,6 +1283,13 @@ def : PatGprGpr<shiftopw<riscv_sllw>, SLLW>;
 def : PatGprGpr<shiftopw<riscv_srlw>, SRLW>;
 def : PatGprGpr<shiftopw<riscv_sraw>, SRAW>;
 
+// Select W instructions without sext_inreg if only the lower 32 bits of the
+// result are used.
+def : PatGprGpr<overflowingbinopw<add>, ADDW>;
+def : PatGprSimm12<overflowingbinopw<add>, ADDIW>;
+def : PatGprGpr<overflowingbinopw<sub>, SUBW>;
+def : PatGprImm<overflowingbinopw<shl>, SLLIW, uimm5>;
+
 /// Loads
 
 defm : LdPat<sextloadi32, LW, i64>;
@@ -1323,7 +1330,8 @@ def : Pat<(add GPR:$rs1, (AddiPair:$rs2)),
                 (AddiPairImmA GPR:$rs2))>;
 
 let Predicates = [IsRV64] in {
-def : Pat<(sext_inreg (add_oneuse GPR:$rs1, (AddiPair:$rs2)), i32),
+// Select W instructions if only the lower 32-bits of the result are used.
+def : Pat<(overflowingbinopw<add> GPR:$rs1, (AddiPair:$rs2)),
           (ADDIW (ADDIW GPR:$rs1, (AddiPairImmB AddiPair:$rs2)),
                  (AddiPairImmA AddiPair:$rs2))>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
index 71b0f1585aedd..a2cd11b1f26ef 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -186,6 +186,32 @@ def C9LeftShift : PatLeaf<(imm), [{
   return C > 9 && ((C % 9) == 0) && isPowerOf2_64(C / 9);
 }]>;
 
+def CSImm12MulBy4 : PatLeaf<(imm), [{
+  if (!N->hasOneUse())
+    return false;
+  int64_t C = N->getSExtValue();
+  // Skip if C is simm12 or can be optimized by the PatLeaf AddiPair.
+  return !isInt<13>(C) && isInt<14>(C) && (C & 3) == 0;
+}]>;
+
+def CSImm12MulBy8 : PatLeaf<(imm), [{
+  if (!N->hasOneUse())
+    return false;
+  int64_t C = N->getSExtValue();
+  // Skip if C is simm12 or can be optimized by the PatLeaf AddiPair.
+  return !isInt<13>(C) && isInt<15>(C) && (C & 7) == 0;
+}]>;
+
+def SimmShiftRightBy2XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() >> 2, SDLoc(N),
+                                   N->getValueType(0));
+}]>;
+
+def SimmShiftRightBy3XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() >> 3, SDLoc(N),
+                                   N->getValueType(0));
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
@@ -1011,6 +1037,13 @@ def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 40)), GPR:$rs2),
 def : Pat<(add (mul_oneuse GPR:$rs1, (XLenVT 72)), GPR:$rs2),
           (SH3ADD (SH3ADD GPR:$rs1, GPR:$rs1), GPR:$rs2)>;
 
+def : Pat<(add GPR:$r, CSImm12MulBy4:$i),
+          (SH2ADD (ADDI X0, (SimmShiftRightBy2XForm CSImm12MulBy4:$i)),
+                  GPR:$r)>;
+def : Pat<(add GPR:$r, CSImm12MulBy8:$i),
+          (SH3ADD (ADDI X0, (SimmShiftRightBy3XForm CSImm12MulBy8:$i)),
+                  GPR:$r)>;
+
 def : Pat<(mul GPR:$r, C3LeftShift:$i),
           (SLLI (SH1ADD GPR:$r, GPR:$r),
                 (TrailingZerosXForm C3LeftShift:$i))>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index f654ed1949a47..e03536027ec00 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -72,8 +72,9 @@ def : PatGprGpr<urem, REMU>;
 } // Predicates = [HasStdExtM]
 
 let Predicates = [HasStdExtM, IsRV64] in {
-def : Pat<(sext_inreg (mul GPR:$rs1, GPR:$rs2), i32),
-          (MULW GPR:$rs1, GPR:$rs2)>;
+// Select W instructions without sext_inreg if only the lower 32-bits of the
+// result are used.
+def : PatGprGpr<overflowingbinopw<mul>, MULW>;
 
 def : PatGprGpr<riscv_divw, DIVW>;
 def : PatGprGpr<riscv_divuw, DIVUW>;
@@ -105,11 +106,4 @@ let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in {
 // still be better off shifting both left by 32.
 def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))),
           (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
-// Prevent matching the first part of this pattern to mulw. The mul here has
-// additionals users or the ANDs would have been removed. The above pattern
-// will be used for the other users. If we form a mulw we'll keep the ANDs alive
-// and they'll still become SLLI+SRLI.
-def : Pat<(sext_inreg (mul (and GPR:$rs1, 0xffffffff),
-                           (and GPR:$rs2, 0xffffffff)), i32),
-          (ADDIW (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32)), 0)>;
 } // Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba]
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index fd110db1064ba..2dc2cadf9ae5d 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -52,8 +52,15 @@ InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
     // split up large offsets in GEP into better parts than ConstantHoisting
     // can.
     return TTI::TCC_Free;
-  case Instruction::Add:
   case Instruction::And:
+    // zext.h
+    if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
+      return TTI::TCC_Free;
+    // zext.w
+    if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZbb())
+      return TTI::TCC_Free;
+    LLVM_FALLTHROUGH;
+  case Instruction::Add:
   case Instruction::Or:
   case Instruction::Xor:
   case Instruction::Mul:
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 95dacb1e6285a..7be85cf09d5f7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -66,7 +66,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
           ST->hasStdExtV() ? ST->getMinRVVVectorSizeInBits() : 0);
     case TargetTransformInfo::RGK_ScalableVector:
       return TypeSize::getScalable(
-          ST->hasStdExtV() ? ST->getMinRVVVectorSizeInBits() : 0);
+          ST->hasStdExtV() ? RISCV::RVVBitsPerBlock : 0);
     }
 
     llvm_unreachable("Unsupported register kind");
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 0a655a82b8894..32bdb71b45328 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -115,33 +115,29 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
   if (GV->isDSOLocal())
     return true;
 
-  // DLLImport explicitly marks the GV as external.
-  if (GV->hasDLLImportStorageClass())
-    return false;
-
-  // On MinGW, variables that haven't been declared with DLLImport may still
-  // end up automatically imported by the linker. To make this feasible,
-  // don't assume the variables to be DSO local unless we actually know
-  // that for sure. This only has to be done for variables; for functions
-  // the linker can insert thunks for calling functions from another DLL.
-  if (TT.isWindowsGNUEnvironment() && TT.isOSBinFormatCOFF() &&
-      GV->isDeclarationForLinker() && isa<GlobalVariable>(GV))
-    return false;
-
-  // On COFF, don't mark 'extern_weak' symbols as DSO local. If these symbols
-  // remain unresolved in the link, they can be resolved to zero, which is
-  // outside the current DSO.
-  if (TT.isOSBinFormatCOFF() && GV->hasExternalWeakLinkage())
-    return false;
-
-  // Every other GV is local on COFF.
-  // Make an exception for windows OS in the triple: Some firmware builds use
-  // *-win32-macho triples. This (accidentally?) produced windows relocations
-  // without GOT tables in older clang versions; Keep this behaviour.
-  // Some JIT users use *-win32-elf triples; these shouldn't use GOT tables
-  // either.
-  if (TT.isOSBinFormatCOFF() || TT.isOSWindows())
+  if (TT.isOSBinFormatCOFF()) {
+    // DLLImport explicitly marks the GV as external.
+    if (GV->hasDLLImportStorageClass())
+      return false;
+
+    // On MinGW, variables that haven't been declared with DLLImport may still
+    // end up automatically imported by the linker. To make this feasible,
+    // don't assume the variables to be DSO local unless we actually know
+    // that for sure. This only has to be done for variables; for functions
+    // the linker can insert thunks for calling functions from another DLL.
+    if (TT.isWindowsGNUEnvironment() && GV->isDeclarationForLinker() &&
+        isa<GlobalVariable>(GV))
+      return false;
+
+    // Don't mark 'extern_weak' symbols as DSO local. If these symbols remain
+    // unresolved in the link, they can be resolved to zero, which is outside
+    // the current DSO.
+    if (GV->hasExternalWeakLinkage())
+      return false;
+
+    // Every other GV is local on COFF.
     return true;
+  }
 
   if (TT.isOSBinFormatMachO()) {
     if (RM == Reloc::Static)
@@ -149,13 +145,8 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
     return GV->isStrongDefinitionForLinker();
   }
 
-  // Due to the AIX linkage model, any global with default visibility is
-  // considered non-local.
-  if (TT.isOSBinFormatXCOFF())
-    return false;
-
-  assert(TT.isOSBinFormatELF() || TT.isOSBinFormatWasm());
-  assert(RM != Reloc::DynamicNoPIC);
+  assert(TT.isOSBinFormatELF() || TT.isOSBinFormatWasm() ||
+         TT.isOSBinFormatXCOFF());
   return false;
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index c40e4f903df58..4495d624e09d1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -1370,9 +1370,9 @@ bool WebAssemblyFastISel::selectRet(const Instruction *I) {
   }
 
   unsigned Reg;
-  if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::SExt))
+  if (FuncInfo.Fn->getAttributes().hasRetAttr(Attribute::SExt))
     Reg = getRegForSignedValue(RV);
-  else if (FuncInfo.Fn->getAttributes().hasAttribute(0, Attribute::ZExt))
+  else if (FuncInfo.Fn->getAttributes().hasRetAttr(Attribute::ZExt))
     Reg = getRegForUnsignedValue(RV);
   else
     Reg = getRegForValue(RV);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 927b83e8a067c..29f68d476e479 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -303,9 +303,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal);
       setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal);
     }
-    // And some truncating stores are legal as well
-    setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
-    setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
   }
 
   // Don't do anything clever with build_pairs
@@ -854,6 +851,21 @@ void WebAssemblyTargetLowering::computeKnownBitsForTargetNode(
   }
 }
 
+TargetLoweringBase::LegalizeTypeAction
+WebAssemblyTargetLowering::getPreferredVectorAction(MVT VT) const {
+  if (VT.isFixedLengthVector()) {
+    MVT EltVT = VT.getVectorElementType();
+    // We have legal vector types with these lane types, so widening the
+    // vector would let us use some of the lanes directly without having to
+    // extend or truncate values.
+    if (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
+        EltVT == MVT::i64 || EltVT == MVT::f32 || EltVT == MVT::f64)
+      return TypeWidenVector;
+  }
+
+  return TargetLoweringBase::getPreferredVectorAction(VT);
+}
+
 //===----------------------------------------------------------------------===//
 // WebAssembly Lowering private implementation.
 //===----------------------------------------------------------------------===//
@@ -1766,8 +1778,76 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                      Op.getOperand(1));
 }
 
+static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  if (Op.getValueType() != MVT::v2f64)
+    return SDValue();
+
+  auto GetConvertedLane = [](SDValue Op, unsigned &Opcode, SDValue &SrcVec,
+                             unsigned &Index) -> bool {
+    switch (Op.getOpcode()) {
+    case ISD::SINT_TO_FP:
+      Opcode = WebAssemblyISD::CONVERT_LOW_S;
+      break;
+    case ISD::UINT_TO_FP:
+      Opcode = WebAssemblyISD::CONVERT_LOW_U;
+      break;
+    case ISD::FP_EXTEND:
+      Opcode = WebAssemblyISD::PROMOTE_LOW;
+      break;
+    default:
+      return false;
+    }
+
+    auto ExtractVector = Op.getOperand(0);
+    if (ExtractVector.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return false;
+
+    if (!isa<ConstantSDNode>(ExtractVector.getOperand(1).getNode()))
+      return false;
+
+    SrcVec = ExtractVector.getOperand(0);
+    Index = ExtractVector.getConstantOperandVal(1);
+    return true;
+  };
+
+  unsigned LHSOpcode, RHSOpcode, LHSIndex, RHSIndex;
+  SDValue LHSSrcVec, RHSSrcVec;
+  if (!GetConvertedLane(Op.getOperand(0), LHSOpcode, LHSSrcVec, LHSIndex) ||
+      !GetConvertedLane(Op.getOperand(1), RHSOpcode, RHSSrcVec, RHSIndex))
+    return SDValue();
+
+  if (LHSOpcode != RHSOpcode)
+    return SDValue();
+
+  MVT ExpectedSrcVT;
+  switch (LHSOpcode) {
+  case WebAssemblyISD::CONVERT_LOW_S:
+  case WebAssemblyISD::CONVERT_LOW_U:
+    ExpectedSrcVT = MVT::v4i32;
+    break;
+  case WebAssemblyISD::PROMOTE_LOW:
+    ExpectedSrcVT = MVT::v4f32;
+    break;
+  }
+  if (LHSSrcVec.getValueType() != ExpectedSrcVT)
+    return SDValue();
+
+  auto Src = LHSSrcVec;
+  if (LHSIndex != 0 || RHSIndex != 1 || LHSSrcVec != RHSSrcVec) {
+    // Shuffle the source vector so that the converted lanes are the low lanes.
+    Src = DAG.getVectorShuffle(
+        ExpectedSrcVT, DL, LHSSrcVec, RHSSrcVec,
+        {static_cast<int>(LHSIndex), static_cast<int>(RHSIndex) + 4, -1, -1});
+  }
+  return DAG.getNode(LHSOpcode, DL, MVT::v2f64, Src);
+}
+
 SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                                      SelectionDAG &DAG) const {
+  if (auto ConvertLow = LowerConvertLow(Op, DAG))
+    return ConvertLow;
+
   SDLoc DL(Op);
   const EVT VecT = Op.getValueType();
   const EVT LaneT = Op.getOperand(0).getValueType();
@@ -2218,120 +2298,6 @@ performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   return DAG.getNode(Op, SDLoc(N), ResVT, Source);
 }
 
-static SDValue
-performVectorConvertLowCombine(SDNode *N,
-                               TargetLowering::DAGCombinerInfo &DCI) {
-  auto &DAG = DCI.DAG;
-
-  EVT ResVT = N->getValueType(0);
-  if (ResVT != MVT::v2f64)
-    return SDValue();
-
-  auto GetWasmConversionOp = [](unsigned Op) {
-    switch (Op) {
-    case ISD::SINT_TO_FP:
-      return WebAssemblyISD::CONVERT_LOW_S;
-    case ISD::UINT_TO_FP:
-      return WebAssemblyISD::CONVERT_LOW_U;
-    case ISD::FP_EXTEND:
-      return WebAssemblyISD::PROMOTE_LOW;
-    }
-    llvm_unreachable("unexpected op");
-  };
-
-  if (N->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-    // Combine this:
-    //
-    //   (v2f64 (extract_subvector
-    //     (v4f64 ({s,u}int_to_fp (v4i32 $x))), 0))
-    //
-    // into (f64x2.convert_low_i32x4_{s,u} $x).
-    //
-    // Or this:
-    //
-    //  (v2f64 (extract_subvector
-    //    (v4f64 (fp_extend (v4f32 $x))), 0))
-    //
-    // into (f64x2.promote_low_f32x4 $x).
-    auto Conversion = N->getOperand(0);
-    auto ConversionOp = Conversion.getOpcode();
-    MVT ExpectedSourceType;
-    switch (ConversionOp) {
-    case ISD::SINT_TO_FP:
-    case ISD::UINT_TO_FP:
-      ExpectedSourceType = MVT::v4i32;
-      break;
-    case ISD::FP_EXTEND:
-      ExpectedSourceType = MVT::v4f32;
-      break;
-    default:
-      return SDValue();
-    }
-
-    if (Conversion.getValueType() != MVT::v4f64)
-      return SDValue();
-
-    auto Source = Conversion.getOperand(0);
-    if (Source.getValueType() != ExpectedSourceType)
-      return SDValue();
-
-    auto IndexNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
-    if (IndexNode == nullptr || IndexNode->getZExtValue() != 0)
-      return SDValue();
-
-    auto Op = GetWasmConversionOp(ConversionOp);
-    return DAG.getNode(Op, SDLoc(N), ResVT, Source);
-  }
-
-  // Combine this:
-  //
-  //   (v2f64 ({s,u}int_to_fp
-  //     (v2i32 (extract_subvector (v4i32 $x), 0))))
-  //
-  // into (f64x2.convert_low_i32x4_{s,u} $x).
-  //
-  // Or this:
-  //
-  //   (v2f64 (fp_extend
-  //     (v2f32 (extract_subvector (v4f32 $x), 0))))
-  //
-  // into (f64x2.promote_low_f32x4 $x).
-  auto ConversionOp = N->getOpcode();
-  MVT ExpectedExtractType;
-  MVT ExpectedSourceType;
-  switch (ConversionOp) {
-  case ISD::SINT_TO_FP:
-  case ISD::UINT_TO_FP:
-    ExpectedExtractType = MVT::v2i32;
-    ExpectedSourceType = MVT::v4i32;
-    break;
-  case ISD::FP_EXTEND:
-    ExpectedExtractType = MVT::v2f32;
-    ExpectedSourceType = MVT::v4f32;
-    break;
-  default:
-    llvm_unreachable("unexpected opcode");
-  }
-
-  auto Extract = N->getOperand(0);
-  if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
-    return SDValue();
-
-  if (Extract.getValueType() != ExpectedExtractType)
-    return SDValue();
-
-  auto Source = Extract.getOperand(0);
-  if (Source.getValueType() != ExpectedSourceType)
-    return SDValue();
-
-  auto *IndexNode = dyn_cast<ConstantSDNode>(Extract.getOperand(1));
-  if (IndexNode == nullptr || IndexNode->getZExtValue() != 0)
-    return SDValue();
-
-  unsigned Op = GetWasmConversionOp(ConversionOp);
-  return DAG.getNode(Op, SDLoc(N), ResVT, Source);
-}
-
 static SDValue
 performVectorTruncZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   auto &DAG = DCI.DAG;
@@ -2462,11 +2428,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
     return performVectorExtendCombine(N, DCI);
-  case ISD::SINT_TO_FP:
-  case ISD::UINT_TO_FP:
-  case ISD::FP_EXTEND:
-  case ISD::EXTRACT_SUBVECTOR:
-    return performVectorConvertLowCombine(N, DCI);
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
   case ISD::FP_ROUND:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 748b771d930f0..752a8a093d9c9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -114,6 +114,9 @@ class WebAssemblyTargetLowering final : public TargetLowering {
                                      const SelectionDAG &DAG,
                                      unsigned Depth) const override;
 
+  TargetLoweringBase::LegalizeTypeAction
+  getPreferredVectorAction(MVT VT) const override;
+
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 6429b46673a61..e743c24251633 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -267,6 +267,16 @@ multiclass SIMDLoadZero<Vec vec, bits<32> simdop> {
 defm "" : SIMDLoadZero<I32x4, 0x5c>;
 defm "" : SIMDLoadZero<I64x2, 0x5d>;
 
+// Use load_zero to load scalars into vectors as well where possible.
+// TODO: i32, i16, and i8 scalars
+def load_scalar :
+  PatFrag<(ops node:$addr), (scalar_to_vector (i64 (load $addr)))>;
+defm : LoadPatNoOffset<v2i64, load_scalar, "LOAD_ZERO_I64x2">;
+defm : LoadPatImmOff<v2i64, load_scalar, regPlusImm, "LOAD_ZERO_I64x2">;
+defm : LoadPatImmOff<v2i64, load_scalar, or_is_add, "LOAD_ZERO_I64x2">;
+defm : LoadPatOffsetOnly<v2i64, load_scalar, "LOAD_ZERO_I64x2">;
+defm : LoadPatGlobalAddrOffOnly<v2i64, load_scalar, "LOAD_ZERO_I64x2">;
+
 // TODO: f32x4 and f64x2 as well
 foreach vec = [I32x4, I64x2] in {
   defvar inst = "LOAD_ZERO_"#vec;
@@ -1165,6 +1175,16 @@ def : Pat<(vec.int_vt (vselect
           (pmax $lhs, $rhs)>;
 }
 
+// And match the pmin/pmax LLVM intrinsics as well
+def : Pat<(v4f32 (int_wasm_pmin (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
+          (PMIN_F32x4 V128:$lhs, V128:$rhs)>;
+def : Pat<(v4f32 (int_wasm_pmax (v4f32 V128:$lhs), (v4f32 V128:$rhs))),
+          (PMAX_F32x4 V128:$lhs, V128:$rhs)>;
+def : Pat<(v2f64 (int_wasm_pmin (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
+          (PMIN_F64x2 V128:$lhs, V128:$rhs)>;
+def : Pat<(v2f64 (int_wasm_pmax (v2f64 V128:$lhs), (v2f64 V128:$rhs))),
+          (PMAX_F64x2 V128:$lhs, V128:$rhs)>;
+
 //===----------------------------------------------------------------------===//
 // Conversions
 //===----------------------------------------------------------------------===//
@@ -1241,87 +1261,6 @@ multiclass SIMDNarrow<Vec vec, bits<32> baseInst> {
 defm "" : SIMDNarrow<I16x8, 101>;
 defm "" : SIMDNarrow<I32x4, 133>;
 
-// Use narrowing operations for truncating stores. Since the narrowing
-// operations are saturating instead of truncating, we need to mask
-// the stored values first.
-def store_v8i8_trunc_v8i16 :
-  OutPatFrag<(ops node:$val),
-             (EXTRACT_LANE_I64x2
-               (NARROW_U_I8x16
-                 (AND
-                   (CONST_V128_I16x8
-                     0x00ff, 0x00ff, 0x00ff, 0x00ff,
-                     0x00ff, 0x00ff, 0x00ff, 0x00ff),
-                   node:$val),
-                 $val), // Unused input
-               0)>;
-
-def store_v4i16_trunc_v4i32 :
-  OutPatFrag<(ops node:$val),
-             (EXTRACT_LANE_I64x2
-               (NARROW_U_I16x8
-                 (AND
-                  (CONST_V128_I32x4
-                    0x0000ffff, 0x0000ffff, 0x0000ffff, 0x0000ffff),
-                  node:$val),
-                 $val), // Unused input
-               0)>;
-
-// Store patterns adapted from WebAssemblyInstrMemory.td
-multiclass NarrowingStorePatNoOffset<Vec vec, OutPatFrag out> {
-  defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
-  def : Pat<(node vec.vt:$val, I32:$addr),
-            (STORE_I64_A32 0, 0, $addr, (out $val))>,
-        Requires<[HasAddr32]>;
-  def : Pat<(node vec.vt:$val, I64:$addr),
-            (STORE_I64_A64 0, 0, $addr, (out $val))>,
-        Requires<[HasAddr64]>;
-}
-
-defm : NarrowingStorePatNoOffset<I16x8, store_v8i8_trunc_v8i16>;
-defm : NarrowingStorePatNoOffset<I32x4, store_v4i16_trunc_v4i32>;
-
-multiclass NarrowingStorePatImmOff<Vec vec, PatFrag operand, OutPatFrag out> {
-  defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
-  def : Pat<(node vec.vt:$val, (operand I32:$addr, imm:$off)),
-            (STORE_I64_A32 0, imm:$off, $addr, (out $val))>,
-        Requires<[HasAddr32]>;
-  def : Pat<(node vec.vt:$val, (operand I64:$addr, imm:$off)),
-            (STORE_I64_A64 0, imm:$off, $addr, (out $val))>,
-        Requires<[HasAddr64]>;
-}
-
-defm : NarrowingStorePatImmOff<I16x8, regPlusImm, store_v8i8_trunc_v8i16>;
-defm : NarrowingStorePatImmOff<I32x4, regPlusImm, store_v4i16_trunc_v4i32>;
-defm : NarrowingStorePatImmOff<I16x8, or_is_add, store_v8i8_trunc_v8i16>;
-defm : NarrowingStorePatImmOff<I32x4, or_is_add, store_v4i16_trunc_v4i32>;
-
-multiclass NarrowingStorePatOffsetOnly<Vec vec, OutPatFrag out> {
-  defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
-  def : Pat<(node vec.vt:$val, imm:$off),
-            (STORE_I64_A32 0, imm:$off, (CONST_I32 0), (out $val))>,
-        Requires<[HasAddr32]>;
-  def : Pat<(node vec.vt:$val, imm:$off),
-            (STORE_I64_A64 0, imm:$off, (CONST_I64 0), (out $val))>,
-        Requires<[HasAddr64]>;
-}
-
-defm : NarrowingStorePatOffsetOnly<I16x8, store_v8i8_trunc_v8i16>;
-defm : NarrowingStorePatOffsetOnly<I32x4, store_v4i16_trunc_v4i32>;
-
-multiclass NarrowingStorePatGlobalAddrOffOnly<Vec vec, OutPatFrag out> {
-  defvar node = !cast<PatFrag>("truncstorevi"#vec.split.lane_bits);
-  def : Pat<(node vec.vt:$val, (WebAssemblywrapper tglobaladdr:$off)),
-            (STORE_I64_A32 0, tglobaladdr:$off, (CONST_I32 0), (out $val))>,
-        Requires<[IsNotPIC, HasAddr32]>;
-  def : Pat<(node vec.vt:$val, (WebAssemblywrapper tglobaladdr:$off)),
-            (STORE_I64_A64  0, tglobaladdr:$off, (CONST_I64 0), (out $val))>,
-        Requires<[IsNotPIC, HasAddr64]>;
-}
-
-defm : NarrowingStorePatGlobalAddrOffOnly<I16x8, store_v8i8_trunc_v8i16>;
-defm : NarrowingStorePatGlobalAddrOffOnly<I32x4, store_v4i16_trunc_v4i32>;
-
 // Bitcasts are nops
 // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types
 foreach t1 = AllVecs in
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 110273a9f480f..daf5114a54059 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -352,12 +352,12 @@ static Function *getEmscriptenFunction(FunctionType *Ty, const Twine &Name,
   if (!F->hasFnAttribute("wasm-import-module")) {
     llvm::AttrBuilder B;
     B.addAttribute("wasm-import-module", "env");
-    F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+    F->addFnAttrs(B);
   }
   if (!F->hasFnAttribute("wasm-import-name")) {
     llvm::AttrBuilder B;
     B.addAttribute("wasm-import-name", F->getName());
-    F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+    F->addFnAttrs(B);
   }
   return F;
 }
@@ -420,7 +420,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) {
   if (CI->doesNotReturn()) {
     if (auto *F = CI->getCalledFunction())
       F->removeFnAttr(Attribute::NoReturn);
-    CI->removeAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
+    CI->removeFnAttr(Attribute::NoReturn);
   }
 
   IRBuilder<> IRB(C);
@@ -450,9 +450,9 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) {
   ArgAttributes.push_back(AttributeSet());
   // Copy the argument attributes from the original
   for (unsigned I = 0, E = CI->getNumArgOperands(); I < E; ++I)
-    ArgAttributes.push_back(InvokeAL.getParamAttributes(I));
+    ArgAttributes.push_back(InvokeAL.getParamAttrs(I));
 
-  AttrBuilder FnAttrs(InvokeAL.getFnAttributes());
+  AttrBuilder FnAttrs(InvokeAL.getFnAttrs());
   if (FnAttrs.contains(Attribute::AllocSize)) {
     // The allocsize attribute (if any) referes to parameters by index and needs
     // to be adjusted.
@@ -466,9 +466,8 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) {
   }
 
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttributeList NewCallAL =
-      AttributeList::get(C, AttributeSet::get(C, FnAttrs),
-                         InvokeAL.getRetAttributes(), ArgAttributes);
+  AttributeList NewCallAL = AttributeList::get(
+      C, AttributeSet::get(C, FnAttrs), InvokeAL.getRetAttrs(), ArgAttributes);
   NewCall->setAttributes(NewCallAL);
 
   CI->replaceAllUsesWith(NewCall);
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b54b7adfff989..b9d8c148f5fbf 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3127,9 +3127,10 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
   unsigned ComparisonPredicate = ~0U;
 
-  // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
+  // FIXME: Hack to recognize cmp<comparison code>{sh,ss,sd,ph,ps,pd}.
   if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
       (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
+       PatchedName.endswith("sh") || PatchedName.endswith("ph") ||
        PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
     bool IsVCMP = PatchedName[0] == 'v';
     unsigned CCIdx = IsVCMP ? 4 : 3;
@@ -3183,7 +3184,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       .Case("gt_oq",    0x1E)
       .Case("true_us",  0x1F)
       .Default(~0U);
-    if (CC != ~0U && (IsVCMP || CC < 8)) {
+    if (CC != ~0U && (IsVCMP || CC < 8) &&
+        (IsVCMP || PatchedName.back() != 'h')) {
       if (PatchedName.endswith("ss"))
         PatchedName = IsVCMP ? "vcmpss" : "cmpss";
       else if (PatchedName.endswith("sd"))
@@ -3192,6 +3194,10 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
         PatchedName = IsVCMP ? "vcmpps" : "cmpps";
       else if (PatchedName.endswith("pd"))
         PatchedName = IsVCMP ? "vcmppd" : "cmppd";
+      else if (PatchedName.endswith("sh"))
+        PatchedName = "vcmpsh";
+      else if (PatchedName.endswith("ph"))
+        PatchedName = "vcmpph";
       else
         llvm_unreachable("Unexpected suffix!");
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index 30f279cc94a51..baacf2f46183f 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -153,6 +153,20 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
   case X86::VCMPPSZrrib:    case X86::VCMPPSZrribk:
   case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk:
   case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
+  case X86::VCMPPHZ128rmi:  case X86::VCMPPHZ128rri:
+  case X86::VCMPPHZ256rmi:  case X86::VCMPPHZ256rri:
+  case X86::VCMPPHZrmi:     case X86::VCMPPHZrri:
+  case X86::VCMPSHZrm:      case X86::VCMPSHZrr:
+  case X86::VCMPSHZrm_Int:  case X86::VCMPSHZrr_Int:
+  case X86::VCMPPHZ128rmik: case X86::VCMPPHZ128rrik:
+  case X86::VCMPPHZ256rmik: case X86::VCMPPHZ256rrik:
+  case X86::VCMPPHZrmik:    case X86::VCMPPHZrrik:
+  case X86::VCMPSHZrm_Intk: case X86::VCMPSHZrr_Intk:
+  case X86::VCMPPHZ128rmbi: case X86::VCMPPHZ128rmbik:
+  case X86::VCMPPHZ256rmbi: case X86::VCMPPHZ256rmbik:
+  case X86::VCMPPHZrmbi:    case X86::VCMPPHZrmbik:
+  case X86::VCMPPHZrrib:    case X86::VCMPPHZrribk:
+  case X86::VCMPSHZrrb_Int: case X86::VCMPSHZrrb_Intk:
     if (Imm >= 0 && Imm <= 31) {
       OS << '\t';
       printCMPMnemonic(MI, /*IsVCMP*/true, OS);
@@ -162,11 +176,15 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
       if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
         if (Desc.TSFlags & X86II::EVEX_B) {
           // Broadcast form.
-          // Load size is based on W-bit.
-          if (Desc.TSFlags & X86II::VEX_W)
+          // Load size is word for TA map. Otherwise it is based on W-bit.
+          if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) {
+            assert(!(Desc.TSFlags & X86II::VEX_W) && "Unknown W-bit value!");
+            printwordmem(MI, CurOp--, OS);
+          } else if (Desc.TSFlags & X86II::VEX_W) {
             printqwordmem(MI, CurOp--, OS);
-          else
+          } else {
             printdwordmem(MI, CurOp--, OS);
+          }
 
           // Print the number of elements broadcasted.
           unsigned NumElts;
@@ -176,18 +194,28 @@ bool X86ATTInstPrinter::printVecCompareInstr(const MCInst *MI,
             NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
           else
             NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+          if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) {
+            assert(!(Desc.TSFlags & X86II::VEX_W) && "Unknown W-bit value!");
+            NumElts *= 2;
+          }
           OS << "{1to" << NumElts << "}";
         } else {
-          if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
-            printdwordmem(MI, CurOp--, OS);
-          else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+          if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) {
+            if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA)
+              printwordmem(MI, CurOp--, OS);
+            else
+              printdwordmem(MI, CurOp--, OS);
+          } else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) {
+            assert((Desc.TSFlags & X86II::OpMapMask) != X86II::TA &&
+                   "Unexpected op map!");
             printqwordmem(MI, CurOp--, OS);
-          else if (Desc.TSFlags & X86II::EVEX_L2)
+          } else if (Desc.TSFlags & X86II::EVEX_L2) {
             printzmmwordmem(MI, CurOp--, OS);
-          else if (Desc.TSFlags & X86II::VEX_L)
+          } else if (Desc.TSFlags & X86II::VEX_L) {
             printymmwordmem(MI, CurOp--, OS);
-          else
+          } else {
             printxmmwordmem(MI, CurOp--, OS);
+          }
         }
       } else {
         if (Desc.TSFlags & X86II::EVEX_B)
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index d8dbbbbf27790..167580ec1ed00 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -264,6 +264,24 @@ void X86InstPrinterCommon::printCMPMnemonic(const MCInst *MI, bool IsVCmp,
   case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
     OS << "ss\t";
     break;
+  case X86::VCMPPHZ128rmi:  case X86::VCMPPHZ128rri:
+  case X86::VCMPPHZ256rmi:  case X86::VCMPPHZ256rri:
+  case X86::VCMPPHZrmi:     case X86::VCMPPHZrri:
+  case X86::VCMPPHZ128rmik: case X86::VCMPPHZ128rrik:
+  case X86::VCMPPHZ256rmik: case X86::VCMPPHZ256rrik:
+  case X86::VCMPPHZrmik:    case X86::VCMPPHZrrik:
+  case X86::VCMPPHZ128rmbi: case X86::VCMPPHZ128rmbik:
+  case X86::VCMPPHZ256rmbi: case X86::VCMPPHZ256rmbik:
+  case X86::VCMPPHZrmbi:    case X86::VCMPPHZrmbik:
+  case X86::VCMPPHZrrib:    case X86::VCMPPHZrribk:
+    OS << "ph\t";
+    break;
+  case X86::VCMPSHZrm:      case X86::VCMPSHZrr:
+  case X86::VCMPSHZrm_Int:  case X86::VCMPSHZrr_Int:
+  case X86::VCMPSHZrrb_Int: case X86::VCMPSHZrrb_Intk:
+  case X86::VCMPSHZrm_Intk: case X86::VCMPSHZrr_Intk:
+    OS << "sh\t";
+    break;
   }
 }
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index e88d3d06eddff..48c335f9a777c 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -132,6 +132,20 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
   case X86::VCMPPSZrrib:    case X86::VCMPPSZrribk:
   case X86::VCMPSDZrrb_Int: case X86::VCMPSDZrrb_Intk:
   case X86::VCMPSSZrrb_Int: case X86::VCMPSSZrrb_Intk:
+  case X86::VCMPPHZ128rmi:  case X86::VCMPPHZ128rri:
+  case X86::VCMPPHZ256rmi:  case X86::VCMPPHZ256rri:
+  case X86::VCMPPHZrmi:     case X86::VCMPPHZrri:
+  case X86::VCMPSHZrm:      case X86::VCMPSHZrr:
+  case X86::VCMPSHZrm_Int:  case X86::VCMPSHZrr_Int:
+  case X86::VCMPPHZ128rmik: case X86::VCMPPHZ128rrik:
+  case X86::VCMPPHZ256rmik: case X86::VCMPPHZ256rrik:
+  case X86::VCMPPHZrmik:    case X86::VCMPPHZrrik:
+  case X86::VCMPSHZrm_Intk: case X86::VCMPSHZrr_Intk:
+  case X86::VCMPPHZ128rmbi: case X86::VCMPPHZ128rmbik:
+  case X86::VCMPPHZ256rmbi: case X86::VCMPPHZ256rmbik:
+  case X86::VCMPPHZrmbi:    case X86::VCMPPHZrmbik:
+  case X86::VCMPPHZrrib:    case X86::VCMPPHZrribk:
+  case X86::VCMPSHZrrb_Int: case X86::VCMPSHZrrb_Intk:
     if (Imm >= 0 && Imm <= 31) {
       OS << '\t';
       printCMPMnemonic(MI, /*IsVCMP*/true, OS);
@@ -152,11 +166,15 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
       if ((Desc.TSFlags & X86II::FormMask) == X86II::MRMSrcMem) {
         if (Desc.TSFlags & X86II::EVEX_B) {
           // Broadcast form.
-          // Load size is based on W-bit.
-          if (Desc.TSFlags & X86II::VEX_W)
+          // Load size is word for TA map. Otherwise it is based on W-bit.
+          if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) {
+            assert(!(Desc.TSFlags & X86II::VEX_W) && "Unknown W-bit value!");
+            printwordmem(MI, CurOp++, OS);
+          } else if (Desc.TSFlags & X86II::VEX_W) {
             printqwordmem(MI, CurOp++, OS);
-          else
+          } else {
             printdwordmem(MI, CurOp++, OS);
+          }
 
           // Print the number of elements broadcasted.
           unsigned NumElts;
@@ -166,18 +184,28 @@ bool X86IntelInstPrinter::printVecCompareInstr(const MCInst *MI, raw_ostream &OS
             NumElts = (Desc.TSFlags & X86II::VEX_W) ? 4 : 8;
           else
             NumElts = (Desc.TSFlags & X86II::VEX_W) ? 2 : 4;
+          if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA) {
+            assert(!(Desc.TSFlags & X86II::VEX_W) && "Unknown W-bit value!");
+            NumElts *= 2;
+          }
           OS << "{1to" << NumElts << "}";
         } else {
-          if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS)
-            printdwordmem(MI, CurOp++, OS);
-          else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD)
+          if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XS) {
+            if ((Desc.TSFlags & X86II::OpMapMask) == X86II::TA)
+              printwordmem(MI, CurOp++, OS);
+            else
+              printdwordmem(MI, CurOp++, OS);
+          } else if ((Desc.TSFlags & X86II::OpPrefixMask) == X86II::XD) {
+            assert((Desc.TSFlags & X86II::OpMapMask) != X86II::TA &&
+                   "Unexpected op map!");
             printqwordmem(MI, CurOp++, OS);
-          else if (Desc.TSFlags & X86II::EVEX_L2)
+          } else if (Desc.TSFlags & X86II::EVEX_L2) {
             printzmmwordmem(MI, CurOp++, OS);
-          else if (Desc.TSFlags & X86II::VEX_L)
+          } else if (Desc.TSFlags & X86II::VEX_L) {
             printymmwordmem(MI, CurOp++, OS);
-          else
+          } else {
             printxmmwordmem(MI, CurOp++, OS);
+          }
         }
       } else {
         printOperand(MI, CurOp++, OS);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 12f6804b3d764..c70715033cc03 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -207,6 +207,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::ABS          , MVT::i64  , Custom);
   }
 
+  // Signed saturation subtraction.
+  setOperationAction(ISD::SSUBSAT          , MVT::i8   , Custom);
+  setOperationAction(ISD::SSUBSAT          , MVT::i16  , Custom);
+  setOperationAction(ISD::SSUBSAT          , MVT::i32  , Custom);
+  if (Subtarget.is64Bit())
+    setOperationAction(ISD::SSUBSAT        , MVT::i64  , Custom);
+
   // Funnel shifts.
   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
     // For slow shld targets we only lower for code size.
@@ -1142,6 +1149,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
 
     setOperationAction(ISD::UADDSAT,            MVT::v4i32, Custom);
+    setOperationAction(ISD::SADDSAT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::SSUBSAT,            MVT::v2i64, Custom);
 
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
@@ -1900,6 +1909,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
     auto setGroup = [&] (MVT VT) {
+      setOperationAction(ISD::FADD,               VT, Legal);
+      setOperationAction(ISD::STRICT_FADD,        VT, Legal);
+      setOperationAction(ISD::FSUB,               VT, Legal);
+      setOperationAction(ISD::STRICT_FSUB,        VT, Legal);
+      setOperationAction(ISD::FMUL,               VT, Legal);
+      setOperationAction(ISD::STRICT_FMUL,        VT, Legal);
+      setOperationAction(ISD::FDIV,               VT, Legal);
+      setOperationAction(ISD::STRICT_FDIV,        VT, Legal);
+      setOperationAction(ISD::FSQRT,              VT, Legal);
+      setOperationAction(ISD::STRICT_FSQRT,       VT, Legal);
+
+      setOperationAction(ISD::FFLOOR,             VT, Legal);
+      setOperationAction(ISD::STRICT_FFLOOR,      VT, Legal);
+      setOperationAction(ISD::FCEIL,              VT, Legal);
+      setOperationAction(ISD::STRICT_FCEIL,       VT, Legal);
+      setOperationAction(ISD::FTRUNC,             VT, Legal);
+      setOperationAction(ISD::STRICT_FTRUNC,      VT, Legal);
+      setOperationAction(ISD::FRINT,              VT, Legal);
+      setOperationAction(ISD::STRICT_FRINT,       VT, Legal);
+      setOperationAction(ISD::FNEARBYINT,         VT, Legal);
+      setOperationAction(ISD::STRICT_FNEARBYINT,  VT, Legal);
+
       setOperationAction(ISD::LOAD,               VT, Legal);
       setOperationAction(ISD::STORE,              VT, Legal);
 
@@ -1917,19 +1948,60 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // AVX512_FP16 scalar operations
     setGroup(MVT::f16);
     addRegisterClass(MVT::f16,    &X86::FR16XRegClass);
+    setOperationAction(ISD::SELECT_CC,            MVT::f16, Expand);
+    setOperationAction(ISD::BR_CC,                MVT::f16, Expand);
+    setOperationAction(ISD::SETCC,                MVT::f16, Custom);
+    setOperationAction(ISD::STRICT_FSETCC,        MVT::f16, Custom);
+    setOperationAction(ISD::STRICT_FSETCCS,       MVT::f16, Custom);
+    setOperationAction(ISD::FP_ROUND,             MVT::f16, Custom);
+    setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32, Legal);
+    if (isTypeLegal(MVT::f80)) {
+      setOperationAction(ISD::FP_EXTEND,          MVT::f80, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::f80, Custom);
+    }
+
+    setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
+    setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);
 
     if (Subtarget.useAVX512Regs()) {
       setGroup(MVT::v32f16);
       addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
       setOperationAction(ISD::SCALAR_TO_VECTOR,       MVT::v32f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP,             MVT::v32i16, Legal);
+      setOperationAction(ISD::STRICT_SINT_TO_FP,      MVT::v32i16, Legal);
+      setOperationAction(ISD::UINT_TO_FP,             MVT::v32i16, Legal);
+      setOperationAction(ISD::STRICT_UINT_TO_FP,      MVT::v32i16, Legal);
+      setOperationAction(ISD::STRICT_FP_ROUND,        MVT::v16f16, Legal);
+      setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v16f32, Legal);
       setOperationAction(ISD::INSERT_VECTOR_ELT,      MVT::v32f16, Custom);
 
+      setOperationAction(ISD::FP_TO_SINT,             MVT::v32i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_SINT,      MVT::v32i16, Custom);
+      setOperationAction(ISD::FP_TO_UINT,             MVT::v32i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT,      MVT::v32i16, Custom);
+      setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i8,  MVT::v32i16);
+      setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,
+                                 MVT::v32i16);
+      setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i8,  MVT::v32i16);
+      setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,
+                                 MVT::v32i16);
+      setOperationPromotedToType(ISD::FP_TO_SINT,     MVT::v32i1,  MVT::v32i16);
+      setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,
+                                 MVT::v32i16);
+      setOperationPromotedToType(ISD::FP_TO_UINT,     MVT::v32i1,  MVT::v32i16);
+      setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,
+                                 MVT::v32i16);
+
       setOperationAction(ISD::EXTRACT_SUBVECTOR,      MVT::v16f16, Legal);
       setOperationAction(ISD::INSERT_SUBVECTOR,       MVT::v32f16, Legal);
       setOperationAction(ISD::CONCAT_VECTORS,         MVT::v32f16, Custom);
 
       setLoadExtAction(ISD::EXTLOAD, MVT::v8f64,  MVT::v8f16,  Legal);
       setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
+
+      setOperationAction(ISD::STRICT_FSETCC,      MVT::v32i1, Custom);
+      setOperationAction(ISD::STRICT_FSETCCS,     MVT::v32i1, Custom);
     }
 
     if (Subtarget.hasVLX()) {
@@ -1940,6 +2012,21 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
       setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8f16,  Legal);
       setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Legal);
+      setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v16i16, Legal);
+      setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16,  Legal);
+      setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i16,  Legal);
+      setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Legal);
+      setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v16i16, Legal);
+      setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16,  Legal);
+      setOperationAction(ISD::STRICT_UINT_TO_FP,  MVT::v8i16,  Legal);
+
+      setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v8i16, Custom);
+      setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f16, Legal);
+      setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v8f32, Legal);
 
       // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
       setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v8f16,  Custom);
@@ -1981,6 +2068,37 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
     }
 
+    if (Subtarget.hasFP16()) {
+      // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
+      setOperationAction(ISD::FP_TO_SINT,        MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_TO_UINT,        MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_TO_SINT,        MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);
+      setOperationAction(ISD::FP_TO_UINT,        MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);
+      // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
+      setOperationAction(ISD::SINT_TO_FP,        MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);
+      setOperationAction(ISD::UINT_TO_FP,        MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);
+      setOperationAction(ISD::SINT_TO_FP,        MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);
+      setOperationAction(ISD::UINT_TO_FP,        MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);
+      // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
+      setOperationAction(ISD::FP_ROUND,          MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_ROUND,          MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND,   MVT::v4f16, Custom);
+      // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
+      setOperationAction(ISD::FP_EXTEND,         MVT::v2f16, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v2f16, Custom);
+      setOperationAction(ISD::FP_EXTEND,         MVT::v4f16, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v4f16, Custom);
+    }
+
     setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);
     setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);
     setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);
@@ -2365,7 +2483,7 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
 /// preferred vector width.
 EVT X86TargetLowering::getOptimalMemOpType(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
-  if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
+  if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
     if (Op.size() >= 16 &&
         (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
       // FIXME: Check if unaligned 64-byte accesses are slow.
@@ -2618,7 +2736,7 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
         Type::getInt8PtrTy(M.getContext()));
     if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
       F->setCallingConv(CallingConv::X86_FastCall);
-      F->addAttribute(1, Attribute::AttrKind::InReg);
+      F->addParamAttr(0, Attribute::AttrKind::InReg);
     }
     return;
   }
@@ -12130,10 +12248,15 @@ static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
     int M = Mask[i];
     if (M == SM_SentinelUndef)
       continue;
-    if (M == i)
+    if (M == i ||
+        (0 <= M && M < Size && IsElementEquivalent(Size, V1, V1, M, i))) {
+      Mask[i] = i;
       continue;
-    if (M == i + Size) {
+    }
+    if (M == (i + Size) ||
+        (Size <= M && IsElementEquivalent(Size, V2, V2, M - Size, i))) {
       BlendMask |= 1ull << i;
+      Mask[i] = i + Size;
       continue;
     }
     if (Zeroable[i]) {
@@ -18572,7 +18695,13 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
   return false;
 }
 
-/// Top-level lowering for x86 vector shuffles.
+// Forward declaration.
+static SDValue canonicalizeShuffleMaskWithHorizOp(
+    MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,
+    unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
+    const X86Subtarget &Subtarget);
+
+    /// Top-level lowering for x86 vector shuffles.
 ///
 /// This handles decomposition, canonicalization, and lowering of all x86
 /// vector shuffles. Most of the specific lowering strategies are encapsulated
@@ -18681,8 +18810,22 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
+  SmallVector<SDValue> Ops = {V1, V2};
+  SmallVector<int> Mask(OrigMask.begin(), OrigMask.end());
+
+  // Canonicalize the shuffle with any horizontal ops inputs.
+  // NOTE: This may update Ops and Mask.
+  if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(
+          Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
+    return DAG.getBitcast(VT, HOp);
+
+  V1 = DAG.getBitcast(VT, Ops[0]);
+  V2 = DAG.getBitcast(VT, Ops[1]);
+  assert(NumElements == (int)Mask.size() &&
+         "canonicalizeShuffleMaskWithHorizOp "
+         "shouldn't alter the shuffle mask size");
+
   // Commute the shuffle if it will improve canonicalization.
-  SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
   if (canonicalizeShuffleMaskWithCommute(Mask)) {
     ShuffleVectorSDNode::commuteMask(Mask);
     std::swap(V1, V2);
@@ -19170,12 +19313,27 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
       }
     }
 
+    unsigned NumEltsIn128 = 128 / EltSizeInBits;
+    assert(isPowerOf2_32(NumEltsIn128) &&
+           "Vectors will always have power-of-two number of elements.");
+
+    // If we are not inserting into the low 128-bit vector chunk,
+    // then prefer the broadcast+blend sequence.
+    // FIXME: relax the profitability check iff all N1 uses are insertions.
+    if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
+        ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
+         (Subtarget.hasAVX() && (EltSizeInBits >= 32) && MayFoldLoad(N1)))) {
+      SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
+      SmallVector<int, 8> BlendMask;
+      for (unsigned i = 0; i != NumElts; ++i)
+        BlendMask.push_back(i == IdxVal ? i + NumElts : i);
+      return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
+    }
+
     // Get the desired 128-bit vector chunk.
     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
 
     // Insert the element into the desired chunk.
-    unsigned NumEltsIn128 = 128 / EltSizeInBits;
-    assert(isPowerOf2_32(NumEltsIn128));
     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
 
@@ -19958,6 +20116,43 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
                      DAG.getIntPtrConstant(0, dl));
 }
 
+// Try to use a packed vector operation to handle i64 on 32-bit targets.
+static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,
+                                 const X86Subtarget &Subtarget) {
+  assert((Op.getOpcode() == ISD::SINT_TO_FP ||
+          Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
+          Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
+          Op.getOpcode() == ISD::UINT_TO_FP) &&
+         "Unexpected opcode!");
+  bool IsStrict = Op->isStrictFPOpcode();
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+  MVT SrcVT = Src.getSimpleValueType();
+  MVT VT = Op.getSimpleValueType();
+
+  if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
+    return SDValue();
+
+  // Pack the i64 into a vector, do the operation and extract.
+
+  assert(Subtarget.hasFP16() && "Expected FP16");
+
+  SDLoc dl(Op);
+  SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
+  if (IsStrict) {
+    SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
+                                 {Op.getOperand(0), InVec});
+    SDValue Chain = CvtVec.getValue(1);
+    SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
+                                DAG.getIntPtrConstant(0, dl));
+    return DAG.getMergeValues({Value, Chain}, dl);
+  }
+
+  SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
+
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
+                     DAG.getIntPtrConstant(0, dl));
+}
+
 static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
                           const X86Subtarget &Subtarget) {
   switch (Opcode) {
@@ -20210,6 +20405,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
 
   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
     return V;
+  if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
+    return V;
 
   // SSE doesn't have an i16 conversion so we need to promote.
   if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
@@ -20689,6 +20886,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
 
   if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
     return V;
+  if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))
+    return V;
 
   // The transform for i64->f64 isn't correct for 0 when rounding to negative
   // infinity. It produces -0.0, so disable under strictfp.
@@ -21470,9 +21669,11 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
                   Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
   MVT VT = Op->getSimpleValueType(0);
   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+  SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
   MVT SrcVT = Src.getSimpleValueType();
   SDLoc dl(Op);
 
+  SDValue Res;
   if (VT.isVector()) {
     if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
       MVT ResVT = MVT::v4i32;
@@ -21497,10 +21698,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
         Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
                           DAG.getIntPtrConstant(0, dl));
       }
-      SDValue Res, Chain;
       if (IsStrict) {
-        Res =
-            DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
+        Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
         Chain = Res.getValue(1);
       } else {
         Res = DAG.getNode(Opc, dl, ResVT, Src);
@@ -21514,6 +21713,67 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       return Res;
     }
 
+    if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
+      if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
+        return Op;
+
+      MVT ResVT = VT;
+      MVT EleVT = VT.getVectorElementType();
+      if (EleVT != MVT::i64)
+        ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
+
+      if (SrcVT != MVT::v8f16) {
+        SDValue Tmp =
+            IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
+        SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
+        Ops[0] = Src;
+        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
+      }
+
+      if (IsStrict) {
+        Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
+                                   : X86ISD::STRICT_CVTTP2UI,
+                          dl, {ResVT, MVT::Other}, {Chain, Src});
+        Chain = Res.getValue(1);
+      } else {
+        Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
+                          ResVT, Src);
+      }
+
+      // TODO: Need to add exception check code for strict FP.
+      if (EleVT.getSizeInBits() < 16) {
+        ResVT = MVT::getVectorVT(EleVT, 8);
+        Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
+      }
+
+      if (ResVT != VT)
+        Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
+                          DAG.getIntPtrConstant(0, dl));
+
+      if (IsStrict)
+        return DAG.getMergeValues({Res, Chain}, dl);
+      return Res;
+    }
+
+    if (VT == MVT::v8i16 && (SrcVT == MVT::v8f32 || SrcVT == MVT::v8f64)) {
+      if (IsStrict) {
+        Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
+                                   : ISD::STRICT_FP_TO_UINT,
+                          dl, {MVT::v8i32, MVT::Other}, {Chain, Src});
+        Chain = Res.getValue(1);
+      } else {
+        Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
+                          MVT::v8i32, Src);
+      }
+
+      // TODO: Need to add exception check code for strict FP.
+      Res = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i16, Res);
+
+      if (IsStrict)
+        return DAG.getMergeValues({Res, Chain}, dl);
+      return Res;
+    }
+
     // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
     if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
       assert(!IsSigned && "Expected unsigned conversion!");
@@ -21537,10 +21797,9 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
                         DAG.getIntPtrConstant(0, dl));
 
-      SDValue Res, Chain;
       if (IsStrict) {
         Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
-                          {Op->getOperand(0), Src});
+                          {Chain, Src});
         Chain = Res.getValue(1);
       } else {
         Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
@@ -21568,10 +21827,9 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
       Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
                         DAG.getIntPtrConstant(0, dl));
 
-      SDValue Res, Chain;
       if (IsStrict) {
         Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
-                          {Op->getOperand(0), Src});
+                          {Chain, Src});
         Chain = Res.getValue(1);
       } else {
         Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
@@ -21596,7 +21854,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
         SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
                                   {Src, Zero, Zero, Zero});
         Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
-                          {Op->getOperand(0), Tmp});
+                          {Chain, Tmp});
         SDValue Chain = Tmp.getValue(1);
         Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
                           DAG.getIntPtrConstant(0, dl));
@@ -21679,17 +21937,16 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
     // FIXME: This does not generate an invalid exception if the input does not
     // fit in i32. PR44019
     if (Subtarget.is64Bit()) {
-      SDValue Res, Chain;
       if (IsStrict) {
-        Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
-                          { Op.getOperand(0), Src });
+        Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
+                          {Chain, Src});
         Chain = Res.getValue(1);
       } else
         Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
 
       Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
       if (IsStrict)
-        return DAG.getMergeValues({ Res, Chain }, dl);
+        return DAG.getMergeValues({Res, Chain}, dl);
       return Res;
     }
 
@@ -21704,17 +21961,16 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   // fit in i16. PR44019
   if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
     assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
-    SDValue Res, Chain;
     if (IsStrict) {
-      Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
-                        { Op.getOperand(0), Src });
+      Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
+                        {Chain, Src});
       Chain = Res.getValue(1);
     } else
       Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
 
     Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
     if (IsStrict)
-      return DAG.getMergeValues({ Res, Chain }, dl);
+      return DAG.getMergeValues({Res, Chain}, dl);
     return Res;
   }
 
@@ -21730,7 +21986,6 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
     else
       LC = RTLIB::getFPTOUINT(SrcVT, VT);
 
-    SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
     MakeLibCallOptions CallOptions;
     std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
                                                   SDLoc(Op), Chain);
@@ -21742,7 +21997,6 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Fall back to X87.
-  SDValue Chain;
   if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
     if (IsStrict)
       return DAG.getMergeValues({V, Chain}, dl);
@@ -21969,6 +22223,35 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   if (VT == MVT::f128)
     return SDValue();
 
+  if (VT == MVT::f80) {
+    if (SVT == MVT::f16) {
+      assert(Subtarget.hasFP16() && "Unexpected features!");
+      RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, VT);
+      MakeLibCallOptions CallOptions;
+      std::pair<SDValue, SDValue> Tmp =
+          makeLibCall(DAG, LC, VT, In, CallOptions, DL,
+                      IsStrict ? Op.getOperand(0) : SDValue());
+      if (IsStrict)
+        return DAG.getMergeValues({Tmp.first, Tmp.second}, DL);
+      else
+        return Tmp.first;
+    }
+    return Op;
+  }
+
+  if (SVT.getVectorElementType() == MVT::f16) {
+    assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!");
+    if (SVT == MVT::v2f16)
+      In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
+                       DAG.getUNDEF(MVT::v2f16));
+    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
+                              DAG.getUNDEF(MVT::v4f16));
+    if (IsStrict)
+      return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
+                         {Op->getOperand(0), Res});
+    return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
+  }
+
   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
 
   SDValue Res =
@@ -21982,8 +22265,11 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
-  // It's legal except when f128 is involved
-  if (In.getSimpleValueType() != MVT::f128)
+  MVT VT = Op.getSimpleValueType();
+  MVT SVT = In.getSimpleValueType();
+
+  // It's legal except when f128 is involved or we're converting f80->f16.
+  if (SVT != MVT::f128 && !(VT == MVT::f16 && SVT == MVT::f80))
     return Op;
 
   return SDValue();
@@ -27235,11 +27521,11 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
       if (!Attrs.isEmpty() && !Func->isVarArg()) {
         unsigned InRegCount = 0;
-        unsigned Idx = 1;
+        unsigned Idx = 0;
 
         for (FunctionType::param_iterator I = FTy->param_begin(),
              E = FTy->param_end(); I != E; ++I, ++Idx)
-          if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
+          if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
             const DataLayout &DL = DAG.getDataLayout();
             // FIXME: should only count parameters that are lowered to integers.
             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
@@ -27719,6 +28005,25 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
     return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
   }
 
+  if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
+      (!VT.isVector() || VT == MVT::v2i64)) {
+    unsigned BitWidth = VT.getScalarSizeInBits();
+    APInt MinVal = APInt::getSignedMinValue(BitWidth);
+    APInt MaxVal = APInt::getSignedMaxValue(BitWidth);
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    SDValue Result =
+        DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
+                    DAG.getVTList(VT, SetCCResultType), X, Y);
+    SDValue SumDiff = Result.getValue(0);
+    SDValue Overflow = Result.getValue(1);
+    SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
+    SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
+    SDValue SumNeg =
+        DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
+    Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
+    return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
+  }
+
   // Use default expansion.
   return SDValue();
 }
@@ -31078,6 +31383,51 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
     EVT SrcVT = Src.getValueType();
 
+    if (VT.isVector() && Subtarget.hasFP16() &&
+        SrcVT.getVectorElementType() == MVT::f16) {
+      EVT EleVT = VT.getVectorElementType();
+      EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
+
+      if (SrcVT != MVT::v8f16) {
+        SDValue Tmp =
+            IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
+        SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
+        Ops[0] = Src;
+        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
+      }
+
+      SDValue Res, Chain;
+      if (IsStrict) {
+        unsigned Opc =
+            IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
+        Res =
+            DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
+        Chain = Res.getValue(1);
+      } else {
+        unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+        Res = DAG.getNode(Opc, dl, ResVT, Src);
+      }
+
+      // TODO: Need to add exception check code for strict FP.
+      if (EleVT.getSizeInBits() < 16) {
+        MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
+        Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
+
+        // Now widen to 128 bits.
+        unsigned NumConcats = 128 / TmpVT.getSizeInBits();
+        MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
+        SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
+        ConcatOps[0] = Res;
+        Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
+      }
+
+      Results.push_back(Res);
+      if (IsStrict)
+        Results.push_back(Chain);
+
+      return;
+    }
+
     if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
       assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
              "Unexpected type action!");
@@ -31252,9 +31602,31 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
                     N->getOpcode() == ISD::STRICT_SINT_TO_FP;
     EVT VT = N->getValueType(0);
+    SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+    if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
+        Subtarget.hasVLX()) {
+      if (Src.getValueType().getVectorElementType() == MVT::i16)
+        return;
+
+      if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
+        Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+                          IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
+                                   : DAG.getUNDEF(MVT::v2i32));
+      if (IsStrict) {
+        unsigned Opc =
+            IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;
+        SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
+                                  {N->getOperand(0), Src});
+        Results.push_back(Res);
+        Results.push_back(Res.getValue(1));
+      } else {
+        unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
+        Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
+      }
+      return;
+    }
     if (VT != MVT::v2f32)
       return;
-    SDValue Src = N->getOperand(IsStrict ? 1 : 0);
     EVT SrcVT = Src.getValueType();
     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
       if (IsStrict) {
@@ -31355,14 +31727,21 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::FP_ROUND: {
     bool IsStrict = N->isStrictFPOpcode();
     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+    EVT VT = N->getValueType(0);
+    EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
+    if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
+      SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
+                             : DAG.getUNDEF(MVT::v2f32);
+      Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
+    }
     if (!isTypeLegal(Src.getValueType()))
       return;
     SDValue V;
     if (IsStrict)
-      V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
-                      {N->getOperand(0), N->getOperand(1)});
+      V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
+                      {N->getOperand(0), Src});
     else
-      V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+      V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
     Results.push_back(V);
     if (IsStrict)
       Results.push_back(V.getValue(1));
@@ -31374,6 +31753,21 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     // No other ValueType for FP_EXTEND should reach this point.
     assert(N->getValueType(0) == MVT::v2f32 &&
            "Do not know how to legalize this Node");
+    if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
+      return;
+    bool IsStrict = N->isStrictFPOpcode();
+    SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+    SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
+                           : DAG.getUNDEF(MVT::v2f16);
+    SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
+    if (IsStrict)
+      V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
+                      {N->getOperand(0), V});
+    else
+      V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
+    Results.push_back(V);
+    if (IsStrict)
+      Results.push_back(V.getValue(1));
     return;
   }
   case ISD::INTRINSIC_W_CHAIN: {
@@ -34801,6 +35195,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTDPBF16PS: {
     unsigned Opc;
     switch (MI.getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
     case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
     case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
     case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
@@ -34828,6 +35223,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTILESTORED: {
     unsigned Opc;
     switch (MI.getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
     case X86::PTILELOADD:   Opc = X86::TILELOADD;   break;
     case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
     case X86::PTILESTORED:  Opc = X86::TILESTORED;  break;
@@ -37957,6 +38353,13 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)
       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
 
+    // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
+    if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+        isNullConstant(Src.getOperand(1)) &&
+        DAG.getTargetLoweringInfo().isTypeLegal(
+            Src.getOperand(0).getValueType()))
+      return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
+
     // Share broadcast with the longest vector and extract low subvector (free).
     // Ensure the same SDValue from the SDNode use is being used.
     for (SDNode *User : Src->uses())
@@ -43658,8 +44061,6 @@ static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// Optimize a single multiply with constant into two operations in order to
-/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
@@ -43674,8 +44075,11 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalize() && VT.isVector())
     return reduceVMULWidth(N, DAG, Subtarget);
 
+  // Optimize a single multiply with constant into two operations in order to
+  // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
   if (!MulConstantOptimization)
     return SDValue();
+
   // An imul is usually smaller than the alternative sequence.
   if (DAG.getMachineFunction().getFunction().hasMinSize())
     return SDValue();
@@ -47951,6 +48355,7 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
         (Subtarget.hasSSE2() && VT == MVT::f64) ||
+        (Subtarget.hasFP16() && VT == MVT::f16) ||
         (VT.isVector() && TLI.isTypeLegal(VT))))
     return SDValue();
 
@@ -48512,6 +48917,9 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
       SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
     return SDValue();
 
+  // We don't have CMPP Instruction for vxf16
+  if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
+    return SDValue();
   // We can only do this if the vector size in 256 bits or less.
   unsigned Size = VT.getSizeInBits();
   if (Size > 256 && Subtarget.useAVX512Regs())
@@ -49367,10 +49775,31 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   EVT InVT = Op0.getValueType();
 
+  // UINT_TO_FP(vXi1~15)  -> UINT_TO_FP(ZEXT(vXi1~15  to vXi16))
+  // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))
+  // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))
+  if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
+    unsigned ScalarSize = InVT.getScalarSizeInBits();
+    if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
+      return SDValue();
+    SDLoc dl(N);
+    EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
+                                 ScalarSize < 16   ? MVT::i16
+                                 : ScalarSize < 32 ? MVT::i32
+                                                   : MVT::i64,
+                                 InVT.getVectorNumElements());
+    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
+    if (IsStrict)
+      return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},
+                         {N->getOperand(0), P});
+    return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
+  }
+
   // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
-  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
+  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
+      VT.getScalarType() != MVT::f16) {
     SDLoc dl(N);
     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
@@ -49409,10 +49838,31 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   EVT InVT = Op0.getValueType();
 
+  // SINT_TO_FP(vXi1~15)  -> SINT_TO_FP(SEXT(vXi1~15  to vXi16))
+  // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
+  // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
+  if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
+    unsigned ScalarSize = InVT.getScalarSizeInBits();
+    if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)
+      return SDValue();
+    SDLoc dl(N);
+    EVT DstVT = EVT::getVectorVT(*DAG.getContext(),
+                                 ScalarSize < 16   ? MVT::i16
+                                 : ScalarSize < 32 ? MVT::i32
+                                                   : MVT::i64,
+                                 InVT.getVectorNumElements());
+    SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+    if (IsStrict)
+      return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+                         {N->getOperand(0), P});
+    return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
+  }
+
   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
-  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
+  if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
+      VT.getScalarType() != MVT::f16) {
     SDLoc dl(N);
     EVT DstVT = InVT.changeVectorElementType(MVT::i32);
     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
@@ -51258,6 +51708,9 @@ static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
     return SDValue();
 
+  if (Subtarget.hasFP16())
+    return SDValue();
+
   bool IsStrict = N->isStrictFPOpcode();
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(IsStrict ? 1 : 0);
@@ -51366,6 +51819,9 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
   if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
     return SDValue();
 
+  if (Subtarget.hasFP16())
+    return SDValue();
+
   EVT VT = N->getValueType(0);
   SDValue Src = N->getOperand(0);
   EVT SrcVT = Src.getValueType();
@@ -52761,7 +53217,7 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // integer division, leaving the division as-is is a loss even in terms of
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
-  bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
+  bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
 
diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index 85410c54a4d2e..732b2b1a5ada6 100644
--- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -92,7 +92,7 @@ static bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
   if (!CalleeFn)
     return false;
   AttributeList Attrs = CalleeFn->getAttributes();
-  return Attrs.hasFnAttribute(Attribute::ReturnsTwice);
+  return Attrs.hasFnAttr(Attribute::ReturnsTwice);
 }
 
 bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 08f51c3b4b341..c92abc7e8c95d 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -2169,6 +2169,10 @@ let Predicates = [HasAVX512] in {
                                    X86cmpms_su, X86cmpmsSAE_su,
                                    SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W;
 }
+let Predicates = [HasFP16], ExeDomain = SSEPackedSingle in
+  defm VCMPSHZ : avx512_cmp_scalar<f16x_info, X86cmpms, X86cmpmsSAE,
+                                   X86cmpms_su, X86cmpmsSAE_su,
+                                   SchedWriteFCmp.Scl>, AVX512XSIi8Base, TA;
 
 multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr,
                               X86FoldableSchedWrite sched,
@@ -2631,13 +2635,14 @@ multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
                      EVEX_B, Sched<[sched]>;
 }
 
-multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
-  let Predicates = [HasAVX512] in {
+multiclass avx512_vcmp<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _,
+                       Predicate Pred = HasAVX512> {
+  let Predicates = [Pred] in {
     defm Z    : avx512_vcmp_common<sched.ZMM, _.info512, NAME>,
                 avx512_vcmp_sae<sched.ZMM, _.info512>, EVEX_V512;
 
   }
-  let Predicates = [HasAVX512,HasVLX] in {
+  let Predicates = [Pred,HasVLX] in {
    defm Z128 : avx512_vcmp_common<sched.XMM, _.info128, NAME>, EVEX_V128;
    defm Z256 : avx512_vcmp_common<sched.YMM, _.info256, NAME>, EVEX_V256;
   }
@@ -2647,18 +2652,23 @@ defm VCMPPD : avx512_vcmp<SchedWriteFCmp, avx512vl_f64_info>,
                           AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
 defm VCMPPS : avx512_vcmp<SchedWriteFCmp, avx512vl_f32_info>,
                           AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VCMPPH : avx512_vcmp<SchedWriteFCmp, avx512vl_f16_info, HasFP16>,
+                          AVX512PSIi8Base, EVEX_4V, EVEX_CD8<16, CD8VF>, TA;
 
 // Patterns to select fp compares with load as first operand.
 let Predicates = [HasAVX512] in {
-  def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1,
-                            timm:$cc)),
+  def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1, timm:$cc)),
             (VCMPSDZrm FR64X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
 
-  def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1,
-                            timm:$cc)),
+  def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1, timm:$cc)),
             (VCMPSSZrm FR32X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
 }
 
+let Predicates = [HasFP16] in {
+  def : Pat<(v1i1 (X86cmpms (loadf16 addr:$src2), FR16X:$src1, timm:$cc)),
+            (VCMPSHZrm FR16X:$src1, addr:$src2, (X86cmpm_imm_commute timm:$cc))>;
+}
+
 // ----------------------------------------------------------------
 // FPClass
 
@@ -2806,24 +2816,28 @@ multiclass avx512_vector_fpclass_all<string OpcodeStr, AVX512VLVectorVTInfo _,
 }
 
 multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
-                                 bits<8> opcScalar, X86SchedWriteWidths sched,
-                                 Predicate prd> {
+                                 bits<8> opcScalar, X86SchedWriteWidths sched> {
+  defm PH : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f16_info, opcVec,
+                                      sched, HasFP16>,
+                                      EVEX_CD8<16, CD8VF>, AVX512PSIi8Base, TA;
+  defm SHZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
+                                   sched.Scl, f16x_info, HasFP16>,
+                                   EVEX_CD8<16, CD8VT1>, AVX512PSIi8Base, TA;
   defm PS : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f32_info, opcVec,
-                                      sched, prd>,
-                                      EVEX_CD8<32, CD8VF>;
+                                      sched, HasDQI>,
+                                      EVEX_CD8<32, CD8VF>, AVX512AIi8Base;
   defm PD : avx512_vector_fpclass_all<OpcodeStr,  avx512vl_f64_info, opcVec,
-                                      sched, prd>,
-                                      EVEX_CD8<64, CD8VF> , VEX_W;
+                                      sched, HasDQI>,
+                                      EVEX_CD8<64, CD8VF>, AVX512AIi8Base, VEX_W;
   defm SSZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
-                                   sched.Scl, f32x_info, prd>, VEX_LIG,
-                                   EVEX_CD8<32, CD8VT1>;
+                                   sched.Scl, f32x_info, HasDQI>, VEX_LIG,
+                                   EVEX_CD8<32, CD8VT1>, AVX512AIi8Base;
   defm SDZ : avx512_scalar_fpclass<opcScalar, OpcodeStr,
-                                   sched.Scl, f64x_info, prd>, VEX_LIG,
-                                   EVEX_CD8<64, CD8VT1>, VEX_W;
+                                   sched.Scl, f64x_info, HasDQI>, VEX_LIG,
+                                   EVEX_CD8<64, CD8VT1>, AVX512AIi8Base, VEX_W;
 }
 
-defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp,
-                                      HasDQI>, AVX512AIi8Base, EVEX;
+defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, SchedWriteFCmp>, EVEX;
 
 //-----------------------------------------------------------------
 // Mask register copy, including
@@ -4152,7 +4166,7 @@ defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
                                   VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
 defm VMOVSHZ : avx512_move_scalar<"vmovsh", X86Movsh, X86vzload16, f16x_info,
-                                  [HasFP16, OptForSize]>,
+                                  [HasFP16]>,
                                   VEX_LIG, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
 
 multiclass avx512_move_scalar_lowering<string InstrStr, SDNode OpNode,
@@ -5634,6 +5648,12 @@ multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDPatternOperator
              avx512_fp_scalar_round<opc, OpcodeStr#"sd", f64x_info, RndNode,
                               sched.PD.Scl, IsCommutable>,
                               XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+  let Predicates = [HasFP16] in
+    defm SHZ : avx512_fp_scalar<opc, OpcodeStr#"sh", f16x_info, OpNode,
+                                VecNode, sched.PH.Scl, IsCommutable>,
+               avx512_fp_scalar_round<opc, OpcodeStr#"sh", f16x_info, RndNode,
+                                sched.PH.Scl, IsCommutable>,
+                                T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>;
 }
 
 multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -5647,6 +5667,13 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               VecNode, SaeNode, sched.PD.Scl, IsCommutable,
                               NAME#"SD">,
                               XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+  let Predicates = [HasFP16] in {
+    defm SHZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sh", f16x_info, OpNode,
+                                VecNode, SaeNode, sched.PH.Scl, IsCommutable,
+                                NAME#"SH">,
+                                T_MAP5XS, EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>,
+                                NotEVEX2VEXConvertible;
+  }
 }
 defm VADD : avx512_binop_s_round<0x58, "vadd", any_fadd, X86fadds, X86faddRnds,
                                  SchedWriteFAddSizes, 1>;
@@ -5702,6 +5729,15 @@ defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
                                          VEX_W, EVEX_4V, VEX_LIG,
                                          EVEX_CD8<64, CD8VT1>, SIMD_EXC;
 
+defm VMINCSHZ : avx512_comutable_binop_s<0x5D, "vminsh", f16x_info, X86fminc,
+                                         SchedWriteFCmp.Scl, "VMINCSH">, T_MAP5XS,
+                                         EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
+                                         NotEVEX2VEXConvertible;
+defm VMAXCSHZ : avx512_comutable_binop_s<0x5F, "vmaxsh", f16x_info, X86fmaxc,
+                                         SchedWriteFCmp.Scl, "VMAXCSH">, T_MAP5XS,
+                                         EVEX_4V, VEX_LIG, EVEX_CD8<16, CD8VT1>, SIMD_EXC,
+                                         NotEVEX2VEXConvertible;
+
 multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                             SDPatternOperator MaskOpNode,
                             X86VectorVTInfo _, X86FoldableSchedWrite sched,
@@ -5789,9 +5825,33 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator Op
   }
 }
 
+multiclass avx512_fp_binop_ph<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                              SDPatternOperator MaskOpNode,
+                              X86SchedWriteSizes sched, bit IsCommutable = 0,
+                              bit IsPD128Commutable = IsCommutable> {
+  let Predicates = [HasFP16] in {
+    defm PHZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v32f16_info,
+                                sched.PH.ZMM, IsCommutable>, EVEX_V512, T_MAP5PS,
+                                EVEX_CD8<16, CD8VF>;
+  }
+  let Predicates = [HasVLX, HasFP16] in {
+    defm PHZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f16x_info,
+                                   sched.PH.XMM, IsCommutable>, EVEX_V128, T_MAP5PS,
+                                   EVEX_CD8<16, CD8VF>;
+    defm PHZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f16x_info,
+                                   sched.PH.YMM, IsCommutable>, EVEX_V256, T_MAP5PS,
+                                   EVEX_CD8<16, CD8VF>;
+  }
+}
+
 let Uses = [MXCSR] in
 multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
                                    X86SchedWriteSizes sched> {
+  let Predicates = [HasFP16] in {
+    defm PHZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PH.ZMM,
+                                      v32f16_info>,
+                                      EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+  }
   defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
                                     v16f32_info>,
                                     EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
@@ -5803,6 +5863,11 @@ multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeR
 let Uses = [MXCSR] in
 multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
                                  X86SchedWriteSizes sched> {
+  let Predicates = [HasFP16] in {
+    defm PHZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PH.ZMM,
+                                    v32f16_info>,
+                                    EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+  }
   defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
                                   v16f32_info>,
                                   EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
@@ -5813,26 +5878,36 @@ multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd
 
 defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, fadd, HasAVX512,
                               SchedWriteFAddSizes, 1>,
+            avx512_fp_binop_ph<0x58, "vadd", any_fadd, fadd, SchedWriteFAddSizes, 1>,
             avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SchedWriteFAddSizes>;
 defm VMUL : avx512_fp_binop_p<0x59, "vmul", any_fmul, fmul, HasAVX512,
                               SchedWriteFMulSizes, 1>,
+            avx512_fp_binop_ph<0x59, "vmul", any_fmul, fmul, SchedWriteFMulSizes, 1>,
             avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SchedWriteFMulSizes>;
 defm VSUB : avx512_fp_binop_p<0x5C, "vsub", any_fsub, fsub, HasAVX512,
                               SchedWriteFAddSizes>,
+            avx512_fp_binop_ph<0x5C, "vsub", any_fsub, fsub, SchedWriteFAddSizes>,
             avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SchedWriteFAddSizes>;
 defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", any_fdiv, fdiv, HasAVX512,
                               SchedWriteFDivSizes>,
+            avx512_fp_binop_ph<0x5E, "vdiv", any_fdiv, fdiv, SchedWriteFDivSizes>,
             avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>;
 defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, X86fmin, HasAVX512,
                               SchedWriteFCmpSizes, 0>,
+            avx512_fp_binop_ph<0x5D, "vmin", X86fmin, X86fmin, SchedWriteFCmpSizes, 0>,
             avx512_fp_binop_p_sae<0x5D, "vmin", X86fminSAE, SchedWriteFCmpSizes>;
 defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, X86fmax, HasAVX512,
                               SchedWriteFCmpSizes, 0>,
+            avx512_fp_binop_ph<0x5F, "vmax", X86fmax, X86fmax, SchedWriteFCmpSizes, 0>,
             avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxSAE, SchedWriteFCmpSizes>;
 let isCodeGenOnly = 1 in {
   defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, X86fminc, HasAVX512,
+                                 SchedWriteFCmpSizes, 1>,
+               avx512_fp_binop_ph<0x5D, "vmin", X86fminc, X86fminc,
                                  SchedWriteFCmpSizes, 1>;
   defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, X86fmaxc, HasAVX512,
+                                 SchedWriteFCmpSizes, 1>,
+               avx512_fp_binop_ph<0x5F, "vmax", X86fmaxc, X86fmaxc,
                                  SchedWriteFCmpSizes, 1>;
 }
 let Uses = []<Register>, mayRaiseFPException = 0 in {
@@ -5886,35 +5961,50 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr,
                                 X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm PHZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v32f16_info>,
+               avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v32f16_info>,
+                                EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+    defm SHZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f16x_info>,
+               avx512_fp_scalar_round<opcScaler, OpcodeStr#"sh", f16x_info, X86scalefsRnd, sched.Scl>,
+                             EVEX_4V, T_MAP6PD, EVEX_CD8<16, CD8VT1>;
+  }
   defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v16f32_info>,
              avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v16f32_info>,
-                              EVEX_V512, EVEX_CD8<32, CD8VF>;
+                              EVEX_V512, EVEX_CD8<32, CD8VF>, T8PD;
   defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.ZMM, v8f64_info>,
              avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v8f64_info>,
-                              EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+                              EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>, T8PD;
   defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>,
              avx512_fp_scalar_round<opcScaler, OpcodeStr#"ss", f32x_info,
                                     X86scalefsRnd, sched.Scl>,
-                                    EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                                    EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, T8PD;
   defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>,
              avx512_fp_scalar_round<opcScaler, OpcodeStr#"sd", f64x_info,
                                     X86scalefsRnd, sched.Scl>,
-                                    EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W;
+                                    EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W, T8PD;
 
   // Define only if AVX512VL feature is present.
   let Predicates = [HasVLX] in {
     defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v4f32x_info>,
-                                   EVEX_V128, EVEX_CD8<32, CD8VF>;
+                                   EVEX_V128, EVEX_CD8<32, CD8VF>, T8PD;
     defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v8f32x_info>,
-                                   EVEX_V256, EVEX_CD8<32, CD8VF>;
+                                   EVEX_V256, EVEX_CD8<32, CD8VF>, T8PD;
     defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v2f64x_info>,
-                                   EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+                                   EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>, T8PD;
     defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v4f64x_info>,
-                                   EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+                                   EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>, T8PD;
+  }
+
+  let Predicates = [HasFP16, HasVLX] in {
+    defm PHZ128 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.XMM, v8f16x_info>,
+                                   EVEX_V128, EVEX_CD8<16, CD8VF>, T_MAP6PD;
+    defm PHZ256 : avx512_fp_scalef_p<opc, OpcodeStr, X86scalef, sched.YMM, v16f16x_info>,
+                                   EVEX_V256, EVEX_CD8<16, CD8VF>, T_MAP6PD;
   }
 }
 defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef",
-                                    SchedWriteFAdd>, T8PD, NotEVEX2VEXConvertible;
+                                    SchedWriteFAdd>, NotEVEX2VEXConvertible;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  VPTESTM instructions
@@ -7460,8 +7550,8 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
                                   X86VectorVTInfo DstVT, SDNode OpNode,
                                   SDNode OpNodeRnd,
                                   X86FoldableSchedWrite sched, string asm,
-                                  string aliasStr> {
-  let Predicates = [HasAVX512], ExeDomain = SrcVT.ExeDomain in {
+                                  string aliasStr, Predicate prd = HasAVX512> {
+  let Predicates = [prd], ExeDomain = SrcVT.ExeDomain in {
     def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                 [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src)))]>,
@@ -7477,7 +7567,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
                 [(set DstVT.RC:$dst, (OpNode
                       (SrcVT.ScalarIntMemFrags addr:$src)))]>,
                 EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
-  } // Predicates = [HasAVX512]
+  } // Predicates = [prd]
 
   def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
           (!cast<Instruction>(NAME # "rr_Int") DstVT.RC:$dst, SrcVT.RC:$src), 0, "att">;
@@ -7641,8 +7731,9 @@ def : Pat<(v2f64 (X86Movsd
 multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
                             X86VectorVTInfo _DstRC, SDPatternOperator OpNode,
                             SDNode OpNodeInt, SDNode OpNodeSAE,
-                            X86FoldableSchedWrite sched, string aliasStr>{
-let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in {
+                            X86FoldableSchedWrite sched, string aliasStr,
+                            Predicate prd = HasAVX512> {
+let Predicates = [prd], ExeDomain = _SrcRC.ExeDomain in {
   let isCodeGenOnly = 1 in {
   def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
@@ -7669,7 +7760,7 @@ let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in {
               [(set _DstRC.RC:$dst,
                 (OpNodeInt (_SrcRC.ScalarIntMemFrags addr:$src)))]>,
               EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
-} //HasAVX512
+} // Predicates = [prd]
 
   def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
           (!cast<Instruction>(NAME # "rr_Int") _DstRC.RC:$dst, _SrcRC.RC:$src), 0, "att">;
@@ -7767,33 +7858,47 @@ multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInf
                         EVEX_4V, VEX_LIG, Sched<[sched]>,
                         EVEX_B, EVEX_RC;
 }
-multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
+multiclass avx512_cvt_fp_scalar_trunc<bits<8> opc, string OpcodeStr,
                                       SDNode OpNode, SDNode OpNodeRnd,
                                       X86FoldableSchedWrite sched,
-                                      X86VectorVTInfo _src, X86VectorVTInfo _dst> {
-  let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
+                                      X86VectorVTInfo _src, X86VectorVTInfo _dst,
+                                      Predicate prd = HasAVX512> {
+  let Predicates = [prd], ExeDomain = SSEPackedSingle in {
     defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
              avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
-                               OpNodeRnd, sched>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
+                               OpNodeRnd, sched>, EVEX_CD8<_src.EltSize, CD8VT1>;
   }
 }
 
-multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
-                                      SDNode OpNode, SDNode OpNodeSAE,
-                                      X86FoldableSchedWrite sched,
-                                      X86VectorVTInfo _src, X86VectorVTInfo _dst> {
-  let Predicates = [HasAVX512], ExeDomain = SSEPackedSingle in {
+multiclass avx512_cvt_fp_scalar_extend<bits<8> opc, string OpcodeStr,
+                                       SDNode OpNode, SDNode OpNodeSAE,
+                                       X86FoldableSchedWrite sched,
+                                       X86VectorVTInfo _src, X86VectorVTInfo _dst,
+                                       Predicate prd = HasAVX512> {
+  let Predicates = [prd], ExeDomain = SSEPackedSingle in {
     defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNode, sched>,
              avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeSAE, sched>,
-             EVEX_CD8<32, CD8VT1>, XS;
+             EVEX_CD8<_src.EltSize, CD8VT1>;
   }
 }
-defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", X86frounds,
+defm VCVTSD2SS : avx512_cvt_fp_scalar_trunc<0x5A, "vcvtsd2ss", X86frounds,
                                          X86froundsRnd, WriteCvtSD2SS, f64x_info,
-                                         f32x_info>;
-defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpexts,
+                                         f32x_info>, XD, VEX_W;
+defm VCVTSS2SD : avx512_cvt_fp_scalar_extend<0x5A, "vcvtss2sd", X86fpexts,
                                           X86fpextsSAE, WriteCvtSS2SD, f32x_info,
-                                          f64x_info>;
+                                          f64x_info>, XS;
+defm VCVTSD2SH : avx512_cvt_fp_scalar_trunc<0x5A, "vcvtsd2sh", X86frounds,
+                                          X86froundsRnd, WriteCvtSD2SS, f64x_info,
+                                          f16x_info, HasFP16>, T_MAP5XD, VEX_W;
+defm VCVTSH2SD : avx512_cvt_fp_scalar_extend<0x5A, "vcvtsh2sd", X86fpexts,
+                                          X86fpextsSAE, WriteCvtSS2SD, f16x_info,
+                                          f64x_info, HasFP16>, T_MAP5XS;
+defm VCVTSS2SH : avx512_cvt_fp_scalar_trunc<0x1D, "vcvtss2sh", X86frounds,
+                                          X86froundsRnd, WriteCvtSD2SS, f32x_info,
+                                          f16x_info, HasFP16>, T_MAP5PS;
+defm VCVTSH2SS : avx512_cvt_fp_scalar_extend<0x13, "vcvtsh2ss", X86fpexts,
+                                          X86fpextsSAE, WriteCvtSS2SD, f16x_info,
+                                          f32x_info, HasFP16>, T_MAP6PS;
 
 def : Pat<(f64 (any_fpextend FR32X:$src)),
           (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>,
@@ -7806,6 +7911,27 @@ def : Pat<(f32 (any_fpround FR64X:$src)),
           (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>,
            Requires<[HasAVX512]>;
 
+def : Pat<(f32 (any_fpextend FR16X:$src)),
+          (VCVTSH2SSZrr (f32 (IMPLICIT_DEF)), FR16X:$src)>,
+          Requires<[HasFP16]>;
+def : Pat<(f32 (any_fpextend (loadf16 addr:$src))),
+          (VCVTSH2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasFP16, OptForSize]>;
+
+def : Pat<(f64 (any_fpextend FR16X:$src)),
+          (VCVTSH2SDZrr (f64 (IMPLICIT_DEF)), FR16X:$src)>,
+          Requires<[HasFP16]>;
+def : Pat<(f64 (any_fpextend (loadf16 addr:$src))),
+          (VCVTSH2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
+          Requires<[HasFP16, OptForSize]>;
+
+def : Pat<(f16 (any_fpround FR32X:$src)),
+          (VCVTSS2SHZrr (f16 (IMPLICIT_DEF)), FR32X:$src)>,
+           Requires<[HasFP16]>;
+def : Pat<(f16 (any_fpround FR64X:$src)),
+          (VCVTSD2SHZrr (f16 (IMPLICIT_DEF)), FR64X:$src)>,
+           Requires<[HasFP16]>;
+
 def : Pat<(v4f32 (X86Movss
                    (v4f32 VR128X:$dst),
                    (v4f32 (scalar_to_vector
@@ -7919,39 +8045,82 @@ multiclass avx512_vcvt_fpextend<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
                    (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src)),
                    (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src))>;
 
-// Extend Float to Double
-multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
-                           X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX512] in {
-    defm Z : avx512_vcvt_fpextend<opc, OpcodeStr, v8f64_info, v8f32x_info,
+// Extend [Float to Double, Half to Float]
+multiclass avx512_cvt_extend<bits<8> opc, string OpcodeStr,
+                             AVX512VLVectorVTInfo _dst, AVX512VLVectorVTInfo _src,
+                             X86SchedWriteWidths sched, Predicate prd = HasAVX512> {
+  let Predicates = [prd] in {
+    defm Z : avx512_vcvt_fpextend<opc, OpcodeStr,  _dst.info512, _src.info256,
                             any_fpextend, fpextend, sched.ZMM>,
-             avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, _dst.info512, _src.info256,
                                 X86vfpextSAE, sched.ZMM>, EVEX_V512;
   }
-  let Predicates = [HasVLX] in {
-    defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, v2f64x_info, v4f32x_info,
-                               X86any_vfpext, X86vfpext, sched.XMM, "{1to2}",
+  let Predicates = [prd, HasVLX] in {
+    defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, _dst.info128, _src.info128,
+                               X86any_vfpext, X86vfpext, sched.XMM,
+                               _dst.info128.BroadcastStr,
                                "", f64mem>, EVEX_V128;
-    defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info,
-                                     any_fpextend, fpextend, sched.YMM>, EVEX_V256;
-  }
-}
-
-// Truncate Double to Float
-multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> {
-  let Predicates = [HasAVX512] in {
-    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info,
+    defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, _dst.info256, _src.info128,
+                               any_fpextend, fpextend, sched.YMM>, EVEX_V256;
+  }
+}
+
+// Truncate [Double to Float, Float to Half]
+multiclass avx512_cvt_trunc<bits<8> opc, string OpcodeStr,
+                            AVX512VLVectorVTInfo _dst, AVX512VLVectorVTInfo _src,
+                            X86SchedWriteWidths sched, Predicate prd = HasAVX512,
+                            PatFrag bcast128 = _src.info128.BroadcastLdFrag,
+                            PatFrag bcast256 = _src.info256.BroadcastLdFrag,
+                            PatFrag bcast512 = _src.info512.BroadcastLdFrag,
+                            PatFrag loadVT128 = _src.info128.LdFrag,
+                            PatFrag loadVT256 = _src.info256.LdFrag,
+                            PatFrag loadVT512 = _src.info512.LdFrag,
+                            RegisterClass maskRC128 = _src.info128.KRCWM,
+                            RegisterClass maskRC256 = _src.info256.KRCWM,
+                            RegisterClass maskRC512 = _src.info512.KRCWM> {
+  let Predicates = [prd] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, _dst.info256, _src.info512,
                             X86any_vfpround, X86vfpround, sched.ZMM>,
-             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, _dst.info256, _src.info512,
                                X86vfproundRnd, sched.ZMM>, EVEX_V512;
   }
-  let Predicates = [HasVLX] in {
-    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
-                               null_frag, null_frag, sched.XMM, "{1to2}", "{x}",
-                               f128mem, VK2WM>, EVEX_V128;
-    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info,
+  let Predicates = [prd, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info128,
+                               null_frag, null_frag, sched.XMM,
+                               _src.info128.BroadcastStr, "{x}",
+                               f128mem, maskRC128>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info256,
                                X86any_vfpround, X86vfpround,
-                               sched.YMM, "{1to4}", "{y}">, EVEX_V256;
+                               sched.YMM, _src.info256.BroadcastStr, "{y}">, EVEX_V256;
+
+    // Special patterns to allow use of X86vmfpround for masking. Instruction
+    // patterns have been disabled with null_frag.
+    def : Pat<(_dst.info128.VT (X86any_vfpround (_src.info128.VT VR128X:$src))),
+              (!cast<Instruction>(NAME # "Z128rr") VR128X:$src)>;
+    def : Pat<(X86vmfpround (_src.info128.VT VR128X:$src), (_dst.info128.VT VR128X:$src0),
+                            maskRC128:$mask),
+              (!cast<Instruction>(NAME # "Z128rrk") VR128X:$src0, maskRC128:$mask, VR128X:$src)>;
+    def : Pat<(X86vmfpround (_src.info128.VT VR128X:$src), _dst.info128.ImmAllZerosV,
+                            maskRC128:$mask),
+              (!cast<Instruction>(NAME # "Z128rrkz") maskRC128:$mask, VR128X:$src)>;
+
+    def : Pat<(_dst.info128.VT (X86any_vfpround (loadVT128 addr:$src))),
+              (!cast<Instruction>(NAME # "Z128rm") addr:$src)>;
+    def : Pat<(X86vmfpround (loadVT128 addr:$src), (_dst.info128.VT VR128X:$src0),
+                            maskRC128:$mask),
+              (!cast<Instruction>(NAME # "Z128rmk") VR128X:$src0, maskRC128:$mask, addr:$src)>;
+    def : Pat<(X86vmfpround (loadVT128 addr:$src), _dst.info128.ImmAllZerosV,
+                            maskRC128:$mask),
+              (!cast<Instruction>(NAME # "Z128rmkz") maskRC128:$mask, addr:$src)>;
+
+    def : Pat<(_dst.info128.VT (X86any_vfpround (_src.info128.VT (bcast128 addr:$src)))),
+              (!cast<Instruction>(NAME # "Z128rmb") addr:$src)>;
+    def : Pat<(X86vmfpround (_src.info128.VT (bcast128 addr:$src)),
+                            (_dst.info128.VT VR128X:$src0), maskRC128:$mask),
+              (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$src0, maskRC128:$mask, addr:$src)>;
+    def : Pat<(X86vmfpround (_src.info128.VT (bcast128 addr:$src)),
+                            _dst.info128.ImmAllZerosV, maskRC128:$mask),
+              (!cast<Instruction>(NAME # "Z128rmbkz") maskRC128:$mask, addr:$src)>;
   }
 
   def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
@@ -7995,40 +8164,185 @@ multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sc
                   VK4WM:$mask, f64mem:$src), 0, "att">;
 }
 
-defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>,
+defm VCVTPD2PS : avx512_cvt_trunc<0x5A, "vcvtpd2ps",
+                                  avx512vl_f32_info, avx512vl_f64_info, SchedWriteCvtPD2PS>,
                                   VEX_W, PD, EVEX_CD8<64, CD8VF>;
-defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SchedWriteCvtPS2PD>,
-                                  PS, EVEX_CD8<32, CD8VH>;
+defm VCVTPS2PD : avx512_cvt_extend<0x5A, "vcvtps2pd",
+                                   avx512vl_f64_info, avx512vl_f32_info, SchedWriteCvtPS2PD>,
+                                   PS, EVEX_CD8<32, CD8VH>;
 
-let Predicates = [HasVLX] in {
+// Extend Half to Double
+multiclass avx512_cvtph2pd<bits<8> opc, string OpcodeStr,
+                            X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fpextend<opc, OpcodeStr, v8f64_info, v8f16x_info,
+                                  any_fpextend, fpextend, sched.ZMM>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f16x_info,
+                                X86vfpextSAE, sched.ZMM>, EVEX_V512;
+    def : Pat<(v8f64 (extloadv8f16 addr:$src)),
+                (!cast<Instruction>(NAME # "Zrm") addr:$src)>;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, v2f64x_info, v8f16x_info,
+                                     X86any_vfpext, X86vfpext, sched.XMM, "{1to2}", "",
+                                     f32mem>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v8f16x_info,
+                                     X86any_vfpext, X86vfpext, sched.YMM, "{1to4}", "",
+                                     f64mem>, EVEX_V256;
+  }
+}
+
+// Truncate Double to Half
+multiclass avx512_cvtpd2ph<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v8f64_info,
+                            X86any_vfpround, X86vfpround, sched.ZMM, "{1to8}", "{z}">,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f16x_info, v8f64_info,
+                               X86vfproundRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v2f64x_info, null_frag,
+                               null_frag, sched.XMM, "{1to2}", "{x}", f128mem,
+                               VK2WM>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v4f64x_info, null_frag,
+                               null_frag, sched.YMM, "{1to4}", "{y}", f256mem,
+                               VK4WM>, EVEX_V256;
+  }
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+                  VR128X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+                  (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+                  VK2WM:$mask, VR128X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+                  (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+                  VK2WM:$mask, VR128X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+                  (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
+                  i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
+                  "$dst {${mask}}, ${src}{1to2}}",
+                  (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+                  VK2WM:$mask, i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, ${src}{1to2}}",
+                  (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+                  VK2WM:$mask, i64mem:$src), 0, "att">;
+
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+                  VR256X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|"
+                  "$dst {${mask}}, $src}",
+                  (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+                  VK4WM:$mask, VR256X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, $src}",
+                  (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+                  VK4WM:$mask, VR256X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+                  (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
+                  i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
+                  "$dst {${mask}}, ${src}{1to4}}",
+                  (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+                  VK4WM:$mask, i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, ${src}{1to4}}",
+                  (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+                  VK4WM:$mask, i64mem:$src), 0, "att">;
+
+  def : InstAlias<OpcodeStr#"z\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Zrr") VR128X:$dst,
+                  VR512:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{$src, $dst {${mask}}|"
+                  "$dst {${mask}}, $src}",
+                  (!cast<Instruction>(NAME # "Zrrk") VR128X:$dst,
+                  VK8WM:$mask, VR512:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{$src, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, $src}",
+                  (!cast<Instruction>(NAME # "Zrrkz") VR128X:$dst,
+                  VK8WM:$mask, VR512:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{${src}{1to8}, $dst|$dst, ${src}{1to8}}",
+                  (!cast<Instruction>(NAME # "Zrmb") VR128X:$dst,
+                  i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{${src}{1to8}, $dst {${mask}}|"
+                  "$dst {${mask}}, ${src}{1to8}}",
+                  (!cast<Instruction>(NAME # "Zrmbk") VR128X:$dst,
+                  VK8WM:$mask, i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{${src}{1to8}, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, ${src}{1to8}}",
+                  (!cast<Instruction>(NAME # "Zrmbkz") VR128X:$dst,
+                  VK8WM:$mask, i64mem:$src), 0, "att">;
+}
+
+defm VCVTPS2PHX : avx512_cvt_trunc<0x1D, "vcvtps2phx", avx512vl_f16_info,
+                                   avx512vl_f32_info, SchedWriteCvtPD2PS,
+                                   HasFP16>, T_MAP5PD, EVEX_CD8<32, CD8VF>;
+defm VCVTPH2PSX : avx512_cvt_extend<0x13, "vcvtph2psx", avx512vl_f32_info,
+                                    avx512vl_f16_info, SchedWriteCvtPS2PD,
+                                    HasFP16>, T_MAP6PD, EVEX_CD8<16, CD8VH>;
+defm VCVTPD2PH : avx512_cvtpd2ph<0x5A, "vcvtpd2ph", SchedWriteCvtPD2PS>,
+                                 VEX_W, T_MAP5PD, EVEX_CD8<64, CD8VF>;
+defm VCVTPH2PD : avx512_cvtph2pd<0x5A, "vcvtph2pd", SchedWriteCvtPS2PD>,
+                                 T_MAP5PS, EVEX_CD8<16, CD8VQ>;
+
+let Predicates = [HasFP16, HasVLX] in {
   // Special patterns to allow use of X86vmfpround for masking. Instruction
   // patterns have been disabled with null_frag.
-  def : Pat<(X86any_vfpround (v2f64 VR128X:$src)),
-            (VCVTPD2PSZ128rr VR128X:$src)>;
-  def : Pat<(X86vmfpround (v2f64 VR128X:$src), (v4f32 VR128X:$src0),
+  def : Pat<(v8f16 (X86any_vfpround (v4f64 VR256X:$src))),
+            (VCVTPD2PHZ256rr VR256X:$src)>;
+  def : Pat<(v8f16 (X86vmfpround (v4f64 VR256X:$src), (v8f16 VR128X:$src0),
+                          VK4WM:$mask)),
+            (VCVTPD2PHZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>;
+  def : Pat<(X86vmfpround (v4f64 VR256X:$src), v8f16x_info.ImmAllZerosV,
+                          VK4WM:$mask),
+            (VCVTPD2PHZ256rrkz VK4WM:$mask, VR256X:$src)>;
+
+  def : Pat<(v8f16 (X86any_vfpround (loadv4f64 addr:$src))),
+            (VCVTPD2PHZ256rm addr:$src)>;
+  def : Pat<(X86vmfpround (loadv4f64 addr:$src), (v8f16 VR128X:$src0),
+                          VK4WM:$mask),
+            (VCVTPD2PHZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+  def : Pat<(X86vmfpround (loadv4f64 addr:$src), v8f16x_info.ImmAllZerosV,
+                          VK4WM:$mask),
+            (VCVTPD2PHZ256rmkz VK4WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_vfpround (v4f64 (X86VBroadcastld64 addr:$src)))),
+            (VCVTPD2PHZ256rmb addr:$src)>;
+  def : Pat<(X86vmfpround (v4f64 (X86VBroadcastld64 addr:$src)),
+                          (v8f16 VR128X:$src0), VK4WM:$mask),
+            (VCVTPD2PHZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+  def : Pat<(X86vmfpround (v4f64 (X86VBroadcastld64 addr:$src)),
+                          v8f16x_info.ImmAllZerosV, VK4WM:$mask),
+            (VCVTPD2PHZ256rmbkz VK4WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_vfpround (v2f64 VR128X:$src))),
+            (VCVTPD2PHZ128rr VR128X:$src)>;
+  def : Pat<(X86vmfpround (v2f64 VR128X:$src), (v8f16 VR128X:$src0),
                           VK2WM:$mask),
-            (VCVTPD2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
-  def : Pat<(X86vmfpround (v2f64 VR128X:$src), v4f32x_info.ImmAllZerosV,
+            (VCVTPD2PHZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+  def : Pat<(X86vmfpround (v2f64 VR128X:$src), v8f16x_info.ImmAllZerosV,
                           VK2WM:$mask),
-            (VCVTPD2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
+            (VCVTPD2PHZ128rrkz VK2WM:$mask, VR128X:$src)>;
 
-  def : Pat<(X86any_vfpround (loadv2f64 addr:$src)),
-            (VCVTPD2PSZ128rm addr:$src)>;
-  def : Pat<(X86vmfpround (loadv2f64 addr:$src), (v4f32 VR128X:$src0),
+  def : Pat<(v8f16 (X86any_vfpround (loadv2f64 addr:$src))),
+            (VCVTPD2PHZ128rm addr:$src)>;
+  def : Pat<(X86vmfpround (loadv2f64 addr:$src), (v8f16 VR128X:$src0),
                           VK2WM:$mask),
-            (VCVTPD2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86vmfpround (loadv2f64 addr:$src), v4f32x_info.ImmAllZerosV,
+            (VCVTPD2PHZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+  def : Pat<(X86vmfpround (loadv2f64 addr:$src), v8f16x_info.ImmAllZerosV,
                           VK2WM:$mask),
-            (VCVTPD2PSZ128rmkz VK2WM:$mask, addr:$src)>;
+            (VCVTPD2PHZ128rmkz VK2WM:$mask, addr:$src)>;
 
-  def : Pat<(X86any_vfpround (v2f64 (X86VBroadcastld64 addr:$src))),
-            (VCVTPD2PSZ128rmb addr:$src)>;
+  def : Pat<(v8f16 (X86any_vfpround (v2f64 (X86VBroadcastld64 addr:$src)))),
+            (VCVTPD2PHZ128rmb addr:$src)>;
   def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)),
-                          (v4f32 VR128X:$src0), VK2WM:$mask),
-            (VCVTPD2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+                          (v8f16 VR128X:$src0), VK2WM:$mask),
+            (VCVTPD2PHZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
   def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)),
-                          v4f32x_info.ImmAllZerosV, VK2WM:$mask),
-            (VCVTPD2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
+                          v8f16x_info.ImmAllZerosV, VK2WM:$mask),
+            (VCVTPD2PHZ128rmbkz VK2WM:$mask, addr:$src)>;
 }
 
 // Convert Signed/Unsigned Doubleword to Double
@@ -8349,26 +8663,60 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
 }
 
 // Convert Signed/Unsigned Quardword to Float
-multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
-                           SDNode MaskOpNode, SDNode OpNodeRnd,
-                           X86SchedWriteWidths sched> {
-  let Predicates = [HasDQI] in {
-    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode,
+// Also Convert Signed/Unsigned Doubleword to Half
+multiclass avx512_cvtqq2ps_dq2ph<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                                 SDPatternOperator MaskOpNode, SDPatternOperator OpNode128,
+                                 SDPatternOperator OpNode128M, SDPatternOperator OpNodeRnd,
+                                 AVX512VLVectorVTInfo _dst, AVX512VLVectorVTInfo _src,
+                                 X86SchedWriteWidths sched, Predicate prd = HasDQI> {
+  let Predicates = [prd] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, _dst.info256, _src.info512, OpNode,
                             MaskOpNode, sched.ZMM>,
-             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, _dst.info256, _src.info512,
                                OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
-  let Predicates = [HasDQI, HasVLX] in {
+  let Predicates = [prd, HasVLX] in {
     // we need "x"/"y" suffixes in order to distinguish between 128 and 256
     // memory forms of these instructions in Asm Parcer. They have the same
     // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
     // due to the same reason.
-    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, null_frag,
-                               null_frag, sched.XMM, "{1to2}", "{x}", i128mem, VK2WM>,
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info128, null_frag,
+                               null_frag, sched.XMM, _src.info128.BroadcastStr,
+                               "{x}", i128mem, _src.info128.KRCWM>,
                                EVEX_V128, NotEVEX2VEXConvertible;
-    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
-                               MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256,
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, _dst.info128, _src.info256, OpNode,
+                               MaskOpNode, sched.YMM, _src.info256.BroadcastStr,
+                               "{y}">, EVEX_V256,
                                NotEVEX2VEXConvertible;
+
+    // Special patterns to allow use of X86VM[SU]intToFP for masking. Instruction
+    // patterns have been disabled with null_frag.
+    def : Pat<(_dst.info128.VT (OpNode128 (_src.info128.VT VR128X:$src))),
+              (!cast<Instruction>(NAME # "Z128rr") VR128X:$src)>;
+    def : Pat<(OpNode128M (_src.info128.VT VR128X:$src), (_dst.info128.VT VR128X:$src0),
+                             _src.info128.KRCWM:$mask),
+              (!cast<Instruction>(NAME # "Z128rrk") VR128X:$src0, _src.info128.KRCWM:$mask, VR128X:$src)>;
+    def : Pat<(OpNode128M (_src.info128.VT VR128X:$src), _dst.info128.ImmAllZerosV,
+                             _src.info128.KRCWM:$mask),
+              (!cast<Instruction>(NAME # "Z128rrkz") _src.info128.KRCWM:$mask, VR128X:$src)>;
+
+    def : Pat<(_dst.info128.VT (OpNode128 (_src.info128.LdFrag addr:$src))),
+              (!cast<Instruction>(NAME # "Z128rm") addr:$src)>;
+    def : Pat<(OpNode128M (_src.info128.LdFrag addr:$src), (_dst.info128.VT VR128X:$src0),
+                             _src.info128.KRCWM:$mask),
+              (!cast<Instruction>(NAME # "Z128rmk") VR128X:$src0, _src.info128.KRCWM:$mask, addr:$src)>;
+    def : Pat<(OpNode128M (_src.info128.LdFrag addr:$src), _dst.info128.ImmAllZerosV,
+                             _src.info128.KRCWM:$mask),
+              (!cast<Instruction>(NAME # "Z128rmkz") _src.info128.KRCWM:$mask, addr:$src)>;
+
+    def : Pat<(_dst.info128.VT (OpNode128 (_src.info128.VT (X86VBroadcastld64 addr:$src)))),
+              (!cast<Instruction>(NAME # "Z128rmb") addr:$src)>;
+    def : Pat<(OpNode128M (_src.info128.VT (X86VBroadcastld64 addr:$src)),
+                             (_dst.info128.VT VR128X:$src0), _src.info128.KRCWM:$mask),
+              (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$src0, _src.info128.KRCWM:$mask, addr:$src)>;
+    def : Pat<(OpNode128M (_src.info128.VT (X86VBroadcastld64 addr:$src)),
+                             _dst.info128.ImmAllZerosV, _src.info128.KRCWM:$mask),
+              (!cast<Instruction>(NAME # "Z128rmbkz") _src.info128.KRCWM:$mask, addr:$src)>;
   }
 
   def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
@@ -8510,13 +8858,29 @@ defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", any_uint_to_fp,
                             uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PD>,
                             VEX_W, XS, EVEX_CD8<64, CD8VF>;
 
-defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", any_sint_to_fp,
-                            sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
-                            VEX_W, PS, EVEX_CD8<64, CD8VF>;
+defm VCVTDQ2PH : avx512_cvtqq2ps_dq2ph<0x5B, "vcvtdq2ph", any_sint_to_fp, sint_to_fp,
+                            X86any_VSintToFP, X86VMSintToFP,
+                            X86VSintToFpRnd, avx512vl_f16_info, avx512vl_i32_info,
+                            SchedWriteCvtDQ2PS, HasFP16>,
+                            T_MAP5PS, EVEX_CD8<32, CD8VF>;
+
+defm VCVTUDQ2PH : avx512_cvtqq2ps_dq2ph<0x7A, "vcvtudq2ph", any_uint_to_fp, uint_to_fp,
+                            X86any_VUintToFP, X86VMUintToFP,
+                            X86VUintToFpRnd, avx512vl_f16_info, avx512vl_i32_info,
+                            SchedWriteCvtDQ2PS, HasFP16>, T_MAP5XD,
+                            EVEX_CD8<32, CD8VF>;
 
-defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", any_uint_to_fp,
-                            uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PS>,
-                            VEX_W, XD, EVEX_CD8<64, CD8VF>;
+defm VCVTQQ2PS : avx512_cvtqq2ps_dq2ph<0x5B, "vcvtqq2ps", any_sint_to_fp, sint_to_fp,
+                            X86any_VSintToFP, X86VMSintToFP,
+                            X86VSintToFpRnd, avx512vl_f32_info, avx512vl_i64_info,
+                            SchedWriteCvtDQ2PS>, VEX_W, PS,
+                            EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PS : avx512_cvtqq2ps_dq2ph<0x7A, "vcvtuqq2ps", any_uint_to_fp, uint_to_fp,
+                            X86any_VUintToFP, X86VMUintToFP,
+                            X86VUintToFpRnd, avx512vl_f32_info, avx512vl_i64_info,
+                            SchedWriteCvtDQ2PS>, VEX_W, XD,
+                            EVEX_CD8<64, CD8VF>;
 
 let Predicates = [HasVLX] in {
   // Special patterns to allow use of X86mcvtp2Int for masking. Instruction
@@ -8706,66 +9070,6 @@ let Predicates = [HasVLX] in {
             (VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
 }
 
-let Predicates = [HasDQI, HasVLX] in {
-  // Special patterns to allow use of X86VMSintToFP for masking. Instruction
-  // patterns have been disabled with null_frag.
-  def : Pat<(v4f32 (X86any_VSintToFP (v2i64 VR128X:$src))),
-            (VCVTQQ2PSZ128rr VR128X:$src)>;
-  def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0),
-                           VK2WM:$mask),
-            (VCVTQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
-  def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV,
-                           VK2WM:$mask),
-            (VCVTQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
-
-  def : Pat<(v4f32 (X86any_VSintToFP (loadv2i64 addr:$src))),
-            (VCVTQQ2PSZ128rm addr:$src)>;
-  def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0),
-                           VK2WM:$mask),
-            (VCVTQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV,
-                           VK2WM:$mask),
-            (VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
-
-  def : Pat<(v4f32 (X86any_VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
-            (VCVTQQ2PSZ128rmb addr:$src)>;
-  def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
-                           (v4f32 VR128X:$src0), VK2WM:$mask),
-            (VCVTQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
-                           v4f32x_info.ImmAllZerosV, VK2WM:$mask),
-            (VCVTQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
-
-  // Special patterns to allow use of X86VMUintToFP for masking. Instruction
-  // patterns have been disabled with null_frag.
-  def : Pat<(v4f32 (X86any_VUintToFP (v2i64 VR128X:$src))),
-            (VCVTUQQ2PSZ128rr VR128X:$src)>;
-  def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0),
-                           VK2WM:$mask),
-            (VCVTUQQ2PSZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
-  def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), v4f32x_info.ImmAllZerosV,
-                           VK2WM:$mask),
-            (VCVTUQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
-
-  def : Pat<(v4f32 (X86any_VUintToFP (loadv2i64 addr:$src))),
-            (VCVTUQQ2PSZ128rm addr:$src)>;
-  def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0),
-                           VK2WM:$mask),
-            (VCVTUQQ2PSZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), v4f32x_info.ImmAllZerosV,
-                           VK2WM:$mask),
-            (VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
-
-  def : Pat<(v4f32 (X86any_VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
-            (VCVTUQQ2PSZ128rmb addr:$src)>;
-  def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
-                           (v4f32 VR128X:$src0), VK2WM:$mask),
-            (VCVTUQQ2PSZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
-                           v4f32x_info.ImmAllZerosV, VK2WM:$mask),
-            (VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
-}
-
 //===----------------------------------------------------------------------===//
 // Half precision conversion instructions
 //===----------------------------------------------------------------------===//
@@ -8945,10 +9249,35 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
   }
 }
 
-/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
+let Defs = [EFLAGS], Predicates = [HasFP16] in {
+  defm VUCOMISHZ : avx512_ord_cmp_sae<0x2E, v8f16x_info, "vucomish",
+                                SSEPackedSingle>, AVX512PSIi8Base, T_MAP5PS,
+                                EVEX_CD8<16, CD8VT1>;
+  defm VCOMISHZ : avx512_ord_cmp_sae<0x2F, v8f16x_info, "vcomish",
+                                SSEPackedSingle>, AVX512PSIi8Base, T_MAP5PS,
+                                EVEX_CD8<16, CD8VT1>;
+  defm VUCOMISHZ : sse12_ord_cmp<0x2E, FR16X, X86any_fcmp, f16, f16mem, loadf16,
+                                "ucomish", SSEPackedSingle>, T_MAP5PS, EVEX,
+                                VEX_LIG, EVEX_CD8<16, CD8VT1>;
+  defm VCOMISHZ : sse12_ord_cmp<0x2F, FR16X, X86strict_fcmps, f16, f16mem, loadf16,
+                                "comish", SSEPackedSingle>, T_MAP5PS, EVEX,
+                                VEX_LIG, EVEX_CD8<16, CD8VT1>;
+  let isCodeGenOnly = 1 in {
+    defm VUCOMISHZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v8f16, shmem,
+                                sse_load_f16, "ucomish", SSEPackedSingle>,
+                                T_MAP5PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+
+    defm VCOMISHZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v8f16, shmem,
+                                sse_load_f16, "comish", SSEPackedSingle>,
+                                T_MAP5PS, EVEX, VEX_LIG, EVEX_CD8<16, CD8VT1>;
+  }
+}
+
+/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd, rcpsh, rsqrtsh
 multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                         X86FoldableSchedWrite sched, X86VectorVTInfo _> {
-  let Predicates = [HasAVX512], ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
+                         X86FoldableSchedWrite sched, X86VectorVTInfo _,
+                         Predicate prd = HasAVX512> {
+  let Predicates = [prd], ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
@@ -8963,6 +9292,13 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 }
 
+defm VRCPSHZ : avx512_fp14_s<0x4D, "vrcpsh", X86rcp14s, SchedWriteFRcp.Scl,
+                               f16x_info, HasFP16>, EVEX_CD8<16, CD8VT1>,
+                               T_MAP6PD;
+defm VRSQRTSHZ : avx512_fp14_s<0x4F, "vrsqrtsh", X86rsqrt14s,
+                                 SchedWriteFRsqrt.Scl, f16x_info, HasFP16>,
+                                 EVEX_CD8<16, CD8VT1>, T_MAP6PD;
+let Uses = [MXCSR] in {
 defm VRCP14SSZ : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SchedWriteFRcp.Scl,
                                f32x_info>, EVEX_CD8<32, CD8VT1>,
                                T8PD;
@@ -8975,6 +9311,7 @@ defm VRSQRT14SSZ : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s,
 defm VRSQRT14SDZ : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s,
                                  SchedWriteFRsqrt.Scl, f64x_info>, VEX_W,
                                  EVEX_CD8<64, CD8VT1>, T8PD;
+}
 
 /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
 multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -8998,33 +9335,45 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }
 }
 
-let Uses = [MXCSR] in
 multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                 X86SchedWriteWidths sched> {
-  defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, sched.ZMM,
-                           v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-  defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, sched.ZMM,
-                           v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
-
-  // Define only if AVX512VL feature is present.
-  let Predicates = [HasVLX] in {
-    defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
-                                OpNode, sched.XMM, v4f32x_info>,
-                               EVEX_V128, EVEX_CD8<32, CD8VF>;
-    defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
-                                OpNode, sched.YMM, v8f32x_info>,
-                               EVEX_V256, EVEX_CD8<32, CD8VF>;
-    defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
-                                OpNode, sched.XMM, v2f64x_info>,
-                               EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
-    defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
-                                OpNode, sched.YMM, v4f64x_info>,
-                               EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+  let Uses = [MXCSR] in {
+  defm 14PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14ps"), OpNode, sched.ZMM,
+                             v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm 14PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14pd"), OpNode, sched.ZMM,
+                             v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
   }
-}
+  let Predicates = [HasFP16] in
+  defm PHZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"), OpNode, sched.ZMM,
+                           v32f16_info>, EVEX_V512, T_MAP6PD, EVEX_CD8<16, CD8VF>;
 
-defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14, SchedWriteFRsqrt>;
-defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>;
+  // Define only if AVX512VL feature is present.
+  let Predicates = [HasVLX], Uses = [MXCSR] in {
+    defm 14PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14ps"),
+                                  OpNode, sched.XMM, v4f32x_info>,
+                                  EVEX_V128, EVEX_CD8<32, CD8VF>;
+    defm 14PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14ps"),
+                                  OpNode, sched.YMM, v8f32x_info>,
+                                  EVEX_V256, EVEX_CD8<32, CD8VF>;
+    defm 14PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14pd"),
+                                  OpNode, sched.XMM, v2f64x_info>,
+                                  EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
+    defm 14PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "14pd"),
+                                  OpNode, sched.YMM, v4f64x_info>,
+                                  EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm PHZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"),
+                                OpNode, sched.XMM, v8f16x_info>,
+                                EVEX_V128, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+    defm PHZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ph"),
+                                OpNode, sched.YMM, v16f16x_info>,
+                                EVEX_V256, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+  }
+}
+
+defm VRSQRT : avx512_fp14_p_vl_all<0x4E, "vrsqrt", X86rsqrt14, SchedWriteFRsqrt>;
+defm VRCP : avx512_fp14_p_vl_all<0x4C, "vrcp", X86rcp14, SchedWriteFRcp>;
 
 /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
 multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
@@ -9054,20 +9403,29 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
 multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                         SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
   defm SSZ : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, OpNodeSAE,
-                           sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG;
+                           sched>, EVEX_CD8<32, CD8VT1>, VEX_LIG, T8PD, EVEX_4V;
   defm SDZ : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, OpNodeSAE,
-                           sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
+                           sched>, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W, T8PD, EVEX_4V;
+}
+
+multiclass avx512_vgetexpsh<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        SDNode OpNodeSAE, X86FoldableSchedWrite sched> {
+  let Predicates = [HasFP16] in
+  defm SHZ : avx512_fp28_s<opc, OpcodeStr#"sh", f16x_info, OpNode,  OpNodeSAE, sched>,
+               EVEX_CD8<16, CD8VT1>, T_MAP6PD, EVEX_4V;
 }
 
 let Predicates = [HasERI] in {
   defm VRCP28   : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, X86rcp28SAEs,
-                               SchedWriteFRcp.Scl>, T8PD, EVEX_4V;
+                               SchedWriteFRcp.Scl>;
   defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, X86rsqrt28SAEs,
-                               SchedWriteFRsqrt.Scl>, T8PD, EVEX_4V;
+                               SchedWriteFRsqrt.Scl>;
 }
 
 defm VGETEXP   : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
-                              SchedWriteFRnd.Scl>, T8PD, EVEX_4V;
+                              SchedWriteFRnd.Scl>,
+                 avx512_vgetexpsh<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
+                                  SchedWriteFRnd.Scl>;
 /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
 
 multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
@@ -9131,6 +9489,19 @@ multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
   }
 }
 
+multiclass  avx512_vgetexp_fp16<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                       SDNode OpNodeSAE, X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in
+  defm PHZ : avx512_fp28_p<opc, OpcodeStr#"ph", v32f16_info, OpNode, sched.ZMM>,
+              avx512_fp28_p_sae<opc, OpcodeStr#"ph", v32f16_info, OpNodeSAE, sched.ZMM>,
+              T_MAP6PD, EVEX_V512, EVEX_CD8<16, CD8VF>;
+  let Predicates = [HasFP16, HasVLX] in {
+    defm PHZ128 : avx512_fp28_p<opc, OpcodeStr#"ph", v8f16x_info, OpNode, sched.XMM>,
+                                     EVEX_V128, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+    defm PHZ256 : avx512_fp28_p<opc, OpcodeStr#"ph", v16f16x_info, OpNode, sched.YMM>,
+                                     EVEX_V256, T_MAP6PD, EVEX_CD8<16, CD8VF>;
+  }
+}
 let Predicates = [HasERI] in {
  defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, X86rsqrt28SAE,
                             SchedWriteFRsqrt>, EVEX;
@@ -9141,6 +9512,8 @@ let Predicates = [HasERI] in {
 }
 defm VGETEXP   : avx512_eri<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE,
                             SchedWriteFRnd>,
+                 avx512_vgetexp_fp16<0x42, "vgetexp", X86fgetexp, X86fgetexpSAE,
+                                     SchedWriteFRnd>,
                  avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexp,
                                           SchedWriteFRnd>, EVEX;
 
@@ -9178,6 +9551,18 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
 let Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
                                   X86SchedWriteSizes sched> {
+  let Predicates = [HasFP16] in
+  defm PHZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
+                                sched.PH.ZMM, v32f16_info>,
+                                EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+  let Predicates = [HasFP16, HasVLX] in {
+    defm PHZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
+                                     sched.PH.XMM, v8f16x_info>,
+                                     EVEX_V128, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+    defm PHZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ph"),
+                                     sched.PH.YMM, v16f16x_info>,
+                                     EVEX_V256, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+  }
   defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
                                 sched.PS.ZMM, v16f32_info>,
                                 EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
@@ -9204,6 +9589,10 @@ multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
 let Uses = [MXCSR] in
 multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
                                         X86SchedWriteSizes sched> {
+  let Predicates = [HasFP16] in
+  defm PHZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ph"),
+                                      sched.PH.ZMM, v32f16_info>,
+                                      EVEX_V512, T_MAP5PS, EVEX_CD8<16, CD8VF>;
   defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"),
                                       sched.PS.ZMM, v16f32_info>,
                                       EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
@@ -9213,8 +9602,8 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWrite sched,
-                              X86VectorVTInfo _, string Name> {
-  let ExeDomain = _.ExeDomain in {
+                              X86VectorVTInfo _, string Name, Predicate prd = HasAVX512> {
+  let ExeDomain = _.ExeDomain, Predicates = [prd] in {
     defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
@@ -9236,7 +9625,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
                                      (i32 timm:$rc))>,
                          EVEX_B, EVEX_RC, Sched<[sched]>;
 
-    let isCodeGenOnly = 1, hasSideEffects = 0, Predicates=[HasAVX512] in {
+    let isCodeGenOnly = 1, hasSideEffects = 0 in {
       def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
                 (ins _.FRC:$src1, _.FRC:$src2),
                 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
@@ -9249,13 +9638,13 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
     }
   }
 
-  let Predicates = [HasAVX512] in {
+  let Predicates = [prd] in {
     def : Pat<(_.EltVT (any_fsqrt _.FRC:$src)),
               (!cast<Instruction>(Name#Zr)
                   (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
   }
 
-  let Predicates = [HasAVX512, OptForSize] in {
+  let Predicates = [prd, OptForSize] in {
     def : Pat<(_.EltVT (any_fsqrt (load addr:$src))),
               (!cast<Instruction>(Name#Zm)
                   (_.EltVT (IMPLICIT_DEF)), addr:$src)>;
@@ -9264,6 +9653,8 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
 
 multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr,
                                   X86SchedWriteSizes sched> {
+  defm SHZ : avx512_sqrt_scalar<opc, OpcodeStr#"sh", sched.PH.Scl, f16x_info, NAME#"SH", HasFP16>,
+                        EVEX_CD8<16, CD8VT1>, EVEX_4V, T_MAP5XS;
   defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", sched.PS.Scl, f32x_info, NAME#"SS">,
                         EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
   defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", sched.PD.Scl, f64x_info, NAME#"SD">,
@@ -9328,6 +9719,12 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
   }
 }
 
+let Predicates = [HasFP16] in
+defm VRNDSCALESHZ : avx512_rndscale_scalar<0x0A, "vrndscalesh",
+                                           SchedWriteFRnd.Scl, f16x_info>,
+                                           AVX512PSIi8Base, TA, EVEX_4V,
+                                           EVEX_CD8<16, CD8VT1>;
+
 defm VRNDSCALESSZ : avx512_rndscale_scalar<0x0A, "vrndscaless",
                                            SchedWriteFRnd.Scl, f32x_info>,
                                            AVX512AIi8Base, EVEX_4V, VEX_LIG,
@@ -9356,6 +9753,9 @@ multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
   }
 }
 
+defm : avx512_masked_scalar<fsqrt, "SQRTSHZ", X86Movsh,
+                            (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v8f16x_info,
+                            fp16imm0, (COPY_TO_REGCLASS  $mask, VK1WM), HasFP16>;
 defm : avx512_masked_scalar<fsqrt, "SQRTSSZ", X86Movss,
                             (v1i1 (scalar_to_vector (i8 (trunc (i32 GR32:$mask))))), v4f32x_info,
                             fp32imm0, (COPY_TO_REGCLASS  $mask, VK1WM), HasAVX512>;
@@ -10574,24 +10974,26 @@ multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
                     bits<8> opcPs, bits<8> opcPd, SDPatternOperator OpNode,
                     SDPatternOperator MaskOpNode, SDNode OpNodeSAE,
                     X86SchedWriteWidths sched, Predicate prd>{
+  defm PH : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f16_info,
+                            opcPs, OpNode, MaskOpNode, OpNodeSAE, sched, HasFP16>,
+                            AVX512PSIi8Base, TA, EVEX, EVEX_CD8<16, CD8VF>;
   defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
                             opcPs, OpNode, MaskOpNode, OpNodeSAE, sched, prd>,
-                            EVEX_CD8<32, CD8VF>;
+                            AVX512AIi8Base, EVEX, EVEX_CD8<32, CD8VF>;
   defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
                             opcPd, OpNode, MaskOpNode, OpNodeSAE, sched, prd>,
-                            EVEX_CD8<64, CD8VF>, VEX_W;
+                            AVX512AIi8Base, EVEX, EVEX_CD8<64, CD8VF>, VEX_W;
 }
 
 defm VREDUCE   : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
                               X86VReduce, X86VReduce, X86VReduceSAE,
-                              SchedWriteFRnd, HasDQI>, AVX512AIi8Base, EVEX;
+                              SchedWriteFRnd, HasDQI>;
 defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
                               X86any_VRndScale, X86VRndScale, X86VRndScaleSAE,
-                              SchedWriteFRnd, HasAVX512>,
-                              AVX512AIi8Base, EVEX;
+                              SchedWriteFRnd, HasAVX512>;
 defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
                               X86VGetMant, X86VGetMant, X86VGetMantSAE,
-                              SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX;
+                              SchedWriteFRnd, HasAVX512>;
 
 defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
                                                 0x50, X86VRange, X86VRangeSAE,
@@ -10615,6 +11017,9 @@ defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
 defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
       0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasDQI>,
       AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VREDUCESH: avx512_common_fp_sae_scalar_imm<"vreducesh", f16x_info,
+      0x57, X86Reduces, X86ReducesSAE, SchedWriteFRnd, HasFP16>,
+      AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>;
 
 defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
       0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
@@ -10622,6 +11027,9 @@ defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
 defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
       0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasAVX512>,
       AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+defm VGETMANTSH: avx512_common_fp_sae_scalar_imm<"vgetmantsh", f16x_info,
+      0x27, X86GetMants, X86GetMantsSAE, SchedWriteFRnd, HasFP16>,
+      AVX512PSIi8Base, TA, VEX_LIG, EVEX_4V, EVEX_CD8<16, CD8VT1>;
 
 multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                                           X86FoldableSchedWrite sched,
@@ -11868,6 +12276,11 @@ defm : AVX512_scalar_math_fp_patterns<any_fsub, fsub, "SUBSD", X86Movsd, v2f64x_
 defm : AVX512_scalar_math_fp_patterns<any_fmul, fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>;
 defm : AVX512_scalar_math_fp_patterns<any_fdiv, fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>;
 
+defm : AVX512_scalar_math_fp_patterns<any_fadd, fadd, "ADDSH", X86Movsh, v8f16x_info, fp16imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fsub, fsub, "SUBSH", X86Movsh, v8f16x_info, fp16imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fmul, fmul, "MULSH", X86Movsh, v8f16x_info, fp16imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fdiv, fdiv, "DIVSH", X86Movsh, v8f16x_info, fp16imm0>;
+
 multiclass AVX512_scalar_unary_math_patterns<SDPatternOperator OpNode, string OpcPrefix,
                                              SDNode Move, X86VectorVTInfo _> {
   let Predicates = [HasAVX512] in {
@@ -11879,6 +12292,7 @@ multiclass AVX512_scalar_unary_math_patterns<SDPatternOperator OpNode, string Op
 
 defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32x_info>;
 defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64x_info>;
+defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSH", X86Movsh, v8f16x_info>;
 
 //===----------------------------------------------------------------------===//
 // AES instructions
@@ -12563,3 +12977,510 @@ let hasSideEffects = 0 in {
   def VMOVSHtoW64rr : AVX512<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
                      "vmovw\t{$src, $dst|$dst, $src}", []>, T_MAP5PD, EVEX, VEX_W, Sched<[WriteVecMoveToGpr]>;
 }
+
+// Convert 16-bit float to i16/u16
+multiclass avx512_cvtph2w<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                          SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                          AVX512VLVectorVTInfo _Dst,
+                          AVX512VLVectorVTInfo _Src,
+                          X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, _Dst.info512, _Src.info512,
+                            OpNode, MaskOpNode, sched.ZMM>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, _Dst.info512, _Src.info512,
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, _Dst.info128, _Src.info128,
+                               OpNode, MaskOpNode, sched.XMM>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, _Dst.info256, _Src.info256,
+                               OpNode, MaskOpNode, sched.YMM>, EVEX_V256;
+  }
+}
+
+// Convert 16-bit float to i16/u16 truncate
+multiclass avx512_cvttph2w<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                           SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                           AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src,
+                           X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, _Dst.info512, _Src.info512,
+                            OpNode, MaskOpNode, sched.ZMM>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, _Dst.info512, _Src.info512,
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, _Dst.info128, _Src.info128,
+                               OpNode, MaskOpNode, sched.XMM>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, _Dst.info256, _Src.info256,
+                               OpNode, MaskOpNode, sched.YMM>, EVEX_V256;
+  }
+}
+
+defm VCVTPH2UW : avx512_cvtph2w<0x7D, "vcvtph2uw", X86cvtp2UInt, X86cvtp2UInt,
+                                X86cvtp2UIntRnd, avx512vl_i16_info,
+                                avx512vl_f16_info, SchedWriteCvtPD2DQ>,
+                                T_MAP5PS, EVEX_CD8<16, CD8VF>;
+defm VCVTUW2PH : avx512_cvtph2w<0x7D, "vcvtuw2ph", any_uint_to_fp, uint_to_fp,
+                                X86VUintToFpRnd, avx512vl_f16_info,
+                                avx512vl_i16_info, SchedWriteCvtPD2DQ>,
+                                T_MAP5XD, EVEX_CD8<16, CD8VF>;
+defm VCVTTPH2W : avx512_cvttph2w<0x7C, "vcvttph2w", X86any_cvttp2si,
+                                X86cvttp2si, X86cvttp2siSAE,
+                                avx512vl_i16_info, avx512vl_f16_info,
+                                SchedWriteCvtPD2DQ>, T_MAP5PD, EVEX_CD8<16, CD8VF>;
+defm VCVTTPH2UW : avx512_cvttph2w<0x7C, "vcvttph2uw", X86any_cvttp2ui,
+                                X86cvttp2ui, X86cvttp2uiSAE,
+                                avx512vl_i16_info, avx512vl_f16_info,
+                                SchedWriteCvtPD2DQ>, T_MAP5PS, EVEX_CD8<16, CD8VF>;
+defm VCVTPH2W : avx512_cvtph2w<0x7D, "vcvtph2w", X86cvtp2Int, X86cvtp2Int,
+                                X86cvtp2IntRnd, avx512vl_i16_info,
+                                avx512vl_f16_info, SchedWriteCvtPD2DQ>,
+                                T_MAP5PD, EVEX_CD8<16, CD8VF>;
+defm VCVTW2PH : avx512_cvtph2w<0x7D, "vcvtw2ph", any_sint_to_fp, sint_to_fp,
+                                X86VSintToFpRnd, avx512vl_f16_info,
+                                avx512vl_i16_info, SchedWriteCvtPD2DQ>,
+                                T_MAP5XS, EVEX_CD8<16, CD8VF>;
+
+// Convert Half to Signed/Unsigned Doubleword
+multiclass avx512_cvtph2dq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                           SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f16x_info, OpNode,
+                            MaskOpNode, sched.ZMM>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f16x_info,
+                                OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.XMM, "{1to4}", "", f64mem>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.YMM>, EVEX_V256;
+  }
+}
+
+// Convert Half to Signed/Unsigned Doubleword with truncation
+multiclass avx512_cvttph2dq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                            SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                            X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f16x_info, OpNode,
+                            MaskOpNode, sched.ZMM>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f16x_info,
+                                OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.XMM, "{1to4}", "", f64mem>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.YMM>, EVEX_V256;
+  }
+}
+
+
+defm VCVTPH2DQ : avx512_cvtph2dq<0x5B, "vcvtph2dq", X86cvtp2Int, X86cvtp2Int,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 EVEX_CD8<16, CD8VH>;
+defm VCVTPH2UDQ : avx512_cvtph2dq<0x79, "vcvtph2udq", X86cvtp2UInt, X86cvtp2UInt,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5PS,
+                                 EVEX_CD8<16, CD8VH>;
+
+defm VCVTTPH2DQ : avx512_cvttph2dq<0x5B, "vcvttph2dq", X86any_cvttp2si,
+                                X86cvttp2si, X86cvttp2siSAE,
+                                SchedWriteCvtPS2DQ>, T_MAP5XS,
+                                EVEX_CD8<16, CD8VH>;
+
+defm VCVTTPH2UDQ : avx512_cvttph2dq<0x78, "vcvttph2udq", X86any_cvttp2ui,
+                                 X86cvttp2ui, X86cvttp2uiSAE,
+                                 SchedWriteCvtPS2DQ>, T_MAP5PS,
+                                 EVEX_CD8<16, CD8VH>;
+
+// Convert Half to Signed/Unsigned Quardword
+multiclass avx512_cvtph2qq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                           SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f16x_info, OpNode,
+                            MaskOpNode, sched.ZMM>,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f16x_info,
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    // Explicitly specified broadcast string, since we take only 2 elements
+    // from v8f16x_info source
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.XMM, "{1to2}", "", f32mem>,
+                               EVEX_V128;
+    // Explicitly specified broadcast string, since we take only 4 elements
+    // from v8f16x_info source
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.YMM, "{1to4}", "", f64mem>,
+                               EVEX_V256;
+  }
+}
+
+// Convert Half to Signed/Unsigned Quardword with truncation
+multiclass avx512_cvttph2qq<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                            SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                            X86SchedWriteWidths sched> {
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f16x_info, OpNode,
+                            MaskOpNode, sched.ZMM>,
+             avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f16x_info,
+                                OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    // Explicitly specified broadcast string, since we take only 2 elements
+    // from v8f16x_info source
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.XMM, "{1to2}", "", f32mem>, EVEX_V128;
+    // Explicitly specified broadcast string, since we take only 4 elements
+    // from v8f16x_info source
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v8f16x_info, OpNode,
+                               MaskOpNode, sched.YMM, "{1to4}", "", f64mem>, EVEX_V256;
+  }
+}
+
+defm VCVTPH2QQ : avx512_cvtph2qq<0x7B, "vcvtph2qq", X86cvtp2Int, X86cvtp2Int,
+                                 X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 EVEX_CD8<16, CD8VQ>;
+
+defm VCVTPH2UQQ : avx512_cvtph2qq<0x79, "vcvtph2uqq", X86cvtp2UInt, X86cvtp2UInt,
+                                 X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 EVEX_CD8<16, CD8VQ>;
+
+defm VCVTTPH2QQ : avx512_cvttph2qq<0x7A, "vcvttph2qq", X86any_cvttp2si,
+                                 X86cvttp2si, X86cvttp2siSAE,
+                                 SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 EVEX_CD8<16, CD8VQ>;
+
+defm VCVTTPH2UQQ : avx512_cvttph2qq<0x78, "vcvttph2uqq", X86any_cvttp2ui,
+                                 X86cvttp2ui, X86cvttp2uiSAE,
+                                 SchedWriteCvtPS2DQ>, T_MAP5PD,
+                                 EVEX_CD8<16, CD8VQ>;
+
+// Convert Signed/Unsigned Quardword to Half
+multiclass avx512_cvtqq2ph<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                           SDPatternOperator MaskOpNode, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
+  // we need "x"/"y"/"z" suffixes in order to distinguish between 128, 256 and
+  // 512 memory forms of these instructions in Asm Parcer. They have the same
+  // dest type - 'v8f16x_info'. We also specify the broadcast string explicitly
+  // due to the same reason.
+  let Predicates = [HasFP16] in {
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v8i64_info, OpNode,
+                            MaskOpNode, sched.ZMM, "{1to8}", "{z}">,
+             avx512_vcvt_fp_rc<opc, OpcodeStr, v8f16x_info, v8i64_info,
+                               OpNodeRnd, sched.ZMM>, EVEX_V512;
+  }
+  let Predicates = [HasFP16, HasVLX] in {
+    defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v2i64x_info,
+                               null_frag, null_frag, sched.XMM, "{1to2}", "{x}",
+                               i128mem, VK2WM>,
+                               EVEX_V128, NotEVEX2VEXConvertible;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f16x_info, v4i64x_info,
+                               null_frag, null_frag, sched.YMM, "{1to4}", "{y}",
+                               i256mem, VK4WM>,
+                               EVEX_V256, NotEVEX2VEXConvertible;
+  }
+
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+                  VR128X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+                  (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
+                  VK2WM:$mask, VR128X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+                  (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
+                  VK2WM:$mask, VR128X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+                  (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
+                  i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
+                  "$dst {${mask}}, ${src}{1to2}}",
+                  (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
+                  VK2WM:$mask, i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, ${src}{1to2}}",
+                  (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
+                  VK2WM:$mask, i64mem:$src), 0, "att">;
+
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+                  VR256X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|"
+                  "$dst {${mask}}, $src}",
+                  (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
+                  VK4WM:$mask, VR256X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, $src}",
+                  (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
+                  VK4WM:$mask, VR256X:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+                  (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
+                  i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
+                  "$dst {${mask}}, ${src}{1to4}}",
+                  (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
+                  VK4WM:$mask, i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, ${src}{1to4}}",
+                  (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
+                  VK4WM:$mask, i64mem:$src), 0, "att">;
+
+  def : InstAlias<OpcodeStr#"z\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Zrr") VR128X:$dst,
+                  VR512:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{$src, $dst {${mask}}|"
+                  "$dst {${mask}}, $src}",
+                  (!cast<Instruction>(NAME # "Zrrk") VR128X:$dst,
+                  VK8WM:$mask, VR512:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{$src, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, $src}",
+                  (!cast<Instruction>(NAME # "Zrrkz") VR128X:$dst,
+                  VK8WM:$mask, VR512:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{${src}{1to8}, $dst|$dst, ${src}{1to8}}",
+                  (!cast<Instruction>(NAME # "Zrmb") VR128X:$dst,
+                  i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{${src}{1to8}, $dst {${mask}}|"
+                  "$dst {${mask}}, ${src}{1to8}}",
+                  (!cast<Instruction>(NAME # "Zrmbk") VR128X:$dst,
+                  VK8WM:$mask, i64mem:$src), 0, "att">;
+  def : InstAlias<OpcodeStr#"z\t{${src}{1to8}, $dst {${mask}} {z}|"
+                  "$dst {${mask}} {z}, ${src}{1to8}}",
+                  (!cast<Instruction>(NAME # "Zrmbkz") VR128X:$dst,
+                  VK8WM:$mask, i64mem:$src), 0, "att">;
+}
+
+defm VCVTQQ2PH : avx512_cvtqq2ph<0x5B, "vcvtqq2ph", any_sint_to_fp, sint_to_fp,
+                            X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, T_MAP5PS,
+                            EVEX_CD8<64, CD8VF>;
+
+defm VCVTUQQ2PH : avx512_cvtqq2ph<0x7A, "vcvtuqq2ph", any_uint_to_fp, uint_to_fp,
+                            X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, T_MAP5XD,
+                            EVEX_CD8<64, CD8VF>;
+
+// Convert half to signed/unsigned int 32/64
+defm VCVTSH2SIZ: avx512_cvt_s_int_round<0x2D, f16x_info, i32x_info, X86cvts2si,
+                                   X86cvts2siRnd, WriteCvtSS2I, "cvtsh2si", "{l}", HasFP16>,
+                                   T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+defm VCVTSH2SI64Z: avx512_cvt_s_int_round<0x2D, f16x_info, i64x_info, X86cvts2si,
+                                   X86cvts2siRnd, WriteCvtSS2I, "cvtsh2si", "{q}", HasFP16>,
+                                   T_MAP5XS, VEX_W, EVEX_CD8<16, CD8VT1>;
+defm VCVTSH2USIZ: avx512_cvt_s_int_round<0x79, f16x_info, i32x_info, X86cvts2usi,
+                                   X86cvts2usiRnd, WriteCvtSS2I, "cvtsh2usi", "{l}", HasFP16>,
+                                   T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+defm VCVTSH2USI64Z: avx512_cvt_s_int_round<0x79, f16x_info, i64x_info, X86cvts2usi,
+                                   X86cvts2usiRnd, WriteCvtSS2I, "cvtsh2usi", "{q}", HasFP16>,
+                                   T_MAP5XS, VEX_W, EVEX_CD8<16, CD8VT1>;
+
+defm VCVTTSH2SIZ: avx512_cvt_s_all<0x2C, "vcvttsh2si", f16x_info, i32x_info,
+                        any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
+                        "{l}", HasFP16>, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+defm VCVTTSH2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsh2si", f16x_info, i64x_info,
+                        any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
+                        "{q}", HasFP16>, VEX_W, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+defm VCVTTSH2USIZ: avx512_cvt_s_all<0x78, "vcvttsh2usi", f16x_info, i32x_info,
+                        any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
+                        "{l}", HasFP16>, T_MAP5XS, EVEX_CD8<16, CD8VT1>;
+defm VCVTTSH2USI64Z: avx512_cvt_s_all<0x78, "vcvttsh2usi", f16x_info, i64x_info,
+                        any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
+                        "{q}", HasFP16>, T_MAP5XS, VEX_W, EVEX_CD8<16, CD8VT1>;
+
+let Predicates = [HasFP16] in {
+  defm VCVTSI2SHZ  : avx512_vcvtsi_common<0x2A,  X86SintToFp, X86SintToFpRnd, WriteCvtI2SS, GR32,
+                                   v8f16x_info, i32mem, loadi32, "cvtsi2sh", "l">,
+                                   T_MAP5XS, EVEX_CD8<32, CD8VT1>;
+  defm VCVTSI642SHZ: avx512_vcvtsi_common<0x2A,  X86SintToFp, X86SintToFpRnd, WriteCvtI2SS, GR64,
+                                   v8f16x_info, i64mem, loadi64, "cvtsi2sh","q">,
+                                   T_MAP5XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+  defm VCVTUSI2SHZ   : avx512_vcvtsi_common<0x7B,  X86UintToFp, X86UintToFpRnd, WriteCvtI2SS, GR32,
+                                    v8f16x_info, i32mem, loadi32,
+                                    "cvtusi2sh","l">, T_MAP5XS, EVEX_CD8<32, CD8VT1>;
+  defm VCVTUSI642SHZ : avx512_vcvtsi_common<0x7B,  X86UintToFp, X86UintToFpRnd, WriteCvtI2SS, GR64,
+                                    v8f16x_info, i64mem, loadi64, "cvtusi2sh", "q">,
+                                    T_MAP5XS, VEX_W, EVEX_CD8<64, CD8VT1>;
+  def : InstAlias<"vcvtsi2sh\t{$src, $src1, $dst|$dst, $src1, $src}",
+              (VCVTSI2SHZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
+
+  def : InstAlias<"vcvtusi2sh\t{$src, $src1, $dst|$dst, $src1, $src}",
+              (VCVTUSI2SHZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
+
+
+  def : Pat<(f16 (any_sint_to_fp (loadi32 addr:$src))),
+            (VCVTSI2SHZrm (f16 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f16 (any_sint_to_fp (loadi64 addr:$src))),
+            (VCVTSI642SHZrm (f16 (IMPLICIT_DEF)), addr:$src)>;
+
+  def : Pat<(f16 (any_sint_to_fp GR32:$src)),
+            (VCVTSI2SHZrr (f16 (IMPLICIT_DEF)), GR32:$src)>;
+  def : Pat<(f16 (any_sint_to_fp GR64:$src)),
+            (VCVTSI642SHZrr (f16 (IMPLICIT_DEF)), GR64:$src)>;
+
+  def : Pat<(f16 (any_uint_to_fp (loadi32 addr:$src))),
+            (VCVTUSI2SHZrm (f16 (IMPLICIT_DEF)), addr:$src)>;
+  def : Pat<(f16 (any_uint_to_fp (loadi64 addr:$src))),
+            (VCVTUSI642SHZrm (f16 (IMPLICIT_DEF)), addr:$src)>;
+
+  def : Pat<(f16 (any_uint_to_fp GR32:$src)),
+            (VCVTUSI2SHZrr (f16 (IMPLICIT_DEF)), GR32:$src)>;
+  def : Pat<(f16 (any_uint_to_fp GR64:$src)),
+            (VCVTUSI642SHZrr (f16 (IMPLICIT_DEF)), GR64:$src)>;
+
+  // Patterns used for matching vcvtsi2sh intrinsic sequences from clang
+  // which produce unnecessary vmovsh instructions
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_sint_to_fp GR64:$src)))))),
+            (VCVTSI642SHZrr_Int VR128X:$dst, GR64:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_sint_to_fp (loadi64 addr:$src))))))),
+            (VCVTSI642SHZrm_Int VR128X:$dst, addr:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_sint_to_fp GR32:$src)))))),
+            (VCVTSI2SHZrr_Int VR128X:$dst, GR32:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_sint_to_fp (loadi32 addr:$src))))))),
+            (VCVTSI2SHZrm_Int VR128X:$dst, addr:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_uint_to_fp GR64:$src)))))),
+            (VCVTUSI642SHZrr_Int VR128X:$dst, GR64:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_uint_to_fp (loadi64 addr:$src))))))),
+            (VCVTUSI642SHZrm_Int VR128X:$dst, addr:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_uint_to_fp GR32:$src)))))),
+            (VCVTUSI2SHZrr_Int VR128X:$dst, GR32:$src)>;
+
+  def : Pat<(v8f16 (X86Movsh
+                     (v8f16 VR128X:$dst),
+                     (v8f16 (scalar_to_vector (f16 (any_uint_to_fp (loadi32 addr:$src))))))),
+            (VCVTUSI2SHZrm_Int VR128X:$dst, addr:$src)>;
+} // Predicates = [HasFP16]
+
+let Predicates = [HasFP16, HasVLX] in {
+  // Special patterns to allow use of X86VMSintToFP for masking. Instruction
+  // patterns have been disabled with null_frag.
+  def : Pat<(v8f16 (X86any_VSintToFP (v4i64 VR256X:$src))),
+            (VCVTQQ2PHZ256rr VR256X:$src)>;
+  def : Pat<(X86VMSintToFP (v4i64 VR256X:$src), (v8f16 VR128X:$src0),
+                           VK4WM:$mask),
+            (VCVTQQ2PHZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>;
+  def : Pat<(X86VMSintToFP (v4i64 VR256X:$src), v8f16x_info.ImmAllZerosV,
+                           VK4WM:$mask),
+            (VCVTQQ2PHZ256rrkz VK4WM:$mask, VR256X:$src)>;
+
+  def : Pat<(v8f16 (X86any_VSintToFP (loadv4i64 addr:$src))),
+            (VCVTQQ2PHZ256rm addr:$src)>;
+  def : Pat<(X86VMSintToFP (loadv4i64 addr:$src), (v8f16 VR128X:$src0),
+                           VK4WM:$mask),
+            (VCVTQQ2PHZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+  def : Pat<(X86VMSintToFP (loadv4i64 addr:$src), v8f16x_info.ImmAllZerosV,
+                           VK4WM:$mask),
+            (VCVTQQ2PHZ256rmkz VK4WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_VSintToFP (v4i64 (X86VBroadcastld64 addr:$src)))),
+            (VCVTQQ2PHZ256rmb addr:$src)>;
+  def : Pat<(X86VMSintToFP (v4i64 (X86VBroadcastld64 addr:$src)),
+                           (v8f16 VR128X:$src0), VK4WM:$mask),
+            (VCVTQQ2PHZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+  def : Pat<(X86VMSintToFP (v4i64 (X86VBroadcastld64 addr:$src)),
+                           v8f16x_info.ImmAllZerosV, VK4WM:$mask),
+            (VCVTQQ2PHZ256rmbkz VK4WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_VSintToFP (v2i64 VR128X:$src))),
+            (VCVTQQ2PHZ128rr VR128X:$src)>;
+  def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), (v8f16 VR128X:$src0),
+                           VK2WM:$mask),
+            (VCVTQQ2PHZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+  def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), v8f16x_info.ImmAllZerosV,
+                           VK2WM:$mask),
+            (VCVTQQ2PHZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+  def : Pat<(v8f16 (X86any_VSintToFP (loadv2i64 addr:$src))),
+            (VCVTQQ2PHZ128rm addr:$src)>;
+  def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), (v8f16 VR128X:$src0),
+                           VK2WM:$mask),
+            (VCVTQQ2PHZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+  def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), v8f16x_info.ImmAllZerosV,
+                           VK2WM:$mask),
+            (VCVTQQ2PHZ128rmkz VK2WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
+            (VCVTQQ2PHZ128rmb addr:$src)>;
+  def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
+                           (v8f16 VR128X:$src0), VK2WM:$mask),
+            (VCVTQQ2PHZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+  def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
+                           v8f16x_info.ImmAllZerosV, VK2WM:$mask),
+            (VCVTQQ2PHZ128rmbkz VK2WM:$mask, addr:$src)>;
+
+  // Special patterns to allow use of X86VMUintToFP for masking. Instruction
+  // patterns have been disabled with null_frag.
+  def : Pat<(v8f16 (X86any_VUintToFP (v4i64 VR256X:$src))),
+            (VCVTUQQ2PHZ256rr VR256X:$src)>;
+  def : Pat<(X86VMUintToFP (v4i64 VR256X:$src), (v8f16 VR128X:$src0),
+                           VK4WM:$mask),
+            (VCVTUQQ2PHZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>;
+  def : Pat<(X86VMUintToFP (v4i64 VR256X:$src), v8f16x_info.ImmAllZerosV,
+                           VK4WM:$mask),
+            (VCVTUQQ2PHZ256rrkz VK4WM:$mask, VR256X:$src)>;
+
+  def : Pat<(v8f16 (X86any_VUintToFP (loadv4i64 addr:$src))),
+            (VCVTUQQ2PHZ256rm addr:$src)>;
+  def : Pat<(X86VMUintToFP (loadv4i64 addr:$src), (v8f16 VR128X:$src0),
+                           VK4WM:$mask),
+            (VCVTUQQ2PHZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+  def : Pat<(X86VMUintToFP (loadv4i64 addr:$src), v8f16x_info.ImmAllZerosV,
+                           VK4WM:$mask),
+            (VCVTUQQ2PHZ256rmkz VK4WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_VUintToFP (v4i64 (X86VBroadcastld64 addr:$src)))),
+            (VCVTUQQ2PHZ256rmb addr:$src)>;
+  def : Pat<(X86VMUintToFP (v4i64 (X86VBroadcastld64 addr:$src)),
+                           (v8f16 VR128X:$src0), VK4WM:$mask),
+            (VCVTUQQ2PHZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
+  def : Pat<(X86VMUintToFP (v4i64 (X86VBroadcastld64 addr:$src)),
+                           v8f16x_info.ImmAllZerosV, VK4WM:$mask),
+            (VCVTUQQ2PHZ256rmbkz VK4WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_VUintToFP (v2i64 VR128X:$src))),
+            (VCVTUQQ2PHZ128rr VR128X:$src)>;
+  def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), (v8f16 VR128X:$src0),
+                           VK2WM:$mask),
+            (VCVTUQQ2PHZ128rrk VR128X:$src0, VK2WM:$mask, VR128X:$src)>;
+  def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), v8f16x_info.ImmAllZerosV,
+                           VK2WM:$mask),
+            (VCVTUQQ2PHZ128rrkz VK2WM:$mask, VR128X:$src)>;
+
+  def : Pat<(v8f16 (X86any_VUintToFP (loadv2i64 addr:$src))),
+            (VCVTUQQ2PHZ128rm addr:$src)>;
+  def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), (v8f16 VR128X:$src0),
+                           VK2WM:$mask),
+            (VCVTUQQ2PHZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+  def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), v8f16x_info.ImmAllZerosV,
+                           VK2WM:$mask),
+            (VCVTUQQ2PHZ128rmkz VK2WM:$mask, addr:$src)>;
+
+  def : Pat<(v8f16 (X86any_VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
+            (VCVTUQQ2PHZ128rmb addr:$src)>;
+  def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
+                           (v8f16 VR128X:$src0), VK2WM:$mask),
+            (VCVTUQQ2PHZ128rmbk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+  def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
+                           v8f16x_info.ImmAllZerosV, VK2WM:$mask),
+            (VCVTUQQ2PHZ128rmbkz VK2WM:$mask, addr:$src)>;
+}
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index ba00e7da81f99..afbac21094144 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -1502,8 +1502,12 @@ let hasSideEffects = 0 in {
   let mayLoad = 1 in
   def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-
-             []>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>;
+             []>, T8XD, VEX_4V,
+             Sched<[sched.Folded, WriteIMulH,
+                    // Memory operand.
+                    ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                    // Implicit read of EDX/RDX
+                    sched.ReadAfterFold]>;
 
   // Pseudo instructions to be used when the low result isn't used. The
   // instruction is defined to keep the high if both destinations are the same.
@@ -1518,9 +1522,9 @@ let hasSideEffects = 0 in {
 
 let Predicates = [HasBMI2] in {
   let Uses = [EDX] in
-    defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteIMul32>;
+    defm MULX32 : bmi_mulx<"mulx{l}", GR32, i32mem, WriteMULX32>;
   let Uses = [RDX] in
-    defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteIMul64>, VEX_W;
+    defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem, WriteMULX64>, VEX_W;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1547,7 +1551,12 @@ let Predicates = [HasADX], Defs = [EFLAGS], Uses = [EFLAGS],
                     "adox{q}\t{$src2, $dst|$dst, $src2}", []>, T8XS;
   } // SchedRW
 
-  let mayLoad = 1, SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold] in {
+  let mayLoad = 1,
+      SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold,
+                 // Memory operand.
+                 ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+                 // Implicit read of EFLAGS
+                 WriteADC.ReadAfterFold] in {
   def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst),
                    (ins GR32:$src1, i32mem:$src2),
                    "adcx{l}\t{$src2, $dst|$dst, $src2}", []>, T8PD;
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 17fe7f0bd310d..959c8d4a2d886 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -815,10 +815,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VFPCLASSPDZ128rr,     X86::VFPCLASSPDZ128rm,     0 },
   { X86::VFPCLASSPDZ256rr,     X86::VFPCLASSPDZ256rm,     0 },
   { X86::VFPCLASSPDZrr,        X86::VFPCLASSPDZrm,        0 },
+  { X86::VFPCLASSPHZ128rr,     X86::VFPCLASSPHZ128rm,     0 },
+  { X86::VFPCLASSPHZ256rr,     X86::VFPCLASSPHZ256rm,     0 },
+  { X86::VFPCLASSPHZrr,        X86::VFPCLASSPHZrm,        0 },
   { X86::VFPCLASSPSZ128rr,     X86::VFPCLASSPSZ128rm,     0 },
   { X86::VFPCLASSPSZ256rr,     X86::VFPCLASSPSZ256rm,     0 },
   { X86::VFPCLASSPSZrr,        X86::VFPCLASSPSZrm,        0 },
   { X86::VFPCLASSSDZrr,        X86::VFPCLASSSDZrm,        TB_NO_REVERSE },
+  { X86::VFPCLASSSHZrr,        X86::VFPCLASSSHZrm,        TB_NO_REVERSE },
   { X86::VFPCLASSSSZrr,        X86::VFPCLASSSSZrm,        TB_NO_REVERSE },
   { X86::VFRCZPDYrr,           X86::VFRCZPDYrm,           0 },
   { X86::VFRCZPDrr,            X86::VFRCZPDrm,            0 },
@@ -829,12 +833,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VGETEXPPDZ128r,       X86::VGETEXPPDZ128m,       0 },
   { X86::VGETEXPPDZ256r,       X86::VGETEXPPDZ256m,       0 },
   { X86::VGETEXPPDZr,          X86::VGETEXPPDZm,          0 },
+  { X86::VGETEXPPHZ128r,       X86::VGETEXPPHZ128m,       0 },
+  { X86::VGETEXPPHZ256r,       X86::VGETEXPPHZ256m,       0 },
+  { X86::VGETEXPPHZr,          X86::VGETEXPPHZm,          0 },
   { X86::VGETEXPPSZ128r,       X86::VGETEXPPSZ128m,       0 },
   { X86::VGETEXPPSZ256r,       X86::VGETEXPPSZ256m,       0 },
   { X86::VGETEXPPSZr,          X86::VGETEXPPSZm,          0 },
   { X86::VGETMANTPDZ128rri,    X86::VGETMANTPDZ128rmi,    0 },
   { X86::VGETMANTPDZ256rri,    X86::VGETMANTPDZ256rmi,    0 },
   { X86::VGETMANTPDZrri,       X86::VGETMANTPDZrmi,       0 },
+  { X86::VGETMANTPHZ128rri,    X86::VGETMANTPHZ128rmi,    0 },
+  { X86::VGETMANTPHZ256rri,    X86::VGETMANTPHZ256rmi,    0 },
+  { X86::VGETMANTPHZrri,       X86::VGETMANTPHZrmi,       0 },
   { X86::VGETMANTPSZ128rri,    X86::VGETMANTPSZ128rmi,    0 },
   { X86::VGETMANTPSZ256rri,    X86::VGETMANTPSZ256rmi,    0 },
   { X86::VGETMANTPSZrri,       X86::VGETMANTPSZrmi,       0 },
@@ -1161,17 +1171,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VRCP14PSZr,           X86::VRCP14PSZm,           0 },
   { X86::VRCP28PDZr,           X86::VRCP28PDZm,           0 },
   { X86::VRCP28PSZr,           X86::VRCP28PSZm,           0 },
+  { X86::VRCPPHZ128r,          X86::VRCPPHZ128m,          0 },
+  { X86::VRCPPHZ256r,          X86::VRCPPHZ256m,          0 },
+  { X86::VRCPPHZr,             X86::VRCPPHZm,             0 },
   { X86::VRCPPSYr,             X86::VRCPPSYm,             0 },
   { X86::VRCPPSr,              X86::VRCPPSm,              0 },
   { X86::VREDUCEPDZ128rri,     X86::VREDUCEPDZ128rmi,     0 },
   { X86::VREDUCEPDZ256rri,     X86::VREDUCEPDZ256rmi,     0 },
   { X86::VREDUCEPDZrri,        X86::VREDUCEPDZrmi,        0 },
+  { X86::VREDUCEPHZ128rri,     X86::VREDUCEPHZ128rmi,     0 },
+  { X86::VREDUCEPHZ256rri,     X86::VREDUCEPHZ256rmi,     0 },
+  { X86::VREDUCEPHZrri,        X86::VREDUCEPHZrmi,        0 },
   { X86::VREDUCEPSZ128rri,     X86::VREDUCEPSZ128rmi,     0 },
   { X86::VREDUCEPSZ256rri,     X86::VREDUCEPSZ256rmi,     0 },
   { X86::VREDUCEPSZrri,        X86::VREDUCEPSZrmi,        0 },
   { X86::VRNDSCALEPDZ128rri,   X86::VRNDSCALEPDZ128rmi,   0 },
   { X86::VRNDSCALEPDZ256rri,   X86::VRNDSCALEPDZ256rmi,   0 },
   { X86::VRNDSCALEPDZrri,      X86::VRNDSCALEPDZrmi,      0 },
+  { X86::VRNDSCALEPHZ128rri,   X86::VRNDSCALEPHZ128rmi,   0 },
+  { X86::VRNDSCALEPHZ256rri,   X86::VRNDSCALEPHZ256rmi,   0 },
+  { X86::VRNDSCALEPHZrri,      X86::VRNDSCALEPHZrmi,      0 },
   { X86::VRNDSCALEPSZ128rri,   X86::VRNDSCALEPSZ128rmi,   0 },
   { X86::VRNDSCALEPSZ256rri,   X86::VRNDSCALEPSZ256rmi,   0 },
   { X86::VRNDSCALEPSZrri,      X86::VRNDSCALEPSZrmi,      0 },
@@ -1187,6 +1206,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VRSQRT14PSZr,         X86::VRSQRT14PSZm,         0 },
   { X86::VRSQRT28PDZr,         X86::VRSQRT28PDZm,         0 },
   { X86::VRSQRT28PSZr,         X86::VRSQRT28PSZm,         0 },
+  { X86::VRSQRTPHZ128r,        X86::VRSQRTPHZ128m,        0 },
+  { X86::VRSQRTPHZ256r,        X86::VRSQRTPHZ256m,        0 },
+  { X86::VRSQRTPHZr,           X86::VRSQRTPHZm,           0 },
   { X86::VRSQRTPSYr,           X86::VRSQRTPSYm,           0 },
   { X86::VRSQRTPSr,            X86::VRSQRTPSm,            0 },
   { X86::VSQRTPDYr,            X86::VSQRTPDYm,            0 },
@@ -1194,6 +1216,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VSQRTPDZ256r,         X86::VSQRTPDZ256m,         0 },
   { X86::VSQRTPDZr,            X86::VSQRTPDZm,            0 },
   { X86::VSQRTPDr,             X86::VSQRTPDm,             0 },
+  { X86::VSQRTPHZ128r,         X86::VSQRTPHZ128m,         0 },
+  { X86::VSQRTPHZ256r,         X86::VSQRTPHZ256m,         0 },
+  { X86::VSQRTPHZr,            X86::VSQRTPHZm,            0 },
   { X86::VSQRTPSYr,            X86::VSQRTPSYm,            0 },
   { X86::VSQRTPSZ128r,         X86::VSQRTPSZ128m,         0 },
   { X86::VSQRTPSZ256r,         X86::VSQRTPSZ256m,         0 },
@@ -1550,6 +1575,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VADDPDZ256rr,             X86::VADDPDZ256rm,             0 },
   { X86::VADDPDZrr,                X86::VADDPDZrm,                0 },
   { X86::VADDPDrr,                 X86::VADDPDrm,                 0 },
+  { X86::VADDPHZ128rr,             X86::VADDPHZ128rm,             0 },
+  { X86::VADDPHZ256rr,             X86::VADDPHZ256rm,             0 },
+  { X86::VADDPHZrr,                X86::VADDPHZrm,                0 },
   { X86::VADDPSYrr,                X86::VADDPSYrm,                0 },
   { X86::VADDPSZ128rr,             X86::VADDPSZ128rm,             0 },
   { X86::VADDPSZ256rr,             X86::VADDPSZ256rm,             0 },
@@ -1559,6 +1587,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VADDSDZrr_Int,            X86::VADDSDZrm_Int,            TB_NO_REVERSE },
   { X86::VADDSDrr,                 X86::VADDSDrm,                 0 },
   { X86::VADDSDrr_Int,             X86::VADDSDrm_Int,             TB_NO_REVERSE },
+  { X86::VADDSHZrr,                X86::VADDSHZrm,                0 },
+  { X86::VADDSHZrr_Int,            X86::VADDSHZrm_Int,            TB_NO_REVERSE },
   { X86::VADDSSZrr,                X86::VADDSSZrm,                0 },
   { X86::VADDSSZrr_Int,            X86::VADDSSZrm_Int,            TB_NO_REVERSE },
   { X86::VADDSSrr,                 X86::VADDSSrm,                 0 },
@@ -1642,6 +1672,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VCMPPDZ256rri,            X86::VCMPPDZ256rmi,            0 },
   { X86::VCMPPDZrri,               X86::VCMPPDZrmi,               0 },
   { X86::VCMPPDrri,                X86::VCMPPDrmi,                0 },
+  { X86::VCMPPHZ128rri,            X86::VCMPPHZ128rmi,            0 },
+  { X86::VCMPPHZ256rri,            X86::VCMPPHZ256rmi,            0 },
+  { X86::VCMPPHZrri,               X86::VCMPPHZrmi,               0 },
   { X86::VCMPPSYrri,               X86::VCMPPSYrmi,               0 },
   { X86::VCMPPSZ128rri,            X86::VCMPPSZ128rmi,            0 },
   { X86::VCMPPSZ256rri,            X86::VCMPPSZ256rmi,            0 },
@@ -1651,6 +1684,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VCMPSDZrr_Int,            X86::VCMPSDZrm_Int,            TB_NO_REVERSE },
   { X86::VCMPSDrr,                 X86::VCMPSDrm,                 0 },
   { X86::VCMPSDrr_Int,             X86::VCMPSDrm_Int,             TB_NO_REVERSE },
+  { X86::VCMPSHZrr,                X86::VCMPSHZrm,                0 },
+  { X86::VCMPSHZrr_Int,            X86::VCMPSHZrm_Int,            TB_NO_REVERSE },
   { X86::VCMPSSZrr,                X86::VCMPSSZrm,                0 },
   { X86::VCMPSSZrr_Int,            X86::VCMPSSZrm_Int,            TB_NO_REVERSE },
   { X86::VCMPSSrr,                 X86::VCMPSSrm,                 0 },
@@ -1782,6 +1817,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VDIVPDZ256rr,             X86::VDIVPDZ256rm,             0 },
   { X86::VDIVPDZrr,                X86::VDIVPDZrm,                0 },
   { X86::VDIVPDrr,                 X86::VDIVPDrm,                 0 },
+  { X86::VDIVPHZ128rr,             X86::VDIVPHZ128rm,             0 },
+  { X86::VDIVPHZ256rr,             X86::VDIVPHZ256rm,             0 },
+  { X86::VDIVPHZrr,                X86::VDIVPHZrm,                0 },
   { X86::VDIVPSYrr,                X86::VDIVPSYrm,                0 },
   { X86::VDIVPSZ128rr,             X86::VDIVPSZ128rm,             0 },
   { X86::VDIVPSZ256rr,             X86::VDIVPSZ256rm,             0 },
@@ -1791,6 +1829,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VDIVSDZrr_Int,            X86::VDIVSDZrm_Int,            TB_NO_REVERSE },
   { X86::VDIVSDrr,                 X86::VDIVSDrm,                 0 },
   { X86::VDIVSDrr_Int,             X86::VDIVSDrm_Int,             TB_NO_REVERSE },
+  { X86::VDIVSHZrr,                X86::VDIVSHZrm,                0 },
+  { X86::VDIVSHZrr_Int,            X86::VDIVSHZrm_Int,            TB_NO_REVERSE },
   { X86::VDIVSSZrr,                X86::VDIVSSZrm,                0 },
   { X86::VDIVSSZrr_Int,            X86::VDIVSSZrm_Int,            TB_NO_REVERSE },
   { X86::VDIVSSrr,                 X86::VDIVSSrm,                 0 },
@@ -1849,26 +1889,38 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VFPCLASSPDZ128rrk,        X86::VFPCLASSPDZ128rmk,        0 },
   { X86::VFPCLASSPDZ256rrk,        X86::VFPCLASSPDZ256rmk,        0 },
   { X86::VFPCLASSPDZrrk,           X86::VFPCLASSPDZrmk,           0 },
+  { X86::VFPCLASSPHZ128rrk,        X86::VFPCLASSPHZ128rmk,        0 },
+  { X86::VFPCLASSPHZ256rrk,        X86::VFPCLASSPHZ256rmk,        0 },
+  { X86::VFPCLASSPHZrrk,           X86::VFPCLASSPHZrmk,           0 },
   { X86::VFPCLASSPSZ128rrk,        X86::VFPCLASSPSZ128rmk,        0 },
   { X86::VFPCLASSPSZ256rrk,        X86::VFPCLASSPSZ256rmk,        0 },
   { X86::VFPCLASSPSZrrk,           X86::VFPCLASSPSZrmk,           0 },
   { X86::VFPCLASSSDZrrk,           X86::VFPCLASSSDZrmk,           TB_NO_REVERSE },
+  { X86::VFPCLASSSHZrrk,           X86::VFPCLASSSHZrmk,           TB_NO_REVERSE },
   { X86::VFPCLASSSSZrrk,           X86::VFPCLASSSSZrmk,           TB_NO_REVERSE },
   { X86::VGETEXPPDZ128rkz,         X86::VGETEXPPDZ128mkz,         0 },
   { X86::VGETEXPPDZ256rkz,         X86::VGETEXPPDZ256mkz,         0 },
   { X86::VGETEXPPDZrkz,            X86::VGETEXPPDZmkz,            0 },
+  { X86::VGETEXPPHZ128rkz,         X86::VGETEXPPHZ128mkz,         0 },
+  { X86::VGETEXPPHZ256rkz,         X86::VGETEXPPHZ256mkz,         0 },
+  { X86::VGETEXPPHZrkz,            X86::VGETEXPPHZmkz,            0 },
   { X86::VGETEXPPSZ128rkz,         X86::VGETEXPPSZ128mkz,         0 },
   { X86::VGETEXPPSZ256rkz,         X86::VGETEXPPSZ256mkz,         0 },
   { X86::VGETEXPPSZrkz,            X86::VGETEXPPSZmkz,            0 },
   { X86::VGETEXPSDZr,              X86::VGETEXPSDZm,              TB_NO_REVERSE },
+  { X86::VGETEXPSHZr,              X86::VGETEXPSHZm,              TB_NO_REVERSE },
   { X86::VGETEXPSSZr,              X86::VGETEXPSSZm,              TB_NO_REVERSE },
   { X86::VGETMANTPDZ128rrikz,      X86::VGETMANTPDZ128rmikz,      0 },
   { X86::VGETMANTPDZ256rrikz,      X86::VGETMANTPDZ256rmikz,      0 },
   { X86::VGETMANTPDZrrikz,         X86::VGETMANTPDZrmikz,         0 },
+  { X86::VGETMANTPHZ128rrikz,      X86::VGETMANTPHZ128rmikz,      0 },
+  { X86::VGETMANTPHZ256rrikz,      X86::VGETMANTPHZ256rmikz,      0 },
+  { X86::VGETMANTPHZrrikz,         X86::VGETMANTPHZrmikz,         0 },
   { X86::VGETMANTPSZ128rrikz,      X86::VGETMANTPSZ128rmikz,      0 },
   { X86::VGETMANTPSZ256rrikz,      X86::VGETMANTPSZ256rmikz,      0 },
   { X86::VGETMANTPSZrrikz,         X86::VGETMANTPSZrmikz,         0 },
   { X86::VGETMANTSDZrri,           X86::VGETMANTSDZrmi,           TB_NO_REVERSE },
+  { X86::VGETMANTSHZrri,           X86::VGETMANTSHZrmi,           TB_NO_REVERSE },
   { X86::VGETMANTSSZrri,           X86::VGETMANTSSZrmi,           TB_NO_REVERSE },
   { X86::VGF2P8AFFINEINVQBYrri,    X86::VGF2P8AFFINEINVQBYrmi,    0 },
   { X86::VGF2P8AFFINEINVQBZ128rri, X86::VGF2P8AFFINEINVQBZ128rmi, 0 },
@@ -1912,6 +1964,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMAXCPDZ256rr,            X86::VMAXCPDZ256rm,            0 },
   { X86::VMAXCPDZrr,               X86::VMAXCPDZrm,               0 },
   { X86::VMAXCPDrr,                X86::VMAXCPDrm,                0 },
+  { X86::VMAXCPHZ128rr,            X86::VMAXCPHZ128rm,            0 },
+  { X86::VMAXCPHZ256rr,            X86::VMAXCPHZ256rm,            0 },
+  { X86::VMAXCPHZrr,               X86::VMAXCPHZrm,               0 },
   { X86::VMAXCPSYrr,               X86::VMAXCPSYrm,               0 },
   { X86::VMAXCPSZ128rr,            X86::VMAXCPSZ128rm,            0 },
   { X86::VMAXCPSZ256rr,            X86::VMAXCPSZ256rm,            0 },
@@ -1919,6 +1974,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMAXCPSrr,                X86::VMAXCPSrm,                0 },
   { X86::VMAXCSDZrr,               X86::VMAXCSDZrm,               0 },
   { X86::VMAXCSDrr,                X86::VMAXCSDrm,                0 },
+  { X86::VMAXCSHZrr,               X86::VMAXCSHZrm,               0 },
   { X86::VMAXCSSZrr,               X86::VMAXCSSZrm,               0 },
   { X86::VMAXCSSrr,                X86::VMAXCSSrm,                0 },
   { X86::VMAXPDYrr,                X86::VMAXPDYrm,                0 },
@@ -1926,6 +1982,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMAXPDZ256rr,             X86::VMAXPDZ256rm,             0 },
   { X86::VMAXPDZrr,                X86::VMAXPDZrm,                0 },
   { X86::VMAXPDrr,                 X86::VMAXPDrm,                 0 },
+  { X86::VMAXPHZ128rr,             X86::VMAXPHZ128rm,             0 },
+  { X86::VMAXPHZ256rr,             X86::VMAXPHZ256rm,             0 },
+  { X86::VMAXPHZrr,                X86::VMAXPHZrm,                0 },
   { X86::VMAXPSYrr,                X86::VMAXPSYrm,                0 },
   { X86::VMAXPSZ128rr,             X86::VMAXPSZ128rm,             0 },
   { X86::VMAXPSZ256rr,             X86::VMAXPSZ256rm,             0 },
@@ -1935,6 +1994,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMAXSDZrr_Int,            X86::VMAXSDZrm_Int,            TB_NO_REVERSE },
   { X86::VMAXSDrr,                 X86::VMAXSDrm,                 0 },
   { X86::VMAXSDrr_Int,             X86::VMAXSDrm_Int,             TB_NO_REVERSE },
+  { X86::VMAXSHZrr,                X86::VMAXSHZrm,                0 },
+  { X86::VMAXSHZrr_Int,            X86::VMAXSHZrm_Int,            TB_NO_REVERSE },
   { X86::VMAXSSZrr,                X86::VMAXSSZrm,                0 },
   { X86::VMAXSSZrr_Int,            X86::VMAXSSZrm_Int,            TB_NO_REVERSE },
   { X86::VMAXSSrr,                 X86::VMAXSSrm,                 0 },
@@ -1944,6 +2005,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMINCPDZ256rr,            X86::VMINCPDZ256rm,            0 },
   { X86::VMINCPDZrr,               X86::VMINCPDZrm,               0 },
   { X86::VMINCPDrr,                X86::VMINCPDrm,                0 },
+  { X86::VMINCPHZ128rr,            X86::VMINCPHZ128rm,            0 },
+  { X86::VMINCPHZ256rr,            X86::VMINCPHZ256rm,            0 },
+  { X86::VMINCPHZrr,               X86::VMINCPHZrm,               0 },
   { X86::VMINCPSYrr,               X86::VMINCPSYrm,               0 },
   { X86::VMINCPSZ128rr,            X86::VMINCPSZ128rm,            0 },
   { X86::VMINCPSZ256rr,            X86::VMINCPSZ256rm,            0 },
@@ -1951,6 +2015,7 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMINCPSrr,                X86::VMINCPSrm,                0 },
   { X86::VMINCSDZrr,               X86::VMINCSDZrm,               0 },
   { X86::VMINCSDrr,                X86::VMINCSDrm,                0 },
+  { X86::VMINCSHZrr,               X86::VMINCSHZrm,               0 },
   { X86::VMINCSSZrr,               X86::VMINCSSZrm,               0 },
   { X86::VMINCSSrr,                X86::VMINCSSrm,                0 },
   { X86::VMINPDYrr,                X86::VMINPDYrm,                0 },
@@ -1958,6 +2023,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMINPDZ256rr,             X86::VMINPDZ256rm,             0 },
   { X86::VMINPDZrr,                X86::VMINPDZrm,                0 },
   { X86::VMINPDrr,                 X86::VMINPDrm,                 0 },
+  { X86::VMINPHZ128rr,             X86::VMINPHZ128rm,             0 },
+  { X86::VMINPHZ256rr,             X86::VMINPHZ256rm,             0 },
+  { X86::VMINPHZrr,                X86::VMINPHZrm,                0 },
   { X86::VMINPSYrr,                X86::VMINPSYrm,                0 },
   { X86::VMINPSZ128rr,             X86::VMINPSZ128rm,             0 },
   { X86::VMINPSZ256rr,             X86::VMINPSZ256rm,             0 },
@@ -1967,6 +2035,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMINSDZrr_Int,            X86::VMINSDZrm_Int,            TB_NO_REVERSE },
   { X86::VMINSDrr,                 X86::VMINSDrm,                 0 },
   { X86::VMINSDrr_Int,             X86::VMINSDrm_Int,             TB_NO_REVERSE },
+  { X86::VMINSHZrr,                X86::VMINSHZrm,                0 },
+  { X86::VMINSHZrr_Int,            X86::VMINSHZrm_Int,            TB_NO_REVERSE },
   { X86::VMINSSZrr,                X86::VMINSSZrm,                0 },
   { X86::VMINSSZrr_Int,            X86::VMINSSZrm_Int,            TB_NO_REVERSE },
   { X86::VMINSSrr,                 X86::VMINSSrm,                 0 },
@@ -2021,6 +2091,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMULPDZ256rr,             X86::VMULPDZ256rm,             0 },
   { X86::VMULPDZrr,                X86::VMULPDZrm,                0 },
   { X86::VMULPDrr,                 X86::VMULPDrm,                 0 },
+  { X86::VMULPHZ128rr,             X86::VMULPHZ128rm,             0 },
+  { X86::VMULPHZ256rr,             X86::VMULPHZ256rm,             0 },
+  { X86::VMULPHZrr,                X86::VMULPHZrm,                0 },
   { X86::VMULPSYrr,                X86::VMULPSYrm,                0 },
   { X86::VMULPSZ128rr,             X86::VMULPSZ128rm,             0 },
   { X86::VMULPSZ256rr,             X86::VMULPSZ256rm,             0 },
@@ -2030,6 +2103,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VMULSDZrr_Int,            X86::VMULSDZrm_Int,            TB_NO_REVERSE },
   { X86::VMULSDrr,                 X86::VMULSDrm,                 0 },
   { X86::VMULSDrr_Int,             X86::VMULSDrm_Int,             TB_NO_REVERSE },
+  { X86::VMULSHZrr,                X86::VMULSHZrm,                0 },
+  { X86::VMULSHZrr_Int,            X86::VMULSHZrm_Int,            TB_NO_REVERSE },
   { X86::VMULSSZrr,                X86::VMULSSZrm,                0 },
   { X86::VMULSSZrr_Int,            X86::VMULSSZrm_Int,            TB_NO_REVERSE },
   { X86::VMULSSrr,                 X86::VMULSSrm,                 0 },
@@ -2861,24 +2936,37 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VRCP28PSZrkz,             X86::VRCP28PSZmkz,             0 },
   { X86::VRCP28SDZr,               X86::VRCP28SDZm,               TB_NO_REVERSE },
   { X86::VRCP28SSZr,               X86::VRCP28SSZm,               TB_NO_REVERSE },
+  { X86::VRCPPHZ128rkz,            X86::VRCPPHZ128mkz,            0 },
+  { X86::VRCPPHZ256rkz,            X86::VRCPPHZ256mkz,            0 },
+  { X86::VRCPPHZrkz,               X86::VRCPPHZmkz,               0 },
+  { X86::VRCPSHZrr,                X86::VRCPSHZrm,                TB_NO_REVERSE },
   { X86::VRCPSSr,                  X86::VRCPSSm,                  0 },
   { X86::VRCPSSr_Int,              X86::VRCPSSm_Int,              TB_NO_REVERSE },
   { X86::VREDUCEPDZ128rrikz,       X86::VREDUCEPDZ128rmikz,       0 },
   { X86::VREDUCEPDZ256rrikz,       X86::VREDUCEPDZ256rmikz,       0 },
   { X86::VREDUCEPDZrrikz,          X86::VREDUCEPDZrmikz,          0 },
+  { X86::VREDUCEPHZ128rrikz,       X86::VREDUCEPHZ128rmikz,       0 },
+  { X86::VREDUCEPHZ256rrikz,       X86::VREDUCEPHZ256rmikz,       0 },
+  { X86::VREDUCEPHZrrikz,          X86::VREDUCEPHZrmikz,          0 },
   { X86::VREDUCEPSZ128rrikz,       X86::VREDUCEPSZ128rmikz,       0 },
   { X86::VREDUCEPSZ256rrikz,       X86::VREDUCEPSZ256rmikz,       0 },
   { X86::VREDUCEPSZrrikz,          X86::VREDUCEPSZrmikz,          0 },
   { X86::VREDUCESDZrri,            X86::VREDUCESDZrmi,            TB_NO_REVERSE },
+  { X86::VREDUCESHZrri,            X86::VREDUCESHZrmi,            TB_NO_REVERSE },
   { X86::VREDUCESSZrri,            X86::VREDUCESSZrmi,            TB_NO_REVERSE },
   { X86::VRNDSCALEPDZ128rrikz,     X86::VRNDSCALEPDZ128rmikz,     0 },
   { X86::VRNDSCALEPDZ256rrikz,     X86::VRNDSCALEPDZ256rmikz,     0 },
   { X86::VRNDSCALEPDZrrikz,        X86::VRNDSCALEPDZrmikz,        0 },
+  { X86::VRNDSCALEPHZ128rrikz,     X86::VRNDSCALEPHZ128rmikz,     0 },
+  { X86::VRNDSCALEPHZ256rrikz,     X86::VRNDSCALEPHZ256rmikz,     0 },
+  { X86::VRNDSCALEPHZrrikz,        X86::VRNDSCALEPHZrmikz,        0 },
   { X86::VRNDSCALEPSZ128rrikz,     X86::VRNDSCALEPSZ128rmikz,     0 },
   { X86::VRNDSCALEPSZ256rrikz,     X86::VRNDSCALEPSZ256rmikz,     0 },
   { X86::VRNDSCALEPSZrrikz,        X86::VRNDSCALEPSZrmikz,        0 },
   { X86::VRNDSCALESDZr,            X86::VRNDSCALESDZm,            0 },
   { X86::VRNDSCALESDZr_Int,        X86::VRNDSCALESDZm_Int,        TB_NO_REVERSE },
+  { X86::VRNDSCALESHZr,            X86::VRNDSCALESHZm,            0 },
+  { X86::VRNDSCALESHZr_Int,        X86::VRNDSCALESHZm_Int,        TB_NO_REVERSE },
   { X86::VRNDSCALESSZr,            X86::VRNDSCALESSZm,            0 },
   { X86::VRNDSCALESSZr_Int,        X86::VRNDSCALESSZm_Int,        TB_NO_REVERSE },
   { X86::VROUNDSDr,                X86::VROUNDSDm,                0 },
@@ -2897,15 +2985,23 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VRSQRT28PSZrkz,           X86::VRSQRT28PSZmkz,           0 },
   { X86::VRSQRT28SDZr,             X86::VRSQRT28SDZm,             TB_NO_REVERSE },
   { X86::VRSQRT28SSZr,             X86::VRSQRT28SSZm,             TB_NO_REVERSE },
+  { X86::VRSQRTPHZ128rkz,          X86::VRSQRTPHZ128mkz,          0 },
+  { X86::VRSQRTPHZ256rkz,          X86::VRSQRTPHZ256mkz,          0 },
+  { X86::VRSQRTPHZrkz,             X86::VRSQRTPHZmkz,             0 },
+  { X86::VRSQRTSHZrr,              X86::VRSQRTSHZrm,              TB_NO_REVERSE },
   { X86::VRSQRTSSr,                X86::VRSQRTSSm,                0 },
   { X86::VRSQRTSSr_Int,            X86::VRSQRTSSm_Int,            TB_NO_REVERSE },
   { X86::VSCALEFPDZ128rr,          X86::VSCALEFPDZ128rm,          0 },
   { X86::VSCALEFPDZ256rr,          X86::VSCALEFPDZ256rm,          0 },
   { X86::VSCALEFPDZrr,             X86::VSCALEFPDZrm,             0 },
+  { X86::VSCALEFPHZ128rr,          X86::VSCALEFPHZ128rm,          0 },
+  { X86::VSCALEFPHZ256rr,          X86::VSCALEFPHZ256rm,          0 },
+  { X86::VSCALEFPHZrr,             X86::VSCALEFPHZrm,             0 },
   { X86::VSCALEFPSZ128rr,          X86::VSCALEFPSZ128rm,          0 },
   { X86::VSCALEFPSZ256rr,          X86::VSCALEFPSZ256rm,          0 },
   { X86::VSCALEFPSZrr,             X86::VSCALEFPSZrm,             0 },
   { X86::VSCALEFSDZrr,             X86::VSCALEFSDZrm,             TB_NO_REVERSE },
+  { X86::VSCALEFSHZrr,             X86::VSCALEFSHZrm,             TB_NO_REVERSE },
   { X86::VSCALEFSSZrr,             X86::VSCALEFSSZrm,             TB_NO_REVERSE },
   { X86::VSHUFF32X4Z256rri,        X86::VSHUFF32X4Z256rmi,        0 },
   { X86::VSHUFF32X4Zrri,           X86::VSHUFF32X4Zrmi,           0 },
@@ -2928,6 +3024,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VSQRTPDZ128rkz,           X86::VSQRTPDZ128mkz,           0 },
   { X86::VSQRTPDZ256rkz,           X86::VSQRTPDZ256mkz,           0 },
   { X86::VSQRTPDZrkz,              X86::VSQRTPDZmkz,              0 },
+  { X86::VSQRTPHZ128rkz,           X86::VSQRTPHZ128mkz,           0 },
+  { X86::VSQRTPHZ256rkz,           X86::VSQRTPHZ256mkz,           0 },
+  { X86::VSQRTPHZrkz,              X86::VSQRTPHZmkz,              0 },
   { X86::VSQRTPSZ128rkz,           X86::VSQRTPSZ128mkz,           0 },
   { X86::VSQRTPSZ256rkz,           X86::VSQRTPSZ256mkz,           0 },
   { X86::VSQRTPSZrkz,              X86::VSQRTPSZmkz,              0 },
@@ -2935,6 +3034,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VSQRTSDZr_Int,            X86::VSQRTSDZm_Int,            TB_NO_REVERSE },
   { X86::VSQRTSDr,                 X86::VSQRTSDm,                 0 },
   { X86::VSQRTSDr_Int,             X86::VSQRTSDm_Int,             TB_NO_REVERSE },
+  { X86::VSQRTSHZr,                X86::VSQRTSHZm,                0 },
+  { X86::VSQRTSHZr_Int,            X86::VSQRTSHZm_Int,            TB_NO_REVERSE },
   { X86::VSQRTSSZr,                X86::VSQRTSSZm,                0 },
   { X86::VSQRTSSZr_Int,            X86::VSQRTSSZm_Int,            TB_NO_REVERSE },
   { X86::VSQRTSSr,                 X86::VSQRTSSm,                 0 },
@@ -2944,6 +3045,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VSUBPDZ256rr,             X86::VSUBPDZ256rm,             0 },
   { X86::VSUBPDZrr,                X86::VSUBPDZrm,                0 },
   { X86::VSUBPDrr,                 X86::VSUBPDrm,                 0 },
+  { X86::VSUBPHZ128rr,             X86::VSUBPHZ128rm,             0 },
+  { X86::VSUBPHZ256rr,             X86::VSUBPHZ256rm,             0 },
+  { X86::VSUBPHZrr,                X86::VSUBPHZrm,                0 },
   { X86::VSUBPSYrr,                X86::VSUBPSYrm,                0 },
   { X86::VSUBPSZ128rr,             X86::VSUBPSZ128rm,             0 },
   { X86::VSUBPSZ256rr,             X86::VSUBPSZ256rm,             0 },
@@ -2953,6 +3057,8 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VSUBSDZrr_Int,            X86::VSUBSDZrm_Int,            TB_NO_REVERSE },
   { X86::VSUBSDrr,                 X86::VSUBSDrm,                 0 },
   { X86::VSUBSDrr_Int,             X86::VSUBSDrm_Int,             TB_NO_REVERSE },
+  { X86::VSUBSHZrr,                X86::VSUBSHZrm,                0 },
+  { X86::VSUBSHZrr_Int,            X86::VSUBSHZrm_Int,            TB_NO_REVERSE },
   { X86::VSUBSSZrr,                X86::VSUBSSZrm,                0 },
   { X86::VSUBSSZrr_Int,            X86::VSUBSSZrm_Int,            TB_NO_REVERSE },
   { X86::VSUBSSrr,                 X86::VSUBSSrm,                 0 },
@@ -2999,10 +3105,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VADDPDZ128rrkz,             X86::VADDPDZ128rmkz,             0 },
   { X86::VADDPDZ256rrkz,             X86::VADDPDZ256rmkz,             0 },
   { X86::VADDPDZrrkz,                X86::VADDPDZrmkz,                0 },
+  { X86::VADDPHZ128rrkz,             X86::VADDPHZ128rmkz,             0 },
+  { X86::VADDPHZ256rrkz,             X86::VADDPHZ256rmkz,             0 },
+  { X86::VADDPHZrrkz,                X86::VADDPHZrmkz,                0 },
   { X86::VADDPSZ128rrkz,             X86::VADDPSZ128rmkz,             0 },
   { X86::VADDPSZ256rrkz,             X86::VADDPSZ256rmkz,             0 },
   { X86::VADDPSZrrkz,                X86::VADDPSZrmkz,                0 },
   { X86::VADDSDZrr_Intkz,            X86::VADDSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VADDSHZrr_Intkz,            X86::VADDSHZrm_Intkz,            TB_NO_REVERSE },
   { X86::VADDSSZrr_Intkz,            X86::VADDSSZrm_Intkz,            TB_NO_REVERSE },
   { X86::VALIGNDZ128rrikz,           X86::VALIGNDZ128rmikz,           0 },
   { X86::VALIGNDZ256rrikz,           X86::VALIGNDZ256rmikz,           0 },
@@ -3041,10 +3151,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VCMPPDZ128rrik,             X86::VCMPPDZ128rmik,             0 },
   { X86::VCMPPDZ256rrik,             X86::VCMPPDZ256rmik,             0 },
   { X86::VCMPPDZrrik,                X86::VCMPPDZrmik,                0 },
+  { X86::VCMPPHZ128rrik,             X86::VCMPPHZ128rmik,             0 },
+  { X86::VCMPPHZ256rrik,             X86::VCMPPHZ256rmik,             0 },
+  { X86::VCMPPHZrrik,                X86::VCMPPHZrmik,                0 },
   { X86::VCMPPSZ128rrik,             X86::VCMPPSZ128rmik,             0 },
   { X86::VCMPPSZ256rrik,             X86::VCMPPSZ256rmik,             0 },
   { X86::VCMPPSZrrik,                X86::VCMPPSZrmik,                0 },
   { X86::VCMPSDZrr_Intk,             X86::VCMPSDZrm_Intk,             TB_NO_REVERSE },
+  { X86::VCMPSHZrr_Intk,             X86::VCMPSHZrm_Intk,             TB_NO_REVERSE },
   { X86::VCMPSSZrr_Intk,             X86::VCMPSSZrm_Intk,             TB_NO_REVERSE },
   { X86::VCVTDQ2PDZ128rrk,           X86::VCVTDQ2PDZ128rmk,           TB_NO_REVERSE },
   { X86::VCVTDQ2PDZ256rrk,           X86::VCVTDQ2PDZ256rmk,           0 },
@@ -3141,10 +3255,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VDIVPDZ128rrkz,             X86::VDIVPDZ128rmkz,             0 },
   { X86::VDIVPDZ256rrkz,             X86::VDIVPDZ256rmkz,             0 },
   { X86::VDIVPDZrrkz,                X86::VDIVPDZrmkz,                0 },
+  { X86::VDIVPHZ128rrkz,             X86::VDIVPHZ128rmkz,             0 },
+  { X86::VDIVPHZ256rrkz,             X86::VDIVPHZ256rmkz,             0 },
+  { X86::VDIVPHZrrkz,                X86::VDIVPHZrmkz,                0 },
   { X86::VDIVPSZ128rrkz,             X86::VDIVPSZ128rmkz,             0 },
   { X86::VDIVPSZ256rrkz,             X86::VDIVPSZ256rmkz,             0 },
   { X86::VDIVPSZrrkz,                X86::VDIVPSZrmkz,                0 },
   { X86::VDIVSDZrr_Intkz,            X86::VDIVSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VDIVSHZrr_Intkz,            X86::VDIVSHZrm_Intkz,            TB_NO_REVERSE },
   { X86::VDIVSSZrr_Intkz,            X86::VDIVSSZrm_Intkz,            TB_NO_REVERSE },
   { X86::VDPBF16PSZ128r,             X86::VDPBF16PSZ128m,             0 },
   { X86::VDPBF16PSZ256r,             X86::VDPBF16PSZ256m,             0 },
@@ -3484,18 +3602,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VGETEXPPDZ128rk,            X86::VGETEXPPDZ128mk,            0 },
   { X86::VGETEXPPDZ256rk,            X86::VGETEXPPDZ256mk,            0 },
   { X86::VGETEXPPDZrk,               X86::VGETEXPPDZmk,               0 },
+  { X86::VGETEXPPHZ128rk,            X86::VGETEXPPHZ128mk,            0 },
+  { X86::VGETEXPPHZ256rk,            X86::VGETEXPPHZ256mk,            0 },
+  { X86::VGETEXPPHZrk,               X86::VGETEXPPHZmk,               0 },
   { X86::VGETEXPPSZ128rk,            X86::VGETEXPPSZ128mk,            0 },
   { X86::VGETEXPPSZ256rk,            X86::VGETEXPPSZ256mk,            0 },
   { X86::VGETEXPPSZrk,               X86::VGETEXPPSZmk,               0 },
   { X86::VGETEXPSDZrkz,              X86::VGETEXPSDZmkz,              TB_NO_REVERSE },
+  { X86::VGETEXPSHZrkz,              X86::VGETEXPSHZmkz,              TB_NO_REVERSE },
   { X86::VGETEXPSSZrkz,              X86::VGETEXPSSZmkz,              TB_NO_REVERSE },
   { X86::VGETMANTPDZ128rrik,         X86::VGETMANTPDZ128rmik,         0 },
   { X86::VGETMANTPDZ256rrik,         X86::VGETMANTPDZ256rmik,         0 },
   { X86::VGETMANTPDZrrik,            X86::VGETMANTPDZrmik,            0 },
+  { X86::VGETMANTPHZ128rrik,         X86::VGETMANTPHZ128rmik,         0 },
+  { X86::VGETMANTPHZ256rrik,         X86::VGETMANTPHZ256rmik,         0 },
+  { X86::VGETMANTPHZrrik,            X86::VGETMANTPHZrmik,            0 },
   { X86::VGETMANTPSZ128rrik,         X86::VGETMANTPSZ128rmik,         0 },
   { X86::VGETMANTPSZ256rrik,         X86::VGETMANTPSZ256rmik,         0 },
   { X86::VGETMANTPSZrrik,            X86::VGETMANTPSZrmik,            0 },
   { X86::VGETMANTSDZrrikz,           X86::VGETMANTSDZrmikz,           TB_NO_REVERSE },
+  { X86::VGETMANTSHZrrikz,           X86::VGETMANTSHZrmikz,           TB_NO_REVERSE },
   { X86::VGETMANTSSZrrikz,           X86::VGETMANTSSZrmikz,           TB_NO_REVERSE },
   { X86::VGF2P8AFFINEINVQBZ128rrikz, X86::VGF2P8AFFINEINVQBZ128rmikz, 0 },
   { X86::VGF2P8AFFINEINVQBZ256rrikz, X86::VGF2P8AFFINEINVQBZ256rmikz, 0 },
@@ -3521,30 +3647,44 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VMAXCPDZ128rrkz,            X86::VMAXCPDZ128rmkz,            0 },
   { X86::VMAXCPDZ256rrkz,            X86::VMAXCPDZ256rmkz,            0 },
   { X86::VMAXCPDZrrkz,               X86::VMAXCPDZrmkz,               0 },
+  { X86::VMAXCPHZ128rrkz,            X86::VMAXCPHZ128rmkz,            0 },
+  { X86::VMAXCPHZ256rrkz,            X86::VMAXCPHZ256rmkz,            0 },
+  { X86::VMAXCPHZrrkz,               X86::VMAXCPHZrmkz,               0 },
   { X86::VMAXCPSZ128rrkz,            X86::VMAXCPSZ128rmkz,            0 },
   { X86::VMAXCPSZ256rrkz,            X86::VMAXCPSZ256rmkz,            0 },
   { X86::VMAXCPSZrrkz,               X86::VMAXCPSZrmkz,               0 },
   { X86::VMAXPDZ128rrkz,             X86::VMAXPDZ128rmkz,             0 },
   { X86::VMAXPDZ256rrkz,             X86::VMAXPDZ256rmkz,             0 },
   { X86::VMAXPDZrrkz,                X86::VMAXPDZrmkz,                0 },
+  { X86::VMAXPHZ128rrkz,             X86::VMAXPHZ128rmkz,             0 },
+  { X86::VMAXPHZ256rrkz,             X86::VMAXPHZ256rmkz,             0 },
+  { X86::VMAXPHZrrkz,                X86::VMAXPHZrmkz,                0 },
   { X86::VMAXPSZ128rrkz,             X86::VMAXPSZ128rmkz,             0 },
   { X86::VMAXPSZ256rrkz,             X86::VMAXPSZ256rmkz,             0 },
   { X86::VMAXPSZrrkz,                X86::VMAXPSZrmkz,                0 },
   { X86::VMAXSDZrr_Intkz,            X86::VMAXSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VMAXSHZrr_Intkz,            X86::VMAXSHZrm_Intkz,            TB_NO_REVERSE },
   { X86::VMAXSSZrr_Intkz,            X86::VMAXSSZrm_Intkz,            TB_NO_REVERSE },
   { X86::VMINCPDZ128rrkz,            X86::VMINCPDZ128rmkz,            0 },
   { X86::VMINCPDZ256rrkz,            X86::VMINCPDZ256rmkz,            0 },
   { X86::VMINCPDZrrkz,               X86::VMINCPDZrmkz,               0 },
+  { X86::VMINCPHZ128rrkz,            X86::VMINCPHZ128rmkz,            0 },
+  { X86::VMINCPHZ256rrkz,            X86::VMINCPHZ256rmkz,            0 },
+  { X86::VMINCPHZrrkz,               X86::VMINCPHZrmkz,               0 },
   { X86::VMINCPSZ128rrkz,            X86::VMINCPSZ128rmkz,            0 },
   { X86::VMINCPSZ256rrkz,            X86::VMINCPSZ256rmkz,            0 },
   { X86::VMINCPSZrrkz,               X86::VMINCPSZrmkz,               0 },
   { X86::VMINPDZ128rrkz,             X86::VMINPDZ128rmkz,             0 },
   { X86::VMINPDZ256rrkz,             X86::VMINPDZ256rmkz,             0 },
   { X86::VMINPDZrrkz,                X86::VMINPDZrmkz,                0 },
+  { X86::VMINPHZ128rrkz,             X86::VMINPHZ128rmkz,             0 },
+  { X86::VMINPHZ256rrkz,             X86::VMINPHZ256rmkz,             0 },
+  { X86::VMINPHZrrkz,                X86::VMINPHZrmkz,                0 },
   { X86::VMINPSZ128rrkz,             X86::VMINPSZ128rmkz,             0 },
   { X86::VMINPSZ256rrkz,             X86::VMINPSZ256rmkz,             0 },
   { X86::VMINPSZrrkz,                X86::VMINPSZrmkz,                0 },
   { X86::VMINSDZrr_Intkz,            X86::VMINSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VMINSHZrr_Intkz,            X86::VMINSHZrm_Intkz,            TB_NO_REVERSE },
   { X86::VMINSSZrr_Intkz,            X86::VMINSSZrm_Intkz,            TB_NO_REVERSE },
   { X86::VMOVAPDZ128rrk,             X86::VMOVAPDZ128rmk,             TB_NO_REVERSE | TB_ALIGN_16 },
   { X86::VMOVAPDZ256rrk,             X86::VMOVAPDZ256rmk,             TB_NO_REVERSE | TB_ALIGN_32 },
@@ -3588,10 +3728,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VMULPDZ128rrkz,             X86::VMULPDZ128rmkz,             0 },
   { X86::VMULPDZ256rrkz,             X86::VMULPDZ256rmkz,             0 },
   { X86::VMULPDZrrkz,                X86::VMULPDZrmkz,                0 },
+  { X86::VMULPHZ128rrkz,             X86::VMULPHZ128rmkz,             0 },
+  { X86::VMULPHZ256rrkz,             X86::VMULPHZ256rmkz,             0 },
+  { X86::VMULPHZrrkz,                X86::VMULPHZrmkz,                0 },
   { X86::VMULPSZ128rrkz,             X86::VMULPSZ128rmkz,             0 },
   { X86::VMULPSZ256rrkz,             X86::VMULPSZ256rmkz,             0 },
   { X86::VMULPSZrrkz,                X86::VMULPSZrmkz,                0 },
   { X86::VMULSDZrr_Intkz,            X86::VMULSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VMULSHZrr_Intkz,            X86::VMULSHZrm_Intkz,            TB_NO_REVERSE },
   { X86::VMULSSZrr_Intkz,            X86::VMULSSZrm_Intkz,            TB_NO_REVERSE },
   { X86::VORPDZ128rrkz,              X86::VORPDZ128rmkz,              0 },
   { X86::VORPDZ256rrkz,              X86::VORPDZ256rmkz,              0 },
@@ -4258,21 +4402,33 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VRCP28PSZrk,                X86::VRCP28PSZmk,                0 },
   { X86::VRCP28SDZrkz,               X86::VRCP28SDZmkz,               TB_NO_REVERSE },
   { X86::VRCP28SSZrkz,               X86::VRCP28SSZmkz,               TB_NO_REVERSE },
+  { X86::VRCPPHZ128rk,               X86::VRCPPHZ128mk,               0 },
+  { X86::VRCPPHZ256rk,               X86::VRCPPHZ256mk,               0 },
+  { X86::VRCPPHZrk,                  X86::VRCPPHZmk,                  0 },
+  { X86::VRCPSHZrrkz,                X86::VRCPSHZrmkz,                TB_NO_REVERSE },
   { X86::VREDUCEPDZ128rrik,          X86::VREDUCEPDZ128rmik,          0 },
   { X86::VREDUCEPDZ256rrik,          X86::VREDUCEPDZ256rmik,          0 },
   { X86::VREDUCEPDZrrik,             X86::VREDUCEPDZrmik,             0 },
+  { X86::VREDUCEPHZ128rrik,          X86::VREDUCEPHZ128rmik,          0 },
+  { X86::VREDUCEPHZ256rrik,          X86::VREDUCEPHZ256rmik,          0 },
+  { X86::VREDUCEPHZrrik,             X86::VREDUCEPHZrmik,             0 },
   { X86::VREDUCEPSZ128rrik,          X86::VREDUCEPSZ128rmik,          0 },
   { X86::VREDUCEPSZ256rrik,          X86::VREDUCEPSZ256rmik,          0 },
   { X86::VREDUCEPSZrrik,             X86::VREDUCEPSZrmik,             0 },
   { X86::VREDUCESDZrrikz,            X86::VREDUCESDZrmikz,            TB_NO_REVERSE },
+  { X86::VREDUCESHZrrikz,            X86::VREDUCESHZrmikz,            TB_NO_REVERSE },
   { X86::VREDUCESSZrrikz,            X86::VREDUCESSZrmikz,            TB_NO_REVERSE },
   { X86::VRNDSCALEPDZ128rrik,        X86::VRNDSCALEPDZ128rmik,        0 },
   { X86::VRNDSCALEPDZ256rrik,        X86::VRNDSCALEPDZ256rmik,        0 },
   { X86::VRNDSCALEPDZrrik,           X86::VRNDSCALEPDZrmik,           0 },
+  { X86::VRNDSCALEPHZ128rrik,        X86::VRNDSCALEPHZ128rmik,        0 },
+  { X86::VRNDSCALEPHZ256rrik,        X86::VRNDSCALEPHZ256rmik,        0 },
+  { X86::VRNDSCALEPHZrrik,           X86::VRNDSCALEPHZrmik,           0 },
   { X86::VRNDSCALEPSZ128rrik,        X86::VRNDSCALEPSZ128rmik,        0 },
   { X86::VRNDSCALEPSZ256rrik,        X86::VRNDSCALEPSZ256rmik,        0 },
   { X86::VRNDSCALEPSZrrik,           X86::VRNDSCALEPSZrmik,           0 },
   { X86::VRNDSCALESDZr_Intkz,        X86::VRNDSCALESDZm_Intkz,        TB_NO_REVERSE },
+  { X86::VRNDSCALESHZr_Intkz,        X86::VRNDSCALESHZm_Intkz,        TB_NO_REVERSE },
   { X86::VRNDSCALESSZr_Intkz,        X86::VRNDSCALESSZm_Intkz,        TB_NO_REVERSE },
   { X86::VRSQRT14PDZ128rk,           X86::VRSQRT14PDZ128mk,           0 },
   { X86::VRSQRT14PDZ256rk,           X86::VRSQRT14PDZ256mk,           0 },
@@ -4286,13 +4442,21 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VRSQRT28PSZrk,              X86::VRSQRT28PSZmk,              0 },
   { X86::VRSQRT28SDZrkz,             X86::VRSQRT28SDZmkz,             TB_NO_REVERSE },
   { X86::VRSQRT28SSZrkz,             X86::VRSQRT28SSZmkz,             TB_NO_REVERSE },
+  { X86::VRSQRTPHZ128rk,             X86::VRSQRTPHZ128mk,             0 },
+  { X86::VRSQRTPHZ256rk,             X86::VRSQRTPHZ256mk,             0 },
+  { X86::VRSQRTPHZrk,                X86::VRSQRTPHZmk,                0 },
+  { X86::VRSQRTSHZrrkz,              X86::VRSQRTSHZrmkz,              TB_NO_REVERSE },
   { X86::VSCALEFPDZ128rrkz,          X86::VSCALEFPDZ128rmkz,          0 },
   { X86::VSCALEFPDZ256rrkz,          X86::VSCALEFPDZ256rmkz,          0 },
   { X86::VSCALEFPDZrrkz,             X86::VSCALEFPDZrmkz,             0 },
+  { X86::VSCALEFPHZ128rrkz,          X86::VSCALEFPHZ128rmkz,          0 },
+  { X86::VSCALEFPHZ256rrkz,          X86::VSCALEFPHZ256rmkz,          0 },
+  { X86::VSCALEFPHZrrkz,             X86::VSCALEFPHZrmkz,             0 },
   { X86::VSCALEFPSZ128rrkz,          X86::VSCALEFPSZ128rmkz,          0 },
   { X86::VSCALEFPSZ256rrkz,          X86::VSCALEFPSZ256rmkz,          0 },
   { X86::VSCALEFPSZrrkz,             X86::VSCALEFPSZrmkz,             0 },
   { X86::VSCALEFSDZrrkz,             X86::VSCALEFSDZrmkz,             TB_NO_REVERSE },
+  { X86::VSCALEFSHZrrkz,             X86::VSCALEFSHZrmkz,             TB_NO_REVERSE },
   { X86::VSCALEFSSZrrkz,             X86::VSCALEFSSZrmkz,             TB_NO_REVERSE },
   { X86::VSHUFF32X4Z256rrikz,        X86::VSHUFF32X4Z256rmikz,        0 },
   { X86::VSHUFF32X4Zrrikz,           X86::VSHUFF32X4Zrmikz,           0 },
@@ -4311,18 +4475,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VSQRTPDZ128rk,              X86::VSQRTPDZ128mk,              0 },
   { X86::VSQRTPDZ256rk,              X86::VSQRTPDZ256mk,              0 },
   { X86::VSQRTPDZrk,                 X86::VSQRTPDZmk,                 0 },
+  { X86::VSQRTPHZ128rk,              X86::VSQRTPHZ128mk,              0 },
+  { X86::VSQRTPHZ256rk,              X86::VSQRTPHZ256mk,              0 },
+  { X86::VSQRTPHZrk,                 X86::VSQRTPHZmk,                 0 },
   { X86::VSQRTPSZ128rk,              X86::VSQRTPSZ128mk,              0 },
   { X86::VSQRTPSZ256rk,              X86::VSQRTPSZ256mk,              0 },
   { X86::VSQRTPSZrk,                 X86::VSQRTPSZmk,                 0 },
   { X86::VSQRTSDZr_Intkz,            X86::VSQRTSDZm_Intkz,            TB_NO_REVERSE },
+  { X86::VSQRTSHZr_Intkz,            X86::VSQRTSHZm_Intkz,            TB_NO_REVERSE },
   { X86::VSQRTSSZr_Intkz,            X86::VSQRTSSZm_Intkz,            TB_NO_REVERSE },
   { X86::VSUBPDZ128rrkz,             X86::VSUBPDZ128rmkz,             0 },
   { X86::VSUBPDZ256rrkz,             X86::VSUBPDZ256rmkz,             0 },
   { X86::VSUBPDZrrkz,                X86::VSUBPDZrmkz,                0 },
+  { X86::VSUBPHZ128rrkz,             X86::VSUBPHZ128rmkz,             0 },
+  { X86::VSUBPHZ256rrkz,             X86::VSUBPHZ256rmkz,             0 },
+  { X86::VSUBPHZrrkz,                X86::VSUBPHZrmkz,                0 },
   { X86::VSUBPSZ128rrkz,             X86::VSUBPSZ128rmkz,             0 },
   { X86::VSUBPSZ256rrkz,             X86::VSUBPSZ256rmkz,             0 },
   { X86::VSUBPSZrrkz,                X86::VSUBPSZrmkz,                0 },
   { X86::VSUBSDZrr_Intkz,            X86::VSUBSDZrm_Intkz,            TB_NO_REVERSE },
+  { X86::VSUBSHZrr_Intkz,            X86::VSUBSHZrm_Intkz,            TB_NO_REVERSE },
   { X86::VSUBSSZrr_Intkz,            X86::VSUBSSZrm_Intkz,            TB_NO_REVERSE },
   { X86::VUNPCKHPDZ128rrkz,          X86::VUNPCKHPDZ128rmkz,          0 },
   { X86::VUNPCKHPDZ256rrkz,          X86::VUNPCKHPDZ256rmkz,          0 },
@@ -4348,10 +4520,14 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VADDPDZ128rrk,             X86::VADDPDZ128rmk,             0 },
   { X86::VADDPDZ256rrk,             X86::VADDPDZ256rmk,             0 },
   { X86::VADDPDZrrk,                X86::VADDPDZrmk,                0 },
+  { X86::VADDPHZ128rrk,             X86::VADDPHZ128rmk,             0 },
+  { X86::VADDPHZ256rrk,             X86::VADDPHZ256rmk,             0 },
+  { X86::VADDPHZrrk,                X86::VADDPHZrmk,                0 },
   { X86::VADDPSZ128rrk,             X86::VADDPSZ128rmk,             0 },
   { X86::VADDPSZ256rrk,             X86::VADDPSZ256rmk,             0 },
   { X86::VADDPSZrrk,                X86::VADDPSZrmk,                0 },
   { X86::VADDSDZrr_Intk,            X86::VADDSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VADDSHZrr_Intk,            X86::VADDSHZrm_Intk,            TB_NO_REVERSE },
   { X86::VADDSSZrr_Intk,            X86::VADDSSZrm_Intk,            TB_NO_REVERSE },
   { X86::VALIGNDZ128rrik,           X86::VALIGNDZ128rmik,           0 },
   { X86::VALIGNDZ256rrik,           X86::VALIGNDZ256rmik,           0 },
@@ -4374,18 +4550,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VCVTNE2PS2BF16Z128rrk,     X86::VCVTNE2PS2BF16Z128rmk,     0 },
   { X86::VCVTNE2PS2BF16Z256rrk,     X86::VCVTNE2PS2BF16Z256rmk,     0 },
   { X86::VCVTNE2PS2BF16Zrrk,        X86::VCVTNE2PS2BF16Zrmk,        0 },
+  { X86::VCVTSD2SHZrr_Intk,         X86::VCVTSD2SHZrm_Intk,         TB_NO_REVERSE },
   { X86::VCVTSD2SSZrr_Intk,         X86::VCVTSD2SSZrm_Intk,         TB_NO_REVERSE },
+  { X86::VCVTSH2SDZrr_Intk,         X86::VCVTSH2SDZrm_Intk,         TB_NO_REVERSE },
+  { X86::VCVTSH2SSZrr_Intk,         X86::VCVTSH2SSZrm_Intk,         TB_NO_REVERSE },
   { X86::VCVTSS2SDZrr_Intk,         X86::VCVTSS2SDZrm_Intk,         TB_NO_REVERSE },
+  { X86::VCVTSS2SHZrr_Intk,         X86::VCVTSS2SHZrm_Intk,         TB_NO_REVERSE },
   { X86::VDBPSADBWZ128rrik,         X86::VDBPSADBWZ128rmik,         0 },
   { X86::VDBPSADBWZ256rrik,         X86::VDBPSADBWZ256rmik,         0 },
   { X86::VDBPSADBWZrrik,            X86::VDBPSADBWZrmik,            0 },
   { X86::VDIVPDZ128rrk,             X86::VDIVPDZ128rmk,             0 },
   { X86::VDIVPDZ256rrk,             X86::VDIVPDZ256rmk,             0 },
   { X86::VDIVPDZrrk,                X86::VDIVPDZrmk,                0 },
+  { X86::VDIVPHZ128rrk,             X86::VDIVPHZ128rmk,             0 },
+  { X86::VDIVPHZ256rrk,             X86::VDIVPHZ256rmk,             0 },
+  { X86::VDIVPHZrrk,                X86::VDIVPHZrmk,                0 },
   { X86::VDIVPSZ128rrk,             X86::VDIVPSZ128rmk,             0 },
   { X86::VDIVPSZ256rrk,             X86::VDIVPSZ256rmk,             0 },
   { X86::VDIVPSZrrk,                X86::VDIVPSZrmk,                0 },
   { X86::VDIVSDZrr_Intk,            X86::VDIVSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VDIVSHZrr_Intk,            X86::VDIVSHZrm_Intk,            TB_NO_REVERSE },
   { X86::VDIVSSZrr_Intk,            X86::VDIVSSZrm_Intk,            TB_NO_REVERSE },
   { X86::VDPBF16PSZ128rk,           X86::VDPBF16PSZ128mk,           0 },
   { X86::VDPBF16PSZ128rkz,          X86::VDPBF16PSZ128mkz,          0 },
@@ -4674,8 +4858,10 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VFNMSUB231SSZr_Intk,       X86::VFNMSUB231SSZm_Intk,       TB_NO_REVERSE },
   { X86::VFNMSUB231SSZr_Intkz,      X86::VFNMSUB231SSZm_Intkz,      TB_NO_REVERSE },
   { X86::VGETEXPSDZrk,              X86::VGETEXPSDZmk,              TB_NO_REVERSE },
+  { X86::VGETEXPSHZrk,              X86::VGETEXPSHZmk,              TB_NO_REVERSE },
   { X86::VGETEXPSSZrk,              X86::VGETEXPSSZmk,              TB_NO_REVERSE },
   { X86::VGETMANTSDZrrik,           X86::VGETMANTSDZrmik,           TB_NO_REVERSE },
+  { X86::VGETMANTSHZrrik,           X86::VGETMANTSHZrmik,           TB_NO_REVERSE },
   { X86::VGETMANTSSZrrik,           X86::VGETMANTSSZrmik,           TB_NO_REVERSE },
   { X86::VGF2P8AFFINEINVQBZ128rrik, X86::VGF2P8AFFINEINVQBZ128rmik, 0 },
   { X86::VGF2P8AFFINEINVQBZ256rrik, X86::VGF2P8AFFINEINVQBZ256rmik, 0 },
@@ -4701,38 +4887,56 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VMAXCPDZ128rrk,            X86::VMAXCPDZ128rmk,            0 },
   { X86::VMAXCPDZ256rrk,            X86::VMAXCPDZ256rmk,            0 },
   { X86::VMAXCPDZrrk,               X86::VMAXCPDZrmk,               0 },
+  { X86::VMAXCPHZ128rrk,            X86::VMAXCPHZ128rmk,            0 },
+  { X86::VMAXCPHZ256rrk,            X86::VMAXCPHZ256rmk,            0 },
+  { X86::VMAXCPHZrrk,               X86::VMAXCPHZrmk,               0 },
   { X86::VMAXCPSZ128rrk,            X86::VMAXCPSZ128rmk,            0 },
   { X86::VMAXCPSZ256rrk,            X86::VMAXCPSZ256rmk,            0 },
   { X86::VMAXCPSZrrk,               X86::VMAXCPSZrmk,               0 },
   { X86::VMAXPDZ128rrk,             X86::VMAXPDZ128rmk,             0 },
   { X86::VMAXPDZ256rrk,             X86::VMAXPDZ256rmk,             0 },
   { X86::VMAXPDZrrk,                X86::VMAXPDZrmk,                0 },
+  { X86::VMAXPHZ128rrk,             X86::VMAXPHZ128rmk,             0 },
+  { X86::VMAXPHZ256rrk,             X86::VMAXPHZ256rmk,             0 },
+  { X86::VMAXPHZrrk,                X86::VMAXPHZrmk,                0 },
   { X86::VMAXPSZ128rrk,             X86::VMAXPSZ128rmk,             0 },
   { X86::VMAXPSZ256rrk,             X86::VMAXPSZ256rmk,             0 },
   { X86::VMAXPSZrrk,                X86::VMAXPSZrmk,                0 },
   { X86::VMAXSDZrr_Intk,            X86::VMAXSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VMAXSHZrr_Intk,            X86::VMAXSHZrm_Intk,            TB_NO_REVERSE },
   { X86::VMAXSSZrr_Intk,            X86::VMAXSSZrm_Intk,            TB_NO_REVERSE },
   { X86::VMINCPDZ128rrk,            X86::VMINCPDZ128rmk,            0 },
   { X86::VMINCPDZ256rrk,            X86::VMINCPDZ256rmk,            0 },
   { X86::VMINCPDZrrk,               X86::VMINCPDZrmk,               0 },
+  { X86::VMINCPHZ128rrk,            X86::VMINCPHZ128rmk,            0 },
+  { X86::VMINCPHZ256rrk,            X86::VMINCPHZ256rmk,            0 },
+  { X86::VMINCPHZrrk,               X86::VMINCPHZrmk,               0 },
   { X86::VMINCPSZ128rrk,            X86::VMINCPSZ128rmk,            0 },
   { X86::VMINCPSZ256rrk,            X86::VMINCPSZ256rmk,            0 },
   { X86::VMINCPSZrrk,               X86::VMINCPSZrmk,               0 },
   { X86::VMINPDZ128rrk,             X86::VMINPDZ128rmk,             0 },
   { X86::VMINPDZ256rrk,             X86::VMINPDZ256rmk,             0 },
   { X86::VMINPDZrrk,                X86::VMINPDZrmk,                0 },
+  { X86::VMINPHZ128rrk,             X86::VMINPHZ128rmk,             0 },
+  { X86::VMINPHZ256rrk,             X86::VMINPHZ256rmk,             0 },
+  { X86::VMINPHZrrk,                X86::VMINPHZrmk,                0 },
   { X86::VMINPSZ128rrk,             X86::VMINPSZ128rmk,             0 },
   { X86::VMINPSZ256rrk,             X86::VMINPSZ256rmk,             0 },
   { X86::VMINPSZrrk,                X86::VMINPSZrmk,                0 },
   { X86::VMINSDZrr_Intk,            X86::VMINSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VMINSHZrr_Intk,            X86::VMINSHZrm_Intk,            TB_NO_REVERSE },
   { X86::VMINSSZrr_Intk,            X86::VMINSSZrm_Intk,            TB_NO_REVERSE },
   { X86::VMULPDZ128rrk,             X86::VMULPDZ128rmk,             0 },
   { X86::VMULPDZ256rrk,             X86::VMULPDZ256rmk,             0 },
   { X86::VMULPDZrrk,                X86::VMULPDZrmk,                0 },
+  { X86::VMULPHZ128rrk,             X86::VMULPHZ128rmk,             0 },
+  { X86::VMULPHZ256rrk,             X86::VMULPHZ256rmk,             0 },
+  { X86::VMULPHZrrk,                X86::VMULPHZrmk,                0 },
   { X86::VMULPSZ128rrk,             X86::VMULPSZ128rmk,             0 },
   { X86::VMULPSZ256rrk,             X86::VMULPSZ256rmk,             0 },
   { X86::VMULPSZrrk,                X86::VMULPSZrmk,                0 },
   { X86::VMULSDZrr_Intk,            X86::VMULSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VMULSHZrr_Intk,            X86::VMULSHZrm_Intk,            TB_NO_REVERSE },
   { X86::VMULSSZrr_Intk,            X86::VMULSSZrm_Intk,            TB_NO_REVERSE },
   { X86::VORPDZ128rrk,              X86::VORPDZ128rmk,              0 },
   { X86::VORPDZ256rrk,              X86::VORPDZ256rmk,              0 },
@@ -5213,21 +5417,29 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VRCP14SSZrrk,              X86::VRCP14SSZrmk,              TB_NO_REVERSE },
   { X86::VRCP28SDZrk,               X86::VRCP28SDZmk,               TB_NO_REVERSE },
   { X86::VRCP28SSZrk,               X86::VRCP28SSZmk,               TB_NO_REVERSE },
+  { X86::VRCPSHZrrk,                X86::VRCPSHZrmk,                TB_NO_REVERSE },
   { X86::VREDUCESDZrrik,            X86::VREDUCESDZrmik,            TB_NO_REVERSE },
+  { X86::VREDUCESHZrrik,            X86::VREDUCESHZrmik,            TB_NO_REVERSE },
   { X86::VREDUCESSZrrik,            X86::VREDUCESSZrmik,            TB_NO_REVERSE },
   { X86::VRNDSCALESDZr_Intk,        X86::VRNDSCALESDZm_Intk,        TB_NO_REVERSE },
+  { X86::VRNDSCALESHZr_Intk,        X86::VRNDSCALESHZm_Intk,        TB_NO_REVERSE },
   { X86::VRNDSCALESSZr_Intk,        X86::VRNDSCALESSZm_Intk,        TB_NO_REVERSE },
   { X86::VRSQRT14SDZrrk,            X86::VRSQRT14SDZrmk,            TB_NO_REVERSE },
   { X86::VRSQRT14SSZrrk,            X86::VRSQRT14SSZrmk,            TB_NO_REVERSE },
   { X86::VRSQRT28SDZrk,             X86::VRSQRT28SDZmk,             TB_NO_REVERSE },
   { X86::VRSQRT28SSZrk,             X86::VRSQRT28SSZmk,             TB_NO_REVERSE },
+  { X86::VRSQRTSHZrrk,              X86::VRSQRTSHZrmk,              TB_NO_REVERSE },
   { X86::VSCALEFPDZ128rrk,          X86::VSCALEFPDZ128rmk,          0 },
   { X86::VSCALEFPDZ256rrk,          X86::VSCALEFPDZ256rmk,          0 },
   { X86::VSCALEFPDZrrk,             X86::VSCALEFPDZrmk,             0 },
+  { X86::VSCALEFPHZ128rrk,          X86::VSCALEFPHZ128rmk,          0 },
+  { X86::VSCALEFPHZ256rrk,          X86::VSCALEFPHZ256rmk,          0 },
+  { X86::VSCALEFPHZrrk,             X86::VSCALEFPHZrmk,             0 },
   { X86::VSCALEFPSZ128rrk,          X86::VSCALEFPSZ128rmk,          0 },
   { X86::VSCALEFPSZ256rrk,          X86::VSCALEFPSZ256rmk,          0 },
   { X86::VSCALEFPSZrrk,             X86::VSCALEFPSZrmk,             0 },
   { X86::VSCALEFSDZrrk,             X86::VSCALEFSDZrmk,             TB_NO_REVERSE },
+  { X86::VSCALEFSHZrrk,             X86::VSCALEFSHZrmk,             TB_NO_REVERSE },
   { X86::VSCALEFSSZrrk,             X86::VSCALEFSSZrmk,             TB_NO_REVERSE },
   { X86::VSHUFF32X4Z256rrik,        X86::VSHUFF32X4Z256rmik,        0 },
   { X86::VSHUFF32X4Zrrik,           X86::VSHUFF32X4Zrmik,           0 },
@@ -5244,14 +5456,19 @@ static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
   { X86::VSHUFPSZ256rrik,           X86::VSHUFPSZ256rmik,           0 },
   { X86::VSHUFPSZrrik,              X86::VSHUFPSZrmik,              0 },
   { X86::VSQRTSDZr_Intk,            X86::VSQRTSDZm_Intk,            TB_NO_REVERSE },
+  { X86::VSQRTSHZr_Intk,            X86::VSQRTSHZm_Intk,            TB_NO_REVERSE },
   { X86::VSQRTSSZr_Intk,            X86::VSQRTSSZm_Intk,            TB_NO_REVERSE },
   { X86::VSUBPDZ128rrk,             X86::VSUBPDZ128rmk,             0 },
   { X86::VSUBPDZ256rrk,             X86::VSUBPDZ256rmk,             0 },
   { X86::VSUBPDZrrk,                X86::VSUBPDZrmk,                0 },
+  { X86::VSUBPHZ128rrk,             X86::VSUBPHZ128rmk,             0 },
+  { X86::VSUBPHZ256rrk,             X86::VSUBPHZ256rmk,             0 },
+  { X86::VSUBPHZrrk,                X86::VSUBPHZrmk,                0 },
   { X86::VSUBPSZ128rrk,             X86::VSUBPSZ128rmk,             0 },
   { X86::VSUBPSZ256rrk,             X86::VSUBPSZ256rmk,             0 },
   { X86::VSUBPSZrrk,                X86::VSUBPSZrmk,                0 },
   { X86::VSUBSDZrr_Intk,            X86::VSUBSDZrm_Intk,            TB_NO_REVERSE },
+  { X86::VSUBSHZrr_Intk,            X86::VSUBSHZrm_Intk,            TB_NO_REVERSE },
   { X86::VSUBSSZrr_Intk,            X86::VSUBSSZrm_Intk,            TB_NO_REVERSE },
   { X86::VUNPCKHPDZ128rrk,          X86::VUNPCKHPDZ128rmk,          0 },
   { X86::VUNPCKHPDZ256rrk,          X86::VUNPCKHPDZ256rmk,          0 },
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index fe9fea40029b0..b562c360c359e 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -130,14 +130,12 @@ def X86vmtruncs  : SDNode<"X86ISD::VMTRUNCS",  SDTVmtrunc>;
 def X86vmtruncus : SDNode<"X86ISD::VMTRUNCUS", SDTVmtrunc>;
 
 def X86vfpext  : SDNode<"X86ISD::VFPEXT",
-                        SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
-                                             SDTCVecEltisVT<1, f32>,
-                                             SDTCisSameSizeAs<0, 1>]>>;
+                        SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>,
+                                             SDTCisFP<1>, SDTCisVec<1>]>>;
 
 def X86strict_vfpext  : SDNode<"X86ISD::STRICT_VFPEXT",
-                               SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
-                                                    SDTCVecEltisVT<1, f32>,
-                                                    SDTCisSameSizeAs<0, 1>]>,
+                               SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>,
+                                                    SDTCisFP<1>, SDTCisVec<1>]>,
                                                     [SDNPHasChain]>;
 
 def X86any_vfpext : PatFrags<(ops node:$src),
@@ -145,13 +143,13 @@ def X86any_vfpext : PatFrags<(ops node:$src),
                                (X86vfpext node:$src)]>;
 
 def X86vfpround: SDNode<"X86ISD::VFPROUND",
-                        SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
-                                             SDTCVecEltisVT<1, f64>,
+                        SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>,
+                                             SDTCisFP<1>, SDTCisVec<1>,
                                              SDTCisOpSmallerThanOp<0, 1>]>>;
 
 def X86strict_vfpround: SDNode<"X86ISD::STRICT_VFPROUND",
-                        SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
-                                             SDTCVecEltisVT<1, f64>,
+                        SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>,
+                                             SDTCisFP<1>, SDTCisVec<1>,
                                              SDTCisOpSmallerThanOp<0, 1>]>,
                                              [SDNPHasChain]>;
 
@@ -160,33 +158,32 @@ def X86any_vfpround : PatFrags<(ops node:$src),
                                (X86vfpround node:$src)]>;
 
 def X86frounds   : SDNode<"X86ISD::VFPROUNDS",
-                           SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
+                           SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>,
                                                 SDTCisSameAs<0, 1>,
-                                                SDTCVecEltisVT<2, f64>,
+                                                SDTCisFP<2>, SDTCisVec<2>,
                                                 SDTCisSameSizeAs<0, 2>]>>;
 
 def X86froundsRnd: SDNode<"X86ISD::VFPROUNDS_RND",
-                        SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
+                        SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
                                              SDTCisSameAs<0, 1>,
-                                             SDTCVecEltisVT<2, f64>,
+                                             SDTCisFP<2>, SDTCisVec<2>,
                                              SDTCisSameSizeAs<0, 2>,
                                              SDTCisVT<3, i32>]>>;
 
 def X86fpexts     : SDNode<"X86ISD::VFPEXTS",
-                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
+                        SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>,
                                              SDTCisSameAs<0, 1>,
-                                             SDTCVecEltisVT<2, f32>,
+                                             SDTCisFP<2>, SDTCisVec<2>,
                                              SDTCisSameSizeAs<0, 2>]>>;
 def X86fpextsSAE  : SDNode<"X86ISD::VFPEXTS_SAE",
-                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f64>,
+                        SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>,
                                              SDTCisSameAs<0, 1>,
-                                             SDTCVecEltisVT<2, f32>,
+                                             SDTCisFP<2>, SDTCisVec<2>,
                                              SDTCisSameSizeAs<0, 2>]>>;
 
 def X86vmfpround: SDNode<"X86ISD::VMFPROUND",
-                         SDTypeProfile<1, 3, [SDTCVecEltisVT<0, f32>,
-                                              SDTCVecEltisVT<1, f64>,
-                                              SDTCisSameSizeAs<0, 1>,
+                         SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
+                                              SDTCisFP<1>, SDTCisVec<1>,
                                               SDTCisSameAs<0, 2>,
                                               SDTCVecEltisVT<3, i1>,
                                               SDTCisSameNumEltsAs<1, 3>]>>;
@@ -709,7 +706,6 @@ def X86cvtp2UInt     : SDNode<"X86ISD::CVTP2UI",  SDTFloatToInt>;
 // Masked versions of above
 def SDTMVintToFP: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
                                        SDTCisFP<0>, SDTCisInt<1>,
-                                       SDTCisSameSizeAs<0, 1>,
                                        SDTCisSameAs<0, 2>,
                                        SDTCVecEltisVT<3, i1>,
                                        SDTCisSameNumEltsAs<1, 3>]>;
@@ -757,12 +753,12 @@ def X86mcvtps2ph   : SDNode<"X86ISD::MCVTPS2PH",
                                              SDTCVecEltisVT<4, i1>,
                                              SDTCisSameNumEltsAs<1, 4>]> >;
 def X86vfpextSAE  : SDNode<"X86ISD::VFPEXT_SAE",
-                        SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
-                                             SDTCVecEltisVT<1, f32>,
+                        SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>,
+                                             SDTCisFP<1>, SDTCisVec<1>,
                                              SDTCisOpSmallerThanOp<1, 0>]>>;
 def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND",
-                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
-                                             SDTCVecEltisVT<1, f64>,
+                        SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>,
+                                             SDTCisFP<1>, SDTCisVec<1>,
                                              SDTCisOpSmallerThanOp<0, 1>,
                                              SDTCisVT<2, i32>]>>;
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 4709021822ac3..7f0e151b9eba2 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2241,6 +2241,10 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VCMPSSZrr:
   case X86::VCMPPDZrri:
   case X86::VCMPPSZrri:
+  case X86::VCMPSHZrr:
+  case X86::VCMPPHZrri:
+  case X86::VCMPPHZ128rri:
+  case X86::VCMPPHZ256rri:
   case X86::VCMPPDZ128rri:
   case X86::VCMPPSZ128rri:
   case X86::VCMPPDZ256rri:
@@ -2487,6 +2491,10 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case X86::VCMPSSZrr:
   case X86::VCMPPDZrri:
   case X86::VCMPPSZrri:
+  case X86::VCMPSHZrr:
+  case X86::VCMPPHZrri:
+  case X86::VCMPPHZ128rri:
+  case X86::VCMPPHZ256rri:
   case X86::VCMPPDZ128rri:
   case X86::VCMPPSZ128rri:
   case X86::VCMPPDZ256rri:
@@ -5169,6 +5177,26 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
   case X86::VCVTUSI642SDZrr_Int:
   case X86::VCVTUSI642SDZrrb_Int:
   case X86::VCVTUSI642SDZrm_Int:
+  case X86::VCVTSI2SHZrr:
+  case X86::VCVTSI2SHZrm:
+  case X86::VCVTSI2SHZrr_Int:
+  case X86::VCVTSI2SHZrrb_Int:
+  case X86::VCVTSI2SHZrm_Int:
+  case X86::VCVTSI642SHZrr:
+  case X86::VCVTSI642SHZrm:
+  case X86::VCVTSI642SHZrr_Int:
+  case X86::VCVTSI642SHZrrb_Int:
+  case X86::VCVTSI642SHZrm_Int:
+  case X86::VCVTUSI2SHZrr:
+  case X86::VCVTUSI2SHZrm:
+  case X86::VCVTUSI2SHZrr_Int:
+  case X86::VCVTUSI2SHZrrb_Int:
+  case X86::VCVTUSI2SHZrm_Int:
+  case X86::VCVTUSI642SHZrr:
+  case X86::VCVTUSI642SHZrm:
+  case X86::VCVTUSI642SHZrr_Int:
+  case X86::VCVTUSI642SHZrrb_Int:
+  case X86::VCVTUSI642SHZrm_Int:
     // Load folding won't effect the undef register update since the input is
     // a GPR.
     return OpNum == 1 && !ForLoadFold;
@@ -5241,6 +5269,29 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
   case X86::VRCP14SDZrm:
   case X86::VRCP14SSZrr:
   case X86::VRCP14SSZrm:
+  case X86::VRCPSHZrr:
+  case X86::VRCPSHZrm:
+  case X86::VRSQRTSHZrr:
+  case X86::VRSQRTSHZrm:
+  case X86::VREDUCESHZrmi:
+  case X86::VREDUCESHZrri:
+  case X86::VREDUCESHZrrib:
+  case X86::VGETEXPSHZr:
+  case X86::VGETEXPSHZrb:
+  case X86::VGETEXPSHZm:
+  case X86::VGETMANTSHZrri:
+  case X86::VGETMANTSHZrrib:
+  case X86::VGETMANTSHZrmi:
+  case X86::VRNDSCALESHZr:
+  case X86::VRNDSCALESHZr_Int:
+  case X86::VRNDSCALESHZrb_Int:
+  case X86::VRNDSCALESHZm:
+  case X86::VRNDSCALESHZm_Int:
+  case X86::VSQRTSHZr:
+  case X86::VSQRTSHZr_Int:
+  case X86::VSQRTSHZrb_Int:
+  case X86::VSQRTSHZm:
+  case X86::VSQRTSHZm_Int:
   case X86::VRCP28SDZr:
   case X86::VRCP28SDZrb:
   case X86::VRCP28SDZm:
@@ -5270,6 +5321,26 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
   case X86::VSQRTSDZrb_Int:
   case X86::VSQRTSDZm:
   case X86::VSQRTSDZm_Int:
+  case X86::VCVTSD2SHZrr:
+  case X86::VCVTSD2SHZrr_Int:
+  case X86::VCVTSD2SHZrrb_Int:
+  case X86::VCVTSD2SHZrm:
+  case X86::VCVTSD2SHZrm_Int:
+  case X86::VCVTSS2SHZrr:
+  case X86::VCVTSS2SHZrr_Int:
+  case X86::VCVTSS2SHZrrb_Int:
+  case X86::VCVTSS2SHZrm:
+  case X86::VCVTSS2SHZrm_Int:
+  case X86::VCVTSH2SDZrr:
+  case X86::VCVTSH2SDZrr_Int:
+  case X86::VCVTSH2SDZrrb_Int:
+  case X86::VCVTSH2SDZrm:
+  case X86::VCVTSH2SDZrm_Int:
+  case X86::VCVTSH2SSZrr:
+  case X86::VCVTSH2SSZrr_Int:
+  case X86::VCVTSH2SSZrrb_Int:
+  case X86::VCVTSH2SSZrm:
+  case X86::VCVTSH2SSZrm_Int:
     return OpNum == 1;
   case X86::VMOVSSZrrk:
   case X86::VMOVSDZrrk:
@@ -6047,6 +6118,31 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     }
   }
 
+  if ((Opc == X86::VMOVSHZrm || Opc == X86::VMOVSHZrm_alt) && RegSize > 16) {
+    // These instructions only load 16 bits, we can't fold them if the
+    // destination register is wider than 16 bits (2 bytes), and its user
+    // instruction isn't scalar (SH).
+    switch (UserOpc) {
+    case X86::VADDSHZrr_Int:
+    case X86::VCMPSHZrr_Int:
+    case X86::VDIVSHZrr_Int:
+    case X86::VMAXSHZrr_Int:
+    case X86::VMINSHZrr_Int:
+    case X86::VMULSHZrr_Int:
+    case X86::VSUBSHZrr_Int:
+    case X86::VADDSHZrr_Intk: case X86::VADDSHZrr_Intkz:
+    case X86::VCMPSHZrr_Intk:
+    case X86::VDIVSHZrr_Intk: case X86::VDIVSHZrr_Intkz:
+    case X86::VMAXSHZrr_Intk: case X86::VMAXSHZrr_Intkz:
+    case X86::VMINSHZrr_Intk: case X86::VMINSHZrr_Intkz:
+    case X86::VMULSHZrr_Intk: case X86::VMULSHZrr_Intkz:
+    case X86::VSUBSHZrr_Intk: case X86::VSUBSHZrr_Intkz:
+      return false;
+    default:
+      return true;
+    }
+  }
+
   return false;
 }
 
@@ -8401,6 +8497,14 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
   case X86::VMINCSSrr:
   case X86::VMINCSDZrr:
   case X86::VMINCSSZrr:
+  case X86::VMAXCPHZ128rr:
+  case X86::VMAXCPHZ256rr:
+  case X86::VMAXCPHZrr:
+  case X86::VMAXCSHZrr:
+  case X86::VMINCPHZ128rr:
+  case X86::VMINCPHZ256rr:
+  case X86::VMINCPHZrr:
+  case X86::VMINCSHZrr:
     return true;
   case X86::ADDPDrr:
   case X86::ADDPSrr:
@@ -8438,6 +8542,14 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
   case X86::VMULSSrr:
   case X86::VMULSDZrr:
   case X86::VMULSSZrr:
+  case X86::VADDPHZ128rr:
+  case X86::VADDPHZ256rr:
+  case X86::VADDPHZrr:
+  case X86::VADDSHZrr:
+  case X86::VMULPHZ128rr:
+  case X86::VMULPHZ256rr:
+  case X86::VMULPHZrr:
+  case X86::VMULSHZrr:
     return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
            Inst.getFlag(MachineInstr::MIFlag::FmNsz);
   default:
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 70d7cecce31bd..3cf6ad248e9c5 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1747,20 +1747,20 @@ let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 // XMM only
 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
+                       [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
                        VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (X86any_vfpround (loadv2f64 addr:$src)))]>,
+                       [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv2f64 addr:$src))))]>,
                        VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
 
 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (X86any_vfpround VR256:$src))]>,
+                        [(set VR128:$dst, (v4f32 (X86any_vfpround (v4f64 VR256:$src))))]>,
                         VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
-                        [(set VR128:$dst, (X86any_vfpround (loadv4f64 addr:$src)))]>,
+                        [(set VR128:$dst, (v4f32 (X86any_vfpround (loadv4f64 addr:$src))))]>,
                         VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
 } // Predicates = [HasAVX, NoVLX]
 
@@ -1771,11 +1771,11 @@ def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
 
 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
+                     [(set VR128:$dst, (v4f32 (X86any_vfpround (v2f64 VR128:$src))))]>,
                      Sched<[WriteCvtPD2PS]>, SIMD_EXC;
 def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}",
-                     [(set VR128:$dst, (X86any_vfpround (memopv2f64 addr:$src)))]>,
+                     [(set VR128:$dst, (v4f32 (X86any_vfpround (memopv2f64 addr:$src))))]>,
                      Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index de2500b8e1bd7..efc4811084f94 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -987,6 +987,206 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_256, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
   X86_INTRINSIC_DATA(avx512bf16_dpbf16ps_512, INTR_TYPE_3OP, X86ISD::DPBF16PS, 0),
   X86_INTRINSIC_DATA(avx512bf16_mask_cvtneps2bf16_128, CVTNEPS2BF16_MASK, X86ISD::CVTNEPS2BF16, X86ISD::MCVTNEPS2BF16),
+  X86_INTRINSIC_DATA(avx512fp16_add_ph_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND),
+  X86_INTRINSIC_DATA(avx512fp16_div_ph_512, INTR_TYPE_2OP, ISD::FDIV, X86ISD::FDIV_RND),
+  X86_INTRINSIC_DATA(avx512fp16_fpclass_ph_128, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512fp16_fpclass_ph_256, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512fp16_fpclass_ph_512, INTR_TYPE_2OP, X86ISD::VFPCLASS, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_add_sh_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FADDS, X86ISD::FADDS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_128, CMP_MASK_CC, X86ISD::CMPMM, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_256, CMP_MASK_CC, X86ISD::CMPMM, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_cmp_ph_512, CMP_MASK_CC, X86ISD::CMPMM, X86ISD::CMPMM_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_cmp_sh, CMP_MASK_SCALAR_CC,
+                     X86ISD::FSETCCM, X86ISD::FSETCCM_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_div_sh_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FDIVS, X86ISD::FDIVS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_fpclass_sh, FPCLASSS, X86ISD::VFPCLASSS, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getexp_ph_128, INTR_TYPE_1OP_MASK, X86ISD::FGETEXP, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getexp_ph_256, INTR_TYPE_1OP_MASK, X86ISD::FGETEXP, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getexp_ph_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::FGETEXP, X86ISD::FGETEXP_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getexp_sh, INTR_TYPE_SCALAR_MASK_SAE,
+                     X86ISD::FGETEXPS, X86ISD::FGETEXPS_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getmant_ph_128, INTR_TYPE_2OP_MASK_SAE,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getmant_ph_256, INTR_TYPE_2OP_MASK_SAE,
+                     X86ISD::VGETMANT, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getmant_ph_512, INTR_TYPE_2OP_MASK_SAE,
+                     X86ISD::VGETMANT, X86ISD::VGETMANT_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_getmant_sh, INTR_TYPE_3OP_SCALAR_MASK_SAE,
+                     X86ISD::VGETMANTS, X86ISD::VGETMANTS_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_max_sh_round, INTR_TYPE_SCALAR_MASK_SAE,
+                     X86ISD::FMAXS, X86ISD::FMAXS_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_min_sh_round, INTR_TYPE_SCALAR_MASK_SAE,
+                     X86ISD::FMINS, X86ISD::FMINS_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_mul_sh_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMULS, X86ISD::FMULS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rcp_ph_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rcp_ph_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rcp_ph_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rcp_sh, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_reduce_ph_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_reduce_ph_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_reduce_ph_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VREDUCE, X86ISD::VREDUCE_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_reduce_sh, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_ph_128, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_ph_256, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_ph_512, INTR_TYPE_2OP_MASK_SAE, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rndscale_sh, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_ph_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_ph_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_ph_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_rsqrt_sh, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_scalef_ph_128, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_scalef_ph_256, INTR_TYPE_2OP_MASK, X86ISD::SCALEF, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_scalef_ph_512, INTR_TYPE_2OP_MASK,
+                     X86ISD::SCALEF, X86ISD::SCALEF_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_scalef_sh, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::SCALEFS, X86ISD::SCALEFS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_sqrt_sh, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FSQRTS, X86ISD::FSQRTS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_sub_sh_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FSUBS, X86ISD::FSUBS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtdq2ph_128, TRUNCATE_TO_REG,
+                     X86ISD::CVTSI2P, X86ISD::MCVTSI2P),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtpd2ph_128, TRUNCATE_TO_REG,
+                     X86ISD::VFPROUND, X86ISD::VMFPROUND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtpd2ph_256, TRUNCATE_TO_REG,
+                     X86ISD::VFPROUND, X86ISD::VMFPROUND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtpd2ph_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VFPROUND, X86ISD::VFPROUND_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2dq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2dq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2dq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2pd_128, INTR_TYPE_1OP_MASK, X86ISD::VFPEXT, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2pd_256, INTR_TYPE_1OP_MASK, X86ISD::VFPEXT, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2pd_512, INTR_TYPE_1OP_MASK_SAE,
+                     ISD::FP_EXTEND, X86ISD::VFPEXT_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2psx_128, INTR_TYPE_1OP_MASK, X86ISD::VFPEXT, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2psx_256, INTR_TYPE_1OP_MASK, ISD::FP_EXTEND, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2psx_512, INTR_TYPE_1OP_MASK_SAE,
+                     ISD::FP_EXTEND, X86ISD::VFPEXT_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2qq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2qq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2qq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2udq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2udq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2udq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uqq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uqq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uqq_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2uw_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2UI, X86ISD::CVTP2UI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2w_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2w_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtph2w_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTP2SI, X86ISD::CVTP2SI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtps2phx_128, TRUNCATE_TO_REG,
+                     X86ISD::VFPROUND, X86ISD::VMFPROUND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtps2phx_256, INTR_TYPE_1OP_MASK, X86ISD::VFPROUND, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtps2phx_512, INTR_TYPE_1OP_MASK,
+                     X86ISD::VFPROUND, X86ISD::VFPROUND_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtqq2ph_128, TRUNCATE_TO_REG,
+                     X86ISD::CVTSI2P, X86ISD::MCVTSI2P),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtqq2ph_256, TRUNCATE_TO_REG,
+                     X86ISD::CVTSI2P, X86ISD::MCVTSI2P),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtsd2sh_round, INTR_TYPE_SCALAR_MASK_RND,
+                     X86ISD::VFPROUNDS, X86ISD::VFPROUNDS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtsh2sd_round, INTR_TYPE_SCALAR_MASK_SAE,
+                     X86ISD::VFPEXTS, X86ISD::VFPEXTS_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtsh2ss_round, INTR_TYPE_SCALAR_MASK_SAE,
+                     X86ISD::VFPEXTS, X86ISD::VFPEXTS_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtss2sh_round, INTR_TYPE_SCALAR_MASK_RND,
+                     X86ISD::VFPROUNDS, X86ISD::VFPROUNDS_RND),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2dq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2dq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2dq_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2qq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2qq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2qq_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2udq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2udq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2udq_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uqq_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uqq_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uqq_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uw_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uw_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2UI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2uw_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::CVTTP2UI, X86ISD::CVTTP2UI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2w_128, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2w_256, INTR_TYPE_1OP_MASK,
+                     X86ISD::CVTTP2SI, 0),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvttph2w_512, INTR_TYPE_1OP_MASK_SAE,
+                     X86ISD::CVTTP2SI, X86ISD::CVTTP2SI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtudq2ph_128, TRUNCATE_TO_REG,
+                     X86ISD::CVTUI2P, X86ISD::MCVTUI2P),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtuqq2ph_128, TRUNCATE_TO_REG,
+                     X86ISD::CVTUI2P, X86ISD::MCVTUI2P),
+  X86_INTRINSIC_DATA(avx512fp16_mask_vcvtuqq2ph_256, TRUNCATE_TO_REG,
+                     X86ISD::CVTUI2P, X86ISD::MCVTUI2P),
+  X86_INTRINSIC_DATA(avx512fp16_max_ph_128, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512fp16_max_ph_256, INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(avx512fp16_max_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_min_ph_128, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512fp16_min_ph_256, INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512fp16_min_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMIN, X86ISD::FMIN_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_mul_ph_512, INTR_TYPE_2OP, ISD::FMUL, X86ISD::FMUL_RND),
+  X86_INTRINSIC_DATA(avx512fp16_sqrt_ph_512, INTR_TYPE_1OP, ISD::FSQRT, X86ISD::FSQRT_RND),
+  X86_INTRINSIC_DATA(avx512fp16_sub_ph_512, INTR_TYPE_2OP, ISD::FSUB, X86ISD::FSUB_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcomi_sh, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
+  /*fp16 scalar convert instruction*/
+  X86_INTRINSIC_DATA(avx512fp16_vcvtsh2si32, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtsh2si64, INTR_TYPE_1OP, X86ISD::CVTS2SI, X86ISD::CVTS2SI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtsh2usi32, INTR_TYPE_1OP, X86ISD::CVTS2UI, X86ISD::CVTS2UI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtsh2usi64, INTR_TYPE_1OP, X86ISD::CVTS2UI, X86ISD::CVTS2UI_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtsi2sh, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtsi642sh, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_SINT_TO_FP, X86ISD::SCALAR_SINT_TO_FP_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvttsh2si32, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_vcvttsh2si64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2SI, X86ISD::CVTTS2SI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_vcvttsh2usi32, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_vcvttsh2usi64, INTR_TYPE_1OP_SAE, X86ISD::CVTTS2UI, X86ISD::CVTTS2UI_SAE),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtusi2sh, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
+  X86_INTRINSIC_DATA(avx512fp16_vcvtusi642sh, INTR_TYPE_2OP,
+                     X86ISD::SCALAR_UINT_TO_FP, X86ISD::SCALAR_UINT_TO_FP_RND),
   X86_INTRINSIC_DATA(bmi_bextr_32,         INTR_TYPE_2OP, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(bmi_bextr_64,         INTR_TYPE_2OP, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(bmi_bzhi_32,          INTR_TYPE_2OP, X86ISD::BZHI, 0),
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index 4ba44ccb6c160..e150f2dbc3540 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -40,8 +40,10 @@
 //
 #include "X86.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -56,66 +58,44 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/Local.h"
 
 using namespace llvm;
 using namespace PatternMatch;
 
 #define DEBUG_TYPE "lower-amx-type"
 
-static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder,
-                                           BasicBlock *BB) {
+static bool isAMXCast(Instruction *II) {
+  return match(II,
+               m_Intrinsic<Intrinsic::x86_cast_vector_to_tile>(m_Value())) ||
+         match(II, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(m_Value()));
+}
+
+static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB,
+                                           Type *Ty) {
   Function &F = *BB->getParent();
   Module *M = BB->getModule();
   const DataLayout &DL = M->getDataLayout();
 
-  Type *V256I32Ty = VectorType::get(Builder.getInt32Ty(), 256, false);
   LLVMContext &Ctx = Builder.getContext();
   auto AllocaAlignment = DL.getPrefTypeAlign(Type::getX86_AMXTy(Ctx));
   unsigned AllocaAS = DL.getAllocaAddrSpace();
   AllocaInst *AllocaRes =
-      new AllocaInst(V256I32Ty, AllocaAS, "", &F.getEntryBlock().front());
+      new AllocaInst(Ty, AllocaAS, "", &F.getEntryBlock().front());
   AllocaRes->setAlignment(AllocaAlignment);
   return AllocaRes;
 }
 
-namespace {
-class X86LowerAMXType {
-  Function &Func;
-  TargetMachine *TM = nullptr;
-
-  // In AMX intrinsics we let Shape = {Row, Col}, but the
-  // RealCol = Col / ElementSize. We may use the RealCol
-  // as a new Row for other new created AMX intrinsics.
-  std::map<Value *, Value *> Col2Row;
-
-public:
-  X86LowerAMXType(Function &F, TargetMachine *TargetM) : Func(F), TM(TargetM) {}
-  bool visit();
-  void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast);
-  void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST);
-  bool transformBitcast(BitCastInst *Bitcast);
-  std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo);
-  Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity);
-};
-
-Value *X86LowerAMXType::getRowFromCol(Instruction *II, Value *V,
-                                      unsigned Granularity) {
-  if (Col2Row.count(V))
-    return Col2Row[V];
-  IRBuilder<> Builder(&*II->getParent()->getFirstInsertionPt());
-  if (auto *I = dyn_cast<Instruction>(V)) {
-    BasicBlock::iterator Iter = I->getIterator();
-    ++Iter;
-    Builder.SetInsertPoint(&*Iter);
-  }
-  ConstantInt *Gran = Builder.getInt16(Granularity);
-  Value *RealRow = Builder.CreateUDiv(V, Gran);
-  Col2Row[V] = RealRow;
-  return RealRow;
+static Instruction *getFirstNonAllocaInTheEntryBlock(Function &F) {
+  for (Instruction &I : F.getEntryBlock())
+    if (!isa<AllocaInst>(&I))
+      return &I;
+  llvm_unreachable("No terminator in the entry block!");
 }
 
-std::pair<Value *, Value *> X86LowerAMXType::getShape(IntrinsicInst *II,
-                                                      unsigned OpNo) {
+static std::pair<Value *, Value *> getShape(IntrinsicInst *II, unsigned OpNo) {
+  IRBuilder<> Builder(II);
   Value *Row = nullptr, *Col = nullptr;
   switch (II->getIntrinsicID()) {
   default:
@@ -144,14 +124,32 @@ std::pair<Value *, Value *> X86LowerAMXType::getShape(IntrinsicInst *II,
       Col = II->getArgOperand(2);
       break;
     case 5:
-      Row = II->getArgOperand(2);
-      // FIXME: There is a design bug for AMX shape, which the Col should be
-      // Col/4 if it will be used as Row, but current Greedy RA can't handle
-      // this case well, it may failed if we generate a new Shape definition.
-      // So Let's just do it in O0 first.
-      // Row = Row / 4
-      if (TM->getOptLevel() == CodeGenOpt::None)
-        Row = getRowFromCol(II, Row, 4);
+      if (isa<ConstantInt>(II->getArgOperand(2)))
+        Row = Builder.getInt16(
+            (dyn_cast<ConstantInt>(II->getOperand(2))->getSExtValue()) / 4);
+      else if (isa<Instruction>(II->getArgOperand(2))) {
+        // When it is not a const value and it is not a function argument, we
+        // create Row after the definition of II->getOperand(2) instead of
+        // before II. For example, II is %118, we try to getshape for %117:
+        //   %117 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x
+        //   i32> %115).
+        //   %118 = call x86_amx @llvm.x86.tdpbf16ps.internal(i16
+        //   %104, i16 %105, i16 %106, x86_amx %110, x86_amx %114, x86_amx
+        //   %117).
+        // If we create %row = udiv i16 %106, 4 before %118(aka. II), then its
+        // definition is after its user(new tileload for %117).
+        // So, the best choice is to create %row right after the definition of
+        // %106.
+        Builder.SetInsertPoint(cast<Instruction>(II->getOperand(2)));
+        Row = Builder.CreateUDiv(II->getOperand(2), Builder.getInt16(4));
+        cast<Instruction>(Row)->moveAfter(cast<Instruction>(II->getOperand(2)));
+      } else {
+        // When it is not a const value and it is a function argument, we create
+        // Row at the entry bb.
+        IRBuilder<> NewBuilder(
+            getFirstNonAllocaInTheEntryBlock(*II->getFunction()));
+        Row = NewBuilder.CreateUDiv(II->getOperand(2), NewBuilder.getInt16(4));
+      }
       Col = II->getArgOperand(1);
       break;
     }
@@ -162,6 +160,40 @@ std::pair<Value *, Value *> X86LowerAMXType::getShape(IntrinsicInst *II,
   return std::make_pair(Row, Col);
 }
 
+namespace {
+class X86LowerAMXType {
+  Function &Func;
+
+  // In AMX intrinsics we let Shape = {Row, Col}, but the
+  // RealCol = Col / ElementSize. We may use the RealCol
+  // as a new Row for other new created AMX intrinsics.
+  std::map<Value *, Value *> Col2Row;
+
+public:
+  X86LowerAMXType(Function &F) : Func(F) {}
+  bool visit();
+  void combineLoadBitcast(LoadInst *LD, BitCastInst *Bitcast);
+  void combineBitcastStore(BitCastInst *Bitcast, StoreInst *ST);
+  bool transformBitcast(BitCastInst *Bitcast);
+  Value *getRowFromCol(Instruction *II, Value *V, unsigned Granularity);
+};
+
+Value *X86LowerAMXType::getRowFromCol(Instruction *II, Value *V,
+                                      unsigned Granularity) {
+  if (Col2Row.count(V))
+    return Col2Row[V];
+  IRBuilder<> Builder(&*II->getParent()->getFirstInsertionPt());
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    BasicBlock::iterator Iter = I->getIterator();
+    ++Iter;
+    Builder.SetInsertPoint(&*Iter);
+  }
+  ConstantInt *Gran = Builder.getInt16(Granularity);
+  Value *RealRow = Builder.CreateUDiv(V, Gran);
+  Col2Row[V] = RealRow;
+  return RealRow;
+}
+
 // %src = load <256 x i32>, <256 x i32>* %addr, align 64
 // %2 = bitcast <256 x i32> %src to x86_amx
 // -->
@@ -230,8 +262,8 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
   Value *I8Ptr, *Stride;
   auto *Src = Bitcast->getOperand(0);
 
-  auto Prepare = [&]() {
-    AllocaAddr = createAllocaInstAtEntry(Builder, Bitcast->getParent());
+  auto Prepare = [&](Type *MemTy) {
+    AllocaAddr = createAllocaInstAtEntry(Builder, Bitcast->getParent(), MemTy);
     I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
     Stride = Builder.getInt64(64);
   };
@@ -250,7 +282,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
     auto *II = dyn_cast<IntrinsicInst>(U.getUser());
     if (!II)
       return false; // May be bitcast from x86amx to <256 x i32>.
-    Prepare();
+    Prepare(Bitcast->getOperand(0)->getType());
     Builder.CreateStore(Src, AllocaAddr);
     // TODO we can pick an constant operand for the shape.
     Value *Row = nullptr, *Col = nullptr;
@@ -270,7 +302,7 @@ bool X86LowerAMXType::transformBitcast(BitCastInst *Bitcast) {
     auto *II = dyn_cast<IntrinsicInst>(Src);
     if (!II)
       return false; // May be bitcast from <256 x i32> to x86amx.
-    Prepare();
+    Prepare(Bitcast->getType());
     Value *Row = II->getOperand(0);
     Value *Col = II->getOperand(1);
     std::array<Value *, 5> Args = {Row, Col, I8Ptr, Stride, Src};
@@ -637,6 +669,366 @@ bool X86VolatileTileData::volatileTileData() {
 
 namespace {
 
+class X86LowerAMXCast {
+  Function &Func;
+
+public:
+  X86LowerAMXCast(Function &F) : Func(F) {}
+  bool combineAMXcast(TargetLibraryInfo *TLI);
+  bool transformAMXCast(IntrinsicInst *AMXCast);
+  bool transformAllAMXCast();
+  bool optimizeAMXCastFromPhi(IntrinsicInst *CI, PHINode *PN,
+                              SmallSetVector<Instruction *, 16> &DeadInst);
+};
+
+static bool DCEInstruction(Instruction *I,
+                           SmallSetVector<Instruction *, 16> &WorkList,
+                           const TargetLibraryInfo *TLI) {
+  if (isInstructionTriviallyDead(I, TLI)) {
+    salvageDebugInfo(*I);
+    salvageKnowledge(I);
+
+    // Null out all of the instruction's operands to see if any operand becomes
+    // dead as we go.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+      Value *OpV = I->getOperand(i);
+      I->setOperand(i, nullptr);
+
+      if (!OpV->use_empty() || I == OpV)
+        continue;
+
+      // If the operand is an instruction that became dead as we nulled out the
+      // operand, and if it is 'trivially' dead, delete it in a future loop
+      // iteration.
+      if (Instruction *OpI = dyn_cast<Instruction>(OpV)) {
+        if (isInstructionTriviallyDead(OpI, TLI)) {
+          WorkList.insert(OpI);
+        }
+      }
+    }
+    I->eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+/// This function handles following case
+///
+///     A  ->  B    amxcast
+///     PHI
+///     B  ->  A    amxcast
+///
+/// All the related PHI nodes can be replaced by new PHI nodes with type A.
+/// The uses of \p CI can be changed to the new PHI node corresponding to \p PN.
+bool X86LowerAMXCast::optimizeAMXCastFromPhi(
+    IntrinsicInst *CI, PHINode *PN,
+    SmallSetVector<Instruction *, 16> &DeadInst) {
+  IRBuilder<> Builder(CI);
+  Value *Src = CI->getOperand(0);
+  Type *SrcTy = Src->getType(); // Type B
+  Type *DestTy = CI->getType(); // Type A
+
+  SmallVector<PHINode *, 4> PhiWorklist;
+  SmallSetVector<PHINode *, 4> OldPhiNodes;
+
+  // Find all of the A->B casts and PHI nodes.
+  // We need to inspect all related PHI nodes, but PHIs can be cyclic, so
+  // OldPhiNodes is used to track all known PHI nodes, before adding a new
+  // PHI to PhiWorklist, it is checked against and added to OldPhiNodes first.
+  PhiWorklist.push_back(PN);
+  OldPhiNodes.insert(PN);
+  while (!PhiWorklist.empty()) {
+    auto *OldPN = PhiWorklist.pop_back_val();
+    for (Value *IncValue : OldPN->incoming_values()) {
+      // TODO: currently, We ignore cases where it is a const. In the future, we
+      // might support const.
+      if (isa<Constant>(IncValue))
+        return false;
+
+      if (auto *PNode = dyn_cast<PHINode>(IncValue)) {
+        if (OldPhiNodes.insert(PNode))
+          PhiWorklist.push_back(PNode);
+        continue;
+      }
+      Instruction *ACI = dyn_cast<Instruction>(IncValue);
+      if (ACI && isAMXCast(ACI)) {
+        // Verify it's a A->B cast.
+        Type *TyA = ACI->getOperand(0)->getType();
+        Type *TyB = ACI->getType();
+        if (TyA != DestTy || TyB != SrcTy)
+          return false;
+        continue;
+      }
+      return false;
+    }
+  }
+
+  // Check that each user of each old PHI node is something that we can
+  // rewrite, so that all of the old PHI nodes can be cleaned up afterwards.
+  for (auto *OldPN : OldPhiNodes) {
+    for (User *V : OldPN->users()) {
+      Instruction *ACI = dyn_cast<Instruction>(V);
+      if (ACI && isAMXCast(ACI)) {
+        // Verify it's a B->A cast.
+        Type *TyB = ACI->getOperand(0)->getType();
+        Type *TyA = ACI->getType();
+        if (TyA != DestTy || TyB != SrcTy)
+          return false;
+      } else if (auto *PHI = dyn_cast<PHINode>(V)) {
+        // As long as the user is another old PHI node, then even if we don't
+        // rewrite it, the PHI web we're considering won't have any users
+        // outside itself, so it'll be dead.
+        // example:
+        //   bb.0:
+        //      %0 = amxcast ...
+        //   bb.1:
+        //      %1 = amxcast ...
+        //   bb.2:
+        //      %goodphi = phi %0, %1
+        //      %3 = amxcast %goodphi
+        //   bb.3:
+        //      %goodphi2 = phi %0, %goodphi
+        //      %4 = amxcast %goodphi2
+        // When optimizeAMXCastFromPhi process %3 and %goodphi, %goodphi2 is
+        // outside the phi-web, so the combination stop When
+        // optimizeAMXCastFromPhi process %4 and %goodphi2, the optimization
+        // will be done.
+        if (OldPhiNodes.count(PHI) == 0)
+          return false;
+      } else
+        return false;
+    }
+  }
+
+  // For each old PHI node, create a corresponding new PHI node with a type A.
+  SmallDenseMap<PHINode *, PHINode *> NewPNodes;
+  for (auto *OldPN : OldPhiNodes) {
+    Builder.SetInsertPoint(OldPN);
+    PHINode *NewPN = Builder.CreatePHI(DestTy, OldPN->getNumOperands());
+    NewPNodes[OldPN] = NewPN;
+  }
+
+  // Fill in the operands of new PHI nodes.
+  for (auto *OldPN : OldPhiNodes) {
+    PHINode *NewPN = NewPNodes[OldPN];
+    for (unsigned j = 0, e = OldPN->getNumOperands(); j != e; ++j) {
+      Value *V = OldPN->getOperand(j);
+      Value *NewV = nullptr;
+      Instruction *ACI = dyn_cast<Instruction>(V);
+      // There should not be a AMXcast from a const.
+      if (ACI && isAMXCast(ACI))
+        NewV = ACI->getOperand(0);
+      else if (auto *PrevPN = dyn_cast<PHINode>(V))
+        NewV = NewPNodes[PrevPN];
+      assert(NewV);
+      NewPN->addIncoming(NewV, OldPN->getIncomingBlock(j));
+    }
+  }
+
+  // Traverse all accumulated PHI nodes and process its users,
+  // which are Stores and BitcCasts. Without this processing
+  // NewPHI nodes could be replicated and could lead to extra
+  // moves generated after DeSSA.
+  // If there is a store with type B, change it to type A.
+
+  // Replace users of BitCast B->A with NewPHI. These will help
+  // later to get rid of a closure formed by OldPHI nodes.
+  for (auto *OldPN : OldPhiNodes) {
+    PHINode *NewPN = NewPNodes[OldPN];
+    for (User *V : make_early_inc_range(OldPN->users())) {
+      Instruction *ACI = dyn_cast<Instruction>(V);
+      if (ACI && isAMXCast(ACI)) {
+        Type *TyB = ACI->getOperand(0)->getType();
+        Type *TyA = ACI->getType();
+        assert(TyA == DestTy && TyB == SrcTy);
+        (void)TyA;
+        (void)TyB;
+        ACI->replaceAllUsesWith(NewPN);
+        DeadInst.insert(ACI);
+      } else if (auto *PHI = dyn_cast<PHINode>(V)) {
+        // We don't need to push PHINode into DeadInst since they are operands
+        // of rootPN DCE can safely delete rootPN's operands if rootPN is dead.
+        assert(OldPhiNodes.contains(PHI));
+        (void)PHI;
+      } else
+        llvm_unreachable("all uses should be handled");
+    }
+  }
+  return true;
+}
+
+bool X86LowerAMXCast::combineAMXcast(TargetLibraryInfo *TLI) {
+  bool Change = false;
+  // Collect tile cast instruction.
+  SmallVector<Instruction *, 8> Vec2TileInsts;
+  SmallVector<Instruction *, 8> Tile2VecInsts;
+  SmallVector<Instruction *, 8> PhiCastWorkList;
+  SmallSetVector<Instruction *, 16> DeadInst;
+  for (BasicBlock &BB : Func) {
+    for (Instruction &I : BB) {
+      Value *Vec;
+      if (match(&I,
+                m_Intrinsic<Intrinsic::x86_cast_vector_to_tile>(m_Value(Vec))))
+        Vec2TileInsts.push_back(&I);
+      else if (match(&I, m_Intrinsic<Intrinsic::x86_cast_tile_to_vector>(
+                             m_Value(Vec))))
+        Tile2VecInsts.push_back(&I);
+    }
+  }
+
+  auto Convert = [&](SmallVectorImpl<Instruction *> &Insts, Intrinsic::ID IID) {
+    for (auto *Inst : Insts) {
+      for (User *U : Inst->users()) {
+        IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+        if (!II || II->getIntrinsicID() != IID)
+          continue;
+        // T1 = vec2tile V0
+        // V2 = tile2vec T1
+        // V3 = OP V2
+        // -->
+        // T1 = vec2tile V0
+        // V2 = tile2vec T1
+        // V3 = OP V0
+        II->replaceAllUsesWith(Inst->getOperand(0));
+        Change = true;
+      }
+    }
+  };
+
+  Convert(Vec2TileInsts, Intrinsic::x86_cast_tile_to_vector);
+  Convert(Tile2VecInsts, Intrinsic::x86_cast_vector_to_tile);
+
+  auto EraseInst = [&](SmallVectorImpl<Instruction *> &Insts) {
+    for (auto *Inst : Insts) {
+      if (Inst->use_empty()) {
+        Inst->eraseFromParent();
+        Change = true;
+      }
+    }
+  };
+
+  EraseInst(Vec2TileInsts);
+  EraseInst(Tile2VecInsts);
+
+  // Handle the A->B->A cast, and there is an intervening PHI node.
+  for (BasicBlock &BB : Func) {
+    for (Instruction &I : BB) {
+      if (isAMXCast(&I)) {
+        if (isa<PHINode>(I.getOperand(0)))
+          PhiCastWorkList.push_back(&I);
+      }
+    }
+  }
+  for (auto *I : PhiCastWorkList) {
+    // We skip the dead Amxcast.
+    if (DeadInst.contains(I))
+      continue;
+    PHINode *PN = cast<PHINode>(I->getOperand(0));
+    if (optimizeAMXCastFromPhi(cast<IntrinsicInst>(I), PN, DeadInst)) {
+      DeadInst.insert(PN);
+      Change = true;
+    }
+  }
+
+  // Since we create new phi and merge AMXCast, some old phis and AMXCast might
+  // have no uses. We do some DeadCodeElimination for them.
+  while (!DeadInst.empty()) {
+    Instruction *I = DeadInst.pop_back_val();
+    Change |= DCEInstruction(I, DeadInst, TLI);
+  }
+  return Change;
+}
+
+// There might be remaining AMXcast after combineAMXcast and they should be
+// handled elegantly.
+bool X86LowerAMXCast::transformAMXCast(IntrinsicInst *AMXCast) {
+  IRBuilder<> Builder(AMXCast);
+  AllocaInst *AllocaAddr;
+  Value *I8Ptr, *Stride;
+  auto *Src = AMXCast->getOperand(0);
+
+  auto Prepare = [&](Type *MemTy) {
+    AllocaAddr = createAllocaInstAtEntry(Builder, AMXCast->getParent(), MemTy);
+    I8Ptr = Builder.CreateBitCast(AllocaAddr, Builder.getInt8PtrTy());
+    Stride = Builder.getInt64(64);
+  };
+
+  if (AMXCast->getType()->isX86_AMXTy()) {
+    // %2 = amxcast <225 x i32> %src to x86_amx
+    // call void @llvm.x86.tilestored64.internal(i16 15, i16 60,
+    //                                           i8* %addr3, i64 60, x86_amx %2)
+    // -->
+    // %addr = alloca <225 x i32>, align 64
+    // store <225 x i32> %src, <225 x i32>* %addr, align 64
+    // %addr2 = bitcast <225 x i32>* %addr to i8*
+    // %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 15, i16 60,
+    //                                                  i8* %addr2,
+    //                                                  i64 60)
+    // call void @llvm.x86.tilestored64.internal(i16 15, i16 60,
+    //                                           i8* %addr3, i64 60, x86_amx %2)
+    Use &U = *(AMXCast->use_begin());
+    unsigned OpNo = U.getOperandNo();
+    auto *II = dyn_cast<IntrinsicInst>(U.getUser());
+    if (!II)
+      return false; // May be bitcast from x86amx to <256 x i32>.
+    Prepare(AMXCast->getOperand(0)->getType());
+    Builder.CreateStore(Src, AllocaAddr);
+    // TODO we can pick an constant operand for the shape.
+    Value *Row = nullptr, *Col = nullptr;
+    std::tie(Row, Col) = getShape(II, OpNo);
+    std::array<Value *, 4> Args = {
+        Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty())};
+    Value *NewInst = Builder.CreateIntrinsic(
+        Intrinsic::x86_tileloadd64_internal, None, Args);
+    AMXCast->replaceAllUsesWith(NewInst);
+    AMXCast->eraseFromParent();
+  } else {
+    // %2 = amxcast x86_amx %src to <225 x i32>
+    // -->
+    // %addr = alloca <225 x i32>, align 64
+    // %addr2 = bitcast <225 x i32>* to i8*
+    // call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col,
+    //                                           i8* %addr2, i64 %stride)
+    // %2 = load <225 x i32>, <225 x i32>* %addr, align 64
+    auto *II = dyn_cast<IntrinsicInst>(Src);
+    if (!II)
+      return false; // May be bitcast from <256 x i32> to x86amx.
+    Prepare(AMXCast->getType());
+    Value *Row = II->getOperand(0);
+    Value *Col = II->getOperand(1);
+    std::array<Value *, 5> Args = {
+        Row, Col, I8Ptr, Builder.CreateSExt(Col, Builder.getInt64Ty()), Src};
+    Builder.CreateIntrinsic(Intrinsic::x86_tilestored64_internal, None, Args);
+    Value *NewInst = Builder.CreateLoad(AMXCast->getType(), AllocaAddr);
+    AMXCast->replaceAllUsesWith(NewInst);
+    AMXCast->eraseFromParent();
+  }
+
+  return true;
+}
+
+bool X86LowerAMXCast::transformAllAMXCast() {
+  bool Change = false;
+  // Collect tile cast instruction.
+  SmallVector<Instruction *, 8> WorkLists;
+  for (BasicBlock &BB : Func) {
+    for (Instruction &I : BB) {
+      if (isAMXCast(&I))
+        WorkLists.push_back(&I);
+    }
+  }
+
+  for (auto *Inst : WorkLists) {
+    Change |= transformAMXCast(cast<IntrinsicInst>(Inst));
+  }
+
+  return Change;
+}
+
+} // anonymous namespace
+
+namespace {
+
 class X86LowerAMXTypeLegacyPass : public FunctionPass {
 public:
   static char ID;
@@ -646,10 +1038,18 @@ class X86LowerAMXTypeLegacyPass : public FunctionPass {
   }
 
   bool runOnFunction(Function &F) override {
+    bool C = false;
     TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+    TargetLibraryInfo *TLI =
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    X86LowerAMXCast LAC(F);
+    C |= LAC.combineAMXcast(TLI);
+    // There might be remaining AMXcast after combineAMXcast and they should be
+    // handled elegantly.
+    C |= LAC.transformAllAMXCast();
 
-    X86LowerAMXType LAT(F, TM);
-    bool C = LAT.visit();
+    X86LowerAMXType LAT(F);
+    C |= LAT.visit();
 
     // Prepare for fast register allocation at O0.
     // Todo: May better check the volatile model of AMX code, not just
@@ -671,6 +1071,7 @@ class X86LowerAMXTypeLegacyPass : public FunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<TargetPassConfig>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 
@@ -681,6 +1082,7 @@ char X86LowerAMXTypeLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(X86LowerAMXTypeLegacyPass, DEBUG_TYPE, PassName, false,
                     false)
 
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index d2ced1c67407e..4d7799a337367 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -112,6 +112,25 @@ multiclass BWWriteResPair<X86FoldableSchedWrite SchedRW,
 // 2/3/7 cycle to recompute the address.
 def : WriteRes<WriteRMW, [BWPort237,BWPort4]>;
 
+// Loads, stores, and moves, not folded with other operations.
+// Store_addr on 237.
+// Store_data on 4.
+defm : X86WriteRes<WriteStore,   [BWPort237, BWPort4], 1, [1,1], 1>;
+defm : X86WriteRes<WriteStoreNT, [BWPort237, BWPort4], 1, [1,1], 2>;
+defm : X86WriteRes<WriteLoad,    [BWPort23], 5, [1], 1>;
+defm : X86WriteRes<WriteMove,    [BWPort0156], 1, [1], 1>;
+
+// Treat misc copies as a move.
+def  : InstRW<[WriteMove], (instrs COPY)>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def  : WriteRes<WriteZero,       []>;
+
+// Model the effect of clobbering the read-write mask operand of the GATHER operation.
+// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
+defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
+
 // Arithmetic.
 defm : BWWriteResPair<WriteALU,    [BWPort0156], 1>; // Simple integer ALU op.
 defm : BWWriteResPair<WriteADC,    [BWPort06], 1>; // Integer ALU + flags op.
@@ -123,41 +142,38 @@ defm : X86WriteRes<WriteIMul16Imm,    [BWPort1,BWPort0156], 4, [1,1], 2>;
 defm : X86WriteRes<WriteIMul16ImmLd,  [BWPort1,BWPort0156,BWPort23], 8, [1,1,1], 3>;
 defm : BWWriteResPair<WriteIMul16Reg, [BWPort1],   3>;
 defm : BWWriteResPair<WriteIMul32,    [BWPort1,BWPort06,BWPort0156], 4, [1,1,1], 3>;
+defm : BWWriteResPair<WriteMULX32,    [BWPort1,BWPort06,BWPort0156], 4, [1,1,1], 3>;
 defm : BWWriteResPair<WriteIMul32Imm, [BWPort1],   3>;
 defm : BWWriteResPair<WriteIMul32Reg, [BWPort1],   3>;
 defm : BWWriteResPair<WriteIMul64,    [BWPort1,BWPort5], 4, [1,1], 2>;
+defm : BWWriteResPair<WriteMULX64,    [BWPort1,BWPort5], 4, [1,1], 2>;
 defm : BWWriteResPair<WriteIMul64Imm, [BWPort1],   3>;
 defm : BWWriteResPair<WriteIMul64Reg, [BWPort1],   3>;
-def : WriteRes<WriteIMulH, []> { let Latency = 3; }
-
-// TODO: Why isn't the BWDivider used consistently?
-defm : X86WriteRes<WriteDiv8,      [BWPort0, BWDivider], 25, [1, 10], 1>;
-defm : X86WriteRes<WriteDiv16,     [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv32,     [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv64,     [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
-defm : X86WriteRes<WriteDiv8Ld,    [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteDiv16Ld,   [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteDiv32Ld,   [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteDiv64Ld,   [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
-
-defm : X86WriteRes<WriteIDiv8,     [BWPort0, BWDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv16,    [BWPort0, BWDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv32,    [BWPort0, BWDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv64,    [BWPort0, BWDivider], 25, [1,10], 1>;
-defm : X86WriteRes<WriteIDiv8Ld,   [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteIDiv16Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteIDiv32Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
-defm : X86WriteRes<WriteIDiv64Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
+def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
 
-defm : X86WriteRes<WriteCMPXCHG,[BWPort06, BWPort0156], 5, [2, 3], 5>;
-defm : X86WriteRes<WriteCMPXCHGRMW,[BWPort23, BWPort06, BWPort0156, BWPort237, BWPort4], 8, [1, 2, 1, 1, 1], 6>;
 defm : X86WriteRes<WriteBSWAP32,   [BWPort15], 1, [1], 1>;
 defm : X86WriteRes<WriteBSWAP64,   [BWPort06, BWPort15], 2, [1, 1], 2>;
+defm : X86WriteRes<WriteCMPXCHG,[BWPort06, BWPort0156], 5, [2, 3], 5>;
+defm : X86WriteRes<WriteCMPXCHGRMW,[BWPort23, BWPort06, BWPort0156, BWPort237, BWPort4], 8, [1, 2, 1, 1, 1], 6>;
 defm : X86WriteRes<WriteXCHG,      [BWPort0156], 2, [3], 3>;
 
-defm : BWWriteResPair<WriteCRC32, [BWPort1],   3>;
+// Integer shifts and rotates.
+defm : BWWriteResPair<WriteShift,    [BWPort06],  1>;
+defm : BWWriteResPair<WriteShiftCL,  [BWPort06,BWPort0156],  3, [2,1], 3>;
+defm : BWWriteResPair<WriteRotate,   [BWPort06],  1, [1], 1>;
+defm : BWWriteResPair<WriteRotateCL, [BWPort06,BWPort0156],  3, [2,1], 3>;
 
-def : WriteRes<WriteLEA, [BWPort15]>; // LEA instructions can't fold loads.
+// SHLD/SHRD.
+defm : X86WriteRes<WriteSHDrri, [BWPort1], 3, [1], 1>;
+defm : X86WriteRes<WriteSHDrrcl,[BWPort1,BWPort06,BWPort0156], 6, [1, 1, 2], 4>;
+defm : X86WriteRes<WriteSHDmri, [BWPort1,BWPort23,BWPort237,BWPort0156], 9, [1, 1, 1, 1], 4>;
+defm : X86WriteRes<WriteSHDmrcl,[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156], 11, [1, 1, 1, 1, 2], 6>;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm : BWWriteResPair<WriteJump,  [BWPort06],   1>;
+
+defm : BWWriteResPair<WriteCRC32, [BWPort1],   3>;
 
 defm : BWWriteResPair<WriteCMOV,  [BWPort06], 1>; // Conditional move.
 defm : X86WriteRes<WriteFCMOV, [BWPort1], 3, [1], 1>; // x87 conditional move.
@@ -176,6 +192,11 @@ defm : X86WriteRes<WriteBitTestSet,      [BWPort06], 1, [1], 1>; // Bit Test + S
 defm : X86WriteRes<WriteBitTestSetImmLd, [BWPort06,BWPort23], 5, [1,1], 3>;
 defm : X86WriteRes<WriteBitTestSetRegLd, [BWPort0156,BWPort23], 5, [1,1], 2>;
 
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [BWPort15]>;
+
 // Bit counts.
 defm : BWWriteResPair<WriteBSF, [BWPort1], 3>;
 defm : BWWriteResPair<WriteBSR, [BWPort1], 3>;
@@ -183,43 +204,29 @@ defm : BWWriteResPair<WriteLZCNT,          [BWPort1], 3>;
 defm : BWWriteResPair<WriteTZCNT,          [BWPort1], 3>;
 defm : BWWriteResPair<WritePOPCNT,         [BWPort1], 3>;
 
-// Integer shifts and rotates.
-defm : BWWriteResPair<WriteShift,    [BWPort06],  1>;
-defm : BWWriteResPair<WriteShiftCL,  [BWPort06,BWPort0156],  3, [2,1], 3>;
-defm : BWWriteResPair<WriteRotate,   [BWPort06],  1, [1], 1>;
-defm : BWWriteResPair<WriteRotateCL, [BWPort06,BWPort0156],  3, [2,1], 3>;
-
-// SHLD/SHRD.
-defm : X86WriteRes<WriteSHDrri, [BWPort1], 3, [1], 1>;
-defm : X86WriteRes<WriteSHDrrcl,[BWPort1,BWPort06,BWPort0156], 6, [1, 1, 2], 4>;
-defm : X86WriteRes<WriteSHDmri, [BWPort1,BWPort23,BWPort237,BWPort0156], 9, [1, 1, 1, 1], 4>;
-defm : X86WriteRes<WriteSHDmrcl,[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156], 11, [1, 1, 1, 1, 2], 6>;
-
 // BMI1 BEXTR/BLS, BMI2 BZHI
 defm : BWWriteResPair<WriteBEXTR, [BWPort06,BWPort15], 2, [1,1], 2>;
 defm : BWWriteResPair<WriteBLS,   [BWPort15], 1>;
 defm : BWWriteResPair<WriteBZHI,  [BWPort15], 1>;
 
-// Loads, stores, and moves, not folded with other operations.
-defm : X86WriteRes<WriteLoad,    [BWPort23], 5, [1], 1>;
-defm : X86WriteRes<WriteStore,   [BWPort237, BWPort4], 1, [1,1], 1>;
-defm : X86WriteRes<WriteStoreNT, [BWPort237, BWPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteMove,    [BWPort0156], 1, [1], 1>;
-
-// Model the effect of clobbering the read-write mask operand of the GATHER operation.
-// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
-defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
-
-// Idioms that clear a register, like xorps %xmm0, %xmm0.
-// These can often bypass execution ports completely.
-def : WriteRes<WriteZero,  []>;
-
-// Treat misc copies as a move.
-def : InstRW<[WriteMove], (instrs COPY)>;
-
-// Branches don't produce values, so they have no latency, but they still
-// consume resources. Indirect branches can fold loads.
-defm : BWWriteResPair<WriteJump,  [BWPort06],   1>;
+// TODO: Why isn't the BWDivider used consistently?
+defm : X86WriteRes<WriteDiv8,     [BWPort0, BWDivider], 25, [1, 10], 1>;
+defm : X86WriteRes<WriteDiv16,    [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv32,    [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv64,    [BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156], 80, [7,7,3,3,1,11], 32>;
+defm : X86WriteRes<WriteDiv8Ld,   [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteDiv16Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteDiv32Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteDiv64Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 34, [2,2,2,1,1], 8>;
+
+defm : X86WriteRes<WriteIDiv8,    [BWPort0, BWDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv16,   [BWPort0, BWDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv32,   [BWPort0, BWDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv64,   [BWPort0, BWDivider], 25, [1,10], 1>;
+defm : X86WriteRes<WriteIDiv8Ld,  [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteIDiv16Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteIDiv32Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
+defm : X86WriteRes<WriteIDiv64Ld, [BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156], 35, [2,2,2,1,1], 8>;
 
 // Floating point. This covers both scalar and vector operations.
 defm : X86WriteRes<WriteFLD0,          [BWPort01], 1, [1], 1>;
@@ -245,6 +252,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5
 defm : X86WriteRes<WriteFMove,         [BWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [BWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [BWPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteEMMS,          [BWPort01,BWPort15,BWPort015,BWPort0156], 31, [8,1,21,1], 31>;
 
 defm : BWWriteResPair<WriteFAdd,    [BWPort1],  3, [1], 1, 5>; // Floating point add/sub.
 defm : BWWriteResPair<WriteFAddX,   [BWPort1],  3, [1], 1, 5>; // Floating point add/sub (XMM).
@@ -285,6 +293,16 @@ defm : BWWriteResPair<WriteFDiv64X,  [BWPort0,BWFPDivider], 14, [1,8], 1, 5>; //
 defm : BWWriteResPair<WriteFDiv64Y,  [BWPort0,BWPort015,BWFPDivider], 23, [2,1,16], 3, 6>; // Floating point division (YMM).
 defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
 
+defm : BWWriteResPair<WriteFRcp,   [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal estimate.
+defm : BWWriteResPair<WriteFRcpX,  [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal estimate (XMM).
+defm : BWWriteResPair<WriteFRcpY,  [BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal estimate (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+
+defm : BWWriteResPair<WriteFRsqrt, [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal square root estimate.
+defm : BWWriteResPair<WriteFRsqrtX,[BWPort0],  5, [1], 1, 5>; // Floating point reciprocal square root estimate (XMM).
+defm : BWWriteResPair<WriteFRsqrtY,[BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal square root estimate (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+
 defm : X86WriteRes<WriteFSqrt,       [BWPort0,BWFPDivider], 11, [1,4], 1>; // Floating point square root.
 defm : X86WriteRes<WriteFSqrtLd,     [BWPort0,BWPort23,BWFPDivider], 16, [1,1,7], 2>;
 defm : BWWriteResPair<WriteFSqrtX,   [BWPort0,BWFPDivider], 11, [1,7], 1, 5>; // Floating point square root (XMM).
@@ -297,16 +315,6 @@ defm : BWWriteResPair<WriteFSqrt64Y, [BWPort0,BWPort015,BWFPDivider], 29, [2,1,2
 defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
 defm : BWWriteResPair<WriteFSqrt80,  [BWPort0,BWFPDivider], 23, [1,9]>; // Floating point long double square root.
 
-defm : BWWriteResPair<WriteFRcp,   [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal estimate.
-defm : BWWriteResPair<WriteFRcpX,  [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal estimate (XMM).
-defm : BWWriteResPair<WriteFRcpY,  [BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal estimate (YMM/ZMM).
-defm : X86WriteResPairUnsupported<WriteFRcpZ>;
-
-defm : BWWriteResPair<WriteFRsqrt, [BWPort0],  5, [1], 1, 5>; // Floating point reciprocal square root estimate.
-defm : BWWriteResPair<WriteFRsqrtX,[BWPort0],  5, [1], 1, 5>; // Floating point reciprocal square root estimate (XMM).
-defm : BWWriteResPair<WriteFRsqrtY,[BWPort0,BWPort015], 11, [2,1], 3, 6>; // Floating point reciprocal square root estimate (YMM/ZMM).
-defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
-
 defm : BWWriteResPair<WriteFMA,    [BWPort01], 5, [1], 1, 5>; // Fused Multiply Add.
 defm : BWWriteResPair<WriteFMAX,   [BWPort01], 5, [1], 1, 5>; // Fused Multiply Add (XMM).
 defm : BWWriteResPair<WriteFMAY,   [BWPort01], 5, [1], 1, 6>; // Fused Multiply Add (YMM/ZMM).
@@ -336,6 +344,8 @@ defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
 defm : BWWriteResPair<WriteFBlend,  [BWPort015], 1, [1], 1, 5>; // Floating point vector blends.
 defm : BWWriteResPair<WriteFBlendY, [BWPort015], 1, [1], 1, 6>; // Floating point vector blends.
 defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : BWWriteResPair<WriteFShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteFVarShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector variable shuffles.
 defm : BWWriteResPair<WriteFVarBlend,  [BWPort5], 2, [2], 2, 5>; // Fp vector variable blends.
 defm : BWWriteResPair<WriteFVarBlendY, [BWPort5], 2, [2], 2, 6>; // Fp vector variable blends.
 defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
@@ -343,6 +353,48 @@ defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
 // FMA Scheduling helper class.
 // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
 
+// Conversion between integer and float.
+defm : BWWriteResPair<WriteCvtSS2I,   [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2I,   [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2IY,  [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : BWWriteResPair<WriteCvtSD2I,   [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2I,   [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2IY,  [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+
+defm : BWWriteResPair<WriteCvtI2SS,   [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PS,   [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PSY,  [BWPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : BWWriteResPair<WriteCvtI2SD,   [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PD,   [BWPort1], 4>;
+defm : BWWriteResPair<WriteCvtI2PDY,  [BWPort1], 4>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+
+defm : BWWriteResPair<WriteCvtSS2SD,  [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2PD,  [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPS2PDY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
+defm : BWWriteResPair<WriteCvtSD2SS,  [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2PS,  [BWPort1], 3>;
+defm : BWWriteResPair<WriteCvtPD2PSY, [BWPort1], 3>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
+
+defm : X86WriteRes<WriteCvtPH2PS,     [BWPort0,BWPort5], 2, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSY,    [BWPort0,BWPort5], 2, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
+defm : X86WriteRes<WriteCvtPH2PSLd,  [BWPort0,BWPort23], 6, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPH2PSYLd, [BWPort0,BWPort23], 6, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
+
+defm : X86WriteRes<WriteCvtPS2PH,    [BWPort1,BWPort5], 4, [1,1], 2>;
+defm : X86WriteRes<WriteCvtPS2PHY,   [BWPort1,BWPort5], 6, [1,1], 2>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+defm : X86WriteRes<WriteCvtPS2PHSt,  [BWPort1,BWPort4,BWPort237], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteCvtPS2PHYSt, [BWPort1,BWPort4,BWPort237], 7, [1,1,1], 3>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
 // Vector integer operations.
 defm : X86WriteRes<WriteVecLoad,         [BWPort23], 5, [1], 1>;
 defm : X86WriteRes<WriteVecLoadX,        [BWPort23], 5, [1], 1>;
@@ -366,12 +418,6 @@ defm : X86WriteRes<WriteVecMoveY,        [BWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [BWPort0], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [BWPort5], 1, [1], 1>;
 
-defm : X86WriteRes<WriteEMMS,            [BWPort01,BWPort15,BWPort015,BWPort0156], 31, [8,1,21,1], 31>;
-
-defm : BWWriteResPair<WriteVecALU,   [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
-defm : BWWriteResPair<WriteVecALUX,  [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
-defm : BWWriteResPair<WriteVecALUY,  [BWPort15], 1, [1], 1, 6>; // Vector integer ALU op, no logicals (YMM/ZMM).
-defm : X86WriteResPairUnsupported<WriteVecALUZ>;
 defm : BWWriteResPair<WriteVecLogic, [BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor.
 defm : BWWriteResPair<WriteVecLogicX,[BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor.
 defm : BWWriteResPair<WriteVecLogicY,[BWPort015], 1, [1], 1, 6>; // Vector integer and/or/xor (YMM/ZMM).
@@ -379,6 +425,10 @@ defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
 defm : BWWriteResPair<WriteVecTest,  [BWPort0,BWPort5], 2, [1,1], 2, 5>; // Vector integer TEST instructions.
 defm : BWWriteResPair<WriteVecTestY, [BWPort0,BWPort5], 4, [1,1], 2, 6>; // Vector integer TEST instructions (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : BWWriteResPair<WriteVecALU,   [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : BWWriteResPair<WriteVecALUX,  [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals.
+defm : BWWriteResPair<WriteVecALUY,  [BWPort15], 1, [1], 1, 6>; // Vector integer ALU op, no logicals (YMM/ZMM).
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
 defm : BWWriteResPair<WriteVecIMul,  [BWPort0],  5, [1], 1, 5>; // Vector integer multiply.
 defm : BWWriteResPair<WriteVecIMulX, [BWPort0],  5, [1], 1, 5>; // Vector integer multiply.
 defm : BWWriteResPair<WriteVecIMulY, [BWPort0],  5, [1], 1, 6>; // Vector integer multiply.
@@ -397,6 +447,9 @@ defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
 defm : BWWriteResPair<WriteBlend,  [BWPort5], 1, [1], 1, 5>; // Vector blends.
 defm : BWWriteResPair<WriteBlendY, [BWPort5], 1, [1], 1, 6>; // Vector blends (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : BWWriteResPair<WriteShuffle256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteVPMOV256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width packed vector width-changing move.
+defm : BWWriteResPair<WriteVarShuffle256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width vector variable shuffles.
 defm : BWWriteResPair<WriteVarBlend,  [BWPort5], 2, [2], 2, 5>; // Vector variable blends.
 defm : BWWriteResPair<WriteVarBlendY, [BWPort5], 2, [2], 2, 6>; // Vector variable blends (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
@@ -444,49 +497,7 @@ def : WriteRes<WriteVecExtractSt, [BWPort4,BWPort5,BWPort237]> {
   let NumMicroOps = 3;
 }
 
-// Conversion between integer and float.
-defm : BWWriteResPair<WriteCvtSS2I,   [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPS2I,   [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPS2IY,  [BWPort1], 3>;
-defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
-defm : BWWriteResPair<WriteCvtSD2I,   [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPD2I,   [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPD2IY,  [BWPort1], 3>;
-defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
-
-defm : BWWriteResPair<WriteCvtI2SS,   [BWPort1], 4>;
-defm : BWWriteResPair<WriteCvtI2PS,   [BWPort1], 4>;
-defm : BWWriteResPair<WriteCvtI2PSY,  [BWPort1], 4>;
-defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
-defm : BWWriteResPair<WriteCvtI2SD,   [BWPort1], 4>;
-defm : BWWriteResPair<WriteCvtI2PD,   [BWPort1], 4>;
-defm : BWWriteResPair<WriteCvtI2PDY,  [BWPort1], 4>;
-defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
-
-defm : BWWriteResPair<WriteCvtSS2SD,  [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPS2PD,  [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPS2PDY, [BWPort1], 3>;
-defm : X86WriteResPairUnsupported<WriteCvtPS2PDZ>;
-defm : BWWriteResPair<WriteCvtSD2SS,  [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPD2PS,  [BWPort1], 3>;
-defm : BWWriteResPair<WriteCvtPD2PSY, [BWPort1], 3>;
-defm : X86WriteResPairUnsupported<WriteCvtPD2PSZ>;
-
-defm : X86WriteRes<WriteCvtPH2PS,     [BWPort0,BWPort5], 2, [1,1], 2>;
-defm : X86WriteRes<WriteCvtPH2PSY,    [BWPort0,BWPort5], 2, [1,1], 2>;
-defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
-defm : X86WriteRes<WriteCvtPH2PSLd,  [BWPort0,BWPort23], 6, [1,1], 2>;
-defm : X86WriteRes<WriteCvtPH2PSYLd, [BWPort0,BWPort23], 6, [1,1], 2>;
-defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
-
-defm : X86WriteRes<WriteCvtPS2PH,    [BWPort1,BWPort5], 4, [1,1], 2>;
-defm : X86WriteRes<WriteCvtPS2PHY,   [BWPort1,BWPort5], 6, [1,1], 2>;
-defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
-defm : X86WriteRes<WriteCvtPS2PHSt,  [BWPort1,BWPort4,BWPort237], 5, [1,1,1], 3>;
-defm : X86WriteRes<WriteCvtPS2PHYSt, [BWPort1,BWPort4,BWPort237], 7, [1,1,1], 3>;
-defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
-
-// Strings instructions.
+// String instructions.
 
 // Packed Compare Implicit Length Strings, Return Mask
 def : WriteRes<WritePCmpIStrM, [BWPort0]> {
@@ -542,7 +553,7 @@ def : WriteRes<WriteVecMOVMSK,  [BWPort0]> { let Latency = 3; }
 def : WriteRes<WriteVecMOVMSKY, [BWPort0]> { let Latency = 3; }
 def : WriteRes<WriteMMXMOVMSK,  [BWPort0]> { let Latency = 1; }
 
-// AES instructions.
+// AES Instructions.
 def : WriteRes<WriteAESDecEnc, [BWPort5]> { // Decryption, encryption.
   let Latency = 7;
   let NumMicroOps = 1;
@@ -578,27 +589,19 @@ def : WriteRes<WriteAESKeyGenLd, [BWPort0, BWPort5, BWPort23, BWPort015]> {
 
 // Carry-less multiplication instructions.
 defm : BWWriteResPair<WriteCLMul,  [BWPort0], 5>;
+// Load/store MXCSR.
+def : WriteRes<WriteLDMXCSR, [BWPort0,BWPort23,BWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
+def : WriteRes<WriteSTMXCSR, [BWPort4,BWPort5,BWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
 
 // Catch-all for expensive system instructions.
-def : WriteRes<WriteSystem,     [BWPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
-
-// AVX2.
-defm : BWWriteResPair<WriteFShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector shuffles.
-defm : BWWriteResPair<WriteFVarShuffle256, [BWPort5], 3, [1], 1, 6>; // Fp 256-bit width vector variable shuffles.
-defm : BWWriteResPair<WriteShuffle256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width vector shuffles.
-defm : BWWriteResPair<WriteVPMOV256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width packed vector width-changing move.
-defm : BWWriteResPair<WriteVarShuffle256, [BWPort5], 3, [1], 1, 6>;  // 256-bit width vector variable shuffles.
+def : WriteRes<WriteSystem,     [BWPort0156]> { let Latency = 100; }
 
 // Old microcoded instructions that nobody use.
-def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
+def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; }
 
 // Fence instructions.
 def : WriteRes<WriteFence,  [BWPort23, BWPort4]>;
 
-// Load/store MXCSR.
-def : WriteRes<WriteLDMXCSR, [BWPort0,BWPort23,BWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
-def : WriteRes<WriteSTMXCSR, [BWPort4,BWPort5,BWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
-
 // Nop, not very useful expect it provides a model for nops!
 def : WriteRes<WriteNop, []>;
 
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 99fddcd4b2d59..d1eac84084ba1 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -117,12 +117,16 @@ multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
 // 2/3/7 cycle to recompute the address.
 def : WriteRes<WriteRMW, [HWPort237,HWPort4]>;
 
+// Loads, stores, and moves, not folded with other operations.
 // Store_addr on 237.
 // Store_data on 4.
 defm : X86WriteRes<WriteStore,   [HWPort237, HWPort4], 1, [1,1], 1>;
 defm : X86WriteRes<WriteStoreNT, [HWPort237, HWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteLoad,    [HWPort23], 5, [1], 1>;
 defm : X86WriteRes<WriteMove,    [HWPort0156], 1, [1], 1>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
 def  : WriteRes<WriteZero,       []>;
 
 // Model the effect of clobbering the read-write mask operand of the GATHER operation.
@@ -140,9 +144,11 @@ defm : X86WriteRes<WriteIMul16Imm,    [HWPort1,HWPort0156], 4, [1,1], 2>;
 defm : X86WriteRes<WriteIMul16ImmLd,  [HWPort1,HWPort0156,HWPort23], 8, [1,1,1], 3>;
 defm : HWWriteResPair<WriteIMul16Reg, [HWPort1],   3>;
 defm : HWWriteResPair<WriteIMul32,    [HWPort1,HWPort06,HWPort0156], 4, [1,1,1], 3>;
+defm : HWWriteResPair<WriteMULX32,    [HWPort1,HWPort06,HWPort0156], 4, [1,1,1], 3>;
 defm : HWWriteResPair<WriteIMul32Imm, [HWPort1],   3>;
 defm : HWWriteResPair<WriteIMul32Reg, [HWPort1],   3>;
 defm : HWWriteResPair<WriteIMul64,    [HWPort1,HWPort6], 4, [1,1], 2>;
+defm : HWWriteResPair<WriteMULX64,    [HWPort1,HWPort6], 4, [1,1], 2>;
 defm : HWWriteResPair<WriteIMul64Imm, [HWPort1],   3>;
 defm : HWWriteResPair<WriteIMul64Reg, [HWPort1],   3>;
 def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
@@ -165,11 +171,15 @@ defm : X86WriteRes<WriteSHDrrcl,[HWPort1, HWPort06, HWPort0156], 6, [1, 1, 2], 4
 defm : X86WriteRes<WriteSHDmri, [HWPort1, HWPort23, HWPort237, HWPort0156], 10, [1, 1, 1, 1], 4>;
 defm : X86WriteRes<WriteSHDmrcl,[HWPort1, HWPort23, HWPort237, HWPort06, HWPort0156], 12, [1, 1, 1, 1, 2], 6>;
 
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
 defm : HWWriteResPair<WriteJump,   [HWPort06],  1>;
+
 defm : HWWriteResPair<WriteCRC32,  [HWPort1],   3>;
 
 defm : HWWriteResPair<WriteCMOV,  [HWPort06,HWPort0156], 2, [1,1], 2>; // Conditional move.
 defm : X86WriteRes<WriteFCMOV, [HWPort1], 3, [1], 1>; // x87 conditional move.
+
 def  : WriteRes<WriteSETCC, [HWPort06]>; // Setcc.
 def  : WriteRes<WriteSETCCStore, [HWPort06,HWPort4,HWPort237]> {
   let Latency = 2;
@@ -220,7 +230,7 @@ defm : X86WriteRes<WriteIDiv16Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>
 defm : X86WriteRes<WriteIDiv32Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
 defm : X86WriteRes<WriteIDiv64Ld, [HWPort0,HWPort23,HWDivider], 29, [1,1,10], 2>;
 
-// Scalar and vector floating point.
+// Floating point. This covers both scalar and vector operations.
 defm : X86WriteRes<WriteFLD0,          [HWPort01], 1, [1], 1>;
 defm : X86WriteRes<WriteFLD1,          [HWPort01], 1, [2], 2>;
 defm : X86WriteRes<WriteFLDC,          [HWPort01], 1, [2], 2>;
@@ -305,14 +315,14 @@ defm : HWWriteResPair<WriteFSqrt64Y, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28
 defm : HWWriteResPair<WriteFSqrt64Z, [HWPort0,HWPort15,HWFPDivider], 35, [2,1,28], 3, 7>; // Unsupported = 1
 defm : HWWriteResPair<WriteFSqrt80,  [HWPort0,HWFPDivider], 23, [1,17]>;
 
-defm : HWWriteResPair<WriteFMA,   [HWPort01], 5, [1], 1, 5>;
-defm : HWWriteResPair<WriteFMAX,  [HWPort01], 5, [1], 1, 6>;
-defm : HWWriteResPair<WriteFMAY,  [HWPort01], 5, [1], 1, 7>;
-defm : HWWriteResPair<WriteFMAZ,  [HWPort01], 5, [1], 1, 7>; // Unsupported = 1
-defm : HWWriteResPair<WriteDPPD,  [HWPort0,HWPort1,HWPort5],  9, [1,1,1], 3, 6>;
-defm : HWWriteResPair<WriteDPPS,  [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 6>;
-defm : HWWriteResPair<WriteDPPSY, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>;
-defm : HWWriteResPair<WriteDPPSZ, [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteFMA,    [HWPort01], 5, [1], 1, 5>;
+defm : HWWriteResPair<WriteFMAX,   [HWPort01], 5, [1], 1, 6>;
+defm : HWWriteResPair<WriteFMAY,   [HWPort01], 5, [1], 1, 7>;
+defm : HWWriteResPair<WriteFMAZ,   [HWPort01], 5, [1], 1, 7>; // Unsupported = 1
+defm : HWWriteResPair<WriteDPPD,   [HWPort0,HWPort1,HWPort5],  9, [1,1,1], 3, 6>;
+defm : HWWriteResPair<WriteDPPS,   [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 6>;
+defm : HWWriteResPair<WriteDPPSY,  [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>;
+defm : HWWriteResPair<WriteDPPSZ,  [HWPort0,HWPort1,HWPort5], 14, [2,1,1], 4, 7>; // Unsupported = 1
 defm : HWWriteResPair<WriteFSign,  [HWPort0], 1>;
 defm : X86WriteRes<WriteFRnd,            [HWPort23],  6, [1],   1>;
 defm : X86WriteRes<WriteFRndY,           [HWPort23],  6, [1],   1>;
@@ -593,11 +603,28 @@ def : WriteRes<WriteCLMulLd, [HWPort0, HWPort5, HWPort23]> {
 def : WriteRes<WriteLDMXCSR, [HWPort0,HWPort23,HWPort0156]> { let Latency = 7; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
 def : WriteRes<WriteSTMXCSR, [HWPort4,HWPort5,HWPort237]> { let Latency = 2; let NumMicroOps = 3; let ResourceCycles = [1,1,1]; }
 
+// Catch-all for expensive system instructions.
 def : WriteRes<WriteSystem,     [HWPort0156]> { let Latency = 100; }
+
+// Old microcoded instructions that nobody use.
 def : WriteRes<WriteMicrocoded, [HWPort0156]> { let Latency = 100; }
+
+// Fence instructions.
 def : WriteRes<WriteFence,  [HWPort23, HWPort4]>;
+
+// Nop, not very useful expect it provides a model for nops!
 def : WriteRes<WriteNop, []>;
 
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub  instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+defm : HWWriteResPair<WriteFHAdd,   [HWPort1, HWPort5], 5, [1,2], 3, 6>;
+defm : HWWriteResPair<WriteFHAddY,  [HWPort1, HWPort5], 5, [1,2], 3, 7>;
+defm : HWWriteResPair<WritePHAdd,  [HWPort5, HWPort15], 3, [2,1], 3, 5>;
+defm : HWWriteResPair<WritePHAddX, [HWPort5, HWPort15], 3, [2,1], 3, 6>;
+defm : HWWriteResPair<WritePHAddY, [HWPort5, HWPort15], 3, [2,1], 3, 7>;
+
 //================ Exceptions ================//
 
 //-- Specific Scheduling Models --//
@@ -821,16 +848,6 @@ def HWWriteFXTRACT : SchedWriteRes<[]> {
 }
 def : InstRW<[HWWriteFXTRACT], (instrs FXTRACT)>;
 
-////////////////////////////////////////////////////////////////////////////////
-// Horizontal add/sub  instructions.
-////////////////////////////////////////////////////////////////////////////////
-
-defm : HWWriteResPair<WriteFHAdd,  [HWPort1, HWPort5], 5, [1,2], 3, 6>;
-defm : HWWriteResPair<WriteFHAddY, [HWPort1, HWPort5], 5, [1,2], 3, 7>;
-defm : HWWriteResPair<WritePHAdd,  [HWPort5, HWPort15], 3, [2,1], 3, 5>;
-defm : HWWriteResPair<WritePHAddX, [HWPort5, HWPort15], 3, [2,1], 3, 6>;
-defm : HWWriteResPair<WritePHAddY, [HWPort5, HWPort15], 3, [2,1], 3, 7>;
-
 //=== Floating Point XMM and YMM Instructions ===//
 
 // Remaining instrs.
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 2f7157f432686..d6d8fc6c74d06 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -124,9 +124,11 @@ defm : X86WriteRes<WriteIMul16Imm,    [SBPort1,SBPort015], 4, [1,1], 2>;
 defm : X86WriteRes<WriteIMul16ImmLd,  [SBPort1,SBPort015,SBPort23], 8, [1,1,1], 3>;
 defm : SBWriteResPair<WriteIMul16Reg, [SBPort1],   3>;
 defm : SBWriteResPair<WriteIMul32,    [SBPort1,SBPort05,SBPort015], 4, [1,1,1], 3>;
+defm : SBWriteResPair<WriteMULX32,    [SBPort1,SBPort05,SBPort015], 4, [1,1,1], 3>;
 defm : SBWriteResPair<WriteIMul32Imm, [SBPort1],   3>;
 defm : SBWriteResPair<WriteIMul32Reg, [SBPort1],   3>;
 defm : SBWriteResPair<WriteIMul64,    [SBPort1,SBPort0], 4, [1,1], 2>;
+defm : SBWriteResPair<WriteMULX64,    [SBPort1,SBPort0], 4, [1,1], 2>;
 defm : SBWriteResPair<WriteIMul64Imm, [SBPort1],   3>;
 defm : SBWriteResPair<WriteIMul64Reg, [SBPort1],   3>;
 def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 8486bdda0349a..b0d586c2b463d 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -122,9 +122,11 @@ defm : X86WriteRes<WriteIMul16Imm,     [SKLPort1,SKLPort0156], 4, [1,1], 2>;
 defm : X86WriteRes<WriteIMul16ImmLd,   [SKLPort1,SKLPort0156,SKLPort23], 8, [1,1,1], 3>;
 defm : SKLWriteResPair<WriteIMul16Reg, [SKLPort1],   3>;
 defm : SKLWriteResPair<WriteIMul32,    [SKLPort1,SKLPort06,SKLPort0156], 4, [1,1,1], 3>;
+defm : SKLWriteResPair<WriteMULX32,    [SKLPort1,SKLPort06,SKLPort0156], 4, [1,1,1], 3>;
 defm : SKLWriteResPair<WriteIMul32Imm, [SKLPort1],   3>;
 defm : SKLWriteResPair<WriteIMul32Reg, [SKLPort1],   3>;
 defm : SKLWriteResPair<WriteIMul64,    [SKLPort1,SKLPort5], 4, [1,1], 2>;
+defm : SKLWriteResPair<WriteMULX64,    [SKLPort1,SKLPort5], 4, [1,1], 2>;
 defm : SKLWriteResPair<WriteIMul64Imm, [SKLPort1],   3>;
 defm : SKLWriteResPair<WriteIMul64Reg, [SKLPort1],   3>;
 def : WriteRes<WriteIMulH, []> { let Latency = 3; }
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index ba80d47c4eb62..f2f72da8a7ef2 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -123,9 +123,11 @@ defm : X86WriteRes<WriteIMul16ImmLd,   [SKXPort1,SKXPort0156,SKXPort23], 8, [1,1
 defm : X86WriteRes<WriteIMul16Reg,     [SKXPort1],   3, [1], 1>;
 defm : X86WriteRes<WriteIMul16RegLd,   [SKXPort1,SKXPort0156,SKXPort23], 8, [1,1,1], 3>;
 defm : SKXWriteResPair<WriteIMul32,    [SKXPort1,SKXPort06,SKXPort0156], 4, [1,1,1], 3>;
+defm : SKXWriteResPair<WriteMULX32,    [SKXPort1,SKXPort06,SKXPort0156], 4, [1,1,1], 3>;
 defm : SKXWriteResPair<WriteIMul32Imm, [SKXPort1],   3>;
 defm : SKXWriteResPair<WriteIMul32Reg, [SKXPort1],   3>;
 defm : SKXWriteResPair<WriteIMul64,    [SKXPort1,SKXPort5], 4, [1,1], 2>;
+defm : SKXWriteResPair<WriteMULX64,    [SKXPort1,SKXPort5], 4, [1,1], 2>;
 defm : SKXWriteResPair<WriteIMul64Imm, [SKXPort1],   3>;
 defm : SKXWriteResPair<WriteIMul64Reg, [SKXPort1],   3>;
 def : WriteRes<WriteIMulH, []> { let Latency = 3; }
diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td
index ae1c7be866246..bf654f3be3199 100644
--- a/llvm/lib/Target/X86/X86Schedule.td
+++ b/llvm/lib/Target/X86/X86Schedule.td
@@ -148,7 +148,9 @@ defm WriteIMul32Reg : X86SchedWritePair; // Integer 32-bit multiplication by reg
 defm WriteIMul64    : X86SchedWritePair; // Integer 64-bit multiplication.
 defm WriteIMul64Imm : X86SchedWritePair; // Integer 64-bit multiplication by immediate.
 defm WriteIMul64Reg : X86SchedWritePair; // Integer 64-bit multiplication by register.
-def  WriteIMulH     : SchedWrite;        // Integer multiplication, high part.
+defm WriteMULX32    : X86SchedWritePair; // Integer 32-bit Multiplication without affecting flags.
+defm WriteMULX64    : X86SchedWritePair; // Integer 64-bit Multiplication without affecting flags.
+def  WriteIMulH     : SchedWrite;        // Integer multiplication, high part (only used by MULX).
 
 def  WriteBSWAP32 : SchedWrite; // Byte Order (Endianness) 32-bit Swap.
 def  WriteBSWAP64 : SchedWrite; // Byte Order (Endianness) 64-bit Swap.
diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td
index d00c2e3718d3a..1412649551c4f 100644
--- a/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -91,6 +91,8 @@ defm : AtomWriteResPair<WriteIMul64,    [AtomPort01], [AtomPort01], 12, 12, [12]
 defm : AtomWriteResPair<WriteIMul64Imm, [AtomPort01], [AtomPort01], 14, 14, [14], [14]>;
 defm : AtomWriteResPair<WriteIMul64Reg, [AtomPort01], [AtomPort01], 12, 12, [12], [12]>;
 defm : X86WriteResUnsupported<WriteIMulH>;
+defm : X86WriteResPairUnsupported<WriteMULX32>;
+defm : X86WriteResPairUnsupported<WriteMULX64>;
 
 defm : X86WriteRes<WriteXCHG,        [AtomPort01], 2, [2], 1>;
 defm : X86WriteRes<WriteBSWAP32,     [AtomPort0], 1, [1], 1>;
diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
index 99d4011dae779..3012fd4d62ff7 100644
--- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
@@ -435,7 +435,11 @@ defm : PdWriteResExPair<WriteIMul32Reg, [PdEX1, PdMul],          4,  [1, 2]>;
 defm : PdWriteResExPair<WriteIMul64,    [PdEX1, PdMul],          6,  [1, 6]>;
 defm : PdWriteResExPair<WriteIMul64Imm, [PdEX1, PdMul],          6,  [1, 4],1, 1>;
 defm : PdWriteResExPair<WriteIMul64Reg, [PdEX1, PdMul],          6,  [1, 4]>;
-defm : X86WriteResUnsupported<WriteIMulH>; // BMI2 MULX
+
+// BMI2 MULX
+defm : X86WriteResUnsupported<WriteIMulH>;
+defm : X86WriteResPairUnsupported<WriteMULX32>;
+defm : X86WriteResPairUnsupported<WriteMULX64>;
 
 defm : PdWriteResExPair<WriteDiv8,    [PdEX1, PdDiv],           12,  [1, 12]>;
 defm : PdWriteResExPair<WriteDiv16,   [PdEX1, PdDiv],           15,  [1, 15],   2>;
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index cdd03830bcad3..530429688709e 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -209,7 +209,9 @@ defm : JWriteResIntPair<WriteIMul32Reg, [JALU1, JMul], 3, [1, 1], 1>;
 defm : JWriteResIntPair<WriteIMul64,    [JALU1, JMul], 6, [1, 4], 2>;  
 defm : JWriteResIntPair<WriteIMul64Imm, [JALU1, JMul], 6, [1, 4], 1>;
 defm : JWriteResIntPair<WriteIMul64Reg, [JALU1, JMul], 6, [1, 4], 1>;
-defm : X86WriteRes<WriteIMulH,          [JALU1], 6, [4], 1>;
+defm : X86WriteResUnsupported<WriteIMulH>;
+defm : X86WriteResPairUnsupported<WriteMULX32>;
+defm : X86WriteResPairUnsupported<WriteMULX64>;
 
 defm : JWriteResIntPair<WriteDiv8,   [JALU1, JDiv], 12, [1, 12], 1>;
 defm : JWriteResIntPair<WriteDiv16,  [JALU1, JDiv], 17, [1, 17], 2>;
diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td
index 123844a73a599..d2526472ed5ec 100644
--- a/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -111,7 +111,9 @@ defm : SLMWriteResPair<WriteIMul32Reg, [SLM_IEC_RSV1],  3>;
 defm : SLMWriteResPair<WriteIMul64,    [SLM_IEC_RSV1],  3>;
 defm : SLMWriteResPair<WriteIMul64Imm, [SLM_IEC_RSV1],  3>;
 defm : SLMWriteResPair<WriteIMul64Reg, [SLM_IEC_RSV1],  3>;
-def  : WriteRes<WriteIMulH, [SLM_FPC_RSV0]>;
+defm : X86WriteResUnsupported<WriteIMulH>;
+defm : X86WriteResPairUnsupported<WriteMULX32>;
+defm : X86WriteResPairUnsupported<WriteMULX64>;
 
 defm : X86WriteRes<WriteBSWAP32, [SLM_IEC_RSV01], 1, [1], 1>;
 defm : X86WriteRes<WriteBSWAP64, [SLM_IEC_RSV01], 1, [1], 1>;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
index 12f8e7cc76f7e..3a1c27fda7040 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -256,8 +256,9 @@ defm : ZnWriteResPair<WriteIDiv32, [ZnALU2, ZnDivider], 25, [1,25], 2>;
 defm : ZnWriteResPair<WriteIDiv64, [ZnALU2, ZnDivider], 41, [1,41], 2>;
 
 // IMULH
-def  : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{
-  let Latency = 4;
+def  : WriteRes<WriteIMulH, [ZnMultiplier]>{
+  let Latency = 3;
+  let NumMicroOps = 0;
 }
 
 // Floating point operations
@@ -659,32 +660,10 @@ def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
 }
 def : SchedAlias<WriteIMul64Ld, ZnWriteMul64Ld>;
 
-// MULX.
-// r32,r32,r32.
-def ZnWriteMulX32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
-  let Latency = 3;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[ZnWriteMulX32], (instrs MULX32rr)>;
-
-// r32,r32,m32.
-def ZnWriteMulX32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
-  let Latency = 8;
-  let ResourceCycles = [1, 2, 2];
-}
-def : InstRW<[ZnWriteMulX32Ld, ReadAfterLd], (instrs MULX32rm)>;
-
-// r64,r64,r64.
-def ZnWriteMulX64 : SchedWriteRes<[ZnALU1]> {
-  let Latency = 3;
-}
-def : InstRW<[ZnWriteMulX64], (instrs MULX64rr)>;
-
-// r64,r64,m64.
-def ZnWriteMulX64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
-  let Latency = 8;
-}
-def : InstRW<[ZnWriteMulX64Ld, ReadAfterLd], (instrs MULX64rm)>;
+// MULX
+// Numbers are based on the AMD SOG for Family 17h - Instruction Latencies.
+defm : ZnWriteResPair<WriteMULX32, [ZnALU1, ZnMultiplier], 3, [1, 1], 1, 5, 0>;
+defm : ZnWriteResPair<WriteMULX64, [ZnALU1, ZnMultiplier], 3, [1, 1], 1, 5, 0>;
 
 //-- Control transfer instructions --//
 
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td
index 5b4b151d29389..75b9bce4507ec 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -243,8 +243,9 @@ defm : Zn2WriteResPair<WriteIDiv32, [Zn2ALU2, Zn2Divider], 25, [1,25], 2>;
 defm : Zn2WriteResPair<WriteIDiv64, [Zn2ALU2, Zn2Divider], 41, [1,41], 2>;
 
 // IMULH
-def  : WriteRes<WriteIMulH, [Zn2ALU1, Zn2Multiplier]>{
-  let Latency = 4;
+def  : WriteRes<WriteIMulH, [Zn2Multiplier]>{
+  let Latency = 3;
+  let NumMicroOps = 0;
 }
 
 // Floating point operations
@@ -658,31 +659,9 @@ def : SchedAlias<WriteIMul64ImmLd, Zn2WriteMul64Ld>;
 def : SchedAlias<WriteIMul64RegLd, Zn2WriteMul64Ld>;
 
 // MULX.
-// r32,r32,r32.
-def Zn2WriteMulX32 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
-  let Latency = 3;
-  let ResourceCycles = [1, 2];
-}
-def : InstRW<[Zn2WriteMulX32], (instrs MULX32rr)>;
-
-// r32,r32,m32.
-def Zn2WriteMulX32Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
-  let Latency = 7;
-  let ResourceCycles = [1, 2, 2];
-}
-def : InstRW<[Zn2WriteMulX32Ld, ReadAfterLd], (instrs MULX32rm)>;
-
-// r64,r64,r64.
-def Zn2WriteMulX64 : SchedWriteRes<[Zn2ALU1]> {
-  let Latency = 3;
-}
-def : InstRW<[Zn2WriteMulX64], (instrs MULX64rr)>;
-
-// r64,r64,m64.
-def Zn2WriteMulX64Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
-  let Latency = 7;
-}
-def : InstRW<[Zn2WriteMulX64Ld, ReadAfterLd], (instrs MULX64rm)>;
+// Numbers are based on the AMD SOG for Family 17h - Instruction Latencies.
+defm : Zn2WriteResPair<WriteMULX32, [Zn2ALU1, Zn2Multiplier], 3, [1, 1], 1, 4, 0>;
+defm : Zn2WriteResPair<WriteMULX64, [Zn2ALU1, Zn2Multiplier], 3, [1, 1], 1, 4, 0>;
 
 //-- Control transfer instructions --//
 
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index 4a91a91a0f0fd..ec4ae59aa074b 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -617,6 +617,7 @@ defm : Zn3WriteResIntPair<WriteIMul16, [Zn3Multiplier], 3, [3], 3, /*LoadUOps=*/
 defm : Zn3WriteResIntPair<WriteIMul16Imm, [Zn3Multiplier], 4, [4], 2>; // Integer 16-bit multiplication by immediate.
 defm : Zn3WriteResIntPair<WriteIMul16Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 16-bit multiplication by register.
 defm : Zn3WriteResIntPair<WriteIMul32, [Zn3Multiplier], 3, [3], 2>;    // Integer 32-bit multiplication.
+defm : Zn3WriteResIntPair<WriteMULX32, [Zn3Multiplier], 4, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
 
 def Zn3MULX32rr : SchedWriteRes<[Zn3Multiplier]> {
   let Latency = 4;
@@ -630,11 +631,14 @@ def Zn3MULX32rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
   let ResourceCycles = [1, 1, 2];
   let NumMicroOps = Zn3MULX32rr.NumMicroOps;
 }
-def : InstRW<[Zn3MULX32rm, WriteIMulH], (instrs MULX32rm)>;
+def : InstRW<[Zn3MULX32rm, WriteIMulH,
+              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+              ReadAfterLd], (instrs MULX32rm)>;
 
 defm : Zn3WriteResIntPair<WriteIMul32Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by immediate.
 defm : Zn3WriteResIntPair<WriteIMul32Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 32-bit multiplication by register.
 defm : Zn3WriteResIntPair<WriteIMul64, [Zn3Multiplier], 3, [3], 2>;    // Integer 64-bit multiplication.
+defm : Zn3WriteResIntPair<WriteMULX64, [Zn3Multiplier], 4, [1], 2>;    // Integer 32-bit Unsigned Multiply Without Affecting Flags.
 
 def Zn3MULX64rr : SchedWriteRes<[Zn3Multiplier]> {
   let Latency = 4;
@@ -648,7 +652,9 @@ def Zn3MULX64rm : SchedWriteRes<[Zn3AGU012, Zn3Load, Zn3Multiplier]> {
   let ResourceCycles = [1, 1, 2];
   let NumMicroOps = Zn3MULX64rr.NumMicroOps;
 }
-def : InstRW<[Zn3MULX64rm, WriteIMulH], (instrs MULX64rm)>;
+def : InstRW<[Zn3MULX64rm, WriteIMulH,
+              ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
+              ReadAfterLd], (instrs MULX64rm)>;
 
 defm : Zn3WriteResIntPair<WriteIMul64Imm, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by immediate.
 defm : Zn3WriteResIntPair<WriteIMul64Reg, [Zn3Multiplier], 3, [1], 1>; // Integer 64-bit multiplication by register.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 7594a28db776c..532bc33d69a53 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2582,6 +2582,22 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 
   // TODO: Overflow intrinsics (*ADDO, *SUBO, *MULO) with vector types are not
   //       specialized in these tables yet.
+  static const CostTblEntry AVX512BITALGCostTbl[] = {
+    { ISD::CTPOP,      MVT::v32i16,  1 },
+    { ISD::CTPOP,      MVT::v64i8,   1 },
+    { ISD::CTPOP,      MVT::v16i16,  1 },
+    { ISD::CTPOP,      MVT::v32i8,   1 },
+    { ISD::CTPOP,      MVT::v8i16,   1 },
+    { ISD::CTPOP,      MVT::v16i8,   1 },
+  };
+  static const CostTblEntry AVX512VPOPCNTDQCostTbl[] = {
+    { ISD::CTPOP,      MVT::v8i64,   1 },
+    { ISD::CTPOP,      MVT::v16i32,  1 },
+    { ISD::CTPOP,      MVT::v4i64,   1 },
+    { ISD::CTPOP,      MVT::v8i32,   1 },
+    { ISD::CTPOP,      MVT::v2i64,   1 },
+    { ISD::CTPOP,      MVT::v4i32,   1 },
+  };
   static const CostTblEntry AVX512CDCostTbl[] = {
     { ISD::CTLZ,       MVT::v8i64,   1 },
     { ISD::CTLZ,       MVT::v16i32,  1 },
@@ -2709,6 +2725,8 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::FMAXNUM,    MVT::v2f64,   2 },
     { ISD::FMAXNUM,    MVT::v4f64,   2 },
     { ISD::FMAXNUM,    MVT::v8f64,   2 },
+    { ISD::ISNAN,      MVT::v8f64,   1 },
+    { ISD::ISNAN,      MVT::v16f32,  1 },
   };
   static const CostTblEntry XOPCostTbl[] = {
     { ISD::BITREVERSE, MVT::v4i64,   4 },
@@ -2837,6 +2855,8 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::FSQRT,      MVT::f64,    21 }, // SNB from http://www.agner.org/
     { ISD::FSQRT,      MVT::v2f64,  21 }, // SNB from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f64,  43 }, // SNB from http://www.agner.org/
+    { ISD::ISNAN,      MVT::v4f64,   1 },
+    { ISD::ISNAN,      MVT::v8f32,   1 },
   };
   static const CostTblEntry GLMCostTbl[] = {
     { ISD::FSQRT, MVT::f32,   19 }, // sqrtss
@@ -2933,12 +2953,16 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::FMAXNUM,    MVT::v2f64,   4 },
     { ISD::FSQRT,      MVT::f64,    32 }, // Nehalem from http://www.agner.org/
     { ISD::FSQRT,      MVT::v2f64,  32 }, // Nehalem from http://www.agner.org/
+    { ISD::ISNAN,      MVT::f64,    1 },
+    { ISD::ISNAN,      MVT::v2f64,  1 },
   };
   static const CostTblEntry SSE1CostTbl[] = {
     { ISD::FMAXNUM,    MVT::f32,     4 },
     { ISD::FMAXNUM,    MVT::v4f32,   4 },
     { ISD::FSQRT,      MVT::f32,    28 }, // Pentium III from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
+    { ISD::ISNAN,      MVT::f32,     1 },
+    { ISD::ISNAN,      MVT::v4f32,   1 },
   };
   static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
     { ISD::CTTZ,       MVT::i64,     1 },
@@ -3028,6 +3052,10 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   case Intrinsic::cttz:
     ISD = ISD::CTTZ;
     break;
+  case Intrinsic::isnan:
+    ISD = ISD::ISNAN;
+    OpTy = ICA.getArgTypes()[0];
+    break;
   case Intrinsic::maxnum:
   case Intrinsic::minnum:
     // FMINNUM has same costs so don't duplicate.
@@ -3123,6 +3151,14 @@ X86TTIImpl::getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
         return adjustTableCost(*Entry, LT.first, ICA.getFlags());
 
+    if (ST->hasBITALG())
+      if (const auto *Entry = CostTableLookup(AVX512BITALGCostTbl, ISD, MTy))
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
+    if (ST->hasVPOPCNTDQ())
+      if (const auto *Entry = CostTableLookup(AVX512VPOPCNTDQCostTbl, ISD, MTy))
+        return adjustTableCost(*Entry, LT.first, ICA.getFlags());
+
     if (ST->hasCDI())
       if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
         return adjustTableCost(*Entry, LT.first, ICA.getFlags());
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index 16b82219e8ca3..10a5f47104d5f 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -29,10 +29,12 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/Support/KnownBits.h"
 
 using namespace llvm;
 
@@ -61,6 +63,8 @@ static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
     Ops.push_back(I->getOperand(0));
     Ops.push_back(I->getOperand(1));
     break;
@@ -127,6 +131,8 @@ bool TruncInstCombine::buildTruncExpressionDag() {
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
     case Instruction::Select: {
       SmallVector<Value *, 2> Operands;
       getRelevantOperands(I, Operands);
@@ -137,7 +143,7 @@ bool TruncInstCombine::buildTruncExpressionDag() {
       // TODO: Can handle more cases here:
       // 1. shufflevector, extractelement, insertelement
       // 2. udiv, urem
-      // 3. shl, lshr, ashr
+      // 3. ashr
       // 4. phi node(and loop handling)
       // ...
       return false;
@@ -270,6 +276,32 @@ Type *TruncInstCombine::getBestTruncatedType() {
   unsigned OrigBitWidth =
       CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits();
 
+  // Initialize MinBitWidth for shift instructions with the minimum number
+  // that is greater than shift amount (i.e. shift amount + 1). For `lshr`
+  // adjust MinBitWidth so that all potentially truncated bits of
+  // the value-to-be-shifted are zeros.
+  // Also normalize MinBitWidth not to be greater than source bitwidth.
+  for (auto &Itr : InstInfoMap) {
+    Instruction *I = Itr.first;
+    if (I->getOpcode() == Instruction::Shl ||
+        I->getOpcode() == Instruction::LShr) {
+      KnownBits KnownRHS = computeKnownBits(I->getOperand(1), DL);
+      unsigned MinBitWidth = KnownRHS.getMaxValue()
+                                 .uadd_sat(APInt(OrigBitWidth, 1))
+                                 .getLimitedValue(OrigBitWidth);
+      if (MinBitWidth == OrigBitWidth)
+        return nullptr;
+      if (I->getOpcode() == Instruction::LShr) {
+        KnownBits KnownLHS = computeKnownBits(I->getOperand(0), DL);
+        MinBitWidth =
+            std::max(MinBitWidth, KnownLHS.getMaxValue().getActiveBits());
+        if (MinBitWidth >= OrigBitWidth)
+          return nullptr;
+      }
+      Itr.second.MinBitWidth = MinBitWidth;
+    }
+  }
+
   // Calculate minimum allowed bit-width allowed for shrinking the currently
   // visited truncate's operand.
   unsigned MinBitWidth = getMinBitWidth();
@@ -356,10 +388,16 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
     case Instruction::Mul:
     case Instruction::And:
     case Instruction::Or:
-    case Instruction::Xor: {
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr: {
       Value *LHS = getReducedOperand(I->getOperand(0), SclTy);
       Value *RHS = getReducedOperand(I->getOperand(1), SclTy);
       Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS);
+      // Preserve `exact` flag since truncation doesn't change exactness
+      if (Opc == Instruction::LShr)
+        if (auto *ResI = dyn_cast<Instruction>(Res))
+          ResI->setIsExact(I->isExact());
       break;
     }
     case Instruction::Select: {
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index e4ef23c0094bd..e213de2b73f4d 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -2514,7 +2514,7 @@ void coro::salvageDebugInfo(
   bool OutermostLoad = true;
   Value *Storage = DVI->getVariableLocationOp(0);
   Value *OriginalStorage = Storage;
-  while (auto *Inst = dyn_cast<Instruction>(Storage)) {
+  while (auto *Inst = dyn_cast_or_null<Instruction>(Storage)) {
     if (auto *LdInst = dyn_cast<LoadInst>(Inst)) {
       Storage = LdInst->getOperand(0);
       // FIXME: This is a heuristic that works around the fact that
@@ -2543,6 +2543,8 @@ void coro::salvageDebugInfo(
       Expr = DIExpression::appendOpsToArg(Expr, Ops, 0, /*StackValue*/ false);
     }
   }
+  if (!Storage)
+    return;
 
   // Store a pointer to the coroutine frame object in an alloca so it
   // is available throughout the function when producing unoptimized
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 9ccd4e98320aa..4a4e8b8743854 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -906,8 +906,7 @@ void CoroCloner::create() {
   case coro::ABI::Switch:
     // Bootstrap attributes by copying function attributes from the
     // original function.  This should include optimization settings and so on.
-    NewAttrs = NewAttrs.addAttributes(Context, AttributeList::FunctionIndex,
-                                      OrigAttrs.getFnAttributes());
+    NewAttrs = NewAttrs.addFnAttributes(Context, OrigAttrs.getFnAttrs());
 
     addFramePointerAttrs(NewAttrs, Context, 0,
                          Shape.FrameSize, Shape.FrameAlign);
@@ -929,9 +928,8 @@ void CoroCloner::create() {
     }
 
     // Transfer the original function's attributes.
-    auto FnAttrs = OrigF.getAttributes().getFnAttributes();
-    NewAttrs =
-        NewAttrs.addAttributes(Context, AttributeList::FunctionIndex, FnAttrs);
+    auto FnAttrs = OrigF.getAttributes().getFnAttrs();
+    NewAttrs = NewAttrs.addFnAttributes(Context, FnAttrs);
     break;
   }
   case coro::ABI::Retcon:
@@ -1568,8 +1566,8 @@ static void splitAsyncCoroutine(Function &F, coro::Shape &Shape,
   // Reset various things that the optimizer might have decided it
   // "knows" about the coroutine function due to not seeing a return.
   F.removeFnAttr(Attribute::NoReturn);
-  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
-  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+  F.removeRetAttr(Attribute::NoAlias);
+  F.removeRetAttr(Attribute::NonNull);
 
   auto &Context = F.getContext();
   auto *Int8PtrTy = Type::getInt8PtrTy(Context);
@@ -1667,8 +1665,8 @@ static void splitRetconCoroutine(Function &F, coro::Shape &Shape,
   // Reset various things that the optimizer might have decided it
   // "knows" about the coroutine function due to not seeing a return.
   F.removeFnAttr(Attribute::NoReturn);
-  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
-  F.removeAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+  F.removeRetAttr(Attribute::NoAlias);
+  F.removeRetAttr(Attribute::NonNull);
 
   // Allocate the frame.
   auto *Id = cast<AnyCoroIdRetconInst>(Shape.CoroBegin->getId());
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index c131e29300e61..66e47ef03bf93 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -311,10 +311,9 @@ void coro::Shape::buildFrom(Function &F) {
         if (CoroBegin)
           report_fatal_error(
                 "coroutine should have exactly one defining @llvm.coro.begin");
-        CB->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
-        CB->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
-        CB->removeAttribute(AttributeList::FunctionIndex,
-                            Attribute::NoDuplicate);
+        CB->addRetAttr(Attribute::NonNull);
+        CB->addRetAttr(Attribute::NoAlias);
+        CB->removeFnAttr(Attribute::NoDuplicate);
         CoroBegin = CB;
         break;
       }
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index f670a101767e9..5d6b750d9a460 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -148,7 +148,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     } else if (!ArgsToPromote.count(&*I)) {
       // Unchanged argument
       Params.push_back(I->getType());
-      ArgAttrVec.push_back(PAL.getParamAttributes(ArgNo));
+      ArgAttrVec.push_back(PAL.getParamAttrs(ArgNo));
     } else if (I->use_empty()) {
       // Dead argument (which are always marked as promotable)
       ++NumArgumentsDead;
@@ -231,8 +231,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
 
   // Recompute the parameter attributes list based on the new arguments for
   // the function.
-  NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(),
-                                       PAL.getRetAttributes(), ArgAttrVec));
+  NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttrs(),
+                                       PAL.getRetAttrs(), ArgAttrVec));
   ArgAttrVec.clear();
 
   F->getParent()->getFunctionList().insert(F->getIterator(), NF);
@@ -257,7 +257,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
          ++I, ++AI, ++ArgNo)
       if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
         Args.push_back(*AI); // Unmodified argument
-        ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+        ArgAttrVec.push_back(CallPAL.getParamAttrs(ArgNo));
       } else if (ByValArgsToTransform.count(&*I)) {
         // Emit a GEP and load for each element of the struct.
         Type *AgTy = I->getParamByValType();
@@ -325,7 +325,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     // Push any varargs arguments on the list.
     for (; AI != CB.arg_end(); ++AI, ++ArgNo) {
       Args.push_back(*AI);
-      ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+      ArgAttrVec.push_back(CallPAL.getParamAttrs(ArgNo));
     }
 
     SmallVector<OperandBundleDef, 1> OpBundles;
@@ -341,9 +341,9 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
       NewCS = NewCall;
     }
     NewCS->setCallingConv(CB.getCallingConv());
-    NewCS->setAttributes(
-        AttributeList::get(F->getContext(), CallPAL.getFnAttributes(),
-                           CallPAL.getRetAttributes(), ArgAttrVec));
+    NewCS->setAttributes(AttributeList::get(F->getContext(),
+                                            CallPAL.getFnAttrs(),
+                                            CallPAL.getRetAttrs(), ArgAttrVec));
     NewCS->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
     Args.clear();
     ArgAttrVec.clear();
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 8fb82d60033f3..1b30157e19522 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -1922,7 +1922,7 @@ void Attributor::createShallowWrapper(Function &F) {
 
   CallInst *CI = CallInst::Create(&F, Args, "", EntryBB);
   CI->setTailCall(true);
-  CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
+  CI->addFnAttr(Attribute::NoInline);
   ReturnInst::Create(Ctx, CI->getType()->isVoidTy() ? nullptr : CI, EntryBB);
 
   NumFnShallowWrappersCreated++;
@@ -2159,7 +2159,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
       } else {
         NewArgumentTypes.push_back(Arg.getType());
         NewArgumentAttributes.push_back(
-            OldFnAttributeList.getParamAttributes(Arg.getArgNo()));
+            OldFnAttributeList.getParamAttrs(Arg.getArgNo()));
       }
     }
 
@@ -2190,8 +2190,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
     // the function.
     LLVMContext &Ctx = OldFn->getContext();
     NewFn->setAttributes(AttributeList::get(
-        Ctx, OldFnAttributeList.getFnAttributes(),
-        OldFnAttributeList.getRetAttributes(), NewArgumentAttributes));
+        Ctx, OldFnAttributeList.getFnAttrs(), OldFnAttributeList.getRetAttrs(),
+        NewArgumentAttributes));
 
     // Since we have now created the new function, splice the body of the old
     // function right into the new function, leaving the old rotting hulk of the
@@ -2236,7 +2236,7 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
         } else {
           NewArgOperands.push_back(ACS.getCallArgOperand(OldArgNum));
           NewArgOperandAttributes.push_back(
-              OldCallAttributeList.getParamAttributes(OldArgNum));
+              OldCallAttributeList.getParamAttrs(OldArgNum));
         }
       }
 
@@ -2266,8 +2266,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
       NewCB->setCallingConv(OldCB->getCallingConv());
       NewCB->takeName(OldCB);
       NewCB->setAttributes(AttributeList::get(
-          Ctx, OldCallAttributeList.getFnAttributes(),
-          OldCallAttributeList.getRetAttributes(), NewArgOperandAttributes));
+          Ctx, OldCallAttributeList.getFnAttrs(),
+          OldCallAttributeList.getRetAttrs(), NewArgOperandAttributes));
 
       CallSitePairs.push_back({OldCB, NewCB});
       return true;
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 2bbae90a725df..a037c62eafc67 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -221,9 +221,9 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
     if (!PAL.isEmpty()) {
       SmallVector<AttributeSet, 8> ArgAttrs;
       for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo)
-        ArgAttrs.push_back(PAL.getParamAttributes(ArgNo));
-      PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttributes(),
-                               PAL.getRetAttributes(), ArgAttrs);
+        ArgAttrs.push_back(PAL.getParamAttrs(ArgNo));
+      PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttrs(),
+                               PAL.getRetAttrs(), ArgAttrs);
     }
 
     SmallVector<OperandBundleDef, 1> OpBundles;
@@ -804,7 +804,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     if (LiveValues.erase(Arg)) {
       Params.push_back(I->getType());
       ArgAlive[ArgI] = true;
-      ArgAttrVec.push_back(PAL.getParamAttributes(ArgI));
+      ArgAttrVec.push_back(PAL.getParamAttrs(ArgI));
       HasLiveReturnedArg |= PAL.hasParamAttr(ArgI, Attribute::Returned);
     } else {
       ++NumArgumentsEliminated;
@@ -892,7 +892,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   assert(NRetTy && "No new return type found?");
 
   // The existing function return attributes.
-  AttrBuilder RAttrs(PAL.getRetAttributes());
+  AttrBuilder RAttrs(PAL.getRetAttrs());
 
   // Remove any incompatible attributes, but only if we removed all return
   // values. Otherwise, ensure that we don't have any conflicting attributes
@@ -907,8 +907,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
 
   // Strip allocsize attributes. They might refer to the deleted arguments.
-  AttributeSet FnAttrs = PAL.getFnAttributes().removeAttribute(
-      F->getContext(), Attribute::AllocSize);
+  AttributeSet FnAttrs =
+      PAL.getFnAttrs().removeAttribute(F->getContext(), Attribute::AllocSize);
 
   // Reconstruct the AttributesList based on the vector we constructed.
   assert(ArgAttrVec.size() == Params.size());
@@ -943,7 +943,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
     // Adjust the call return attributes in case the function was changed to
     // return void.
-    AttrBuilder RAttrs(CallPAL.getRetAttributes());
+    AttrBuilder RAttrs(CallPAL.getRetAttrs());
     RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
     AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
 
@@ -957,7 +957,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
       if (ArgAlive[Pi]) {
         Args.push_back(*I);
         // Get original parameter attributes, but skip return attributes.
-        AttributeSet Attrs = CallPAL.getParamAttributes(Pi);
+        AttributeSet Attrs = CallPAL.getParamAttrs(Pi);
         if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) {
           // If the return type has changed, then get rid of 'returned' on the
           // call site. The alternative is to make all 'returned' attributes on
@@ -976,7 +976,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     // Push any varargs arguments on the list. Don't forget their attributes.
     for (auto E = CB.arg_end(); I != E; ++I, ++Pi) {
       Args.push_back(*I);
-      ArgAttrVec.push_back(CallPAL.getParamAttributes(Pi));
+      ArgAttrVec.push_back(CallPAL.getParamAttrs(Pi));
     }
 
     // Reconstruct the AttributesList based on the vector we constructed.
@@ -984,7 +984,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
     // Again, be sure to remove any allocsize attributes, since their indices
     // may now be incorrect.
-    AttributeSet FnAttrs = CallPAL.getFnAttributes().removeAttribute(
+    AttributeSet FnAttrs = CallPAL.getFnAttrs().removeAttribute(
         F->getContext(), Attribute::AllocSize);
 
     AttributeList NewCallPAL = AttributeList::get(
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index ca8660a98ded8..6814dca489fc5 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -303,7 +303,7 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
       AttrsToRemove.addAttribute(Attribute::InaccessibleMemOnly);
       AttrsToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
     }
-    F->removeAttributes(AttributeList::FunctionIndex, AttrsToRemove);
+    F->removeFnAttrs(AttrsToRemove);
 
     // Add in the new attribute.
     if (WritesMemory && !ReadsMemory)
@@ -1055,8 +1055,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
   // pointers.
   for (Function *F : SCCNodes) {
     // Already nonnull.
-    if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                        Attribute::NonNull))
+    if (F->getAttributes().hasRetAttr(Attribute::NonNull))
       continue;
 
     // We can infer and propagate function attributes only when we know that the
@@ -1077,7 +1076,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
         // which prevents us from speculating about the entire SCC
         LLVM_DEBUG(dbgs() << "Eagerly marking " << F->getName()
                           << " as nonnull\n");
-        F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+        F->addRetAttr(Attribute::NonNull);
         ++NumNonNullReturn;
         MadeChange = true;
       }
@@ -1090,13 +1089,12 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
 
   if (SCCReturnsNonNull) {
     for (Function *F : SCCNodes) {
-      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
-                                          Attribute::NonNull) ||
+      if (F->getAttributes().hasRetAttr(Attribute::NonNull) ||
           !F->getReturnType()->isPointerTy())
         continue;
 
       LLVM_DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
-      F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+      F->addRetAttr(Attribute::NonNull);
       ++NumNonNullReturn;
       MadeChange = true;
     }
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 2f6cf0ca7087a..4535b75e2c482 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
@@ -496,7 +497,7 @@ static void computeImportForFunction(
                             VI.name().str() + " due to " +
                             getFailureName(Reason);
           auto Error = make_error<StringError>(
-              Msg, std::make_error_code(std::errc::operation_not_supported));
+              Msg, make_error_code(errc::not_supported));
           logAllUnhandledErrors(std::move(Error), errs(),
                                 "Error importing module: ");
           break;
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 5904277fa6a60..1ffba572ff35f 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1610,7 +1610,8 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
     // initializer to be the stored value, then delete all stores to the
     // global.  This allows us to mark it constant.
     if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
-      if (isa<UndefValue>(GV->getInitializer())) {
+      if (SOVConstant->getType() == GV->getValueType() &&
+          isa<UndefValue>(GV->getInitializer())) {
         // Change the initial value here.
         GV->setInitializer(SOVConstant);
 
@@ -1914,10 +1915,8 @@ static void RemovePreallocated(Function *F) {
       Value *AllocaReplacement = ArgAllocas[AllocArgIndex];
       if (!AllocaReplacement) {
         auto AddressSpace = UseCall->getType()->getPointerAddressSpace();
-        auto *ArgType = UseCall
-                            ->getAttribute(AttributeList::FunctionIndex,
-                                           Attribute::Preallocated)
-                            .getValueAsType();
+        auto *ArgType =
+            UseCall->getFnAttr(Attribute::Preallocated).getValueAsType();
         auto *InsertBefore = PreallocatedSetup->getNextNonDebugInstruction();
         Builder.SetInsertPoint(InsertBefore);
         auto *Alloca =
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index adf9ffba57801..318c1494445d5 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -1231,8 +1231,7 @@ static void fillOverallFunction(Module &M, OutlinableGroup &CurrentGroup,
                                         *CurrentGroup.OutlinedFunction);
 
   // Transfer the attributes from the function to the new function.
-  for (Attribute A :
-       CurrentOS->ExtractedFunction->getAttributes().getFnAttributes())
+  for (Attribute A : CurrentOS->ExtractedFunction->getAttributes().getFnAttrs())
     CurrentGroup.OutlinedFunction->addFnAttr(A);
 
   // Create an output block for the first extracted function.
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 6c225118b7394..c48ae6ec2f36a 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -92,6 +92,11 @@ static cl::opt<bool> DisableOpenMPOptStateMachineRewrite(
     cl::desc("Disable OpenMP optimizations that replace the state machine."),
     cl::Hidden, cl::init(false));
 
+static cl::opt<bool> PrintModuleAfterOptimizations(
+    "openmp-opt-print-module", cl::ZeroOrMore,
+    cl::desc("Print the current module after OpenMP optimizations."),
+    cl::Hidden, cl::init(false));
+
 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
           "Number of OpenMP runtime calls deduplicated");
 STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -1063,7 +1068,7 @@ struct OpenMPOpt {
         // Forward parameter attributes from the callback to the callee.
         for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
              U < E; ++U)
-          for (const Attribute &A : CI->getAttributes().getParamAttributes(U))
+          for (const Attribute &A : CI->getAttributes().getParamAttrs(U))
             NewCI->addParamAttr(
                 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
 
@@ -2707,9 +2712,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
 
       ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0));
 
-      LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in "
-                        << CB->getCaller()->getName() << " with "
-                        << AllocSize->getZExtValue()
+      LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
+                        << " with " << AllocSize->getZExtValue()
                         << " bytes of shared memory\n");
 
       // Create a new shared memory buffer of the same size as the allocation
@@ -2873,8 +2877,11 @@ struct AAKernelInfoFunction : AAKernelInfo {
         },
         Fn);
 
-    assert((KernelInitCB && KernelDeinitCB) &&
-           "Kernel without __kmpc_target_init or __kmpc_target_deinit!");
+    // Ignore kernels without initializers such as global constructors.
+    if (!KernelInitCB || !KernelDeinitCB) {
+      indicateOptimisticFixpoint();
+      return;
+    }
 
     // For kernels we might need to initialize/finalize the IsSPMD state and
     // we need to register a simplification callback so that the Attributor
@@ -4473,6 +4480,10 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
 
   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
   bool Changed = OMPOpt.run(true);
+
+  if (PrintModuleAfterOptimizations)
+    LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt Module Pass:\n" << M);
+
   if (Changed)
     return PreservedAnalyses::none();
 
@@ -4525,6 +4536,10 @@ PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
 
   OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
   bool Changed = OMPOpt.run(false);
+
+  if (PrintModuleAfterOptimizations)
+    LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);
+
   if (Changed)
     return PreservedAnalyses::none();
 
@@ -4590,7 +4605,12 @@ struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass {
                  MaxFixpointIterations, OREGetter, DEBUG_TYPE);
 
     OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
-    return OMPOpt.run(false);
+    bool Result = OMPOpt.run(false);
+
+    if (PrintModuleAfterOptimizations)
+      LLVM_DEBUG(dbgs() << TAG << "Module after OpenMPOpt CGSCC Pass:\n" << M);
+
+    return Result;
   }
 
   bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
index 55b88ac14da5f..21b5d764221f2 100644
--- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
+++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -127,6 +127,10 @@ void ContextTrieNode::setFunctionSamples(FunctionSamples *FSamples) {
   FuncSamples = FSamples;
 }
 
+uint32_t ContextTrieNode::getFunctionSize() const { return FuncSize; }
+
+void ContextTrieNode::setFunctionSize(uint32_t FSize) { FuncSize = FSize; }
+
 LineLocation ContextTrieNode::getCallSiteLoc() const { return CallSiteLoc; }
 
 ContextTrieNode *ContextTrieNode::getParentContext() const {
@@ -137,9 +141,10 @@ void ContextTrieNode::setParentContext(ContextTrieNode *Parent) {
   ParentContext = Parent;
 }
 
-void ContextTrieNode::dump() {
+void ContextTrieNode::dumpNode() {
   dbgs() << "Node: " << FuncName << "\n"
          << "  Callsite: " << CallSiteLoc << "\n"
+         << "  Size: " << FuncSize << "\n"
          << "  Children:\n";
 
   for (auto &It : AllChildContext) {
@@ -147,6 +152,23 @@ void ContextTrieNode::dump() {
   }
 }
 
+void ContextTrieNode::dumpTree() {
+  dbgs() << "Context Profile Tree:\n";
+  std::queue<ContextTrieNode *> NodeQueue;
+  NodeQueue.push(this);
+
+  while (!NodeQueue.empty()) {
+    ContextTrieNode *Node = NodeQueue.front();
+    NodeQueue.pop();
+    Node->dumpNode();
+
+    for (auto &It : Node->getAllChildContext()) {
+      ContextTrieNode *ChildNode = &It.second;
+      NodeQueue.push(ChildNode);
+    }
+  }
+}
+
 uint32_t ContextTrieNode::nodeHash(StringRef ChildName,
                                    const LineLocation &Callsite) {
   // We still use child's name for child hash, this is
@@ -171,7 +193,8 @@ ContextTrieNode *ContextTrieNode::getOrCreateChildContext(
   if (!AllowCreate)
     return nullptr;
 
-  AllChildContext[Hash] = ContextTrieNode(this, CalleeName, nullptr, CallSite);
+  AllChildContext[Hash] =
+      ContextTrieNode(this, CalleeName, nullptr, 0, CallSite);
   return &AllChildContext[Hash];
 }
 
@@ -385,22 +408,7 @@ ContextTrieNode &SampleContextTracker::promoteMergeContextSamplesTree(
                                         ContextStrToRemove);
 }
 
-void SampleContextTracker::dump() {
-  dbgs() << "Context Profile Tree:\n";
-  std::queue<ContextTrieNode *> NodeQueue;
-  NodeQueue.push(&RootContext);
-
-  while (!NodeQueue.empty()) {
-    ContextTrieNode *Node = NodeQueue.front();
-    NodeQueue.pop();
-    Node->dump();
-
-    for (auto &It : Node->getAllChildContext()) {
-      ContextTrieNode *ChildNode = &It.second;
-      NodeQueue.push(ChildNode);
-    }
-  }
-}
+void SampleContextTracker::dump() { RootContext.dumpTree(); }
 
 ContextTrieNode *
 SampleContextTracker::getContextFor(const SampleContext &Context) {
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 8e9c79fc7bbba..e3e06a21ad561 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -358,10 +358,10 @@ class SampleProfileLoader final
       std::function<AssumptionCache &(Function &)> GetAssumptionCache,
       std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
       std::function<const TargetLibraryInfo &(Function &)> GetTLI)
-      : SampleProfileLoaderBaseImpl(std::string(Name)),
+      : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName)),
         GetAC(std::move(GetAssumptionCache)),
         GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
-        RemappingFilename(std::string(RemapName)), LTOPhase(LTOPhase) {}
+        LTOPhase(LTOPhase) {}
 
   bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
   bool runOnModule(Module &M, ModuleAnalysisManager *AM,
@@ -417,9 +417,6 @@ class SampleProfileLoader final
   /// Profile tracker for different context.
   std::unique_ptr<SampleContextTracker> ContextTracker;
 
-  /// Name of the profile remapping file to load.
-  std::string RemappingFilename;
-
   /// Flag indicating whether input profile is context-sensitive
   bool ProfileIsCS = false;
 
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index eea848d3eb2f3..b7734b427431d 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -181,9 +181,9 @@ void simplifyExternals(Module &M) {
                          F.getAddressSpace(), "", &M);
     NewF->copyAttributesFrom(&F);
     // Only copy function attribtues.
-    NewF->setAttributes(
-        AttributeList::get(M.getContext(), AttributeList::FunctionIndex,
-                           F.getAttributes().getFnAttributes()));
+    NewF->setAttributes(AttributeList::get(M.getContext(),
+                                           AttributeList::FunctionIndex,
+                                           F.getAttributes().getFnAttrs()));
     NewF->takeName(&F);
     F.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, F.getType()));
     F.eraseFromParent();
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 7a8946110785a..a7e68a6767a6b 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -1288,7 +1288,7 @@ void DevirtModule::tryICallBranchFunnel(
                           M.getDataLayout().getProgramAddressSpace(),
                           "branch_funnel", &M);
   }
-  JT->addAttribute(1, Attribute::Nest);
+  JT->addParamAttr(0, Attribute::Nest);
 
   std::vector<Value *> JTArgs;
   JTArgs.push_back(JT->arg_begin());
@@ -1361,10 +1361,10 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
           M.getContext(), ArrayRef<Attribute>{Attribute::get(
                               M.getContext(), Attribute::Nest)}));
       for (unsigned I = 0; I + 2 <  Attrs.getNumAttrSets(); ++I)
-        NewArgAttrs.push_back(Attrs.getParamAttributes(I));
+        NewArgAttrs.push_back(Attrs.getParamAttrs(I));
       NewCS->setAttributes(
-          AttributeList::get(M.getContext(), Attrs.getFnAttributes(),
-                             Attrs.getRetAttributes(), NewArgAttrs));
+          AttributeList::get(M.getContext(), Attrs.getFnAttrs(),
+                             Attrs.getRetAttrs(), NewArgAttrs));
 
       CB.replaceAllUsesWith(NewCS);
       CB.eraseFromParent();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index d01a021bf3f40..17b8db6b41457 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1355,6 +1355,17 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (match(RHS, m_OneUse(m_c_Add(m_Value(A), m_Specific(LHS)))))
     return BinaryOperator::CreateAdd(A, Builder.CreateShl(LHS, 1, "reass.add"));
 
+  {
+    // (A + C1) + (C2 - B) --> (A - B) + (C1 + C2)
+    Constant *C1, *C2;
+    if (match(&I, m_c_Add(m_Add(m_Value(A), m_ImmConstant(C1)),
+                          m_Sub(m_ImmConstant(C2), m_Value(B)))) &&
+        (LHS->hasOneUse() || RHS->hasOneUse())) {
+      Value *Sub = Builder.CreateSub(A, B);
+      return BinaryOperator::CreateAdd(Sub, ConstantExpr::getAdd(C1, C2));
+    }
+  }
+
   // X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1)
   if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V);
 
@@ -1818,11 +1829,15 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     return BinaryOperator::CreateNot(Op1);
 
   // (~X) - (~Y) --> Y - X
-  Value *X, *Y;
-  if (match(Op0, m_Not(m_Value(X))) && match(Op1, m_Not(m_Value(Y))))
-    return BinaryOperator::CreateSub(Y, X);
+  if (isFreeToInvert(Op0, Op0->hasOneUse()) &&
+      isFreeToInvert(Op1, Op1->hasOneUse())) {
+    Value *NotOp0 = Builder.CreateNot(Op0);
+    Value *NotOp1 = Builder.CreateNot(Op1);
+    return BinaryOperator::CreateSub(NotOp1, NotOp0);
+  }
 
   // (X + -1) - Y --> ~Y + X
+  Value *X, *Y;
   if (match(Op0, m_OneUse(m_Add(m_Value(X), m_AllOnes()))))
     return BinaryOperator::CreateAdd(Builder.CreateNot(Op1), X);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 9be483651bd40..5c5df3e7b1d5a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1113,8 +1113,8 @@ static Value *extractIntPart(const IntPart &P, IRBuilderBase &Builder) {
 /// (icmp eq X0, Y0) & (icmp eq X1, Y1) -> icmp eq X01, Y01
 /// (icmp ne X0, Y0) | (icmp ne X1, Y1) -> icmp ne X01, Y01
 /// where X0, X1 and Y0, Y1 are adjacent parts extracted from an integer.
-static Value *foldEqOfParts(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd,
-                            InstCombiner::BuilderTy &Builder) {
+Value *InstCombinerImpl::foldEqOfParts(ICmpInst *Cmp0, ICmpInst *Cmp1,
+                                       bool IsAnd) {
   if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
     return nullptr;
 
@@ -1262,7 +1262,7 @@ Value *InstCombinerImpl::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
           foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/true, Q, Builder))
     return X;
 
-  if (Value *X = foldEqOfParts(LHS, RHS, /*IsAnd=*/true, Builder))
+  if (Value *X = foldEqOfParts(LHS, RHS, /*IsAnd=*/true))
     return X;
 
   // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
@@ -2496,7 +2496,7 @@ Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
           foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/false, Q, Builder))
     return X;
 
-  if (Value *X = foldEqOfParts(LHS, RHS, /*IsAnd=*/false, Builder))
+  if (Value *X = foldEqOfParts(LHS, RHS, /*IsAnd=*/false))
     return X;
 
   // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
@@ -3576,13 +3576,17 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
   // ~min(~X, ~Y) --> max(X, Y)
   // ~max(~X, Y) --> min(X, ~Y)
   auto *II = dyn_cast<IntrinsicInst>(Op0);
-  if (II && match(Op1, m_AllOnes())) {
-    if (match(Op0, m_MaxOrMin(m_Not(m_Value(X)), m_Not(m_Value(Y))))) {
+  if (II && II->hasOneUse() && match(Op1, m_AllOnes())) {
+    if (match(Op0, m_MaxOrMin(m_Value(X), m_Value(Y))) &&
+        isFreeToInvert(X, X->hasOneUse()) &&
+        isFreeToInvert(Y, Y->hasOneUse())) {
       Intrinsic::ID InvID = getInverseMinMaxIntrinsic(II->getIntrinsicID());
-      Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, X, Y);
+      Value *NotX = Builder.CreateNot(X);
+      Value *NotY = Builder.CreateNot(Y);
+      Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, NotX, NotY);
       return replaceInstUsesWith(I, InvMaxMin);
     }
-    if (match(Op0, m_OneUse(m_c_MaxOrMin(m_Not(m_Value(X)), m_Value(Y))))) {
+    if (match(Op0, m_c_MaxOrMin(m_Not(m_Value(X)), m_Value(Y)))) {
       Intrinsic::ID InvID = getInverseMinMaxIntrinsic(II->getIntrinsicID());
       Value *NotY = Builder.CreateNot(Y);
       Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, X, NotY);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 84b69695ac510..238d4cc5f069c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -796,13 +796,12 @@ static Instruction *foldClampRangeOfTwo(IntrinsicInst *II,
 }
 
 /// Reduce a sequence of min/max intrinsics with a common operand.
-static Instruction *factorizeMinMaxTree(IntrinsicInst *II,
-                                        InstCombiner::BuilderTy &Builder) {
+static Instruction *factorizeMinMaxTree(IntrinsicInst *II) {
   // Match 3 of the same min/max ops. Example: umin(umin(), umin()).
   auto *LHS = dyn_cast<IntrinsicInst>(II->getArgOperand(0));
   auto *RHS = dyn_cast<IntrinsicInst>(II->getArgOperand(1));
   Intrinsic::ID MinMaxID = II->getIntrinsicID();
-  if (!LHS || !RHS || LHS->getIntrinsicID() !=  MinMaxID ||
+  if (!LHS || !RHS || LHS->getIntrinsicID() != MinMaxID ||
       RHS->getIntrinsicID() != MinMaxID ||
       (!LHS->hasOneUse() && !RHS->hasOneUse()))
     return nullptr;
@@ -1065,6 +1064,18 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       }
     }
 
+    if (IID == Intrinsic::smax || IID == Intrinsic::smin) {
+      // smax (neg nsw X), (neg nsw Y) --> neg nsw (smin X, Y)
+      // smin (neg nsw X), (neg nsw Y) --> neg nsw (smax X, Y)
+      // TODO: Canonicalize neg after min/max if I1 is constant.
+      if (match(I0, m_NSWNeg(m_Value(X))) && match(I1, m_NSWNeg(m_Value(Y))) &&
+          (I0->hasOneUse() || I1->hasOneUse())) {
+        Intrinsic::ID InvID = getInverseMinMaxIntrinsic(IID);
+        Value *InvMaxMin = Builder.CreateBinaryIntrinsic(InvID, X, Y);
+        return BinaryOperator::CreateNSWNeg(InvMaxMin);
+      }
+    }
+
     if (match(I0, m_Not(m_Value(X)))) {
       // max (not X), (not Y) --> not (min X, Y)
       Intrinsic::ID InvID = getInverseMinMaxIntrinsic(IID);
@@ -1109,12 +1120,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (Instruction *Sel = foldClampRangeOfTwo(II, Builder))
       return Sel;
 
+    if (Instruction *SAdd = matchSAddSubSat(*II))
+      return SAdd;
+
     if (match(I1, m_ImmConstant()))
       if (auto *Sel = dyn_cast<SelectInst>(I0))
         if (Instruction *R = FoldOpIntoSelect(*II, Sel))
           return R;
 
-    if (Instruction *NewMinMax = factorizeMinMaxTree(II, Builder))
+    if (Instruction *NewMinMax = factorizeMinMaxTree(II))
        return NewMinMax;
 
     break;
@@ -2422,55 +2436,46 @@ void InstCombinerImpl::annotateAnyAllocSite(CallBase &Call, const TargetLibraryI
 
   if (isMallocLikeFn(&Call, TLI) && Op0C) {
     if (isOpNewLikeFn(&Call, TLI))
-      Call.addAttribute(AttributeList::ReturnIndex,
-                        Attribute::getWithDereferenceableBytes(
-                            Call.getContext(), Op0C->getZExtValue()));
+      Call.addRetAttr(Attribute::getWithDereferenceableBytes(
+          Call.getContext(), Op0C->getZExtValue()));
     else
-      Call.addAttribute(AttributeList::ReturnIndex,
-                        Attribute::getWithDereferenceableOrNullBytes(
-                            Call.getContext(), Op0C->getZExtValue()));
+      Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+          Call.getContext(), Op0C->getZExtValue()));
   } else if (isAlignedAllocLikeFn(&Call, TLI)) {
     if (Op1C)
-      Call.addAttribute(AttributeList::ReturnIndex,
-                        Attribute::getWithDereferenceableOrNullBytes(
-                            Call.getContext(), Op1C->getZExtValue()));
+      Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+          Call.getContext(), Op1C->getZExtValue()));
     // Add alignment attribute if alignment is a power of two constant.
     if (Op0C && Op0C->getValue().ult(llvm::Value::MaximumAlignment) &&
         isKnownNonZero(Call.getOperand(1), DL, 0, &AC, &Call, &DT)) {
       uint64_t AlignmentVal = Op0C->getZExtValue();
       if (llvm::isPowerOf2_64(AlignmentVal)) {
-        Call.removeAttribute(AttributeList::ReturnIndex, Attribute::Alignment);
-        Call.addAttribute(AttributeList::ReturnIndex,
-                          Attribute::getWithAlignment(Call.getContext(),
-                                                      Align(AlignmentVal)));
+        Call.removeRetAttr(Attribute::Alignment);
+        Call.addRetAttr(Attribute::getWithAlignment(Call.getContext(),
+                                                    Align(AlignmentVal)));
       }
     }
   } else if (isReallocLikeFn(&Call, TLI) && Op1C) {
-    Call.addAttribute(AttributeList::ReturnIndex,
-                      Attribute::getWithDereferenceableOrNullBytes(
-                          Call.getContext(), Op1C->getZExtValue()));
+    Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+        Call.getContext(), Op1C->getZExtValue()));
   } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) {
     bool Overflow;
     const APInt &N = Op0C->getValue();
     APInt Size = N.umul_ov(Op1C->getValue(), Overflow);
     if (!Overflow)
-      Call.addAttribute(AttributeList::ReturnIndex,
-                        Attribute::getWithDereferenceableOrNullBytes(
-                            Call.getContext(), Size.getZExtValue()));
+      Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+          Call.getContext(), Size.getZExtValue()));
   } else if (isStrdupLikeFn(&Call, TLI)) {
     uint64_t Len = GetStringLength(Call.getOperand(0));
     if (Len) {
       // strdup
       if (NumArgs == 1)
-        Call.addAttribute(AttributeList::ReturnIndex,
-                          Attribute::getWithDereferenceableOrNullBytes(
-                              Call.getContext(), Len));
+        Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+            Call.getContext(), Len));
       // strndup
       else if (NumArgs == 2 && Op1C)
-        Call.addAttribute(
-            AttributeList::ReturnIndex,
-            Attribute::getWithDereferenceableOrNullBytes(
-                Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1)));
+        Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+            Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1)));
     }
   }
 }
@@ -2672,7 +2677,7 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
         // isKnownNonNull -> nonnull attribute
         if (!GCR.hasRetAttr(Attribute::NonNull) &&
             isKnownNonZero(DerivedPtr, DL, 0, &AC, &Call, &DT)) {
-          GCR.addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+          GCR.addRetAttr(Attribute::NonNull);
           // We discovered new fact, re-check users.
           Worklist.pushUsersToWorkList(GCR);
         }
@@ -2829,7 +2834,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
     if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
       return false;   // Cannot transform this parameter value.
 
-    if (AttrBuilder(CallerPAL.getParamAttributes(i))
+    if (AttrBuilder(CallerPAL.getParamAttrs(i))
             .overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
       return false;   // Attribute not compatible with transformed value.
 
@@ -2912,11 +2917,11 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
 
     // Add any parameter attributes.
     if (CallerPAL.hasParamAttr(i, Attribute::ByVal)) {
-      AttrBuilder AB(CallerPAL.getParamAttributes(i));
+      AttrBuilder AB(CallerPAL.getParamAttrs(i));
       AB.addByValAttr(NewArg->getType()->getPointerElementType());
       ArgAttrs.push_back(AttributeSet::get(Ctx, AB));
     } else
-      ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
+      ArgAttrs.push_back(CallerPAL.getParamAttrs(i));
   }
 
   // If the function takes more arguments than the call was taking, add them
@@ -2943,12 +2948,12 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
         Args.push_back(NewArg);
 
         // Add any parameter attributes.
-        ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
+        ArgAttrs.push_back(CallerPAL.getParamAttrs(i));
       }
     }
   }
 
-  AttributeSet FnAttrs = CallerPAL.getFnAttributes();
+  AttributeSet FnAttrs = CallerPAL.getFnAttrs();
 
   if (NewRetTy->isVoidTy())
     Caller->setName("");   // Void type should not have a name.
@@ -3049,7 +3054,7 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
     for (FunctionType::param_iterator I = NestFTy->param_begin(),
                                       E = NestFTy->param_end();
          I != E; ++NestArgNo, ++I) {
-      AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo);
+      AttributeSet AS = NestAttrs.getParamAttrs(NestArgNo);
       if (AS.hasAttribute(Attribute::Nest)) {
         // Record the parameter type and any other attributes.
         NestTy = *I;
@@ -3085,7 +3090,7 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
 
           // Add the original argument and attributes.
           NewArgs.push_back(*I);
-          NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
+          NewArgAttrs.push_back(Attrs.getParamAttrs(ArgNo));
 
           ++ArgNo;
           ++I;
@@ -3131,8 +3136,8 @@ InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
         NestF : ConstantExpr::getBitCast(NestF,
                                          PointerType::getUnqual(NewFTy));
       AttributeList NewPAL =
-          AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(),
-                             Attrs.getRetAttributes(), NewArgAttrs);
+          AttributeList::get(FTy->getContext(), Attrs.getFnAttrs(),
+                             Attrs.getRetAttrs(), NewArgAttrs);
 
       SmallVector<OperandBundleDef, 1> OpBundles;
       Call.getOperandBundlesAsDefs(OpBundles);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index cefb947567e98..af722269fc1f8 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -326,7 +326,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Instruction *narrowMathIfNoOverflow(BinaryOperator &I);
   Instruction *narrowFunnelShift(TruncInst &Trunc);
   Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN);
-  Instruction *matchSAddSubSat(SelectInst &MinMax1);
+  Instruction *matchSAddSubSat(Instruction &MinMax1);
 
   void freelyInvertAllUsersOf(Value *V);
 
@@ -348,6 +348,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Or);
   Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Xor);
 
+  Value *foldEqOfParts(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd);
+
   /// Optimize (fcmp)&(fcmp) or (fcmp)|(fcmp).
   /// NOTE: Unlike most of instcombine, this returns a Value which should
   /// already be inserted into the function.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 14bf26d74fad0..ff4611e163ae6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -2182,7 +2182,7 @@ static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X,
 }
 
 /// Match a sadd_sat or ssub_sat which is using min/max to clamp the value.
-Instruction *InstCombinerImpl::matchSAddSubSat(SelectInst &MinMax1) {
+Instruction *InstCombinerImpl::matchSAddSubSat(Instruction &MinMax1) {
   Type *Ty = MinMax1.getType();
 
   // We are looking for a tree of:
@@ -2212,9 +2212,10 @@ Instruction *InstCombinerImpl::matchSAddSubSat(SelectInst &MinMax1) {
   if (!shouldChangeType(Ty->getScalarType()->getIntegerBitWidth(), NewBitWidth))
     return nullptr;
 
-  // Also make sure that the number of uses is as expected. The "3"s are for the
-  // the two items of min/max (the compare and the select).
-  if (MinMax2->hasNUsesOrMore(3) || AddSub->hasNUsesOrMore(3))
+  // Also make sure that the number of uses is as expected. The 3 is for the
+  // the two items of the compare and the select, or 2 from a min/max.
+  unsigned ExpUses = isa<IntrinsicInst>(MinMax1) ? 2 : 3;
+  if (MinMax2->hasNUsesOrMore(ExpUses) || AddSub->hasNUsesOrMore(ExpUses))
     return nullptr;
 
   // Create the new type (which can be a vector type)
@@ -2754,11 +2755,16 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
                                                         /* IsAnd */ IsAnd))
           return I;
 
-      if (auto *ICmp0 = dyn_cast<ICmpInst>(CondVal))
-        if (auto *ICmp1 = dyn_cast<ICmpInst>(Op1))
+      if (auto *ICmp0 = dyn_cast<ICmpInst>(CondVal)) {
+        if (auto *ICmp1 = dyn_cast<ICmpInst>(Op1)) {
           if (auto *V = foldAndOrOfICmpsOfAndWithPow2(ICmp0, ICmp1, &SI, IsAnd,
                                                       /* IsLogical */ true))
             return replaceInstUsesWith(SI, V);
+
+          if (auto *V = foldEqOfParts(ICmp0, ICmp1, IsAnd))
+            return replaceInstUsesWith(SI, V);
+        }
+      }
     }
 
     // select (select a, true, b), c, false -> select a, c, false
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 434659cf4b2c7..1026b9da44e91 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2118,10 +2118,12 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
               // -- have to recreate %src & %gep
               // put NewSrc at same location as %src
               Builder.SetInsertPoint(cast<Instruction>(PtrOp));
-              auto *NewSrc = cast<GetElementPtrInst>(
-                  Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName()));
-              NewSrc->setIsInBounds(Src->isInBounds());
-              auto *NewGEP =
+              Value *NewSrc =
+                  Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName());
+              // Propagate 'inbounds' if the new source was not constant-folded.
+              if (auto *NewSrcGEPI = dyn_cast<GetElementPtrInst>(NewSrc))
+                NewSrcGEPI->setIsInBounds(Src->isInBounds());
+              GetElementPtrInst *NewGEP =
                   GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1});
               NewGEP->setIsInBounds(GEP.isInBounds());
               return NewGEP;
@@ -2130,8 +2132,17 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         }
       }
 
+      // Guard the gep(gep) fold so we don't create an add inside a loop
+      // when there wasn't an equivalent instruction there before.
+      bool DifferentLoops = false;
+      if (LI)
+        if (auto *GEPLoop = LI->getLoopFor(GEP.getParent()))
+          if (auto *SrcOpI = dyn_cast<Instruction>(Src))
+            if (LI->getLoopFor(SrcOpI->getParent()) != GEPLoop)
+              DifferentLoops = true;
+
       // Fold (gep(gep(Ptr,Idx0),Idx1) -> gep(Ptr,add(Idx0,Idx1))
-      if (GO1->getType() == SO1->getType()) {
+      if (!DifferentLoops && GO1->getType() == SO1->getType()) {
         bool NewInBounds = GEP.isInBounds() && Src->isInBounds();
         auto *NewIdx =
             Builder.CreateAdd(GO1, SO1, GEP.getName() + ".idx",
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 0d4ca0bcecfb2..d03538a76244e 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -442,7 +442,7 @@ struct ShadowMapping {
 
 } // end anonymous namespace
 
-static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
+static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
                                       bool IsKasan) {
   bool IsAndroid = TargetTriple.isAndroid();
   bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
@@ -559,6 +559,17 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   return Mapping;
 }
 
+namespace llvm {
+void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize,
+                               bool IsKasan, uint64_t *ShadowBase,
+                               int *MappingScale, bool *OrShadowOffset) {
+  auto Mapping = getShadowMapping(TargetTriple, LongSize, IsKasan);
+  *ShadowBase = Mapping.Offset;
+  *MappingScale = Mapping.Scale;
+  *OrShadowOffset = Mapping.OrShadowOffset;
+}
+} // namespace llvm
+
 static uint64_t getRedzoneSizeForScale(int MappingScale) {
   // Redzone used for stack and globals is at least 32 bytes.
   // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
@@ -1212,20 +1223,14 @@ GlobalsMetadata ASanGlobalsMetadataAnalysis::run(Module &M,
   return GlobalsMetadata(M);
 }
 
-AddressSanitizerPass::AddressSanitizerPass(
-    bool CompileKernel, bool Recover, bool UseAfterScope,
-    AsanDetectStackUseAfterReturnMode UseAfterReturn)
-    : CompileKernel(CompileKernel), Recover(Recover),
-      UseAfterScope(UseAfterScope), UseAfterReturn(UseAfterReturn) {}
-
 PreservedAnalyses AddressSanitizerPass::run(Function &F,
                                             AnalysisManager<Function> &AM) {
   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
   Module &M = *F.getParent();
   if (auto *R = MAMProxy.getCachedResult<ASanGlobalsMetadataAnalysis>(M)) {
     const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
-    AddressSanitizer Sanitizer(M, R, CompileKernel, Recover, UseAfterScope,
-                               UseAfterReturn);
+    AddressSanitizer Sanitizer(M, R, Options.CompileKernel, Options.Recover,
+                               Options.UseAfterScope, Options.UseAfterReturn);
     if (Sanitizer.instrumentFunction(F, TLI))
       return PreservedAnalyses::none();
     return PreservedAnalyses::all();
@@ -2133,8 +2138,7 @@ Instruction *ModuleAddressSanitizer::CreateAsanModuleDtor(Module &M) {
   AsanDtorFunction = Function::createWithDefaultAttr(
       FunctionType::get(Type::getVoidTy(*C), false),
       GlobalValue::InternalLinkage, 0, kAsanModuleDtorName, &M);
-  AsanDtorFunction->addAttribute(AttributeList::FunctionIndex,
-                                 Attribute::NoUnwind);
+  AsanDtorFunction->addFnAttr(Attribute::NoUnwind);
   // Ensure Dtor cannot be discarded, even if in a comdat.
   appendToUsed(M, {AsanDtorFunction});
   BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 63aa84e4a77cd..6a62f5e9ad774 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -349,18 +349,18 @@ transformFunctionAttributes(const TransformedFunction &TransformedFunction,
   for (unsigned I = 0, IE = TransformedFunction.ArgumentIndexMapping.size();
        I < IE; ++I) {
     unsigned TransformedIndex = TransformedFunction.ArgumentIndexMapping[I];
-    ArgumentAttributes[TransformedIndex] = CallSiteAttrs.getParamAttributes(I);
+    ArgumentAttributes[TransformedIndex] = CallSiteAttrs.getParamAttrs(I);
   }
 
   // Copy annotations on varargs arguments.
   for (unsigned I = TransformedFunction.OriginalType->getNumParams(),
                 IE = CallSiteAttrs.getNumAttrSets();
        I < IE; ++I) {
-    ArgumentAttributes.push_back(CallSiteAttrs.getParamAttributes(I));
+    ArgumentAttributes.push_back(CallSiteAttrs.getParamAttrs(I));
   }
 
-  return AttributeList::get(Ctx, CallSiteAttrs.getFnAttributes(),
-                            CallSiteAttrs.getRetAttributes(),
+  return AttributeList::get(Ctx, CallSiteAttrs.getFnAttrs(),
+                            CallSiteAttrs.getRetAttrs(),
                             llvm::makeArrayRef(ArgumentAttributes));
 }
 
@@ -1154,14 +1154,12 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
   Function *NewF = Function::Create(NewFT, NewFLink, F->getAddressSpace(),
                                     NewFName, F->getParent());
   NewF->copyAttributesFrom(F);
-  NewF->removeAttributes(
-      AttributeList::ReturnIndex,
+  NewF->removeRetAttrs(
       AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
 
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
   if (F->isVarArg()) {
-    NewF->removeAttributes(AttributeList::FunctionIndex,
-                           AttrBuilder().addAttribute("split-stack"));
+    NewF->removeFnAttrs(AttrBuilder().addAttribute("split-stack"));
     CallInst::Create(DFSanVarargWrapperFn,
                      IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
                      BB);
@@ -1238,23 +1236,17 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
 void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
   {
     AttributeList AL;
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
-                         Attribute::NoUnwind);
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
-                         Attribute::ReadOnly);
-    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
-                         Attribute::ZExt);
+    AL = AL.addFnAttribute(M.getContext(), Attribute::NoUnwind);
+    AL = AL.addFnAttribute(M.getContext(), Attribute::ReadOnly);
+    AL = AL.addRetAttribute(M.getContext(), Attribute::ZExt);
     DFSanUnionLoadFn =
         Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy, AL);
   }
   {
     AttributeList AL;
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
-                         Attribute::NoUnwind);
-    AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
-                         Attribute::ReadOnly);
-    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
-                         Attribute::ZExt);
+    AL = AL.addFnAttribute(M.getContext(), Attribute::NoUnwind);
+    AL = AL.addFnAttribute(M.getContext(), Attribute::ReadOnly);
+    AL = AL.addRetAttribute(M.getContext(), Attribute::ZExt);
     DFSanLoadLabelAndOriginFn = Mod->getOrInsertFunction(
         "__dfsan_load_label_and_origin", DFSanLoadLabelAndOriginFnTy, AL);
   }
@@ -1274,8 +1266,7 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
   {
     AttributeList AL;
     AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
-    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
-                         Attribute::ZExt);
+    AL = AL.addRetAttribute(M.getContext(), Attribute::ZExt);
     DFSanChainOriginFn = Mod->getOrInsertFunction("__dfsan_chain_origin",
                                                   DFSanChainOriginFnTy, AL);
   }
@@ -1283,8 +1274,7 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
     AttributeList AL;
     AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
     AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt);
-    AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
-                         Attribute::ZExt);
+    AL = AL.addRetAttribute(M.getContext(), Attribute::ZExt);
     DFSanChainOriginIfTaintedFn = Mod->getOrInsertFunction(
         "__dfsan_chain_origin_if_tainted", DFSanChainOriginIfTaintedFnTy, AL);
   }
@@ -1464,8 +1454,7 @@ bool DataFlowSanitizer::runImpl(Module &M) {
         Function *NewF = Function::Create(NewFT, F.getLinkage(),
                                           F.getAddressSpace(), "", &M);
         NewF->copyAttributesFrom(&F);
-        NewF->removeAttributes(
-            AttributeList::ReturnIndex,
+        NewF->removeRetAttrs(
             AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
         for (Function::arg_iterator FArg = F.arg_begin(),
                                     NewFArg = NewF->arg_begin(),
@@ -1513,7 +1502,7 @@ bool DataFlowSanitizer::runImpl(Module &M) {
               std::string(F.getName()),
           WrapperLinkage, NewFT);
       if (getInstrumentedABI() == IA_TLS)
-        NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs);
+        NewF->removeFnAttrs(ReadOnlyNoneAttrs);
 
       Value *WrappedFnCst =
           ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
@@ -2124,7 +2113,7 @@ std::pair<Value *, Value *> DFSanFunction::loadShadowOriginSansLoadTracking(
         IRB.CreateCall(DFS.DFSanLoadLabelAndOriginFn,
                        {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
                         ConstantInt::get(DFS.IntptrTy, Size)});
-    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+    Call->addRetAttr(Attribute::ZExt);
     return {IRB.CreateTrunc(IRB.CreateLShr(Call, DFS.OriginWidthBits),
                             DFS.PrimitiveShadowTy),
             IRB.CreateTrunc(Call, DFS.OriginTy)};
@@ -2171,7 +2160,7 @@ std::pair<Value *, Value *> DFSanFunction::loadShadowOriginSansLoadTracking(
   IRBuilder<> IRB(Pos);
   CallInst *FallbackCall = IRB.CreateCall(
       DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
-  FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+  FallbackCall->addRetAttr(Attribute::ZExt);
   return {FallbackCall, Origin};
 }
 
@@ -2953,8 +2942,7 @@ bool DFSanVisitor::visitWrappedCallBase(Function &F, CallBase &CB) {
 
       // Custom functions returning non-void will write to the return label.
       if (!FT->getReturnType()->isVoidTy()) {
-        CustomFn->removeAttributes(AttributeList::FunctionIndex,
-                                   DFSF.DFS.ReadOnlyNoneAttrs);
+        CustomFn->removeFnAttrs(DFSF.DFS.ReadOnlyNoneAttrs);
       }
     }
 
@@ -3176,9 +3164,8 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
       NewCB = IRB.CreateCall(NewFT, Func, Args);
     }
     NewCB->setCallingConv(CB.getCallingConv());
-    NewCB->setAttributes(CB.getAttributes().removeAttributes(
-        *DFSF.DFS.Ctx, AttributeList::ReturnIndex,
-        AttributeFuncs::typeIncompatible(NewCB->getType())));
+    NewCB->setAttributes(CB.getAttributes().removeRetAttributes(
+        *DFSF.DFS.Ctx, AttributeFuncs::typeIncompatible(NewCB->getType())));
 
     if (Next) {
       ExtractValueInst *ExVal = ExtractValueInst::Create(NewCB, 0, "", Next);
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index d9b9d82dd4f24..aa791eb286c8a 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -258,6 +258,8 @@ class HWAddressSanitizer {
 
   void setSSI(const StackSafetyGlobalInfo *S) { SSI = S; }
 
+  DenseMap<AllocaInst *, AllocaInst *> padInterestingAllocas(
+      const MapVector<AllocaInst *, AllocaInfo> &AllocasToInstrument);
   bool sanitizeFunction(Function &F,
                         llvm::function_ref<const DominatorTree &()> GetDT,
                         llvm::function_ref<const PostDominatorTree &()> GetPDT);
@@ -469,19 +471,14 @@ llvm::createHWAddressSanitizerLegacyPassPass(bool CompileKernel, bool Recover,
                                           DisableOptimization);
 }
 
-HWAddressSanitizerPass::HWAddressSanitizerPass(bool CompileKernel, bool Recover,
-                                               bool DisableOptimization)
-    : CompileKernel(CompileKernel), Recover(Recover),
-      DisableOptimization(DisableOptimization) {}
-
 PreservedAnalyses HWAddressSanitizerPass::run(Module &M,
                                               ModuleAnalysisManager &MAM) {
   const StackSafetyGlobalInfo *SSI = nullptr;
   auto TargetTriple = llvm::Triple(M.getTargetTriple());
-  if (shouldUseStackSafetyAnalysis(TargetTriple, DisableOptimization))
+  if (shouldUseStackSafetyAnalysis(TargetTriple, Options.DisableOptimization))
     SSI = &MAM.getResult<StackSafetyGlobalAnalysis>(M);
 
-  HWAddressSanitizer HWASan(M, CompileKernel, Recover, SSI);
+  HWAddressSanitizer HWASan(M, Options.CompileKernel, Options.Recover, SSI);
   bool Modified = false;
   auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   for (Function &F : M) {
@@ -1378,6 +1375,39 @@ bool HWAddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
          !(SSI && SSI->isSafe(AI));
 }
 
+DenseMap<AllocaInst *, AllocaInst *> HWAddressSanitizer::padInterestingAllocas(
+    const MapVector<AllocaInst *, AllocaInfo> &AllocasToInstrument) {
+  DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
+  for (auto &KV : AllocasToInstrument) {
+    AllocaInst *AI = KV.first;
+    uint64_t Size = getAllocaSizeInBytes(*AI);
+    uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+    AI->setAlignment(
+        Align(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
+    if (Size != AlignedSize) {
+      Type *AllocatedType = AI->getAllocatedType();
+      if (AI->isArrayAllocation()) {
+        uint64_t ArraySize =
+            cast<ConstantInt>(AI->getArraySize())->getZExtValue();
+        AllocatedType = ArrayType::get(AllocatedType, ArraySize);
+      }
+      Type *TypeWithPadding = StructType::get(
+          AllocatedType, ArrayType::get(Int8Ty, AlignedSize - Size));
+      auto *NewAI = new AllocaInst(
+          TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
+      NewAI->takeName(AI);
+      NewAI->setAlignment(AI->getAlign());
+      NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
+      NewAI->setSwiftError(AI->isSwiftError());
+      NewAI->copyMetadata(*AI);
+      auto *Bitcast = new BitCastInst(NewAI, AI->getType(), "", AI);
+      AI->replaceAllUsesWith(Bitcast);
+      AllocaToPaddedAllocaMap[AI] = NewAI;
+    }
+  }
+  return AllocaToPaddedAllocaMap;
+}
+
 bool HWAddressSanitizer::sanitizeFunction(
     Function &F, llvm::function_ref<const DominatorTree &()> GetDT,
     llvm::function_ref<const PostDominatorTree &()> GetPDT) {
@@ -1481,34 +1511,8 @@ bool HWAddressSanitizer::sanitizeFunction(
   // Pad and align each of the allocas that we instrumented to stop small
   // uninteresting allocas from hiding in instrumented alloca's padding and so
   // that we have enough space to store real tags for short granules.
-  DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
-  for (auto &KV : AllocasToInstrument) {
-    AllocaInst *AI = KV.first;
-    uint64_t Size = getAllocaSizeInBytes(*AI);
-    uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
-    AI->setAlignment(
-        Align(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
-    if (Size != AlignedSize) {
-      Type *AllocatedType = AI->getAllocatedType();
-      if (AI->isArrayAllocation()) {
-        uint64_t ArraySize =
-            cast<ConstantInt>(AI->getArraySize())->getZExtValue();
-        AllocatedType = ArrayType::get(AllocatedType, ArraySize);
-      }
-      Type *TypeWithPadding = StructType::get(
-          AllocatedType, ArrayType::get(Int8Ty, AlignedSize - Size));
-      auto *NewAI = new AllocaInst(
-          TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
-      NewAI->takeName(AI);
-      NewAI->setAlignment(AI->getAlign());
-      NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
-      NewAI->setSwiftError(AI->isSwiftError());
-      NewAI->copyMetadata(*AI);
-      auto *Bitcast = new BitCastInst(NewAI, AI->getType(), "", AI);
-      AI->replaceAllUsesWith(Bitcast);
-      AllocaToPaddedAllocaMap[AI] = NewAI;
-    }
-  }
+  DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap =
+      padInterestingAllocas(AllocasToInstrument);
 
   if (!AllocaToPaddedAllocaMap.empty()) {
     for (auto &BB : F) {
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 36762d1629374..d2620fd593e15 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -556,7 +556,7 @@ bool InstrProfiling::run(
   UsedVars.clear();
   TT = Triple(M.getTargetTriple());
 
-  bool MadeChange;
+  bool MadeChange = false;
 
   // Emit the runtime hook even if no counters are present.
   if (needsRuntimeHookUnconditionally(TT))
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 0e6a404a9e0be..610a76d14262a 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -255,8 +255,6 @@ PreservedAnalyses MemProfilerPass::run(Function &F,
   if (Profiler.instrumentFunction(F))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
-
-  return PreservedAnalyses::all();
 }
 
 ModuleMemProfilerPass::ModuleMemProfilerPass() {}
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 4e755bab15f3a..9b4cc9c46f451 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3653,9 +3653,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           .addAttribute(Attribute::ArgMemOnly)
           .addAttribute(Attribute::Speculatable);
 
-      Call->removeAttributes(AttributeList::FunctionIndex, B);
+      Call->removeFnAttrs(B);
       if (Function *Func = Call->getCalledFunction()) {
-        Func->removeAttributes(AttributeList::FunctionIndex, B);
+        Func->removeFnAttrs(B);
       }
 
       maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
@@ -3807,7 +3807,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (isAMustTailRetVal(RetVal)) return;
     Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
     bool HasNoUndef =
-        F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoUndef);
+        F.hasRetAttribute(Attribute::NoUndef);
     bool StoreShadow = !(ClEagerChecks && HasNoUndef);
     // FIXME: Consider using SpecialCaseList to specify a list of functions that
     // must always return fully initialized values. For now, we hardcode "main".
@@ -4176,7 +4176,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
                     MemorySanitizerVisitor &MSV)
       : F(F), MS(MS), MSV(MSV) {
     AMD64FpEndOffset = AMD64FpEndOffsetSSE;
-    for (const auto &Attr : F.getAttributes().getFnAttributes()) {
+    for (const auto &Attr : F.getAttributes().getFnAttrs()) {
       if (Attr.isStringAttribute() &&
           (Attr.getKindAsString() == "target-features")) {
         if (Attr.getValueAsString().contains("-sse"))
@@ -5330,6 +5330,9 @@ bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) {
   if (!CompileKernel && F.getName() == kMsanModuleCtorName)
     return false;
 
+  if (F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation))
+    return false;
+
   MemorySanitizerVisitor Visitor(F, *this, TLI);
 
   // Clear out readonly/readnone attributes.
@@ -5339,7 +5342,7 @@ bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) {
       .addAttribute(Attribute::WriteOnly)
       .addAttribute(Attribute::ArgMemOnly)
       .addAttribute(Attribute::Speculatable);
-  F.removeAttributes(AttributeList::FunctionIndex, B);
+  F.removeFnAttrs(B);
 
   return Visitor.runOnFunction();
 }
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 063999a682367..87fdcc9114f44 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -249,8 +249,7 @@ void ThreadSanitizer::initialize(Module &M) {
 
   IRBuilder<> IRB(M.getContext());
   AttributeList Attr;
-  Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex,
-                           Attribute::NoUnwind);
+  Attr = Attr.addFnAttribute(M.getContext(), Attribute::NoUnwind);
   // Initialize the callbacks.
   TsanFuncEntry = M.getOrInsertFunction("__tsan_func_entry", Attr,
                                         IRB.getVoidTy(), IRB.getInt8PtrTy());
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 212c3899a496f..111206b14641a 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -531,7 +531,7 @@ struct MainSwitch {
       return false;
 
     if (isa<PHINode>(SIUse) &&
-        SIBB->getSingleSuccessor() != dyn_cast<Instruction>(SIUse)->getParent())
+        SIBB->getSingleSuccessor() != cast<Instruction>(SIUse)->getParent())
       return false;
 
     // If select will not be sunk during unfolding, and it is in the same basic
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index b315ecf63c96b..1bc4a3886dec1 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -124,7 +124,7 @@ EnablePartialStoreMerging("enable-dse-partial-store-merging",
 static cl::opt<unsigned>
     MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(150), cl::Hidden,
                        cl::desc("The number of memory instructions to scan for "
-                                "dead store elimination (default = 100)"));
+                                "dead store elimination (default = 150)"));
 static cl::opt<unsigned> MemorySSAUpwardsStepLimit(
     "dse-memoryssa-walklimit", cl::init(90), cl::Hidden,
     cl::desc("The maximum number of steps while walking upwards to find "
@@ -1501,11 +1501,6 @@ struct DSEState {
     };
     PushMemUses(EarlierAccess);
 
-    // Optimistically collect all accesses for reads. If we do not find any
-    // read clobbers, add them to the cache.
-    SmallPtrSet<MemoryAccess *, 16> KnownNoReads;
-    if (!EarlierMemInst->mayReadFromMemory())
-      KnownNoReads.insert(EarlierAccess);
     // Check if EarlierDef may be read.
     for (unsigned I = 0; I < WorkList.size(); I++) {
       MemoryAccess *UseAccess = WorkList[I];
@@ -1518,7 +1513,6 @@ struct DSEState {
       }
       --ScanLimit;
       NumDomMemDefChecks++;
-      KnownNoReads.insert(UseAccess);
 
       if (isa<MemoryPhi>(UseAccess)) {
         if (any_of(KillingDefs, [this, UseAccess](Instruction *KI) {
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 16368aec7c3f0..79750aec2c5db 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1673,8 +1673,11 @@ bool GVN::processNonLocalLoad(LoadInst *Load) {
   if (!isLoadInLoopPREEnabled() && LI && LI->getLoopFor(Load->getParent()))
     return Changed;
 
-  return Changed || PerformLoadPRE(Load, ValuesPerBlock, UnavailableBlocks) ||
-         performLoopLoadPRE(Load, ValuesPerBlock, UnavailableBlocks);
+  if (PerformLoadPRE(Load, ValuesPerBlock, UnavailableBlocks) ||
+      performLoopLoadPRE(Load, ValuesPerBlock, UnavailableBlocks))
+    return true;
+
+  return Changed;
 }
 
 static bool impliesEquivalanceIfTrue(CmpInst* Cmp) {
diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index 790d71992da49..6bbe46c0c1ad0 100644
--- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -377,12 +377,12 @@ class GVNHoist {
     if (!Root)
       return;
     // Depth first walk on PDom tree to fill the CHIargs at each PDF.
-    RenameStackType RenameStack;
     for (auto Node : depth_first(Root)) {
       BasicBlock *BB = Node->getBlock();
       if (!BB)
         continue;
 
+      RenameStackType RenameStack;
       // Collect all values in BB and push to stack.
       fillRenameStack(BB, ValueBBs, RenameStack);
 
@@ -827,6 +827,8 @@ void GVNHoist::fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
   auto it1 = ValueBBs.find(BB);
   if (it1 != ValueBBs.end()) {
     // Iterate in reverse order to keep lower ranked values on the top.
+    LLVM_DEBUG(dbgs() << "\nVisiting: " << BB->getName()
+                      << " for pushing instructions on stack";);
     for (std::pair<VNType, Instruction *> &VI : reverse(it1->second)) {
       // Get the value of instruction I
       LLVM_DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index 61eb4ce0ed46b..b1f393765cb9d 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -46,6 +46,7 @@
 #include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/ConstantRange.h"
@@ -105,8 +106,10 @@ static void setCondition(Instruction *I, Value *NewCond) {
 }
 
 // Eliminates the guard instruction properly.
-static void eliminateGuard(Instruction *GuardInst) {
+static void eliminateGuard(Instruction *GuardInst, MemorySSAUpdater *MSSAU) {
   GuardInst->eraseFromParent();
+  if (MSSAU)
+    MSSAU->removeMemoryAccess(GuardInst);
   ++GuardsEliminated;
 }
 
@@ -114,6 +117,7 @@ class GuardWideningImpl {
   DominatorTree &DT;
   PostDominatorTree *PDT;
   LoopInfo &LI;
+  MemorySSAUpdater *MSSAU;
 
   /// Together, these describe the region of interest.  This might be all of
   /// the blocks within a function, or only a given loop's blocks and preheader.
@@ -269,12 +273,12 @@ class GuardWideningImpl {
   }
 
 public:
-
   explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT,
-                             LoopInfo &LI, DomTreeNode *Root,
+                             LoopInfo &LI, MemorySSAUpdater *MSSAU,
+                             DomTreeNode *Root,
                              std::function<bool(BasicBlock*)> BlockFilter)
-    : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter)
-        {}
+      : DT(DT), PDT(PDT), LI(LI), MSSAU(MSSAU), Root(Root),
+        BlockFilter(BlockFilter) {}
 
   /// The entry point for this pass.
   bool run();
@@ -313,7 +317,7 @@ bool GuardWideningImpl::run() {
     if (!WidenedGuards.count(I)) {
       assert(isa<ConstantInt>(getCondition(I)) && "Should be!");
       if (isSupportedGuardInstruction(I))
-        eliminateGuard(I);
+        eliminateGuard(I, MSSAU);
       else {
         assert(isa<BranchInst>(I) &&
                "Eliminated something other than guard or branch?");
@@ -766,12 +770,18 @@ PreservedAnalyses GuardWideningPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &LI = AM.getResult<LoopAnalysis>(F);
   auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
-  if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
-                         [](BasicBlock*) { return true; } ).run())
+  auto *MSSAA = AM.getCachedResult<MemorySSAAnalysis>(F);
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (MSSAA)
+    MSSAU = std::make_unique<MemorySSAUpdater>(&MSSAA->getMSSA());
+  if (!GuardWideningImpl(DT, &PDT, LI, MSSAU ? MSSAU.get() : nullptr,
+                         DT.getRootNode(), [](BasicBlock *) { return true; })
+           .run())
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
+  PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
 
@@ -784,11 +794,17 @@ PreservedAnalyses GuardWideningPass::run(Loop &L, LoopAnalysisManager &AM,
   auto BlockFilter = [&](BasicBlock *BB) {
     return BB == RootBB || L.contains(BB);
   };
-  if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, AR.DT.getNode(RootBB),
-                         BlockFilter).run())
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (AR.MSSA)
+    MSSAU = std::make_unique<MemorySSAUpdater>(AR.MSSA);
+  if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, MSSAU ? MSSAU.get() : nullptr,
+                         AR.DT.getNode(RootBB), BlockFilter).run())
     return PreservedAnalyses::all();
 
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
 }
 
 namespace {
@@ -805,8 +821,14 @@ struct GuardWideningLegacyPass : public FunctionPass {
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
-    return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
-                         [](BasicBlock*) { return true; } ).run();
+    auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+    std::unique_ptr<MemorySSAUpdater> MSSAU;
+    if (MSSAWP)
+      MSSAU = std::make_unique<MemorySSAUpdater>(&MSSAWP->getMSSA());
+    return GuardWideningImpl(DT, &PDT, LI, MSSAU ? MSSAU.get() : nullptr,
+                             DT.getRootNode(),
+                             [](BasicBlock *) { return true; })
+        .run();
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -814,6 +836,7 @@ struct GuardWideningLegacyPass : public FunctionPass {
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<PostDominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
   }
 };
 
@@ -833,13 +856,18 @@ struct LoopGuardWideningLegacyPass : public LoopPass {
     auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
     auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+    auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+    std::unique_ptr<MemorySSAUpdater> MSSAU;
+    if (MSSAWP)
+      MSSAU = std::make_unique<MemorySSAUpdater>(&MSSAWP->getMSSA());
+
     BasicBlock *RootBB = L->getLoopPredecessor();
     if (!RootBB)
       RootBB = L->getHeader();
     auto BlockFilter = [&](BasicBlock *BB) {
       return BB == RootBB || L->contains(BB);
     };
-    return GuardWideningImpl(DT, PDT, LI,
+    return GuardWideningImpl(DT, PDT, LI, MSSAU ? MSSAU.get() : nullptr,
                              DT.getNode(RootBB), BlockFilter).run();
   }
 
@@ -847,6 +875,7 @@ struct LoopGuardWideningLegacyPass : public LoopPass {
     AU.setPreservesCFG();
     getLoopAnalysisUsage(AU);
     AU.addPreserved<PostDominatorTreeWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
   }
 };
 }
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 9ee2a2d0bf080..9b665bba4c97f 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1507,12 +1507,9 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
       continue;
     }
 
-    // If we end up with a pointer exit count, bail.  Note that we can end up
-    // with a pointer exit count for one exiting block, and not for another in
-    // the same loop.
-    if (!ExitCount->getType()->isIntegerTy() ||
-        !MaxExitCount->getType()->isIntegerTy())
-      continue;
+    assert(ExitCount->getType()->isIntegerTy() &&
+           MaxExitCount->getType()->isIntegerTy() &&
+           "Exit counts must be integers");
 
     Type *WiderType =
       SE->getWiderType(MaxExitCount->getType(), ExitCount->getType());
@@ -1569,14 +1566,11 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   // through *explicit* control flow.  We have to eliminate the possibility of
   // implicit exits (see below) before we know it's truly exact.
   const SCEV *ExactBTC = SE->getBackedgeTakenCount(L);
-  if (isa<SCEVCouldNotCompute>(ExactBTC) ||
-      !SE->isLoopInvariant(ExactBTC, L) ||
-      !isSafeToExpand(ExactBTC, *SE))
+  if (isa<SCEVCouldNotCompute>(ExactBTC) || !isSafeToExpand(ExactBTC, *SE))
     return false;
 
-  // If we end up with a pointer exit count, bail.  It may be unsized.
-  if (!ExactBTC->getType()->isIntegerTy())
-    return false;
+  assert(SE->isLoopInvariant(ExactBTC, L) && "BTC must be loop invariant");
+  assert(ExactBTC->getType()->isIntegerTy() && "BTC must be integer");
 
   auto BadExit = [&](BasicBlock *ExitingBB) {
     // If our exiting block exits multiple loops, we can only rewrite the
@@ -1603,15 +1597,12 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
       return true;
 
     const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
-    if (isa<SCEVCouldNotCompute>(ExitCount) ||
-        !SE->isLoopInvariant(ExitCount, L) ||
-        !isSafeToExpand(ExitCount, *SE))
-      return true;
-
-    // If we end up with a pointer exit count, bail.  It may be unsized.
-    if (!ExitCount->getType()->isIntegerTy())
+    if (isa<SCEVCouldNotCompute>(ExitCount) || !isSafeToExpand(ExitCount, *SE))
       return true;
 
+    assert(SE->isLoopInvariant(ExitCount, L) &&
+           "Exit count must be loop invariant");
+    assert(ExitCount->getType()->isIntegerTy() && "Exit count must be integer");
     return false;
   };
 
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 33c5abf0fe302..963f73f897302 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -117,13 +117,6 @@ static cl::opt<uint32_t> MaxNumUsesTraversed(
     cl::desc("Max num uses visited for identifying load "
              "invariance in loop using invariant start (default = 8)"));
 
-// Default value of zero implies we use the regular alias set tracker mechanism
-// instead of the cross product using AA to identify aliasing of the memory
-// location we are interested in.
-static cl::opt<int>
-LICMN2Theshold("licm-n2-threshold", cl::Hidden, cl::init(0),
-               cl::desc("How many instruction to cross product using AA"));
-
 // Experimental option to allow imprecision in LICM in pathological cases, in
 // exchange for faster compile. This is to be removed if MemorySSA starts to
 // address the same issue. This flag applies only when LICM uses MemorySSA
@@ -181,7 +174,7 @@ static Instruction *cloneInstructionInExitBlock(
     const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU);
 
 static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
-                             AliasSetTracker *AST, MemorySSAUpdater *MSSAU);
+                             MemorySSAUpdater *MSSAU);
 
 static void moveInstructionBefore(Instruction &I, Instruction &Dest,
                                   ICFLoopSafetyInfo &SafetyInfo,
@@ -207,9 +200,6 @@ struct LoopInvariantCodeMotion {
 private:
   unsigned LicmMssaOptCap;
   unsigned LicmMssaNoAccForPromotionCap;
-
-  std::unique_ptr<AliasSetTracker>
-  collectAliasInfoForLoop(Loop *L, LoopInfo *LI, AAResults *AA);
 };
 
 struct LegacyLICMPass : public LoopPass {
@@ -229,9 +219,7 @@ struct LegacyLICMPass : public LoopPass {
                       << L->getHeader()->getNameOrAsOperand() << "\n");
 
     auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
-    MemorySSA *MSSA = EnableMSSALoopDependency
-                          ? (&getAnalysis<MemorySSAWrapperPass>().getMSSA())
-                          : nullptr;
+    MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
     bool hasProfileData = L->getHeader()->getParent()->hasProfileData();
     BlockFrequencyInfo *BFI =
         hasProfileData ? &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI()
@@ -258,10 +246,8 @@ struct LegacyLICMPass : public LoopPass {
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
-    if (EnableMSSALoopDependency) {
-      AU.addRequired<MemorySSAWrapperPass>();
-      AU.addPreserved<MemorySSAWrapperPass>();
-    }
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     getLoopAnalysisUsage(AU);
     LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
@@ -276,6 +262,9 @@ struct LegacyLICMPass : public LoopPass {
 
 PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
                                 LoopStandardAnalysisResults &AR, LPMUpdater &) {
+  if (!AR.MSSA)
+    report_fatal_error("LICM requires MemorySSA (loop-mssa)");
+
   // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
   // pass.  Function analyses need to be preserved across loop transformations
   // but ORE cannot be preserved (see comment before the pass definition).
@@ -290,8 +279,7 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
 
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<LoopAnalysis>();
-  if (AR.MSSA)
-    PA.preserve<MemorySSAAnalysis>();
+  PA.preserve<MemorySSAAnalysis>();
 
   return PA;
 }
@@ -299,6 +287,9 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
 PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM,
                                  LoopStandardAnalysisResults &AR,
                                  LPMUpdater &) {
+  if (!AR.MSSA)
+    report_fatal_error("LNICM requires MemorySSA (loop-mssa)");
+
   // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
   // pass.  Function analyses need to be preserved across loop transformations
   // but ORE cannot be preserved (see comment before the pass definition).
@@ -317,8 +308,7 @@ PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM,
 
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<LoopAnalysis>();
-  if (AR.MSSA)
-    PA.preserve<MemorySSAAnalysis>();
+  PA.preserve<MemorySSAAnalysis>();
 
   return PA;
 }
@@ -387,10 +377,6 @@ bool LoopInvariantCodeMotion::runOnLoop(
     return false;
   }
 
-  std::unique_ptr<AliasSetTracker> CurAST;
-  std::unique_ptr<MemorySSAUpdater> MSSAU;
-  std::unique_ptr<SinkAndHoistLICMFlags> Flags;
-
   // Don't sink stores from loops with coroutine suspend instructions.
   // LICM would sink instructions into the default destination of
   // the coroutine switch. The default destination of the switch is to
@@ -407,17 +393,9 @@ bool LoopInvariantCodeMotion::runOnLoop(
     });
   });
 
-  if (!MSSA) {
-    LLVM_DEBUG(dbgs() << "LICM: Using Alias Set Tracker.\n");
-    CurAST = collectAliasInfoForLoop(L, LI, AA);
-    Flags = std::make_unique<SinkAndHoistLICMFlags>(
-        LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true);
-  } else {
-    LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n");
-    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
-    Flags = std::make_unique<SinkAndHoistLICMFlags>(
-        LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true, L, MSSA);
-  }
+  MemorySSAUpdater MSSAU(MSSA);
+  SinkAndHoistLICMFlags Flags(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
+                              /*IsSink=*/true, L, MSSA);
 
   // Get the preheader block to move instructions into...
   BasicBlock *Preheader = L->getLoopPreheader();
@@ -436,19 +414,16 @@ bool LoopInvariantCodeMotion::runOnLoop(
   // us to sink instructions in one pass, without iteration.  After sinking
   // instructions, we perform another pass to hoist them out of the loop.
   if (L->hasDedicatedExits())
-    Changed |=
-        LoopNestMode
-            ? sinkRegionForLoopNest(DT->getNode(L->getHeader()), AA, LI, DT,
-                                    BFI, TLI, TTI, L, CurAST.get(), MSSAU.get(),
-                                    &SafetyInfo, *Flags.get(), ORE)
-            : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, TTI,
-                         L, CurAST.get(), MSSAU.get(), &SafetyInfo,
-                         *Flags.get(), ORE);
-  Flags->setIsSink(false);
+    Changed |= LoopNestMode
+                   ? sinkRegionForLoopNest(DT->getNode(L->getHeader()), AA, LI,
+                                           DT, BFI, TLI, TTI, L, &MSSAU,
+                                           &SafetyInfo, Flags, ORE)
+                   : sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI,
+                                TLI, TTI, L, &MSSAU, &SafetyInfo, Flags, ORE);
+  Flags.setIsSink(false);
   if (Preheader)
     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L,
-                           CurAST.get(), MSSAU.get(), SE, &SafetyInfo,
-                           *Flags.get(), ORE, LoopNestMode);
+                           &MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode);
 
   // Now that all loop invariants have been removed from the loop, promote any
   // memory references to scalars that we can.
@@ -458,7 +433,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
   // preheader for SSA updater, so also avoid sinking when no preheader
   // is available.
   if (!DisablePromotion && Preheader && L->hasDedicatedExits() &&
-      !Flags->tooManyMemoryAccesses() && !HasCoroSuspendInst) {
+      !Flags.tooManyMemoryAccesses() && !HasCoroSuspendInst) {
     // Figure out the loop exits and their insertion points
     SmallVector<BasicBlock *, 8> ExitBlocks;
     L->getUniqueExitBlocks(ExitBlocks);
@@ -472,55 +447,29 @@ bool LoopInvariantCodeMotion::runOnLoop(
       SmallVector<Instruction *, 8> InsertPts;
       SmallVector<MemoryAccess *, 8> MSSAInsertPts;
       InsertPts.reserve(ExitBlocks.size());
-      if (MSSAU)
-        MSSAInsertPts.reserve(ExitBlocks.size());
+      MSSAInsertPts.reserve(ExitBlocks.size());
       for (BasicBlock *ExitBlock : ExitBlocks) {
         InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
-        if (MSSAU)
-          MSSAInsertPts.push_back(nullptr);
+        MSSAInsertPts.push_back(nullptr);
       }
 
       PredIteratorCache PIC;
 
+      // Promoting one set of accesses may make the pointers for another set
+      // loop invariant, so run this in a loop (with the MaybePromotable set
+      // decreasing in size over time).
       bool Promoted = false;
-      if (CurAST.get()) {
-        // Loop over all of the alias sets in the tracker object.
-        for (AliasSet &AS : *CurAST) {
-          // We can promote this alias set if it has a store, if it is a "Must"
-          // alias set, if the pointer is loop invariant, and if we are not
-          // eliminating any volatile loads or stores.
-          if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
-              !L->isLoopInvariant(AS.begin()->getValue()))
-            continue;
-
-          assert(
-              !AS.empty() &&
-              "Must alias set should have at least one pointer element in it!");
-
-          SmallSetVector<Value *, 8> PointerMustAliases;
-          for (const auto &ASI : AS)
-            PointerMustAliases.insert(ASI.getValue());
-
-          Promoted |= promoteLoopAccessesToScalars(
-              PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI,
-              DT, TLI, L, CurAST.get(), MSSAU.get(), &SafetyInfo, ORE);
+      bool LocalPromoted;
+      do {
+        LocalPromoted = false;
+        for (const SmallSetVector<Value *, 8> &PointerMustAliases :
+             collectPromotionCandidates(MSSA, AA, L)) {
+          LocalPromoted |= promoteLoopAccessesToScalars(
+              PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC,
+              LI, DT, TLI, L, &MSSAU, &SafetyInfo, ORE);
         }
-      } else {
-        // Promoting one set of accesses may make the pointers for another set
-        // loop invariant, so run this in a loop (with the MaybePromotable set
-        // decreasing in size over time).
-        bool LocalPromoted;
-        do {
-          LocalPromoted = false;
-          for (const SmallSetVector<Value *, 8> &PointerMustAliases :
-               collectPromotionCandidates(MSSA, AA, L)) {
-            LocalPromoted |= promoteLoopAccessesToScalars(
-                PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC,
-                LI, DT, TLI, L, /*AST*/nullptr, MSSAU.get(), &SafetyInfo, ORE);
-          }
-          Promoted |= LocalPromoted;
-        } while (LocalPromoted);
-      }
+        Promoted |= LocalPromoted;
+      } while (LocalPromoted);
 
       // Once we have promoted values across the loop body we have to
       // recursively reform LCSSA as any nested loop may now have values defined
@@ -542,8 +491,8 @@ bool LoopInvariantCodeMotion::runOnLoop(
   assert((L->isOutermost() || L->getParentLoop()->isLCSSAForm(*DT)) &&
          "Parent loop not left in LCSSA form after LICM!");
 
-  if (MSSAU.get() && VerifyMemorySSA)
-    MSSAU->getMemorySSA()->verifyMemorySSA();
+  if (VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
 
   if (Changed && SE)
     SE->forgetLoopDispositions(L);
@@ -558,17 +507,15 @@ bool LoopInvariantCodeMotion::runOnLoop(
 bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                       DominatorTree *DT, BlockFrequencyInfo *BFI,
                       TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
-                      Loop *CurLoop, AliasSetTracker *CurAST,
-                      MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
+                      Loop *CurLoop, MemorySSAUpdater *MSSAU,
+                      ICFLoopSafetyInfo *SafetyInfo,
                       SinkAndHoistLICMFlags &Flags,
                       OptimizationRemarkEmitter *ORE, Loop *OutermostLoop) {
 
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
-         CurLoop != nullptr && SafetyInfo != nullptr &&
+         CurLoop != nullptr && MSSAU != nullptr && SafetyInfo != nullptr &&
          "Unexpected input to sinkRegion.");
-  assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
-         "Either AliasSetTracker or MemorySSA should be initialized.");
 
   // We want to visit children before parents. We will enque all the parents
   // before their children in the worklist and process the worklist in reverse
@@ -593,7 +540,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
         salvageKnowledge(&I);
         salvageDebugInfo(I);
         ++II;
-        eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+        eraseInstruction(I, *SafetyInfo, MSSAU);
         Changed = true;
         continue;
       }
@@ -608,20 +555,20 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
       if (!I.mayHaveSideEffects() &&
           isNotUsedOrFreeInLoop(I, LoopNestMode ? OutermostLoop : CurLoop,
                                 SafetyInfo, TTI, FreeInLoop, LoopNestMode) &&
-          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
-                             ORE)) {
+          canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/nullptr, MSSAU, true,
+                             &Flags, ORE)) {
         if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) {
           if (!FreeInLoop) {
             ++II;
             salvageDebugInfo(I);
-            eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+            eraseInstruction(I, *SafetyInfo, MSSAU);
           }
           Changed = true;
         }
       }
     }
   }
-  if (MSSAU && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
   return Changed;
 }
@@ -629,9 +576,8 @@ bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
 bool llvm::sinkRegionForLoopNest(
     DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
     BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
-    Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
-    ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags,
-    OptimizationRemarkEmitter *ORE) {
+    Loop *CurLoop, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
+    SinkAndHoistLICMFlags &Flags, OptimizationRemarkEmitter *ORE) {
 
   bool Changed = false;
   SmallPriorityWorklist<Loop *, 4> Worklist;
@@ -639,9 +585,8 @@ bool llvm::sinkRegionForLoopNest(
   appendLoopsToWorklist(*CurLoop, Worklist);
   while (!Worklist.empty()) {
     Loop *L = Worklist.pop_back_val();
-    Changed |=
-        sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, TTI, L,
-                   CurAST, MSSAU, SafetyInfo, Flags, ORE, CurLoop);
+    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI,
+                          TTI, L, MSSAU, SafetyInfo, Flags, ORE, CurLoop);
   }
   return Changed;
 }
@@ -848,9 +793,8 @@ class ControlFlowHoister {
     if (HoistTarget == InitialPreheader) {
       // Phis in the loop header now need to use the new preheader.
       InitialPreheader->replaceSuccessorsPhiUsesWith(HoistCommonSucc);
-      if (MSSAU)
-        MSSAU->wireOldPredecessorsToNewImmediatePredecessor(
-            HoistTarget->getSingleSuccessor(), HoistCommonSucc, {HoistTarget});
+      MSSAU->wireOldPredecessorsToNewImmediatePredecessor(
+          HoistTarget->getSingleSuccessor(), HoistCommonSucc, {HoistTarget});
       // The new preheader dominates the loop header.
       DomTreeNode *PreheaderNode = DT->getNode(HoistCommonSucc);
       DomTreeNode *HeaderNode = DT->getNode(CurLoop->getHeader());
@@ -912,16 +856,14 @@ static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock,
 bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                        DominatorTree *DT, BlockFrequencyInfo *BFI,
                        TargetLibraryInfo *TLI, Loop *CurLoop,
-                       AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
-                       ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo,
+                       MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
+                       ICFLoopSafetyInfo *SafetyInfo,
                        SinkAndHoistLICMFlags &Flags,
                        OptimizationRemarkEmitter *ORE, bool LoopNestMode) {
   // Verify inputs.
   assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
-         CurLoop != nullptr && SafetyInfo != nullptr &&
+         CurLoop != nullptr && MSSAU != nullptr && SafetyInfo != nullptr &&
          "Unexpected input to hoistRegion.");
-  assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
-         "Either AliasSetTracker or MemorySSA should be initialized.");
 
   ControlFlowHoister CFH(LI, DT, CurLoop, MSSAU);
 
@@ -950,12 +892,10 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
               &I, I.getModule()->getDataLayout(), TLI)) {
         LLVM_DEBUG(dbgs() << "LICM folding inst: " << I << "  --> " << *C
                           << '\n');
-        if (CurAST)
-          CurAST->copyValue(&I, C);
         // FIXME MSSA: Such replacements may make accesses unoptimized (D51960).
         I.replaceAllUsesWith(C);
         if (isInstructionTriviallyDead(&I, TLI))
-          eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+          eraseInstruction(I, *SafetyInfo, MSSAU);
         Changed = true;
         continue;
       }
@@ -968,8 +908,8 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
       // and we have accurately duplicated the control flow from the loop header
       // to that block.
       if (CurLoop->hasLoopInvariantOperands(&I) &&
-          canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
-                             ORE) &&
+          canSinkOrHoistInst(I, AA, DT, CurLoop, /*CurAST*/ nullptr, MSSAU,
+                             true, &Flags, ORE) &&
           worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) &&
           isSafeToExecuteUnconditionally(
               I, DT, TLI, CurLoop, SafetyInfo, ORE,
@@ -998,7 +938,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
         SafetyInfo->insertInstructionTo(Product, I.getParent());
         Product->insertAfter(&I);
         I.replaceAllUsesWith(Product);
-        eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+        eraseInstruction(I, *SafetyInfo, MSSAU);
 
         hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB),
               SafetyInfo, MSSAU, SE, ORE);
@@ -1077,7 +1017,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
       }
     }
   }
-  if (MSSAU && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
 
     // Now that we've finished hoisting make sure that LI and DT are still
@@ -1495,7 +1435,7 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
                UI->getNumOperands() == 1) {
           if (!CurLoop->contains(UI))
             break;
-          UI = cast<Instruction>(U->user_back());
+          UI = cast<Instruction>(UI->user_back());
         }
       }
     }
@@ -1584,9 +1524,7 @@ static Instruction *cloneInstructionInExitBlock(
 }
 
 static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
-                             AliasSetTracker *AST, MemorySSAUpdater *MSSAU) {
-  if (AST)
-    AST->deleteValue(&I);
+                             MemorySSAUpdater *MSSAU) {
   if (MSSAU)
     MSSAU->removeMemoryAccess(&I);
   SafetyInfo.removeInstruction(&I);
@@ -1824,7 +1762,7 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
     Instruction *New = sinkThroughTriviallyReplaceablePHI(
         PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU);
     PN->replaceAllUsesWith(New);
-    eraseInstruction(*PN, *SafetyInfo, nullptr, nullptr);
+    eraseInstruction(*PN, *SafetyInfo, nullptr);
     Changed = true;
   }
   return Changed;
@@ -1913,7 +1851,6 @@ class LoopPromoter : public LoadAndStorePromoter {
   SmallVectorImpl<Instruction *> &LoopInsertPts;
   SmallVectorImpl<MemoryAccess *> &MSSAInsertPts;
   PredIteratorCache &PredCache;
-  AliasSetTracker *AST;
   MemorySSAUpdater *MSSAU;
   LoopInfo &LI;
   DebugLoc DL;
@@ -1945,12 +1882,12 @@ class LoopPromoter : public LoadAndStorePromoter {
                SmallVectorImpl<BasicBlock *> &LEB,
                SmallVectorImpl<Instruction *> &LIP,
                SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC,
-               AliasSetTracker *ast, MemorySSAUpdater *MSSAU, LoopInfo &li,
-               DebugLoc dl, int alignment, bool UnorderedAtomic,
-               const AAMDNodes &AATags, ICFLoopSafetyInfo &SafetyInfo)
+               MemorySSAUpdater *MSSAU, LoopInfo &li, DebugLoc dl,
+               int alignment, bool UnorderedAtomic, const AAMDNodes &AATags,
+               ICFLoopSafetyInfo &SafetyInfo)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
         LoopExitBlocks(LEB), LoopInsertPts(LIP), MSSAInsertPts(MSSAIP),
-        PredCache(PIC), AST(ast), MSSAU(MSSAU), LI(li), DL(std::move(dl)),
+        PredCache(PIC), MSSAU(MSSAU), LI(li), DL(std::move(dl)),
         Alignment(alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags),
         SafetyInfo(SafetyInfo) {}
 
@@ -1983,34 +1920,24 @@ class LoopPromoter : public LoadAndStorePromoter {
       if (AATags)
         NewSI->setAAMetadata(AATags);
 
-      if (MSSAU) {
-        MemoryAccess *MSSAInsertPoint = MSSAInsertPts[i];
-        MemoryAccess *NewMemAcc;
-        if (!MSSAInsertPoint) {
-          NewMemAcc = MSSAU->createMemoryAccessInBB(
-              NewSI, nullptr, NewSI->getParent(), MemorySSA::Beginning);
-        } else {
-          NewMemAcc =
-              MSSAU->createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint);
-        }
-        MSSAInsertPts[i] = NewMemAcc;
-        MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
-        // FIXME: true for safety, false may still be correct.
+      MemoryAccess *MSSAInsertPoint = MSSAInsertPts[i];
+      MemoryAccess *NewMemAcc;
+      if (!MSSAInsertPoint) {
+        NewMemAcc = MSSAU->createMemoryAccessInBB(
+            NewSI, nullptr, NewSI->getParent(), MemorySSA::Beginning);
+      } else {
+        NewMemAcc =
+            MSSAU->createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint);
       }
+      MSSAInsertPts[i] = NewMemAcc;
+      MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
+      // FIXME: true for safety, false may still be correct.
     }
   }
 
-  void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
-    // Update alias analysis.
-    if (AST)
-      AST->copyValue(LI, V);
-  }
   void instructionDeleted(Instruction *I) const override {
     SafetyInfo.removeInstruction(I);
-    if (AST)
-      AST->deleteValue(I);
-    if (MSSAU)
-      MSSAU->removeMemoryAccess(I);
+    MSSAU->removeMemoryAccess(I);
   }
 };
 
@@ -2061,8 +1988,8 @@ bool llvm::promoteLoopAccessesToScalars(
     SmallVectorImpl<Instruction *> &InsertPts,
     SmallVectorImpl<MemoryAccess *> &MSSAInsertPts, PredIteratorCache &PIC,
     LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
-    Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
-    ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) {
+    Loop *CurLoop, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
+    OptimizationRemarkEmitter *ORE) {
   // Verify inputs.
   assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
          SafetyInfo != nullptr &&
@@ -2294,7 +2221,7 @@ bool llvm::promoteLoopAccessesToScalars(
   SmallVector<PHINode *, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
-                        InsertPts, MSSAInsertPts, PIC, CurAST, MSSAU, *LI, DL,
+                        InsertPts, MSSAInsertPts, PIC, MSSAU, *LI, DL,
                         Alignment.value(), SawUnorderedAtomic, AATags,
                         *SafetyInfo);
 
@@ -2311,24 +2238,22 @@ bool llvm::promoteLoopAccessesToScalars(
     PreheaderLoad->setAAMetadata(AATags);
   SSA.AddAvailableValue(Preheader, PreheaderLoad);
 
-  if (MSSAU) {
-    MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
-        PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End);
-    MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess);
-    MSSAU->insertUse(NewMemUse, /*RenameUses=*/true);
-  }
+  MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
+      PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End);
+  MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess);
+  MSSAU->insertUse(NewMemUse, /*RenameUses=*/true);
 
-  if (MSSAU && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
   // Rewrite all the loads in the loop and remember all the definitions from
   // stores in the loop.
   Promoter.run(LoopUses);
 
-  if (MSSAU && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
   // If the SSAUpdater didn't use the load in the preheader, just zap it now.
   if (PreheaderLoad->use_empty())
-    eraseInstruction(*PreheaderLoad, *SafetyInfo, CurAST, MSSAU);
+    eraseInstruction(*PreheaderLoad, *SafetyInfo, MSSAU);
 
   return true;
 }
@@ -2394,71 +2319,10 @@ collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L) {
   return Result;
 }
 
-/// Returns an owning pointer to an alias set which incorporates aliasing info
-/// from L and all subloops of L.
-std::unique_ptr<AliasSetTracker>
-LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
-                                                 AAResults *AA) {
-  auto CurAST = std::make_unique<AliasSetTracker>(*AA);
-
-  // Add everything from all the sub loops.
-  for (Loop *InnerL : L->getSubLoops())
-    for (BasicBlock *BB : InnerL->blocks())
-      CurAST->add(*BB);
-
-  // And merge in this loop (without anything from inner loops).
-  for (BasicBlock *BB : L->blocks())
-    if (LI->getLoopFor(BB) == L)
-      CurAST->add(*BB);
-
-  return CurAST;
-}
-
 static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
                                      AliasSetTracker *CurAST, Loop *CurLoop,
                                      AAResults *AA) {
-  // First check to see if any of the basic blocks in CurLoop invalidate *V.
-  bool isInvalidatedAccordingToAST = CurAST->getAliasSetFor(MemLoc).isMod();
-
-  if (!isInvalidatedAccordingToAST || !LICMN2Theshold)
-    return isInvalidatedAccordingToAST;
-
-  // Check with a diagnostic analysis if we can refine the information above.
-  // This is to identify the limitations of using the AST.
-  // The alias set mechanism used by LICM has a major weakness in that it
-  // combines all things which may alias into a single set *before* asking
-  // modref questions. As a result, a single readonly call within a loop will
-  // collapse all loads and stores into a single alias set and report
-  // invalidation if the loop contains any store. For example, readonly calls
-  // with deopt states have this form and create a general alias set with all
-  // loads and stores.  In order to get any LICM in loops containing possible
-  // deopt states we need a more precise invalidation of checking the mod ref
-  // info of each instruction within the loop and LI. This has a complexity of
-  // O(N^2), so currently, it is used only as a diagnostic tool since the
-  // default value of LICMN2Threshold is zero.
-
-  // Don't look at nested loops.
-  if (CurLoop->begin() != CurLoop->end())
-    return true;
-
-  int N = 0;
-  for (BasicBlock *BB : CurLoop->getBlocks())
-    for (Instruction &I : *BB) {
-      if (N >= LICMN2Theshold) {
-        LLVM_DEBUG(dbgs() << "Alasing N2 threshold exhausted for "
-                          << *(MemLoc.Ptr) << "\n");
-        return true;
-      }
-      N++;
-      auto Res = AA->getModRefInfo(&I, MemLoc);
-      if (isModSet(Res)) {
-        LLVM_DEBUG(dbgs() << "Aliasing failed on " << I << " for "
-                          << *(MemLoc.Ptr) << "\n");
-        return true;
-      }
-    }
-  LLVM_DEBUG(dbgs() << "Aliasing okay for " << *(MemLoc.Ptr) << "\n");
-  return false;
+  return CurAST->getAliasSetFor(MemLoc).isMod();
 }
 
 bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 3343bdd0b5731..c8a4bbdc68510 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -496,17 +496,27 @@ static OverflowResult checkOverflow(FlattenInfo &FI, DominatorTree *DT,
   for (Value *V : FI.LinearIVUses) {
     for (Value *U : V->users()) {
       if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
-        // The IV is used as the operand of a GEP, and the IV is at least as
-        // wide as the address space of the GEP. In this case, the GEP would
-        // wrap around the address space before the IV increment wraps, which
-        // would be UB.
-        if (GEP->isInBounds() &&
-            V->getType()->getIntegerBitWidth() >=
-                DL.getPointerTypeSizeInBits(GEP->getType())) {
-          LLVM_DEBUG(
-              dbgs() << "use of linear IV would be UB if overflow occurred: ";
-              GEP->dump());
-          return OverflowResult::NeverOverflows;
+        for (Value *GEPUser : U->users()) {
+          Instruction *GEPUserInst = dyn_cast<Instruction>(GEPUser);
+          if (!isa<LoadInst>(GEPUserInst) &&
+              !(isa<StoreInst>(GEPUserInst) &&
+                GEP == GEPUserInst->getOperand(1)))
+            continue;
+          if (!isGuaranteedToExecuteForEveryIteration(GEPUserInst,
+                                                      FI.InnerLoop))
+            continue;
+          // The IV is used as the operand of a GEP which dominates the loop
+          // latch, and the IV is at least as wide as the address space of the
+          // GEP. In this case, the GEP would wrap around the address space
+          // before the IV increment wraps, which would be UB.
+          if (GEP->isInBounds() &&
+              V->getType()->getIntegerBitWidth() >=
+                  DL.getPointerTypeSizeInBits(GEP->getType())) {
+            LLVM_DEBUG(
+                dbgs() << "use of linear IV would be UB if overflow occurred: ";
+                GEP->dump());
+            return OverflowResult::NeverOverflows;
+          }
         }
       }
     }
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index f1dcb10b01bf1..f57ee475fcc3e 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -225,7 +225,7 @@ class LoopIdiomRecognize {
                                bool IsNegStride, bool IsLoopMemset = false);
   bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
   bool processLoopStoreOfLoopLoad(Value *DestPtr, Value *SourcePtr,
-                                  unsigned StoreSize, MaybeAlign StoreAlign,
+                                  const SCEV *StoreSize, MaybeAlign StoreAlign,
                                   MaybeAlign LoadAlign, Instruction *TheStore,
                                   Instruction *TheLoad,
                                   const SCEVAddRecExpr *StoreEv,
@@ -858,15 +858,15 @@ bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
 
   // Check if the stride matches the size of the memcpy. If so, then we know
   // that every byte is touched in the loop.
-  const SCEVConstant *StoreStride =
+  const SCEVConstant *ConstStoreStride =
       dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
-  const SCEVConstant *LoadStride =
+  const SCEVConstant *ConstLoadStride =
       dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
-  if (!StoreStride || !LoadStride)
+  if (!ConstStoreStride || !ConstLoadStride)
     return false;
 
-  APInt StoreStrideValue = StoreStride->getAPInt();
-  APInt LoadStrideValue = LoadStride->getAPInt();
+  APInt StoreStrideValue = ConstStoreStride->getAPInt();
+  APInt LoadStrideValue = ConstLoadStride->getAPInt();
   // Huge stride value - give up
   if (StoreStrideValue.getBitWidth() > 64 || LoadStrideValue.getBitWidth() > 64)
     return false;
@@ -876,7 +876,7 @@ bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
       return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
              << ore::NV("Inst", "memcpy") << " in "
              << ore::NV("Function", MCI->getFunction())
-             << " function will not be hoised: "
+             << " function will not be hoisted: "
              << ore::NV("Reason", "memcpy size is not equal to stride");
     });
     return false;
@@ -888,16 +888,17 @@ bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
   if (StoreStrideInt != LoadStrideInt)
     return false;
 
-  return processLoopStoreOfLoopLoad(Dest, Source, (unsigned)SizeInBytes,
-                                    MCI->getDestAlign(), MCI->getSourceAlign(),
-                                    MCI, MCI, StoreEv, LoadEv, BECount);
+  return processLoopStoreOfLoopLoad(
+      Dest, Source, SE->getConstant(Dest->getType(), SizeInBytes),
+      MCI->getDestAlign(), MCI->getSourceAlign(), MCI, MCI, StoreEv, LoadEv,
+      BECount);
 }
 
 /// processLoopMemSet - See if this memset can be promoted to a large memset.
 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
                                            const SCEV *BECount) {
-  // We can only handle non-volatile memsets with a constant size.
-  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+  // We can only handle non-volatile memsets.
+  if (MSI->isVolatile())
     return false;
 
   // If we're not allowed to hack on memset, we fail.
@@ -910,23 +911,72 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
   // loop, which indicates a strided store.  If we have something else, it's a
   // random store we can't handle.
   const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
-  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
+  if (!Ev || Ev->getLoop() != CurLoop)
     return false;
-
-  // Reject memsets that are so large that they overflow an unsigned.
-  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
-  if ((SizeInBytes >> 32) != 0)
+  if (!Ev->isAffine()) {
+    LLVM_DEBUG(dbgs() << "  Pointer is not affine, abort\n");
     return false;
+  }
 
-  // Check to see if the stride matches the size of the memset.  If so, then we
-  // know that every byte is touched in the loop.
-  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
-  if (!ConstStride)
+  const SCEV *PointerStrideSCEV = Ev->getOperand(1);
+  const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
+  if (!PointerStrideSCEV || !MemsetSizeSCEV)
     return false;
 
-  APInt Stride = ConstStride->getAPInt();
-  if (SizeInBytes != Stride && SizeInBytes != -Stride)
-    return false;
+  bool IsNegStride = false;
+  const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
+
+  if (IsConstantSize) {
+    // Memset size is constant.
+    // Check if the pointer stride matches the memset size. If so, then
+    // we know that every byte is touched in the loop.
+    LLVM_DEBUG(dbgs() << "  memset size is constant\n");
+    uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+    const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
+    if (!ConstStride)
+      return false;
+
+    APInt Stride = ConstStride->getAPInt();
+    if (SizeInBytes != Stride && SizeInBytes != -Stride)
+      return false;
+
+    IsNegStride = SizeInBytes == -Stride;
+  } else {
+    // Memset size is non-constant.
+    // Check if the pointer stride matches the memset size.
+    // To be conservative, the pass would not promote pointers that aren't in
+    // address space zero. Also, the pass only handles memset length and stride
+    // that are invariant for the top level loop.
+    LLVM_DEBUG(dbgs() << "  memset size is non-constant\n");
+    if (Pointer->getType()->getPointerAddressSpace() != 0) {
+      LLVM_DEBUG(dbgs() << "  pointer is not in address space zero, "
+                        << "abort\n");
+      return false;
+    }
+    if (!SE->isLoopInvariant(MemsetSizeSCEV, CurLoop)) {
+      LLVM_DEBUG(dbgs() << "  memset size is not a loop-invariant, "
+                        << "abort\n");
+      return false;
+    }
+
+    // Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
+    IsNegStride = PointerStrideSCEV->isNonConstantNegative();
+    const SCEV *PositiveStrideSCEV =
+        IsNegStride ? SE->getNegativeSCEV(PointerStrideSCEV)
+                    : PointerStrideSCEV;
+    LLVM_DEBUG(dbgs() << "  MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
+                      << "  PositiveStrideSCEV: " << *PositiveStrideSCEV
+                      << "\n");
+
+    if (PositiveStrideSCEV != MemsetSizeSCEV) {
+      // TODO: folding can be done to the SCEVs
+      // The folding is to fold expressions that is covered by the loop guard
+      // at loop entry. After the folding, compare again and proceed
+      // optimization if equal.
+      LLVM_DEBUG(dbgs() << "  SCEV don't match, abort\n");
+      return false;
+    }
+  }
 
   // Verify that the memset value is loop invariant.  If not, we can't promote
   // the memset.
@@ -936,7 +986,6 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
 
   SmallPtrSet<Instruction *, 1> MSIs;
   MSIs.insert(MSI);
-  bool IsNegStride = SizeInBytes == -Stride;
   return processLoopStridedStore(Pointer, SE->getSCEV(MSI->getLength()),
                                  MaybeAlign(MSI->getDestAlignment()),
                                  SplatValue, MSI, MSIs, Ev, BECount,
@@ -950,7 +999,7 @@ static bool
 mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
                       const SCEV *BECount, const SCEV *StoreSizeSCEV,
                       AliasAnalysis &AA,
-                      SmallPtrSetImpl<Instruction *> &IgnoredStores) {
+                      SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
   // Get the location that may be stored across the loop.  Since the access is
   // strided positively through memory, we say that the modified location starts
   // at the pointer and has infinite size.
@@ -973,7 +1022,7 @@ mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
   for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
        ++BI)
     for (Instruction &I : **BI)
-      if (IgnoredStores.count(&I) == 0 &&
+      if (IgnoredInsts.count(&I) == 0 &&
           isModOrRefSet(
               intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
         return true;
@@ -1028,20 +1077,6 @@ static const SCEV *getTripCount(const SCEV *BECount, Type *IntPtr,
 ///
 /// This also maps the SCEV into the provided type and tries to handle the
 /// computation in a way that will fold cleanly.
-static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
-                               unsigned StoreSize, Loop *CurLoop,
-                               const DataLayout *DL, ScalarEvolution *SE) {
-  const SCEV *TripCountSCEV = getTripCount(BECount, IntPtr, CurLoop, DL, SE);
-
-  // And scale it based on the store size.
-  if (StoreSize != 1) {
-    return SE->getMulExpr(TripCountSCEV, SE->getConstant(IntPtr, StoreSize),
-                          SCEV::FlagNUW);
-  }
-  return TripCountSCEV;
-}
-
-/// getNumBytes that takes StoreSize as a SCEV
 static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
                                const SCEV *StoreSizeSCEV, Loop *CurLoop,
                                const DataLayout *DL, ScalarEvolution *SE) {
@@ -1208,16 +1243,18 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   // random load we can't handle.
   Value *LoadPtr = LI->getPointerOperand();
   const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(SE->getSCEV(LoadPtr));
-  return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSize,
+
+  const SCEV *StoreSizeSCEV = SE->getConstant(StorePtr->getType(), StoreSize);
+  return processLoopStoreOfLoopLoad(StorePtr, LoadPtr, StoreSizeSCEV,
                                     SI->getAlign(), LI->getAlign(), SI, LI,
                                     StoreEv, LoadEv, BECount);
 }
 
 bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
-    Value *DestPtr, Value *SourcePtr, unsigned StoreSize, MaybeAlign StoreAlign,
-    MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad,
-    const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv,
-    const SCEV *BECount) {
+    Value *DestPtr, Value *SourcePtr, const SCEV *StoreSizeSCEV,
+    MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
+    Instruction *TheLoad, const SCEVAddRecExpr *StoreEv,
+    const SCEVAddRecExpr *LoadEv, const SCEV *BECount) {
 
   // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
   // conservatively bail here, since otherwise we may have to transform
@@ -1240,9 +1277,14 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
   Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
 
   APInt Stride = getStoreStride(StoreEv);
+  const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
+
+  // TODO: Deal with non-constant size; Currently expect constant store size
+  assert(ConstStoreSize && "store size is expected to be a constant");
+
+  int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
   bool IsNegStride = StoreSize == -Stride;
 
-  const SCEV *StoreSizeSCEV = SE->getConstant(BECount->getType(), StoreSize);
   // Handle negative strided loops.
   if (IsNegStride)
     StrStart =
@@ -1266,19 +1308,19 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
   // the return value will read this comment, and leave them alone.
   Changed = true;
 
-  SmallPtrSet<Instruction *, 2> Stores;
-  Stores.insert(TheStore);
+  SmallPtrSet<Instruction *, 2> IgnoredInsts;
+  IgnoredInsts.insert(TheStore);
 
   bool IsMemCpy = isa<MemCpyInst>(TheStore);
   const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
 
   bool UseMemMove =
       mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
-                            StoreSizeSCEV, *AA, Stores);
+                            StoreSizeSCEV, *AA, IgnoredInsts);
   if (UseMemMove) {
-    Stores.insert(TheLoad);
+    IgnoredInsts.insert(TheLoad);
     if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
-                              BECount, StoreSizeSCEV, *AA, Stores)) {
+                              BECount, StoreSizeSCEV, *AA, IgnoredInsts)) {
       ORE.emit([&]() {
         return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
                                         TheStore)
@@ -1289,7 +1331,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
       });
       return Changed;
     }
-    Stores.erase(TheLoad);
+    IgnoredInsts.erase(TheLoad);
   }
 
   const SCEV *LdStart = LoadEv->getStart();
@@ -1308,9 +1350,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
   // If the store is a memcpy instruction, we must check if it will write to
   // the load memory locations. So remove it from the ignored stores.
   if (IsMemCpy)
-    Stores.erase(TheStore);
+    IgnoredInsts.erase(TheStore);
   if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
-                            StoreSizeSCEV, *AA, Stores)) {
+                            StoreSizeSCEV, *AA, IgnoredInsts)) {
     ORE.emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
              << ore::NV("Inst", InstRemark) << " in "
@@ -1343,7 +1385,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
   // Okay, everything is safe, we can transform this!
 
   const SCEV *NumBytesS =
-      getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
+      getNumBytes(BECount, IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
 
   Value *NumBytes =
       Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
@@ -1408,8 +1450,8 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
            << " function";
   });
 
-  // Okay, the memcpy has been formed.  Zap the original store and anything that
-  // feeds into it.
+  // Okay, a new call to memcpy/memmove has been formed.  Zap the original store
+  // and anything that feeds into it.
   if (MSSAU)
     MSSAU->removeMemoryAccess(TheStore, true);
   deleteDeadInstruction(TheStore);
diff --git a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 3153a8721193b..61cb3e94d6bfe 100644
--- a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -195,15 +195,10 @@ class LoopInstSimplifyLegacyPass : public LoopPass {
     const TargetLibraryInfo &TLI =
         getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
             *L->getHeader()->getParent());
-    MemorySSA *MSSA = nullptr;
-    Optional<MemorySSAUpdater> MSSAU;
-    if (EnableMSSALoopDependency) {
-      MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-      MSSAU = MemorySSAUpdater(MSSA);
-    }
+    MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+    MemorySSAUpdater MSSAU(MSSA);
 
-    return simplifyLoopInst(*L, DT, LI, AC, TLI,
-                            MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
+    return simplifyLoopInst(*L, DT, LI, AC, TLI, &MSSAU);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -211,10 +206,8 @@ class LoopInstSimplifyLegacyPass : public LoopPass {
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.setPreservesCFG();
-    if (EnableMSSALoopDependency) {
-      AU.addRequired<MemorySSAWrapperPass>();
-      AU.addPreserved<MemorySSAWrapperPass>();
-    }
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 };
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index aaf586173e442..9c4f18f8e2213 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -34,7 +34,6 @@
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -718,15 +717,12 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
   auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
   auto *BFI = (PSI && PSI->hasProfileSummary()) ?
       &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
-  MemorySSA *MSSA = EnableMSSALoopDependency
-                        ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
-                        : nullptr;
 
   auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
   bool Changed = eliminateLoadsAcrossLoops(
       F, LI, DT, BFI, PSI, &SE, &AC, [&](Loop &L) -> const LoopAccessInfo & {
         LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
-                                          TLI, TTI, nullptr, MSSA};
+                                          TLI, TTI, nullptr, nullptr};
         return LAM.getResult<LoopAccessAnalysis>(L, AR);
       });
 
diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index f4fce48713316..9f61aa0ed918c 100644
--- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -285,6 +285,10 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
     else
       PI.runAfterPass<Loop>(*Pass, *L, PassPA);
 
+    if (LAR.MSSA && !PassPA.getChecker<MemorySSAAnalysis>().preserved())
+      report_fatal_error("Loop pass manager using MemorySSA contains a pass "
+                         "that does not preserve MemorySSA");
+
 #ifndef NDEBUG
     // LoopAnalysisResults should always be valid.
     // Note that we don't LAR.SE.verify() because that can change observed SE
diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index 6d5b19443c767..5ba137b1c85fb 100644
--- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -99,8 +99,7 @@ class LoopRotateLegacyPass : public LoopPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
-    if (EnableMSSALoopDependency)
-      AU.addPreserved<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
     getLoopAnalysisUsage(AU);
 
     // Lazy BFI and BPI are marked as preserved here so LoopRotate
@@ -121,13 +120,11 @@ class LoopRotateLegacyPass : public LoopPass {
     auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
     Optional<MemorySSAUpdater> MSSAU;
-    if (EnableMSSALoopDependency) {
-      // Not requiring MemorySSA and getting it only if available will split
-      // the loop pass pipeline when LoopRotate is being run first.
-      auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>();
-      if (MSSAA)
-        MSSAU = MemorySSAUpdater(&MSSAA->getMSSA());
-    }
+    // Not requiring MemorySSA and getting it only if available will split
+    // the loop pass pipeline when LoopRotate is being run first.
+    auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+    if (MSSAA)
+      MSSAU = MemorySSAUpdater(&MSSAA->getMSSA());
     // Vectorization requires loop-rotation. Use default threshold for loops the
     // user explicitly marked for vectorization, even when header duplication is
     // disabled.
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index cc6d112208079..6fa736426e459 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -733,27 +733,20 @@ class LoopSimplifyCFGLegacyPass : public LoopPass {
     DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
     LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-    Optional<MemorySSAUpdater> MSSAU;
-    if (EnableMSSALoopDependency) {
-      MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-      MSSAU = MemorySSAUpdater(MSSA);
-      if (VerifyMemorySSA)
-        MSSA->verifyMemorySSA();
-    }
+    MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+    MemorySSAUpdater MSSAU(MSSA);
+    if (VerifyMemorySSA)
+      MSSA->verifyMemorySSA();
     bool DeleteCurrentLoop = false;
-    bool Changed = simplifyLoopCFG(
-        *L, DT, LI, SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
-        DeleteCurrentLoop);
+    bool Changed = simplifyLoopCFG(*L, DT, LI, SE, &MSSAU, DeleteCurrentLoop);
     if (DeleteCurrentLoop)
       LPM.markLoopAsDeleted(*L);
     return Changed;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    if (EnableMSSALoopDependency) {
-      AU.addRequired<MemorySSAWrapperPass>();
-      AU.addPreserved<MemorySSAWrapperPass>();
-    }
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
     AU.addPreserved<DependenceAnalysisWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 34dad2ac213bb..a3aeb670ec143 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -319,6 +319,16 @@ struct EstimatedUnrollCost {
   unsigned RolledDynamicCost;
 };
 
+struct PragmaInfo {
+  PragmaInfo(bool UUC, bool PFU, unsigned PC, bool PEU)
+      : UserUnrollCount(UUC), PragmaFullUnroll(PFU), PragmaCount(PC),
+        PragmaEnableUnroll(PEU) {}
+  const bool UserUnrollCount;
+  const bool PragmaFullUnroll;
+  const unsigned PragmaCount;
+  const bool PragmaEnableUnroll;
+};
+
 } // end anonymous namespace
 
 /// Figure out if the loop is worth full unrolling.
@@ -747,13 +757,132 @@ class UnrollCostEstimator {
 
   // Returns loop size estimation for unrolled loop, given the unrolling
   // configuration specified by UP.
-  uint64_t getUnrolledLoopSize(TargetTransformInfo::UnrollingPreferences &UP) {
+  uint64_t
+  getUnrolledLoopSize(const TargetTransformInfo::UnrollingPreferences &UP,
+                      const unsigned CountOverwrite = 0) const {
     assert(LoopSize >= UP.BEInsns &&
            "LoopSize should not be less than BEInsns!");
-    return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
+    if (CountOverwrite)
+      return static_cast<uint64_t>(LoopSize - UP.BEInsns) * CountOverwrite +
+             UP.BEInsns;
+    else
+      return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count +
+             UP.BEInsns;
   }
 };
 
+static Optional<unsigned>
+shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
+                   const unsigned TripMultiple, const unsigned TripCount,
+                   const UnrollCostEstimator UCE,
+                   const TargetTransformInfo::UnrollingPreferences &UP) {
+
+  // Using unroll pragma
+  // 1st priority is unroll count set by "unroll-count" option.
+
+  if (PInfo.UserUnrollCount) {
+    if (UP.AllowRemainder &&
+        UCE.getUnrolledLoopSize(UP, (unsigned)UnrollCount) < UP.Threshold)
+      return (unsigned)UnrollCount;
+  }
+
+  // 2nd priority is unroll count set by pragma.
+  if (PInfo.PragmaCount > 0) {
+    if ((UP.AllowRemainder || (TripMultiple % PInfo.PragmaCount == 0)) &&
+        UCE.getUnrolledLoopSize(UP, PInfo.PragmaCount) < PragmaUnrollThreshold)
+      return PInfo.PragmaCount;
+  }
+
+  if (PInfo.PragmaFullUnroll && TripCount != 0) {
+    if (UCE.getUnrolledLoopSize(UP, TripCount) < PragmaUnrollThreshold)
+      return TripCount;
+  }
+  // if didn't return until here, should continue to other priorties
+  return None;
+}
+
+static Optional<unsigned> shouldFullUnroll(
+    Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,
+    ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
+    const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
+    const TargetTransformInfo::UnrollingPreferences &UP) {
+
+  if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
+    // When computing the unrolled size, note that BEInsns are not replicated
+    // like the rest of the loop body.
+    if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) {
+      return FullUnrollTripCount;
+
+    } else {
+      // The loop isn't that small, but we still can fully unroll it if that
+      // helps to remove a significant number of instructions.
+      // To check that, run additional analysis on the loop.
+      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
+              L, FullUnrollTripCount, DT, SE, EphValues, TTI,
+              UP.Threshold * UP.MaxPercentThresholdBoost / 100,
+              UP.MaxIterationsCountToAnalyze)) {
+        unsigned Boost =
+            getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
+        if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
+          return FullUnrollTripCount;
+        }
+      }
+    }
+  }
+  return None;
+}
+
+static Optional<unsigned>
+shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
+                    const UnrollCostEstimator UCE,
+                    const TargetTransformInfo::UnrollingPreferences &UP) {
+
+  unsigned count = UP.Count;
+  if (TripCount) {
+    if (!UP.Partial) {
+      LLVM_DEBUG(dbgs() << "  will not try to unroll partially because "
+                        << "-unroll-allow-partial not given\n");
+      count = 0;
+      return count;
+    }
+    if (count == 0)
+      count = TripCount;
+    if (UP.PartialThreshold != NoThreshold) {
+      // Reduce unroll count to be modulo of TripCount for partial unrolling.
+      if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
+        count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
+                (LoopSize - UP.BEInsns);
+      if (count > UP.MaxCount)
+        count = UP.MaxCount;
+      while (count != 0 && TripCount % count != 0)
+        count--;
+      if (UP.AllowRemainder && count <= 1) {
+        // If there is no Count that is modulo of TripCount, set Count to
+        // largest power-of-two factor that satisfies the threshold limit.
+        // As we'll create fixup loop, do the type of unrolling only if
+        // remainder loop is allowed.
+        count = UP.DefaultUnrollRuntimeCount;
+        while (count != 0 &&
+               UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
+          count >>= 1;
+      }
+      if (count < 2) {
+        count = 0;
+      }
+    } else {
+      count = TripCount;
+    }
+    if (count > UP.MaxCount)
+      count = UP.MaxCount;
+
+    LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << count << "\n");
+
+    return count;
+  }
+
+  // if didn't return until here, should continue to other priorties
+  return None;
+}
 // Returns true if unroll count was set explicitly.
 // Calculates unroll count and writes it to UP.Count.
 // Unless IgnoreUser is true, will also use metadata and command-line options
@@ -771,7 +900,18 @@ bool llvm::computeUnrollCount(
     TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
 
   UnrollCostEstimator UCE(*L, LoopSize);
+  Optional<unsigned> UnrollFactor;
+
+  const bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
+  const bool PragmaFullUnroll = hasUnrollFullPragma(L);
+  const unsigned PragmaCount = unrollCountPragmaValue(L);
+  const bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
+
+  const bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
+                              PragmaEnableUnroll || UserUnrollCount;
 
+  PragmaInfo PInfo(UserUnrollCount, PragmaFullUnroll, PragmaCount,
+                   PragmaEnableUnroll);
   // Use an explicit peel count that has been specified for testing. In this
   // case it's not permitted to also specify an explicit unroll count.
   if (PP.PeelCount) {
@@ -783,47 +923,29 @@ bool llvm::computeUnrollCount(
     UP.Runtime = false;
     return true;
   }
-
   // Check for explicit Count.
   // 1st priority is unroll count set by "unroll-count" option.
-  bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
-  if (UserUnrollCount) {
-    UP.Count = UnrollCount;
-    UP.AllowExpensiveTripCount = true;
-    UP.Force = true;
-    if (UP.AllowRemainder && UCE.getUnrolledLoopSize(UP) < UP.Threshold)
-      return true;
-  }
-
   // 2nd priority is unroll count set by pragma.
-  unsigned PragmaCount = unrollCountPragmaValue(L);
-  if (PragmaCount > 0) {
-    UP.Count = PragmaCount;
-    UP.Runtime = true;
-    UP.AllowExpensiveTripCount = true;
-    UP.Force = true;
-    if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
-        UCE.getUnrolledLoopSize(UP) < PragmaUnrollThreshold)
-      return true;
-  }
-  bool PragmaFullUnroll = hasUnrollFullPragma(L);
-  if (PragmaFullUnroll && TripCount != 0) {
-    UP.Count = TripCount;
-    if (UCE.getUnrolledLoopSize(UP) < PragmaUnrollThreshold)
-      return false;
-  }
+  UnrollFactor = shouldPragmaUnroll(L, PInfo, TripMultiple, TripCount, UCE, UP);
 
-  bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
-  bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
-                        PragmaEnableUnroll || UserUnrollCount;
-
-  if (ExplicitUnroll && TripCount != 0) {
-    // If the loop has an unrolling pragma, we want to be more aggressive with
-    // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
-    // value which is larger than the default limits.
-    UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
-    UP.PartialThreshold =
-        std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
+  if (UnrollFactor) {
+    UP.Count = *UnrollFactor;
+
+    if (UserUnrollCount || (PragmaCount > 0)) {
+      UP.AllowExpensiveTripCount = true;
+      UP.Force = true;
+    }
+    UP.Runtime |= (PragmaCount > 0);
+    return ExplicitUnroll;
+  } else {
+    if (ExplicitUnroll && TripCount != 0) {
+      // If the loop has an unrolling pragma, we want to be more aggressive with
+      // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
+      // value which is larger than the default limits.
+      UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
+      UP.PartialThreshold =
+          std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
+    }
   }
 
   // 3rd priority is full unroll count.
@@ -853,28 +975,20 @@ bool llvm::computeUnrollCount(
   unsigned FullUnrollTripCount =
       ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
   UP.Count = FullUnrollTripCount;
-  if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
-    // When computing the unrolled size, note that BEInsns are not replicated
-    // like the rest of the loop body.
-    if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) {
-      UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
-      return ExplicitUnroll;
-    } else {
-      // The loop isn't that small, but we still can fully unroll it if that
-      // helps to remove a significant number of instructions.
-      // To check that, run additional analysis on the loop.
-      if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
-              L, FullUnrollTripCount, DT, SE, EphValues, TTI,
-              UP.Threshold * UP.MaxPercentThresholdBoost / 100,
-              UP.MaxIterationsCountToAnalyze)) {
-        unsigned Boost =
-            getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
-        if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
-          UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
-          return ExplicitUnroll;
-        }
-      }
-    }
+
+  UnrollFactor =
+      shouldFullUnroll(L, TTI, DT, SE, EphValues, FullUnrollTripCount, UCE, UP);
+
+  // if shouldFullUnroll can do the unrolling, some side parameteres should be
+  // set
+  if (UnrollFactor) {
+    UP.Count = *UnrollFactor;
+    UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
+    TripCount = FullUnrollTripCount;
+    TripMultiple = UP.UpperBound ? 1 : TripMultiple;
+    return ExplicitUnroll;
+  } else {
+    UP.Count = FullUnrollTripCount;
   }
 
   // 4th priority is loop peeling.
@@ -885,39 +999,31 @@ bool llvm::computeUnrollCount(
     return ExplicitUnroll;
   }
 
+  // Before starting partial unrolling, set up.partial to true,
+  // if user explicitly asked  for unrolling
+  if (TripCount)
+    UP.Partial |= ExplicitUnroll;
+
   // 5th priority is partial unrolling.
   // Try partial unroll only when TripCount could be statically calculated.
-  if (TripCount) {
-    UP.Partial |= ExplicitUnroll;
-    if (!UP.Partial) {
-      LLVM_DEBUG(dbgs() << "  will not try to unroll partially because "
-                        << "-unroll-allow-partial not given\n");
-      UP.Count = 0;
-      return false;
-    }
-    if (UP.Count == 0)
-      UP.Count = TripCount;
+  UnrollFactor = shouldPartialUnroll(LoopSize, TripCount, UCE, UP);
+
+  if (UnrollFactor) {
+    UP.Count = *UnrollFactor;
+
+    if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
+        UP.Count != TripCount)
+      ORE->emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE,
+                                        "FullUnrollAsDirectedTooLarge",
+                                        L->getStartLoc(), L->getHeader())
+               << "Unable to fully unroll loop as directed by unroll pragma "
+                  "because "
+                  "unrolled size is too large.";
+      });
+
     if (UP.PartialThreshold != NoThreshold) {
-      // Reduce unroll count to be modulo of TripCount for partial unrolling.
-      if (UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold)
-        UP.Count =
-            (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
-            (LoopSize - UP.BEInsns);
-      if (UP.Count > UP.MaxCount)
-        UP.Count = UP.MaxCount;
-      while (UP.Count != 0 && TripCount % UP.Count != 0)
-        UP.Count--;
-      if (UP.AllowRemainder && UP.Count <= 1) {
-        // If there is no Count that is modulo of TripCount, set Count to
-        // largest power-of-two factor that satisfies the threshold limit.
-        // As we'll create fixup loop, do the type of unrolling only if
-        // remainder loop is allowed.
-        UP.Count = UP.DefaultUnrollRuntimeCount;
-        while (UP.Count != 0 &&
-               UCE.getUnrolledLoopSize(UP) > UP.PartialThreshold)
-          UP.Count >>= 1;
-      }
-      if (UP.Count < 2) {
+      if (UP.Count == 0) {
         if (PragmaEnableUnroll)
           ORE->emit([&]() {
             return OptimizationRemarkMissed(DEBUG_TYPE,
@@ -927,25 +1033,8 @@ bool llvm::computeUnrollCount(
                       "pragma "
                       "because unrolled size is too large.";
           });
-        UP.Count = 0;
       }
-    } else {
-      UP.Count = TripCount;
     }
-    if (UP.Count > UP.MaxCount)
-      UP.Count = UP.MaxCount;
-    if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
-        UP.Count != TripCount)
-      ORE->emit([&]() {
-        return OptimizationRemarkMissed(DEBUG_TYPE,
-                                        "FullUnrollAsDirectedTooLarge",
-                                        L->getStartLoc(), L->getHeader())
-               << "Unable to fully unroll loop as directed by unroll pragma "
-                  "because "
-                  "unrolled size is too large.";
-      });
-    LLVM_DEBUG(dbgs() << "  partially unrolling with count: " << UP.Count
-                      << "\n");
     return ExplicitUnroll;
   }
   assert(TripCount == 0 &&
@@ -982,8 +1071,6 @@ bool llvm::computeUnrollCount(
         UP.AllowExpensiveTripCount = true;
     }
   }
-
-  // Reduce count based on the type of unrolling and the threshold values.
   UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
   if (!UP.Runtime) {
     LLVM_DEBUG(
@@ -1018,7 +1105,7 @@ bool llvm::computeUnrollCount(
 
     using namespace ore;
 
-    if (PragmaCount > 0 && !UP.AllowRemainder)
+    if (unrollCountPragmaValue(L) > 0 && !UP.AllowRemainder)
       ORE->emit([&]() {
         return OptimizationRemarkMissed(DEBUG_TYPE,
                                         "DifferentUnrollCountFromDirected",
diff --git a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 9a854ff802465..9770465545473 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -232,10 +232,8 @@ namespace {
       AU.addPreserved<LazyBranchProbabilityInfoPass>();
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
-      if (EnableMSSALoopDependency) {
-        AU.addRequired<MemorySSAWrapperPass>();
-        AU.addPreserved<MemorySSAWrapperPass>();
-      }
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
       if (HasBranchDivergence)
         AU.addRequired<LegacyDivergenceAnalysis>();
       getLoopAnalysisUsage(AU);
@@ -539,11 +537,8 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) {
   LPM = &LPMRef;
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  if (EnableMSSALoopDependency) {
-    MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
-    assert(DT && "Cannot update MemorySSA without a valid DomTree.");
-  }
+  MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+  MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
   CurrentLoop = L;
   Function *F = CurrentLoop->getHeader()->getParent();
 
@@ -551,19 +546,19 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) {
   if (SanitizeMemory)
     SafetyInfo.computeLoopSafetyInfo(L);
 
-  if (MSSA && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSA->verifyMemorySSA();
 
   bool Changed = false;
   do {
     assert(CurrentLoop->isLCSSAForm(*DT));
-    if (MSSA && VerifyMemorySSA)
+    if (VerifyMemorySSA)
       MSSA->verifyMemorySSA();
     RedoLoop = false;
     Changed |= processCurrentLoop();
   } while (RedoLoop);
 
-  if (MSSA && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSA->verifyMemorySSA();
 
   return Changed;
diff --git a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 7872c553b412c..44027ccd92ca2 100644
--- a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -82,7 +82,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
 
   // Add attribute "readnone" so that backend can use a native sqrt instruction
   // for this call.
-  Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+  Call->addFnAttr(Attribute::ReadNone);
 
   // Insert a FP compare instruction and use it as the CurrBB branch condition.
   Builder.SetInsertPoint(CurrBBTerm);
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index bc0fecc972fc1..f0b85c154d11c 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1377,11 +1377,11 @@ static AttributeList legalizeCallAttributes(LLVMContext &Ctx,
     return AL;
 
   // Remove the readonly, readnone, and statepoint function attributes.
-  AttrBuilder FnAttrs = AL.getFnAttributes();
+  AttrBuilder FnAttrs = AL.getFnAttrs();
   for (auto Attr : FnAttrsToStrip)
     FnAttrs.removeAttribute(Attr);
 
-  for (Attribute A : AL.getFnAttributes()) {
+  for (Attribute A : AL.getFnAttrs()) {
     if (isStatepointDirectiveAttr(A))
       FnAttrs.remove(A);
   }
@@ -1533,9 +1533,8 @@ static StringRef getDeoptLowering(CallBase *Call) {
     // FIXME: Calls have a *really* confusing interface around attributes
     // with values.
     const AttributeList &CSAS = Call->getAttributes();
-    if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering))
-      return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering)
-          .getValueAsString();
+    if (CSAS.hasFnAttr(DeoptLowering))
+      return CSAS.getFnAttr(DeoptLowering).getValueAsString();
     Function *F = Call->getCalledFunction();
     assert(F && F->hasFnAttribute(DeoptLowering));
     return F->getFnAttribute(DeoptLowering).getValueAsString();
@@ -1801,7 +1800,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
       CallInst *GCResult = Builder.CreateGCResult(Token, Call->getType(), Name);
       GCResult->setAttributes(
           AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex,
-                             Call->getAttributes().getRetAttributes()));
+                             Call->getAttributes().getRetAttrs()));
 
       // We cannot RAUW or delete CS.getInstruction() because it could be in the
       // live set of some other safepoint, in which case that safepoint's
@@ -2656,14 +2655,15 @@ template <typename AttrHolder>
 static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
                                       unsigned Index) {
   AttrBuilder R;
-  if (AH.getDereferenceableBytes(Index))
+  AttributeSet AS = AH.getAttributes().getAttributes(Index);
+  if (AS.getDereferenceableBytes())
     R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable,
-                                  AH.getDereferenceableBytes(Index)));
-  if (AH.getDereferenceableOrNullBytes(Index))
+                                  AS.getDereferenceableBytes()));
+  if (AS.getDereferenceableOrNullBytes())
     R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
-                                  AH.getDereferenceableOrNullBytes(Index)));
+                                  AS.getDereferenceableOrNullBytes()));
   for (auto Attr : ParamAttrsToStrip)
-    if (AH.getAttributes().hasAttribute(Index, Attr))
+    if (AS.hasAttribute(Attr))
       R.addAttribute(Attr);
 
   if (!R.empty())
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index b09f896d01577..f547e5f1a0086 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -490,15 +490,14 @@ bool llvm::runIPSCCP(
         AttrBuilder AttributesToRemove;
         AttributesToRemove.addAttribute(Attribute::ArgMemOnly);
         AttributesToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
-        F.removeAttributes(AttributeList::FunctionIndex, AttributesToRemove);
+        F.removeFnAttrs(AttributesToRemove);
 
         for (User *U : F.users()) {
           auto *CB = dyn_cast<CallBase>(U);
           if (!CB || CB->getCalledFunction() != &F)
             continue;
 
-          CB->removeAttributes(AttributeList::FunctionIndex,
-                               AttributesToRemove);
+          CB->removeFnAttrs(AttributesToRemove);
         }
       }
     }
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index b9cccc2af3090..a6d489f825b22 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -3126,10 +3126,8 @@ class SimpleLoopUnswitchLegacyPass : public LoopPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
-    if (EnableMSSALoopDependency) {
-      AU.addRequired<MemorySSAWrapperPass>();
-      AU.addPreserved<MemorySSAWrapperPass>();
-    }
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 };
@@ -3150,12 +3148,8 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-  MemorySSA *MSSA = nullptr;
-  Optional<MemorySSAUpdater> MSSAU;
-  if (EnableMSSALoopDependency) {
-    MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-    MSSAU = MemorySSAUpdater(MSSA);
-  }
+  MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+  MemorySSAUpdater MSSAU(MSSA);
 
   auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
   auto *SE = SEWP ? &SEWP->getSE() : nullptr;
@@ -3179,14 +3173,13 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
       LPM.markLoopAsDeleted(*L);
   };
 
-  if (MSSA && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSA->verifyMemorySSA();
 
-  bool Changed =
-      unswitchLoop(*L, DT, LI, AC, AA, TTI, true, NonTrivial, UnswitchCB, SE,
-                   MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
+  bool Changed = unswitchLoop(*L, DT, LI, AC, AA, TTI, true, NonTrivial,
+                              UnswitchCB, SE, &MSSAU);
 
-  if (MSSA && VerifyMemorySSA)
+  if (VerifyMemorySSA)
     MSSA->verifyMemorySSA();
 
   // Historically this pass has had issues with the dominator tree so verify it
diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
index d689e04da36f9..259b802bda63b 100644
--- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
+++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -203,21 +203,20 @@ struct AssumeBuilderState {
   }
 
   void addCall(const CallBase *Call) {
-    auto addAttrList = [&](AttributeList AttrList) {
-      for (unsigned Idx = AttributeList::FirstArgIndex;
-           Idx < AttrList.getNumAttrSets(); Idx++)
-        for (Attribute Attr : AttrList.getAttributes(Idx)) {
+    auto addAttrList = [&](AttributeList AttrList, unsigned NumArgs) {
+      for (unsigned Idx = 0; Idx < NumArgs; Idx++)
+        for (Attribute Attr : AttrList.getParamAttrs(Idx)) {
           bool IsPoisonAttr = Attr.hasAttribute(Attribute::NonNull) ||
                               Attr.hasAttribute(Attribute::Alignment);
-          if (!IsPoisonAttr || Call->isPassingUndefUB(Idx - 1))
-            addAttribute(Attr, Call->getArgOperand(Idx - 1));
+          if (!IsPoisonAttr || Call->isPassingUndefUB(Idx))
+            addAttribute(Attr, Call->getArgOperand(Idx));
         }
-      for (Attribute Attr : AttrList.getFnAttributes())
+      for (Attribute Attr : AttrList.getFnAttrs())
         addAttribute(Attr, nullptr);
     };
-    addAttrList(Call->getAttributes());
+    addAttrList(Call->getAttributes(), Call->arg_size());
     if (Function *Fn = Call->getCalledFunction())
-      addAttrList(Fn->getAttributes());
+      addAttrList(Fn->getAttributes(), Fn->arg_size());
   }
 
   AssumeInst *build() {
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index c42eee7aeddc1..c17e7b7f3a5bb 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -96,9 +96,9 @@ static bool setDoesNotThrow(Function &F) {
 }
 
 static bool setRetDoesNotAlias(Function &F) {
-  if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias))
+  if (F.hasRetAttribute(Attribute::NoAlias))
     return false;
-  F.addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+  F.addRetAttr(Attribute::NoAlias);
   ++NumNoAlias;
   return true;
 }
@@ -145,8 +145,8 @@ static bool setSignExtendedArg(Function &F, unsigned ArgNo) {
 
 static bool setRetNoUndef(Function &F) {
   if (!F.getReturnType()->isVoidTy() &&
-      !F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoUndef)) {
-    F.addAttribute(AttributeList::ReturnIndex, Attribute::NoUndef);
+      !F.hasRetAttribute(Attribute::NoUndef)) {
+    F.addRetAttr(Attribute::NoUndef);
     ++NumNoUndef;
     return true;
   }
@@ -1453,9 +1453,8 @@ static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
   // The incoming attribute set may have come from a speculatable intrinsic, but
   // is being replaced with a library call which is not allowed to be
   // speculatable.
-  CI->setAttributes(Attrs.removeAttribute(B.getContext(),
-                                          AttributeList::FunctionIndex,
-                                          Attribute::Speculatable));
+  CI->setAttributes(
+      Attrs.removeFnAttribute(B.getContext(), Attribute::Speculatable));
   if (const Function *F =
           dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
@@ -1498,9 +1497,8 @@ static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
   // The incoming attribute set may have come from a speculatable intrinsic, but
   // is being replaced with a library call which is not allowed to be
   // speculatable.
-  CI->setAttributes(Attrs.removeAttribute(B.getContext(),
-                                          AttributeList::FunctionIndex,
-                                          Attribute::Speculatable));
+  CI->setAttributes(
+      Attrs.removeFnAttribute(B.getContext(), Attribute::Speculatable));
   if (const Function *F =
           dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
     CI->setCallingConv(F->getCallingConv());
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 87868251036c1..ebe19f1751e55 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -424,6 +424,21 @@ bool llvm::isLegalToPromote(const CallBase &CB, Function *Callee,
         *FailureReason = "Argument type mismatch";
       return false;
     }
+    // Make sure that the callee and call agree on byval/inalloca. The types do
+    // not have to match.
+
+    if (Callee->hasParamAttribute(I, Attribute::ByVal) !=
+        CB.getAttributes().hasParamAttr(I, Attribute::ByVal)) {
+      if (FailureReason)
+        *FailureReason = "byval mismatch";
+      return false;
+    }
+    if (Callee->hasParamAttribute(I, Attribute::InAlloca) !=
+        CB.getAttributes().hasParamAttr(I, Attribute::InAlloca)) {
+      if (FailureReason)
+        *FailureReason = "inalloca mismatch";
+      return false;
+    }
   }
   for (; I < NumArgs; I++) {
     // Vararg functions can have more arguments than parameters.
@@ -485,18 +500,19 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
       CB.setArgOperand(ArgNo, Cast);
 
       // Remove any incompatible attributes for the argument.
-      AttrBuilder ArgAttrs(CallerPAL.getParamAttributes(ArgNo));
+      AttrBuilder ArgAttrs(CallerPAL.getParamAttrs(ArgNo));
       ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy));
 
-      // If byval is used, this must be a pointer type, and the byval type must
-      // match the element type. Update it if present.
+      // We may have a different byval/inalloca type.
       if (ArgAttrs.getByValType())
         ArgAttrs.addByValAttr(Callee->getParamByValType(ArgNo));
+      if (ArgAttrs.getInAllocaType())
+        ArgAttrs.addInAllocaAttr(Callee->getParamInAllocaType(ArgNo));
 
       NewArgAttrs.push_back(AttributeSet::get(Ctx, ArgAttrs));
       AttributeChanged = true;
     } else
-      NewArgAttrs.push_back(CallerPAL.getParamAttributes(ArgNo));
+      NewArgAttrs.push_back(CallerPAL.getParamAttrs(ArgNo));
   }
 
   // If the return type of the call site doesn't match that of the callee, cast
@@ -511,7 +527,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
 
   // Set the new callsite attribute.
   if (AttributeChanged)
-    CB.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttributes(),
+    CB.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttrs(),
                                         AttributeSet::get(Ctx, RAttrs),
                                         NewArgAttrs));
 
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 0ac9a5aaa425b..7ea799a3f6453 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -116,13 +116,13 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
   for (const Argument &OldArg : OldFunc->args()) {
     if (Argument *NewArg = dyn_cast<Argument>(VMap[&OldArg])) {
       NewArgAttrs[NewArg->getArgNo()] =
-          OldAttrs.getParamAttributes(OldArg.getArgNo());
+          OldAttrs.getParamAttrs(OldArg.getArgNo());
     }
   }
 
   NewFunc->setAttributes(
-      AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(),
-                         OldAttrs.getRetAttributes(), NewArgAttrs));
+      AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttrs(),
+                         OldAttrs.getRetAttrs(), NewArgAttrs));
 
   // Everything else beyond this point deals with function instructions,
   // so if we are dealing with a function declaration, we're done.
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 9edc52b535508..0411c9bfab5d7 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -885,7 +885,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   //  "target-features" attribute allowing it to be lowered.
   // FIXME: This should be changed to check to see if a specific
   //           attribute can not be inherited.
-  for (const auto &Attr : oldFunction->getAttributes().getFnAttributes()) {
+  for (const auto &Attr : oldFunction->getAttributes().getFnAttrs()) {
     if (Attr.isStringAttribute()) {
       if (Attr.getKindAsString() == "thunk")
         continue;
@@ -943,6 +943,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       // Those attributes should be safe to propagate to the extracted function.
       case Attribute::AlwaysInline:
       case Attribute::Cold:
+      case Attribute::DisableSanitizerInstrumentation:
       case Attribute::Hot:
       case Attribute::NoRecurse:
       case Attribute::InlineHint:
diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 31d03e1e86af0..6674e9bdbc39e 100644
--- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -89,7 +89,7 @@ static bool runOnFunction(Function &F, bool PostInlining) {
 
     insertCall(F, EntryFunc, &*F.begin()->getFirstInsertionPt(), DL);
     Changed = true;
-    F.removeAttribute(AttributeList::FunctionIndex, EntryAttr);
+    F.removeFnAttr(EntryAttr);
   }
 
   if (!ExitFunc.empty()) {
@@ -111,7 +111,7 @@ static bool runOnFunction(Function &F, bool PostInlining) {
       insertCall(F, ExitFunc, T, DL);
       Changed = true;
     }
-    F.removeAttribute(AttributeList::FunctionIndex, ExitAttr);
+    F.removeFnAttr(ExitAttr);
   }
 
   return Changed;
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 792aa8208f276..e985fece6ab26 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1259,8 +1259,7 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
     // existing attribute value (i.e. attributes such as dereferenceable,
     // dereferenceable_or_null etc). See AttrBuilder::merge for more details.
     AttributeList AL = NewRetVal->getAttributes();
-    AttributeList NewAL =
-        AL.addAttributes(Context, AttributeList::ReturnIndex, Valid);
+    AttributeList NewAL = AL.addRetAttributes(Context, Valid);
     NewRetVal->setAttributes(NewAL);
   }
 }
@@ -1376,13 +1375,13 @@ static void UpdateCallGraphAfterInlining(CallBase &CB,
   CallerNode->removeCallEdgeFor(*cast<CallBase>(&CB));
 }
 
-static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
-                                    BasicBlock *InsertBlock,
+static void HandleByValArgumentInit(Type *ByValType, Value *Dst, Value *Src,
+                                    Module *M, BasicBlock *InsertBlock,
                                     InlineFunctionInfo &IFI) {
-  Type *AggTy = cast<PointerType>(Src->getType())->getElementType();
   IRBuilder<> Builder(InsertBlock, InsertBlock->begin());
 
-  Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy));
+  Value *Size =
+      Builder.getInt64(M->getDataLayout().getTypeStoreSize(ByValType));
 
   // Always generate a memcpy of alignment 1 here because we don't know
   // the alignment of the src pointer.  Other optimizations can infer
@@ -1393,13 +1392,13 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
 
 /// When inlining a call site that has a byval argument,
 /// we have to make the implicit memcpy explicit by adding it.
-static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
+static Value *HandleByValArgument(Type *ByValType, Value *Arg,
+                                  Instruction *TheCall,
                                   const Function *CalledFunc,
                                   InlineFunctionInfo &IFI,
                                   unsigned ByValAlignment) {
-  PointerType *ArgTy = cast<PointerType>(Arg->getType());
-  Type *AggTy = ArgTy->getElementType();
-
+  assert(cast<PointerType>(Arg->getType())
+             ->isOpaqueOrPointeeTypeMatches(ByValType));
   Function *Caller = TheCall->getFunction();
   const DataLayout &DL = Caller->getParent()->getDataLayout();
 
@@ -1427,7 +1426,7 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
   }
 
   // Create the alloca.  If we have DataLayout, use nice alignment.
-  Align Alignment(DL.getPrefTypeAlignment(AggTy));
+  Align Alignment(DL.getPrefTypeAlignment(ByValType));
 
   // If the byval had an alignment specified, we *must* use at least that
   // alignment, as it is required by the byval argument (and uses of the
@@ -1435,7 +1434,7 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
   Alignment = max(Alignment, MaybeAlign(ByValAlignment));
 
   Value *NewAlloca =
-      new AllocaInst(AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment,
+      new AllocaInst(ByValType, DL.getAllocaAddrSpace(), nullptr, Alignment,
                      Arg->getName(), &*Caller->begin()->begin());
   IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca));
 
@@ -1695,43 +1694,49 @@ inlineRetainOrClaimRVCalls(CallBase &CB,
         continue;
 
       if (auto *II = dyn_cast<IntrinsicInst>(&*CurI)) {
-        if (II->getIntrinsicID() == Intrinsic::objc_autoreleaseReturnValue &&
-            II->hasNUses(0) &&
-            objcarc::GetRCIdentityRoot(II->getOperand(0)) == RetOpnd) {
-          // If we've found a matching authoreleaseRV call:
-          // - If claimRV is attached to the call, insert a call to objc_release
-          //   and erase the autoreleaseRV call.
-          // - If retainRV is attached to the call, just erase the autoreleaseRV
-          //   call.
-          if (IsClaimRV) {
-            Builder.SetInsertPoint(II);
-            Function *IFn =
-                Intrinsic::getDeclaration(Mod, Intrinsic::objc_release);
-            Value *BC =
-                Builder.CreateBitCast(RetOpnd, IFn->getArg(0)->getType());
-            Builder.CreateCall(IFn, BC, "");
-          }
-          II->eraseFromParent();
-          InsertRetainCall = false;
-        }
-      } else if (auto *CI = dyn_cast<CallInst>(&*CurI)) {
-        if (objcarc::GetRCIdentityRoot(CI) == RetOpnd &&
-            !objcarc::hasAttachedCallOpBundle(CI)) {
-          // If we've found an unannotated call that defines RetOpnd, add a
-          // "clang.arc.attachedcall" operand bundle.
-          Value *BundleArgs[] = {ConstantInt::get(
-              Builder.getInt64Ty(),
-              objcarc::getAttachedCallOperandBundleEnum(IsRetainRV))};
-          OperandBundleDef OB("clang.arc.attachedcall", BundleArgs);
-          auto *NewCall = CallBase::addOperandBundle(
-              CI, LLVMContext::OB_clang_arc_attachedcall, OB, CI);
-          NewCall->copyMetadata(*CI);
-          CI->replaceAllUsesWith(NewCall);
-          CI->eraseFromParent();
-          InsertRetainCall = false;
+        if (II->getIntrinsicID() != Intrinsic::objc_autoreleaseReturnValue ||
+            !II->hasNUses(0) ||
+            objcarc::GetRCIdentityRoot(II->getOperand(0)) != RetOpnd)
+          break;
+
+        // If we've found a matching authoreleaseRV call:
+        // - If claimRV is attached to the call, insert a call to objc_release
+        //   and erase the autoreleaseRV call.
+        // - If retainRV is attached to the call, just erase the autoreleaseRV
+        //   call.
+        if (IsClaimRV) {
+          Builder.SetInsertPoint(II);
+          Function *IFn =
+              Intrinsic::getDeclaration(Mod, Intrinsic::objc_release);
+          Value *BC = Builder.CreateBitCast(RetOpnd, IFn->getArg(0)->getType());
+          Builder.CreateCall(IFn, BC, "");
         }
+        II->eraseFromParent();
+        InsertRetainCall = false;
+        break;
       }
 
+      auto *CI = dyn_cast<CallInst>(&*CurI);
+
+      if (!CI)
+        break;
+
+      if (objcarc::GetRCIdentityRoot(CI) != RetOpnd ||
+          objcarc::hasAttachedCallOpBundle(CI))
+        break;
+
+      // If we've found an unannotated call that defines RetOpnd, add a
+      // "clang.arc.attachedcall" operand bundle.
+      Value *BundleArgs[] = {ConstantInt::get(
+          Builder.getInt64Ty(),
+          objcarc::getAttachedCallOperandBundleEnum(IsRetainRV))};
+      OperandBundleDef OB("clang.arc.attachedcall", BundleArgs);
+      auto *NewCall = CallBase::addOperandBundle(
+          CI, LLVMContext::OB_clang_arc_attachedcall, OB, CI);
+      NewCall->copyMetadata(*CI);
+      CI->replaceAllUsesWith(NewCall);
+      CI->eraseFromParent();
+      InsertRetainCall = false;
       break;
     }
 
@@ -1895,8 +1900,13 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
 
   { // Scope to destroy VMap after cloning.
     ValueToValueMapTy VMap;
+    struct ByValInit {
+      Value *Dst;
+      Value *Src;
+      Type *Ty;
+    };
     // Keep a list of pair (dst, src) to emit byval initializations.
-    SmallVector<std::pair<Value*, Value*>, 4> ByValInit;
+    SmallVector<ByValInit, 4> ByValInits;
 
     // When inlining a function that contains noalias scope metadata,
     // this metadata needs to be cloned so that the inlined blocks
@@ -1921,10 +1931,12 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
       // or readnone, because the copy would be unneeded: the callee doesn't
       // modify the struct.
       if (CB.isByValArgument(ArgNo)) {
-        ActualArg = HandleByValArgument(ActualArg, &CB, CalledFunc, IFI,
+        ActualArg = HandleByValArgument(CB.getParamByValType(ArgNo), ActualArg,
+                                        &CB, CalledFunc, IFI,
                                         CalledFunc->getParamAlignment(ArgNo));
         if (ActualArg != *AI)
-          ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI));
+          ByValInits.push_back(
+              {ActualArg, (Value *)*AI, CB.getParamByValType(ArgNo)});
       }
 
       VMap[&*I] = ActualArg;
@@ -1971,8 +1983,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
     }
 
     // Inject byval arguments initialization.
-    for (std::pair<Value*, Value*> &Init : ByValInit)
-      HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(),
+    for (ByValInit &Init : ByValInits)
+      HandleByValArgumentInit(Init.Ty, Init.Dst, Init.Src, Caller->getParent(),
                               &*FirstNewBlock, IFI);
 
     Optional<OperandBundleUse> ParentDeopt =
@@ -2102,7 +2114,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   for (unsigned i = CalledFunc->getFunctionType()->getNumParams();
        i < CB.getNumArgOperands(); i++) {
     VarArgsToForward.push_back(CB.getArgOperand(i));
-    VarArgsAttrs.push_back(CB.getAttributes().getParamAttributes(i));
+    VarArgsAttrs.push_back(CB.getAttributes().getParamAttrs(i));
   }
 
   bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false;
@@ -2135,13 +2147,13 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
           if (!Attrs.isEmpty() || !VarArgsAttrs.empty()) {
             for (unsigned ArgNo = 0;
                  ArgNo < CI->getFunctionType()->getNumParams(); ++ArgNo)
-              ArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
+              ArgAttrs.push_back(Attrs.getParamAttrs(ArgNo));
           }
 
           // Add VarArg attributes.
           ArgAttrs.append(VarArgsAttrs.begin(), VarArgsAttrs.end());
-          Attrs = AttributeList::get(CI->getContext(), Attrs.getFnAttributes(),
-                                     Attrs.getRetAttributes(), ArgAttrs);
+          Attrs = AttributeList::get(CI->getContext(), Attrs.getFnAttrs(),
+                                     Attrs.getRetAttrs(), ArgAttrs);
           // Add VarArgs to existing parameters.
           SmallVector<Value *, 6> Params(CI->arg_operands());
           Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 590600eec26c4..3d6ffded9b19d 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3223,7 +3223,7 @@ void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(
   if (F && !F->hasLocalLinkage() && F->hasName() &&
       TLI->getLibFunc(F->getName(), Func) && TLI->hasOptimizedCodeGen(Func) &&
       !F->doesNotAccessMemory())
-    CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin);
+    CI->addFnAttr(Attribute::NoBuiltin);
 }
 
 bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index d2fd32c98d734..d14c006c80327 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -779,8 +779,7 @@ namespace {
       AU.addPreserved<DependenceAnalysisWrapperPass>();
       AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
       AU.addPreserved<BranchProbabilityInfoWrapperPass>();
-      if (EnableMSSALoopDependency)
-        AU.addPreserved<MemorySSAWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
     }
 
     /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
@@ -814,12 +813,10 @@ bool LoopSimplify::runOnFunction(Function &F) {
       &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   MemorySSA *MSSA = nullptr;
   std::unique_ptr<MemorySSAUpdater> MSSAU;
-  if (EnableMSSALoopDependency) {
-    auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
-    if (MSSAAnalysis) {
-      MSSA = &MSSAAnalysis->getMSSA();
-      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
-    }
+  auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+  if (MSSAAnalysis) {
+    MSSA = &MSSAAnalysis->getMSSA();
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
   }
 
   bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 51b4a4e97b9e7..bd6d1e7eb982d 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -167,8 +167,11 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
   // Add the branch to the exit block (around the unrolled loop)
   B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader);
   InsertPt->eraseFromParent();
-  if (DT)
-    DT->changeImmediateDominator(OriginalLoopLatchExit, PrologExit);
+  if (DT) {
+    auto *NewDom = DT->findNearestCommonDominator(OriginalLoopLatchExit,
+                                                  PrologExit);
+    DT->changeImmediateDominator(OriginalLoopLatchExit, NewDom);
+  }
 }
 
 /// Connect the unrolling epilog code to the original loop.
@@ -215,7 +218,10 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
     //   PN = PHI [I, Latch]
     // ...
     // Exit:
-    //   EpilogPN = PHI [PN, EpilogPreHeader]
+    //   EpilogPN = PHI [PN, EpilogPreHeader], [X, Exit2], [Y, Exit2.epil]
+    //
+    // Exits from non-latch blocks point to the original exit block and the
+    // epilogue edges have already been added.
     //
     // There is EpilogPreHeader incoming block instead of NewExit as
     // NewExit was spilt 1 more time to get EpilogPreHeader.
@@ -282,8 +288,10 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
   // Add the branch to the exit block (around the unrolling loop)
   B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
   InsertPt->eraseFromParent();
-  if (DT)
-    DT->changeImmediateDominator(Exit, NewExit);
+  if (DT) {
+    auto *NewDom = DT->findNearestCommonDominator(Exit, NewExit);
+    DT->changeImmediateDominator(Exit, NewDom);
+  }
 
   // Split the main loop exit to maintain canonicalization guarantees.
   SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
@@ -398,32 +406,31 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
         NewPHI->setIncomingValue(idx, V);
     }
   }
-  if (CreateRemainderLoop) {
-    Loop *NewLoop = NewLoops[L];  
-    assert(NewLoop && "L should have been cloned");
-    MDNode *LoopID = NewLoop->getLoopID();
-
-    // Only add loop metadata if the loop is not going to be completely
-    // unrolled.
-    if (UnrollRemainder)
-      return NewLoop;
-
-    Optional<MDNode *> NewLoopID = makeFollowupLoopID(
-        LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});
-    if (NewLoopID.hasValue()) {
-      NewLoop->setLoopID(NewLoopID.getValue());
-
-      // Do not setLoopAlreadyUnrolled if loop attributes have been defined
-      // explicitly.
-      return NewLoop;
-    }
+  if (!CreateRemainderLoop)
+    return nullptr;
+
+  Loop *NewLoop = NewLoops[L];
+  assert(NewLoop && "L should have been cloned");
+  MDNode *LoopID = NewLoop->getLoopID();
 
-    // Add unroll disable metadata to disable future unrolling for this loop.
-    NewLoop->setLoopAlreadyUnrolled();
+  // Only add loop metadata if the loop is not going to be completely
+  // unrolled.
+  if (UnrollRemainder)
+    return NewLoop;
+
+  Optional<MDNode *> NewLoopID = makeFollowupLoopID(
+      LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});
+  if (NewLoopID.hasValue()) {
+    NewLoop->setLoopID(NewLoopID.getValue());
+
+    // Do not setLoopAlreadyUnrolled if loop attributes have been defined
+    // explicitly.
     return NewLoop;
   }
-  else
-    return nullptr;
+
+  // Add unroll disable metadata to disable future unrolling for this loop.
+  NewLoop->setLoopAlreadyUnrolled();
+  return NewLoop;
 }
 
 /// Returns true if we can safely unroll a multi-exit/exiting loop. OtherExits
@@ -440,15 +447,6 @@ static bool canSafelyUnrollMultiExitLoop(Loop *L, BasicBlock *LatchExit,
   if (!PreserveLCSSA)
     return false;
 
-  // TODO: Support multiple exiting blocks jumping to the `LatchExit` when
-  // UnrollRuntimeMultiExit is true. This will need updating the logic in
-  // connectEpilog/connectProlog.
-  if (!LatchExit->getSinglePredecessor()) {
-    LLVM_DEBUG(
-        dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
-                  "predecessor.\n");
-    return false;
-  }
   // FIXME: We bail out of multi-exit unrolling when epilog loop is generated
   // and L is an inner loop. This is because in presence of multiple exits, the
   // outer loop is incorrect: we do not add the EpilogPreheader and exit to the
@@ -477,6 +475,11 @@ static bool canProfitablyUnrollMultiExitLoop(
   if (UnrollRuntimeMultiExit.getNumOccurrences())
     return UnrollRuntimeMultiExit;
 
+  // TODO: We used to bail out for correctness (now fixed).  Under what
+  // circumstances is this case profitable to allow?
+  if (!LatchExit->getSinglePredecessor())
+    return false;
+
   // The main pain point with multi-exit loop unrolling is that once unrolled,
   // we will not be able to merge all blocks into a straight line code.
   // There are branches within the unrolled loop that go to the OtherExits.
@@ -524,22 +527,22 @@ static void updateLatchBranchWeightsForRemainderLoop(Loop *OrigLoop,
   uint64_t TrueWeight, FalseWeight;
   BranchInst *LatchBR =
       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
-  if (LatchBR->extractProfMetadata(TrueWeight, FalseWeight)) {
-    uint64_t ExitWeight = LatchBR->getSuccessor(0) == OrigLoop->getHeader()
-                              ? FalseWeight
-                              : TrueWeight;
-    assert(UnrollFactor > 1);
-    uint64_t BackEdgeWeight = (UnrollFactor - 1) * ExitWeight;
-    BasicBlock *Header = RemainderLoop->getHeader();
-    BasicBlock *Latch = RemainderLoop->getLoopLatch();
-    auto *RemainderLatchBR = cast<BranchInst>(Latch->getTerminator());
-    unsigned HeaderIdx = (RemainderLatchBR->getSuccessor(0) == Header ? 0 : 1);
-    MDBuilder MDB(RemainderLatchBR->getContext());
-    MDNode *WeightNode =
-        HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
-                  : MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
-    RemainderLatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
-  }
+  if (!LatchBR->extractProfMetadata(TrueWeight, FalseWeight))
+    return;
+  uint64_t ExitWeight = LatchBR->getSuccessor(0) == OrigLoop->getHeader()
+                            ? FalseWeight
+                            : TrueWeight;
+  assert(UnrollFactor > 1);
+  uint64_t BackEdgeWeight = (UnrollFactor - 1) * ExitWeight;
+  BasicBlock *Header = RemainderLoop->getHeader();
+  BasicBlock *Latch = RemainderLoop->getLoopLatch();
+  auto *RemainderLatchBR = cast<BranchInst>(Latch->getTerminator());
+  unsigned HeaderIdx = (RemainderLatchBR->getSuccessor(0) == Header ? 0 : 1);
+  MDBuilder MDB(RemainderLatchBR->getContext());
+  MDNode *WeightNode =
+    HeaderIdx ? MDB.createBranchWeights(ExitWeight, BackEdgeWeight)
+                : MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
+  RemainderLatchBR->setMetadata(LLVMContext::MD_prof, WeightNode);
 }
 
 /// Calculate ModVal = (BECount + 1) % Count on the abstract integer domain
@@ -740,8 +743,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
     NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
     NewPreHeader->setName(PreHeader->getName() + ".new");
     // Split LatchExit to create phi nodes from branch above.
-    SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit));
-    NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa", DT, LI,
+    NewExit = SplitBlockPredecessors(LatchExit, {Latch}, ".unr-lcssa", DT, LI,
                                      nullptr, PreserveLCSSA);
     // NewExit gets its DebugLoc from LatchExit, which is not part of the
     // original Loop.
@@ -856,6 +858,14 @@ bool llvm::UnrollRuntimeLoopRemainder(
      // node.
      for (unsigned i = 0; i < oldNumOperands; i++){
        auto *PredBB =PN.getIncomingBlock(i);
+       if (PredBB == Latch)
+         // The latch exit is handled seperately, see connectX
+         continue;
+       if (!L->contains(PredBB))
+         // Even if we had dedicated exits, the code above inserted an
+         // extra branch which can reach the latch exit.
+         continue;
+
        auto *V = PN.getIncomingValue(i);
        if (Instruction *I = dyn_cast<Instruction>(V))
          if (L->contains(I))
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index e4d78f9ada083..5b18aa4a47893 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -710,21 +710,54 @@ void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
 
   SE.forgetLoop(L);
 
-  // Note: By splitting the backedge, and then explicitly making it unreachable
-  // we gracefully handle corner cases such as non-bottom tested loops and the
-  // like.  We also have the benefit of being able to reuse existing well tested
-  // code.  It might be worth special casing the common bottom tested case at
-  // some point to avoid code churn.
-
   std::unique_ptr<MemorySSAUpdater> MSSAU;
   if (MSSA)
     MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
 
-  auto *BackedgeBB = SplitEdge(Latch, Header, &DT, &LI, MSSAU.get());
+  // Update the CFG and domtree.  We chose to special case a couple of
+  // of common cases for code quality and test readability reasons.
+  [&]() -> void {
+    if (auto *BI = dyn_cast<BranchInst>(Latch->getTerminator())) {
+      if (!BI->isConditional()) {
+        DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager);
+        (void)changeToUnreachable(BI, /*PreserveLCSSA*/ true, &DTU,
+                                  MSSAU.get());
+        return;
+      }
+
+      // Conditional latch/exit - note that latch can be shared by inner
+      // and outer loop so the other target doesn't need to an exit
+      if (L->isLoopExiting(Latch)) {
+        // TODO: Generalize ConstantFoldTerminator so that it can be used
+        // here without invalidating LCSSA.  (Tricky case: header is an exit
+        // block of a preceeding sibling loop w/o dedicated exits.)
+        const unsigned ExitIdx = L->contains(BI->getSuccessor(0)) ? 1 : 0;
+        BasicBlock *ExitBB = BI->getSuccessor(ExitIdx);
+
+        DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager);
+        Header->removePredecessor(Latch, true);
+
+        IRBuilder<> Builder(BI);
+        auto *NewBI = Builder.CreateBr(ExitBB);
+        // Transfer the metadata to the new branch instruction.
+        NewBI->copyMetadata(*BI, {LLVMContext::MD_loop, LLVMContext::MD_dbg,
+                                  LLVMContext::MD_annotation});
+
+        BI->eraseFromParent();
+        DTU.applyUpdates({{DominatorTree::Delete, Latch, Header}});
+        return;
+      }
 
-  DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager);
-  (void)changeToUnreachable(BackedgeBB->getTerminator(),
-                            /*PreserveLCSSA*/ true, &DTU, MSSAU.get());
+      // General case.  By splitting the backedge, and then explicitly making it
+      // unreachable we gracefully handle corner cases such as switch and invoke
+      // termiantors.
+      auto *BackedgeBB = SplitEdge(Latch, Header, &DT, &LI, MSSAU.get());
+
+      DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager);
+      (void)changeToUnreachable(BackedgeBB->getTerminator(),
+                                /*PreserveLCSSA*/ true, &DTU, MSSAU.get());
+    }
+  }();
 
   // Erase (and destroy) this loop instance.  Handles relinking sub-loops
   // and blocks within the loop as needed.
@@ -959,11 +992,6 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
                                          const TargetTransformInfo *TTI,
                                          Value *Src, RecurKind RdxKind,
                                          ArrayRef<Value *> RedOps) {
-  TargetTransformInfo::ReductionFlags RdxFlags;
-  RdxFlags.IsMaxOp = RdxKind == RecurKind::SMax || RdxKind == RecurKind::UMax ||
-                     RdxKind == RecurKind::FMax;
-  RdxFlags.IsSigned = RdxKind == RecurKind::SMax || RdxKind == RecurKind::SMin;
-
   auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
   switch (RdxKind) {
   case RecurKind::Add:
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index 8a89158788cf8..14439796fb4ae 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -14,9 +14,9 @@
 
 #include "llvm/Transforms/Utils/LoopVersioning.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -354,14 +354,11 @@ PreservedAnalyses LoopVersioningPass::run(Function &F,
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &AA = AM.getResult<AAManager>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  MemorySSA *MSSA = EnableMSSALoopDependency
-                        ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
-                        : nullptr;
 
   auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
   auto GetLAA = [&](Loop &L) -> const LoopAccessInfo & {
     LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
-                                      TLI, TTI, nullptr, MSSA};
+                                      TLI, TTI, nullptr, nullptr};
     return LAM.getResult<LoopAccessAnalysis>(L, AR);
   };
 
diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index 2aef37205c53d..ab85879243e2a 100644
--- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -125,7 +125,7 @@ Function *llvm::createSanitizerCtor(Module &M, StringRef CtorName) {
   Function *Ctor = Function::createWithDefaultAttr(
       FunctionType::get(Type::getVoidTy(M.getContext()), false),
       GlobalValue::InternalLinkage, 0, CtorName, &M);
-  Ctor->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+  Ctor->addFnAttr(Attribute::NoUnwind);
   BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor);
   ReturnInst::Create(M.getContext(), CtorBB);
   // Ensure Ctor cannot be discarded, even if in a comdat.
@@ -297,7 +297,6 @@ void VFABI::setVectorVariantNames(
            "vector function declaration is missing.");
   }
 #endif
-  CI->addAttribute(
-      AttributeList::FunctionIndex,
+  CI->addFnAttr(
       Attribute::get(M->getContext(), MappingsAttrName, Buffer.str()));
 }
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 3978e1e29825f..ca1545b82e039 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1125,22 +1125,6 @@ Value *SCEVExpander::expandIVInc(PHINode *PN, Value *StepV, const Loop *L,
   return IncV;
 }
 
-/// Hoist the addrec instruction chain rooted in the loop phi above the
-/// position. This routine assumes that this is possible (has been checked).
-void SCEVExpander::hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
-                                  Instruction *Pos, PHINode *LoopPhi) {
-  do {
-    if (DT->dominates(InstToHoist, Pos))
-      break;
-    // Make sure the increment is where we want it. But don't move it
-    // down past a potential existing post-inc user.
-    fixupInsertPoints(InstToHoist);
-    InstToHoist->moveBefore(Pos);
-    Pos = InstToHoist;
-    InstToHoist = cast<Instruction>(InstToHoist->getOperand(0));
-  } while (InstToHoist != LoopPhi);
-}
-
 /// Check whether we can cheaply express the requested SCEV in terms of
 /// the available PHI SCEV by truncation and/or inversion of the step.
 static bool canBeCheaplyTransformed(ScalarEvolution &SE,
@@ -1264,8 +1248,6 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
       if (LSRMode) {
         if (!isExpandedAddRecExprPHI(&PN, TempIncV, L))
           continue;
-        if (L == IVIncInsertLoop && !hoistIVInc(TempIncV, IVIncInsertPos))
-          continue;
       } else {
         if (!isNormalAddRecExprPHI(&PN, TempIncV, L))
           continue;
@@ -1293,11 +1275,6 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
     }
 
     if (AddRecPhiMatch) {
-      // Potentially, move the increment. We have made sure in
-      // isExpandedAddRecExprPHI or hoistIVInc that this is possible.
-      if (L == IVIncInsertLoop)
-        hoistBeforePos(&SE.DT, IncV, IVIncInsertPos, AddRecPhiMatch);
-
       // Ok, the add recurrence looks usable.
       // Remember this PHI, even in post-inc mode.
       InsertedValues.insert(AddRecPhiMatch);
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 847fdd760d2fe..ba3129f5581fe 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -4747,23 +4747,20 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch,
                                            DomTreeUpdater *DTU) {
   LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
   auto *BB = Switch->getParent();
-  BasicBlock *NewDefaultBlock = SplitBlockPredecessors(
-      Switch->getDefaultDest(), Switch->getParent(), "", DTU);
   auto *OrigDefaultBlock = Switch->getDefaultDest();
+  OrigDefaultBlock->removePredecessor(BB);
+  BasicBlock *NewDefaultBlock = BasicBlock::Create(
+      BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(),
+      OrigDefaultBlock);
+  new UnreachableInst(Switch->getContext(), NewDefaultBlock);
   Switch->setDefaultDest(&*NewDefaultBlock);
-  if (DTU)
-    DTU->applyUpdates({{DominatorTree::Insert, BB, &*NewDefaultBlock},
-                       {DominatorTree::Delete, BB, OrigDefaultBlock}});
-  SplitBlock(&*NewDefaultBlock, &NewDefaultBlock->front(), DTU);
-  SmallVector<DominatorTree::UpdateType, 2> Updates;
-  if (DTU)
-    for (auto *Successor : successors(NewDefaultBlock))
-      Updates.push_back({DominatorTree::Delete, NewDefaultBlock, Successor});
-  auto *NewTerminator = NewDefaultBlock->getTerminator();
-  new UnreachableInst(Switch->getContext(), NewTerminator);
-  EraseTerminatorAndDCECond(NewTerminator);
-  if (DTU)
+  if (DTU) {
+    SmallVector<DominatorTree::UpdateType, 2> Updates;
+    Updates.push_back({DominatorTree::Insert, BB, &*NewDefaultBlock});
+    if (!is_contained(successors(BB), OrigDefaultBlock))
+      Updates.push_back({DominatorTree::Delete, BB, &*OrigDefaultBlock});
     DTU->applyUpdates(Updates);
+  }
 }
 
 /// Turn a switch with two reachable destinations into an integer range
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 608b77c582242..d8f0d08c275ab 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -142,12 +142,10 @@ static void annotateDereferenceableBytes(CallInst *CI,
     unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace();
     if (!llvm::NullPointerIsDefined(F, AS) ||
         CI->paramHasAttr(ArgNo, Attribute::NonNull))
-      DerefBytes = std::max(CI->getDereferenceableOrNullBytes(
-                                ArgNo + AttributeList::FirstArgIndex),
+      DerefBytes = std::max(CI->getParamDereferenceableOrNullBytes(ArgNo),
                             DereferenceableBytes);
-  
-    if (CI->getDereferenceableBytes(ArgNo + AttributeList::FirstArgIndex) <
-        DerefBytes) {
+
+    if (CI->getParamDereferenceableBytes(ArgNo) < DerefBytes) {
       CI->removeParamAttr(ArgNo, Attribute::Dereferenceable);
       if (!llvm::NullPointerIsDefined(F, AS) ||
           CI->paramHasAttr(ArgNo, Attribute::NonNull))
@@ -512,8 +510,7 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) {
       B.CreateMemCpy(Dst, Align(1), Src, Align(1),
                      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return Dst;
 }
 
@@ -541,8 +538,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
   CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), LenV);
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return DstEnd;
 }
 
@@ -577,9 +573,9 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
   if (SrcLen == 0) {
     // strncpy(x, "", y) -> memset(x, '\0', y)
     Align MemSetAlign =
-        CI->getAttributes().getParamAttributes(0).getAlignment().valueOrOne();
+        CI->getAttributes().getParamAttrs(0).getAlignment().valueOrOne();
     CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, MemSetAlign);
-    AttrBuilder ArgAttrs(CI->getAttributes().getParamAttributes(0));
+    AttrBuilder ArgAttrs(CI->getAttributes().getParamAttrs(0));
     NewCI->setAttributes(NewCI->getAttributes().addParamAttributes(
         CI->getContext(), 0, ArgAttrs));
     return Dst;
@@ -604,8 +600,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
   CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1),
                                    ConstantInt::get(DL.getIntPtrType(PT), Len));
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return Dst;
 }
 
@@ -1082,8 +1077,7 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilderBase &B) {
   CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align(1),
                                    CI->getArgOperand(1), Align(1), Size);
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return CI->getArgOperand(0);
 }
 
@@ -1136,8 +1130,7 @@ Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilderBase &B) {
   // any return attributes are compliant.
   // TODO: Attach return value attributes to the 1st operand to preserve them?
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N);
 }
 
@@ -1151,8 +1144,7 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilderBase &B) {
   CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align(1),
                                     CI->getArgOperand(1), Align(1), Size);
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return CI->getArgOperand(0);
 }
 
@@ -1166,8 +1158,7 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) {
   Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
   CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1));
   NewCI->setAttributes(CI->getAttributes());
-  NewCI->removeAttributes(AttributeList::ReturnIndex,
-                          AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
   return CI->getArgOperand(0);
 }
 
@@ -2286,7 +2277,7 @@ Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilderBase &B,
   // Proceedings of PACT'98, Oct. 1998, IEEE
   if (!CI->hasFnAttr(Attribute::Cold) &&
       isReportingError(Callee, CI, StreamArg)) {
-    CI->addAttribute(AttributeList::FunctionIndex, Attribute::Cold);
+    CI->addFnAttr(Attribute::Cold);
   }
 
   return nullptr;
@@ -3218,8 +3209,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
         B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
                        Align(1), CI->getArgOperand(2));
     NewCI->setAttributes(CI->getAttributes());
-    NewCI->removeAttributes(AttributeList::ReturnIndex,
-                            AttributeFuncs::typeIncompatible(NewCI->getType()));
+    NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -3232,8 +3222,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
         B.CreateMemMove(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
                         Align(1), CI->getArgOperand(2));
     NewCI->setAttributes(CI->getAttributes());
-    NewCI->removeAttributes(AttributeList::ReturnIndex,
-                            AttributeFuncs::typeIncompatible(NewCI->getType()));
+    NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -3246,8 +3235,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
     CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val,
                                      CI->getArgOperand(2), Align(1));
     NewCI->setAttributes(CI->getAttributes());
-    NewCI->removeAttributes(AttributeList::ReturnIndex,
-                            AttributeFuncs::typeIncompatible(NewCI->getType()));
+    NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
     return CI->getArgOperand(0);
   }
   return nullptr;
@@ -3261,9 +3249,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemPCpyChk(CallInst *CI,
                                   CI->getArgOperand(2), B, DL, TLI)) {
       CallInst *NewCI = cast<CallInst>(Call);
       NewCI->setAttributes(CI->getAttributes());
-      NewCI->removeAttributes(
-          AttributeList::ReturnIndex,
-          AttributeFuncs::typeIncompatible(NewCI->getType()));
+      NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
       return NewCI;
     }
   return nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 742a41dc47c73..d6d226da87f3c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -87,7 +87,6 @@
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -5591,13 +5590,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
 
 ElementCount
 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
-  if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
-    reportVectorizationInfo(
-        "Disabling scalable vectorization, because target does not "
-        "support scalable vectors.",
-        "ScalableVectorsUnsupported", ORE, TheLoop);
+  if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
     return ElementCount::getScalable(0);
-  }
 
   if (Hints->isScalableVectorizationDisabled()) {
     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
@@ -5605,6 +5599,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
     return ElementCount::getScalable(0);
   }
 
+  LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
+
   auto MaxScalableVF = ElementCount::getScalable(
       std::numeric_limits<ElementCount::ScalarTy>::max());
 
@@ -5640,6 +5636,13 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
 
   // Limit MaxScalableVF by the maximum safe dependence distance.
   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
+  if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
+    unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange)
+                             .getVScaleRangeArgs()
+                             .second;
+    if (VScaleMax > 0)
+      MaxVScale = VScaleMax;
+  }
   MaxScalableVF = ElementCount::getScalable(
       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
   if (!MaxScalableVF)
@@ -5707,17 +5710,32 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
       return MaxSafeFixedVF;
     }
 
-    LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
-                      << " is unsafe. Ignoring scalable UserVF.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
-                                        TheLoop->getStartLoc(),
-                                        TheLoop->getHeader())
-             << "User-specified vectorization factor "
-             << ore::NV("UserVectorizationFactor", UserVF)
-             << " is unsafe. Ignoring the hint to let the compiler pick a "
-                "suitable VF.";
-    });
+    if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
+      LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+                        << " is ignored because scalable vectors are not "
+                           "available.\n");
+      ORE->emit([&]() {
+        return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+                                          TheLoop->getStartLoc(),
+                                          TheLoop->getHeader())
+               << "User-specified vectorization factor "
+               << ore::NV("UserVectorizationFactor", UserVF)
+               << " is ignored because the target does not support scalable "
+                  "vectors. The compiler will pick a more suitable value.";
+      });
+    } else {
+      LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+                        << " is unsafe. Ignoring scalable UserVF.\n");
+      ORE->emit([&]() {
+        return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+                                          TheLoop->getStartLoc(),
+                                          TheLoop->getHeader())
+               << "User-specified vectorization factor "
+               << ore::NV("UserVectorizationFactor", UserVF)
+               << " is unsafe. Ignoring the hint to let the compiler pick a "
+                  "more suitable value.";
+      });
+    }
   }
 
   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
@@ -9406,6 +9424,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     }
   }
 
+  // Adjust the recipes for any inloop reductions.
+  adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
+
   // Introduce a recipe to combine the incoming and previous values of a
   // first-order recurrence.
   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
@@ -9460,9 +9481,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
       }
   }
 
-  // Adjust the recipes for any inloop reductions.
-  adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
-
   VPlanTransforms::sinkScalarOperands(*Plan);
   VPlanTransforms::mergeReplicateRegions(*Plan);
 
@@ -10527,15 +10545,12 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
     auto &AC = AM.getResult<AssumptionAnalysis>(F);
     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-    MemorySSA *MSSA = EnableMSSALoopDependency
-                          ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
-                          : nullptr;
 
     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
         [&](Loop &L) -> const LoopAccessInfo & {
       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
-                                        TLI, TTI, nullptr, MSSA};
+                                        TLI, TTI, nullptr, nullptr};
       return LAM.getResult<LoopAccessAnalysis>(L, AR);
     };
     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 2786aa45e06f2..b972933b1f795 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -350,15 +350,19 @@ if(runtimes)
   # Create a runtimes target that uses this file as its top-level CMake file.
   # The runtimes target is a configuration of all the runtime libraries
   # together in a single CMake invocaiton.
+  if("openmp" IN_LIST LLVM_ENABLE_RUNTIMES)
+    message(STATUS "Adding dependencies opt llvm-link")
+    set(extra_deps opt llvm-link)
+  endif()
   if(NOT LLVM_RUNTIME_TARGETS)
     runtime_default_target(
-      DEPENDS ${deps}
+      DEPENDS ${deps} ${extra_deps}
       PREFIXES ${prefixes})
     set(test_targets check-runtimes)
   else()
     if("default" IN_LIST LLVM_RUNTIME_TARGETS)
       runtime_default_target(
-        DEPENDS ${deps}
+        DEPENDS ${deps} ${extra_deps}
         PREFIXES ${prefixes})
       list(REMOVE_ITEM LLVM_RUNTIME_TARGETS "default")
     else()
diff --git a/llvm/test/Analysis/BasicAA/store-promote.ll b/llvm/test/Analysis/BasicAA/store-promote.ll
index af2aa8d467d15..dbf313b39808a 100644
--- a/llvm/test/Analysis/BasicAA/store-promote.ll
+++ b/llvm/test/Analysis/BasicAA/store-promote.ll
@@ -2,10 +2,8 @@
 ; disambiguating some obvious cases.  If LICM is able to disambiguate the
 ; two pointers, then the load should be hoisted, and the store sunk.
 
-; RUN: opt < %s -basic-aa -licm -enable-mssa-loop-dependency=false -enable-new-pm=0 -S | FileCheck %s -check-prefixes=CHECK,AST
-; RUN: opt < %s -basic-aa -licm -enable-mssa-loop-dependency=true -enable-new-pm=0 -S | FileCheck %s -check-prefixes=CHECK,MSSA
-; RUN: opt < %s -aa-pipeline=basic-aa -passes='loop(licm)' -S | FileCheck %s -check-prefixes=CHECK,AST
-; RUN: opt < %s -aa-pipeline=basic-aa -passes='loop-mssa(licm)' -S | FileCheck %s -check-prefixes=CHECK,MSSA
+; RUN: opt < %s -basic-aa -licm -enable-new-pm=0 -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes='loop-mssa(licm)' -S | FileCheck %s
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 @A = global i32 7               ; <i32*> [#uses=3]
@@ -28,13 +26,11 @@ Out:            ; preds = %Loop
 ; The Loop block should be empty after the load/store are promoted.
 ; CHECK:     @test1
 ; CHECK:        load i32, i32* @A
-; MSSA:         load i32, i32* @A
-; MSSA:         store i32 %Atmp, i32* @B
+; CHECK:         load i32, i32* @A
+; CHECK:         store i32 %Atmp, i32* @B
 ; CHECK:      Loop:
 ; CHECK-NEXT:   br i1 %c, label %Out, label %Loop
 ; CHECK:      Out:
-; AST:          store i32 %Atmp, i32* @B
-; AST:          load i32, i32* @A
 }
 
 define i32 @test2(i1 %c) {
diff --git a/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
new file mode 100644
index 0000000000000..47446a8818436
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/AArch64/masked_ldst_vls.ll
@@ -0,0 +1,51 @@
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=256 | FileCheck %s -D#VBITS=256
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=384 | FileCheck %s -D#VBITS=256
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=512 | FileCheck %s -D#VBITS=512
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=640 | FileCheck %s -D#VBITS=512
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=768 | FileCheck %s -D#VBITS=512
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=896 | FileCheck %s -D#VBITS=512
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=1024 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=1152 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=1280 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=1408 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=1536 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=1664 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=1792 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=1920 | FileCheck %s -D#VBITS=1024
+; RUN: opt < %s -cost-model -analyze -aarch64-sve-vector-bits-min=2048 | FileCheck %s -D#VBITS=2048
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @fixed_sve_vls() #0 {
+; CHECK-LABEL: 'fixed_sve_vls'
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(2047,VBITS)+1,2)]] for instruction: %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0v256i8(<256 x i8>* undef, i32 8, <256 x i1> undef, <256 x i8> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(4091,VBITS)+1,2)]] for instruction: %v256i16 = call <256 x i16> @llvm.masked.load.v256i16.p0v256i16(<256 x i16>* undef, i32 8, <256 x i1> undef, <256 x i16> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(511,VBITS)+1,2)]] for instruction: %v16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* undef, i32 8, <16 x i1> undef, <16 x i32> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(1023,VBITS)+1,2)]] for instruction: %v16i64 = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* undef, i32 8, <16 x i1> undef, <16 x i64> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(8191,VBITS)+1,2)]] for instruction: %v512f16 = call <512 x half> @llvm.masked.load.v512f16.p0v512f16(<512 x half>* undef, i32 8, <512 x i1> undef, <512 x half> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(8191,VBITS)+1,2)]] for instruction: %v256f32 = call <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>* undef, i32 8, <256 x i1> undef, <256 x float> undef)
+; CHECK:  Cost Model: Found an estimated cost of [[#mul(div(8191,VBITS)+1,2)]] for instruction: %v128f64 = call <128 x double> @llvm.masked.load.v128f64.p0v128f64(<128 x double>* undef, i32 8, <128 x i1> undef, <128 x double> undef)
+; CHECK:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+entry:
+  %v256i8 = call <256 x i8> @llvm.masked.load.v256i8.p0v256i8(<256 x i8> *undef, i32 8, <256 x i1> undef, <256 x i8> undef)
+  %v256i16 = call <256 x i16> @llvm.masked.load.v256i16.p0v256i16(<256 x i16> *undef, i32 8, <256 x i1> undef, <256 x i16> undef)
+  %v16i32 = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32> *undef, i32 8, <16 x i1> undef, <16 x i32> undef)
+  %v16i64 = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64> *undef, i32 8, <16 x i1> undef, <16 x i64> undef)
+
+  %v512f16 = call <512 x half> @llvm.masked.load.v512f16.p0v512f16(<512 x half> *undef, i32 8, <512 x i1> undef, <512 x half> undef)
+  %v256f32 = call <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float> *undef, i32 8, <256 x i1> undef, <256 x float> undef)
+  %v128f64 = call <128 x double> @llvm.masked.load.v128f64.p0v128f64(<128 x double> *undef, i32 8, <128 x i1> undef, <128 x double> undef)
+
+  ret void
+}
+
+declare <256 x i8> @llvm.masked.load.v256i8.p0v256i8(<256 x i8>*, i32, <256 x i1>, <256 x i8>)
+declare <256 x i16> @llvm.masked.load.v256i16.p0v256i16(<256 x i16>*, i32, <256 x i1>, <256 x i16>)
+declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
+declare <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>*, i32, <16 x i1>, <16 x i64>)
+
+declare <512 x half> @llvm.masked.load.v512f16.p0v512f16(<512 x half>*, i32, <512 x i1>, <512 x half>)
+declare <256 x float> @llvm.masked.load.v256f32.p0v256f32(<256 x float>*, i32, <256 x i1>, <256 x float>)
+declare <128 x double> @llvm.masked.load.v128f64.p0v128f64(<128 x double>*, i32, <128 x i1>, <128 x double>)
+
+attributes #0 = { "target-features"="+sve" }
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
index 935f245fcdd2c..b1f0c1d714d35 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
@@ -2,10 +2,10 @@
 
 define void @strict_fp_reductions() {
 ; CHECK-LABEL: strict_fp_reductions
-; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
   %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
   %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
   %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
index 302c191d4fc1d..866e038f14544 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
@@ -2,7 +2,7 @@
 
 ; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve  < %s | FileCheck %s
 
-define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) {
+define void @masked_gathers(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) vscale_range(0, 16) {
 ; CHECK-LABEL: 'masked_gathers'
 ; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction:   %res.nxv4i32 = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32
 ; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction:   %res.nxv8i32 = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
index 503e6ce5e104f..a5fa33277b79e 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-scatter.ll
@@ -2,7 +2,7 @@
 
 ; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve  < %s | FileCheck %s
 
-define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) {
+define void @masked_scatters(<vscale x 4 x i1> %nxv4i1mask, <vscale x 8 x i1> %nxv8i1mask, <4 x i1> %v4i1mask, <1 x i1> %v1i1mask, <vscale x 1 x i1> %nxv1i1mask) vscale_range(0, 16) {
 ; CHECK-LABEL: 'masked_scatters'
 ; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32
 ; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32
diff --git a/llvm/test/Analysis/CostModel/X86/ctpop.ll b/llvm/test/Analysis/CostModel/X86/ctpop.ll
index d15255a1697d2..a7837d93b837d 100644
--- a/llvm/test/Analysis/CostModel/X86/ctpop.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctpop.ll
@@ -2,10 +2,12 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=-popcnt,+sse2 | FileCheck %s -check-prefixes=SSE2,NOPOPCNT
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+sse2 | FileCheck %s -check-prefixes=SSE2,POPCNT
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+sse4.2 | FileCheck %s -check-prefixes=POPCNT,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx | FileCheck %s -check-prefixes=POPCNT,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx2 | FileCheck %s -check-prefixes=POPCNT,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx512f | FileCheck %s -check-prefixes=POPCNT,AVX512,AVX512F
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=POPCNT,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx | FileCheck %s -check-prefixes=POPCNT,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx2 | FileCheck %s -check-prefixes=POPCNT,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx512f | FileCheck %s -check-prefixes=POPCNT,AVX512F
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=POPCNT,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx512vl,+avx512vpopcntdq | FileCheck %s -check-prefixes=POPCNT,AVX512VPOPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+popcnt,+avx512vl,+avx512bitalg | FileCheck %s -check-prefixes=POPCNT,AVX512BITALG
 
 ; Verify the cost of scalar population count instructions.
 
@@ -96,9 +98,21 @@ define <2 x i64> @var_ctpop_v2i64(<2 x i64> %a) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
 ;
-; AVX512-LABEL: 'var_ctpop_v2i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
+; AVX512F-LABEL: 'var_ctpop_v2i64'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
+;
+; AVX512BW-LABEL: 'var_ctpop_v2i64'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
+;
+; AVX512VPOPCNT-LABEL: 'var_ctpop_v2i64'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
+;
+; AVX512BITALG-LABEL: 'var_ctpop_v2i64'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %ctpop
 ;
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
   ret <2 x i64> %ctpop
@@ -121,9 +135,21 @@ define <4 x i64> @var_ctpop_v4i64(<4 x i64> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctpop
 ;
-; AVX512-LABEL: 'var_ctpop_v4i64'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctpop
+; AVX512F-LABEL: 'var_ctpop_v4i64'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctpop
+;
+; AVX512BW-LABEL: 'var_ctpop_v4i64'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctpop
+;
+; AVX512VPOPCNT-LABEL: 'var_ctpop_v4i64'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctpop
+;
+; AVX512BITALG-LABEL: 'var_ctpop_v4i64'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %ctpop
 ;
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
   ret <4 x i64> %ctpop
@@ -153,6 +179,14 @@ define <8 x i64> @var_ctpop_v8i64(<8 x i64> %a) {
 ; AVX512BW-LABEL: 'var_ctpop_v8i64'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctpop
+;
+; AVX512VPOPCNT-LABEL: 'var_ctpop_v8i64'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctpop
+;
+; AVX512BITALG-LABEL: 'var_ctpop_v8i64'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %ctpop
 ;
   %ctpop = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a)
   ret <8 x i64> %ctpop
@@ -171,9 +205,21 @@ define <4 x i32> @var_ctpop_v4i32(<4 x i32> %a) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
 ;
-; AVX512-LABEL: 'var_ctpop_v4i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
+; AVX512F-LABEL: 'var_ctpop_v4i32'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
+;
+; AVX512BW-LABEL: 'var_ctpop_v4i32'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
+;
+; AVX512VPOPCNT-LABEL: 'var_ctpop_v4i32'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
+;
+; AVX512BITALG-LABEL: 'var_ctpop_v4i32'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %ctpop
 ;
   %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
   ret <4 x i32> %ctpop
@@ -196,9 +242,21 @@ define <8 x i32> @var_ctpop_v8i32(<8 x i32> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctpop
 ;
-; AVX512-LABEL: 'var_ctpop_v8i32'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctpop
+; AVX512F-LABEL: 'var_ctpop_v8i32'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctpop
+;
+; AVX512BW-LABEL: 'var_ctpop_v8i32'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctpop
+;
+; AVX512VPOPCNT-LABEL: 'var_ctpop_v8i32'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctpop
+;
+; AVX512BITALG-LABEL: 'var_ctpop_v8i32'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %ctpop
 ;
   %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
   ret <8 x i32> %ctpop
@@ -228,6 +286,14 @@ define <16 x i32> @var_ctpop_v16i32(<16 x i32> %a) {
 ; AVX512BW-LABEL: 'var_ctpop_v16i32'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctpop
+;
+; AVX512VPOPCNT-LABEL: 'var_ctpop_v16i32'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctpop
+;
+; AVX512BITALG-LABEL: 'var_ctpop_v16i32'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %ctpop
 ;
   %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a)
   ret <16 x i32> %ctpop
@@ -246,9 +312,21 @@ define <8 x i16> @var_ctpop_v8i16(<8 x i16> %a) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop
 ;
-; AVX512-LABEL: 'var_ctpop_v8i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop
+; AVX512F-LABEL: 'var_ctpop_v8i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop
+;
+; AVX512BW-LABEL: 'var_ctpop_v8i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop
+;
+; AVX512VPOPCNT-LABEL: 'var_ctpop_v8i16'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop
+;
+; AVX512BITALG-LABEL: 'var_ctpop_v8i16'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %ctpop
 ;
   %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
   ret <8 x i16> %ctpop
@@ -271,9 +349,21 @@ define <16 x i16> @var_ctpop_v16i16(<16 x i16> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctpop
 ;
-; AVX512-LABEL: 'var_ctpop_v16i16'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctpop
+; AVX512F-LABEL: 'var_ctpop_v16i16'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctpop
+;
+; AVX512BW-LABEL: 'var_ctpop_v16i16'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctpop
+;
+; AVX512VPOPCNT-LABEL: 'var_ctpop_v16i16'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctpop
+;
+; AVX512BITALG-LABEL: 'var_ctpop_v16i16'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %ctpop
 ;
   %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
   ret <16 x i16> %ctpop
@@ -303,6 +393,14 @@ define <32 x i16> @var_ctpop_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'var_ctpop_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctpop
+;
+; AVX512VPOPCNT-LABEL: 'var_ctpop_v32i16'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %ctpop = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctpop
+;
+; AVX512BITALG-LABEL: 'var_ctpop_v32i16'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %ctpop
 ;
   %ctpop = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %a)
   ret <32 x i16> %ctpop
@@ -321,9 +419,21 @@ define <16 x i8> @var_ctpop_v16i8(<16 x i8> %a) {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop
 ;
-; AVX512-LABEL: 'var_ctpop_v16i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop
+; AVX512F-LABEL: 'var_ctpop_v16i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop
+;
+; AVX512BW-LABEL: 'var_ctpop_v16i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop
+;
+; AVX512VPOPCNT-LABEL: 'var_ctpop_v16i8'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop
+;
+; AVX512BITALG-LABEL: 'var_ctpop_v16i8'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %ctpop
 ;
   %ctpop = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
   ret <16 x i8> %ctpop
@@ -346,9 +456,21 @@ define <32 x i8> @var_ctpop_v32i8(<32 x i8> %a) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctpop
 ;
-; AVX512-LABEL: 'var_ctpop_v32i8'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctpop
+; AVX512F-LABEL: 'var_ctpop_v32i8'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctpop
+;
+; AVX512BW-LABEL: 'var_ctpop_v32i8'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctpop
+;
+; AVX512VPOPCNT-LABEL: 'var_ctpop_v32i8'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctpop
+;
+; AVX512BITALG-LABEL: 'var_ctpop_v32i8'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %ctpop
 ;
   %ctpop = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
   ret <32 x i8> %ctpop
@@ -378,6 +500,14 @@ define <64 x i8> @var_ctpop_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'var_ctpop_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctpop = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %a)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctpop
+;
+; AVX512VPOPCNT-LABEL: 'var_ctpop_v64i8'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %ctpop = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %a)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctpop
+;
+; AVX512BITALG-LABEL: 'var_ctpop_v64i8'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctpop = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %a)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %ctpop
 ;
   %ctpop = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %a)
   ret <64 x i8> %ctpop
diff --git a/llvm/test/Analysis/CostModel/X86/cttz.ll b/llvm/test/Analysis/CostModel/X86/cttz.ll
index 24266fdfa2ad4..c908579f7f018 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz.ll
@@ -6,6 +6,9 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+avx2 | FileCheck %s -check-prefixes=BMI,AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+avx512f | FileCheck %s -check-prefixes=BMI,AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+avx512vl,+avx512bw,+avx512dq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BW
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+avx512vl,+avx512vpopcntdq | FileCheck %s -check-prefixes=BMI,AVX512,AVX512VPOPCNT
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+bmi,+avx512vl,+avx512bitalg | FileCheck %s -check-prefixes=BMI,AVX512,AVX512BITALG
 
 ; Verify the cost of scalar trailing zero count instructions.
 
@@ -251,6 +254,14 @@ define <8 x i64> @var_cttz_v8i64(<8 x i64> %a) {
 ; AVX512BW-LABEL: 'var_cttz_v8i64'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %cttz
+;
+; AVX512VPOPCNT-LABEL: 'var_cttz_v8i64'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 false)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %cttz
+;
+; AVX512BITALG-LABEL: 'var_cttz_v8i64'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 false)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %cttz
 ;
   %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 0)
   ret <8 x i64> %cttz
@@ -280,6 +291,14 @@ define <8 x i64> @var_cttz_v8i64u(<8 x i64> %a) {
 ; AVX512BW-LABEL: 'var_cttz_v8i64u'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %cttz
+;
+; AVX512VPOPCNT-LABEL: 'var_cttz_v8i64u'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 true)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %cttz
+;
+; AVX512BITALG-LABEL: 'var_cttz_v8i64u'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 true)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %cttz
 ;
   %cttz = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %a, i1 1)
   ret <8 x i64> %cttz
@@ -401,6 +420,14 @@ define <16 x i32> @var_cttz_v16i32(<16 x i32> %a) {
 ; AVX512BW-LABEL: 'var_cttz_v16i32'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %cttz
+;
+; AVX512VPOPCNT-LABEL: 'var_cttz_v16i32'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 false)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %cttz
+;
+; AVX512BITALG-LABEL: 'var_cttz_v16i32'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 false)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %cttz
 ;
   %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 0)
   ret <16 x i32> %cttz
@@ -430,6 +457,14 @@ define <16 x i32> @var_cttz_v16i32u(<16 x i32> %a) {
 ; AVX512BW-LABEL: 'var_cttz_v16i32u'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %cttz
+;
+; AVX512VPOPCNT-LABEL: 'var_cttz_v16i32u'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 true)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %cttz
+;
+; AVX512BITALG-LABEL: 'var_cttz_v16i32u'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 true)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %cttz
 ;
   %cttz = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %a, i1 1)
   ret <16 x i32> %cttz
@@ -551,6 +586,14 @@ define <32 x i16> @var_cttz_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'var_cttz_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %cttz
+;
+; AVX512VPOPCNT-LABEL: 'var_cttz_v32i16'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 false)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %cttz
+;
+; AVX512BITALG-LABEL: 'var_cttz_v32i16'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 false)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %cttz
 ;
   %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 0)
   ret <32 x i16> %cttz
@@ -580,6 +623,14 @@ define <32 x i16> @var_cttz_v32i16u(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'var_cttz_v32i16u'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %cttz
+;
+; AVX512VPOPCNT-LABEL: 'var_cttz_v32i16u'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 true)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %cttz
+;
+; AVX512BITALG-LABEL: 'var_cttz_v32i16u'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 true)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %cttz
 ;
   %cttz = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %a, i1 1)
   ret <32 x i16> %cttz
@@ -701,6 +752,14 @@ define <64 x i8> @var_cttz_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'var_cttz_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %cttz
+;
+; AVX512VPOPCNT-LABEL: 'var_cttz_v64i8'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 false)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %cttz
+;
+; AVX512BITALG-LABEL: 'var_cttz_v64i8'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 false)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %cttz
 ;
   %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 0)
   ret <64 x i8> %cttz
@@ -730,6 +789,14 @@ define <64 x i8> @var_cttz_v64i8u(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'var_cttz_v64i8u'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %cttz
+;
+; AVX512VPOPCNT-LABEL: 'var_cttz_v64i8u'
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 true)
+; AVX512VPOPCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %cttz
+;
+; AVX512BITALG-LABEL: 'var_cttz_v64i8u'
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 true)
+; AVX512BITALG-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %cttz
 ;
   %cttz = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %a, i1 1)
   ret <64 x i8> %cttz
diff --git a/llvm/test/Analysis/CostModel/X86/fpclassify.ll b/llvm/test/Analysis/CostModel/X86/fpclassify.ll
new file mode 100644
index 0000000000000..fe2f59175cc13
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/fpclassify.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mattr=+avx | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512
+
+declare i1 @llvm.isnan.f16(half)
+declare <4 x i1> @llvm.isnan.v4f16(<4 x half>)
+declare <8 x i1> @llvm.isnan.v8f16(<8 x half>)
+declare <16 x i1> @llvm.isnan.v16f16(<16 x half>)
+declare <32 x i1> @llvm.isnan.v32f16(<32 x half>)
+
+declare i1 @llvm.isnan.f32(float)
+declare <2 x i1> @llvm.isnan.v2f32(<2 x float>)
+declare <4 x i1> @llvm.isnan.v4f32(<4 x float>)
+declare <8 x i1> @llvm.isnan.v8f32(<8 x float>)
+declare <16 x i1> @llvm.isnan.v16f32(<16 x float>)
+
+declare i1 @llvm.isnan.f64(double)
+declare <2 x i1> @llvm.isnan.v2f64(<2 x double>)
+declare <4 x i1> @llvm.isnan.v4f64(<4 x double>)
+declare <8 x i1> @llvm.isnan.v8f64(<8 x double>)
+declare <16 x i1> @llvm.isnan.v16f64(<16 x double>)
+
+define i32 @isnan(i32 %arg) {
+; SSE-LABEL: 'isnan'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call i1 @llvm.isnan.f16(half undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F16 = call <4 x i1> @llvm.isnan.v4f16(<4 x half> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F16 = call <8 x i1> @llvm.isnan.v8f16(<8 x half> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F16 = call <16 x i1> @llvm.isnan.v16f16(<16 x half> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V3216 = call <32 x i1> @llvm.isnan.v32f16(<32 x half> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i1 @llvm.isnan.f32(float undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x i1> @llvm.isnan.v2f32(<2 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i1> @llvm.isnan.v4f32(<4 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F32 = call <8 x i1> @llvm.isnan.v8f32(<8 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F32 = call <16 x i1> @llvm.isnan.v16f32(<16 x float> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i1 @llvm.isnan.f64(double undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i1> @llvm.isnan.v2f64(<2 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4F64 = call <4 x i1> @llvm.isnan.v4f64(<4 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8F64 = call <8 x i1> @llvm.isnan.v8f64(<8 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16F64 = call <16 x i1> @llvm.isnan.v16f64(<16 x double> undef)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'isnan'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call i1 @llvm.isnan.f16(half undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F16 = call <4 x i1> @llvm.isnan.v4f16(<4 x half> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F16 = call <8 x i1> @llvm.isnan.v8f16(<8 x half> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F16 = call <16 x i1> @llvm.isnan.v16f16(<16 x half> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V3216 = call <32 x i1> @llvm.isnan.v32f16(<32 x half> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i1 @llvm.isnan.f32(float undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x i1> @llvm.isnan.v2f32(<2 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i1> @llvm.isnan.v4f32(<4 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i1> @llvm.isnan.v8f32(<8 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F32 = call <16 x i1> @llvm.isnan.v16f32(<16 x float> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i1 @llvm.isnan.f64(double undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i1> @llvm.isnan.v2f64(<2 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i1> @llvm.isnan.v4f64(<4 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8F64 = call <8 x i1> @llvm.isnan.v8f64(<8 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16F64 = call <16 x i1> @llvm.isnan.v16f64(<16 x double> undef)
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'isnan'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F16 = call i1 @llvm.isnan.f16(half undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4F16 = call <4 x i1> @llvm.isnan.v4f16(<4 x half> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8F16 = call <8 x i1> @llvm.isnan.v8f16(<8 x half> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16F16 = call <16 x i1> @llvm.isnan.v16f16(<16 x half> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V3216 = call <32 x i1> @llvm.isnan.v32f16(<32 x half> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call i1 @llvm.isnan.f32(float undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x i1> @llvm.isnan.v2f32(<2 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x i1> @llvm.isnan.v4f32(<4 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F32 = call <8 x i1> @llvm.isnan.v8f32(<8 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16F32 = call <16 x i1> @llvm.isnan.v16f32(<16 x float> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call i1 @llvm.isnan.f64(double undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2F64 = call <2 x i1> @llvm.isnan.v2f64(<2 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4F64 = call <4 x i1> @llvm.isnan.v4f64(<4 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8F64 = call <8 x i1> @llvm.isnan.v8f64(<8 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16F64 = call <16 x i1> @llvm.isnan.v16f64(<16 x double> undef)
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %F16 = call i1 @llvm.isnan.f16(half undef)
+  %V4F16 = call <4 x i1> @llvm.isnan.v4f16(<4 x half> undef)
+  %V8F16 = call <8 x i1> @llvm.isnan.v8f16(<8 x half> undef)
+  %V16F16 = call <16 x i1> @llvm.isnan.v16f16(<16 x half> undef)
+  %V3216 = call <32 x i1> @llvm.isnan.v32f16(<32 x half> undef)
+
+  %F32 = call i1 @llvm.isnan.f32(float undef)
+  %V2F32 = call <2 x i1> @llvm.isnan.v2f32(<2 x float> undef)
+  %V4F32 = call <4 x i1> @llvm.isnan.v4f32(<4 x float> undef)
+  %V8F32 = call <8 x i1> @llvm.isnan.v8f32(<8 x float> undef)
+  %V16F32 = call <16 x i1> @llvm.isnan.v16f32(<16 x float> undef)
+
+  %F64 = call i1 @llvm.isnan.f64(double undef)
+  %V2F64 = call <2 x i1> @llvm.isnan.v2f64(<2 x double> undef)
+  %V4F64 = call <4 x i1> @llvm.isnan.v4f64(<4 x double> undef)
+  %V8F64 = call <8 x i1> @llvm.isnan.v8f64(<8 x double> undef)
+  %V16F64 = call <16 x i1> @llvm.isnan.v16f64(<16 x double> undef)
+
+  ret i32 undef
+}
diff --git a/llvm/test/Analysis/GlobalsModRef/dead-uses.ll b/llvm/test/Analysis/GlobalsModRef/dead-uses.ll
index 3657e880007ca..0f67de530eb42 100644
--- a/llvm/test/Analysis/GlobalsModRef/dead-uses.ll
+++ b/llvm/test/Analysis/GlobalsModRef/dead-uses.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -instcombine -globals-aa -licm -enable-new-pm=0 -S | FileCheck %s
-; RUN: opt < %s -aa-pipeline=basic-aa,globals-aa -passes='function(instcombine),require<globals-aa>,function(invalidate<aa>,loop(licm))' -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa,globals-aa -passes='function(instcombine),require<globals-aa>,function(invalidate<aa>,loop-mssa(licm))' -S | FileCheck %s
 
 ; Make sure -globals-aa ignores dead uses of globals.
 
diff --git a/llvm/test/Analysis/MemorySSA/debugvalue.ll b/llvm/test/Analysis/MemorySSA/debugvalue.ll
index 3e73b98279499..a0b9b63ef64a3 100644
--- a/llvm/test/Analysis/MemorySSA/debugvalue.ll
+++ b/llvm/test/Analysis/MemorySSA/debugvalue.ll
@@ -1,4 +1,4 @@
-; RUN: opt -disable-basic-aa -loop-rotate -enable-mssa-loop-dependency -verify-memoryssa -S %s | FileCheck %s
+; RUN: opt -disable-basic-aa -loop-rotate -verify-memoryssa -S %s | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK-LABEL: @f_w4_i2
diff --git a/llvm/test/Analysis/MemorySSA/invalidate.ll b/llvm/test/Analysis/MemorySSA/invalidate.ll
index d2217f070a491..99faf8bed5113 100644
--- a/llvm/test/Analysis/MemorySSA/invalidate.ll
+++ b/llvm/test/Analysis/MemorySSA/invalidate.ll
@@ -1,7 +1,7 @@
-; RUN: opt -aa-pipeline=basic-aa -passes='require<memoryssa>,invalidate<aa>,early-cse-memssa' \
+; RUN: opt -aa-pipeline=basic-aa -passes='require<memoryssa>,invalidate<aa>,early-cse<memssa>' \
 ; RUN:     -debug-pass-manager -disable-output %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-AA-INVALIDATE
-; RUN: opt -aa-pipeline=basic-aa -passes='require<memoryssa>,invalidate<domtree>,early-cse-memssa' \
+; RUN: opt -aa-pipeline=basic-aa -passes='require<memoryssa>,invalidate<domtree>,early-cse<memssa>' \
 ; RUN:     -debug-pass-manager -disable-output %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-DT-INVALIDATE
 
diff --git a/llvm/test/Analysis/MemorySSA/loop-rotate-inv-template.ll b/llvm/test/Analysis/MemorySSA/loop-rotate-inv-template.ll
index ec2e8e6e84117..0760d946ccc75 100644
--- a/llvm/test/Analysis/MemorySSA/loop-rotate-inv-template.ll
+++ b/llvm/test/Analysis/MemorySSA/loop-rotate-inv-template.ll
@@ -1,4 +1,4 @@
-; RUN: opt -disable-output -loop-rotate -enable-mssa-loop-dependency -verify-memoryssa %s
+; RUN: opt -disable-output -loop-rotate -verify-memoryssa %s
 ; REQUIRES: asserts
 
 ; Function Attrs: nounwind
diff --git a/llvm/test/Analysis/MemorySSA/loop-rotate-simplified-clone.ll b/llvm/test/Analysis/MemorySSA/loop-rotate-simplified-clone.ll
index 3d0efc6f6bd4d..8769d60b26635 100644
--- a/llvm/test/Analysis/MemorySSA/loop-rotate-simplified-clone.ll
+++ b/llvm/test/Analysis/MemorySSA/loop-rotate-simplified-clone.ll
@@ -1,4 +1,4 @@
-; RUN: opt -verify-memoryssa -enable-mssa-loop-dependency -loop-rotate %s -S | FileCheck %s 
+; RUN: opt -verify-memoryssa -loop-rotate %s -S | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK-LABEL: @test()
diff --git a/llvm/test/Analysis/MemorySSA/loop-rotate-valuemap.ll b/llvm/test/Analysis/MemorySSA/loop-rotate-valuemap.ll
index 1ee5fcabdebdb..75fb0c61180b0 100644
--- a/llvm/test/Analysis/MemorySSA/loop-rotate-valuemap.ll
+++ b/llvm/test/Analysis/MemorySSA/loop-rotate-valuemap.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-rotate -enable-mssa-loop-dependency %s -S | FileCheck %s
+; RUN: opt -loop-rotate %s -S | FileCheck %s
 ; REQUIRES: asserts
 
 ; Check that loop rotate keeps proper mapping between cloned instructions,
diff --git a/llvm/test/Analysis/MemorySSA/loop-unswitch.ll b/llvm/test/Analysis/MemorySSA/loop-unswitch.ll
index cc511fec32b35..9a02ea7334c1c 100644
--- a/llvm/test/Analysis/MemorySSA/loop-unswitch.ll
+++ b/llvm/test/Analysis/MemorySSA/loop-unswitch.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -simple-loop-unswitch -disable-basic-aa -enable-mssa-loop-dependency -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -simple-loop-unswitch -disable-basic-aa -verify-memoryssa < %s | FileCheck %s
 ; REQUIRES: asserts
 
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Analysis/MemorySSA/loop_rotate_remove_trivial_phis.ll b/llvm/test/Analysis/MemorySSA/loop_rotate_remove_trivial_phis.ll
index d96a3d41cde1b..493e51d581daf 100644
--- a/llvm/test/Analysis/MemorySSA/loop_rotate_remove_trivial_phis.ll
+++ b/llvm/test/Analysis/MemorySSA/loop_rotate_remove_trivial_phis.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-rotate -enable-new-pm=0 -print-memoryssa -disable-output -enable-mssa-loop-dependency -verify-memoryssa %s 2>&1 |  FileCheck %s
+; RUN: opt -loop-rotate -enable-new-pm=0 -print-memoryssa -disable-output -verify-memoryssa %s 2>&1 |  FileCheck %s
 ; RUN: opt -passes='loop-mssa(loop-rotate),print<memoryssa>' -disable-output -verify-memoryssa %s 2>&1 |  FileCheck %s
 ; REQUIRES: asserts
 
diff --git a/llvm/test/Analysis/MemorySSA/nondeterminism.ll b/llvm/test/Analysis/MemorySSA/nondeterminism.ll
index 0bb3df30b5878..230c6e61fb33e 100644
--- a/llvm/test/Analysis/MemorySSA/nondeterminism.ll
+++ b/llvm/test/Analysis/MemorySSA/nondeterminism.ll
@@ -1,4 +1,4 @@
-; RUN: opt -simplifycfg -enable-mssa-loop-dependency -S --preserve-ll-uselistorder %s | FileCheck %s
+; RUN: opt -simplifycfg -S --preserve-ll-uselistorder %s | FileCheck %s
 ; REQUIRES: x86-registered-target
 ; CHECK-LABEL: @n
 ; CHECK: uselistorder i16 0, { 3, 2, 4, 1, 5, 0, 6 }
diff --git a/llvm/test/Analysis/MemorySSA/pr39197.ll b/llvm/test/Analysis/MemorySSA/pr39197.ll
index 115a7748dcf5c..068d4f6fc438f 100644
--- a/llvm/test/Analysis/MemorySSA/pr39197.ll
+++ b/llvm/test/Analysis/MemorySSA/pr39197.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=s390x-linux-gnu -mcpu=z13 -enable-mssa-loop-dependency -verify-memoryssa -sroa -globalopt -function-attrs -simplifycfg -licm -simple-loop-unswitch %s -S | FileCheck %s
+; RUN: opt -mtriple=s390x-linux-gnu -mcpu=z13 -verify-memoryssa -sroa -globalopt -function-attrs -simplifycfg -licm -simple-loop-unswitch %s -S | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
diff --git a/llvm/test/Analysis/MemorySSA/pr40037.ll b/llvm/test/Analysis/MemorySSA/pr40037.ll
index a123a06ac50b0..91dcb79541b31 100644
--- a/llvm/test/Analysis/MemorySSA/pr40037.ll
+++ b/llvm/test/Analysis/MemorySSA/pr40037.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -S -simple-loop-unswitch -enable-mssa-loop-dependency -verify-memoryssa  < %s | FileCheck %s
+; RUN: opt -S -simple-loop-unswitch -verify-memoryssa  < %s | FileCheck %s
 
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
 target triple = "s390x-ibm-linux"
diff --git a/llvm/test/Analysis/MemorySSA/pr40038.ll b/llvm/test/Analysis/MemorySSA/pr40038.ll
index 844ede33cd88c..75032b26ef2a7 100644
--- a/llvm/test/Analysis/MemorySSA/pr40038.ll
+++ b/llvm/test/Analysis/MemorySSA/pr40038.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -S -mtriple=systemz-unknown -mcpu=z13  -O3 -enable-mssa-loop-dependency -enable-simple-loop-unswitch -verify-memoryssa  < %s | FileCheck %s
+; RUN: opt -S -mtriple=systemz-unknown -mcpu=z13  -O3 -enable-simple-loop-unswitch -verify-memoryssa  < %s | FileCheck %s
 
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
 target triple = "s390x-ibm-linux"
diff --git a/llvm/test/Analysis/MemorySSA/pr40509.ll b/llvm/test/Analysis/MemorySSA/pr40509.ll
index 1dbb6cfba3f61..55a2ad88be54b 100644
--- a/llvm/test/Analysis/MemorySSA/pr40509.ll
+++ b/llvm/test/Analysis/MemorySSA/pr40509.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -O3 -enable-mssa-loop-dependency -disable-output %s
+; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -O3 -disable-output %s
 
 ; During transform to LCSSA, an access becomes obfuscated to:
 ; (2 = phi (phi(val), val)), which BasicAA fails to analyze.
diff --git a/llvm/test/Analysis/MemorySSA/pr40749.ll b/llvm/test/Analysis/MemorySSA/pr40749.ll
index fe51f1038bb8b..461129eabb7c5 100644
--- a/llvm/test/Analysis/MemorySSA/pr40749.ll
+++ b/llvm/test/Analysis/MemorySSA/pr40749.ll
@@ -1,4 +1,4 @@
-; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa -S < %s | FileCheck %s
+; RUN: opt -licm -verify-memoryssa -S < %s | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
diff --git a/llvm/test/Analysis/MemorySSA/pr40749_2.ll b/llvm/test/Analysis/MemorySSA/pr40749_2.ll
index 50f1c4f71c8bc..3ebeb54963efc 100644
--- a/llvm/test/Analysis/MemorySSA/pr40749_2.ll
+++ b/llvm/test/Analysis/MemorySSA/pr40749_2.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -licm -simple-loop-unswitch -enable-mssa-loop-dependency -verify-memoryssa %s | FileCheck %s
+; RUN: opt -S -licm -simple-loop-unswitch -verify-memoryssa %s | FileCheck %s
 ; REQUIRES: asserts
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
 target triple = "s390x-ibm-linux"
diff --git a/llvm/test/Analysis/MemorySSA/pr40754.ll b/llvm/test/Analysis/MemorySSA/pr40754.ll
index 3262a0cdd46e8..ce8956bfd1b17 100644
--- a/llvm/test/Analysis/MemorySSA/pr40754.ll
+++ b/llvm/test/Analysis/MemorySSA/pr40754.ll
@@ -1,4 +1,4 @@
-; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa -S < %s | FileCheck %s
+; RUN: opt -licm -verify-memoryssa -S < %s | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
diff --git a/llvm/test/Analysis/MemorySSA/pr41254.ll b/llvm/test/Analysis/MemorySSA/pr41254.ll
index c8b21449bce3b..debb18dc07790 100644
--- a/llvm/test/Analysis/MemorySSA/pr41254.ll
+++ b/llvm/test/Analysis/MemorySSA/pr41254.ll
@@ -1,4 +1,4 @@
-; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa -S < %s | FileCheck %s
+; RUN: opt -licm -verify-memoryssa -S < %s | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
diff --git a/llvm/test/Analysis/MemorySSA/pr41640.ll b/llvm/test/Analysis/MemorySSA/pr41640.ll
index cf25535819530..f72013ade8370 100644
--- a/llvm/test/Analysis/MemorySSA/pr41640.ll
+++ b/llvm/test/Analysis/MemorySSA/pr41640.ll
@@ -1,4 +1,4 @@
-; RUN: opt -disable-output -licm -enable-new-pm=0 -print-memoryssa -enable-mssa-loop-dependency=true < %s 2>&1 | FileCheck %s
+; RUN: opt -disable-output -licm -enable-new-pm=0 -print-memoryssa < %s 2>&1 | FileCheck %s
 ; RUN: opt -disable-output -passes='loop-mssa(licm),print<memoryssa>' < %s 2>&1 | FileCheck %s
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
 target triple = "s390x-ibm-linux"
diff --git a/llvm/test/Analysis/MemorySSA/pr41853.ll b/llvm/test/Analysis/MemorySSA/pr41853.ll
index f7bf21c9f90f2..d28b2c155aad7 100644
--- a/llvm/test/Analysis/MemorySSA/pr41853.ll
+++ b/llvm/test/Analysis/MemorySSA/pr41853.ll
@@ -1,5 +1,4 @@
 ; RUN: opt -S -memoryssa -loop-simplify -early-cse-memssa -earlycse-debug-hash -verify-memoryssa %s | FileCheck %s
-; RUN: opt -S -memoryssa -loop-simplify -early-cse-memssa -enable-mssa-loop-dependency -verify-memoryssa %s | FileCheck %s
 ; REQUIRES: asserts
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Analysis/MemorySSA/pr42294.ll b/llvm/test/Analysis/MemorySSA/pr42294.ll
index ce278fc3eabcf..dc77b6238caf0 100644
--- a/llvm/test/Analysis/MemorySSA/pr42294.ll
+++ b/llvm/test/Analysis/MemorySSA/pr42294.ll
@@ -1,10 +1,7 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-rotate -licm %s -disable-output -enable-mssa-loop-dependency=true -debug-only=licm 2>&1 | FileCheck %s -check-prefix=LICM
-; RUN: opt -loop-rotate -licm %s -disable-output -enable-mssa-loop-dependency=false -debug-only=licm 2>&1 | FileCheck %s -check-prefix=LICM
-; RUN: opt -loop-rotate -licm %s -S -enable-mssa-loop-dependency=true  | FileCheck %s
-; RUN: opt -loop-rotate -licm %s -S -enable-mssa-loop-dependency=false | FileCheck %s
+; RUN: opt -loop-rotate -licm %s -disable-output -debug-only=licm 2>&1 | FileCheck %s -check-prefix=LICM
+; RUN: opt -loop-rotate -licm %s -S  | FileCheck %s
 
-; LICM: Using
 ; LICM-NOT: LICM sinking instruction:   %.pre = load i8, i8* %arrayidx.phi.trans.insert
 
 ; CHECK-LABEL: @fn1
diff --git a/llvm/test/Analysis/MemorySSA/pr42940.ll b/llvm/test/Analysis/MemorySSA/pr42940.ll
index ccd3007c45ad1..fab8c48a2b456 100644
--- a/llvm/test/Analysis/MemorySSA/pr42940.ll
+++ b/llvm/test/Analysis/MemorySSA/pr42940.ll
@@ -1,4 +1,4 @@
-; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa  -S %s | FileCheck %s
+; RUN: opt -licm -verify-memoryssa  -S %s | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
diff --git a/llvm/test/Analysis/MemorySSA/pr43044.ll b/llvm/test/Analysis/MemorySSA/pr43044.ll
index a0b2cf7b9f742..cf1a759e0b6d3 100644
--- a/llvm/test/Analysis/MemorySSA/pr43044.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43044.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-rotate -licm -enable-mssa-loop-dependency -verify-memoryssa %s -S | FileCheck %s
+; RUN: opt -loop-rotate -licm -verify-memoryssa %s -S | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
diff --git a/llvm/test/Analysis/MemorySSA/pr43317.ll b/llvm/test/Analysis/MemorySSA/pr43317.ll
index eb46252568536..36b60d0b4ea77 100644
--- a/llvm/test/Analysis/MemorySSA/pr43317.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43317.ll
@@ -1,4 +1,4 @@
-; RUN: opt -disable-output -licm -enable-new-pm=0 -print-memoryssa -enable-mssa-loop-dependency=true < %s 2>&1 | FileCheck %s
+; RUN: opt -disable-output -licm -enable-new-pm=0 -print-memoryssa < %s 2>&1 | FileCheck %s
 ; RUN: opt -disable-output -passes='loop-mssa(licm),print<memoryssa>' < %s 2>&1 | FileCheck %s
 @v_274 = external dso_local global i64, align 1
 @v_295 = external dso_local global i16, align 1
diff --git a/llvm/test/Analysis/MemorySSA/pr43320.ll b/llvm/test/Analysis/MemorySSA/pr43320.ll
index 6aca3f9eeb148..0527469c1da7d 100644
--- a/llvm/test/Analysis/MemorySSA/pr43320.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43320.ll
@@ -1,4 +1,4 @@
-; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa -S < %s | FileCheck %s
+; RUN: opt -licm -verify-memoryssa -S < %s | FileCheck %s
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/Analysis/MemorySSA/pr43426.ll b/llvm/test/Analysis/MemorySSA/pr43426.ll
index f603c32cf8da6..3e0dd5bafc47b 100644
--- a/llvm/test/Analysis/MemorySSA/pr43426.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43426.ll
@@ -1,4 +1,4 @@
-; RUN: opt -licm -enable-mssa-loop-dependency -S %s | FileCheck %s
+; RUN: opt -licm -S %s | FileCheck %s
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Analysis/MemorySSA/pr43427.ll b/llvm/test/Analysis/MemorySSA/pr43427.ll
index 00a015c98e8fd..ffd5775685204 100644
--- a/llvm/test/Analysis/MemorySSA/pr43427.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43427.ll
@@ -1,4 +1,4 @@
-; RUN: opt -disable-output -licm -enable-new-pm=0 -print-memoryssa -enable-mssa-loop-dependency=true < %s 2>&1 | FileCheck %s
+; RUN: opt -disable-output -licm -enable-new-pm=0 -print-memoryssa < %s 2>&1 | FileCheck %s
 ; RUN: opt -disable-output -aa-pipeline=basic-aa -passes='loop-mssa(licm),print<memoryssa>' < %s 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: @f()
diff --git a/llvm/test/Analysis/MemorySSA/pr43438.ll b/llvm/test/Analysis/MemorySSA/pr43438.ll
index e15ab0e93c3c9..1f62b72399285 100644
--- a/llvm/test/Analysis/MemorySSA/pr43438.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43438.ll
@@ -1,4 +1,4 @@
-; RUN: opt -disable-output -loop-simplify -licm -enable-new-pm=0 -print-memoryssa -enable-mssa-loop-dependency=true < %s 2>&1 | FileCheck %s
+; RUN: opt -disable-output -loop-simplify -licm -enable-new-pm=0 -print-memoryssa < %s 2>&1 | FileCheck %s
 ; RUN: opt -disable-output -aa-pipeline=basic-aa -passes='loop-mssa(licm),print<memoryssa>' < %s 2>&1 | FileCheck %s
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Analysis/MemorySSA/pr43493.ll b/llvm/test/Analysis/MemorySSA/pr43493.ll
index 69506e8c0434b..68b0d3e493293 100644
--- a/llvm/test/Analysis/MemorySSA/pr43493.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43493.ll
@@ -1,4 +1,4 @@
-; RUN: opt -enable-mssa-loop-dependency=true -loop-rotate -verify-memoryssa -S %s | FileCheck %s
+; RUN: opt -loop-rotate -verify-memoryssa -S %s | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK-LABEL: @func_35()
diff --git a/llvm/test/Analysis/MemorySSA/pr43540.ll b/llvm/test/Analysis/MemorySSA/pr43540.ll
index 325e6bc0ae8f7..a75b6d64be0db 100644
--- a/llvm/test/Analysis/MemorySSA/pr43540.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43540.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -licm -enable-mssa-loop-dependency=true %s | FileCheck %s
+; RUN: opt -S -licm %s | FileCheck %s
 @v_1 = global i8 0, align 1
 @v_2 =  global i8 0, align 1
 
diff --git a/llvm/test/Analysis/MemorySSA/pr43541.ll b/llvm/test/Analysis/MemorySSA/pr43541.ll
index 3f6b2e26bce86..9cc9b6fe5ff2e 100644
--- a/llvm/test/Analysis/MemorySSA/pr43541.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43541.ll
@@ -1,4 +1,4 @@
-; RUN: opt -gvn-hoist -enable-mssa-loop-dependency -S < %s | FileCheck %s
+; RUN: opt -gvn-hoist -S < %s | FileCheck %s
 ; REQUIRES: asserts
 %struct.job_pool.6.7 = type { i32 }
 
diff --git a/llvm/test/Analysis/MemorySSA/pr43641.ll b/llvm/test/Analysis/MemorySSA/pr43641.ll
index 06a6b5255b3e1..5e0dc73a0fd30 100644
--- a/llvm/test/Analysis/MemorySSA/pr43641.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43641.ll
@@ -1,4 +1,4 @@
-; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -enable-mssa-loop-dependency -verify-memoryssa -S < %s | FileCheck %s
+; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -verify-memoryssa -S < %s | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK-LABEL: @c
diff --git a/llvm/test/Analysis/MemorySSA/renamephis.ll b/llvm/test/Analysis/MemorySSA/renamephis.ll
index 7d240d823fa23..576f8e61e67c4 100644
--- a/llvm/test/Analysis/MemorySSA/renamephis.ll
+++ b/llvm/test/Analysis/MemorySSA/renamephis.ll
@@ -1,4 +1,4 @@
-; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa -S %s | FileCheck %s
+; RUN: opt -licm -verify-memoryssa -S %s | FileCheck %s
 ; REQUIRES: asserts
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Analysis/MemorySSA/unreachable.ll b/llvm/test/Analysis/MemorySSA/unreachable.ll
index 6a93643537343..c6de66c69b027 100644
--- a/llvm/test/Analysis/MemorySSA/unreachable.ll
+++ b/llvm/test/Analysis/MemorySSA/unreachable.ll
@@ -1,4 +1,4 @@
-; RUN: opt -licm -enable-mssa-loop-dependency -verify-memoryssa %s -S | FileCheck %s
+; RUN: opt -licm -verify-memoryssa %s -S | FileCheck %s
 ; REQUIRES: asserts
 ; Ensure verification doesn't fail with unreachable blocks.
 
diff --git a/llvm/test/Analysis/MemorySSA/update_unroll.ll b/llvm/test/Analysis/MemorySSA/update_unroll.ll
index cdf5d186e8654..006b97ac56b2f 100644
--- a/llvm/test/Analysis/MemorySSA/update_unroll.ll
+++ b/llvm/test/Analysis/MemorySSA/update_unroll.ll
@@ -1,4 +1,4 @@
-; RUN: opt -enable-new-pm=0 -enable-mssa-loop-dependency -verify-memoryssa  -loop-rotate -S %s | FileCheck %s
+; RUN: opt -enable-new-pm=0 -verify-memoryssa  -loop-rotate -S %s | FileCheck %s
 ; RUN: opt -verify-memoryssa -passes='loop-mssa(loop-rotate)' -S %s | FileCheck %s
 ; REQUIRES: asserts
 
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll b/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll
index 45c68b5e3d02c..f88e38ce52cc7 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/lifetime.ll
@@ -924,6 +924,31 @@ entry:
   ret void
 }
 
+define void @multiple_start_end() {
+; CHECK-LABEL: define void @multiple_start_end
+entry:
+; CHECK: entry:
+; CHECK-NEXT: Alive: <>
+  %x = alloca i8
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %x)
+; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %x)
+; CHECK-NEXT: Alive: <x>
+
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %x)
+; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %x)
+; CHECK-NEXT: Alive: <x>
+
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %x)
+; CHECK: call void @llvm.lifetime.end.p0i8(i64 1, i8* %x)
+; CHECK-NEXT: Alive: <>
+
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %x)
+; CHECK: call void @llvm.lifetime.end.p0i8(i64 1, i8* %x)
+; CHECK-NEXT: Alive: <>
+
+  ret void
+}
+
 declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @llvm.lifetime.start.p0i32(i64, i32* nocapture)
diff --git a/llvm/test/Bitcode/attr-btf_tag-dicomposite.ll b/llvm/test/Bitcode/attr-btf_tag-dicomposite.ll
new file mode 100644
index 0000000000000..5ed1c6eeee0a3
--- /dev/null
+++ b/llvm/test/Bitcode/attr-btf_tag-dicomposite.ll
@@ -0,0 +1,36 @@
+; REQUIRES: x86-registered-target
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+%struct.t = type { i32 }
+
+@g = dso_local global %struct.t zeroinitializer, align 4, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!13, !14, !15, !16, !17}
+!llvm.ident = !{!18}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "g", scope: !2, file: !3, line: 2, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 13.0.0 (https://github.com/llvm/llvm-project.git a20bed0ba269a4f9b67e58093c50af9ef0730fd1)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "struct.c", directory: "/home/yhs/work/tests/llvm/btf_tag")
+!4 = !{}
+!5 = !{!0}
+!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t", file: !3, line: 1, size: 32, elements: !7, annotations: !10)
+!7 = !{!8}
+!8 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !6, file: !3, line: 1, baseType: !9, size: 32)
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !{!11, !12}
+!11 = !{!"btf_tag", !"a"}
+!12 = !{!"btf_tag", !"b"}
+
+; CHECK:        distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t"
+; CHECK-SAME:   annotations: ![[ANNOT:[0-9]+]]
+; CHECK:        ![[ANNOT]] = !{![[TAG1:[0-9]+]], ![[TAG2:[0-9]+]]}
+; CHECK:        ![[TAG1]] = !{!"btf_tag", !"a"}
+; CHECK:        ![[TAG2]] = !{!"btf_tag", !"b"}
+!13 = !{i32 7, !"Dwarf Version", i32 4}
+!14 = !{i32 2, !"Debug Info Version", i32 3}
+!15 = !{i32 1, !"wchar_size", i32 4}
+!16 = !{i32 7, !"uwtable", i32 1}
+!17 = !{i32 7, !"frame-pointer", i32 2}
+!18 = !{!"clang version 13.0.0 (https://github.com/llvm/llvm-project.git a20bed0ba269a4f9b67e58093c50af9ef0730fd1)"}
diff --git a/llvm/test/Bitcode/attr-btf_tag-field.ll b/llvm/test/Bitcode/attr-btf_tag-field.ll
new file mode 100644
index 0000000000000..d2013e030233b
--- /dev/null
+++ b/llvm/test/Bitcode/attr-btf_tag-field.ll
@@ -0,0 +1,91 @@
+; REQUIRES: x86-registered-target
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+%struct.t1 = type { i32 }
+%struct.t2 = type { i8, [3 x i8] }
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @foo(%struct.t1* %arg) #0 !dbg !9 {
+entry:
+  %arg.addr = alloca %struct.t1*, align 8
+  store %struct.t1* %arg, %struct.t1** %arg.addr, align 8
+  call void @llvm.dbg.declare(metadata %struct.t1** %arg.addr, metadata !20, metadata !DIExpression()), !dbg !21
+  %0 = load %struct.t1*, %struct.t1** %arg.addr, align 8, !dbg !22
+  %a = getelementptr inbounds %struct.t1, %struct.t1* %0, i32 0, i32 0, !dbg !23
+  %1 = load i32, i32* %a, align 4, !dbg !23
+  ret i32 %1, !dbg !24
+}
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @foo2(%struct.t2* %arg) #0 !dbg !25 {
+entry:
+  %arg.addr = alloca %struct.t2*, align 8
+  store %struct.t2* %arg, %struct.t2** %arg.addr, align 8
+  call void @llvm.dbg.declare(metadata %struct.t2** %arg.addr, metadata !32, metadata !DIExpression()), !dbg !33
+  %0 = load %struct.t2*, %struct.t2** %arg.addr, align 8, !dbg !34
+  %1 = bitcast %struct.t2* %0 to i8*, !dbg !35
+  %bf.load = load i8, i8* %1, align 4, !dbg !35
+  %bf.shl = shl i8 %bf.load, 7, !dbg !35
+  %bf.ashr = ashr i8 %bf.shl, 7, !dbg !35
+  %bf.cast = sext i8 %bf.ashr to i32, !dbg !35
+  ret i32 %bf.cast, !dbg !36
+}
+
+attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 14.0.0 (https://github.com/llvm/llvm-project.git 4cbaee98885ead226304e8836090069db6596965)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "attr-btf_tag-field.c", directory: "/home/yhs/work/tests/llvm/btf_tag")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 7, !"uwtable", i32 1}
+!7 = !{i32 7, !"frame-pointer", i32 2}
+!8 = !{!"clang version 14.0.0 (https://github.com/llvm/llvm-project.git 4cbaee98885ead226304e8836090069db6596965)"}
+!9 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 11, type: !10, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!10 = !DISubroutineType(types: !11)
+!11 = !{!12, !13}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64)
+!14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", file: !1, line: 7, size: 32, elements: !15)
+!15 = !{!16}
+!16 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !14, file: !1, line: 8, baseType: !12, size: 32, annotations: !17)
+!17 = !{!18, !19}
+!18 = !{!"btf_tag", !"tag1"}
+!19 = !{!"btf_tag", !"tag2"}
+
+; CHECK:        !DIDerivedType(tag: DW_TAG_member, name: "a"
+; CHECK-SAME:   annotations: ![[ANNOT:[0-9]+]]
+; CHECK:        ![[ANNOT]] = !{![[TAG1:[0-9]+]], ![[TAG2:[0-9]+]]}
+; CHECK:        ![[TAG1]] = !{!"btf_tag", !"tag1"}
+; CHECK:        ![[TAG2]] = !{!"btf_tag", !"tag2"}
+
+!20 = !DILocalVariable(name: "arg", arg: 1, scope: !9, file: !1, line: 11, type: !13)
+!21 = !DILocation(line: 11, column: 20, scope: !9)
+!22 = !DILocation(line: 12, column: 10, scope: !9)
+!23 = !DILocation(line: 12, column: 15, scope: !9)
+!24 = !DILocation(line: 12, column: 3, scope: !9)
+!25 = distinct !DISubprogram(name: "foo2", scope: !1, file: !1, line: 19, type: !26, scopeLine: 19, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!26 = !DISubroutineType(types: !27)
+!27 = !{!12, !28}
+!28 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !29, size: 64)
+!29 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t2", file: !1, line: 15, size: 32, elements: !30)
+!30 = !{!31}
+!31 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !29, file: !1, line: 16, baseType: !12, size: 1, flags: DIFlagBitField, extraData: i64 0, annotations: !17)
+
+; CHECK:        !DIDerivedType(tag: DW_TAG_member, name: "b"
+; CHECK-SAME:   annotations: ![[ANNOT]]
+
+!32 = !DILocalVariable(name: "arg", arg: 1, scope: !25, file: !1, line: 19, type: !28)
+!33 = !DILocation(line: 19, column: 21, scope: !25)
+!34 = !DILocation(line: 20, column: 10, scope: !25)
+!35 = !DILocation(line: 20, column: 15, scope: !25)
+!36 = !DILocation(line: 20, column: 3, scope: !25)
diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll
index 6ba8ce2854d6b..4ec827fe63e0c 100644
--- a/llvm/test/Bitcode/attributes.ll
+++ b/llvm/test/Bitcode/attributes.ll
@@ -472,6 +472,12 @@ define void @f79() {
   ret void
 }
 
+; CHECK: define void @f80() #50
+define void @f80() disable_sanitizer_instrumentation
+{
+        ret void;
+}
+
 ; CHECK: attributes #0 = { noreturn }
 ; CHECK: attributes #1 = { nounwind }
 ; CHECK: attributes #2 = { readnone }
@@ -522,4 +528,5 @@ define void @f79() {
 ; CHECK: attributes #47 = { vscale_range(1,0) }
 ; CHECK: attributes #48 = { nosanitize_coverage }
 ; CHECK: attributes #49 = { noprofile }
+; CHECK: attributes #50 = { disable_sanitizer_instrumentation }
 ; CHECK: attributes #[[NOBUILTIN]] = { nobuiltin }
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index c2bfb3a3523d3..2281938c6d834 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1510,7 +1510,7 @@ exit:
   ; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x i8> <i8 2, i8 3>, <2 x i8> <i8 3, i8 2>
 
   call void @f.nobuiltin() builtin
-  ; CHECK: call void @f.nobuiltin() #45
+  ; CHECK: call void @f.nobuiltin() #46
 
   call fastcc noalias i32* @f.noalias() noinline
   ; CHECK: call fastcc noalias i32* @f.noalias() #12
@@ -1907,6 +1907,9 @@ define void @instructions.strictfp() strictfp {
 declare void @f.nosanitize_coverage() nosanitize_coverage
 ; CHECK: declare void @f.nosanitize_coverage() #44
 
+declare void @f.disable_sanitizer_instrumentation() disable_sanitizer_instrumentation
+; CHECK: declare void @f.disable_sanitizer_instrumentation() #45
+
 ; immarg attribute
 declare void @llvm.test.immarg.intrinsic(i32 immarg)
 ; CHECK: declare void @llvm.test.immarg.intrinsic(i32 immarg)
@@ -1965,7 +1968,8 @@ declare void @byval_named_type(%named_type* byval(%named_type))
 ; CHECK: attributes #42 = { speculatable }
 ; CHECK: attributes #43 = { strictfp }
 ; CHECK: attributes #44 = { nosanitize_coverage }
-; CHECK: attributes #45 = { builtin }
+; CHECK: attributes #45 = { disable_sanitizer_instrumentation }
+; CHECK: attributes #46 = { builtin }
 
 ;; Metadata
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index 530fbdf932b23..2da155797fa9e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -136,6 +136,19 @@ entry:
   ret void
 }
 
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %4:_(s128), %5:_(s1) = G_UMULO %0:_, %6:_ (in function: umul_s128)
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for umul_s128
+; FALLBACK-WITH-REPORT-OUT-LABEL: umul_s128
+declare {i128, i1} @llvm.umul.with.overflow.i128(i128, i128) nounwind readnone
+define zeroext i1 @umul_s128(i128 %v1, i128* %res) {
+entry:
+  %t = call {i128, i1} @llvm.umul.with.overflow.i128(i128 %v1, i128 2)
+  %val = extractvalue {i128, i1} %t, 0
+  %obit = extractvalue {i128, i1} %t, 1
+  store i128 %val, i128* %res
+  ret i1 %obit
+}
+
 attributes #1 = { "target-features"="+sve" }
 attributes #2 = { "target-features"="+ls64" }
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index d8a94c4015f27..3bf7a5cdb0488 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -2457,3 +2457,16 @@ define {i8, i32} @test_freeze_struct({ i8, i32 }* %addr) {
 }
 
 !0 = !{ i64 0, i64 2 }
+
+declare i64 @llvm.lround.i64.f32(float) nounwind readnone
+define i64 @lround(float %x) {
+  ; CHECK-LABEL: name: lround
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $s0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+  ; CHECK:   [[LROUND:%[0-9]+]]:_(s64) = G_LROUND [[COPY]](s32)
+  ; CHECK:   $x0 = COPY [[LROUND]](s64)
+  ; CHECK:   RET_ReallyLR implicit $x0
+  %lround = tail call i64 @llvm.lround.i64.f32(float %x)
+  ret i64 %lround
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/contract-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/contract-store.mir
index 61a70dd782651..cb92b6a58ba5e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/contract-store.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/contract-store.mir
@@ -1,15 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
 
---- |
-  define void @contract_s64_gpr(i64* %addr) { ret void }
-  define void @contract_s32_gpr(i32* %addr) { ret void }
-  define void @contract_s64_fpr(i64* %addr) { ret void }
-  define void @contract_s32_fpr(i32* %addr) { ret void }
-  define void @contract_s16_fpr(i16* %addr) { ret void }
-  define void @contract_g_unmerge_values_first(i128* %addr) { ret void }
-  define void @contract_g_unmerge_values_second(i128* %addr) { ret void }
-...
 ---
 name:            contract_s64_gpr
 legalized:       true
@@ -20,11 +11,11 @@ body:             |
     ; CHECK-LABEL: name: contract_s64_gpr
     ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY $x1
-    ; CHECK: STRXui [[COPY1]], [[COPY]], 0 :: (store (s64) into %ir.addr)
+    ; CHECK: STRXui [[COPY1]], [[COPY]], 0 :: (store (s64))
     %0:gpr(p0) = COPY $x0
     %1:gpr(s64) = COPY $x1
     %2:fpr(s64) = COPY %1
-    G_STORE  %2:fpr(s64), %0 :: (store (s64) into %ir.addr)
+    G_STORE  %2:fpr(s64), %0 :: (store (s64))
 ...
 ---
 name:            contract_s32_gpr
@@ -36,11 +27,11 @@ body:             |
     ; CHECK-LABEL: name: contract_s32_gpr
     ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
-    ; CHECK: STRWui [[COPY1]], [[COPY]], 0 :: (store (s32) into %ir.addr)
+    ; CHECK: STRWui [[COPY1]], [[COPY]], 0 :: (store (s32))
     %0:gpr(p0) = COPY $x0
     %1:gpr(s32) = COPY $w1
     %2:fpr(s32) = COPY %1
-    G_STORE  %2:fpr(s32), %0 :: (store (s32) into %ir.addr)
+    G_STORE  %2:fpr(s32), %0 :: (store (s32))
 ...
 ---
 name:            contract_s64_fpr
@@ -52,11 +43,11 @@ body:             |
     ; CHECK-LABEL: name: contract_s64_fpr
     ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
-    ; CHECK: STRDui [[COPY1]], [[COPY]], 0 :: (store (s64) into %ir.addr)
+    ; CHECK: STRDui [[COPY1]], [[COPY]], 0 :: (store (s64))
     %0:gpr(p0) = COPY $x0
     %1:fpr(s64) = COPY $d1
     %2:gpr(s64) = COPY %1
-    G_STORE %2:gpr(s64), %0 :: (store (s64) into %ir.addr)
+    G_STORE %2:gpr(s64), %0 :: (store (s64))
 ...
 ---
 name:            contract_s32_fpr
@@ -68,11 +59,11 @@ body:             |
     ; CHECK-LABEL: name: contract_s32_fpr
     ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:fpr32 = COPY $s1
-    ; CHECK: STRSui [[COPY1]], [[COPY]], 0 :: (store (s32) into %ir.addr)
+    ; CHECK: STRSui [[COPY1]], [[COPY]], 0 :: (store (s32))
     %0:gpr(p0) = COPY $x0
     %1:fpr(s32) = COPY $s1
     %2:gpr(s32) = COPY %1
-    G_STORE %2:gpr(s32), %0 :: (store (s32) into %ir.addr)
+    G_STORE %2:gpr(s32), %0 :: (store (s32))
 ...
 ---
 name:            contract_s16_fpr
@@ -84,11 +75,11 @@ body:             |
     ; CHECK-LABEL: name: contract_s16_fpr
     ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
     ; CHECK: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1
-    ; CHECK: STRHui [[COPY1]], [[COPY]], 0 :: (store (s16) into %ir.addr)
+    ; CHECK: STRHui [[COPY1]], [[COPY]], 0 :: (store (s16))
     %0:gpr(p0) = COPY $x0
     %1:fpr(s16) = COPY $h1
     %2:gpr(s16) = COPY %1
-    G_STORE  %2:gpr(s16), %0 :: (store (s16) into %ir.addr)
+    G_STORE  %2:gpr(s16), %0 :: (store (s16))
 ...
 ---
 name:            contract_g_unmerge_values_first
@@ -99,15 +90,16 @@ body:             |
     liveins: $x0, $x1
     ; CHECK-LABEL: name: contract_g_unmerge_values_first
     ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0
-    ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY [[LOAD]].dsub
-    ; CHECK: STRDui [[COPY1]], [[COPY]], 0 :: (store (s64) into %ir.addr)
+    ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0 :: (dereferenceable load (<2 x s64>))
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY [[LDRQui]].dsub
+    ; CHECK: [[CPYi64_:%[0-9]+]]:fpr64 = CPYi64 [[LDRQui]], 1
+    ; CHECK: STRDui [[COPY1]], [[COPY]], 0 :: (store (s64))
     %0:gpr(p0) = COPY $x0
-    %1:fpr(<2 x s64>) = G_LOAD %0:gpr(p0) :: (dereferenceable load (<2 x s64>) from %ir.addr)
+    %1:fpr(<2 x s64>) = G_LOAD %0:gpr(p0) :: (dereferenceable load (<2 x s64>))
     %2:fpr(s64), %3:fpr(s64) = G_UNMERGE_VALUES %1:fpr(<2 x s64>)
     %4:gpr(s64) = COPY %2
     %5:gpr(s64) = COPY %3
-    G_STORE  %4:gpr(s64), %0 :: (store (s64) into %ir.addr)
+    G_STORE  %4:gpr(s64), %0 :: (store (s64))
 ...
 ---
 name:            contract_g_unmerge_values_second
@@ -118,12 +110,31 @@ body:             |
     liveins: $x0, $x1
     ; CHECK-LABEL: name: contract_g_unmerge_values_second
     ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
-    ; CHECK: [[LOAD:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0
-    ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = CPYi64 [[LOAD]], 1
-    ; CHECK: STRDui [[COPY1]], [[COPY]], 0 :: (store (s64) into %ir.addr)
+    ; CHECK: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0 :: (dereferenceable load (<2 x s64>))
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY [[LDRQui]].dsub
+    ; CHECK: [[CPYi64_:%[0-9]+]]:fpr64 = CPYi64 [[LDRQui]], 1
+    ; CHECK: STRDui [[CPYi64_]], [[COPY]], 0 :: (store (s64))
     %0:gpr(p0) = COPY $x0
-    %1:fpr(<2 x s64>) = G_LOAD %0:gpr(p0) :: (dereferenceable load (<2 x s64>) from %ir.addr)
+    %1:fpr(<2 x s64>) = G_LOAD %0:gpr(p0) :: (dereferenceable load (<2 x s64>))
     %2:fpr(s64), %3:fpr(s64) = G_UNMERGE_VALUES %1:fpr(<2 x s64>)
     %4:gpr(s64) = COPY %2
     %5:gpr(s64) = COPY %3
-    G_STORE  %5:gpr(s64), %0 :: (store (s64) into %ir.addr)
+    G_STORE  %5:gpr(s64), %0 :: (store (s64))
+...
+---
+name:            contract_s16_truncstore
+legalized:       true
+regBankSelected: true
+body:             |
+  bb.0:
+    liveins: $x0, $s1
+    ; CHECK-LABEL: name: contract_s16_truncstore
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr32 = COPY $s1
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr32 = COPY [[COPY1]]
+    ; CHECK: STRHHui [[COPY2]], [[COPY]], 0 :: (store (s16))
+    %0:gpr(p0) = COPY $x0
+    %1:fpr(s32) = COPY $s1
+    %2:gpr(s32) = COPY %1
+    G_STORE  %2:gpr(s32), %0 :: (store (s16))
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-isnan.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-isnan.ll
new file mode 100644
index 0000000000000..2368b6a9c3e66
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-isnan.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -mtriple=aarch64-unknown-unknown -stop-after=irtranslator -verify-machineinstrs -o - %s | FileCheck %s
+declare i1 @llvm.isnan.f16(half)
+declare <4 x i1> @llvm.isnan.v4f16(<4 x half>)
+
+define i1 @s16(half %x) {
+  ; CHECK-LABEL: name: s16
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $h0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+  ; CHECK:   %1:_(s1) = nofpexcept G_ISNAN [[COPY]](s16)
+  ; CHECK:   [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT %1(s1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ZEXT]](s8)
+  ; CHECK:   $w0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+  %isnan = tail call i1 @llvm.isnan.f16(half %x)
+  ret i1 %isnan
+}
+
+define <4 x i1> @v4s16(<4 x half> %x) {
+  ; CHECK-LABEL: name: v4s16
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $d0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
+  ; CHECK:   %1:_(<4 x s1>) = nofpexcept G_ISNAN [[COPY]](<4 x s16>)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT %1(<4 x s1>)
+  ; CHECK:   $d0 = COPY [[ANYEXT]](<4 x s16>)
+  ; CHECK:   RET_ReallyLR implicit $d0
+  %isnan = tail call <4 x i1> @llvm.isnan.v4f16(<4 x half> %x)
+  ret <4 x i1> %isnan
+}
+
+define i1 @strictfp(half %x) strictfp {
+  ; CHECK-LABEL: name: strictfp
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $h0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+  ; CHECK:   [[ISNAN:%[0-9]+]]:_(s1) = G_ISNAN [[COPY]](s16)
+  ; CHECK:   [[ZEXT:%[0-9]+]]:_(s8) = G_ZEXT [[ISNAN]](s1)
+  ; CHECK:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ZEXT]](s8)
+  ; CHECK:   $w0 = COPY [[ANYEXT]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+  %isnan = tail call i1 @llvm.isnan.f16(half %x)
+  ret i1 %isnan
+}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitreverse.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitreverse.mir
index ce63e9411f0b8..8d8ede1596c9f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitreverse.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bitreverse.mir
@@ -9,6 +9,7 @@ body: |
   bb.0:
     liveins: $w0
     ; CHECK-LABEL: name: s32_legal
+    ; CHECK: liveins: $w0
     ; CHECK: %copy:_(s32) = COPY $w0
     ; CHECK: %bitreverse:_(s32) = G_BITREVERSE %copy
     ; CHECK: $w0 = COPY %bitreverse(s32)
@@ -25,6 +26,7 @@ body: |
   bb.0:
     liveins: $x0
     ; CHECK-LABEL: name: s64_legal
+    ; CHECK: liveins: $x0
     ; CHECK: %copy:_(s64) = COPY $x0
     ; CHECK: %bitreverse:_(s64) = G_BITREVERSE %copy
     ; CHECK: $x0 = COPY %bitreverse(s64)
@@ -41,6 +43,7 @@ body: |
   bb.0:
     liveins: $x0
     ; CHECK-LABEL: name: v8s8_legal
+    ; CHECK: liveins: $x0
     ; CHECK: %vec:_(<8 x s8>) = G_IMPLICIT_DEF
     ; CHECK: %bitreverse:_(<8 x s8>) = G_BITREVERSE %vec
     ; CHECK: $x0 = COPY %bitreverse(<8 x s8>)
@@ -57,6 +60,7 @@ body: |
   bb.0:
     liveins: $q0
     ; CHECK-LABEL: name: v16s8_legal
+    ; CHECK: liveins: $q0
     ; CHECK: %vec:_(<16 x s8>) = G_IMPLICIT_DEF
     ; CHECK: %bitreverse:_(<16 x s8>) = G_BITREVERSE %vec
     ; CHECK: $q0 = COPY %bitreverse(<16 x s8>)
@@ -66,3 +70,66 @@ body: |
     $q0 = COPY %bitreverse
     RET_ReallyLR implicit $q0
 ...
+---
+name:            s8_widen
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $b0
+    ; CHECK-LABEL: name: s8_widen
+    ; CHECK: liveins: $b0
+    ; CHECK: %copy:_(s8) = COPY $b0
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %copy(s8)
+    ; CHECK: [[BITREVERSE:%[0-9]+]]:_(s32) = G_BITREVERSE [[ANYEXT]]
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITREVERSE]], [[C]](s64)
+    ; CHECK: %bitreverse:_(s8) = G_TRUNC [[LSHR]](s32)
+    ; CHECK: $b0 = COPY %bitreverse(s8)
+    ; CHECK: RET_ReallyLR implicit $b0
+    %copy:_(s8) = COPY $b0
+    %bitreverse:_(s8) = G_BITREVERSE %copy
+    $b0 = COPY %bitreverse
+    RET_ReallyLR implicit $b0
+...
+---
+name:            s3_widen
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $b0
+    ; CHECK-LABEL: name: s3_widen
+    ; CHECK: liveins: $b0
+    ; CHECK: %copy:_(s8) = COPY $b0
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT %copy(s8)
+    ; CHECK: [[BITREVERSE:%[0-9]+]]:_(s32) = G_BITREVERSE [[ANYEXT]]
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 29
+    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITREVERSE]], [[C]](s64)
+    ; CHECK: %ext:_(s8) = G_TRUNC [[LSHR]](s32)
+    ; CHECK: $b0 = COPY %ext(s8)
+    ; CHECK: RET_ReallyLR implicit $b0
+    %copy:_(s8) = COPY $b0
+    %trunc:_(s3) = G_TRUNC %copy
+    %bitreverse:_(s3) = G_BITREVERSE %trunc
+    %ext:_(s8) = G_ANYEXT %bitreverse
+    $b0 = COPY %ext
+    RET_ReallyLR implicit $b0
+...
+---
+name:            s128_narrow
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $q0
+    ; CHECK-LABEL: name: s128_narrow
+    ; CHECK: liveins: $q0
+    ; CHECK: %copy:_(s128) = COPY $q0
+    ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES %copy(s128)
+    ; CHECK: [[BITREVERSE:%[0-9]+]]:_(s64) = G_BITREVERSE [[UV1]]
+    ; CHECK: [[BITREVERSE1:%[0-9]+]]:_(s64) = G_BITREVERSE [[UV]]
+    ; CHECK: %bitreverse:_(s128) = G_MERGE_VALUES [[BITREVERSE]](s64), [[BITREVERSE1]](s64)
+    ; CHECK: $q0 = COPY %bitreverse(s128)
+    ; CHECK: RET_ReallyLR implicit $q0
+    %copy:_(s128) = COPY $q0
+    %bitreverse:_(s128) = G_BITREVERSE %copy
+    $q0 = COPY %bitreverse
+    RET_ReallyLR implicit $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fmaxnum.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fmaxnum.mir
new file mode 100644
index 0000000000000..e1dabeb5e9b41
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fmaxnum.mir
@@ -0,0 +1,115 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=legalizer -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=FP16
+# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=NO-FP16
+...
+---
+name:            s16_legal_with_full_fp16
+alignment:       4
+body:             |
+  bb.0:
+    liveins: $h0, $h1
+    ; FP16-LABEL: name: s16_legal_with_full_fp16
+    ; FP16: %a:_(s16) = COPY $h0
+    ; FP16: %b:_(s16) = COPY $h1
+    ; FP16: %maxnum:_(s16) = G_FMAXNUM %a, %b
+    ; FP16: $h0 = COPY %maxnum(s16)
+    ; FP16: RET_ReallyLR implicit $h0
+    ; NO-FP16-LABEL: name: s16_legal_with_full_fp16
+    ; NO-FP16: %a:_(s16) = COPY $h0
+    ; NO-FP16: %b:_(s16) = COPY $h1
+    ; NO-FP16: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %a(s16)
+    ; NO-FP16: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT %b(s16)
+    ; NO-FP16: [[FMAXNUM:%[0-9]+]]:_(s32) = G_FMAXNUM [[FPEXT]], [[FPEXT1]]
+    ; NO-FP16: %maxnum:_(s16) = G_FPTRUNC [[FMAXNUM]](s32)
+    ; NO-FP16: $h0 = COPY %maxnum(s16)
+    ; NO-FP16: RET_ReallyLR implicit $h0
+    %a:_(s16) = COPY $h0
+    %b:_(s16) = COPY $h1
+    %maxnum:_(s16) = G_FMAXNUM %a, %b
+    $h0 = COPY %maxnum(s16)
+    RET_ReallyLR implicit $h0
+
+...
+---
+name:            s32_legal
+alignment:       4
+body:             |
+  bb.0:
+    liveins: $s0, $s1
+    ; FP16-LABEL: name: s32_legal
+    ; FP16: %a:_(s32) = COPY $s0
+    ; FP16: %b:_(s32) = COPY $s1
+    ; FP16: %maxnum:_(s32) = G_FMAXNUM %a, %b
+    ; FP16: $s0 = COPY %maxnum(s32)
+    ; FP16: RET_ReallyLR implicit $s0
+    ; NO-FP16-LABEL: name: s32_legal
+    ; NO-FP16: %a:_(s32) = COPY $s0
+    ; NO-FP16: %b:_(s32) = COPY $s1
+    ; NO-FP16: %maxnum:_(s32) = G_FMAXNUM %a, %b
+    ; NO-FP16: $s0 = COPY %maxnum(s32)
+    ; NO-FP16: RET_ReallyLR implicit $s0
+    %a:_(s32) = COPY $s0
+    %b:_(s32) = COPY $s1
+    %maxnum:_(s32) = G_FMAXNUM %a, %b
+    $s0 = COPY %maxnum(s32)
+    RET_ReallyLR implicit $s0
+
+...
+---
+name:            s64_legal
+alignment:       4
+body:             |
+  bb.0:
+    liveins: $d0, $d1
+    ; FP16-LABEL: name: s64_legal
+    ; FP16: %a:_(s64) = COPY $d0
+    ; FP16: %b:_(s64) = COPY $d1
+    ; FP16: %maxnum:_(s64) = G_FMAXNUM %a, %b
+    ; FP16: $d0 = COPY %maxnum(s64)
+    ; FP16: RET_ReallyLR implicit $d0
+    ; NO-FP16-LABEL: name: s64_legal
+    ; NO-FP16: %a:_(s64) = COPY $d0
+    ; NO-FP16: %b:_(s64) = COPY $d1
+    ; NO-FP16: %maxnum:_(s64) = G_FMAXNUM %a, %b
+    ; NO-FP16: $d0 = COPY %maxnum(s64)
+    ; NO-FP16: RET_ReallyLR implicit $d0
+    %a:_(s64) = COPY $d0
+    %b:_(s64) = COPY $d1
+    %maxnum:_(s64) = G_FMAXNUM %a, %b
+    $d0 = COPY %maxnum(s64)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            s128_libcall
+alignment:       4
+body:             |
+  bb.0:
+    liveins: $q0, $q1
+    ; FP16-LABEL: name: s128_libcall
+    ; FP16: %a:_(s128) = COPY $q0
+    ; FP16: %b:_(s128) = COPY $q1
+    ; FP16: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; FP16: $q0 = COPY %a(s128)
+    ; FP16: $q1 = COPY %b(s128)
+    ; FP16: BL &fmaxl, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $q0, implicit $q1, implicit-def $q0
+    ; FP16: %maxnum:_(s128) = COPY $q0
+    ; FP16: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; FP16: $q0 = COPY %maxnum(s128)
+    ; FP16: RET_ReallyLR implicit $q0
+    ; NO-FP16-LABEL: name: s128_libcall
+    ; NO-FP16: %a:_(s128) = COPY $q0
+    ; NO-FP16: %b:_(s128) = COPY $q1
+    ; NO-FP16: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; NO-FP16: $q0 = COPY %a(s128)
+    ; NO-FP16: $q1 = COPY %b(s128)
+    ; NO-FP16: BL &fmaxl, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $q0, implicit $q1, implicit-def $q0
+    ; NO-FP16: %maxnum:_(s128) = COPY $q0
+    ; NO-FP16: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; NO-FP16: $q0 = COPY %maxnum(s128)
+    ; NO-FP16: RET_ReallyLR implicit $q0
+    %a:_(s128) = COPY $q0
+    %b:_(s128) = COPY $q1
+    %maxnum:_(s128) = G_FMAXNUM %a, %b
+    $q0 = COPY %maxnum(s128)
+    RET_ReallyLR implicit $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fminnum.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fminnum.mir
new file mode 100644
index 0000000000000..cb6f287b857b3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fminnum.mir
@@ -0,0 +1,115 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=legalizer -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=FP16
+# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=NO-FP16
+...
+---
+name:            s16_legal_with_full_fp16
+alignment:       4
+body:             |
+  bb.0:
+    liveins: $h0, $h1
+    ; FP16-LABEL: name: s16_legal_with_full_fp16
+    ; FP16: %a:_(s16) = COPY $h0
+    ; FP16: %b:_(s16) = COPY $h1
+    ; FP16: %minnum:_(s16) = G_FMINNUM %a, %b
+    ; FP16: $h0 = COPY %minnum(s16)
+    ; FP16: RET_ReallyLR implicit $h0
+    ; NO-FP16-LABEL: name: s16_legal_with_full_fp16
+    ; NO-FP16: %a:_(s16) = COPY $h0
+    ; NO-FP16: %b:_(s16) = COPY $h1
+    ; NO-FP16: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %a(s16)
+    ; NO-FP16: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT %b(s16)
+    ; NO-FP16: [[FMINNUM:%[0-9]+]]:_(s32) = G_FMINNUM [[FPEXT]], [[FPEXT1]]
+    ; NO-FP16: %minnum:_(s16) = G_FPTRUNC [[FMINNUM]](s32)
+    ; NO-FP16: $h0 = COPY %minnum(s16)
+    ; NO-FP16: RET_ReallyLR implicit $h0
+    %a:_(s16) = COPY $h0
+    %b:_(s16) = COPY $h1
+    %minnum:_(s16) = G_FMINNUM %a, %b
+    $h0 = COPY %minnum(s16)
+    RET_ReallyLR implicit $h0
+
+...
+---
+name:            s32_legal
+alignment:       4
+body:             |
+  bb.0:
+    liveins: $s0, $s1
+    ; FP16-LABEL: name: s32_legal
+    ; FP16: %a:_(s32) = COPY $s0
+    ; FP16: %b:_(s32) = COPY $s1
+    ; FP16: %minnum:_(s32) = G_FMINNUM %a, %b
+    ; FP16: $s0 = COPY %minnum(s32)
+    ; FP16: RET_ReallyLR implicit $s0
+    ; NO-FP16-LABEL: name: s32_legal
+    ; NO-FP16: %a:_(s32) = COPY $s0
+    ; NO-FP16: %b:_(s32) = COPY $s1
+    ; NO-FP16: %minnum:_(s32) = G_FMINNUM %a, %b
+    ; NO-FP16: $s0 = COPY %minnum(s32)
+    ; NO-FP16: RET_ReallyLR implicit $s0
+    %a:_(s32) = COPY $s0
+    %b:_(s32) = COPY $s1
+    %minnum:_(s32) = G_FMINNUM %a, %b
+    $s0 = COPY %minnum(s32)
+    RET_ReallyLR implicit $s0
+
+...
+---
+name:            s64_legal
+alignment:       4
+body:             |
+  bb.0:
+    liveins: $d0, $d1
+    ; FP16-LABEL: name: s64_legal
+    ; FP16: %a:_(s64) = COPY $d0
+    ; FP16: %b:_(s64) = COPY $d1
+    ; FP16: %minnum:_(s64) = G_FMINNUM %a, %b
+    ; FP16: $d0 = COPY %minnum(s64)
+    ; FP16: RET_ReallyLR implicit $d0
+    ; NO-FP16-LABEL: name: s64_legal
+    ; NO-FP16: %a:_(s64) = COPY $d0
+    ; NO-FP16: %b:_(s64) = COPY $d1
+    ; NO-FP16: %minnum:_(s64) = G_FMINNUM %a, %b
+    ; NO-FP16: $d0 = COPY %minnum(s64)
+    ; NO-FP16: RET_ReallyLR implicit $d0
+    %a:_(s64) = COPY $d0
+    %b:_(s64) = COPY $d1
+    %minnum:_(s64) = G_FMINNUM %a, %b
+    $d0 = COPY %minnum(s64)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            s128_libcall
+alignment:       4
+body:             |
+  bb.0:
+    liveins: $q0, $q1
+    ; FP16-LABEL: name: s128_libcall
+    ; FP16: %a:_(s128) = COPY $q0
+    ; FP16: %b:_(s128) = COPY $q1
+    ; FP16: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; FP16: $q0 = COPY %a(s128)
+    ; FP16: $q1 = COPY %b(s128)
+    ; FP16: BL &fminl, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $q0, implicit $q1, implicit-def $q0
+    ; FP16: %minnum:_(s128) = COPY $q0
+    ; FP16: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; FP16: $q0 = COPY %minnum(s128)
+    ; FP16: RET_ReallyLR implicit $q0
+    ; NO-FP16-LABEL: name: s128_libcall
+    ; NO-FP16: %a:_(s128) = COPY $q0
+    ; NO-FP16: %b:_(s128) = COPY $q1
+    ; NO-FP16: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+    ; NO-FP16: $q0 = COPY %a(s128)
+    ; NO-FP16: $q1 = COPY %b(s128)
+    ; NO-FP16: BL &fminl, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $q0, implicit $q1, implicit-def $q0
+    ; NO-FP16: %minnum:_(s128) = COPY $q0
+    ; NO-FP16: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+    ; NO-FP16: $q0 = COPY %minnum(s128)
+    ; NO-FP16: RET_ReallyLR implicit $q0
+    %a:_(s128) = COPY $q0
+    %b:_(s128) = COPY $q1
+    %minnum:_(s128) = G_FMINNUM %a, %b
+    $q0 = COPY %minnum(s128)
+    RET_ReallyLR implicit $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-intrinsic-get-dynamic-area-offset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-intrinsic-get-dynamic-area-offset.mir
new file mode 100644
index 0000000000000..a157554132530
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-intrinsic-get-dynamic-area-offset.mir
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+
+# RUN:llc %s -verify-machineinstrs -mtriple=aarch64-unknown-unknown -run-pass=legalizer -o - | FileCheck %s
+
+...
+---
+name:            test_64
+alignment:       4
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins:
+    ; CHECK-LABEL: name: test_64
+    ; CHECK: %v:_(s64) = G_CONSTANT i64 0
+    ; CHECK: $x0 = COPY %v(s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %v:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.get.dynamic.area.offset)
+    $x0 = COPY %v(s64)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            test_32
+alignment:       4
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins:
+    ; CHECK-LABEL: name: test_32
+    ; CHECK: %v:_(s32) = G_CONSTANT i32 0
+    ; CHECK: $w0 = COPY %v(s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %v:_(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.get.dynamic.area.offset)
+    $w0 = COPY %v(s32)
+    RET_ReallyLR implicit $w0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-isnan.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-isnan.mir
new file mode 100644
index 0000000000000..4a9be4cedf3aa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-isnan.mir
@@ -0,0 +1,112 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=legalizer -global-isel-abort=0 -verify-machineinstrs %s -o - | FileCheck %s
+
+...
+---
+name:            scalar_nofpexcept
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $h0
+
+    ; CHECK-LABEL: name: scalar_nofpexcept
+    ; CHECK: liveins: $h0
+    ; CHECK: %val:_(s16) = COPY $h0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+    ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[C]](s32)
+    ; CHECK: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %val(s16)
+    ; CHECK: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[FPTRUNC]](s16)
+    ; CHECK: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(uno), [[FPEXT]](s32), [[FPEXT1]]
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[FCMP]](s32)
+    ; CHECK: %ext:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK: $w0 = COPY %ext(s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %val:_(s16) = COPY $h0
+    %isnan:_(s1) = nofpexcept G_ISNAN %val(s16)
+    %ext:_(s32) = G_ZEXT %isnan(s1)
+    $w0 = COPY %ext(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            vector_nofpexcept
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: vector_nofpexcept
+    ; CHECK: liveins: $d0
+    ; CHECK: %val:_(<4 x s16>) = COPY $d0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+    ; CHECK: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[C]](s32)
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[FPTRUNC]](s16), [[FPTRUNC]](s16), [[FPTRUNC]](s16), [[FPTRUNC]](s16)
+    ; CHECK: [[FCMP:%[0-9]+]]:_(<4 x s16>) = G_FCMP floatpred(uno), %val(<4 x s16>), [[BUILD_VECTOR]]
+    ; CHECK: %ext:_(<4 x s16>) = COPY [[FCMP]](<4 x s16>)
+    ; CHECK: $d0 = COPY %ext(<4 x s16>)
+    ; CHECK: RET_ReallyLR implicit $d0
+    %val:_(<4 x s16>) = COPY $d0
+    %isnan:_(<4 x s1>) = nofpexcept G_ISNAN %val(<4 x s16>)
+    %ext:_(<4 x s16>) = G_ANYEXT %isnan(<4 x s1>)
+    $d0 = COPY %ext(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
+---
+name:            scalar_no_flags
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $h0
+
+    ; CHECK-LABEL: name: scalar_no_flags
+    ; CHECK: liveins: $h0
+    ; CHECK: %val:_(s16) = COPY $h0
+    ; CHECK: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT %val(s16)
+    ; CHECK: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[FPEXT]](s32)
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[FPTOSI]](s32)
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[AND]](s32)
+    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 16
+    ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[C1]](s32), [[SEXT_INREG]]
+    ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ICMP]](s32)
+    ; CHECK: %ext:_(s32) = G_AND [[COPY2]], [[C2]]
+    ; CHECK: $w0 = COPY %ext(s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %val:_(s16) = COPY $h0
+    %isnan:_(s1) = G_ISNAN %val(s16)
+    %ext:_(s32) = G_ZEXT %isnan(s1)
+    $w0 = COPY %ext(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            vector_no_flags
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: vector_no_flags
+    ; CHECK: liveins: $d0
+    ; CHECK: %val:_(<4 x s16>) = COPY $d0
+    ; CHECK: [[FPTOSI:%[0-9]+]]:_(<4 x s16>) = G_FPTOSI %val(<4 x s16>)
+    ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+    ; CHECK: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[FPTOSI]], [[BUILD_VECTOR]]
+    ; CHECK: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 31744
+    ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C1]](s16), [[C1]](s16), [[C1]](s16), [[C1]](s16)
+    ; CHECK: [[ICMP:%[0-9]+]]:_(<4 x s16>) = G_ICMP intpred(slt), [[BUILD_VECTOR1]](<4 x s16>), [[AND]]
+    ; CHECK: %isnan:_(<4 x s1>) = G_TRUNC [[ICMP]](<4 x s16>)
+    ; CHECK: %ext:_(<4 x s16>) = G_ANYEXT %isnan(<4 x s1>)
+    ; CHECK: $d0 = COPY %ext(<4 x s16>)
+    ; CHECK: RET_ReallyLR implicit $d0
+    %val:_(<4 x s16>) = COPY $d0
+    %isnan:_(<4 x s1>) = G_ISNAN %val(<4 x s16>)
+    %ext:_(<4 x s16>) = G_ANYEXT %isnan(<4 x s1>)
+    $d0 = COPY %ext(<4 x s16>)
+    RET_ReallyLR implicit $d0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index c1653d121e0ef..b281e0ddbc327 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -674,3 +674,47 @@ body:             |
     %ext:_(s64) = G_ZEXT %load
     $x0 = COPY %ext
     RET_ReallyLR implicit $x0
+...
+---
+name:            load_store_6xp0
+alignment:       4
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.1:
+    liveins: $x0
+    ; CHECK-LABEL: name: load_store_6xp0
+    ; CHECK: liveins: $x0
+    ; CHECK: %ptr:_(p0) = COPY $x0
+    ; CHECK: [[LOAD:%[0-9]+]]:_(p0) = G_LOAD %ptr(p0) :: (load (p0), align 64)
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD]](p0) :: (load (p0) from unknown-address + 8)
+    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
+    ; CHECK: [[LOAD2:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD1]](p0) :: (load (p0) from unknown-address + 16, align 16)
+    ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
+    ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C2]](s64)
+    ; CHECK: [[LOAD3:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD2]](p0) :: (load (p0) from unknown-address + 24)
+    ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
+    ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
+    ; CHECK: [[LOAD4:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD3]](p0) :: (load (p0) from unknown-address + 32, align 32)
+    ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
+    ; CHECK: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C4]](s64)
+    ; CHECK: [[LOAD5:%[0-9]+]]:_(p0) = G_LOAD [[PTR_ADD4]](p0) :: (load (p0) from unknown-address + 40)
+    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD]](p0), [[LOAD1]](p0)
+    ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD2]](p0), [[LOAD3]](p0)
+    ; CHECK: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[LOAD4]](p0), [[LOAD5]](p0)
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR]](<2 x p0>)
+    ; CHECK: G_STORE [[BITCAST]](<2 x s64>), %ptr(p0) :: (store (<2 x s64>), align 64)
+    ; CHECK: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C1]](s64)
+    ; CHECK: [[BITCAST1:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR1]](<2 x p0>)
+    ; CHECK: G_STORE [[BITCAST1]](<2 x s64>), [[PTR_ADD5]](p0) :: (store (<2 x s64>) into unknown-address + 16)
+    ; CHECK: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr, [[C3]](s64)
+    ; CHECK: [[BITCAST2:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[BUILD_VECTOR2]](<2 x p0>)
+    ; CHECK: G_STORE [[BITCAST2]](<2 x s64>), [[PTR_ADD6]](p0) :: (store (<2 x s64>) into unknown-address + 32, align 32)
+    ; CHECK: RET_ReallyLR
+    %ptr:_(p0) = COPY $x0
+    %val:_(<6 x p0>) = G_LOAD %ptr(p0) :: (load (<6 x p0>))
+    G_STORE %val(<6 x p0>), %ptr(p0) :: (store (<6 x p0>))
+    RET_ReallyLR
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-lround.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-lround.mir
new file mode 100644
index 0000000000000..c61fa6c3ee147
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-lround.mir
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer -verify-machineinstrs %s -o - | FileCheck %s
+
+...
+---
+name:            s64_s32_legal
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $s0
+    ; CHECK-LABEL: name: s64_s32_legal
+    ; CHECK: liveins: $s0
+    ; CHECK: %copy:_(s32) = COPY $s0
+    ; CHECK: %lround:_(s64) = G_LROUND %copy(s32)
+    ; CHECK: %trunc:_(s32) = G_TRUNC %lround(s64)
+    ; CHECK: $w0 = COPY %trunc(s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %copy:_(s32) = COPY $s0
+    %lround:_(s64) = G_LROUND %copy(s32)
+    %trunc:_(s32) = G_TRUNC %lround
+    $w0 = COPY %trunc(s32)
+    RET_ReallyLR implicit $w0
+...
+---
+name:            s64_s64_legal
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+    ; CHECK-LABEL: name: s64_s64_legal
+    ; CHECK: liveins: $d0
+    ; CHECK: %copy:_(s64) = COPY $d0
+    ; CHECK: %lround:_(s64) = G_LROUND %copy(s64)
+    ; CHECK: $x0 = COPY %lround(s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %copy:_(s64) = COPY $d0
+    %lround:_(s64) = G_LROUND %copy(s64)
+    $x0 = COPY %lround
+    RET_ReallyLR implicit $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
index c9c0c9f9fe4cd..d674b0d12ffed 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
@@ -65,6 +65,93 @@ body:             |
     %4:_(s32) = G_ANYEXT %3(s1)
     $w0 = COPY %4(s32)
 
+...
+---
+name:            test_smul_overflow_s32
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test_smul_overflow_s32
+    ; CHECK: %lhs:_(s32) = COPY $w0
+    ; CHECK: %rhs:_(s32) = COPY $w1
+    ; CHECK: [[SMULH:%[0-9]+]]:_(s32) = G_SMULH %lhs, %rhs
+    ; CHECK: %mul:_(s32) = G_MUL %lhs, %rhs
+    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
+    ; CHECK: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR %mul, [[C]](s64)
+    ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[SMULH]](s32), [[ASHR]]
+    ; CHECK: $w0 = COPY %mul(s32)
+    ; CHECK: %ext_overflow:_(s32) = COPY [[ICMP]](s32)
+    ; CHECK: $w0 = COPY %ext_overflow(s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %lhs:_(s32) = COPY $w0
+    %rhs:_(s32) = COPY $w1
+    %mul:_(s32), %overflow:_(s1) = G_SMULO %lhs, %rhs
+    $w0 = COPY %mul(s32)
+    %ext_overflow:_(s32) = G_ANYEXT %overflow(s1)
+    $w0 = COPY %ext_overflow(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_umul_overflow_s32
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test_umul_overflow_s32
+    ; CHECK: %lhs:_(s32) = COPY $w0
+    ; CHECK: %rhs:_(s32) = COPY $w1
+    ; CHECK: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH %lhs, %rhs
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: %mul:_(s32) = G_MUL %lhs, %rhs
+    ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[UMULH]](s32), [[C]]
+    ; CHECK: $w0 = COPY %mul(s32)
+    ; CHECK: %ext_overflow:_(s32) = COPY [[ICMP]](s32)
+    ; CHECK: $w0 = COPY %ext_overflow(s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %lhs:_(s32) = COPY $w0
+    %rhs:_(s32) = COPY $w1
+    %mul:_(s32), %overflow:_(s1) = G_UMULO %lhs, %rhs
+    $w0 = COPY %mul(s32)
+    %ext_overflow:_(s32) = G_ANYEXT %overflow(s1)
+    $w0 = COPY %ext_overflow(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_umul_overflow_s24
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test_umul_overflow_s24
+    ; CHECK: %lhs_wide:_(s32) = COPY $w0
+    ; CHECK: %rhs_wide:_(s32) = COPY $w1
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY %lhs_wide(s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY %rhs_wide(s32)
+    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+    ; CHECK: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[AND1]]
+    ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND]], [[AND1]]
+    ; CHECK: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[UMULH]](s32), [[C1]]
+    ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C]]
+    ; CHECK: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[MUL]](s32), [[AND2]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ICMP]](s32)
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ICMP1]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[COPY3]]
+    ; CHECK: %ext_mul:_(s32) = COPY [[MUL]](s32)
+    ; CHECK: $w0 = COPY %ext_mul(s32)
+    ; CHECK: %ext_overflow:_(s32) = COPY [[OR]](s32)
+    ; CHECK: $w0 = COPY %ext_overflow(s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %lhs_wide:_(s32) = COPY $w0
+    %rhs_wide:_(s32) = COPY $w1
+    %lhs:_(s24) = G_TRUNC %lhs_wide
+    %rhs:_(s24) = G_TRUNC %rhs_wide
+    %mul:_(s24), %overflow:_(s1) = G_UMULO %lhs, %rhs
+    %ext_mul:_(s32) = G_ANYEXT %mul
+    $w0 = COPY %ext_mul(s32)
+    %ext_overflow:_(s32) = G_ANYEXT %overflow(s1)
+    $w0 = COPY %ext_overflow(s32)
+    RET_ReallyLR implicit $w0
+
 ...
 ---
 name:            vector_mul_scalarize
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
index 8c61a8d2ba8fe..799fe5f42e4a3 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
@@ -854,22 +854,27 @@ body:             |
   ; CHECK:   %ptr1:_(p0) = COPY $x1
   ; CHECK:   %ptr2:_(p0) = COPY $x0
   ; CHECK:   %cond:_(s1) = G_IMPLICIT_DEF
-  ; CHECK:   %val_1:_(<4 x p0>) = G_LOAD %ptr1(p0) :: (load (<4 x p0>))
-  ; CHECK:   [[UV:%[0-9]+]]:_(<2 x p0>), [[UV1:%[0-9]+]]:_(<2 x p0>) = G_UNMERGE_VALUES %val_1(<4 x p0>)
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD %ptr1(p0) :: (load (<2 x s64>), align 32)
+  ; CHECK:   [[BITCAST:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD]](<2 x s64>)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr1, [[C]](s64)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+  ; CHECK:   [[BITCAST1:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD1]](<2 x s64>)
   ; CHECK:   G_BRCOND %cond(s1), %bb.2
   ; CHECK:   G_BR %bb.1
   ; CHECK: bb.1:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD %ptr2(p0) :: (load (<4 x s64>))
-  ; CHECK:   %val_2:_(<4 x p0>) = G_BITCAST [[LOAD]](<4 x s64>)
-  ; CHECK:   [[UV2:%[0-9]+]]:_(<2 x p0>), [[UV3:%[0-9]+]]:_(<2 x p0>) = G_UNMERGE_VALUES %val_2(<4 x p0>)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(<2 x s64>) = G_LOAD %ptr2(p0) :: (load (<2 x s64>), align 32)
+  ; CHECK:   [[BITCAST2:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD2]](<2 x s64>)
+  ; CHECK:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+  ; CHECK:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr2, [[C1]](s64)
+  ; CHECK:   [[LOAD3:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD1]](p0) :: (load (<2 x s64>) from unknown-address + 16)
+  ; CHECK:   [[BITCAST3:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD3]](<2 x s64>)
   ; CHECK: bb.2:
-  ; CHECK:   [[PHI:%[0-9]+]]:_(<2 x p0>) = G_PHI [[UV2]](<2 x p0>), %bb.1, [[UV]](<2 x p0>), %bb.0
-  ; CHECK:   [[PHI1:%[0-9]+]]:_(<2 x p0>) = G_PHI [[UV3]](<2 x p0>), %bb.1, [[UV1]](<2 x p0>), %bb.0
-  ; CHECK:   %phi:_(<4 x p0>) = G_CONCAT_VECTORS [[PHI]](<2 x p0>), [[PHI1]](<2 x p0>)
-  ; CHECK:   %unmerge_1:_(<2 x p0>), %unmerge_2:_(<2 x p0>) = G_UNMERGE_VALUES %phi(<4 x p0>)
-  ; CHECK:   $q0 = COPY %unmerge_1(<2 x p0>)
-  ; CHECK:   $q1 = COPY %unmerge_2(<2 x p0>)
+  ; CHECK:   [[PHI:%[0-9]+]]:_(<2 x p0>) = G_PHI [[BITCAST2]](<2 x p0>), %bb.1, [[BITCAST]](<2 x p0>), %bb.0
+  ; CHECK:   [[PHI1:%[0-9]+]]:_(<2 x p0>) = G_PHI [[BITCAST3]](<2 x p0>), %bb.1, [[BITCAST1]](<2 x p0>), %bb.0
+  ; CHECK:   $q0 = COPY [[PHI]](<2 x p0>)
+  ; CHECK:   $q1 = COPY [[PHI1]](<2 x p0>)
   ; CHECK:   RET_ReallyLR implicit $q0, implicit $q1
   bb.0:
     successors: %bb.1(0x50000000), %bb.2(0x30000000)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-or.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-or.mir
new file mode 100644
index 0000000000000..2ebd88a224b94
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-reduce-or.mir
@@ -0,0 +1,701 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -march=aarch64 -run-pass=legalizer -global-isel-abort=1 -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            test_redor_v1i1
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$w0' }
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: test_redor_v1i1
+    ; CHECK: liveins: $w0
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+    ; CHECK: $w0 = COPY [[AND]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(s32) = COPY $w0
+    %0:_(s1) = G_TRUNC %1(s32)
+    %2:_(s1) = G_VECREDUCE_OR %0(s1)
+    %4:_(s32) = G_ZEXT %2(s1)
+    $w0 = COPY %4(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v2i1
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_redor_v2i1
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV1]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY2]]
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
+    ; CHECK: $w0 = COPY [[AND]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(<2 x s32>) = COPY $d0
+    %0:_(<2 x s1>) = G_TRUNC %1(<2 x s32>)
+    %2:_(s1) = G_VECREDUCE_OR %0(<2 x s1>)
+    %4:_(s32) = G_ZEXT %2(s1)
+    $w0 = COPY %4(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v4i1
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_redor_v4i1
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
+    ; CHECK: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+    ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ANYEXT2]], [[ANYEXT3]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR1]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY2]]
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C]]
+    ; CHECK: $w0 = COPY [[AND]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(<4 x s16>) = COPY $d0
+    %0:_(<4 x s1>) = G_TRUNC %1(<4 x s16>)
+    %2:_(s1) = G_VECREDUCE_OR %0(<4 x s1>)
+    %4:_(s32) = G_ZEXT %2(s1)
+    $w0 = COPY %4(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v8i1
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_redor_v8i1
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
+    ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[COPY]](<8 x s8>)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s8)
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s8)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s8)
+    ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s8)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ANYEXT2]], [[ANYEXT3]]
+    ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s8)
+    ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s8)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ANYEXT4]], [[ANYEXT5]]
+    ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s8)
+    ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s8)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ANYEXT6]], [[ANYEXT7]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR1]](s32)
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[OR3]](s32)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[COPY4]]
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[OR4]](s32)
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[OR5]](s32)
+    ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[COPY5]], [[COPY6]]
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[OR6]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C]]
+    ; CHECK: $w0 = COPY [[AND]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(<8 x s8>) = COPY $d0
+    %0:_(<8 x s1>) = G_TRUNC %1(<8 x s8>)
+    %2:_(s1) = G_VECREDUCE_OR %0(<8 x s1>)
+    %4:_(s32) = G_ZEXT %2(s1)
+    $w0 = COPY %4(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v16i1
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo: {}
+body:             |
+  bb.1:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: test_redor_v16i1
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
+    ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8), [[UV12:%[0-9]+]]:_(s8), [[UV13:%[0-9]+]]:_(s8), [[UV14:%[0-9]+]]:_(s8), [[UV15:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[COPY]](<16 x s8>)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s8)
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s8)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s8)
+    ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s8)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ANYEXT2]], [[ANYEXT3]]
+    ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s8)
+    ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s8)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ANYEXT4]], [[ANYEXT5]]
+    ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s8)
+    ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s8)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ANYEXT6]], [[ANYEXT7]]
+    ; CHECK: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s8)
+    ; CHECK: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s8)
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ANYEXT8]], [[ANYEXT9]]
+    ; CHECK: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s8)
+    ; CHECK: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s8)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ANYEXT10]], [[ANYEXT11]]
+    ; CHECK: [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s8)
+    ; CHECK: [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s8)
+    ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ANYEXT12]], [[ANYEXT13]]
+    ; CHECK: [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s8)
+    ; CHECK: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s8)
+    ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ANYEXT14]], [[ANYEXT15]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR1]](s32)
+    ; CHECK: [[OR8:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[OR3]](s32)
+    ; CHECK: [[OR9:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[COPY4]]
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[OR4]](s32)
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[OR5]](s32)
+    ; CHECK: [[OR10:%[0-9]+]]:_(s32) = G_OR [[COPY5]], [[COPY6]]
+    ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[OR6]](s32)
+    ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[OR7]](s32)
+    ; CHECK: [[OR11:%[0-9]+]]:_(s32) = G_OR [[COPY7]], [[COPY8]]
+    ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[OR8]](s32)
+    ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[OR9]](s32)
+    ; CHECK: [[OR12:%[0-9]+]]:_(s32) = G_OR [[COPY9]], [[COPY10]]
+    ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[OR10]](s32)
+    ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[OR11]](s32)
+    ; CHECK: [[OR13:%[0-9]+]]:_(s32) = G_OR [[COPY11]], [[COPY12]]
+    ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[OR12]](s32)
+    ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[OR13]](s32)
+    ; CHECK: [[OR14:%[0-9]+]]:_(s32) = G_OR [[COPY13]], [[COPY14]]
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[OR14]](s32)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C]]
+    ; CHECK: $w0 = COPY [[AND]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(<16 x s8>) = COPY $q0
+    %0:_(<16 x s1>) = G_TRUNC %1(<16 x s8>)
+    %2:_(s1) = G_VECREDUCE_OR %0(<16 x s1>)
+    %4:_(s32) = G_ZEXT %2(s1)
+    $w0 = COPY %4(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v1i8
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_redor_v1i8
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
+    ; CHECK: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[COPY]](<8 x s8>)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[BITCAST]](s64)
+    ; CHECK: $w0 = COPY [[TRUNC]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(<8 x s8>) = COPY $d0
+    %11:_(s64) = G_BITCAST %1(<8 x s8>)
+    %0:_(s8) = G_TRUNC %11(s64)
+    %9:_(s8) = G_VECREDUCE_OR %0(s8)
+    %10:_(s32) = G_ANYEXT %9(s8)
+    $w0 = COPY %10(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v3i8
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$w0' }
+  - { reg: '$w1' }
+  - { reg: '$w2' }
+body:             |
+  bb.1:
+    liveins: $w0, $w1, $w2
+
+    ; CHECK-LABEL: name: test_redor_v3i8
+    ; CHECK: liveins: $w0, $w1, $w2
+    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[COPY4]]
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY5]], [[COPY6]]
+    ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[OR1]](s32)
+    ; CHECK: $w0 = COPY [[COPY7]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(s32) = COPY $w0
+    %2:_(s32) = COPY $w1
+    %3:_(s32) = COPY $w2
+    %4:_(<3 x s32>) = G_BUILD_VECTOR %1(s32), %2(s32), %3(s32)
+    %0:_(<3 x s8>) = G_TRUNC %4(<3 x s32>)
+    %5:_(s8) = G_VECREDUCE_OR %0(<3 x s8>)
+    %6:_(s32) = G_ANYEXT %5(s8)
+    $w0 = COPY %6(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v4i8
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_redor_v4i8
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
+    ; CHECK: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+    ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ANYEXT2]], [[ANYEXT3]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR1]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: $w0 = COPY [[COPY3]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(<4 x s16>) = COPY $d0
+    %0:_(<4 x s8>) = G_TRUNC %1(<4 x s16>)
+    %2:_(s8) = G_VECREDUCE_OR %0(<4 x s8>)
+    %3:_(s32) = G_ANYEXT %2(s8)
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v8i8
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_redor_v8i8
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s8>) = COPY $d0
+    ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[COPY]](<8 x s8>)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s8)
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s8)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s8)
+    ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s8)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ANYEXT2]], [[ANYEXT3]]
+    ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s8)
+    ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s8)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ANYEXT4]], [[ANYEXT5]]
+    ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s8)
+    ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s8)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ANYEXT6]], [[ANYEXT7]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR1]](s32)
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[OR3]](s32)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[COPY4]]
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[OR4]](s32)
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[OR5]](s32)
+    ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[COPY5]], [[COPY6]]
+    ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[OR6]](s32)
+    ; CHECK: $w0 = COPY [[COPY7]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %0:_(<8 x s8>) = COPY $d0
+    %1:_(s8) = G_VECREDUCE_OR %0(<8 x s8>)
+    %2:_(s32) = G_ANYEXT %1(s8)
+    $w0 = COPY %2(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v16i8
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+body:             |
+  bb.1:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: test_redor_v16i8
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
+    ; CHECK: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[COPY]](<16 x s8>)
+    ; CHECK: [[OR:%[0-9]+]]:_(<8 x s8>) = G_OR [[UV]], [[UV1]]
+    ; CHECK: [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[OR]](<8 x s8>)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s8)
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s8)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s8)
+    ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s8)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ANYEXT2]], [[ANYEXT3]]
+    ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s8)
+    ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s8)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ANYEXT4]], [[ANYEXT5]]
+    ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s8)
+    ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s8)
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ANYEXT6]], [[ANYEXT7]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR1]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR3]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[OR4]](s32)
+    ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[COPY4]]
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[OR5]](s32)
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[OR6]](s32)
+    ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[COPY5]], [[COPY6]]
+    ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[OR7]](s32)
+    ; CHECK: $w0 = COPY [[COPY7]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %0:_(<16 x s8>) = COPY $q0
+    %1:_(s8) = G_VECREDUCE_OR %0(<16 x s8>)
+    %2:_(s32) = G_ANYEXT %1(s8)
+    $w0 = COPY %2(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v32i8
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+  - { reg: '$q1' }
+body:             |
+  bb.1:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: test_redor_v32i8
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(<16 x s8>) = COPY $q1
+    ; CHECK: [[OR:%[0-9]+]]:_(<16 x s8>) = G_OR [[COPY]], [[COPY1]]
+    ; CHECK: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[OR]](<16 x s8>)
+    ; CHECK: [[OR1:%[0-9]+]]:_(<8 x s8>) = G_OR [[UV]], [[UV1]]
+    ; CHECK: [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[OR1]](<8 x s8>)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s8)
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s8)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s8)
+    ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s8)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ANYEXT2]], [[ANYEXT3]]
+    ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s8)
+    ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s8)
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ANYEXT4]], [[ANYEXT5]]
+    ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s8)
+    ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s8)
+    ; CHECK: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ANYEXT6]], [[ANYEXT7]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR3]](s32)
+    ; CHECK: [[OR6:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[COPY3]]
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[OR4]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[OR5]](s32)
+    ; CHECK: [[OR7:%[0-9]+]]:_(s32) = G_OR [[COPY4]], [[COPY5]]
+    ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[OR6]](s32)
+    ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[OR7]](s32)
+    ; CHECK: [[OR8:%[0-9]+]]:_(s32) = G_OR [[COPY6]], [[COPY7]]
+    ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[OR8]](s32)
+    ; CHECK: $w0 = COPY [[COPY8]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(<16 x s8>) = COPY $q0
+    %2:_(<16 x s8>) = COPY $q1
+    %0:_(<32 x s8>) = G_CONCAT_VECTORS %1(<16 x s8>), %2(<16 x s8>)
+    %3:_(s8) = G_VECREDUCE_OR %0(<32 x s8>)
+    %4:_(s32) = G_ANYEXT %3(s8)
+    $w0 = COPY %4(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v4i16
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_redor_v4i16
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
+    ; CHECK: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+    ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ANYEXT2]], [[ANYEXT3]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR1]](s32)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: $w0 = COPY [[COPY3]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %0:_(<4 x s16>) = COPY $d0
+    %1:_(s16) = G_VECREDUCE_OR %0(<4 x s16>)
+    %2:_(s32) = G_ANYEXT %1(s16)
+    $w0 = COPY %2(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v8i16
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+body:             |
+  bb.1:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: test_redor_v8i16
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
+    ; CHECK: [[UV:%[0-9]+]]:_(<4 x s16>), [[UV1:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[COPY]](<8 x s16>)
+    ; CHECK: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[UV]], [[UV1]]
+    ; CHECK: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[OR]](<4 x s16>)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+    ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ANYEXT2]], [[ANYEXT3]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR1]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY2]]
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR3]](s32)
+    ; CHECK: $w0 = COPY [[COPY3]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %0:_(<8 x s16>) = COPY $q0
+    %1:_(s16) = G_VECREDUCE_OR %0(<8 x s16>)
+    %2:_(s32) = G_ANYEXT %1(s16)
+    $w0 = COPY %2(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v16i16
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+  - { reg: '$q1' }
+body:             |
+  bb.1:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: test_redor_v16i16
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $q1
+    ; CHECK: [[OR:%[0-9]+]]:_(<8 x s16>) = G_OR [[COPY]], [[COPY1]]
+    ; CHECK: [[UV:%[0-9]+]]:_(<4 x s16>), [[UV1:%[0-9]+]]:_(<4 x s16>) = G_UNMERGE_VALUES [[OR]](<8 x s16>)
+    ; CHECK: [[OR1:%[0-9]+]]:_(<4 x s16>) = G_OR [[UV]], [[UV1]]
+    ; CHECK: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[OR1]](<4 x s16>)
+    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
+    ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
+    ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
+    ; CHECK: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ANYEXT2]], [[ANYEXT3]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[OR3]](s32)
+    ; CHECK: [[OR4:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[COPY3]]
+    ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[OR4]](s32)
+    ; CHECK: $w0 = COPY [[COPY4]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(<8 x s16>) = COPY $q0
+    %2:_(<8 x s16>) = COPY $q1
+    %0:_(<16 x s16>) = G_CONCAT_VECTORS %1(<8 x s16>), %2(<8 x s16>)
+    %3:_(s16) = G_VECREDUCE_OR %0(<16 x s16>)
+    %4:_(s32) = G_ANYEXT %3(s16)
+    $w0 = COPY %4(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v2i32
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: test_redor_v2i32
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
+    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
+    ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[UV1]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR]](s32)
+    ; CHECK: $w0 = COPY [[COPY1]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %0:_(<2 x s32>) = COPY $d0
+    %1:_(s32) = G_VECREDUCE_OR %0(<2 x s32>)
+    $w0 = COPY %1(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v4i32
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+body:             |
+  bb.1:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: test_redor_v4i32
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
+    ; CHECK: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[UV]], [[UV1]]
+    ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[OR]](<2 x s32>)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UV2]], [[UV3]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[OR1]](s32)
+    ; CHECK: $w0 = COPY [[COPY1]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(s32) = G_VECREDUCE_OR %0(<4 x s32>)
+    $w0 = COPY %1(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v8i32
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+  - { reg: '$q1' }
+body:             |
+  bb.1:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: test_redor_v8i32
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+    ; CHECK: [[OR:%[0-9]+]]:_(<4 x s32>) = G_OR [[COPY]], [[COPY1]]
+    ; CHECK: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[OR]](<4 x s32>)
+    ; CHECK: [[OR1:%[0-9]+]]:_(<2 x s32>) = G_OR [[UV]], [[UV1]]
+    ; CHECK: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[OR1]](<2 x s32>)
+    ; CHECK: [[OR2:%[0-9]+]]:_(s32) = G_OR [[UV2]], [[UV3]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[OR2]](s32)
+    ; CHECK: $w0 = COPY [[COPY2]](s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %1:_(<4 x s32>) = COPY $q0
+    %2:_(<4 x s32>) = COPY $q1
+    %0:_(<8 x s32>) = G_CONCAT_VECTORS %1(<4 x s32>), %2(<4 x s32>)
+    %3:_(s32) = G_VECREDUCE_OR %0(<8 x s32>)
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            test_redor_v2i64
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+body:             |
+  bb.1:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: test_redor_v2i64
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
+    ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[UV]], [[UV1]]
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY [[OR]](s64)
+    ; CHECK: $x0 = COPY [[COPY1]](s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %0:_(<2 x s64>) = COPY $q0
+    %1:_(s64) = G_VECREDUCE_OR %0(<2 x s64>)
+    $x0 = COPY %1(s64)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            test_redor_v4i64
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+  - { reg: '$q1' }
+body:             |
+  bb.1:
+    liveins: $q0, $q1
+
+    ; CHECK-LABEL: name: test_redor_v4i64
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+    ; CHECK: [[OR:%[0-9]+]]:_(<2 x s64>) = G_OR [[COPY]], [[COPY1]]
+    ; CHECK: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[OR]](<2 x s64>)
+    ; CHECK: [[OR1:%[0-9]+]]:_(s64) = G_OR [[UV]], [[UV1]]
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY [[OR1]](s64)
+    ; CHECK: $x0 = COPY [[COPY2]](s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %1:_(<2 x s64>) = COPY $q0
+    %2:_(<2 x s64>) = COPY $q1
+    %0:_(<4 x s64>) = G_CONCAT_VECTORS %1(<2 x s64>), %2(<2 x s64>)
+    %3:_(s64) = G_VECREDUCE_OR %0(<4 x s64>)
+    $x0 = COPY %3(s64)
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index a7952f55e15f2..3cd0e7add9db8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -341,11 +341,11 @@
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_UMULO (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. the first uncovered type index: 2, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_SMULO (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. the first uncovered type index: 2, OK
-# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_UMULH (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
@@ -484,11 +484,12 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_FMINNUM (opcode {{[0-9]+}}): 1 type index
-# DEBUG: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMAXNUM (opcode {{[0-9]+}}): 1 type index
-# DEBUG: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMINNUM_IEEE (opcode {{[0-9]+}}): 1 type index
 # DEBUG: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
@@ -525,6 +526,12 @@
 # DEBUG-NEXT: G_ABS (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: G_LROUND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: G_LLROUND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_BR (opcode {{[0-9]+}}): 0 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
@@ -672,8 +679,8 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_VECREDUCE_OR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected 
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_VECREDUCE_XOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-narrow-binop-feeding-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-narrow-binop-feeding-add.mir
new file mode 100644
index 0000000000000..fb19cda303d36
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-narrow-binop-feeding-add.mir
@@ -0,0 +1,332 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+...
+---
+name:            add_64_mask_32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: add_64_mask_32
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: %binop_lhs:_(s64) = COPY $x0
+    ; CHECK: %binop_rhs:_(s64) = COPY $x1
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop_lhs(s64)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %binop_rhs(s64)
+    ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC]], [[TRUNC1]]
+    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[ADD]](s32)
+    ; CHECK: $x0 = COPY [[ZEXT]](s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %binop_lhs:_(s64) = COPY $x0
+    %binop_rhs:_(s64) = COPY $x1
+    %mask_32:_(s64) = G_CONSTANT i64 4294967295
+    %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs
+    %and:_(s64) = G_AND %binop, %mask_32
+    $x0 = COPY %and(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            sub_64_mask_32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: sub_64_mask_32
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: %binop_lhs:_(s64) = COPY $x0
+    ; CHECK: %binop_rhs:_(s64) = COPY $x1
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop_lhs(s64)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %binop_rhs(s64)
+    ; CHECK: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[TRUNC]], [[TRUNC1]]
+    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[SUB]](s32)
+    ; CHECK: $x0 = COPY [[ZEXT]](s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %binop_lhs:_(s64) = COPY $x0
+    %binop_rhs:_(s64) = COPY $x1
+    %mask_32:_(s64) = G_CONSTANT i64 4294967295
+    %binop:_(s64) = G_SUB %binop_lhs, %binop_rhs
+    %and:_(s64) = G_AND %binop, %mask_32
+    $x0 = COPY %and(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            mul_64_mask_32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: mul_64_mask_32
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: %binop_lhs:_(s64) = COPY $x0
+    ; CHECK: %binop_rhs:_(s64) = COPY $x1
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop_lhs(s64)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %binop_rhs(s64)
+    ; CHECK: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[TRUNC]], [[TRUNC1]]
+    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[MUL]](s32)
+    ; CHECK: $x0 = COPY [[ZEXT]](s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %binop_lhs:_(s64) = COPY $x0
+    %binop_rhs:_(s64) = COPY $x1
+    %mask_32:_(s64) = G_CONSTANT i64 4294967295
+    %binop:_(s64) = G_MUL %binop_lhs, %binop_rhs
+    %and:_(s64) = G_AND %binop, %mask_32
+    $x0 = COPY %and(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            and_64_mask_32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: and_64_mask_32
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: %binop_lhs:_(s64) = COPY $x0
+    ; CHECK: %binop_rhs:_(s64) = COPY $x1
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop_lhs(s64)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %binop_rhs(s64)
+    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]]
+    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[AND]](s32)
+    ; CHECK: $x0 = COPY [[ZEXT]](s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %binop_lhs:_(s64) = COPY $x0
+    %binop_rhs:_(s64) = COPY $x1
+    %mask_32:_(s64) = G_CONSTANT i64 4294967295
+    %binop:_(s64) = G_AND %binop_lhs, %binop_rhs
+    %and:_(s64) = G_AND %binop, %mask_32
+    $x0 = COPY %and(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            or_64_mask_32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: or_64_mask_32
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: %binop_lhs:_(s64) = COPY $x0
+    ; CHECK: %binop_rhs:_(s64) = COPY $x1
+    ; CHECK: %mask_32:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK: %binop:_(s64) = G_SUB %binop_lhs, %binop_rhs
+    ; CHECK: %and:_(s64) = G_OR %binop, %mask_32
+    ; CHECK: $x0 = COPY %and(s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %binop_lhs:_(s64) = COPY $x0
+    %binop_rhs:_(s64) = COPY $x1
+    %mask_32:_(s64) = G_CONSTANT i64 4294967295
+    %binop:_(s64) = G_SUB %binop_lhs, %binop_rhs
+    %and:_(s64) = G_OR %binop, %mask_32
+    $x0 = COPY %and(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            xor_64_mask_32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: xor_64_mask_32
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: %binop_lhs:_(s64) = COPY $x0
+    ; CHECK: %binop_rhs:_(s64) = COPY $x1
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop_lhs(s64)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %binop_rhs(s64)
+    ; CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]]
+    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[XOR]](s32)
+    ; CHECK: $x0 = COPY [[ZEXT]](s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %binop_lhs:_(s64) = COPY $x0
+    %binop_rhs:_(s64) = COPY $x1
+    %mask_32:_(s64) = G_CONSTANT i64 4294967295
+    %binop:_(s64) = G_XOR %binop_lhs, %binop_rhs
+    %and:_(s64) = G_AND %binop, %mask_32
+    $x0 = COPY %and(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            walk_thru_copy
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: walk_thru_copy
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: %binop_lhs:_(s64) = COPY $x0
+    ; CHECK: %binop_rhs:_(s64) = COPY $x1
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %binop_lhs(s64)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC %binop_rhs(s64)
+    ; CHECK: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC]], [[TRUNC1]]
+    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[ADD]](s32)
+    ; CHECK: $x0 = COPY [[ZEXT]](s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %binop_lhs:_(s64) = COPY $x0
+    %binop_rhs:_(s64) = COPY $x1
+    %mask_32:_(s64) = G_CONSTANT i64 4294967295
+    %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs
+    %copy:_(s64) = COPY %binop
+    %and:_(s64) = G_AND %copy, %mask_32
+    $x0 = COPY %and(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            dont_combine_zext_not_free_add_64_mask_16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: dont_combine_zext_not_free_add_64_mask_16
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: %binop_lhs:_(s64) = COPY $x0
+    ; CHECK: %binop_rhs:_(s64) = COPY $x1
+    ; CHECK: %mask_16:_(s64) = G_CONSTANT i64 65535
+    ; CHECK: %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs
+    ; CHECK: %and:_(s64) = G_AND %binop, %mask_16
+    ; CHECK: $x0 = COPY %and(s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %binop_lhs:_(s64) = COPY $x0
+    %binop_rhs:_(s64) = COPY $x1
+    %mask_16:_(s64) = G_CONSTANT i64 65535
+    %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs
+    %and:_(s64) = G_AND %binop, %mask_16
+    $x0 = COPY %and(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            dont_combine_zext_not_free_add_64_mask_8
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: dont_combine_zext_not_free_add_64_mask_8
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: %binop_lhs:_(s64) = COPY $x0
+    ; CHECK: %binop_rhs:_(s64) = COPY $x1
+    ; CHECK: %mask_8:_(s64) = G_CONSTANT i64 255
+    ; CHECK: %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs
+    ; CHECK: %and:_(s64) = G_AND %binop, %mask_8
+    ; CHECK: $x0 = COPY %and(s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %binop_lhs:_(s64) = COPY $x0
+    %binop_rhs:_(s64) = COPY $x1
+    %mask_8:_(s64) = G_CONSTANT i64 255
+    %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs
+    %and:_(s64) = G_AND %binop, %mask_8
+    $x0 = COPY %and(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            dont_combine_not_a_mask
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: dont_combine_not_a_mask
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: %binop_lhs:_(s64) = COPY $x0
+    ; CHECK: %binop_rhs:_(s64) = COPY $x1
+    ; CHECK: %not_a_mask:_(s64) = G_CONSTANT i64 26
+    ; CHECK: %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs
+    ; CHECK: %and:_(s64) = G_AND %binop, %not_a_mask
+    ; CHECK: $x0 = COPY %and(s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %binop_lhs:_(s64) = COPY $x0
+    %binop_rhs:_(s64) = COPY $x1
+    %not_a_mask:_(s64) = G_CONSTANT i64 26
+    %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs
+    %and:_(s64) = G_AND %binop, %not_a_mask
+    $x0 = COPY %and(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            dont_combine_more_than_one_use
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: dont_combine_more_than_one_use
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: %binop_lhs:_(s64) = COPY $x0
+    ; CHECK: %binop_rhs:_(s64) = COPY $x1
+    ; CHECK: %not_a_mask:_(s64) = G_CONSTANT i64 26
+    ; CHECK: %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs
+    ; CHECK: %and:_(s64) = G_AND %binop, %not_a_mask
+    ; CHECK: %or:_(s64) = G_OR %and, %binop
+    ; CHECK: $x0 = COPY %or(s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %binop_lhs:_(s64) = COPY $x0
+    %binop_rhs:_(s64) = COPY $x1
+    %not_a_mask:_(s64) = G_CONSTANT i64 26
+    %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs
+    %and:_(s64) = G_AND %binop, %not_a_mask
+    %or:_(s64) = G_OR %and, %binop
+    $x0 = COPY %or(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            dont_combine_vector
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $q0, $q1
+    ; CHECK-LABEL: name: dont_combine_vector
+    ; CHECK: liveins: $q0, $q1
+    ; CHECK: %binop_lhs:_(<2 x s64>) = COPY $q0
+    ; CHECK: %binop_rhs:_(<2 x s64>) = COPY $q1
+    ; CHECK: %mask_elt:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK: %mask:_(<2 x s64>) = G_BUILD_VECTOR %mask_elt(s64), %mask_elt(s64)
+    ; CHECK: %binop:_(<2 x s64>) = G_ADD %binop_lhs, %binop_rhs
+    ; CHECK: %and:_(<2 x s64>) = G_AND %binop, %mask
+    ; CHECK: $q0 = COPY %and(<2 x s64>)
+    ; CHECK: RET_ReallyLR implicit $q0
+    %binop_lhs:_(<2 x s64>) = COPY $q0
+    %binop_rhs:_(<2 x s64>) = COPY $q1
+    %mask_elt:_(s64) = G_CONSTANT i64 4294967295
+    %mask:_(<2 x s64>) = G_BUILD_VECTOR %mask_elt, %mask_elt
+    %binop:_(<2 x s64>) = G_ADD %binop_lhs, %binop_rhs
+    %and:_(<2 x s64>) = G_AND %binop, %mask
+    $q0 = COPY %and(<2 x s64>)
+    RET_ReallyLR implicit $q0
+...
+---
+name:            dont_combine_add_64_mask_64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: dont_combine_add_64_mask_64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: %binop_lhs:_(s64) = COPY $x0
+    ; CHECK: %binop_rhs:_(s64) = COPY $x1
+    ; CHECK: %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs
+    ; CHECK: $x0 = COPY %binop(s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %binop_lhs:_(s64) = COPY $x0
+    %binop_rhs:_(s64) = COPY $x1
+    %mask_64:_(s64) = G_CONSTANT i64 18446744073709551615
+    %binop:_(s64) = G_ADD %binop_lhs, %binop_rhs
+    %and:_(s64) = G_AND %binop, %mask_64
+    $x0 = COPY %and(s64)
+    RET_ReallyLR implicit $x0
+...
+---
+name:            dont_combine_copy_from_physreg
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; CHECK-LABEL: name: dont_combine_copy_from_physreg
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK: %copy_from_physreg:_(s64) = COPY $x0
+    ; CHECK: %mask_32:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK: %and:_(s64) = G_AND %copy_from_physreg, %mask_32
+    ; CHECK: $x0 = COPY %and(s64)
+    ; CHECK: RET_ReallyLR implicit $x0
+    %copy_from_physreg:_(s64) = COPY $x0
+    %binop_rhs:_(s64) = COPY $x1
+    %mask_32:_(s64) = G_CONSTANT i64 4294967295
+    %copy:_(s64) = COPY %copy_from_physreg
+    %and:_(s64) = G_AND %copy, %mask_32
+    $x0 = COPY %and(s64)
+    RET_ReallyLR implicit $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-lround.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-lround.mir
new file mode 100644
index 0000000000000..77865c256c737
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-lround.mir
@@ -0,0 +1,65 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=regbankselect -verify-machineinstrs %s -o - | FileCheck %s
+
+...
+---
+name:            no_cross_bank_copies_needed
+legalized:       true
+regBankSelected: false
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+    ; CHECK-LABEL: name: no_cross_bank_copies_needed
+    ; CHECK: liveins: $d0
+    ; CHECK: %fpr:fpr(s64) = COPY $d0
+    ; CHECK: %lround:gpr(s64) = G_LROUND %fpr(s64)
+    ; CHECK: $d0 = COPY %lround(s64)
+    ; CHECK: RET_ReallyLR implicit $s0
+    %fpr:_(s64) = COPY $d0
+    %lround:_(s64) = G_LROUND %fpr
+    $d0 = COPY %lround:_(s64)
+    RET_ReallyLR implicit $s0
+...
+---
+name:            source_needs_copy
+legalized:       true
+regBankSelected: false
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: source_needs_copy
+    ; CHECK: liveins: $x0
+    ; CHECK: %gpr:gpr(s64) = COPY $x0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY %gpr(s64)
+    ; CHECK: %lround:gpr(s64) = G_LROUND [[COPY]](s64)
+    ; CHECK: $d0 = COPY %lround(s64)
+    ; CHECK: RET_ReallyLR implicit $s0
+    %gpr:_(s64) = COPY $x0
+    %lround:_(s64) = G_LROUND %gpr
+    $d0 = COPY %lround:_(s64)
+    RET_ReallyLR implicit $s0
+...
+---
+name:            load_gets_fpr
+legalized:       true
+regBankSelected: false
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0
+    ; CHECK-LABEL: name: load_gets_fpr
+    ; CHECK: liveins: $x0
+    ; CHECK: %ptr:gpr(p0) = COPY $x0
+    ; CHECK: %load:fpr(s32) = G_LOAD %ptr(p0) :: (load (s32))
+    ; CHECK: %lround:gpr(s64) = G_LROUND %load(s32)
+    ; CHECK: $d0 = COPY %lround(s64)
+    ; CHECK: RET_ReallyLR implicit $s0
+    %ptr:_(p0) = COPY $x0
+    %load:_(s32) = G_LOAD %ptr(p0) :: (load (s32))
+    %lround:_(s64) = G_LROUND %load
+    $d0 = COPY %lround:_(s64)
+    RET_ReallyLR implicit $s0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-maxnum.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-maxnum.mir
new file mode 100644
index 0000000000000..7a83a527957d0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-maxnum.mir
@@ -0,0 +1,64 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=regbankselect -verify-machineinstrs %s -o - | FileCheck %s
+
+...
+---
+name:            fp_inputs
+legalized:       true
+body:             |
+  bb.0:
+    liveins: $s0, $s1
+
+    ; CHECK-LABEL: name: fp_inputs
+    ; CHECK: %lhs:fpr(s32) = COPY $s0
+    ; CHECK: %rhs:fpr(s32) = COPY $s1
+    ; CHECK: %maxnum:fpr(s32) = G_FMAXNUM %lhs, %rhs
+    ; CHECK: $s0 = COPY %maxnum(s32)
+    ; CHECK: RET_ReallyLR implicit $s0
+    %lhs:_(s32) = COPY $s0
+    %rhs:_(s32) = COPY $s1
+    %maxnum:_(s32) = G_FMAXNUM %lhs, %rhs
+    $s0 = COPY %maxnum(s32)
+    RET_ReallyLR implicit $s0
+
+...
+---
+name:            gp_inputs
+legalized:       true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: gp_inputs
+    ; CHECK: %lhs:gpr(s32) = COPY $w0
+    ; CHECK: %rhs:gpr(s32) = COPY $w1
+    ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY %lhs(s32)
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr(s32) = COPY %rhs(s32)
+    ; CHECK: %maxnum:fpr(s32) = G_FMAXNUM [[COPY]], [[COPY1]]
+    ; CHECK: $s0 = COPY %maxnum(s32)
+    ; CHECK: RET_ReallyLR implicit $s0
+    %lhs:_(s32) = COPY $w0
+    %rhs:_(s32) = COPY $w1
+    %maxnum:_(s32) = G_FMAXNUM %lhs, %rhs
+    $s0 = COPY %maxnum(s32)
+    RET_ReallyLR implicit $s0
+
+...
+---
+name:            gp_use
+legalized:       true
+body:             |
+  bb.0:
+    liveins: $s0, $s1
+
+    ; CHECK-LABEL: name: gp_use
+    ; CHECK: %lhs:fpr(s32) = COPY $s0
+    ; CHECK: %rhs:fpr(s32) = COPY $s1
+    ; CHECK: %maxnum:fpr(s32) = G_FMAXNUM %lhs, %rhs
+    ; CHECK: $w0 = COPY %maxnum(s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %lhs:_(s32) = COPY $s0
+    %rhs:_(s32) = COPY $s1
+    %maxnum:_(s32) = G_FMAXNUM %lhs, %rhs
+    $w0 = COPY %maxnum(s32)
+    RET_ReallyLR implicit $w0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-minnum.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-minnum.mir
new file mode 100644
index 0000000000000..a090434873348
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-minnum.mir
@@ -0,0 +1,64 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=regbankselect -verify-machineinstrs %s -o - | FileCheck %s
+
+...
+---
+name:            fp_inputs
+legalized:       true
+body:             |
+  bb.0:
+    liveins: $s0, $s1
+
+    ; CHECK-LABEL: name: fp_inputs
+    ; CHECK: %lhs:fpr(s32) = COPY $s0
+    ; CHECK: %rhs:fpr(s32) = COPY $s1
+    ; CHECK: %minnum:fpr(s32) = G_FMINNUM %lhs, %rhs
+    ; CHECK: $s0 = COPY %minnum(s32)
+    ; CHECK: RET_ReallyLR implicit $s0
+    %lhs:_(s32) = COPY $s0
+    %rhs:_(s32) = COPY $s1
+    %minnum:_(s32) = G_FMINNUM %lhs, %rhs
+    $s0 = COPY %minnum(s32)
+    RET_ReallyLR implicit $s0
+
+...
+---
+name:            gp_inputs
+legalized:       true
+body:             |
+  bb.0:
+    liveins: $w0, $w1
+
+    ; CHECK-LABEL: name: gp_inputs
+    ; CHECK: %lhs:gpr(s32) = COPY $w0
+    ; CHECK: %rhs:gpr(s32) = COPY $w1
+    ; CHECK: [[COPY:%[0-9]+]]:fpr(s32) = COPY %lhs(s32)
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr(s32) = COPY %rhs(s32)
+    ; CHECK: %minnum:fpr(s32) = G_FMINNUM [[COPY]], [[COPY1]]
+    ; CHECK: $s0 = COPY %minnum(s32)
+    ; CHECK: RET_ReallyLR implicit $s0
+    %lhs:_(s32) = COPY $w0
+    %rhs:_(s32) = COPY $w1
+    %minnum:_(s32) = G_FMINNUM %lhs, %rhs
+    $s0 = COPY %minnum(s32)
+    RET_ReallyLR implicit $s0
+
+...
+---
+name:            gp_use
+legalized:       true
+body:             |
+  bb.0:
+    liveins: $s0, $s1
+
+    ; CHECK-LABEL: name: gp_use
+    ; CHECK: %lhs:fpr(s32) = COPY $s0
+    ; CHECK: %rhs:fpr(s32) = COPY $s1
+    ; CHECK: %minnum:fpr(s32) = G_FMINNUM %lhs, %rhs
+    ; CHECK: $w0 = COPY %minnum(s32)
+    ; CHECK: RET_ReallyLR implicit $w0
+    %lhs:_(s32) = COPY $s0
+    %rhs:_(s32) = COPY $s1
+    %minnum:_(s32) = G_FMINNUM %lhs, %rhs
+    $w0 = COPY %minnum(s32)
+    RET_ReallyLR implicit $w0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-extract-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-extract-vector-elt.mir
index 919fec7e1b1ff..e157c7339daac 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-extract-vector-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-extract-vector-elt.mir
@@ -211,6 +211,35 @@ body:             |
     RET_ReallyLR implicit $h0
 ...
 ---
+name:            v16s8
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+  - { reg: '$q0' }
+body:             |
+  bb.1:
+    liveins: $q0
+
+    ; CHECK-LABEL: name: v16s8
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr8 = COPY [[COPY]].bsub
+    ; CHECK: [[SUBREG_TO_REG:%[0-9]+]]:fpr32 = SUBREG_TO_REG 0, [[COPY1]], %subreg.bsub
+    ; CHECK: [[COPY2:%[0-9]+]]:gpr32all = COPY [[SUBREG_TO_REG]]
+    ; CHECK: $w0 = COPY [[COPY2]]
+    ; CHECK: RET_ReallyLR implicit $w0
+    %0:fpr(<16 x s8>) = COPY $q0
+    %2:gpr(s64) = G_CONSTANT i64 0
+    %1:fpr(s8) = G_EXTRACT_VECTOR_ELT %0(<16 x s8>), %2(s64)
+    %4:gpr(s8) = COPY %1(s8)
+    %3:gpr(s32) = G_ANYEXT %4(s8)
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
 name:            v2p0
 alignment:       4
 legalized:       true
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-unmerge.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-unmerge.mir
index 2ea2711521193..e047692f7d1e0 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-unmerge.mir
@@ -1,39 +1,8 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 
 # RUN: llc -O0 -mattr=-fullfp16 -mtriple=aarch64-- \
-# RUN: -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=1 %s -o - | FileCheck %s
 
---- |
-  define <2 x double> @test_v2s64_unmerge(<2 x double> %a) {
-    ret <2 x double> %a
-  }
-
-  define <4 x float> @test_v4s32_unmerge(<4 x float> %a) {
-    ret <4 x float> %a
-  }
-
-  define <2 x half> @test_v2s16_unmerge(<2 x half> %a) {
-    ret <2 x half> %a
-  }
-
-  define <4 x half> @test_v4s16_unmerge(<4 x half> %a) {
-    ret <4 x half> %a
-  }
-
-  define <8 x half> @test_v8s16_unmerge(<8 x half> %a) {
-    ret <8 x half> %a
-  }
-
-  define <2 x float> @test_vecsplit_2v2s32_v4s32(<4 x float> %a) {
-    ret <2 x float> undef
-  }
-
-  define <2 x half> @test_vecsplit_2v2s16_v4s16(<4 x half> %a) {
-    ret <2 x half> undef
-  }
-
-  define void @test_s128(i128 %p) { ret void }
-
-...
 ---
 name:            test_v2s64_unmerge
 alignment:       4
@@ -46,14 +15,23 @@ registers:
   - { id: 2, class: fpr }
   - { id: 3, class: fpr }
 body:             |
-  bb.1 (%ir-block.0):
+  bb.1:
     liveins: $q0
-    ; CHECK-LABEL: name:            test_v2s64_unmerge
+    ; CHECK-LABEL: name: test_v2s64_unmerge
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY [[COPY]].dsub
+    ; CHECK: [[CPYi64_:%[0-9]+]]:fpr64 = CPYi64 [[COPY]], 1
+    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.dsub
+    ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[CPYi64_]], %subreg.dsub
+    ; CHECK: [[INSvi64lane:%[0-9]+]]:fpr128 = INSvi64lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0
+    ; CHECK: $q0 = COPY [[INSvi64lane]]
+    ; CHECK: RET_ReallyLR implicit $q0
     %0:fpr(<2 x s64>) = COPY $q0
 
     ; Since 2 * 64 = 128, we can just directly copy.
-    ; CHECK: %2:fpr64 = COPY %0.dsub
-    ; CHECK: %3:fpr64 = CPYi64 %0, 1
     %2:fpr(s64), %3:fpr(s64) = G_UNMERGE_VALUES %0(<2 x s64>)
 
     %1:fpr(<2 x s64>) = G_BUILD_VECTOR %2(s64), %3(s64)
@@ -74,16 +52,31 @@ registers:
   - { id: 4, class: fpr }
   - { id: 5, class: fpr }
 body:             |
-  bb.1 (%ir-block.0):
+  bb.1:
     liveins: $q0
-    ; CHECK-LABEL: name:            test_v4s32_unmerge
+    ; CHECK-LABEL: name: test_v4s32_unmerge
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr32 = COPY [[COPY]].ssub
+    ; CHECK: [[CPYi32_:%[0-9]+]]:fpr32 = CPYi32 [[COPY]], 1
+    ; CHECK: [[CPYi32_1:%[0-9]+]]:fpr32 = CPYi32 [[COPY]], 2
+    ; CHECK: [[CPYi32_2:%[0-9]+]]:fpr32 = CPYi32 [[COPY]], 3
+    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.ssub
+    ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[CPYi32_]], %subreg.ssub
+    ; CHECK: [[INSvi32lane:%[0-9]+]]:fpr128 = INSvi32lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0
+    ; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[CPYi32_1]], %subreg.ssub
+    ; CHECK: [[INSvi32lane1:%[0-9]+]]:fpr128 = INSvi32lane [[INSvi32lane]], 2, [[INSERT_SUBREG2]], 0
+    ; CHECK: [[DEF3:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF3]], [[CPYi32_2]], %subreg.ssub
+    ; CHECK: [[INSvi32lane2:%[0-9]+]]:fpr128 = INSvi32lane [[INSvi32lane1]], 3, [[INSERT_SUBREG3]], 0
+    ; CHECK: $q0 = COPY [[INSvi32lane2]]
+    ; CHECK: RET_ReallyLR implicit $q0
     %0:fpr(<4 x s32>) = COPY $q0
 
     ; Since 4 * 32 = 128, we can just directly copy.
-    ; CHECK: %2:fpr32 = COPY %0.ssub
-    ; CHECK: %3:fpr32 = CPYi32 %0, 1
-    ; CHECK: %4:fpr32 = CPYi32 %0, 2
-    ; CHECK: %5:fpr32 = CPYi32 %0, 3
     %2:fpr(s32), %3:fpr(s32), %4:fpr(s32), %5:fpr(s32) = G_UNMERGE_VALUES %0(<4 x s32>)
 
     %1:fpr(<4 x s32>) = G_BUILD_VECTOR %2(s32), %3(s32), %4(s32), %5(s32)
@@ -103,34 +96,35 @@ registers:
   - { id: 4, class: fpr }
   - { id: 5, class: fpr }
 body:             |
-  bb.1 (%ir-block.0):
+  bb.1:
     liveins: $s0
-    ; CHECK-LABEL: name: test_v2s16_unmerge
 
+    ; CHECK-LABEL: name: test_v2s16_unmerge
+    ; CHECK: liveins: $s0
     ; CHECK: [[COPY:%[0-9]+]]:fpr32 = COPY $s0
-    %0:fpr(<2 x s16>) = COPY $s0
-
-    ; Since 2 * 16 != 128, we need to widen using implicit defs.
-    ; Note that we expect to reuse one of the INSERT_SUBREG results, as CPYi16
-    ; expects a lane > 0.
     ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
     ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.ssub
     ; CHECK: [[COPY1:%[0-9]+]]:fpr16 = COPY [[INSERT_SUBREG]].hsub
     ; CHECK: [[CPYi16_:%[0-9]+]]:fpr16 = CPYi16 [[INSERT_SUBREG]], 1
-    %2:fpr(s16), %3:fpr(s16) = G_UNMERGE_VALUES %0(<2 x s16>)
-
     ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
     ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.hsub
     ; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
     ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[CPYi16_]], %subreg.hsub
     ; CHECK: [[INSvi16lane:%[0-9]+]]:fpr128 = INSvi16lane [[INSERT_SUBREG1]], 1, [[INSERT_SUBREG2]], 0
     ; CHECK: [[COPY2:%[0-9]+]]:fpr32 = COPY [[INSvi16lane]].ssub
+    ; CHECK: $s0 = COPY [[COPY2]]
+    ; CHECK: RET_ReallyLR implicit $s0
+    %0:fpr(<2 x s16>) = COPY $s0
+
+    ; Since 2 * 16 != 128, we need to widen using implicit defs.
+    ; Note that we expect to reuse one of the INSERT_SUBREG results, as CPYi16
+    ; expects a lane > 0.
+    %2:fpr(s16), %3:fpr(s16) = G_UNMERGE_VALUES %0(<2 x s16>)
+
     %1:fpr(<2 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16)
 
-    ; CHECK: $s0 = COPY [[COPY2]]
     $s0 = COPY %1(<2 x s16>)
 
-    ; CHECK: RET_ReallyLR implicit $s0
     RET_ReallyLR implicit $s0
 ...
 ---
@@ -147,24 +141,40 @@ registers:
   - { id: 4, class: fpr }
   - { id: 5, class: fpr }
 body:             |
-  bb.1 (%ir-block.0):
+  bb.1:
     liveins: $d0
-    ; CHECK-LABEL: name:            test_v4s16_unmerge
+    ; CHECK-LABEL: name: test_v4s16_unmerge
+    ; CHECK: liveins: $d0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub
+    ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY]], %subreg.dsub
+    ; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[COPY]], %subreg.dsub
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr16 = COPY [[INSERT_SUBREG]].hsub
+    ; CHECK: [[CPYi16_:%[0-9]+]]:fpr16 = CPYi16 [[INSERT_SUBREG]], 1
+    ; CHECK: [[CPYi16_1:%[0-9]+]]:fpr16 = CPYi16 [[INSERT_SUBREG1]], 2
+    ; CHECK: [[CPYi16_2:%[0-9]+]]:fpr16 = CPYi16 [[INSERT_SUBREG2]], 3
+    ; CHECK: [[DEF3:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF3]], [[COPY1]], %subreg.hsub
+    ; CHECK: [[DEF4:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG4:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF4]], [[CPYi16_]], %subreg.hsub
+    ; CHECK: [[INSvi16lane:%[0-9]+]]:fpr128 = INSvi16lane [[INSERT_SUBREG3]], 1, [[INSERT_SUBREG4]], 0
+    ; CHECK: [[DEF5:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG5:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF5]], [[CPYi16_1]], %subreg.hsub
+    ; CHECK: [[INSvi16lane1:%[0-9]+]]:fpr128 = INSvi16lane [[INSvi16lane]], 2, [[INSERT_SUBREG5]], 0
+    ; CHECK: [[DEF6:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG6:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF6]], [[CPYi16_2]], %subreg.hsub
+    ; CHECK: [[INSvi16lane2:%[0-9]+]]:fpr128 = INSvi16lane [[INSvi16lane1]], 3, [[INSERT_SUBREG6]], 0
+    ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY [[INSvi16lane2]].dsub
+    ; CHECK: $d0 = COPY [[COPY2]]
+    ; CHECK: RET_ReallyLR implicit $d0
     %0:fpr(<4 x s16>) = COPY $d0
 
     ; Since 4 * 16 != 128, we need to widen using implicit defs.
     ; Note that we expect to reuse one of the INSERT_SUBREG results, as CPYi16
     ; expects a lane > 0.
-    ; CHECK-DAG: [[IMPDEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INS_SHARED:%[0-9]+]]:fpr128 = INSERT_SUBREG [[IMPDEF1]], %0, %subreg.dsub
-    ; CHECK: [[IMPDEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INS2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[IMPDEF2]], %0, %subreg.dsub
-    ; CHECK: [[IMPDEF3:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[INS3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[IMPDEF3]], %0, %subreg.dsub
-    ; CHECK: %2:fpr16 = COPY [[INS_SHARED]].hsub
-    ; CHECK: %3:fpr16 = CPYi16 [[INS_SHARED]], 1
-    ; CHECK: %4:fpr16 = CPYi16 [[INS2]], 2
-    ; CHECK: %5:fpr16 = CPYi16 [[INS3]], 3
     %2:fpr(s16), %3:fpr(s16), %4:fpr(s16), %5:fpr(s16) = G_UNMERGE_VALUES %0(<4 x s16>)
 
     %1:fpr(<4 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16), %5(s16)
@@ -189,20 +199,47 @@ registers:
   - { id: 8, class: fpr }
   - { id: 9, class: fpr }
 body:             |
-  bb.1 (%ir-block.0):
+  bb.1:
     liveins: $q0
-    ; CHECK-LABEL: name:            test_v8s16_unmerge
+    ; CHECK-LABEL: name: test_v8s16_unmerge
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr16 = COPY [[COPY]].hsub
+    ; CHECK: [[CPYi16_:%[0-9]+]]:fpr16 = CPYi16 [[COPY]], 1
+    ; CHECK: [[CPYi16_1:%[0-9]+]]:fpr16 = CPYi16 [[COPY]], 2
+    ; CHECK: [[CPYi16_2:%[0-9]+]]:fpr16 = CPYi16 [[COPY]], 3
+    ; CHECK: [[CPYi16_3:%[0-9]+]]:fpr16 = CPYi16 [[COPY]], 4
+    ; CHECK: [[CPYi16_4:%[0-9]+]]:fpr16 = CPYi16 [[COPY]], 5
+    ; CHECK: [[CPYi16_5:%[0-9]+]]:fpr16 = CPYi16 [[COPY]], 6
+    ; CHECK: [[CPYi16_6:%[0-9]+]]:fpr16 = CPYi16 [[COPY]], 7
+    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.hsub
+    ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[CPYi16_]], %subreg.hsub
+    ; CHECK: [[INSvi16lane:%[0-9]+]]:fpr128 = INSvi16lane [[INSERT_SUBREG]], 1, [[INSERT_SUBREG1]], 0
+    ; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[CPYi16_1]], %subreg.hsub
+    ; CHECK: [[INSvi16lane1:%[0-9]+]]:fpr128 = INSvi16lane [[INSvi16lane]], 2, [[INSERT_SUBREG2]], 0
+    ; CHECK: [[DEF3:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF3]], [[CPYi16_2]], %subreg.hsub
+    ; CHECK: [[INSvi16lane2:%[0-9]+]]:fpr128 = INSvi16lane [[INSvi16lane1]], 3, [[INSERT_SUBREG3]], 0
+    ; CHECK: [[DEF4:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG4:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF4]], [[CPYi16_3]], %subreg.hsub
+    ; CHECK: [[INSvi16lane3:%[0-9]+]]:fpr128 = INSvi16lane [[INSvi16lane2]], 4, [[INSERT_SUBREG4]], 0
+    ; CHECK: [[DEF5:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG5:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF5]], [[CPYi16_4]], %subreg.hsub
+    ; CHECK: [[INSvi16lane4:%[0-9]+]]:fpr128 = INSvi16lane [[INSvi16lane3]], 5, [[INSERT_SUBREG5]], 0
+    ; CHECK: [[DEF6:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG6:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF6]], [[CPYi16_5]], %subreg.hsub
+    ; CHECK: [[INSvi16lane5:%[0-9]+]]:fpr128 = INSvi16lane [[INSvi16lane4]], 6, [[INSERT_SUBREG6]], 0
+    ; CHECK: [[DEF7:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG7:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF7]], [[CPYi16_6]], %subreg.hsub
+    ; CHECK: [[INSvi16lane6:%[0-9]+]]:fpr128 = INSvi16lane [[INSvi16lane5]], 7, [[INSERT_SUBREG7]], 0
+    ; CHECK: $q0 = COPY [[INSvi16lane6]]
+    ; CHECK: RET_ReallyLR implicit $q0
     %0:fpr(<8 x s16>) = COPY $q0
 
     ; Since 8 * 16 = 128, we can just directly copy.
-    ; CHECK: %2:fpr16 = COPY %0.hsub
-    ; CHECK: %3:fpr16 = CPYi16 %0, 1
-    ; CHECK: %4:fpr16 = CPYi16 %0, 2
-    ; CHECK: %5:fpr16 = CPYi16 %0, 3
-    ; CHECK: %6:fpr16 = CPYi16 %0, 4
-    ; CHECK: %7:fpr16 = CPYi16 %0, 5
-    ; CHECK: %8:fpr16 = CPYi16 %0, 6
-    ; CHECK: %9:fpr16 = CPYi16 %0, 7
     %2:fpr(s16), %3:fpr(s16), %4:fpr(s16), %5:fpr(s16), %6:fpr(s16), %7:fpr(s16), %8:fpr(s16), %9:fpr(s16) = G_UNMERGE_VALUES %0(<8 x s16>)
 
     %1:fpr(<8 x s16>) = G_BUILD_VECTOR %2:fpr(s16), %3:fpr(s16), %4:fpr(s16), %5:fpr(s16), %6:fpr(s16), %7:fpr(s16), %8:fpr(s16), %9:fpr(s16)
@@ -210,13 +247,68 @@ body:             |
     RET_ReallyLR implicit $q0
 ...
 ---
+name:            test_v8s8_unmerge
+alignment:       4
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $q0
+    ; CHECK-LABEL: name: test_v8s8_unmerge
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.dsub
+    ; CHECK: [[DEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG1:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF1]], [[COPY]], %subreg.dsub
+    ; CHECK: [[DEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF2]], [[COPY]], %subreg.dsub
+    ; CHECK: [[DEF3:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.dsub
+    ; CHECK: [[DEF4:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG4:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF4]], [[COPY]], %subreg.dsub
+    ; CHECK: [[DEF5:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG5:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF5]], [[COPY]], %subreg.dsub
+    ; CHECK: [[DEF6:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK: [[INSERT_SUBREG6:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF6]], [[COPY]], %subreg.dsub
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr8 = COPY [[INSERT_SUBREG]].bsub
+    ; CHECK: [[CPYi8_:%[0-9]+]]:fpr8 = CPYi8 [[INSERT_SUBREG]], 1
+    ; CHECK: [[CPYi8_1:%[0-9]+]]:fpr8 = CPYi8 [[INSERT_SUBREG1]], 2
+    ; CHECK: [[CPYi8_2:%[0-9]+]]:fpr8 = CPYi8 [[INSERT_SUBREG2]], 3
+    ; CHECK: [[CPYi8_3:%[0-9]+]]:fpr8 = CPYi8 [[INSERT_SUBREG3]], 4
+    ; CHECK: [[CPYi8_4:%[0-9]+]]:fpr8 = CPYi8 [[INSERT_SUBREG4]], 5
+    ; CHECK: [[CPYi8_5:%[0-9]+]]:fpr8 = CPYi8 [[INSERT_SUBREG5]], 6
+    ; CHECK: [[CPYi8_6:%[0-9]+]]:fpr8 = CPYi8 [[INSERT_SUBREG6]], 7
+    ; CHECK: $b0 = COPY [[COPY1]]
+    ; CHECK: $b1 = COPY [[CPYi8_]]
+    ; CHECK: $b2 = COPY [[CPYi8_1]]
+    ; CHECK: $b3 = COPY [[CPYi8_2]]
+    ; CHECK: $b4 = COPY [[CPYi8_3]]
+    ; CHECK: $b5 = COPY [[CPYi8_4]]
+    ; CHECK: $b6 = COPY [[CPYi8_5]]
+    ; CHECK: $b7 = COPY [[CPYi8_6]]
+    ; CHECK: RET_ReallyLR implicit $d0
+    %0:fpr(<8 x s8>) = COPY $d0
+    %2:fpr(s8), %3:fpr(s8), %4:fpr(s8), %5:fpr(s8), %6:fpr(s8), %7:fpr(s8), %8:fpr(s8), %9:fpr(s8) = G_UNMERGE_VALUES %0(<8 x s8>)
+    $b0 = COPY %2
+    $b1 = COPY %3
+    $b2 = COPY %4
+    $b3 = COPY %5
+    $b4 = COPY %6
+    $b5 = COPY %7
+    $b6 = COPY %8
+    $b7 = COPY %9
+    RET_ReallyLR implicit $d0
+...
+---
 name:            test_vecsplit_2v2s32_v4s32
 alignment:       4
 legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 body:             |
-  bb.1 (%ir-block.0):
+  bb.1:
     liveins: $q0
     ; CHECK-LABEL: name: test_vecsplit_2v2s32_v4s32
     ; CHECK: liveins: $q0
@@ -239,7 +331,7 @@ legalized:       true
 regBankSelected: true
 tracksRegLiveness: true
 body:             |
-  bb.1 (%ir-block.0):
+  bb.1:
     liveins: $d0
     ; CHECK-LABEL: name: test_vecsplit_2v2s16_v4s16
     ; CHECK: liveins: $d0
@@ -266,6 +358,14 @@ tracksRegLiveness: true
 body:             |
   bb.1:
     liveins: $q0
+    ; CHECK-LABEL: name: test_s128
+    ; CHECK: liveins: $q0
+    ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+    ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY [[COPY]].dsub
+    ; CHECK: [[CPYi64_:%[0-9]+]]:fpr64 = CPYi64 [[COPY]], 1
+    ; CHECK: $d0 = COPY [[COPY1]]
+    ; CHECK: $d1 = COPY [[CPYi64_]]
+    ; CHECK: RET_ReallyLR implicit $d0, implicit $d1
     %0:fpr(s128) = COPY $q0
     %1:fpr(s64), %2:fpr(s64) = G_UNMERGE_VALUES %0(s128)
     $d0 = COPY %1(s64)
diff --git a/llvm/test/CodeGen/AArch64/arm64-srl-and.ll b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll
new file mode 100644
index 0000000000000..2f024e444d25f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-srl-and.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -O3 < %s | FileCheck %s
+
+; Disable the dagcombine if operand has multi use
+
+@g = global i16 0, align 4
+define i32 @srl_and()  {
+; CHECK-LABEL: srl_and:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    adrp x8, :got:g
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:g]
+; CHECK-NEXT:    mov w9, #50
+; CHECK-NEXT:    ldrh w8, [x8]
+; CHECK-NEXT:    eor w8, w8, w9
+; CHECK-NEXT:    sub w8, w8, #1
+; CHECK-NEXT:    and w0, w8, w8, lsr #16
+; CHECK-NEXT:    ret
+entry:
+  %0 = load i16, i16* @g, align 4
+  %1 = xor i16 %0, 50
+  %tobool = icmp ne i16 %1, 0
+  %lor.ext = zext i1 %tobool to i32
+  %sub = add i16 %1, -1
+
+  %srl = zext i16 %sub to i32
+  %and = and i32 %srl, %lor.ext
+
+  ret i32 %and
+}
diff --git a/llvm/test/CodeGen/AArch64/csinc-cmp-removal.mir b/llvm/test/CodeGen/AArch64/csinc-cmp-removal.mir
index 4222e84b113cc..2098218d23f33 100644
--- a/llvm/test/CodeGen/AArch64/csinc-cmp-removal.mir
+++ b/llvm/test/CodeGen/AArch64/csinc-cmp-removal.mir
@@ -307,3 +307,42 @@ body:             |
     RET_ReallyLR
 
 ...
+---
+name:            subswr_wrong_cmp_value
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: subswr_wrong_cmp_value
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK:   liveins: $x1
+  ; CHECK:   [[COPY:%[0-9]+]]:gpr64common = COPY $x1
+  ; CHECK:   [[DEF:%[0-9]+]]:gpr64 = IMPLICIT_DEF
+  ; CHECK:   [[SUBSXrr:%[0-9]+]]:gpr64 = SUBSXrr killed [[DEF]], [[COPY]], implicit-def $nzcv
+  ; CHECK:   [[CSINCWr:%[0-9]+]]:gpr32common = CSINCWr $wzr, $wzr, 1, implicit $nzcv
+  ; CHECK:   [[SUBSWri:%[0-9]+]]:gpr32 = SUBSWri killed [[CSINCWr]], 3, 0, implicit-def $nzcv
+  ; CHECK:   Bcc 1, %bb.2, implicit $nzcv
+  ; CHECK:   B %bb.1
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.2(0x80000000)
+  ; CHECK:   B %bb.2
+  ; CHECK: bb.2:
+  ; CHECK:   RET_ReallyLR
+  bb.0:
+    liveins: $x1
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    %1:gpr64common = COPY $x1
+    %2:gpr64 = IMPLICIT_DEF
+    %3:gpr64 = SUBSXrr killed %2:gpr64, %1:gpr64common, implicit-def $nzcv
+    %4:gpr32common = CSINCWr $wzr, $wzr, 1, implicit $nzcv
+    %5:gpr32 = SUBSWri killed %4:gpr32common, 3, 0, implicit-def $nzcv
+    Bcc 1, %bb.2, implicit $nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+    B %bb.2
+
+  bb.2:
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir b/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir
index 0ecea1f8ae9ba..5e7886356c17a 100644
--- a/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir
+++ b/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir
@@ -14,13 +14,13 @@ body:             |
     liveins: $w0, $w1, $x1
     ; CHECK-LABEL: name: 1-ldrwpre-ldrwui-merge
     ; CHECK: liveins: $w0, $w1, $x1
-    ; CHECK: early-clobber $x1, renamable $w0, renamable $w1 = LDPWpre renamable $x1, 5 :: (load (s32))
-    ; CHECK: STPWi renamable $w0, renamable $w1, renamable $x1, 0 :: (store (s32))
+    ; CHECK: early-clobber $x1, renamable $w0, renamable $w2 = LDPWpre renamable $x1, 5 :: (load (s32))
+    ; CHECK: STPWi renamable $w0, renamable $w2, renamable $x1, 0 :: (store (s32))
     ; CHECK: RET undef $lr
     early-clobber renamable $x1, renamable $w0 = LDRWpre killed renamable $x1, 20 :: (load (s32))
-    renamable $w1 = LDRWui renamable $x1, 1 :: (load (s32))
+    renamable $w2 = LDRWui renamable $x1, 1 :: (load (s32))
     STRWui killed renamable $w0, renamable $x1, 0 :: (store (s32))
-    STRWui killed renamable $w1, renamable $x1, 1 :: (store (s32))
+    STRWui killed renamable $w2, renamable $x1, 1 :: (store (s32))
     RET undef $lr
 ...
 
diff --git a/llvm/test/CodeGen/AArch64/memset.ll b/llvm/test/CodeGen/AArch64/memset.ll
new file mode 100644
index 0000000000000..4d1d2241c05ab
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/memset.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; CHECK: memset_call:
+; CHECK-NOT: and
+; CHECK: dup
+; CHECK-NEXT: stp
+; CHECK-NEXT: stp
+; CHECK-NEXT: ret
+define void @memset_call(i8* %0, i32 %1) {
+  %3 = trunc i32 %1 to i8
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 %3, i64 64, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8*, i8, i64, i1 immarg)
+
diff --git a/llvm/test/CodeGen/AArch64/pr51476.ll b/llvm/test/CodeGen/AArch64/pr51476.ll
new file mode 100644
index 0000000000000..6abd41a121546
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr51476.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test(i8 %arg) nounwind {
+; CHECK-LABEL: test:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    cmp w8, #1
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    cmp w0, #3
+; CHECK-NEXT:    strb w0, [sp, #12]
+; CHECK-NEXT:    b.eq .LBB0_2
+; CHECK-NEXT:  // %bb.1: // %do_call
+; CHECK-NEXT:    bl unknown
+; CHECK-NEXT:  .LBB0_2: // %common.ret
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %tmp = alloca i8
+  %cmp1 = icmp ne i8 %arg, 1
+  %zext = zext i1 %cmp1 to i8
+  store i8 %zext, i8* %tmp
+  %zext2 = load i8, i8* %tmp
+  %cmp2 = icmp eq i8 %zext2, 3
+  br i1 %cmp2, label %exit, label %do_call
+
+do_call:
+  call void @unknown(i8 %zext2)
+  ret void
+
+exit:
+  ret void
+}
+
+declare void @unknown(i8)
diff --git a/llvm/test/CodeGen/AArch64/reduce-or.ll b/llvm/test/CodeGen/AArch64/reduce-or.ll
index e832352f086b0..659b062b5cc94 100644
--- a/llvm/test/CodeGen/AArch64/reduce-or.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-or.ll
@@ -1,11 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon -global-isel -global-isel-abort=1 | FileCheck %s --check-prefix=GISEL
+
 
 define i1 @test_redor_v1i1(<1 x i1> %a) {
 ; CHECK-LABEL: test_redor_v1i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w0, w0, #0x1
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v1i1:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    and w0, w0, #0x1
+; GISEL-NEXT:    ret
   %or_result = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> %a)
   ret i1 %or_result
 }
@@ -19,6 +26,16 @@ define i1 @test_redor_v2i1(<2 x i1> %a) {
 ; CHECK-NEXT:    orr w8, w9, w8
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v2i1:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
+; GISEL-NEXT:    mov s1, v0.s[1]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w0, w8, #0x1
+; GISEL-NEXT:    ret
   %or_result = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> %a)
   ret i1 %or_result
 }
@@ -36,6 +53,22 @@ define i1 @test_redor_v4i1(<4 x i1> %a) {
 ; CHECK-NEXT:    orr w8, w9, w8
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v4i1:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
+; GISEL-NEXT:    mov h1, v0.h[1]
+; GISEL-NEXT:    mov h2, v0.h[2]
+; GISEL-NEXT:    mov h3, v0.h[3]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fmov w9, s2
+; GISEL-NEXT:    fmov w10, s3
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w0, w8, #0x1
+; GISEL-NEXT:    ret
   %or_result = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> %a)
   ret i1 %or_result
 }
@@ -61,6 +94,34 @@ define i1 @test_redor_v8i1(<8 x i1> %a) {
 ; CHECK-NEXT:    orr w8, w9, w8
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v8i1:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
+; GISEL-NEXT:    mov b1, v0.b[1]
+; GISEL-NEXT:    mov b2, v0.b[2]
+; GISEL-NEXT:    mov b3, v0.b[3]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    mov b4, v0.b[4]
+; GISEL-NEXT:    mov b5, v0.b[5]
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fmov w9, s2
+; GISEL-NEXT:    fmov w10, s3
+; GISEL-NEXT:    mov b6, v0.b[6]
+; GISEL-NEXT:    mov b7, v0.b[7]
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    orr w10, w10, w11
+; GISEL-NEXT:    fmov w11, s6
+; GISEL-NEXT:    fmov w12, s7
+; GISEL-NEXT:    orr w11, w11, w12
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w0, w8, #0x1
+; GISEL-NEXT:    ret
   %or_result = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %a)
   ret i1 %or_result
 }
@@ -87,6 +148,57 @@ define i1 @test_redor_v16i1(<16 x i1> %a) {
 ; CHECK-NEXT:    orr w8, w8, w9
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v16i1:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov b1, v0.b[1]
+; GISEL-NEXT:    mov b2, v0.b[2]
+; GISEL-NEXT:    mov b3, v0.b[3]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    mov b4, v0.b[4]
+; GISEL-NEXT:    mov b5, v0.b[5]
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fmov w9, s2
+; GISEL-NEXT:    fmov w10, s3
+; GISEL-NEXT:    mov b6, v0.b[6]
+; GISEL-NEXT:    mov b7, v0.b[7]
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    mov b16, v0.b[8]
+; GISEL-NEXT:    mov b17, v0.b[9]
+; GISEL-NEXT:    orr w10, w10, w11
+; GISEL-NEXT:    fmov w11, s6
+; GISEL-NEXT:    fmov w12, s7
+; GISEL-NEXT:    mov b18, v0.b[10]
+; GISEL-NEXT:    mov b19, v0.b[11]
+; GISEL-NEXT:    orr w11, w11, w12
+; GISEL-NEXT:    fmov w12, s16
+; GISEL-NEXT:    fmov w13, s17
+; GISEL-NEXT:    mov b20, v0.b[12]
+; GISEL-NEXT:    mov b21, v0.b[13]
+; GISEL-NEXT:    orr w12, w12, w13
+; GISEL-NEXT:    fmov w13, s18
+; GISEL-NEXT:    fmov w14, s19
+; GISEL-NEXT:    mov b22, v0.b[14]
+; GISEL-NEXT:    mov b23, v0.b[15]
+; GISEL-NEXT:    orr w13, w13, w14
+; GISEL-NEXT:    fmov w14, s20
+; GISEL-NEXT:    fmov w15, s21
+; GISEL-NEXT:    orr w14, w14, w15
+; GISEL-NEXT:    fmov w15, s22
+; GISEL-NEXT:    fmov w16, s23
+; GISEL-NEXT:    orr w15, w15, w16
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w10, w12, w13
+; GISEL-NEXT:    orr w11, w14, w15
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    and w0, w8, #0x1
+; GISEL-NEXT:    ret
   %or_result = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> %a)
   ret i1 %or_result
 }
@@ -97,6 +209,12 @@ define i8 @test_redor_v1i8(<1 x i8> %a) {
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w0, v0.b[0]
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v1i8:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    fmov x0, d0
+; GISEL-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> %a)
   ret i8 %or_result
 }
@@ -107,6 +225,12 @@ define i8 @test_redor_v3i8(<3 x i8> %a) {
 ; CHECK-NEXT:    orr w8, w0, w1
 ; CHECK-NEXT:    orr w0, w8, w2
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v3i8:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    orr w8, w0, w1
+; GISEL-NEXT:    orr w0, w8, w2
+; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> %a)
   ret i8 %or_result
 }
@@ -123,6 +247,21 @@ define i8 @test_redor_v4i8(<4 x i8> %a) {
 ; CHECK-NEXT:    orr w9, w10, w9
 ; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v4i8:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
+; GISEL-NEXT:    mov h1, v0.h[1]
+; GISEL-NEXT:    mov h2, v0.h[2]
+; GISEL-NEXT:    mov h3, v0.h[3]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fmov w9, s2
+; GISEL-NEXT:    fmov w10, s3
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %a)
   ret i8 %or_result
 }
@@ -147,6 +286,33 @@ define i8 @test_redor_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    orr w9, w10, w9
 ; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v8i8:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
+; GISEL-NEXT:    mov b1, v0.b[1]
+; GISEL-NEXT:    mov b2, v0.b[2]
+; GISEL-NEXT:    mov b3, v0.b[3]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    mov b4, v0.b[4]
+; GISEL-NEXT:    mov b5, v0.b[5]
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fmov w9, s2
+; GISEL-NEXT:    fmov w10, s3
+; GISEL-NEXT:    mov b6, v0.b[6]
+; GISEL-NEXT:    mov b7, v0.b[7]
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    orr w10, w10, w11
+; GISEL-NEXT:    fmov w11, s6
+; GISEL-NEXT:    fmov w12, s7
+; GISEL-NEXT:    orr w11, w11, w12
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a)
   ret i8 %or_result
 }
@@ -172,6 +338,34 @@ define i8 @test_redor_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    umov w9, v0.b[7]
 ; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v16i8:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov d1, v0.d[1]
+; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
+; GISEL-NEXT:    mov b1, v0.b[1]
+; GISEL-NEXT:    mov b2, v0.b[2]
+; GISEL-NEXT:    mov b3, v0.b[3]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    mov b4, v0.b[4]
+; GISEL-NEXT:    mov b5, v0.b[5]
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fmov w9, s2
+; GISEL-NEXT:    fmov w10, s3
+; GISEL-NEXT:    mov b6, v0.b[6]
+; GISEL-NEXT:    mov b7, v0.b[7]
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    orr w10, w10, w11
+; GISEL-NEXT:    fmov w11, s6
+; GISEL-NEXT:    fmov w12, s7
+; GISEL-NEXT:    orr w11, w11, w12
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a)
   ret i8 %or_result
 }
@@ -198,6 +392,35 @@ define i8 @test_redor_v32i8(<32 x i8> %a) {
 ; CHECK-NEXT:    umov w9, v0.b[7]
 ; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v32i8:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
+; GISEL-NEXT:    mov d1, v0.d[1]
+; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
+; GISEL-NEXT:    mov b1, v0.b[1]
+; GISEL-NEXT:    mov b2, v0.b[2]
+; GISEL-NEXT:    mov b3, v0.b[3]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    mov b4, v0.b[4]
+; GISEL-NEXT:    mov b5, v0.b[5]
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fmov w9, s2
+; GISEL-NEXT:    fmov w10, s3
+; GISEL-NEXT:    mov b6, v0.b[6]
+; GISEL-NEXT:    mov b7, v0.b[7]
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    fmov w10, s4
+; GISEL-NEXT:    fmov w11, s5
+; GISEL-NEXT:    orr w10, w10, w11
+; GISEL-NEXT:    fmov w11, s6
+; GISEL-NEXT:    fmov w12, s7
+; GISEL-NEXT:    orr w11, w11, w12
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    orr w9, w10, w11
+; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    ret
   %or_result = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %a)
   ret i8 %or_result
 }
@@ -214,6 +437,21 @@ define i16 @test_redor_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    orr w9, w10, w9
 ; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v4i16:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
+; GISEL-NEXT:    mov h1, v0.h[1]
+; GISEL-NEXT:    mov h2, v0.h[2]
+; GISEL-NEXT:    mov h3, v0.h[3]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fmov w9, s2
+; GISEL-NEXT:    fmov w10, s3
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    ret
   %or_result = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a)
   ret i16 %or_result
 }
@@ -231,6 +469,22 @@ define i16 @test_redor_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    umov w9, v0.h[3]
 ; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v8i16:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov d1, v0.d[1]
+; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
+; GISEL-NEXT:    mov h1, v0.h[1]
+; GISEL-NEXT:    mov h2, v0.h[2]
+; GISEL-NEXT:    mov h3, v0.h[3]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fmov w9, s2
+; GISEL-NEXT:    fmov w10, s3
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    ret
   %or_result = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a)
   ret i16 %or_result
 }
@@ -249,6 +503,23 @@ define i16 @test_redor_v16i16(<16 x i16> %a) {
 ; CHECK-NEXT:    umov w9, v0.h[3]
 ; CHECK-NEXT:    orr w0, w8, w9
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v16i16:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
+; GISEL-NEXT:    mov d1, v0.d[1]
+; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
+; GISEL-NEXT:    mov h1, v0.h[1]
+; GISEL-NEXT:    mov h2, v0.h[2]
+; GISEL-NEXT:    mov h3, v0.h[3]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    orr w8, w8, w9
+; GISEL-NEXT:    fmov w9, s2
+; GISEL-NEXT:    fmov w10, s3
+; GISEL-NEXT:    orr w9, w9, w10
+; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    ret
   %or_result = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %a)
   ret i16 %or_result
 }
@@ -261,6 +532,15 @@ define i32 @test_redor_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v2i32:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
+; GISEL-NEXT:    mov s1, v0.s[1]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    ret
   %or_result = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a)
   ret i32 %or_result
 }
@@ -274,6 +554,16 @@ define i32 @test_redor_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v4i32:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov d1, v0.d[1]
+; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
+; GISEL-NEXT:    mov s1, v0.s[1]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    ret
   %or_result = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
   ret i32 %or_result
 }
@@ -288,6 +578,17 @@ define i32 @test_redor_v8i32(<8 x i32> %a) {
 ; CHECK-NEXT:    fmov w9, s0
 ; CHECK-NEXT:    orr w0, w9, w8
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v8i32:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
+; GISEL-NEXT:    mov d1, v0.d[1]
+; GISEL-NEXT:    orr v0.8b, v0.8b, v1.8b
+; GISEL-NEXT:    mov s1, v0.s[1]
+; GISEL-NEXT:    fmov w8, s0
+; GISEL-NEXT:    fmov w9, s1
+; GISEL-NEXT:    orr w0, w8, w9
+; GISEL-NEXT:    ret
   %or_result = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a)
   ret i32 %or_result
 }
@@ -299,6 +600,14 @@ define i64 @test_redor_v2i64(<2 x i64> %a) {
 ; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v2i64:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    mov d1, v0.d[1]
+; GISEL-NEXT:    fmov x8, d0
+; GISEL-NEXT:    fmov x9, d1
+; GISEL-NEXT:    orr x0, x8, x9
+; GISEL-NEXT:    ret
   %or_result = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
   ret i64 %or_result
 }
@@ -311,6 +620,15 @@ define i64 @test_redor_v4i64(<4 x i64> %a) {
 ; CHECK-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    ret
+;
+; GISEL-LABEL: test_redor_v4i64:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    orr v0.16b, v0.16b, v1.16b
+; GISEL-NEXT:    mov d1, v0.d[1]
+; GISEL-NEXT:    fmov x8, d0
+; GISEL-NEXT:    fmov x9, d1
+; GISEL-NEXT:    orr x0, x8, x9
+; GISEL-NEXT:    ret
   %or_result = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a)
   ret i64 %or_result
 }
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat.ll b/llvm/test/CodeGen/AArch64/sadd_sat.ll
index 89ff9fad542bd..9ed83dc7850e8 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat.ll
@@ -12,11 +12,9 @@ define i32 @func(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: func:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adds w8, w0, w1
-; CHECK-NEXT:    mov w9, #2147483647
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cinv w8, w9, ge
-; CHECK-NEXT:    adds w9, w0, w1
-; CHECK-NEXT:    csel w0, w8, w9, vs
+; CHECK-NEXT:    asr w9, w8, #31
+; CHECK-NEXT:    eor w9, w9, #0x80000000
+; CHECK-NEXT:    csel w0, w9, w8, vs
 ; CHECK-NEXT:    ret
   %tmp = call i32 @llvm.sadd.sat.i32(i32 %x, i32 %y);
   ret i32 %tmp;
@@ -26,11 +24,9 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: func2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adds x8, x0, x1
-; CHECK-NEXT:    mov x9, #9223372036854775807
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    cinv x8, x9, ge
-; CHECK-NEXT:    adds x9, x0, x1
-; CHECK-NEXT:    csel x0, x8, x9, vs
+; CHECK-NEXT:    asr x9, x8, #63
+; CHECK-NEXT:    eor x9, x9, #0x8000000000000000
+; CHECK-NEXT:    csel x0, x9, x8, vs
 ; CHECK-NEXT:    ret
   %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %y);
   ret i64 %tmp;
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll b/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll
index 1b631448181a7..8ea35fdf95968 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll
@@ -11,11 +11,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-LABEL: func32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mul w8, w1, w2
-; CHECK-NEXT:    adds w10, w0, w8
-; CHECK-NEXT:    mov w9, #2147483647
-; CHECK-NEXT:    cmp w10, #0
-; CHECK-NEXT:    cinv w9, w9, ge
 ; CHECK-NEXT:    adds w8, w0, w8
+; CHECK-NEXT:    asr w9, w8, #31
+; CHECK-NEXT:    eor w9, w9, #0x80000000
 ; CHECK-NEXT:    csel w0, w9, w8, vs
 ; CHECK-NEXT:    ret
   %a = mul i32 %y, %z
@@ -27,11 +25,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-LABEL: func64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adds x8, x0, x2
-; CHECK-NEXT:    mov x9, #9223372036854775807
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    cinv x8, x9, ge
-; CHECK-NEXT:    adds x9, x0, x2
-; CHECK-NEXT:    csel x0, x8, x9, vs
+; CHECK-NEXT:    asr x9, x8, #63
+; CHECK-NEXT:    eor x9, x9, #0x8000000000000000
+; CHECK-NEXT:    csel x0, x9, x8, vs
 ; CHECK-NEXT:    ret
   %a = mul i64 %y, %z
   %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %z)
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 53e3563e7cea9..cda08e950d78d 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -351,26 +351,23 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; CHECK-LABEL: v2i128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    adds x8, x2, x6
-; CHECK-NEXT:    adcs x12, x3, x7
-; CHECK-NEXT:    mov x9, #9223372036854775807
-; CHECK-NEXT:    eor x10, x3, x7
-; CHECK-NEXT:    cmp x12, #0
-; CHECK-NEXT:    eor x13, x3, x12
-; CHECK-NEXT:    cinv x14, x9, ge
-; CHECK-NEXT:    bics xzr, x13, x10
-; CHECK-NEXT:    asr x10, x12, #63
-; CHECK-NEXT:    csel x2, x10, x8, lt
-; CHECK-NEXT:    csel x3, x14, x12, lt
+; CHECK-NEXT:    adcs x11, x3, x7
+; CHECK-NEXT:    eor x9, x3, x7
+; CHECK-NEXT:    eor x12, x3, x11
+; CHECK-NEXT:    bics xzr, x12, x9
+; CHECK-NEXT:    asr x9, x11, #63
+; CHECK-NEXT:    eor x12, x9, #0x8000000000000000
+; CHECK-NEXT:    csel x2, x9, x8, lt
+; CHECK-NEXT:    csel x3, x12, x11, lt
 ; CHECK-NEXT:    adds x8, x0, x4
-; CHECK-NEXT:    adcs x10, x1, x5
-; CHECK-NEXT:    eor x11, x1, x5
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:    eor x12, x1, x10
-; CHECK-NEXT:    cinv x9, x9, ge
-; CHECK-NEXT:    bics xzr, x12, x11
-; CHECK-NEXT:    asr x11, x10, #63
+; CHECK-NEXT:    adcs x9, x1, x5
+; CHECK-NEXT:    eor x10, x1, x5
+; CHECK-NEXT:    eor x12, x1, x9
+; CHECK-NEXT:    asr x11, x9, #63
+; CHECK-NEXT:    bics xzr, x12, x10
+; CHECK-NEXT:    eor x13, x11, #0x8000000000000000
 ; CHECK-NEXT:    csel x8, x11, x8, lt
-; CHECK-NEXT:    csel x1, x9, x10, lt
+; CHECK-NEXT:    csel x1, x13, x9, lt
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat.ll b/llvm/test/CodeGen/AArch64/ssub_sat.ll
index 2dd854d2898b8..e5b25832db7ae 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat.ll
@@ -12,11 +12,9 @@ define i32 @func(i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: func:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    subs w8, w0, w1
-; CHECK-NEXT:    mov w9, #2147483647
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cinv w8, w9, ge
-; CHECK-NEXT:    subs w9, w0, w1
-; CHECK-NEXT:    csel w0, w8, w9, vs
+; CHECK-NEXT:    asr w9, w8, #31
+; CHECK-NEXT:    eor w9, w9, #0x80000000
+; CHECK-NEXT:    csel w0, w9, w8, vs
 ; CHECK-NEXT:    ret
   %tmp = call i32 @llvm.ssub.sat.i32(i32 %x, i32 %y);
   ret i32 %tmp;
@@ -26,11 +24,9 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: func2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    subs x8, x0, x1
-; CHECK-NEXT:    mov x9, #9223372036854775807
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    cinv x8, x9, ge
-; CHECK-NEXT:    subs x9, x0, x1
-; CHECK-NEXT:    csel x0, x8, x9, vs
+; CHECK-NEXT:    asr x9, x8, #63
+; CHECK-NEXT:    eor x9, x9, #0x8000000000000000
+; CHECK-NEXT:    csel x0, x9, x8, vs
 ; CHECK-NEXT:    ret
   %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %y);
   ret i64 %tmp;
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll b/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll
index 8a753b071c76a..9c108a582c23c 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll
@@ -11,11 +11,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-LABEL: func32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mul w8, w1, w2
-; CHECK-NEXT:    subs w10, w0, w8
-; CHECK-NEXT:    mov w9, #2147483647
-; CHECK-NEXT:    cmp w10, #0
-; CHECK-NEXT:    cinv w9, w9, ge
 ; CHECK-NEXT:    subs w8, w0, w8
+; CHECK-NEXT:    asr w9, w8, #31
+; CHECK-NEXT:    eor w9, w9, #0x80000000
 ; CHECK-NEXT:    csel w0, w9, w8, vs
 ; CHECK-NEXT:    ret
   %a = mul i32 %y, %z
@@ -27,11 +25,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-LABEL: func64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    subs x8, x0, x2
-; CHECK-NEXT:    mov x9, #9223372036854775807
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    cinv x8, x9, ge
-; CHECK-NEXT:    subs x9, x0, x2
-; CHECK-NEXT:    csel x0, x8, x9, vs
+; CHECK-NEXT:    asr x9, x8, #63
+; CHECK-NEXT:    eor x9, x9, #0x8000000000000000
+; CHECK-NEXT:    csel x0, x9, x8, vs
 ; CHECK-NEXT:    ret
   %a = mul i64 %y, %z
   %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %z)
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index 8babee052fed6..77e8237faf1e0 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -354,26 +354,23 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; CHECK-LABEL: v2i128:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    subs x8, x2, x6
-; CHECK-NEXT:    sbcs x12, x3, x7
-; CHECK-NEXT:    mov x9, #9223372036854775807
-; CHECK-NEXT:    eor x10, x3, x7
-; CHECK-NEXT:    cmp x12, #0
-; CHECK-NEXT:    eor x13, x3, x12
-; CHECK-NEXT:    cinv x14, x9, ge
-; CHECK-NEXT:    tst x10, x13
-; CHECK-NEXT:    asr x10, x12, #63
-; CHECK-NEXT:    csel x2, x10, x8, lt
-; CHECK-NEXT:    csel x3, x14, x12, lt
+; CHECK-NEXT:    sbcs x11, x3, x7
+; CHECK-NEXT:    eor x9, x3, x7
+; CHECK-NEXT:    eor x12, x3, x11
+; CHECK-NEXT:    tst x9, x12
+; CHECK-NEXT:    asr x9, x11, #63
+; CHECK-NEXT:    eor x12, x9, #0x8000000000000000
+; CHECK-NEXT:    csel x2, x9, x8, lt
+; CHECK-NEXT:    csel x3, x12, x11, lt
 ; CHECK-NEXT:    subs x8, x0, x4
-; CHECK-NEXT:    sbcs x10, x1, x5
-; CHECK-NEXT:    eor x11, x1, x5
-; CHECK-NEXT:    cmp x10, #0
-; CHECK-NEXT:    eor x12, x1, x10
-; CHECK-NEXT:    cinv x9, x9, ge
-; CHECK-NEXT:    tst x11, x12
-; CHECK-NEXT:    asr x11, x10, #63
+; CHECK-NEXT:    sbcs x9, x1, x5
+; CHECK-NEXT:    eor x10, x1, x5
+; CHECK-NEXT:    eor x12, x1, x9
+; CHECK-NEXT:    asr x11, x9, #63
+; CHECK-NEXT:    tst x10, x12
+; CHECK-NEXT:    eor x13, x11, #0x8000000000000000
 ; CHECK-NEXT:    csel x8, x11, x8, lt
-; CHECK-NEXT:    csel x1, x9, x10, lt
+; CHECK-NEXT:    csel x1, x13, x9, lt
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    fmov x0, d0
diff --git a/llvm/test/CodeGen/AArch64/strpre-str-merge.mir b/llvm/test/CodeGen/AArch64/strpre-str-merge.mir
index 017b5f3cd2a2b..5cee20a66527f 100644
--- a/llvm/test/CodeGen/AArch64/strpre-str-merge.mir
+++ b/llvm/test/CodeGen/AArch64/strpre-str-merge.mir
@@ -451,3 +451,30 @@ body:             |
     RET undef $lr, implicit $x0
 
 ...
+
+---
+name:            17-strwpre-strwui-same-reg-no-merge
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$x0' }
+  - { reg: '$x1' }
+  - { reg: '$x2' }
+frameInfo:
+  maxAlignment:    1
+  maxCallFrameSize: 0
+machineFunctionInfo:
+  hasRedZone:      false
+body:             |
+  bb.0.entry:
+    liveins: $x0, $x1, $x2
+    ; CHECK-LABEL: name: 17-strwpre-strwui-same-reg-no-merge
+    ; CHECK: liveins: $x0, $x1, $x2
+    ; CHECK: early-clobber renamable $x0 = STRWpre renamable $w1, renamable $x0, 24, implicit $w0, implicit-def $w0 :: (store (s32))
+    ; CHECK: STRWui renamable $w0, renamable $x0, 1 :: (store (s32))
+    ; CHECK: RET undef $lr, implicit $x0
+    early-clobber renamable $x0 = STRWpre killed renamable $w1, killed renamable $x0, 24 :: (store (s32))
+    STRWui renamable $w0, renamable $x0, 1 :: (store (s32))
+    RET undef $lr, implicit $x0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index ec0f4bdf025c6..2ce98f00687df 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -1057,6 +1057,37 @@ define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
   ret void
 }
 
+; extract_subvec(...(insert_subvec(a,b,c))) -> extract_subvec(bitcast(b),d) like
+; combines can effectively unlegalise bitcast operations. This test ensures such
+; combines do not happen after operation legalisation. When not prevented the
+; test triggers infinite combine->legalise->combine->...
+;
+; NOTE: For this test to function correctly it's critical for %vals to be in a
+; different block to the scatter store.  If not, the problematic bitcast will be
+; removed before operation legalisation and thus not exercise the combine.
+define void @masked_scatter_bitcast_infinite_loop(<8 x double>* %a, <8 x double*>* %b, i1 %cond) #0 {
+; CHECK-LABEL: masked_scatter_bitcast_infinite_loop
+; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8
+; VBITS_GE_512-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
+; VBITS_GE_512-NEXT: tbz w2, #0, [[LABEL:.*]]
+; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
+; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0
+; VBITS_GE_512-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d]
+; VBITS_GE_512-NEXT: [[LABEL]]:
+; VBITS_GE_512-NEXT: ret
+  %vals = load volatile <8 x double>, <8 x double>* %a
+  br i1 %cond, label %bb.1, label %bb.2
+
+bb.1:
+  %ptrs = load <8 x double*>, <8 x double*>* %b
+  %mask = fcmp oeq <8 x double> %vals, zeroinitializer
+  call void @llvm.masked.scatter.v8f64(<8 x double> %vals, <8 x double*> %ptrs, i32 8, <8 x i1> %mask)
+  br label %bb.2
+
+bb.2:
+  ret void
+}
+
 declare void @llvm.masked.scatter.v2i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>)
 declare void @llvm.masked.scatter.v4i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>)
 declare void @llvm.masked.scatter.v8i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/volatile-combine.ll b/llvm/test/CodeGen/AArch64/volatile-combine.ll
new file mode 100644
index 0000000000000..32fea1990ce79
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/volatile-combine.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=arm64-apple-ios %s -o - -stop-after=finalize-isel 2>&1 | FileCheck %s
+
+define void @foo(i64 %a, i64 %b, i32* %ptr) {
+; CHECK-LABEL: name: foo
+; CHECK: STRWui {{.*}} (volatile store (s32) into %ir.ptr)
+  %sum = add i64 %a, 1
+  %sum.32 = trunc i64 %sum to i32
+  store volatile i32 %sum.32, i32* %ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
index 1b146ddc72b91..d4c1670b1c56d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll
@@ -8,174 +8,191 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v64i32_varidx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, s33
+; GCN-NEXT:    s_mov_b32 s4, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
-; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[3:4], off offset:48
-; GCN-NEXT:    s_mov_b32 s5, 0
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    v_mov_b32_e32 v4, s5
-; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
-; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v4, vcc
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:48
-; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    v_mov_b32_e32 v6, s5
-; GCN-NEXT:    v_mov_b32_e32 v5, s4
-; GCN-NEXT:    v_add_co_u32_e32 v60, vcc, v0, v5
-; GCN-NEXT:    v_addc_co_u32_e32 v61, vcc, v1, v6, vcc
-; GCN-NEXT:    v_and_b32_e32 v0, 63, v2
-; GCN-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    v_add_u32_e32 v1, 0x100, v1
-; GCN-NEXT:    v_add_u32_e32 v0, v1, v0
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[19:22], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[23:26], v[0:1], off offset:80
+; GCN-NEXT:    global_load_dwordx4 v[27:30], v[0:1], off offset:96
+; GCN-NEXT:    global_load_dwordx4 v[31:34], v[0:1], off offset:112
+; GCN-NEXT:    global_load_dwordx4 v[35:38], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[39:42], v[0:1], off offset:144
+; GCN-NEXT:    global_load_dwordx4 v[43:46], v[0:1], off offset:160
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:176
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:192
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v[60:61], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[60:61], off offset:32
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[51:54], v[0:1], off offset:208
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[60:63], v[60:61], off offset:48
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:260
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:264
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[55:58], v[0:1], off offset:224
+; GCN-NEXT:    global_load_dwordx4 v[59:62], v[0:1], off offset:240
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:260
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:264
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:268
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    v_and_b32_e32 v0, 63, v2
+; GCN-NEXT:    v_lshrrev_b32_e64 v1, 6, s33
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_add_u32_e32 v1, 0x100, v1
+; GCN-NEXT:    v_add_u32_e32 v0, v1, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v20
-; GCN-NEXT:    v_mov_b32_e32 v13, v21
-; GCN-NEXT:    v_mov_b32_e32 v14, v22
-; GCN-NEXT:    v_mov_b32_e32 v15, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v15
+; GCN-NEXT:    v_mov_b32_e32 v13, v16
+; GCN-NEXT:    v_mov_b32_e32 v14, v17
+; GCN-NEXT:    v_mov_b32_e32 v15, v18
 ; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:432
 ; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:436
 ; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:440
 ; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:444
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:448
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:452
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:460
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:448
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:452
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:460
 ; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
@@ -193,36 +210,39 @@ define i32 @v_extract_v64i32_varidx(<64 x i32> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v11
-; GCN-NEXT:    v_mov_b32_e32 v9, v12
-; GCN-NEXT:    v_mov_b32_e32 v10, v13
-; GCN-NEXT:    v_mov_b32_e32 v11, v14
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:508
+; GCN-NEXT:    v_mov_b32_e32 v4, v7
+; GCN-NEXT:    v_mov_b32_e32 v5, v8
+; GCN-NEXT:    v_mov_b32_e32 v6, v9
+; GCN-NEXT:    v_mov_b32_e32 v7, v10
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:508
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b32 s33, s6
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <64 x i32>, <64 x i32> addrspace(1)* %ptr
@@ -234,173 +254,190 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v128i16_varidx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, s33
+; GCN-NEXT:    s_mov_b32 s4, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
-; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[3:4], off offset:48
-; GCN-NEXT:    s_mov_b32 s5, 0
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    v_mov_b32_e32 v4, s5
-; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
-; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v4, vcc
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:48
-; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    v_mov_b32_e32 v6, s5
-; GCN-NEXT:    v_mov_b32_e32 v5, s4
-; GCN-NEXT:    v_add_co_u32_e32 v60, vcc, v0, v5
-; GCN-NEXT:    v_addc_co_u32_e32 v61, vcc, v1, v6, vcc
-; GCN-NEXT:    v_bfe_u32 v0, v2, 1, 6
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    v_and_b32_e32 v1, 1, v2
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[19:22], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[23:26], v[0:1], off offset:80
+; GCN-NEXT:    global_load_dwordx4 v[27:30], v[0:1], off offset:96
+; GCN-NEXT:    global_load_dwordx4 v[31:34], v[0:1], off offset:112
+; GCN-NEXT:    global_load_dwordx4 v[35:38], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[39:42], v[0:1], off offset:144
+; GCN-NEXT:    global_load_dwordx4 v[43:46], v[0:1], off offset:160
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:176
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:192
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v[60:61], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[60:61], off offset:32
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[51:54], v[0:1], off offset:208
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[60:63], v[60:61], off offset:48
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:260
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:264
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[55:58], v[0:1], off offset:224
+; GCN-NEXT:    global_load_dwordx4 v[59:62], v[0:1], off offset:240
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:260
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:264
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:268
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    v_bfe_u32 v0, v2, 1, 6
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT:    v_and_b32_e32 v1, 1, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v20
-; GCN-NEXT:    v_mov_b32_e32 v13, v21
-; GCN-NEXT:    v_mov_b32_e32 v14, v22
-; GCN-NEXT:    v_mov_b32_e32 v15, v23
+; GCN-NEXT:    v_mov_b32_e32 v12, v15
+; GCN-NEXT:    v_mov_b32_e32 v13, v16
+; GCN-NEXT:    v_mov_b32_e32 v14, v17
+; GCN-NEXT:    v_mov_b32_e32 v15, v18
 ; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:432
 ; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:436
 ; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:440
 ; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:444
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:448
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:452
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:460
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:448
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:452
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:460
 ; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
@@ -418,40 +455,43 @@ define i16 @v_extract_v128i16_varidx(<128 x i16> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v11
-; GCN-NEXT:    v_mov_b32_e32 v9, v12
-; GCN-NEXT:    v_mov_b32_e32 v10, v13
-; GCN-NEXT:    v_mov_b32_e32 v11, v14
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:508
-; GCN-NEXT:    v_lshrrev_b32_e64 v11, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v11, 0x100, v11
-; GCN-NEXT:    v_add_u32_e32 v0, v11, v0
+; GCN-NEXT:    v_mov_b32_e32 v4, v7
+; GCN-NEXT:    v_mov_b32_e32 v5, v8
+; GCN-NEXT:    v_mov_b32_e32 v6, v9
+; GCN-NEXT:    v_mov_b32_e32 v7, v10
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:508
+; GCN-NEXT:    v_lshrrev_b32_e64 v7, 6, s33
+; GCN-NEXT:    v_add_u32_e32 v7, 0x100, v7
+; GCN-NEXT:    v_add_u32_e32 v0, v7, v0
 ; GCN-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b32 s33, s6
-; GCN-NEXT:    s_waitcnt vmcnt(16)
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 s33, s4
+; GCN-NEXT:    s_waitcnt vmcnt(15)
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -464,174 +504,191 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-LABEL: v_extract_v32i64_varidx:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, s33
+; GCN-NEXT:    s_mov_b32 s4, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0x3fc0
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, 64, v0
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
-; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v[3:4], off offset:48
-; GCN-NEXT:    s_mov_b32 s5, 0
-; GCN-NEXT:    v_mov_b32_e32 v3, s4
-; GCN-NEXT:    v_mov_b32_e32 v4, s5
-; GCN-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
-; GCN-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v4, vcc
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:64
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:128
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:192
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v[3:4], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v[3:4], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[3:4], off offset:48
-; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    v_mov_b32_e32 v6, s5
-; GCN-NEXT:    v_mov_b32_e32 v5, s4
-; GCN-NEXT:    v_add_co_u32_e32 v60, vcc, v0, v5
-; GCN-NEXT:    v_addc_co_u32_e32 v61, vcc, v1, v6, vcc
-; GCN-NEXT:    v_and_b32_e32 v0, 31, v2
-; GCN-NEXT:    v_lshrrev_b32_e64 v2, 6, s33
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GCN-NEXT:    v_add_u32_e32 v2, 0x100, v2
-; GCN-NEXT:    v_add_u32_e32 v1, v2, v0
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
+; GCN-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
+; GCN-NEXT:    global_load_dwordx4 v[11:14], v[0:1], off offset:32
+; GCN-NEXT:    global_load_dwordx4 v[15:18], v[0:1], off offset:48
+; GCN-NEXT:    global_load_dwordx4 v[19:22], v[0:1], off offset:64
+; GCN-NEXT:    global_load_dwordx4 v[23:26], v[0:1], off offset:80
+; GCN-NEXT:    global_load_dwordx4 v[27:30], v[0:1], off offset:96
+; GCN-NEXT:    global_load_dwordx4 v[31:34], v[0:1], off offset:112
+; GCN-NEXT:    global_load_dwordx4 v[35:38], v[0:1], off offset:128
+; GCN-NEXT:    global_load_dwordx4 v[39:42], v[0:1], off offset:144
+; GCN-NEXT:    global_load_dwordx4 v[43:46], v[0:1], off offset:160
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:176
 ; GCN-NEXT:    s_add_i32 s32, s32, 0x10000
 ; GCN-NEXT:    s_add_i32 s32, s32, 0xffff0000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:640 ; 4-byte Folded Spill
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:644 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:648 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:652 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:656 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:660 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:664 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:668 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:672 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:676 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:680 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:684 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:688 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:692 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:696 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:700 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[47:50], v[0:1], off offset:192
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v[60:61], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v[60:61], off offset:32
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:576 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:580 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:584 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:588 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:592 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:596 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:600 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:604 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:608 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:612 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:616 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:620 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:624 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:628 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:632 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:636 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[51:54], v[0:1], off offset:208
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
-; GCN-NEXT:    global_load_dwordx4 v[60:63], v[60:61], off offset:48
-; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:256
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:260
-; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:264
-; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:268
-; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:272
-; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:276
-; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:280
-; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:284
-; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:288
-; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:292
-; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:296
-; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:300
-; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:304
-; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:308
-; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:312
-; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:316
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:320
-; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:324
-; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:328
-; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:332
-; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:384
-; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:388
-; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:392
-; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:396
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:336
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:340
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:344
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:348
-; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:352
-; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:356
-; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:360
-; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:364
-; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:368
-; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:372
-; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:376
-; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:380
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:400
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:404
-; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:408
-; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:412
-; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:416
-; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:420
-; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:424
-; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:428
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v19, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v21, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v22, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v23, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:512 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v12, v20
-; GCN-NEXT:    v_mov_b32_e32 v13, v21
-; GCN-NEXT:    v_mov_b32_e32 v14, v22
-; GCN-NEXT:    v_mov_b32_e32 v15, v23
+; GCN-NEXT:    buffer_store_dword v48, off, s[0:3], s33 offset:516 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v49, off, s[0:3], s33 offset:520 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v50, off, s[0:3], s33 offset:524 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v51, off, s[0:3], s33 offset:528 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v52, off, s[0:3], s33 offset:532 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v53, off, s[0:3], s33 offset:536 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v54, off, s[0:3], s33 offset:540 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:544 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:548 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:552 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:556 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:560 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:564 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:568 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:572 ; 4-byte Folded Spill
+; GCN-NEXT:    global_load_dwordx4 v[55:58], v[0:1], off offset:224
+; GCN-NEXT:    global_load_dwordx4 v[59:62], v[0:1], off offset:240
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:256
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:260
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:264
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:268
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:272
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:276
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:280
+; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:284
+; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:288
+; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:292
+; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:296
+; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:300
+; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:304
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 offset:308
+; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 offset:312
+; GCN-NEXT:    buffer_store_dword v18, off, s[0:3], s33 offset:316
+; GCN-NEXT:    buffer_store_dword v19, off, s[0:3], s33 offset:320
+; GCN-NEXT:    buffer_store_dword v20, off, s[0:3], s33 offset:324
+; GCN-NEXT:    buffer_store_dword v21, off, s[0:3], s33 offset:328
+; GCN-NEXT:    buffer_store_dword v22, off, s[0:3], s33 offset:332
+; GCN-NEXT:    buffer_store_dword v23, off, s[0:3], s33 offset:336
+; GCN-NEXT:    buffer_store_dword v24, off, s[0:3], s33 offset:340
+; GCN-NEXT:    buffer_store_dword v25, off, s[0:3], s33 offset:344
+; GCN-NEXT:    buffer_store_dword v26, off, s[0:3], s33 offset:348
+; GCN-NEXT:    buffer_store_dword v27, off, s[0:3], s33 offset:352
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 offset:356
+; GCN-NEXT:    buffer_store_dword v29, off, s[0:3], s33 offset:360
+; GCN-NEXT:    buffer_store_dword v30, off, s[0:3], s33 offset:364
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:368
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:372
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:376
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s33 offset:380
+; GCN-NEXT:    buffer_store_dword v35, off, s[0:3], s33 offset:384
+; GCN-NEXT:    buffer_store_dword v36, off, s[0:3], s33 offset:388
+; GCN-NEXT:    buffer_store_dword v37, off, s[0:3], s33 offset:392
+; GCN-NEXT:    buffer_store_dword v38, off, s[0:3], s33 offset:396
+; GCN-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:400
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:404
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:408
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:412
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:416
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:420
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:424
+; GCN-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:428
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:640 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:644 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:648 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:652 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:656 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:660 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:664 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:668 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:672 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:676 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:680 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:684 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:688 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:692 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:696 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:700 ; 4-byte Folded Reload
+; GCN-NEXT:    v_and_b32_e32 v0, 31, v2
+; GCN-NEXT:    v_lshrrev_b32_e64 v2, 6, s33
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_add_u32_e32 v2, 0x100, v2
+; GCN-NEXT:    v_add_u32_e32 v1, v2, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v12, v15
+; GCN-NEXT:    v_mov_b32_e32 v13, v16
+; GCN-NEXT:    v_mov_b32_e32 v14, v17
+; GCN-NEXT:    v_mov_b32_e32 v15, v18
 ; GCN-NEXT:    buffer_store_dword v12, off, s[0:3], s33 offset:432
 ; GCN-NEXT:    buffer_store_dword v13, off, s[0:3], s33 offset:436
 ; GCN-NEXT:    buffer_store_dword v14, off, s[0:3], s33 offset:440
 ; GCN-NEXT:    buffer_store_dword v15, off, s[0:3], s33 offset:444
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:448
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:452
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:456
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:460
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:576 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:580 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:584 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:588 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:592 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:596 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:600 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:604 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:608 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v12, off, s[0:3], s33 offset:612 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v13, off, s[0:3], s33 offset:616 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v14, off, s[0:3], s33 offset:620 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v15, off, s[0:3], s33 offset:624 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 offset:628 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:632 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:636 ; 4-byte Folded Reload
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:448
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:452
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:456
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:460
 ; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:512 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:516 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 offset:520 ; 4-byte Folded Reload
@@ -649,37 +706,40 @@ define i64 @v_extract_v32i64_varidx(<32 x i64> addrspace(1)* %ptr, i32 %idx) {
 ; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 offset:568 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v8, v11
-; GCN-NEXT:    v_mov_b32_e32 v9, v12
-; GCN-NEXT:    v_mov_b32_e32 v10, v13
-; GCN-NEXT:    v_mov_b32_e32 v11, v14
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:480
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:484
-; GCN-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:488
-; GCN-NEXT:    buffer_store_dword v11, off, s[0:3], s33 offset:492
-; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:496
-; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:500
-; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:504
-; GCN-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:508
+; GCN-NEXT:    v_mov_b32_e32 v4, v7
+; GCN-NEXT:    v_mov_b32_e32 v5, v8
+; GCN-NEXT:    v_mov_b32_e32 v6, v9
+; GCN-NEXT:    v_mov_b32_e32 v7, v10
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:464
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:468
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:472
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:476
+; GCN-NEXT:    buffer_store_dword v55, off, s[0:3], s33 offset:480
+; GCN-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:484
+; GCN-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:488
+; GCN-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:492
+; GCN-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:496
+; GCN-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:500
+; GCN-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:504
+; GCN-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:508
 ; GCN-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
-; GCN-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b32 s33, s6
+; GCN-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %vec = load <32 x i64>, <32 x i64> addrspace(1)* %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 2dedb531bc1bb..548debc54788b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -4138,12 +4138,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
 ; GPRIDX-LABEL: v_extract_v64i32_37:
 ; GPRIDX:       ; %bb.0:
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GPRIDX-NEXT:    s_mov_b64 s[4:5], 0x80
-; GPRIDX-NEXT:    v_mov_b32_e32 v2, s4
-; GPRIDX-NEXT:    v_mov_b32_e32 v3, s5
-; GPRIDX-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GPRIDX-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GPRIDX-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GPRIDX-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:144
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    v_mov_b32_e32 v0, v5
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
@@ -4151,12 +4146,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
 ; MOVREL-LABEL: v_extract_v64i32_37:
 ; MOVREL:       ; %bb.0:
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], 0x80
-; MOVREL-NEXT:    v_mov_b32_e32 v2, s4
-; MOVREL-NEXT:    v_mov_b32_e32 v3, s5
-; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x90, v0
 ; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; MOVREL-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0)
@@ -4167,12 +4157,7 @@ define i32 @v_extract_v64i32_37(<64 x i32> addrspace(1)* %ptr) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0x80
-; GFX10-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off offset:144
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
index 04120780f8e19..e2e5e3369bd9a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
@@ -6,123 +6,96 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
 ; GCN-LABEL: v_insert_v64i32_37:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-NEXT:    v_lshlrev_b32_e32 v68, 8, v0
-; GCN-NEXT:    s_movk_i32 s4, 0x80
-; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v64, 8, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v68
-; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, 64, v2
-; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
-; GCN-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[36:39], v[0:1], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[40:43], v[0:1], off offset:48
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_add_co_u32_e32 v64, vcc, v2, v0
-; GCN-NEXT:    s_movk_i32 s4, 0xc0
-; GCN-NEXT:    v_addc_co_u32_e32 v65, vcc, v3, v1, vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    v_add_co_u32_e32 v66, vcc, v2, v0
-; GCN-NEXT:    v_addc_co_u32_e32 v67, vcc, v3, v1, vcc
-; GCN-NEXT:    global_load_dwordx4 v[44:47], v68, s[0:1]
-; GCN-NEXT:    global_load_dwordx4 v[48:51], v68, s[0:1] offset:16
-; GCN-NEXT:    global_load_dwordx4 v[52:55], v68, s[0:1] offset:32
-; GCN-NEXT:    global_load_dwordx4 v[56:59], v68, s[0:1] offset:48
-; GCN-NEXT:    global_load_dwordx4 v[60:63], v68, s[0:1] offset:64
-; GCN-NEXT:    global_load_dwordx4 v[4:7], v[64:65], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[8:11], v[64:65], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[12:15], v[64:65], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[20:23], v[66:67], off offset:16
-; GCN-NEXT:    global_load_dwordx4 v[24:27], v[66:67], off offset:32
-; GCN-NEXT:    global_load_dwordx4 v[28:31], v[66:67], off offset:48
-; GCN-NEXT:    global_load_dwordx4 v[0:3], v68, s[0:1] offset:128
-; GCN-NEXT:    global_load_dwordx4 v[16:19], v68, s[0:1] offset:192
-; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_load_dwordx4 v[32:35], v64, s[0:1]
+; GCN-NEXT:    global_load_dwordx4 v[36:39], v64, s[0:1] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[40:43], v64, s[0:1] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[44:47], v64, s[0:1] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[48:51], v64, s[0:1] offset:64
+; GCN-NEXT:    global_load_dwordx4 v[52:55], v64, s[0:1] offset:80
+; GCN-NEXT:    global_load_dwordx4 v[56:59], v64, s[0:1] offset:96
+; GCN-NEXT:    global_load_dwordx4 v[60:63], v64, s[0:1] offset:112
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v64, s[0:1] offset:128
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v64, s[0:1] offset:144
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v64, s[0:1] offset:160
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v64, s[0:1] offset:176
+; GCN-NEXT:    global_load_dwordx4 v[16:19], v64, s[0:1] offset:192
+; GCN-NEXT:    global_load_dwordx4 v[20:23], v64, s[0:1] offset:208
+; GCN-NEXT:    global_load_dwordx4 v[24:27], v64, s[0:1] offset:224
+; GCN-NEXT:    global_load_dwordx4 v[28:31], v64, s[0:1] offset:240
+; GCN-NEXT:    s_waitcnt vmcnt(6)
 ; GCN-NEXT:    v_mov_b32_e32 v5, 0x3e7
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    global_store_dwordx4 v68, v[0:3], s[2:3] offset:128
-; GCN-NEXT:    global_store_dwordx4 v68, v[4:7], s[2:3] offset:144
-; GCN-NEXT:    global_store_dwordx4 v68, v[8:11], s[2:3] offset:160
-; GCN-NEXT:    global_store_dwordx4 v68, v[12:15], s[2:3] offset:176
-; GCN-NEXT:    s_waitcnt vmcnt(4)
-; GCN-NEXT:    global_store_dwordx4 v68, v[16:19], s[2:3] offset:192
-; GCN-NEXT:    global_store_dwordx4 v68, v[20:23], s[2:3] offset:208
-; GCN-NEXT:    global_store_dwordx4 v68, v[24:27], s[2:3] offset:224
-; GCN-NEXT:    global_store_dwordx4 v68, v[44:47], s[2:3]
-; GCN-NEXT:    global_store_dwordx4 v68, v[48:51], s[2:3] offset:16
-; GCN-NEXT:    global_store_dwordx4 v68, v[52:55], s[2:3] offset:32
-; GCN-NEXT:    global_store_dwordx4 v68, v[56:59], s[2:3] offset:48
-; GCN-NEXT:    global_store_dwordx4 v68, v[60:63], s[2:3] offset:64
-; GCN-NEXT:    global_store_dwordx4 v68, v[28:31], s[2:3] offset:240
-; GCN-NEXT:    global_store_dwordx4 v68, v[32:35], s[2:3] offset:80
-; GCN-NEXT:    global_store_dwordx4 v68, v[36:39], s[2:3] offset:96
-; GCN-NEXT:    global_store_dwordx4 v68, v[40:43], s[2:3] offset:112
+; GCN-NEXT:    global_store_dwordx4 v64, v[0:3], s[2:3] offset:128
+; GCN-NEXT:    global_store_dwordx4 v64, v[4:7], s[2:3] offset:144
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v64, v[8:11], s[2:3] offset:160
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v64, v[12:15], s[2:3] offset:176
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v64, v[16:19], s[2:3] offset:192
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v64, v[20:23], s[2:3] offset:208
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v64, v[24:27], s[2:3] offset:224
+; GCN-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NEXT:    global_store_dwordx4 v64, v[28:31], s[2:3] offset:240
+; GCN-NEXT:    global_store_dwordx4 v64, v[32:35], s[2:3]
+; GCN-NEXT:    global_store_dwordx4 v64, v[36:39], s[2:3] offset:16
+; GCN-NEXT:    global_store_dwordx4 v64, v[40:43], s[2:3] offset:32
+; GCN-NEXT:    global_store_dwordx4 v64, v[44:47], s[2:3] offset:48
+; GCN-NEXT:    global_store_dwordx4 v64, v[48:51], s[2:3] offset:64
+; GCN-NEXT:    global_store_dwordx4 v64, v[52:55], s[2:3] offset:80
+; GCN-NEXT:    global_store_dwordx4 v64, v[56:59], s[2:3] offset:96
+; GCN-NEXT:    global_store_dwordx4 v64, v[60:63], s[2:3] offset:112
 ; GCN-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: v_insert_v64i32_37:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v70, 8, v0
-; GFX10-NEXT:    s_movk_i32 s4, 0x80
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
-; GFX10-NEXT:    s_movk_i32 s4, 0xc0
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
-; GFX10-NEXT:    v_mov_b32_e32 v4, s5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v64, 8, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, s1
-; GFX10-NEXT:    v_mov_b32_e32 v5, s0
-; GFX10-NEXT:    s_clause 0x4
-; GFX10-NEXT:    global_load_dwordx4 v[32:35], v70, s[0:1]
-; GFX10-NEXT:    global_load_dwordx4 v[36:39], v70, s[0:1] offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[40:43], v70, s[0:1] offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[44:47], v70, s[0:1] offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[48:51], v70, s[0:1] offset:64
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v5, v70
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v6, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v64, vcc_lo, v0, 64
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v65, vcc_lo, 0, v5, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v66, vcc_lo, v0, v1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v67, vcc_lo, v5, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v68, vcc_lo, v0, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v69, vcc_lo, v5, v4, vcc_lo
-; GFX10-NEXT:    s_clause 0xa
-; GFX10-NEXT:    global_load_dwordx4 v[52:55], v[64:65], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[56:59], v[64:65], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[60:63], v[64:65], off offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[4:7], v[66:67], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[8:11], v[66:67], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[12:15], v[66:67], off offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[20:23], v[68:69], off offset:16
-; GFX10-NEXT:    global_load_dwordx4 v[24:27], v[68:69], off offset:32
-; GFX10-NEXT:    global_load_dwordx4 v[28:31], v[68:69], off offset:48
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v70, s[0:1] offset:128
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v70, s[0:1] offset:192
-; GFX10-NEXT:    s_waitcnt vmcnt(7)
+; GFX10-NEXT:    s_clause 0xf
+; GFX10-NEXT:    global_load_dwordx4 v[32:35], v64, s[0:1]
+; GFX10-NEXT:    global_load_dwordx4 v[36:39], v64, s[0:1] offset:16
+; GFX10-NEXT:    global_load_dwordx4 v[40:43], v64, s[0:1] offset:32
+; GFX10-NEXT:    global_load_dwordx4 v[44:47], v64, s[0:1] offset:48
+; GFX10-NEXT:    global_load_dwordx4 v[48:51], v64, s[0:1] offset:64
+; GFX10-NEXT:    global_load_dwordx4 v[52:55], v64, s[0:1] offset:80
+; GFX10-NEXT:    global_load_dwordx4 v[56:59], v64, s[0:1] offset:96
+; GFX10-NEXT:    global_load_dwordx4 v[60:63], v64, s[0:1] offset:112
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v64, s[0:1] offset:128
+; GFX10-NEXT:    global_load_dwordx4 v[4:7], v64, s[0:1] offset:144
+; GFX10-NEXT:    global_load_dwordx4 v[8:11], v64, s[0:1] offset:160
+; GFX10-NEXT:    global_load_dwordx4 v[12:15], v64, s[0:1] offset:176
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v64, s[0:1] offset:192
+; GFX10-NEXT:    global_load_dwordx4 v[20:23], v64, s[0:1] offset:208
+; GFX10-NEXT:    global_load_dwordx4 v[24:27], v64, s[0:1] offset:224
+; GFX10-NEXT:    global_load_dwordx4 v[28:31], v64, s[0:1] offset:240
+; GFX10-NEXT:    s_waitcnt vmcnt(6)
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0x3e7
+; GFX10-NEXT:    global_store_dwordx4 v64, v[0:3], s[2:3] offset:128
+; GFX10-NEXT:    global_store_dwordx4 v64, v[4:7], s[2:3] offset:144
+; GFX10-NEXT:    s_waitcnt vmcnt(5)
+; GFX10-NEXT:    global_store_dwordx4 v64, v[8:11], s[2:3] offset:160
+; GFX10-NEXT:    s_waitcnt vmcnt(4)
+; GFX10-NEXT:    global_store_dwordx4 v64, v[12:15], s[2:3] offset:176
+; GFX10-NEXT:    s_waitcnt vmcnt(3)
+; GFX10-NEXT:    global_store_dwordx4 v64, v[16:19], s[2:3] offset:192
+; GFX10-NEXT:    s_waitcnt vmcnt(2)
+; GFX10-NEXT:    global_store_dwordx4 v64, v[20:23], s[2:3] offset:208
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    global_store_dwordx4 v70, v[0:3], s[2:3] offset:128
-; GFX10-NEXT:    global_store_dwordx4 v70, v[4:7], s[2:3] offset:144
-; GFX10-NEXT:    global_store_dwordx4 v70, v[8:11], s[2:3] offset:160
-; GFX10-NEXT:    global_store_dwordx4 v70, v[12:15], s[2:3] offset:176
+; GFX10-NEXT:    global_store_dwordx4 v64, v[24:27], s[2:3] offset:224
+; GFX10-NEXT:    global_store_dwordx4 v64, v[32:35], s[2:3]
+; GFX10-NEXT:    global_store_dwordx4 v64, v[36:39], s[2:3] offset:16
+; GFX10-NEXT:    global_store_dwordx4 v64, v[40:43], s[2:3] offset:32
+; GFX10-NEXT:    global_store_dwordx4 v64, v[44:47], s[2:3] offset:48
+; GFX10-NEXT:    global_store_dwordx4 v64, v[48:51], s[2:3] offset:64
+; GFX10-NEXT:    global_store_dwordx4 v64, v[52:55], s[2:3] offset:80
+; GFX10-NEXT:    global_store_dwordx4 v64, v[56:59], s[2:3] offset:96
+; GFX10-NEXT:    global_store_dwordx4 v64, v[60:63], s[2:3] offset:112
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    global_store_dwordx4 v70, v[16:19], s[2:3] offset:192
-; GFX10-NEXT:    global_store_dwordx4 v70, v[20:23], s[2:3] offset:208
-; GFX10-NEXT:    global_store_dwordx4 v70, v[24:27], s[2:3] offset:224
-; GFX10-NEXT:    global_store_dwordx4 v70, v[32:35], s[2:3]
-; GFX10-NEXT:    global_store_dwordx4 v70, v[36:39], s[2:3] offset:16
-; GFX10-NEXT:    global_store_dwordx4 v70, v[40:43], s[2:3] offset:32
-; GFX10-NEXT:    global_store_dwordx4 v70, v[44:47], s[2:3] offset:48
-; GFX10-NEXT:    global_store_dwordx4 v70, v[48:51], s[2:3] offset:64
-; GFX10-NEXT:    global_store_dwordx4 v70, v[52:55], s[2:3] offset:80
-; GFX10-NEXT:    global_store_dwordx4 v70, v[56:59], s[2:3] offset:96
-; GFX10-NEXT:    global_store_dwordx4 v70, v[60:63], s[2:3] offset:112
-; GFX10-NEXT:    global_store_dwordx4 v70, v[28:31], s[2:3] offset:240
+; GFX10-NEXT:    global_store_dwordx4 v64, v[28:31], s[2:3] offset:240
 ; GFX10-NEXT:    s_endpgm
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.in, i32 %id
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir
index adbe22aac9f41..3eb1ae47b4db3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-cttz-zero-undef.mir
@@ -60,11 +60,11 @@ body: |
     ; CHECK-LABEL: name: cttz_zero_undef_s64_v
     ; CHECK: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
     ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64)
-    ; CHECK: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32)
-    ; CHECK: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:vgpr(s32) = G_CTTZ_ZERO_UNDEF [[UV1]](s32)
+    ; CHECK: [[AMDGPU_FFBL_B32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBL_B32 [[UV]](s32)
+    ; CHECK: [[AMDGPU_FFBL_B32_1:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBL_B32 [[UV1]](s32)
     ; CHECK: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32
-    ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[CTTZ_ZERO_UNDEF1]], [[C]]
-    ; CHECK: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[CTTZ_ZERO_UNDEF]], [[ADD]]
+    ; CHECK: [[ADD:%[0-9]+]]:vgpr(s32) = G_ADD [[AMDGPU_FFBL_B32_1]], [[C]]
+    ; CHECK: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[AMDGPU_FFBL_B32_]], [[ADD]]
     ; CHECK: S_ENDPGM 0, implicit [[UMIN]](s32)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_CTTZ_ZERO_UNDEF %0
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index d235753453ad2..e46bdd9aaf0bc 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -172,7 +172,7 @@ attributes #1 = { nounwind }
 ; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
 ; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-queue-ptr" }
 ;.
-; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { argmemonly nofree nounwind willreturn "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { argmemonly nofree nounwind willreturn }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-queue-ptr" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index 06d4f140538a0..d5d87289f4b9e 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -738,6 +738,82 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 {
   ret float %fadd
 }
 
+define float @func_indirect_call(float()* %fptr) #3 {
+; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_call
+; AKF_HSA-SAME: (float ()* [[FPTR:%.*]]) #[[ATTR21:[0-9]+]] {
+; AKF_HSA-NEXT:    [[F:%.*]] = call float [[FPTR]]()
+; AKF_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
+; AKF_HSA-NEXT:    ret float [[FADD]]
+;
+; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call
+; ATTRIBUTOR_HSA-SAME: (float ()* [[FPTR:%.*]]) #[[ATTR6]] {
+; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float [[FPTR]]()
+; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
+; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
+;
+  %f = call float %fptr()
+  %fadd = fadd float %f, 1.0
+  ret float %fadd
+}
+
+declare float @extern() #3
+define float @func_extern_call() #3 {
+; AKF_HSA-LABEL: define {{[^@]+}}@func_extern_call
+; AKF_HSA-SAME: () #[[ATTR17]] {
+; AKF_HSA-NEXT:    [[F:%.*]] = call float @extern()
+; AKF_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
+; AKF_HSA-NEXT:    ret float [[FADD]]
+;
+; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR6]] {
+; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @extern() #[[ATTR10]]
+; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
+; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
+;
+  %f = call float @extern()
+  %fadd = fadd float %f, 1.0
+  ret float %fadd
+}
+
+define float @func_null_call(float()* %fptr) #3 {
+; AKF_HSA-LABEL: define {{[^@]+}}@func_null_call
+; AKF_HSA-SAME: (float ()* [[FPTR:%.*]]) #[[ATTR21]] {
+; AKF_HSA-NEXT:    [[F:%.*]] = call float null()
+; AKF_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
+; AKF_HSA-NEXT:    ret float [[FADD]]
+;
+; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call
+; ATTRIBUTOR_HSA-SAME: (float ()* [[FPTR:%.*]]) #[[ATTR6]] {
+; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float null()
+; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
+; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
+;
+  %f = call float null()
+  %fadd = fadd float %f, 1.0
+  ret float %fadd
+}
+
+declare float @llvm.amdgcn.rcp.f32(float) #0
+
+; Calls some other recognized intrinsic
+define float @func_other_intrinsic_call(float %arg) #3 {
+; AKF_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call
+; AKF_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR18]] {
+; AKF_HSA-NEXT:    [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]])
+; AKF_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
+; AKF_HSA-NEXT:    ret float [[FADD]]
+;
+; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call
+; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR7]] {
+; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]])
+; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
+; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
+;
+  %f = call float @llvm.amdgcn.rcp.f32(float %arg)
+  %fadd = fadd float %f, 1.0
+  ret float %fadd
+}
+
 attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { nounwind "target-cpu"="fiji" }
 attributes #2 = { nounwind "target-cpu"="gfx900" }
@@ -765,8 +841,9 @@ attributes #3 = { nounwind }
 ; AKF_HSA: attributes #[[ATTR18]] = { nounwind }
 ; AKF_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-calls" "uniform-work-group-size"="false" }
 ; AKF_HSA: attributes #[[ATTR20]] = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" "target-cpu"="fiji" }
+; AKF_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-dispatch-id" "amdgpu-dispatch-ptr" "amdgpu-implicitarg-ptr" "amdgpu-queue-ptr" "amdgpu-work-group-id-x" "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-x" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
 ;.
-; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { noreturn nounwind readnone "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { noreturn nounwind "target-cpu"="fiji" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 15225677da7c3..99fab98422f71 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -520,7 +520,7 @@ attributes #1 = { nounwind }
 ; AKF_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-kernarg-segment-ptr" }
 ; AKF_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-stack-objects" }
 ;.
-; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-work-group-id-y" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-work-group-id-z" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index df765fb2d4899..9bbdb7365c2d3 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -325,7 +325,7 @@ attributes #1 = { nounwind }
 ; AKF_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
 ; AKF_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-work-group-id-y" "amdgpu-work-group-id-z" "amdgpu-work-item-id-y" "amdgpu-work-item-id-z" }
 ;.
-; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn }
 ; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-work-group-id-y" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-work-group-id-z" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll
new file mode 100644
index 0000000000000..240963cfe9009
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomics-cas-remarks-gfx90a.ll
@@ -0,0 +1,103 @@
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=atomic-expand \
+; RUN:      %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-CAS
+
+; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope
+; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent memory scope
+; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup memory scope
+; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront memory scope
+; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread memory scope
+; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope
+; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope
+; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope
+; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope
+; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at singlethread-one-as memory scope
+
+; GFX90A-CAS-LABEL: atomic_add_cas:
+; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-CAS: s_cbranch_execnz
+define dso_local void @atomic_add_cas(float* %p, float %q) {
+entry:
+  %ret = atomicrmw fadd float* %p, float %q monotonic, align 4
+  ret void
+}
+
+; GFX90A-CAS-LABEL: atomic_add_cas_agent:
+; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-CAS: s_cbranch_execnz
+define dso_local void @atomic_add_cas_agent(float* %p, float %q) {
+entry:
+  %ret = atomicrmw fadd float* %p, float %q syncscope("agent") monotonic, align 4
+  ret void
+}
+
+; GFX90A-CAS-LABEL: atomic_add_cas_workgroup:
+; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-CAS: s_cbranch_execnz
+define dso_local void @atomic_add_cas_workgroup(float* %p, float %q) {
+entry:
+  %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup") monotonic, align 4
+  ret void
+}
+
+; GFX90A-CAS-LABEL: atomic_add_cas_wavefront:
+; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-CAS: s_cbranch_execnz
+define dso_local void @atomic_add_cas_wavefront(float* %p, float %q) {
+entry:
+  %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront") monotonic, align 4
+  ret void
+}
+
+; GFX90A-CAS-LABEL: atomic_add_cas_singlethread:
+; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-CAS: s_cbranch_execnz
+define dso_local void @atomic_add_cas_singlethread(float* %p, float %q) {
+entry:
+  %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread") monotonic, align 4
+  ret void
+}
+
+; GFX90A-CAS-LABEL: atomic_add_cas_one_as:
+; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-CAS: s_cbranch_execnz
+define dso_local void @atomic_add_cas_one_as(float* %p, float %q) {
+entry:
+  %ret = atomicrmw fadd float* %p, float %q syncscope("one-as") monotonic, align 4
+  ret void
+}
+
+; GFX90A-CAS-LABEL: atomic_add_cas_agent_one_as:
+; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-CAS: s_cbranch_execnz
+define dso_local void @atomic_add_cas_agent_one_as(float* %p, float %q) {
+entry:
+  %ret = atomicrmw fadd float* %p, float %q syncscope("agent-one-as") monotonic, align 4
+  ret void
+}
+
+; GFX90A-CAS-LABEL: atomic_add_cas_workgroup_one_as:
+; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-CAS: s_cbranch_execnz
+define dso_local void @atomic_add_cas_workgroup_one_as(float* %p, float %q) {
+entry:
+  %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup-one-as") monotonic, align 4
+  ret void
+}
+
+; GFX90A-CAS-LABEL: atomic_add_cas_wavefront_one_as:
+; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-CAS: s_cbranch_execnz
+define dso_local void @atomic_add_cas_wavefront_one_as(float* %p, float %q) {
+entry:
+  %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront-one-as") monotonic, align 4
+  ret void
+}
+
+; GFX90A-CAS-LABEL: atomic_add_cas_singlethread_one_as:
+; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-CAS: s_cbranch_execnz
+define dso_local void @atomic_add_cas_singlethread_one_as(float* %p, float %q) {
+entry:
+  %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread-one-as") monotonic, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll
new file mode 100644
index 0000000000000..35637cd9882a1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomics-hw-remarks-gfx90a.ll
@@ -0,0 +1,95 @@
+; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=si-lower \
+; RUN:      %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-HW
+
+; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope system due to an unsafe request.
+; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope agent due to an unsafe request.
+; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope workgroup due to an unsafe request.
+; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope wavefront due to an unsafe request.
+; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope singlethread due to an unsafe request.
+; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope agent-one-as due to an unsafe request.
+; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope workgroup-one-as due to an unsafe request.
+; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope wavefront-one-as due to an unsafe request.
+; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope singlethread-one-as due to an unsafe request.
+
+; GFX90A-HW-LABEL: atomic_add_unsafe_hw:
+; GFX90A-HW:    ds_add_f64 v2, v[0:1]
+; GFX90A-HW:    s_endpgm
+define amdgpu_kernel void @atomic_add_unsafe_hw(double addrspace(3)* %ptr) #0 {
+main_body:
+  %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst
+  ret void
+}
+
+; GFX90A-HW-LABEL: atomic_add_unsafe_hw_agent:
+; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
+; GFX90A-HW:    s_endpgm
+define amdgpu_kernel void @atomic_add_unsafe_hw_agent(float addrspace(1)* %ptr, float %val) #0 {
+main_body:
+  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("agent") monotonic, align 4
+  ret void
+}
+
+; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wg:
+; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
+; GFX90A-HW:    s_endpgm
+define amdgpu_kernel void @atomic_add_unsafe_hw_wg(float addrspace(1)* %ptr, float %val) #0 {
+main_body:
+  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("workgroup") monotonic, align 4
+  ret void
+}
+
+; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wavefront:
+; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
+; GFX90A-HW:    s_endpgm
+define amdgpu_kernel void @atomic_add_unsafe_hw_wavefront(float addrspace(1)* %ptr, float %val) #0 {
+main_body:
+  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("wavefront") monotonic, align 4
+  ret void
+}
+
+; GFX90A-HW-LABEL: atomic_add_unsafe_hw_single_thread:
+; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
+; GFX90A-HW:    s_endpgm
+define amdgpu_kernel void @atomic_add_unsafe_hw_single_thread(float addrspace(1)* %ptr, float %val) #0 {
+main_body:
+  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("singlethread") monotonic, align 4
+  ret void
+}
+
+; GFX90A-HW-LABEL: atomic_add_unsafe_hw_aoa:
+; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
+; GFX90A-HW:    s_endpgm
+define amdgpu_kernel void @atomic_add_unsafe_hw_aoa(float addrspace(1)* %ptr, float %val) #0 {
+main_body:
+  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("agent-one-as") monotonic, align 4
+  ret void
+}
+
+; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wgoa:
+; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
+; GFX90A-HW:    s_endpgm
+define amdgpu_kernel void @atomic_add_unsafe_hw_wgoa(float addrspace(1)* %ptr, float %val) #0 {
+main_body:
+  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("workgroup-one-as") monotonic, align 4
+  ret void
+}
+
+; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wfoa:
+; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
+; GFX90A-HW:    s_endpgm
+define amdgpu_kernel void @atomic_add_unsafe_hw_wfoa(float addrspace(1)* %ptr, float %val) #0 {
+main_body:
+  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("wavefront-one-as") monotonic, align 4
+  ret void
+}
+
+; GFX90A-HW-LABEL: atomic_add_unsafe_hw_stoa:
+; GFX90A-HW:    global_atomic_add_f32 v0, v1, s[2:3]
+; GFX90A-HW:    s_endpgm
+define amdgpu_kernel void @atomic_add_unsafe_hw_stoa(float addrspace(1)* %ptr, float %val) #0 {
+main_body:
+  %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("singlethread-one-as") monotonic, align 4
+  ret void
+}
+
+attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 946d21ee8edf0..7864ebec2e99a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -2719,5 +2719,126 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @large_offset() {
+; GFX9-LABEL: large_offset:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s2
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 16
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x810
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: large_offset:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s2
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_movk_i32 s0, 0x810
+; GFX10-NEXT:    s_addk_i32 s0, 0x3c0
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 16
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x810
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v1
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    s_endpgm
+;
+; GFX9-PAL-LABEL: large_offset:
+; GFX9-PAL:       ; %bb.0: ; %bb
+; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s0
+; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:3024
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, vcc_hi offset:3024 glc
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 16
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x810
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    s_endpgm
+;
+; GFX10-PAL-LABEL: large_offset:
+; GFX10-PAL:       ; %bb.0: ; %bb
+; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s0
+; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x810
+; GFX10-PAL-NEXT:    s_addk_i32 s0, 0x3c0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, v0
+; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
+; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 glc dlc
+; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 16
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x810
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v0
+; GFX10-PAL-NEXT:    ;;#ASMEND
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v1
+; GFX10-PAL-NEXT:    ;;#ASMEND
+; GFX10-PAL-NEXT:    s_endpgm
+bb:
+  %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5)
+  %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5)
+  %gep = getelementptr inbounds [128 x <4 x i32>], [128 x <4 x i32>] addrspace(5)* %alloca2, i32 0, i32 60
+  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(5)* %gep, align 16
+  %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %gep, align 16
+  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca) #0
+  call void asm sideeffect "; use $0", "s"([128 x <4 x i32>] addrspace(5)* %alloca2) #0
+  ret void
+}
+
 declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll
new file mode 100644
index 0000000000000..e1aa9b0097483
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-atomics.ll
@@ -0,0 +1,1245 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s  -check-prefix=GFX7
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030
+
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s  -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030
+
+declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg)
+declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg)
+
+declare float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float, <4 x i32>, i32, i32, i32 immarg)
+declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32, i32 immarg)
+
+declare float @llvm.amdgcn.image.atomic.fmin.1d.f32.f32(float, i32, <8 x i32>, i32, i32)
+declare float @llvm.amdgcn.image.atomic.fmax.1d.f32.f32(float, i32, <8 x i32>, i32, i32)
+
+
+define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_min_noret_f32:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_noret_f32:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_noret_f32:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x1
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_noret_f32:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s0
+; G_SI-NEXT:    v_mov_b32_e32 v1, s1
+; G_SI-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f32:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; G_GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f32:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x1
+; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 0 offen glc
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x1
+; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_min_noret_f64:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; SI-NEXT:    s_load_dword s0, s[0:1], 0xf
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_noret_f64:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0xf
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_noret_f64:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x2
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_noret_f64:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_SI-NEXT:    s_load_dword s0, s[0:1], 0xf
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s2
+; G_SI-NEXT:    v_mov_b32_e32 v1, s3
+; G_SI-NEXT:    v_mov_b32_e32 v2, s0
+; G_SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_noret_f64:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_GFX7-NEXT:    s_load_dword s0, s[0:1], 0xf
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; G_GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_noret_f64:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x2
+; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; G_GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x2
+; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
+; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_min_rtn_f32:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[0:1], v0, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_SI-NEXT:    s_mov_b32 s2, -1
+; G_SI-NEXT:    s_mov_b32 s3, 0xf000
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_GFX7-NEXT:    s_mov_b32 s2, -1
+; G_GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    global_store_dword v[0:1], v0, off
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    global_store_dword v[0:1], v0, off
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store float %ret, float addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_min_rtn_f64:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write_b64 v0, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_rtn_f64:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_rtn_f64:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_rtn_f64:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_SI-NEXT:    s_mov_b32 m0, -1
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    ds_write_b64 v0, v[0:1]
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f64:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX7-NEXT:    s_mov_b32 m0, -1
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ds_write_b64 v0, v[0:1]
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f64:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ds_write_b64 v0, v[0:1]
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ds_write_b64 v0, v[0:1]
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store double %ret, double addrspace(3)* undef
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, float addrspace(3)* %out) {
+; SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; SI-NEXT:    s_load_dword s0, s[0:1], 0xf
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; SI-NEXT:    v_mov_b32_e32 v1, s0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write_b32 v1, v0
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0xf
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_nop 0
+; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x3c
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v1, s0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ds_write_b32 v1, v0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x2
+; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dword s0, s[0:1], 0x3c
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s0
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ds_write_b32 v1, v0
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_SI-NEXT:    s_load_dword s0, s[0:1], 0xf
+; G_SI-NEXT:    s_mov_b32 m0, -1
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s2
+; G_SI-NEXT:    v_mov_b32_e32 v1, s3
+; G_SI-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; G_SI-NEXT:    v_mov_b32_e32 v1, s0
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    ds_write_b32 v1, v0
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0xd
+; G_GFX7-NEXT:    s_load_dword s2, s[0:1], 0xf
+; G_GFX7-NEXT:    s_mov_b32 m0, -1
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s12
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s13
+; G_GFX7-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s2
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ds_write_b32 v1, v0
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x2
+; G_GFX10-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s12
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s13
+; G_GFX10-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s2
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ds_write_b32 v1, v0
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x2
+; G_GFX1030-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_load_dword s2, s[0:1], 0x3c
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s12
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s13
+; G_GFX1030-NEXT:    buffer_atomic_fmin v0, v1, s[4:7], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s2
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ds_write_b32 v1, v0
+; G_GFX1030-NEXT:    s_endpgm
+; GFX1010-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+main_body:
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store float %ret, float addrspace(3)* %out, align 8
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(3)* %out) {
+; SI-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write_b64 v3, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ds_write_b64 v3, v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ds_write_b64 v3, v[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ds_write_b64 v3, v[0:1]
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_SI-NEXT:    s_mov_b32 m0, -1
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    ds_write_b64 v3, v[0:1]
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX7-NEXT:    s_mov_b32 m0, -1
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ds_write_b64 v3, v[0:1]
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ds_write_b64 v3, v[0:1]
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ds_write_b64 v3, v[0:1]
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store double %ret, double addrspace(3)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_max_noret_f32:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_noret_f32:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_noret_f32:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x1
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_noret_f32:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s0
+; G_SI-NEXT:    v_mov_b32_e32 v1, s1
+; G_SI-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f32:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; G_GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f32:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x1
+; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 0 offen glc
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x1
+; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_max_noret_f64:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; SI-NEXT:    s_load_dword s0, s[0:1], 0xf
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_noret_f64:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GFX7-NEXT:    s_load_dword s0, s[0:1], 0xf
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_noret_f64:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x2
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_noret_f64:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_SI-NEXT:    s_load_dword s0, s[0:1], 0xf
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s2
+; G_SI-NEXT:    v_mov_b32_e32 v1, s3
+; G_SI-NEXT:    v_mov_b32_e32 v2, s0
+; G_SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_noret_f64:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_GFX7-NEXT:    s_load_dword s0, s[0:1], 0xf
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; G_GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_noret_f64:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x2
+; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dword s8, s[0:1], 0x3c
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen glc
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x2
+; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dword s6, s[0:1], 0x3c
+; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, float %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_max_rtn_f32:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v[0:1], v0, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    global_store_dword v[0:1], v0, off
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_SI-NEXT:    s_mov_b32 s2, -1
+; G_SI-NEXT:    s_mov_b32 s3, 0xf000
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_GFX7-NEXT:    s_mov_b32 s2, -1
+; G_GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    global_store_dword v[0:1], v0, off
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    global_store_dword v[0:1], v0, off
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store float %ret, float addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
+; SI-LABEL: raw_buffer_atomic_max_rtn_f64:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write_b64 v0, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_SI-NEXT:    s_mov_b32 m0, -1
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    ds_write_b64 v0, v[0:1]
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX7-NEXT:    s_mov_b32 m0, -1
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ds_write_b64 v0, v[0:1]
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ds_write_b64 v0, v[0:1]
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen glc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ds_write_b64 v0, v[0:1]
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
+  store double %ret, double addrspace(3)* undef
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inreg %rsrc, float %data, i32 %vindex, float addrspace(1)* %out) {
+; SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_nop 0
+; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x2
+; GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s2
+; G_SI-NEXT:    v_mov_b32_e32 v1, s3
+; G_SI-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_SI-NEXT:    s_mov_b32 s2, -1
+; G_SI-NEXT:    s_mov_b32 s3, 0xf000
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_GFX7-NEXT:    s_mov_b32 s2, -1
+; G_GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x2
+; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_nop 0
+; G_GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x2
+; G_GFX1030-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT:    buffer_atomic_fmax v0, v1, s[4:7], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, 0
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    global_store_dword v1, v0, s[0:1]
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store float %ret, float addrspace(1)* %out, align 8
+  ret void
+}
+
+define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> inreg %rsrc, double %data, i32 %vindex, double addrspace(3)* %out) {
+; SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; SI-NEXT:    s_load_dword s8, s[0:1], 0xf
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x10
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    v_mov_b32_e32 v2, s8
+; SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    ds_write_b64 v2, v[0:1]
+; SI-NEXT:    s_endpgm
+;
+; GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_clause 0x2
+; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x3c
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; GFX10-NEXT:    v_mov_b32_e32 v2, s9
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    s_clause 0x2
+; GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; GFX1030-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x3c
+; GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; GFX1030-NEXT:    v_mov_b32_e32 v2, s7
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
+; GFX1030-NEXT:    s_endpgm
+;
+; G_SI-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_SI-NEXT:    s_load_dword s8, s[0:1], 0x10
+; G_SI-NEXT:    s_load_dword s0, s[0:1], 0xf
+; G_SI-NEXT:    s_mov_b32 m0, -1
+; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
+; G_SI-NEXT:    v_mov_b32_e32 v0, s2
+; G_SI-NEXT:    v_mov_b32_e32 v1, s3
+; G_SI-NEXT:    v_mov_b32_e32 v2, s0
+; G_SI-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_SI-NEXT:    v_mov_b32_e32 v2, s8
+; G_SI-NEXT:    s_waitcnt vmcnt(0)
+; G_SI-NEXT:    ds_write_b64 v2, v[0:1]
+; G_SI-NEXT:    s_endpgm
+;
+; G_GFX7-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; G_GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
+; G_GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xf
+; G_GFX7-NEXT:    s_mov_b32 m0, -1
+; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; G_GFX7-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_GFX7-NEXT:    v_mov_b32_e32 v2, s1
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ds_write_b64 v2, v[0:1]
+; G_GFX7-NEXT:    s_endpgm
+;
+; G_GFX10-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    s_clause 0x2
+; G_GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x3c
+; G_GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; G_GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s8
+; G_GFX10-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 4 offen glc slc
+; G_GFX10-NEXT:    v_mov_b32_e32 v2, s9
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
+; G_GFX10-NEXT:    s_endpgm
+;
+; G_GFX1030-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    s_clause 0x2
+; G_GFX1030-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
+; G_GFX1030-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x3c
+; G_GFX1030-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
+; G_GFX1030-NEXT:    v_mov_b32_e32 v0, s4
+; G_GFX1030-NEXT:    v_mov_b32_e32 v1, s5
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s6
+; G_GFX1030-NEXT:    buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 4 offen glc slc
+; G_GFX1030-NEXT:    v_mov_b32_e32 v2, s7
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ds_write_b64 v2, v[0:1]
+; G_GFX1030-NEXT:    s_endpgm
+main_body:
+  %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
+  store double %ret, double addrspace(3)* %out, align 8
+  ret void
+}
+
+define amdgpu_ps float @atomic_fmin_1d(<8 x i32> inreg %rsrc, float %data, i32 %s) {
+; SI-LABEL: atomic_fmin_1d:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: atomic_fmin_1d:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: atomic_fmin_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX1030-LABEL: atomic_fmin_1d:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ; return to shader part epilog
+;
+; G_SI-LABEL: atomic_fmin_1d:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
+; G_SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; G_SI-NEXT:    ; return to shader part epilog
+;
+; G_GFX7-LABEL: atomic_fmin_1d:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 unorm glc
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ; return to shader part epilog
+;
+; G_GFX10-LABEL: atomic_fmin_1d:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ; return to shader part epilog
+;
+; G_GFX1030-LABEL: atomic_fmin_1d:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    image_atomic_fmin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call float @llvm.amdgcn.image.atomic.fmin.1d.f32.f32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret float %v
+}
+
+define amdgpu_ps float @atomic_fmax_1d(<8 x i32> inreg %rsrc, float %data, i32 %s) {
+; SI-LABEL: atomic_fmax_1d:
+; SI:       ; %bb.0: ; %main_body
+; SI-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    ; return to shader part epilog
+;
+; GFX7-LABEL: atomic_fmax_1d:
+; GFX7:       ; %bb.0: ; %main_body
+; GFX7-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: atomic_fmax_1d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX1030-LABEL: atomic_fmax_1d:
+; GFX1030:       ; %bb.0: ; %main_body
+; GFX1030-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-NEXT:    ; return to shader part epilog
+;
+; G_SI-LABEL: atomic_fmax_1d:
+; G_SI:       ; %bb.0: ; %main_body
+; G_SI-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
+; G_SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; G_SI-NEXT:    ; return to shader part epilog
+;
+; G_GFX7-LABEL: atomic_fmax_1d:
+; G_GFX7:       ; %bb.0: ; %main_body
+; G_GFX7-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 unorm glc
+; G_GFX7-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX7-NEXT:    ; return to shader part epilog
+;
+; G_GFX10-LABEL: atomic_fmax_1d:
+; G_GFX10:       ; %bb.0: ; %main_body
+; G_GFX10-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; G_GFX10-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX10-NEXT:    ; return to shader part epilog
+;
+; G_GFX1030-LABEL: atomic_fmax_1d:
+; G_GFX1030:       ; %bb.0: ; %main_body
+; G_GFX1030-NEXT:    image_atomic_fmax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
+; G_GFX1030-NEXT:    s_waitcnt vmcnt(0)
+; G_GFX1030-NEXT:    ; return to shader part epilog
+main_body:
+  %v = call float @llvm.amdgcn.image.atomic.fmax.1d.f32.f32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+  ret float %v
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir b/llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir
new file mode 100644
index 0000000000000..b477c9b5ba90b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hazard-pseudo-machineinstrs.mir
@@ -0,0 +1,45 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass post-RA-sched %s -o - | FileCheck -check-prefix=GCN %s
+
+# WAVE_BARRIER and SI_MASKED_UNREACHABLE are not really instructions.
+# To fix the hazard (m0 def followed by V_INTERP), the scheduler
+# should move another instruction into the slot.
+---
+# CHECK-LABEL: name: hazard_wave_barrier
+# CHECK-LABEL: bb.0:
+# GCN: $m0 = S_MOV_B32 killed renamable $sgpr0
+# GCN-NEXT: WAVE_BARRIER
+# GCN-NEXT: S_MOV_B32 0
+# GCN-NEXT: V_INTERP_MOV_F32
+name: hazard_wave_barrier
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    $m0 = S_MOV_B32 killed renamable $sgpr0
+    WAVE_BARRIER
+    renamable $vgpr0 = V_INTERP_MOV_F32 2, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    renamable $sgpr1 = S_MOV_B32 0
+    S_ENDPGM 0
+
+...
+# GCN-LABEL: name: hazard-masked-unreachable
+# CHECK-LABEL: bb.0:
+# GCN: $m0 = S_MOV_B32 killed renamable $sgpr0
+# GCN-NEXT: SI_MASKED_UNREACHABLE
+# GCN-NEXT: S_MOV_B32 0
+# GCN-NEXT: V_INTERP_MOV_F32
+---
+name: hazard-masked-unreachable
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    $m0 = S_MOV_B32 killed renamable $sgpr0
+    SI_MASKED_UNREACHABLE
+    renamable $vgpr0 = V_INTERP_MOV_F32 2, 0, 0, implicit $mode, implicit $m0, implicit $exec
+    renamable $sgpr1 = S_MOV_B32 0
+  bb.1:
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/hazard.mir b/llvm/test/CodeGen/AMDGPU/hazard.mir
index 1b53aac3646be..5bc4c62569a25 100644
--- a/llvm/test/CodeGen/AMDGPU/hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazard.mir
@@ -125,3 +125,49 @@ body: |
     S_SENDMSG 3, implicit $exec, implicit $m0
     S_ENDPGM 0
 ...
+# GCN-LABEL: name: hazard-lookahead-wave-barrier
+# GCN: S_WAITCNT 0
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: V_ADD_F16_dpp
+---
+name: hazard-lookahead-wave-barrier
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr3
+
+    renamable $vgpr1 = contract nofpexcept V_ADD_F16_e32 killed $vgpr1, $vgpr0, implicit $mode, implicit $exec
+    WAVE_BARRIER
+    S_WAITCNT 0
+    renamable $vgpr2 = contract nofpexcept V_ADD_F16_dpp undef $vgpr2, 0, $vgpr1, 0, $vgpr3, 273, 15, 15, 1, implicit $mode, implicit $exec
+...
+# GCN-LABEL: name: hazard-lookahead-masked-unreachable
+# GCN: SI_MASKED_UNREACHABLE
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_SENDMSG
+---
+name: hazard-lookahead-masked-unreachable
+body: |
+  bb.0:
+    $m0 = S_MOV_B32 -1
+    SI_MASKED_UNREACHABLE
+    S_SENDMSG 3, implicit $exec, implicit $m0
+
+  bb.1:
+    S_ENDPGM 0
+...
+# GCN-LABEL: name: fallthrough-hazard-lookahead-masked-unreachable
+# GCN: SI_MASKED_UNREACHABLE
+# GCN-LABEL: bb.1:
+# GCN-NEXT: S_NOP 0
+# GCN-NEXT: S_SENDMSG
+---
+name: fallthrough-hazard-lookahead-masked-unreachable
+body: |
+  bb.0:
+    $m0 = S_MOV_B32 -1
+    SI_MASKED_UNREACHABLE
+
+  bb.1:
+    S_SENDMSG 3, implicit $exec, implicit $m0
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
index dcdbe6bd02dcd..1d033e117ede7 100644
--- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
+++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass machinelicm -o - %s | FileCheck -check-prefix=GCN %s
 
-# FIXME: MachineLICM hoists all V_CVT instructions out of the loop increasing
-# register pressure. VGPR budget at occupancy 10 is 24 vgprs.
+# MachineLICM shall limit hoisting of V_CVT instructions out of the loop keeping
+# register pressure within the budget. VGPR budget at occupancy 10 is 24 vgprs.
 
 ---
 name:            test
@@ -35,41 +35,41 @@ body:             |
   ; GCN:   %20:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY2]], implicit $mode, implicit $exec
   ; GCN:   %21:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY3]], implicit $mode, implicit $exec
   ; GCN:   %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY4]], implicit $mode, implicit $exec
+  ; GCN: bb.1:
+  ; GCN:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; GCN:   liveins: $vcc
+  ; GCN:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %18, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %19, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %20, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %21, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %22, implicit $exec
   ; GCN:   %23:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY5]], implicit $mode, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed %23, implicit $exec
   ; GCN:   %24:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY6]], implicit $mode, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed %24, implicit $exec
   ; GCN:   %25:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY7]], implicit $mode, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed %25, implicit $exec
   ; GCN:   %26:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY8]], implicit $mode, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed %26, implicit $exec
   ; GCN:   %27:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY9]], implicit $mode, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed %27, implicit $exec
   ; GCN:   %28:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY10]], implicit $mode, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed %28, implicit $exec
   ; GCN:   %29:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY11]], implicit $mode, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed %29, implicit $exec
   ; GCN:   %30:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY12]], implicit $mode, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed %30, implicit $exec
   ; GCN:   %31:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY13]], implicit $mode, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed %31, implicit $exec
   ; GCN:   %32:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY14]], implicit $mode, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed %32, implicit $exec
   ; GCN:   %33:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY15]], implicit $mode, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed %33, implicit $exec
   ; GCN:   %34:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY16]], implicit $mode, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed %34, implicit $exec
   ; GCN:   %35:vreg_64 = nofpexcept V_CVT_F64_I32_e32 [[COPY17]], implicit $mode, implicit $exec
-  ; GCN: bb.1:
-  ; GCN:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
-  ; GCN:   liveins: $vcc
-  ; GCN:   $vcc = S_AND_B64 $exec, $vcc, implicit-def $scc
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %18, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %19, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %20, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %21, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %22, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %23, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %24, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %25, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %26, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %27, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %28, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %29, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %30, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %31, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %32, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %33, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %34, implicit $exec
-  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, %35, implicit $exec
+  ; GCN:   $vcc = V_CMP_EQ_U64_e64 $vcc, killed %35, implicit $exec
   ; GCN:   S_CBRANCH_VCCNZ %bb.1, implicit $vcc
   ; GCN:   S_BRANCH %bb.2
   ; GCN: bb.2:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
new file mode 100644
index 0000000000000..701cfc99cf3d9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+@a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+@b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+
+; CHECK-LABEL: @no_clobber_ds_load_stores_x2_preexisting_aa
+; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !tbaa !0, !alias.scope !5, !noalias !10
+; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !tbaa !0, !alias.scope !5, !noalias !10
+; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !tbaa !0, !alias.scope !10, !noalias !5
+; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !tbaa !0, !alias.scope !10, !noalias !5
+
+define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(i32 addrspace(1)* %arg, i32 %i) {
+bb:
+  store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4, !alias.scope !0, !noalias !3, !tbaa !5
+  %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
+  %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !3, !tbaa !5
+  store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4, !alias.scope !3, !noalias !0, !tbaa !5
+  %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
+  %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !0, !tbaa !5
+  %val = add i32 %val.a, %val.b
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+!0 = !{!1}
+!1 = distinct !{!1, !2}
+!2 = distinct !{!2}
+!3 = !{!4}
+!4 = distinct !{!4, !2}
+!5 = !{!6, !7, i64 0}
+!6 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !7, i64 0}
+!7 = !{!"int", !8, i64 0}
+!8 = !{!"omnipotent char", !9, i64 0}
+!9 = !{!"Simple C++ TBAA"}
+
+; CHECK:!0 = !{!1, !2, i64 0}
+; CHECK:!1 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !2, i64 0}
+; CHECK:!2 = !{!"int", !3, i64 0}
+; CHECK:!3 = !{!"omnipotent char", !4, i64 0}
+; CHECK:!4 = !{!"Simple C++ TBAA"}
+; CHECK:!5 = !{!6, !8}
+; CHECK:!6 = distinct !{!6, !7}
+; CHECK:!7 = distinct !{!7}
+; CHECK:!8 = distinct !{!8, !9}
+; CHECK:!9 = distinct !{!9}
+; CHECK:!10 = !{!11, !12}
+; CHECK:!11 = distinct !{!11, !7}
+; CHECK:!12 = distinct !{!12, !9}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
new file mode 100644
index 0000000000000..6b24510fc9253
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -0,0 +1,77 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+@a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+@b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+@c = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+
+; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2:
+; GCN: ds_write2st64_b32
+; GCN: ds_read2st64_b32
+
+; CHECK-LABEL: @no_clobber_ds_load_stores_x2
+; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !0, !noalias !3
+; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !0, !noalias !3
+; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !3, !noalias !0
+; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !3, !noalias !0
+
+define amdgpu_kernel void @no_clobber_ds_load_stores_x2(i32 addrspace(1)* %arg, i32 %i) {
+bb:
+  store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
+  %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
+  %val.a = load i32, i32 addrspace(3)* %gep.a, align 4
+  store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4
+  %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
+  %val.b = load i32, i32 addrspace(3)* %gep.b, align 4
+  %val = add i32 %val.a, %val.b
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3:
+; GCN-DAG: ds_write2st64_b32
+; GCN-DAG: ds_write_b32
+; GCN-DAG: ds_read2st64_b32
+; GCN-DAG: ds_read_b32
+
+; CHECK-LABEL: @no_clobber_ds_load_stores_x3
+; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !5, !noalias !8
+; CHECK: %val.a = load i32, i32 addrspace(3)* %gep.a, align 4, !alias.scope !5, !noalias !8
+; CHECK: store i32 2, i32 addrspace(3)* %1, align 16, !alias.scope !11, !noalias !12
+; CHECK: %val.b = load i32, i32 addrspace(3)* %gep.b, align 4, !alias.scope !11, !noalias !12
+; CHECK: store i32 3, i32 addrspace(3)* %2, align 16, !alias.scope !13, !noalias !14
+; CHECK: %val.c = load i32, i32 addrspace(3)* %gep.c, align 4, !alias.scope !13, !noalias !14
+
+define amdgpu_kernel void @no_clobber_ds_load_stores_x3(i32 addrspace(1)* %arg, i32 %i) {
+bb:
+  store i32 1, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 0), align 4
+  %gep.a = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @a, i32 0, i32 %i
+  %val.a = load i32, i32 addrspace(3)* %gep.a, align 4
+  store i32 2, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 0), align 4
+  %gep.b = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @b, i32 0, i32 %i
+  %val.b = load i32, i32 addrspace(3)* %gep.b, align 4
+  store i32 3, i32 addrspace(3)* getelementptr inbounds ([64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 0), align 4
+  %gep.c = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* @c, i32 0, i32 %i
+  %val.c = load i32, i32 addrspace(3)* %gep.c, align 4
+  %val.1 = add i32 %val.a, %val.b
+  %val = add i32 %val.1, %val.c
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+; CHECK: !0 = !{!1}
+; CHECK: !1 = distinct !{!1, !2}
+; CHECK: !2 = distinct !{!2}
+; CHECK: !3 = !{!4}
+; CHECK: !4 = distinct !{!4, !2}
+; CHECK: !5 = !{!6}
+; CHECK: !6 = distinct !{!6, !7}
+; CHECK: !7 = distinct !{!7}
+; CHECK: !8 = !{!9, !10}
+; CHECK: !9 = distinct !{!9, !7}
+; CHECK: !10 = distinct !{!10, !7}
+; CHECK: !11 = !{!9}
+; CHECK: !12 = !{!6, !10}
+; CHECK: !13 = !{!10}
+; CHECK: !14 = !{!6, !9}
diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
index a92f0e2d48362..f890ab2625f1b 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
@@ -71,5 +71,5 @@ attributes #0 = { nounwind readnone speculatable willreturn }
 ; AKF_GCN: attributes #[[ATTR0:[0-9]+]] = { nounwind readnone speculatable willreturn }
 ;.
 ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1:[0-9]+]] = { nounwind readnone speculatable willreturn "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1:[0-9]+]] = { nounwind readnone speculatable willreturn }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll
index 3c2b66c302c17..827d23dbd4637 100644
--- a/llvm/test/CodeGen/AMDGPU/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll
@@ -62,10 +62,8 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -93,10 +91,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
 ; GFX6-NEXT:    v_add_i32_e64 v1, s[4:5], v0, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v0, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
+; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -107,10 +103,8 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_add_u32_e64 v1, s[4:5], v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -159,19 +153,18 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_add_u16_e32 v4, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v6, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v5, v6, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
 ; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
+; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v4
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_add_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
@@ -228,26 +221,25 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v7, v8, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_add_u16_e32 v3, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
@@ -313,19 +305,18 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_add_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v7, v8, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_add_u16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -333,17 +324,17 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_add_u16_e32 v5, v4, v2
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
 ; GFX8-NEXT:    v_cmp_gt_i16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v5
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_add_u16_e32 v3, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -376,17 +367,16 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v0, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v5, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v4, v5, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
+; GFX6-NEXT:    s_brev_b32 s6, 1
+; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_add_i32_e64 v2, s[4:5], v1, v3
-; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[6:7]
+; GFX6-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
+; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -397,17 +387,16 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v5, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v5, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
+; GFX8-NEXT:    s_brev_b32 s6, 1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_add_u32_e64 v2, s[4:5], v1, v3
-; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[6:7]
+; GFX8-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -438,13 +427,10 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
 ; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
-; GFX6-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -455,13 +441,10 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
 ; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
-; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -472,13 +455,10 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
-; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -487,15 +467,13 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, v2
-; GFX10-NEXT:    v_bfrev_b32_e32 v6, -2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_gt_i64_e64 s4, 0, v[2:3]
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s5, 0, v[4:5]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v6, s5
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs)
diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
index fde23b00aec58..3e8a725ca6c88 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll
@@ -62,10 +62,8 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0xffff8000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -93,10 +91,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v1
 ; GFX6-NEXT:    v_sub_i32_e64 v1, s[4:5], v0, v1
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v0, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v1
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
+; GFX6-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -107,10 +103,8 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_sub_u32_e64 v1, s[4:5], v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v0, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -159,19 +153,18 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v4, v3, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v6, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v5, v6, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v4, v3
+; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v4
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v0, v1
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v1, v0
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v5, v6, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
@@ -229,26 +222,25 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v7, v8, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v3, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
@@ -314,19 +306,18 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX8-NEXT:    v_sub_u16_e32 v6, v5, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffff8000
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0x7fff
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v7, v8, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v4
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v6, v5
+; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 15, v6
+; GFX8-NEXT:    s_movk_i32 s6, 0x8000
+; GFX8-NEXT:    v_xor_b32_e32 v4, s6, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_sub_u16_e32 v2, v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v0, 15, v2
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
@@ -334,17 +325,17 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX8-NEXT:    v_sub_u16_e32 v5, v4, v2
-; GFX8-NEXT:    v_cmp_gt_i16_e32 vcc, 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
-; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], 0, v2
+; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, v5, v4
+; GFX8-NEXT:    v_ashrrev_i16_e32 v2, 15, v5
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX8-NEXT:    v_cmp_lt_i16_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_sub_u16_e32 v3, v1, v3
 ; GFX8-NEXT:    v_cmp_lt_i16_e64 s[4:5], v3, v1
-; GFX8-NEXT:    v_cmp_gt_i16_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, v8, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 15, v3
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -377,17 +368,16 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v0, v2
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v5, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v4, v5, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
+; GFX6-NEXT:    s_brev_b32 s6, 1
+; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v2, s[4:5], v1, v3
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
+; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -398,17 +388,16 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v2
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v0, v2
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v4, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v5, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v5, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v2
+; GFX8-NEXT:    s_brev_b32 s6, 1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v2, s[4:5], v1, v3
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v2, v1
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v5, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v2
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -438,24 +427,23 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v0, v3
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v6, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v7, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; GFX6-NEXT:    s_brev_b32 s6, 1
+; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v1, v4
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v6, v7, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
+; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v3, s[4:5], v2, v5
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v3
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
+; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -466,24 +454,23 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v3
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v0, v3
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v6, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v7, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v7, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; GFX8-NEXT:    s_brev_b32 s6, 1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v1, v4
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v1
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v6, v7, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v3, s[4:5], v2, v5
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v3, v2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -515,31 +502,30 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v0, v4
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v8, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v8, v9, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
+; GFX6-NEXT:    s_brev_b32 s6, 1
+; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v1, v5
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v8, v9, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
+; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v2, v6
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v8, v9, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
+; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v4, s[4:5], v3, v7
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v8, v9, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
+; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
+; GFX6-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -550,31 +536,30 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v4
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v0, v4
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v8, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v9, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v8, v9, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v4
+; GFX8-NEXT:    s_brev_b32 s6, 1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v1, v5
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v1
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v8, v9, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v2, v6
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v9, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v6
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v4
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v4, s[4:5], v3, v7
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v4, v3
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, v9, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v7
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v4
+; GFX8-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -608,59 +593,59 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v8
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v0, v8
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v16, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v17, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v16, v17, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v8
+; GFX6-NEXT:    s_brev_b32 s6, 1
+; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v1, v9
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v2, v10
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v3, v11
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
+; GFX6-NEXT:    v_bfrev_b32_e32 v16, 1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v3, v16, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v4, v12
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v4, v16, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v5, v13
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
+; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v5, v16, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v6, v14
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
+; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v6, v16, v6
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v8, s[4:5], v7, v15
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX6-NEXT:    v_cndmask_b32_e64 v7, v16, v17, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
+; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
+; GFX6-NEXT:    v_xor_b32_e32 v7, v16, v7
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -671,59 +656,59 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v8
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v0, v8
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v16, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v17, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v16, v17, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v8
+; GFX8-NEXT:    s_brev_b32 s6, 1
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v1, v9
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v1
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v9
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v2, v10
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v10
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v3, v11
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v3
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_bfrev_b32_e32 v16, 1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v3, v16, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v4, v12
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v4
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v12
+; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v4, v16, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v5, v13
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v5
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v13
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v5, v16, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v6, v14
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v6
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v14
+; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v6, v16, v6
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v8, s[4:5], v7, v15
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v8, v7
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v16, v17, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v15
+; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v8
+; GFX8-NEXT:    v_xor_b32_e32 v7, v16, v7
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -765,115 +750,115 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v0, v16
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
-; GFX6-NEXT:    v_bfrev_b32_e32 v32, 1
-; GFX6-NEXT:    v_bfrev_b32_e32 v33, -2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v0, v32, v33, s[6:7]
+; GFX6-NEXT:    s_brev_b32 s6, 1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v1, v17
 ; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v17
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v1
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v32, v33, s[6:7]
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v2, v18
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
+; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v3, v19
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v3, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
+; GFX6-NEXT:    v_bfrev_b32_e32 v17, 1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v3, v17, v3
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v4, v20
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v4, v17, v4
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v5, v21
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v5
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v5, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
+; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v5, v17, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v6, v22
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v6
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v6, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
+; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v6, v17, v6
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v7, v23
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v7
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v7, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
+; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v7, v17, v7
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v8, v24
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v8
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v8, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
+; GFX6-NEXT:    v_ashrrev_i32_e32 v8, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v8, v17, v8
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v9, v25
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v9
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v9, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
+; GFX6-NEXT:    v_ashrrev_i32_e32 v9, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v9, v17, v9
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v10, v26
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v10
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v10, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
+; GFX6-NEXT:    v_ashrrev_i32_e32 v10, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v10, v17, v10
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v11, v27
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v11
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v11, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
+; GFX6-NEXT:    v_ashrrev_i32_e32 v11, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v11, v17, v11
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v12, v28
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v12
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v12, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
+; GFX6-NEXT:    v_ashrrev_i32_e32 v12, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v12, v17, v12
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v13, v29
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v13
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v13, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
+; GFX6-NEXT:    v_ashrrev_i32_e32 v13, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v13, v17, v13
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v14, v30
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v14
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v14, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
+; GFX6-NEXT:    v_ashrrev_i32_e32 v14, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v14, v17, v14
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
 ; GFX6-NEXT:    v_sub_i32_e64 v16, s[4:5], v15, v31
-; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v31
 ; GFX6-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
-; GFX6-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX6-NEXT:    v_cndmask_b32_e64 v15, v32, v33, s[6:7]
+; GFX6-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v31
+; GFX6-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
+; GFX6-NEXT:    v_xor_b32_e32 v15, v17, v15
 ; GFX6-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -884,115 +869,115 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v16
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v0, v16
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v0
-; GFX8-NEXT:    v_bfrev_b32_e32 v32, 1
-; GFX8-NEXT:    v_bfrev_b32_e32 v33, -2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v32, v33, s[6:7]
+; GFX8-NEXT:    s_brev_b32 s6, 1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v0, s6, v0
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v0, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v1, v17
 ; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v17
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v1
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v32, v33, s[6:7]
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v1, s6, v1
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v16, v1, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v2, v18
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v2
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v18
+; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v3, v19
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v3
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v19
+; GFX8-NEXT:    v_bfrev_b32_e32 v17, 1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v3, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v3, v17, v3
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v3, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v4, v20
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v4
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v20
+; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v4, v17, v4
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v5, v21
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v5
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v21
+; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v5, v17, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v6, v22
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v6
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v22
+; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v6, v17, v6
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v7, v23
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v7
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v23
+; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v7, v17, v7
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v16, v7, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v8, v24
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v8
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v24
+; GFX8-NEXT:    v_ashrrev_i32_e32 v8, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v8, v17, v8
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v9, v25
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v9
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v25
+; GFX8-NEXT:    v_ashrrev_i32_e32 v9, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v9, v17, v9
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v16, v9, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v10, v26
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v10
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v26
+; GFX8-NEXT:    v_ashrrev_i32_e32 v10, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v10, v17, v10
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v16, v10, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v11, v27
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v11
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v27
+; GFX8-NEXT:    v_ashrrev_i32_e32 v11, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v11, v17, v11
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v12, v28
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v12
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v12, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v28
+; GFX8-NEXT:    v_ashrrev_i32_e32 v12, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v12, v17, v12
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v16, v12, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v13, v29
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v13
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v13, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v29
+; GFX8-NEXT:    v_ashrrev_i32_e32 v13, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v13, v17, v13
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v14, v30
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v14
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v14, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v30
+; GFX8-NEXT:    v_ashrrev_i32_e32 v14, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v14, v17, v14
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v16, v14, vcc
 ; GFX8-NEXT:    v_sub_u32_e64 v16, s[4:5], v15, v31
-; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v31
 ; GFX8-NEXT:    v_cmp_lt_i32_e64 s[4:5], v16, v15
-; GFX8-NEXT:    v_cmp_gt_i32_e64 s[6:7], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v15, v32, v33, s[6:7]
+; GFX8-NEXT:    v_cmp_lt_i32_e32 vcc, 0, v31
+; GFX8-NEXT:    v_ashrrev_i32_e32 v15, 31, v16
+; GFX8-NEXT:    v_xor_b32_e32 v15, v17, v15
 ; GFX8-NEXT:    s_xor_b64 vcc, vcc, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1052,13 +1037,10 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX6-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
 ; GFX6-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
-; GFX6-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX6-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX6-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
-; GFX6-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX6-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1069,13 +1051,10 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
 ; GFX8-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
-; GFX8-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX8-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX8-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
-; GFX8-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX8-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1086,13 +1065,10 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
 ; GFX9-NEXT:    v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
 ; GFX9-NEXT:    s_xor_b64 vcc, s[4:5], vcc
-; GFX9-NEXT:    v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
-; GFX9-NEXT:    v_bfrev_b32_e32 v2, -2
-; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1101,15 +1077,13 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v0, v2
-; GFX10-NEXT:    v_bfrev_b32_e32 v6, -2
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s4, 0, v[2:3]
-; GFX10-NEXT:    v_cmp_gt_i64_e64 s5, 0, v[4:5]
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
 ; GFX10-NEXT:    v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0x80000000, v6, s5
+; GFX10-NEXT:    v_xor_b32_e32 v1, 0x80000000, v6
 ; GFX10-NEXT:    s_xor_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs)
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 4fe18cb5608a7..d3f6b7c9539db 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -18,7 +18,7 @@ declare void @llvm.debugtrap() #1
 define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) {
 ; NOHSA-TRAP-GFX900-V2-LABEL: trap:
 ; NOHSA-TRAP-GFX900-V2:       ; %bb.0:
-; NOHSA-TRAP-GFX900-V2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; NOHSA-TRAP-GFX900-V2-NEXT:    v_mov_b32_e32 v0, 0
 ; NOHSA-TRAP-GFX900-V2-NEXT:    v_mov_b32_e32 v1, 1
 ; NOHSA-TRAP-GFX900-V2-NEXT:    s_waitcnt lgkmcnt(0)
@@ -28,7 +28,7 @@ define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) {
 ;
 ; NOHSA-TRAP-GFX900-V3-LABEL: trap:
 ; NOHSA-TRAP-GFX900-V3:       ; %bb.0:
-; NOHSA-TRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; NOHSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v0, 0
 ; NOHSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v1, 1
 ; NOHSA-TRAP-GFX900-V3-NEXT:    s_waitcnt lgkmcnt(0)
@@ -38,7 +38,7 @@ define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) {
 ;
 ; NOHSA-TRAP-GFX900-V4-LABEL: trap:
 ; NOHSA-TRAP-GFX900-V4:       ; %bb.0:
-; NOHSA-TRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; NOHSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v0, 0
 ; NOHSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v1, 1
 ; NOHSA-TRAP-GFX900-V4-NEXT:    s_waitcnt lgkmcnt(0)
@@ -357,7 +357,7 @@ define amdgpu_kernel void @trap(i32 addrspace(1)* nocapture readonly %arg0) {
 define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %arg0) local_unnamed_addr {
 ; NOHSA-TRAP-GFX900-V2-LABEL: non_entry_trap:
 ; NOHSA-TRAP-GFX900-V2:       ; %bb.0: ; %entry
-; NOHSA-TRAP-GFX900-V2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; NOHSA-TRAP-GFX900-V2-NEXT:    v_mov_b32_e32 v0, 0
 ; NOHSA-TRAP-GFX900-V2-NEXT:    s_waitcnt lgkmcnt(0)
 ; NOHSA-TRAP-GFX900-V2-NEXT:    global_load_dword v1, v0, s[0:1] glc
@@ -375,7 +375,7 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
 ;
 ; NOHSA-TRAP-GFX900-V3-LABEL: non_entry_trap:
 ; NOHSA-TRAP-GFX900-V3:       ; %bb.0: ; %entry
-; NOHSA-TRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; NOHSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v0, 0
 ; NOHSA-TRAP-GFX900-V3-NEXT:    s_waitcnt lgkmcnt(0)
 ; NOHSA-TRAP-GFX900-V3-NEXT:    global_load_dword v1, v0, s[0:1] glc
@@ -393,7 +393,7 @@ define amdgpu_kernel void @non_entry_trap(i32 addrspace(1)* nocapture readonly %
 ;
 ; NOHSA-TRAP-GFX900-V4-LABEL: non_entry_trap:
 ; NOHSA-TRAP-GFX900-V4:       ; %bb.0: ; %entry
-; NOHSA-TRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; NOHSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v0, 0
 ; NOHSA-TRAP-GFX900-V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; NOHSA-TRAP-GFX900-V4-NEXT:    global_load_dword v1, v0, s[0:1] glc
@@ -805,7 +805,7 @@ ret:
 define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0) {
 ; NOHSA-TRAP-GFX900-V2-LABEL: debugtrap:
 ; NOHSA-TRAP-GFX900-V2:       ; %bb.0:
-; NOHSA-TRAP-GFX900-V2-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V2-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; NOHSA-TRAP-GFX900-V2-NEXT:    v_mov_b32_e32 v0, 0
 ; NOHSA-TRAP-GFX900-V2-NEXT:    v_mov_b32_e32 v1, 1
 ; NOHSA-TRAP-GFX900-V2-NEXT:    v_mov_b32_e32 v2, 2
@@ -818,7 +818,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
 ;
 ; NOHSA-TRAP-GFX900-V3-LABEL: debugtrap:
 ; NOHSA-TRAP-GFX900-V3:       ; %bb.0:
-; NOHSA-TRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V3-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; NOHSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v0, 0
 ; NOHSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v1, 1
 ; NOHSA-TRAP-GFX900-V3-NEXT:    v_mov_b32_e32 v2, 2
@@ -831,7 +831,7 @@ define amdgpu_kernel void @debugtrap(i32 addrspace(1)* nocapture readonly %arg0)
 ;
 ; NOHSA-TRAP-GFX900-V4-LABEL: debugtrap:
 ; NOHSA-TRAP-GFX900-V4:       ; %bb.0:
-; NOHSA-TRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-V4-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x24
 ; NOHSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v0, 0
 ; NOHSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v1, 1
 ; NOHSA-TRAP-GFX900-V4-NEXT:    v_mov_b32_e32 v2, 2
diff --git a/llvm/test/CodeGen/ARM/qdadd.ll b/llvm/test/CodeGen/ARM/qdadd.ll
index 94d165c2502af..be35c75cc5de9 100644
--- a/llvm/test/CodeGen/ARM/qdadd.ll
+++ b/llvm/test/CodeGen/ARM/qdadd.ll
@@ -7,32 +7,14 @@
 define i32 @qdadd(i32 %x, i32 %y) nounwind {
 ; CHECK-T2NODSP-LABEL: qdadd:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    .save {r7, lr}
-; CHECK-T2NODSP-NEXT:    push {r7, lr}
-; CHECK-T2NODSP-NEXT:    movs r3, #0
-; CHECK-T2NODSP-NEXT:    adds.w r12, r0, r0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    mov.w r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    mov.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r12, r0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r3, r12
-; CHECK-T2NODSP-NEXT:    adds r0, r3, r1
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r0
 ; CHECK-T2NODSP-NEXT:    mov.w r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi.w lr, #1
-; CHECK-T2NODSP-NEXT:    cmp.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r0, r3
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r2, r0
-; CHECK-T2NODSP-NEXT:    mov r0, r2
-; CHECK-T2NODSP-NEXT:    pop {r7, pc}
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r1
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: qdadd:
 ; CHECK-T2DSP:       @ %bb.0:
@@ -51,32 +33,14 @@ define i32 @qdadd(i32 %x, i32 %y) nounwind {
 define i32 @qdadd_c(i32 %x, i32 %y) nounwind {
 ; CHECK-T2NODSP-LABEL: qdadd_c:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    .save {r7, lr}
-; CHECK-T2NODSP-NEXT:    push {r7, lr}
-; CHECK-T2NODSP-NEXT:    movs r3, #0
-; CHECK-T2NODSP-NEXT:    adds.w r12, r0, r0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    mov.w r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    mov.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r12, r0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r3, r12
-; CHECK-T2NODSP-NEXT:    adds r0, r1, r3
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r0
 ; CHECK-T2NODSP-NEXT:    mov.w r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi.w lr, #1
-; CHECK-T2NODSP-NEXT:    cmp.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r0, r1
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r2, r0
-; CHECK-T2NODSP-NEXT:    mov r0, r2
-; CHECK-T2NODSP-NEXT:    pop {r7, pc}
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r1
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: qdadd_c:
 ; CHECK-T2DSP:       @ %bb.0:
@@ -95,32 +59,14 @@ define i32 @qdadd_c(i32 %x, i32 %y) nounwind {
 define i32 @qdsub(i32 %x, i32 %y) nounwind {
 ; CHECK-T2NODSP-LABEL: qdsub:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    .save {r7, lr}
-; CHECK-T2NODSP-NEXT:    push {r7, lr}
-; CHECK-T2NODSP-NEXT:    movs r3, #0
-; CHECK-T2NODSP-NEXT:    adds.w r12, r0, r0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    mov.w r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    mov.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r12, r0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r3, r12
-; CHECK-T2NODSP-NEXT:    subs r0, r1, r3
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r0
 ; CHECK-T2NODSP-NEXT:    mov.w r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi.w lr, #1
-; CHECK-T2NODSP-NEXT:    cmp.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r1, r3
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r2, r0
-; CHECK-T2NODSP-NEXT:    mov r0, r2
-; CHECK-T2NODSP-NEXT:    pop {r7, pc}
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    subs r0, r1, r0
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: qdsub:
 ; CHECK-T2DSP:       @ %bb.0:
@@ -139,32 +85,14 @@ define i32 @qdsub(i32 %x, i32 %y) nounwind {
 define i32 @qdsub_c(i32 %x, i32 %y) nounwind {
 ; CHECK-T2NODSP-LABEL: qdsub_c:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    .save {r7, lr}
-; CHECK-T2NODSP-NEXT:    push {r7, lr}
-; CHECK-T2NODSP-NEXT:    movs r3, #0
-; CHECK-T2NODSP-NEXT:    adds.w r12, r0, r0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    mov.w r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    mov.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r3, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r12, r0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r3, r12
-; CHECK-T2NODSP-NEXT:    subs r0, r3, r1
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r0
 ; CHECK-T2NODSP-NEXT:    mov.w r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi.w lr, #1
-; CHECK-T2NODSP-NEXT:    cmp.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r3, r1
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r2, r0
-; CHECK-T2NODSP-NEXT:    mov r0, r2
-; CHECK-T2NODSP-NEXT:    pop {r7, pc}
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    subs r0, r0, r1
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r2, r0, asr #31
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: qdsub_c:
 ; CHECK-T2DSP:       @ %bb.0:
diff --git a/llvm/test/CodeGen/ARM/sadd_sat.ll b/llvm/test/CodeGen/ARM/sadd_sat.ll
index 6d5e3e3d216c3..287e52d5044d6 100644
--- a/llvm/test/CodeGen/ARM/sadd_sat.ll
+++ b/llvm/test/CodeGen/ARM/sadd_sat.ll
@@ -16,48 +16,22 @@ declare i64 @llvm.sadd.sat.i64(i64, i64)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; CHECK-T1-LABEL: func:
 ; CHECK-T1:       @ %bb.0:
-; CHECK-T1-NEXT:    mov r2, r0
-; CHECK-T1-NEXT:    movs r3, #1
 ; CHECK-T1-NEXT:    adds r0, r0, r1
-; CHECK-T1-NEXT:    mov r1, r3
-; CHECK-T1-NEXT:    bmi .LBB0_2
+; CHECK-T1-NEXT:    bvc .LBB0_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    movs r1, #0
+; CHECK-T1-NEXT:    asrs r1, r0, #31
+; CHECK-T1-NEXT:    movs r0, #1
+; CHECK-T1-NEXT:    lsls r0, r0, #31
+; CHECK-T1-NEXT:    eors r0, r1
 ; CHECK-T1-NEXT:  .LBB0_2:
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bne .LBB0_4
-; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    lsls r1, r3, #31
-; CHECK-T1-NEXT:    cmp r0, r2
-; CHECK-T1-NEXT:    bvs .LBB0_5
-; CHECK-T1-NEXT:    b .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_4:
-; CHECK-T1-NEXT:    ldr r1, .LCPI0_0
-; CHECK-T1-NEXT:    cmp r0, r2
-; CHECK-T1-NEXT:    bvc .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_5:
-; CHECK-T1-NEXT:    mov r0, r1
-; CHECK-T1-NEXT:  .LBB0_6:
 ; CHECK-T1-NEXT:    bx lr
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI0_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
 ;
 ; CHECK-T2NODSP-LABEL: func:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    adds r2, r0, r1
-; CHECK-T2NODSP-NEXT:    mov.w r3, #0
+; CHECK-T2NODSP-NEXT:    adds r0, r0, r1
 ; CHECK-T2NODSP-NEXT:    mov.w r1, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r1, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r2, r0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r1, r2
-; CHECK-T2NODSP-NEXT:    mov r0, r1
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r1, r0, asr #31
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func:
@@ -67,15 +41,9 @@ define i32 @func(i32 %x, i32 %y) nounwind {
 ;
 ; CHECK-ARMNODPS-LABEL: func:
 ; CHECK-ARMNODPS:       @ %bb.0:
-; CHECK-ARMNODPS-NEXT:    adds r2, r0, r1
-; CHECK-ARMNODPS-NEXT:    mov r3, #0
-; CHECK-ARMNODPS-NEXT:    movmi r3, #1
+; CHECK-ARMNODPS-NEXT:    adds r0, r0, r1
 ; CHECK-ARMNODPS-NEXT:    mov r1, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r3, #0
-; CHECK-ARMNODPS-NEXT:    mvnne r1, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r2, r0
-; CHECK-ARMNODPS-NEXT:    movvc r1, r2
-; CHECK-ARMNODPS-NEXT:    mov r0, r1
+; CHECK-ARMNODPS-NEXT:    eorvs r0, r1, r0, asr #31
 ; CHECK-ARMNODPS-NEXT:    bx lr
 ;
 ; CHECK-ARMBASEDSP-LABEL: func:
@@ -97,36 +65,28 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-T1-NEXT:    .save {r4, lr}
 ; CHECK-T1-NEXT:    push {r4, lr}
 ; CHECK-T1-NEXT:    mov r4, r1
-; CHECK-T1-NEXT:    eors r4, r3
-; CHECK-T1-NEXT:    adds r0, r0, r2
-; CHECK-T1-NEXT:    adcs r3, r1
 ; CHECK-T1-NEXT:    eors r1, r3
-; CHECK-T1-NEXT:    bics r1, r4
-; CHECK-T1-NEXT:    bpl .LBB1_2
+; CHECK-T1-NEXT:    adds r2, r0, r2
+; CHECK-T1-NEXT:    adcs r3, r4
+; CHECK-T1-NEXT:    eors r4, r3
+; CHECK-T1-NEXT:    bics r4, r1
+; CHECK-T1-NEXT:    asrs r1, r3, #31
+; CHECK-T1-NEXT:    cmp r4, #0
+; CHECK-T1-NEXT:    mov r0, r1
+; CHECK-T1-NEXT:    bmi .LBB1_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    asrs r0, r3, #31
+; CHECK-T1-NEXT:    mov r0, r2
 ; CHECK-T1-NEXT:  .LBB1_2:
-; CHECK-T1-NEXT:    cmp r3, #0
+; CHECK-T1-NEXT:    cmp r4, #0
 ; CHECK-T1-NEXT:    bmi .LBB1_4
 ; CHECK-T1-NEXT:  @ %bb.3:
+; CHECK-T1-NEXT:    mov r1, r3
+; CHECK-T1-NEXT:    pop {r4, pc}
+; CHECK-T1-NEXT:  .LBB1_4:
 ; CHECK-T1-NEXT:    movs r2, #1
 ; CHECK-T1-NEXT:    lsls r2, r2, #31
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bpl .LBB1_5
-; CHECK-T1-NEXT:    b .LBB1_6
-; CHECK-T1-NEXT:  .LBB1_4:
-; CHECK-T1-NEXT:    ldr r2, .LCPI1_0
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bmi .LBB1_6
-; CHECK-T1-NEXT:  .LBB1_5:
-; CHECK-T1-NEXT:    mov r2, r3
-; CHECK-T1-NEXT:  .LBB1_6:
-; CHECK-T1-NEXT:    mov r1, r2
+; CHECK-T1-NEXT:    eors r1, r2
 ; CHECK-T1-NEXT:    pop {r4, pc}
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI1_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
 ;
 ; CHECK-T2-LABEL: func2:
 ; CHECK-T2:       @ %bb.0:
@@ -134,17 +94,14 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-T2-NEXT:    eor.w r12, r1, r3
 ; CHECK-T2-NEXT:    adc.w r2, r1, r3
 ; CHECK-T2-NEXT:    eors r1, r2
-; CHECK-T2-NEXT:    bic.w r3, r1, r12
+; CHECK-T2-NEXT:    bic.w r1, r1, r12
+; CHECK-T2-NEXT:    cmp r1, #0
 ; CHECK-T2-NEXT:    mov.w r1, #-2147483648
-; CHECK-T2-NEXT:    cmp r3, #0
 ; CHECK-T2-NEXT:    it mi
 ; CHECK-T2-NEXT:    asrmi r0, r2, #31
-; CHECK-T2-NEXT:    cmp r2, #0
 ; CHECK-T2-NEXT:    it mi
-; CHECK-T2-NEXT:    mvnmi r1, #-2147483648
-; CHECK-T2-NEXT:    cmp r3, #0
-; CHECK-T2-NEXT:    it pl
-; CHECK-T2-NEXT:    movpl r1, r2
+; CHECK-T2-NEXT:    eormi.w r2, r1, r2, asr #31
+; CHECK-T2-NEXT:    mov r1, r2
 ; CHECK-T2-NEXT:    bx lr
 ;
 ; CHECK-ARM-LABEL: func2:
@@ -153,14 +110,12 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-ARM-NEXT:    eor r12, r1, r3
 ; CHECK-ARM-NEXT:    adc r2, r1, r3
 ; CHECK-ARM-NEXT:    eor r1, r1, r2
-; CHECK-ARM-NEXT:    bic r3, r1, r12
+; CHECK-ARM-NEXT:    bic r1, r1, r12
+; CHECK-ARM-NEXT:    cmp r1, #0
 ; CHECK-ARM-NEXT:    mov r1, #-2147483648
-; CHECK-ARM-NEXT:    cmp r3, #0
 ; CHECK-ARM-NEXT:    asrmi r0, r2, #31
-; CHECK-ARM-NEXT:    cmp r2, #0
-; CHECK-ARM-NEXT:    mvnmi r1, #-2147483648
-; CHECK-ARM-NEXT:    cmp r3, #0
-; CHECK-ARM-NEXT:    movpl r1, r2
+; CHECK-ARM-NEXT:    eormi r2, r1, r2, asr #31
+; CHECK-ARM-NEXT:    mov r1, r2
 ; CHECK-ARM-NEXT:    bx lr
   %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %y)
   ret i64 %tmp
diff --git a/llvm/test/CodeGen/ARM/sadd_sat_plus.ll b/llvm/test/CodeGen/ARM/sadd_sat_plus.ll
index f0a0b72db2fbd..6a8d9def56638 100644
--- a/llvm/test/CodeGen/ARM/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/ARM/sadd_sat_plus.ll
@@ -13,49 +13,24 @@ declare i64 @llvm.sadd.sat.i64(i64, i64)
 define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-T1-LABEL: func32:
 ; CHECK-T1:       @ %bb.0:
-; CHECK-T1-NEXT:    mov r3, r0
 ; CHECK-T1-NEXT:    muls r1, r2, r1
-; CHECK-T1-NEXT:    movs r2, #1
 ; CHECK-T1-NEXT:    adds r0, r0, r1
-; CHECK-T1-NEXT:    mov r1, r2
-; CHECK-T1-NEXT:    bmi .LBB0_2
+; CHECK-T1-NEXT:    bvc .LBB0_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    movs r1, #0
+; CHECK-T1-NEXT:    asrs r1, r0, #31
+; CHECK-T1-NEXT:    movs r0, #1
+; CHECK-T1-NEXT:    lsls r0, r0, #31
+; CHECK-T1-NEXT:    eors r0, r1
 ; CHECK-T1-NEXT:  .LBB0_2:
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bne .LBB0_4
-; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    lsls r1, r2, #31
-; CHECK-T1-NEXT:    cmp r0, r3
-; CHECK-T1-NEXT:    bvs .LBB0_5
-; CHECK-T1-NEXT:    b .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_4:
-; CHECK-T1-NEXT:    ldr r1, .LCPI0_0
-; CHECK-T1-NEXT:    cmp r0, r3
-; CHECK-T1-NEXT:    bvc .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_5:
-; CHECK-T1-NEXT:    mov r0, r1
-; CHECK-T1-NEXT:  .LBB0_6:
 ; CHECK-T1-NEXT:    bx lr
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI0_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
 ;
 ; CHECK-T2NODSP-LABEL: func32:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    mla r2, r1, r2, r0
-; CHECK-T2NODSP-NEXT:    movs r3, #0
-; CHECK-T2NODSP-NEXT:    mov.w r1, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r2, #0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r1, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r2, r0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r1, r2
+; CHECK-T2NODSP-NEXT:    mla r1, r1, r2, r0
+; CHECK-T2NODSP-NEXT:    mov.w r2, #-2147483648
+; CHECK-T2NODSP-NEXT:    cmp r1, r0
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r1, r2, r1, asr #31
 ; CHECK-T2NODSP-NEXT:    mov r0, r1
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
@@ -84,35 +59,28 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-T1-NEXT:    mov r2, r1
 ; CHECK-T1-NEXT:    eors r2, r3
 ; CHECK-T1-NEXT:    ldr r4, [sp, #8]
-; CHECK-T1-NEXT:    adds r0, r0, r4
+; CHECK-T1-NEXT:    adds r4, r0, r4
 ; CHECK-T1-NEXT:    adcs r3, r1
 ; CHECK-T1-NEXT:    eors r1, r3
 ; CHECK-T1-NEXT:    bics r1, r2
-; CHECK-T1-NEXT:    bpl .LBB1_2
+; CHECK-T1-NEXT:    asrs r2, r3, #31
+; CHECK-T1-NEXT:    cmp r1, #0
+; CHECK-T1-NEXT:    mov r0, r2
+; CHECK-T1-NEXT:    bmi .LBB1_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    asrs r0, r3, #31
+; CHECK-T1-NEXT:    mov r0, r4
 ; CHECK-T1-NEXT:  .LBB1_2:
-; CHECK-T1-NEXT:    cmp r3, #0
+; CHECK-T1-NEXT:    cmp r1, #0
 ; CHECK-T1-NEXT:    bmi .LBB1_4
 ; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    movs r2, #1
-; CHECK-T1-NEXT:    lsls r2, r2, #31
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bpl .LBB1_5
-; CHECK-T1-NEXT:    b .LBB1_6
+; CHECK-T1-NEXT:    mov r1, r3
+; CHECK-T1-NEXT:    pop {r4, pc}
 ; CHECK-T1-NEXT:  .LBB1_4:
-; CHECK-T1-NEXT:    ldr r2, .LCPI1_0
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bmi .LBB1_6
-; CHECK-T1-NEXT:  .LBB1_5:
-; CHECK-T1-NEXT:    mov r2, r3
-; CHECK-T1-NEXT:  .LBB1_6:
+; CHECK-T1-NEXT:    movs r1, #1
+; CHECK-T1-NEXT:    lsls r1, r1, #31
+; CHECK-T1-NEXT:    eors r2, r1
 ; CHECK-T1-NEXT:    mov r1, r2
 ; CHECK-T1-NEXT:    pop {r4, pc}
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI1_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
 ;
 ; CHECK-T2-LABEL: func64:
 ; CHECK-T2:       @ %bb.0:
@@ -122,17 +90,14 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-T2-NEXT:    adc.w r2, r1, r12
 ; CHECK-T2-NEXT:    eor.w r3, r1, r12
 ; CHECK-T2-NEXT:    eors r1, r2
-; CHECK-T2-NEXT:    bic.w r3, r1, r3
+; CHECK-T2-NEXT:    bics r1, r3
+; CHECK-T2-NEXT:    cmp r1, #0
 ; CHECK-T2-NEXT:    mov.w r1, #-2147483648
-; CHECK-T2-NEXT:    cmp r3, #0
 ; CHECK-T2-NEXT:    it mi
 ; CHECK-T2-NEXT:    asrmi r0, r2, #31
-; CHECK-T2-NEXT:    cmp r2, #0
 ; CHECK-T2-NEXT:    it mi
-; CHECK-T2-NEXT:    mvnmi r1, #-2147483648
-; CHECK-T2-NEXT:    cmp r3, #0
-; CHECK-T2-NEXT:    it pl
-; CHECK-T2-NEXT:    movpl r1, r2
+; CHECK-T2-NEXT:    eormi.w r2, r1, r2, asr #31
+; CHECK-T2-NEXT:    mov r1, r2
 ; CHECK-T2-NEXT:    bx lr
 ;
 ; CHECK-ARM-LABEL: func64:
@@ -143,14 +108,12 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-ARM-NEXT:    eor r3, r1, r2
 ; CHECK-ARM-NEXT:    adc r2, r1, r2
 ; CHECK-ARM-NEXT:    eor r1, r1, r2
-; CHECK-ARM-NEXT:    bic r3, r1, r3
+; CHECK-ARM-NEXT:    bic r1, r1, r3
+; CHECK-ARM-NEXT:    cmp r1, #0
 ; CHECK-ARM-NEXT:    mov r1, #-2147483648
-; CHECK-ARM-NEXT:    cmp r3, #0
 ; CHECK-ARM-NEXT:    asrmi r0, r2, #31
-; CHECK-ARM-NEXT:    cmp r2, #0
-; CHECK-ARM-NEXT:    mvnmi r1, #-2147483648
-; CHECK-ARM-NEXT:    cmp r3, #0
-; CHECK-ARM-NEXT:    movpl r1, r2
+; CHECK-ARM-NEXT:    eormi r2, r1, r2, asr #31
+; CHECK-ARM-NEXT:    mov r1, r2
 ; CHECK-ARM-NEXT:    bx lr
   %a = mul i64 %y, %z
   %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %z)
diff --git a/llvm/test/CodeGen/ARM/ssub_sat.ll b/llvm/test/CodeGen/ARM/ssub_sat.ll
index 5edbb8e280679..30d7a683654a9 100644
--- a/llvm/test/CodeGen/ARM/ssub_sat.ll
+++ b/llvm/test/CodeGen/ARM/ssub_sat.ll
@@ -16,50 +16,22 @@ declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; CHECK-T1-LABEL: func:
 ; CHECK-T1:       @ %bb.0:
-; CHECK-T1-NEXT:    .save {r4, lr}
-; CHECK-T1-NEXT:    push {r4, lr}
-; CHECK-T1-NEXT:    mov r2, r0
-; CHECK-T1-NEXT:    movs r3, #1
 ; CHECK-T1-NEXT:    subs r0, r0, r1
-; CHECK-T1-NEXT:    mov r4, r3
-; CHECK-T1-NEXT:    bmi .LBB0_2
+; CHECK-T1-NEXT:    bvc .LBB0_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    movs r4, #0
+; CHECK-T1-NEXT:    asrs r1, r0, #31
+; CHECK-T1-NEXT:    movs r0, #1
+; CHECK-T1-NEXT:    lsls r0, r0, #31
+; CHECK-T1-NEXT:    eors r0, r1
 ; CHECK-T1-NEXT:  .LBB0_2:
-; CHECK-T1-NEXT:    cmp r4, #0
-; CHECK-T1-NEXT:    bne .LBB0_4
-; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    lsls r3, r3, #31
-; CHECK-T1-NEXT:    cmp r2, r1
-; CHECK-T1-NEXT:    bvs .LBB0_5
-; CHECK-T1-NEXT:    b .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_4:
-; CHECK-T1-NEXT:    ldr r3, .LCPI0_0
-; CHECK-T1-NEXT:    cmp r2, r1
-; CHECK-T1-NEXT:    bvc .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_5:
-; CHECK-T1-NEXT:    mov r0, r3
-; CHECK-T1-NEXT:  .LBB0_6:
-; CHECK-T1-NEXT:    pop {r4, pc}
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI0_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
+; CHECK-T1-NEXT:    bx lr
 ;
 ; CHECK-T2NODSP-LABEL: func:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    subs.w r12, r0, r1
-; CHECK-T2NODSP-NEXT:    mov.w r3, #0
-; CHECK-T2NODSP-NEXT:    mov.w r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r3, #1
-; CHECK-T2NODSP-NEXT:    cmp r3, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r2, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r0, r1
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r2, r12
-; CHECK-T2NODSP-NEXT:    mov r0, r2
+; CHECK-T2NODSP-NEXT:    subs r0, r0, r1
+; CHECK-T2NODSP-NEXT:    mov.w r1, #-2147483648
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r1, r0, asr #31
 ; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func:
@@ -69,15 +41,9 @@ define i32 @func(i32 %x, i32 %y) nounwind {
 ;
 ; CHECK-ARMNODPS-LABEL: func:
 ; CHECK-ARMNODPS:       @ %bb.0:
-; CHECK-ARMNODPS-NEXT:    subs r12, r0, r1
-; CHECK-ARMNODPS-NEXT:    mov r3, #0
-; CHECK-ARMNODPS-NEXT:    movmi r3, #1
-; CHECK-ARMNODPS-NEXT:    mov r2, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r3, #0
-; CHECK-ARMNODPS-NEXT:    mvnne r2, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r0, r1
-; CHECK-ARMNODPS-NEXT:    movvc r2, r12
-; CHECK-ARMNODPS-NEXT:    mov r0, r2
+; CHECK-ARMNODPS-NEXT:    subs r0, r0, r1
+; CHECK-ARMNODPS-NEXT:    mov r1, #-2147483648
+; CHECK-ARMNODPS-NEXT:    eorvs r0, r1, r0, asr #31
 ; CHECK-ARMNODPS-NEXT:    bx lr
 ;
 ; CHECK-ARMBASEDSP-LABEL: func:
@@ -98,38 +64,30 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-T1:       @ %bb.0:
 ; CHECK-T1-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-T1-NEXT:    push {r4, r5, r7, lr}
-; CHECK-T1-NEXT:    mov r5, r1
-; CHECK-T1-NEXT:    eors r5, r3
-; CHECK-T1-NEXT:    subs r0, r0, r2
 ; CHECK-T1-NEXT:    mov r4, r1
-; CHECK-T1-NEXT:    sbcs r4, r3
-; CHECK-T1-NEXT:    eors r1, r4
-; CHECK-T1-NEXT:    ands r1, r5
-; CHECK-T1-NEXT:    bpl .LBB1_2
+; CHECK-T1-NEXT:    eors r1, r3
+; CHECK-T1-NEXT:    subs r5, r0, r2
+; CHECK-T1-NEXT:    mov r2, r4
+; CHECK-T1-NEXT:    sbcs r2, r3
+; CHECK-T1-NEXT:    eors r4, r2
+; CHECK-T1-NEXT:    ands r4, r1
+; CHECK-T1-NEXT:    asrs r1, r2, #31
+; CHECK-T1-NEXT:    cmp r4, #0
+; CHECK-T1-NEXT:    mov r0, r1
+; CHECK-T1-NEXT:    bmi .LBB1_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    asrs r0, r4, #31
+; CHECK-T1-NEXT:    mov r0, r5
 ; CHECK-T1-NEXT:  .LBB1_2:
 ; CHECK-T1-NEXT:    cmp r4, #0
 ; CHECK-T1-NEXT:    bmi .LBB1_4
 ; CHECK-T1-NEXT:  @ %bb.3:
+; CHECK-T1-NEXT:    mov r1, r2
+; CHECK-T1-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-T1-NEXT:  .LBB1_4:
 ; CHECK-T1-NEXT:    movs r2, #1
 ; CHECK-T1-NEXT:    lsls r2, r2, #31
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bpl .LBB1_5
-; CHECK-T1-NEXT:    b .LBB1_6
-; CHECK-T1-NEXT:  .LBB1_4:
-; CHECK-T1-NEXT:    ldr r2, .LCPI1_0
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bmi .LBB1_6
-; CHECK-T1-NEXT:  .LBB1_5:
-; CHECK-T1-NEXT:    mov r2, r4
-; CHECK-T1-NEXT:  .LBB1_6:
-; CHECK-T1-NEXT:    mov r1, r2
+; CHECK-T1-NEXT:    eors r1, r2
 ; CHECK-T1-NEXT:    pop {r4, r5, r7, pc}
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI1_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
 ;
 ; CHECK-T2-LABEL: func2:
 ; CHECK-T2:       @ %bb.0:
@@ -137,16 +95,13 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-T2-NEXT:    eor.w r12, r1, r3
 ; CHECK-T2-NEXT:    sbc.w r2, r1, r3
 ; CHECK-T2-NEXT:    eors r1, r2
-; CHECK-T2-NEXT:    ands.w r3, r12, r1
-; CHECK-T2-NEXT:    mov.w r1, #-2147483648
+; CHECK-T2-NEXT:    ands.w r1, r1, r12
 ; CHECK-T2-NEXT:    it mi
 ; CHECK-T2-NEXT:    asrmi r0, r2, #31
-; CHECK-T2-NEXT:    cmp r2, #0
+; CHECK-T2-NEXT:    mov.w r1, #-2147483648
 ; CHECK-T2-NEXT:    it mi
-; CHECK-T2-NEXT:    mvnmi r1, #-2147483648
-; CHECK-T2-NEXT:    cmp r3, #0
-; CHECK-T2-NEXT:    it pl
-; CHECK-T2-NEXT:    movpl r1, r2
+; CHECK-T2-NEXT:    eormi.w r2, r1, r2, asr #31
+; CHECK-T2-NEXT:    mov r1, r2
 ; CHECK-T2-NEXT:    bx lr
 ;
 ; CHECK-ARM-LABEL: func2:
@@ -155,13 +110,11 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; CHECK-ARM-NEXT:    eor r12, r1, r3
 ; CHECK-ARM-NEXT:    sbc r2, r1, r3
 ; CHECK-ARM-NEXT:    eor r1, r1, r2
-; CHECK-ARM-NEXT:    ands r3, r12, r1
-; CHECK-ARM-NEXT:    mov r1, #-2147483648
+; CHECK-ARM-NEXT:    ands r1, r12, r1
 ; CHECK-ARM-NEXT:    asrmi r0, r2, #31
-; CHECK-ARM-NEXT:    cmp r2, #0
-; CHECK-ARM-NEXT:    mvnmi r1, #-2147483648
-; CHECK-ARM-NEXT:    cmp r3, #0
-; CHECK-ARM-NEXT:    movpl r1, r2
+; CHECK-ARM-NEXT:    mov r1, #-2147483648
+; CHECK-ARM-NEXT:    eormi r2, r1, r2, asr #31
+; CHECK-ARM-NEXT:    mov r1, r2
 ; CHECK-ARM-NEXT:    bx lr
   %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %y)
   ret i64 %tmp
@@ -373,165 +326,64 @@ define signext i4 @func3(i4 signext %x, i4 signext %y) nounwind {
 define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-T1-LABEL: vec:
 ; CHECK-T1:       @ %bb.0:
-; CHECK-T1-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-T1-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-T1-NEXT:    .pad #12
-; CHECK-T1-NEXT:    sub sp, #12
-; CHECK-T1-NEXT:    str r3, [sp] @ 4-byte Spill
-; CHECK-T1-NEXT:    mov r4, r1
-; CHECK-T1-NEXT:    mov r1, r0
-; CHECK-T1-NEXT:    ldr r5, [sp, #32]
-; CHECK-T1-NEXT:    movs r7, #1
-; CHECK-T1-NEXT:    movs r0, #0
-; CHECK-T1-NEXT:    str r0, [sp, #8] @ 4-byte Spill
-; CHECK-T1-NEXT:    subs r0, r1, r5
-; CHECK-T1-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; CHECK-T1-NEXT:    mov r6, r7
-; CHECK-T1-NEXT:    bmi .LBB5_2
+; CHECK-T1-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-T1-NEXT:    push {r4, r5, r6, lr}
+; CHECK-T1-NEXT:    mov r4, r0
+; CHECK-T1-NEXT:    ldr r6, [sp, #16]
+; CHECK-T1-NEXT:    subs r0, r0, r6
+; CHECK-T1-NEXT:    movs r5, #1
+; CHECK-T1-NEXT:    lsls r5, r5, #31
+; CHECK-T1-NEXT:    cmp r4, r6
+; CHECK-T1-NEXT:    bvc .LBB5_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-T1-NEXT:    asrs r0, r0, #31
+; CHECK-T1-NEXT:    eors r0, r5
 ; CHECK-T1-NEXT:  .LBB5_2:
-; CHECK-T1-NEXT:    lsls r3, r7, #31
-; CHECK-T1-NEXT:    ldr r0, .LCPI5_0
-; CHECK-T1-NEXT:    cmp r6, #0
-; CHECK-T1-NEXT:    mov r6, r0
-; CHECK-T1-NEXT:    bne .LBB5_4
+; CHECK-T1-NEXT:    ldr r4, [sp, #20]
+; CHECK-T1-NEXT:    subs r1, r1, r4
+; CHECK-T1-NEXT:    bvc .LBB5_4
 ; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    mov r6, r3
+; CHECK-T1-NEXT:    asrs r1, r1, #31
+; CHECK-T1-NEXT:    eors r1, r5
 ; CHECK-T1-NEXT:  .LBB5_4:
-; CHECK-T1-NEXT:    cmp r1, r5
+; CHECK-T1-NEXT:    ldr r4, [sp, #24]
+; CHECK-T1-NEXT:    subs r2, r2, r4
 ; CHECK-T1-NEXT:    bvc .LBB5_6
 ; CHECK-T1-NEXT:  @ %bb.5:
-; CHECK-T1-NEXT:    str r6, [sp, #4] @ 4-byte Spill
+; CHECK-T1-NEXT:    asrs r2, r2, #31
+; CHECK-T1-NEXT:    eors r2, r5
 ; CHECK-T1-NEXT:  .LBB5_6:
-; CHECK-T1-NEXT:    ldr r5, [sp, #36]
-; CHECK-T1-NEXT:    subs r1, r4, r5
-; CHECK-T1-NEXT:    mov r6, r7
-; CHECK-T1-NEXT:    bmi .LBB5_8
+; CHECK-T1-NEXT:    ldr r4, [sp, #28]
+; CHECK-T1-NEXT:    subs r3, r3, r4
+; CHECK-T1-NEXT:    bvc .LBB5_8
 ; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-T1-NEXT:    asrs r3, r3, #31
+; CHECK-T1-NEXT:    eors r3, r5
 ; CHECK-T1-NEXT:  .LBB5_8:
-; CHECK-T1-NEXT:    cmp r6, #0
-; CHECK-T1-NEXT:    mov r6, r0
-; CHECK-T1-NEXT:    bne .LBB5_10
-; CHECK-T1-NEXT:  @ %bb.9:
-; CHECK-T1-NEXT:    mov r6, r3
-; CHECK-T1-NEXT:  .LBB5_10:
-; CHECK-T1-NEXT:    cmp r4, r5
-; CHECK-T1-NEXT:    bvc .LBB5_12
-; CHECK-T1-NEXT:  @ %bb.11:
-; CHECK-T1-NEXT:    mov r1, r6
-; CHECK-T1-NEXT:  .LBB5_12:
-; CHECK-T1-NEXT:    ldr r5, [sp, #40]
-; CHECK-T1-NEXT:    subs r4, r2, r5
-; CHECK-T1-NEXT:    mov r6, r7
-; CHECK-T1-NEXT:    bmi .LBB5_14
-; CHECK-T1-NEXT:  @ %bb.13:
-; CHECK-T1-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
-; CHECK-T1-NEXT:  .LBB5_14:
-; CHECK-T1-NEXT:    cmp r6, #0
-; CHECK-T1-NEXT:    mov r6, r0
-; CHECK-T1-NEXT:    bne .LBB5_16
-; CHECK-T1-NEXT:  @ %bb.15:
-; CHECK-T1-NEXT:    mov r6, r3
-; CHECK-T1-NEXT:  .LBB5_16:
-; CHECK-T1-NEXT:    cmp r2, r5
-; CHECK-T1-NEXT:    bvc .LBB5_18
-; CHECK-T1-NEXT:  @ %bb.17:
-; CHECK-T1-NEXT:    mov r4, r6
-; CHECK-T1-NEXT:  .LBB5_18:
-; CHECK-T1-NEXT:    ldr r2, [sp, #44]
-; CHECK-T1-NEXT:    ldr r6, [sp] @ 4-byte Reload
-; CHECK-T1-NEXT:    subs r5, r6, r2
-; CHECK-T1-NEXT:    bpl .LBB5_23
-; CHECK-T1-NEXT:  @ %bb.19:
-; CHECK-T1-NEXT:    cmp r7, #0
-; CHECK-T1-NEXT:    beq .LBB5_24
-; CHECK-T1-NEXT:  .LBB5_20:
-; CHECK-T1-NEXT:    cmp r6, r2
-; CHECK-T1-NEXT:    bvc .LBB5_22
-; CHECK-T1-NEXT:  .LBB5_21:
-; CHECK-T1-NEXT:    mov r5, r0
-; CHECK-T1-NEXT:  .LBB5_22:
-; CHECK-T1-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-T1-NEXT:    mov r2, r4
-; CHECK-T1-NEXT:    mov r3, r5
-; CHECK-T1-NEXT:    add sp, #12
-; CHECK-T1-NEXT:    pop {r4, r5, r6, r7, pc}
-; CHECK-T1-NEXT:  .LBB5_23:
-; CHECK-T1-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-T1-NEXT:    cmp r7, #0
-; CHECK-T1-NEXT:    bne .LBB5_20
-; CHECK-T1-NEXT:  .LBB5_24:
-; CHECK-T1-NEXT:    mov r0, r3
-; CHECK-T1-NEXT:    cmp r6, r2
-; CHECK-T1-NEXT:    bvs .LBB5_21
-; CHECK-T1-NEXT:    b .LBB5_22
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.25:
-; CHECK-T1-NEXT:  .LCPI5_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
+; CHECK-T1-NEXT:    pop {r4, r5, r6, pc}
 ;
 ; CHECK-T2NODSP-LABEL: vec:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    .save {r4, r5, r6, r7, lr}
-; CHECK-T2NODSP-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-T2NODSP-NEXT:    .pad #4
-; CHECK-T2NODSP-NEXT:    sub sp, #4
-; CHECK-T2NODSP-NEXT:    ldr r4, [sp, #24]
-; CHECK-T2NODSP-NEXT:    mov lr, r0
-; CHECK-T2NODSP-NEXT:    ldr r7, [sp, #28]
-; CHECK-T2NODSP-NEXT:    movs r5, #0
-; CHECK-T2NODSP-NEXT:    subs r6, r0, r4
-; CHECK-T2NODSP-NEXT:    mov.w r0, #0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r0, #1
-; CHECK-T2NODSP-NEXT:    cmp r0, #0
-; CHECK-T2NODSP-NEXT:    mov.w r0, #-2147483648
+; CHECK-T2NODSP-NEXT:    .save {r7, lr}
+; CHECK-T2NODSP-NEXT:    push {r7, lr}
+; CHECK-T2NODSP-NEXT:    ldr.w r12, [sp, #8]
+; CHECK-T2NODSP-NEXT:    ldr.w lr, [sp, #12]
+; CHECK-T2NODSP-NEXT:    subs.w r0, r0, r12
 ; CHECK-T2NODSP-NEXT:    mov.w r12, #-2147483648
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r0, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp lr, r4
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r0, r6
-; CHECK-T2NODSP-NEXT:    subs r6, r1, r7
-; CHECK-T2NODSP-NEXT:    mov.w r4, #0
-; CHECK-T2NODSP-NEXT:    mov.w lr, #-2147483648
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r4, #1
-; CHECK-T2NODSP-NEXT:    cmp r4, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne lr, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r1, r7
-; CHECK-T2NODSP-NEXT:    ldr r1, [sp, #32]
-; CHECK-T2NODSP-NEXT:    mov.w r4, #0
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc lr, r6
-; CHECK-T2NODSP-NEXT:    subs r6, r2, r1
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r4, #1
-; CHECK-T2NODSP-NEXT:    cmp r4, #0
-; CHECK-T2NODSP-NEXT:    mov.w r4, #-2147483648
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r4, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r2, r1
-; CHECK-T2NODSP-NEXT:    ldr r1, [sp, #36]
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r4, r6
-; CHECK-T2NODSP-NEXT:    subs r2, r3, r1
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi r5, #1
-; CHECK-T2NODSP-NEXT:    cmp r5, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r12, #-2147483648
-; CHECK-T2NODSP-NEXT:    cmp r3, r1
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r12, r2
-; CHECK-T2NODSP-NEXT:    mov r1, lr
-; CHECK-T2NODSP-NEXT:    mov r2, r4
-; CHECK-T2NODSP-NEXT:    mov r3, r12
-; CHECK-T2NODSP-NEXT:    add sp, #4
-; CHECK-T2NODSP-NEXT:    pop {r4, r5, r6, r7, pc}
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r0, r12, r0, asr #31
+; CHECK-T2NODSP-NEXT:    subs.w r1, r1, lr
+; CHECK-T2NODSP-NEXT:    ldr.w lr, [sp, #16]
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r1, r12, r1, asr #31
+; CHECK-T2NODSP-NEXT:    subs.w r2, r2, lr
+; CHECK-T2NODSP-NEXT:    ldr.w lr, [sp, #20]
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r2, r12, r2, asr #31
+; CHECK-T2NODSP-NEXT:    subs.w r3, r3, lr
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r3, r12, r3, asr #31
+; CHECK-T2NODSP-NEXT:    pop {r7, pc}
 ;
 ; CHECK-T2DSP-LABEL: vec:
 ; CHECK-T2DSP:       @ %bb.0:
@@ -547,49 +399,22 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; CHECK-ARMNODPS-LABEL: vec:
 ; CHECK-ARMNODPS:       @ %bb.0:
-; CHECK-ARMNODPS-NEXT:    .save {r4, r5, r6, r7, r11, lr}
-; CHECK-ARMNODPS-NEXT:    push {r4, r5, r6, r7, r11, lr}
-; CHECK-ARMNODPS-NEXT:    ldr r4, [sp, #24]
-; CHECK-ARMNODPS-NEXT:    mov lr, r0
-; CHECK-ARMNODPS-NEXT:    ldr r7, [sp, #28]
-; CHECK-ARMNODPS-NEXT:    mov r5, #0
-; CHECK-ARMNODPS-NEXT:    subs r6, r0, r4
-; CHECK-ARMNODPS-NEXT:    mov r0, #0
-; CHECK-ARMNODPS-NEXT:    movmi r0, #1
-; CHECK-ARMNODPS-NEXT:    cmp r0, #0
-; CHECK-ARMNODPS-NEXT:    mov r0, #-2147483648
+; CHECK-ARMNODPS-NEXT:    .save {r11, lr}
+; CHECK-ARMNODPS-NEXT:    push {r11, lr}
+; CHECK-ARMNODPS-NEXT:    ldr r12, [sp, #8]
+; CHECK-ARMNODPS-NEXT:    ldr lr, [sp, #12]
+; CHECK-ARMNODPS-NEXT:    subs r0, r0, r12
 ; CHECK-ARMNODPS-NEXT:    mov r12, #-2147483648
-; CHECK-ARMNODPS-NEXT:    mvnne r0, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp lr, r4
-; CHECK-ARMNODPS-NEXT:    movvc r0, r6
-; CHECK-ARMNODPS-NEXT:    subs r6, r1, r7
-; CHECK-ARMNODPS-NEXT:    mov r4, #0
-; CHECK-ARMNODPS-NEXT:    mov lr, #-2147483648
-; CHECK-ARMNODPS-NEXT:    movmi r4, #1
-; CHECK-ARMNODPS-NEXT:    cmp r4, #0
-; CHECK-ARMNODPS-NEXT:    mvnne lr, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r1, r7
-; CHECK-ARMNODPS-NEXT:    ldr r1, [sp, #32]
-; CHECK-ARMNODPS-NEXT:    movvc lr, r6
-; CHECK-ARMNODPS-NEXT:    mov r4, #0
-; CHECK-ARMNODPS-NEXT:    subs r6, r2, r1
-; CHECK-ARMNODPS-NEXT:    movmi r4, #1
-; CHECK-ARMNODPS-NEXT:    cmp r4, #0
-; CHECK-ARMNODPS-NEXT:    mov r4, #-2147483648
-; CHECK-ARMNODPS-NEXT:    mvnne r4, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r2, r1
-; CHECK-ARMNODPS-NEXT:    ldr r1, [sp, #36]
-; CHECK-ARMNODPS-NEXT:    movvc r4, r6
-; CHECK-ARMNODPS-NEXT:    subs r2, r3, r1
-; CHECK-ARMNODPS-NEXT:    movmi r5, #1
-; CHECK-ARMNODPS-NEXT:    cmp r5, #0
-; CHECK-ARMNODPS-NEXT:    mvnne r12, #-2147483648
-; CHECK-ARMNODPS-NEXT:    cmp r3, r1
-; CHECK-ARMNODPS-NEXT:    movvc r12, r2
-; CHECK-ARMNODPS-NEXT:    mov r1, lr
-; CHECK-ARMNODPS-NEXT:    mov r2, r4
-; CHECK-ARMNODPS-NEXT:    mov r3, r12
-; CHECK-ARMNODPS-NEXT:    pop {r4, r5, r6, r7, r11, pc}
+; CHECK-ARMNODPS-NEXT:    eorvs r0, r12, r0, asr #31
+; CHECK-ARMNODPS-NEXT:    subs r1, r1, lr
+; CHECK-ARMNODPS-NEXT:    ldr lr, [sp, #16]
+; CHECK-ARMNODPS-NEXT:    eorvs r1, r12, r1, asr #31
+; CHECK-ARMNODPS-NEXT:    subs r2, r2, lr
+; CHECK-ARMNODPS-NEXT:    ldr lr, [sp, #20]
+; CHECK-ARMNODPS-NEXT:    eorvs r2, r12, r2, asr #31
+; CHECK-ARMNODPS-NEXT:    subs r3, r3, lr
+; CHECK-ARMNODPS-NEXT:    eorvs r3, r12, r3, asr #31
+; CHECK-ARMNODPS-NEXT:    pop {r11, pc}
 ;
 ; CHECK-ARMBASEDSP-LABEL: vec:
 ; CHECK-ARMBASEDSP:       @ %bb.0:
diff --git a/llvm/test/CodeGen/ARM/ssub_sat_plus.ll b/llvm/test/CodeGen/ARM/ssub_sat_plus.ll
index 989e0aa4d692e..5bf7b326c5b91 100644
--- a/llvm/test/CodeGen/ARM/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/ARM/ssub_sat_plus.ll
@@ -13,56 +13,27 @@ declare i64 @llvm.ssub.sat.i64(i64, i64)
 define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; CHECK-T1-LABEL: func32:
 ; CHECK-T1:       @ %bb.0:
-; CHECK-T1-NEXT:    .save {r4, lr}
-; CHECK-T1-NEXT:    push {r4, lr}
-; CHECK-T1-NEXT:    mov r3, r0
 ; CHECK-T1-NEXT:    muls r1, r2, r1
-; CHECK-T1-NEXT:    movs r2, #1
 ; CHECK-T1-NEXT:    subs r0, r0, r1
-; CHECK-T1-NEXT:    mov r4, r2
-; CHECK-T1-NEXT:    bmi .LBB0_2
+; CHECK-T1-NEXT:    bvc .LBB0_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    movs r4, #0
+; CHECK-T1-NEXT:    asrs r1, r0, #31
+; CHECK-T1-NEXT:    movs r0, #1
+; CHECK-T1-NEXT:    lsls r0, r0, #31
+; CHECK-T1-NEXT:    eors r0, r1
 ; CHECK-T1-NEXT:  .LBB0_2:
-; CHECK-T1-NEXT:    cmp r4, #0
-; CHECK-T1-NEXT:    bne .LBB0_4
-; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    lsls r2, r2, #31
-; CHECK-T1-NEXT:    cmp r3, r1
-; CHECK-T1-NEXT:    bvs .LBB0_5
-; CHECK-T1-NEXT:    b .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_4:
-; CHECK-T1-NEXT:    ldr r2, .LCPI0_0
-; CHECK-T1-NEXT:    cmp r3, r1
-; CHECK-T1-NEXT:    bvc .LBB0_6
-; CHECK-T1-NEXT:  .LBB0_5:
-; CHECK-T1-NEXT:    mov r0, r2
-; CHECK-T1-NEXT:  .LBB0_6:
-; CHECK-T1-NEXT:    pop {r4, pc}
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI0_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
+; CHECK-T1-NEXT:    bx lr
 ;
 ; CHECK-T2NODSP-LABEL: func32:
 ; CHECK-T2NODSP:       @ %bb.0:
-; CHECK-T2NODSP-NEXT:    .save {r7, lr}
-; CHECK-T2NODSP-NEXT:    push {r7, lr}
-; CHECK-T2NODSP-NEXT:    mls r12, r1, r2, r0
-; CHECK-T2NODSP-NEXT:    mov.w lr, #0
-; CHECK-T2NODSP-NEXT:    mov.w r3, #-2147483648
+; CHECK-T2NODSP-NEXT:    mls r3, r1, r2, r0
+; CHECK-T2NODSP-NEXT:    mov.w r12, #-2147483648
 ; CHECK-T2NODSP-NEXT:    muls r1, r2, r1
-; CHECK-T2NODSP-NEXT:    cmp.w r12, #0
-; CHECK-T2NODSP-NEXT:    it mi
-; CHECK-T2NODSP-NEXT:    movmi.w lr, #1
-; CHECK-T2NODSP-NEXT:    cmp.w lr, #0
-; CHECK-T2NODSP-NEXT:    it ne
-; CHECK-T2NODSP-NEXT:    mvnne r3, #-2147483648
 ; CHECK-T2NODSP-NEXT:    cmp r0, r1
-; CHECK-T2NODSP-NEXT:    it vc
-; CHECK-T2NODSP-NEXT:    movvc r3, r12
+; CHECK-T2NODSP-NEXT:    it vs
+; CHECK-T2NODSP-NEXT:    eorvs.w r3, r12, r3, asr #31
 ; CHECK-T2NODSP-NEXT:    mov r0, r3
-; CHECK-T2NODSP-NEXT:    pop {r7, pc}
+; CHECK-T2NODSP-NEXT:    bx lr
 ;
 ; CHECK-T2DSP-LABEL: func32:
 ; CHECK-T2DSP:       @ %bb.0:
@@ -83,42 +54,35 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-T1-LABEL: func64:
 ; CHECK-T1:       @ %bb.0:
-; CHECK-T1-NEXT:    .save {r4, lr}
-; CHECK-T1-NEXT:    push {r4, lr}
-; CHECK-T1-NEXT:    ldr r2, [sp, #12]
-; CHECK-T1-NEXT:    mov r4, r1
-; CHECK-T1-NEXT:    eors r4, r2
-; CHECK-T1-NEXT:    ldr r3, [sp, #8]
-; CHECK-T1-NEXT:    subs r0, r0, r3
+; CHECK-T1-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-T1-NEXT:    push {r4, r5, r7, lr}
+; CHECK-T1-NEXT:    ldr r2, [sp, #20]
+; CHECK-T1-NEXT:    mov r5, r1
+; CHECK-T1-NEXT:    eors r5, r2
+; CHECK-T1-NEXT:    ldr r3, [sp, #16]
+; CHECK-T1-NEXT:    subs r4, r0, r3
 ; CHECK-T1-NEXT:    mov r3, r1
 ; CHECK-T1-NEXT:    sbcs r3, r2
 ; CHECK-T1-NEXT:    eors r1, r3
-; CHECK-T1-NEXT:    ands r1, r4
-; CHECK-T1-NEXT:    bpl .LBB1_2
+; CHECK-T1-NEXT:    ands r1, r5
+; CHECK-T1-NEXT:    asrs r2, r3, #31
+; CHECK-T1-NEXT:    cmp r1, #0
+; CHECK-T1-NEXT:    mov r0, r2
+; CHECK-T1-NEXT:    bmi .LBB1_2
 ; CHECK-T1-NEXT:  @ %bb.1:
-; CHECK-T1-NEXT:    asrs r0, r3, #31
+; CHECK-T1-NEXT:    mov r0, r4
 ; CHECK-T1-NEXT:  .LBB1_2:
-; CHECK-T1-NEXT:    cmp r3, #0
+; CHECK-T1-NEXT:    cmp r1, #0
 ; CHECK-T1-NEXT:    bmi .LBB1_4
 ; CHECK-T1-NEXT:  @ %bb.3:
-; CHECK-T1-NEXT:    movs r2, #1
-; CHECK-T1-NEXT:    lsls r2, r2, #31
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bpl .LBB1_5
-; CHECK-T1-NEXT:    b .LBB1_6
+; CHECK-T1-NEXT:    mov r1, r3
+; CHECK-T1-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-T1-NEXT:  .LBB1_4:
-; CHECK-T1-NEXT:    ldr r2, .LCPI1_0
-; CHECK-T1-NEXT:    cmp r1, #0
-; CHECK-T1-NEXT:    bmi .LBB1_6
-; CHECK-T1-NEXT:  .LBB1_5:
-; CHECK-T1-NEXT:    mov r2, r3
-; CHECK-T1-NEXT:  .LBB1_6:
+; CHECK-T1-NEXT:    movs r1, #1
+; CHECK-T1-NEXT:    lsls r1, r1, #31
+; CHECK-T1-NEXT:    eors r2, r1
 ; CHECK-T1-NEXT:    mov r1, r2
-; CHECK-T1-NEXT:    pop {r4, pc}
-; CHECK-T1-NEXT:    .p2align 2
-; CHECK-T1-NEXT:  @ %bb.7:
-; CHECK-T1-NEXT:  .LCPI1_0:
-; CHECK-T1-NEXT:    .long 2147483647 @ 0x7fffffff
+; CHECK-T1-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; CHECK-T2-LABEL: func64:
 ; CHECK-T2:       @ %bb.0:
@@ -128,16 +92,13 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-T2-NEXT:    sbc.w r2, r1, r12
 ; CHECK-T2-NEXT:    eor.w r3, r1, r12
 ; CHECK-T2-NEXT:    eors r1, r2
-; CHECK-T2-NEXT:    ands r3, r1
-; CHECK-T2-NEXT:    mov.w r1, #-2147483648
+; CHECK-T2-NEXT:    ands r1, r3
 ; CHECK-T2-NEXT:    it mi
 ; CHECK-T2-NEXT:    asrmi r0, r2, #31
-; CHECK-T2-NEXT:    cmp r2, #0
+; CHECK-T2-NEXT:    mov.w r1, #-2147483648
 ; CHECK-T2-NEXT:    it mi
-; CHECK-T2-NEXT:    mvnmi r1, #-2147483648
-; CHECK-T2-NEXT:    cmp r3, #0
-; CHECK-T2-NEXT:    it pl
-; CHECK-T2-NEXT:    movpl r1, r2
+; CHECK-T2-NEXT:    eormi.w r2, r1, r2, asr #31
+; CHECK-T2-NEXT:    mov r1, r2
 ; CHECK-T2-NEXT:    bx lr
 ;
 ; CHECK-ARM-LABEL: func64:
@@ -148,13 +109,11 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; CHECK-ARM-NEXT:    eor r3, r1, r2
 ; CHECK-ARM-NEXT:    sbc r2, r1, r2
 ; CHECK-ARM-NEXT:    eor r1, r1, r2
-; CHECK-ARM-NEXT:    ands r3, r3, r1
-; CHECK-ARM-NEXT:    mov r1, #-2147483648
+; CHECK-ARM-NEXT:    ands r1, r3, r1
 ; CHECK-ARM-NEXT:    asrmi r0, r2, #31
-; CHECK-ARM-NEXT:    cmp r2, #0
-; CHECK-ARM-NEXT:    mvnmi r1, #-2147483648
-; CHECK-ARM-NEXT:    cmp r3, #0
-; CHECK-ARM-NEXT:    movpl r1, r2
+; CHECK-ARM-NEXT:    mov r1, #-2147483648
+; CHECK-ARM-NEXT:    eormi r2, r1, r2, asr #31
+; CHECK-ARM-NEXT:    mov r1, r2
 ; CHECK-ARM-NEXT:    bx lr
   %a = mul i64 %y, %z
   %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %z)
diff --git a/llvm/test/CodeGen/Generic/expand-vp.ll b/llvm/test/CodeGen/Generic/expand-vp.ll
index 3d9210570cdde..f4dfbdd2e16c5 100644
--- a/llvm/test/CodeGen/Generic/expand-vp.ll
+++ b/llvm/test/CodeGen/Generic/expand-vp.ll
@@ -25,6 +25,20 @@ declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+; Reductions
+declare i32 @llvm.vp.reduce.add.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+declare i32 @llvm.vp.reduce.mul.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+declare i32 @llvm.vp.reduce.and.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+declare i32 @llvm.vp.reduce.or.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+declare i32 @llvm.vp.reduce.xor.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+declare i32 @llvm.vp.reduce.smin.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+declare i32 @llvm.vp.reduce.smax.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+declare i32 @llvm.vp.reduce.umin.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+declare i32 @llvm.vp.reduce.umax.v4i32(i32, <4 x i32>, <4 x i1>, i32)
+declare float @llvm.vp.reduce.fmin.v4f32(float, <4 x float>, <4 x i1>, i32)
+declare float @llvm.vp.reduce.fmax.v4f32(float, <4 x float>, <4 x i1>, i32)
+declare float @llvm.vp.reduce.fadd.v4f32(float, <4 x float>, <4 x i1>, i32)
+declare float @llvm.vp.reduce.fmul.v4f32(float, <4 x float>, <4 x i1>, i32)
 
 ; Fixed vector test function.
 define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i32> %f3, <8 x i1> %m, i32 %n) {
@@ -78,6 +92,35 @@ define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1,
   %rC = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
   ret void
 }
+
+; Fixed vector reduce test function.
+define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) {
+  %r0 = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+  %r1 = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+  %r2 = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+  %r3 = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+  %r4 = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+  %r5 = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+  %r6 = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+  %r7 = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+  %r8 = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+  ret void
+}
+
+define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) {
+  %r0 = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r6 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r7 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r8 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  %r9 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+  ret void
+}
+
 ; All VP intrinsics have to be lowered into non-VP ops
 ; Convert %evl into %mask for non-speculatable VP intrinsics and emit the
 ; instruction+select idiom with a non-VP SIMD instruction.
@@ -121,7 +164,66 @@ define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1,
 ; ALL-CONVERT:       ret void
 
 
+; Check that reductions use the correct neutral element for masked-off elements
+; ALL-CONVERT: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) {
+; ALL-CONVERT-NEXT:  [[ADD:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[ADD]])
+; ALL-CONVERT-NEXT:  %{{.+}} = add i32 [[RED]], %start
+; ALL-CONVERT-NEXT:  [[MUL:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[MUL]])
+; ALL-CONVERT-NEXT:  %{{.+}} = mul i32 [[RED]], %start
+; ALL-CONVERT-NEXT:  [[AND:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[AND]])
+; ALL-CONVERT-NEXT:  %{{.+}} = and i32 [[RED]], %start
+; ALL-CONVERT-NEXT:  [[OR:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[OR]])
+; ALL-CONVERT-NEXT:  %{{.+}} = or i32 [[RED]], %start
+; ALL-CONVERT-NEXT:  [[XOR:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[XOR]])
+; ALL-CONVERT-NEXT:  %{{.+}} = xor i32 [[RED]], %start
+; ALL-CONVERT-NEXT:  [[SMIN:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[SMIN]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call i32 @llvm.smin.i32(i32 [[RED]], i32 %start)
+; ALL-CONVERT-NEXT:  [[SMAX:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[SMAX]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call i32 @llvm.smax.i32(i32 [[RED]], i32 %start)
+; ALL-CONVERT-NEXT:  [[UMIN:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[UMIN]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call i32 @llvm.umin.i32(i32 [[RED]], i32 %start)
+; ALL-CONVERT-NEXT:  [[UMAX:%.+]] = select <4 x i1> %m, <4 x i32> %vi, <4 x i32> zeroinitializer
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[UMAX]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call i32 @llvm.umax.i32(i32 [[RED]], i32 %start)
+; ALL-CONVERT-NEXT:  ret void
 
+; Check that reductions use the correct neutral element for masked-off elements
+; ALL-CONVERT: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) {
+; ALL-CONVERT-NEXT:  [[FMIN:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x7FF8000000000000>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call float @llvm.minnum.f32(float [[RED]], float %f)
+; ALL-CONVERT-NEXT:  [[FMIN_NNAN:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> <float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000, float 0x7FF0000000000000>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN_NNAN]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call nnan float @llvm.minnum.f32(float [[RED]], float %f)
+; ALL-CONVERT-NEXT:  [[FMIN_NNAN_NINF:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> <float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000, float 0x47EFFFFFE0000000>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call nnan ninf float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[FMIN_NNAN_NINF]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call nnan ninf float @llvm.minnum.f32(float [[RED]], float %f)
+; ALL-CONVERT-NEXT:  [[FMAX:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> <float 0xFFF8000000000000, float 0xFFF8000000000000, float 0xFFF8000000000000, float 0xFFF8000000000000>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[FMAX]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call float @llvm.maxnum.f32(float [[RED]], float %f)
+; ALL-CONVERT-NEXT:  [[FMAX_NNAN:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> <float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000, float 0xFFF0000000000000>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[FMAX_NNAN]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call nnan float @llvm.maxnum.f32(float [[RED]], float %f)
+; ALL-CONVERT-NEXT:  [[FMAX_NNAN_NINF:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> <float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000, float 0xC7EFFFFFE0000000>
+; ALL-CONVERT-NEXT:  [[RED:%.+]] = call nnan ninf float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[FMAX_NNAN_NINF]])
+; ALL-CONVERT-NEXT:  %{{.+}} = call nnan ninf float @llvm.maxnum.f32(float [[RED]], float %f)
+; ALL-CONVERT-NEXT:  [[FADD:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
+; ALL-CONVERT-NEXT:  %{{.+}} = call float @llvm.vector.reduce.fadd.v4f32(float %f, <4 x float> [[FADD]])
+; ALL-CONVERT-NEXT:  [[FADD:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
+; ALL-CONVERT-NEXT:  %{{.+}} = call reassoc float @llvm.vector.reduce.fadd.v4f32(float %f, <4 x float> [[FADD]])
+; ALL-CONVERT-NEXT:  [[FMUL:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; ALL-CONVERT-NEXT:  %{{.+}} = call float @llvm.vector.reduce.fmul.v4f32(float %f, <4 x float> [[FMUL]])
+; ALL-CONVERT-NEXT:  [[FMUL:%.+]] = select <4 x i1> %m, <4 x float> %vf, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; ALL-CONVERT-NEXT:  %{{.+}} = call reassoc float @llvm.vector.reduce.fmul.v4f32(float %f, <4 x float> [[FMUL]])
+; ALL-CONVERT-NEXT:  ret void
 
 ; All legal - don't transform anything.
 
@@ -157,6 +259,30 @@ define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1,
 ; LEGAL_LEGAL-NEXT:  %rC = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
 ; LEGAL_LEGAL-NEXT:  ret void
 
+; LEGAL_LEGAL: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) {
+; LEGAL_LEGAL-NEXT:  %r0 = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r1 = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r2 = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r3 = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r4 = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r5 = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r6 = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r7 = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r8 = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  ret void
+
+; LEGAL_LEGAL: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) {
+; LEGAL_LEGAL-NEXT:  %r0 = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r6 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r7 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r8 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r9 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  ret void
 
 ; Drop %evl where possible else fold %evl into %mask (%evl Discard, %mask Legal)
 ;
@@ -205,6 +331,30 @@ define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1,
 ; DISCARD_LEGAL-NOT:  %{{.+}} = call <vscale x 4 x i32> @llvm.vp.{{.*}}, i32 %n)
 ; DISCARD_LEGAL:      ret void
 
+; DISCARD_LEGAL: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) {
+; DISCARD_LEGAL-NEXT:  %r0 = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r1 = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r2 = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r3 = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r4 = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r5 = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r6 = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r7 = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r8 = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT: ret void
+
+; DISCARD_LEGAL: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) {
+; DISCARD_LEGAL-NEXT:  %r0 = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r1 = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r2 = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r3 = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r4 = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r5 = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r6 = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r7 = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r8 = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT:  %r9 = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; DISCARD_LEGAL-NEXT: ret void
 
 ; Convert %evl into %mask everywhere (%evl Convert, %mask Legal)
 ;
@@ -243,3 +393,35 @@ define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1,
 ; CONVERT_LEGAL-NOT:    %{{.*}} = call <vscale x 4 x i32> @llvm.vp.{{.*}}, i32 %n)
 ; CONVERT_LEGAL:        ret void
 
+; CONVERT_LEGAL: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) {
+; CONVERT_LEGAL-NEXT:  [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i32 0
+; CONVERT_LEGAL-NEXT:  [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CONVERT_LEGAL-NEXT:  [[EVLM:%.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[NSPLAT]]
+; CONVERT_LEGAL-NEXT:  [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m
+; CONVERT_LEGAL-NEXT:  %{{.+}} = call i32 @llvm.vp.reduce.add.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> [[NEWM]], i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call i32 @llvm.vp.reduce.mul.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call i32 @llvm.vp.reduce.and.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call i32 @llvm.vp.reduce.or.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call i32 @llvm.vp.reduce.xor.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call i32 @llvm.vp.reduce.smin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call i32 @llvm.vp.reduce.smax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call i32 @llvm.vp.reduce.umin.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call i32 @llvm.vp.reduce.umax.v4i32(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL: ret void
+
+; CONVERT_LEGAL: define void @test_vp_reduce_fp_v4(float %f, <4 x float> %vf, <4 x i1> %m, i32 %n) {
+; CONVERT_LEGAL-NEXT:  [[NINS:%.+]] = insertelement <4 x i32> poison, i32 %n, i32 0
+; CONVERT_LEGAL-NEXT:  [[NSPLAT:%.+]] = shufflevector <4 x i32> [[NINS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CONVERT_LEGAL-NEXT:  [[EVLM:%.+]] = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[NSPLAT]]
+; CONVERT_LEGAL-NEXT:  [[NEWM:%.+]] = and <4 x i1> [[EVLM]], %m
+; CONVERT_LEGAL-NEXT:  %{{.+}} = call float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> [[NEWM]], i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call nnan float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call nnan ninf float @llvm.vp.reduce.fmin.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call nnan float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call nnan ninf float @llvm.vp.reduce.fmax.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call reassoc float @llvm.vp.reduce.fadd.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL-NOT:   %{{.+}} = call reassoc float @llvm.vp.reduce.fmul.v4f32(float %f, <4 x float> %vf, <4 x i1> %m, i32 4)
+; CONVERT_LEGAL: ret void
diff --git a/llvm/test/CodeGen/M68k/Encoding/Data/Classes/MxMove_MM.mir b/llvm/test/CodeGen/M68k/Encoding/Data/Classes/MxMove_MM.mir
deleted file mode 100644
index 60e01f1ad5e6d..0000000000000
--- a/llvm/test/CodeGen/M68k/Encoding/Data/Classes/MxMove_MM.mir
+++ /dev/null
@@ -1,217 +0,0 @@
-# RUN: llc %s -mtriple=m68k -start-after=prologepilog -O0 -filetype=obj -o - \
-# RUN:   | extract-section .text \
-# RUN:   | FileCheck %s -check-prefixes=MOV8JK,MOV32JK,MOV8JQ,MOV32JQ,MOV8FF,MOV32FF
-# RUN: llc %s -mtriple=m68k -start-after=prologepilog -O0 -filetype=obj -o - \
-# RUN:   | extract-section .text \
-# RUN:   | FileCheck %s -check-prefixes=MOV8PP,MOV32PP,MOV8JJ,MOV32JJ,MOV8OO,MOV32OO
-# RUN: llc %s -mtriple=m68k -start-after=prologepilog -O0 -filetype=obj -o - \
-# RUN:   | extract-section .text \
-# RUN:   | FileCheck %s -check-prefixes=MOV8EE,MOV32EE,MOV8BB,MOV32BB
-
-#------------------------------------------------------------------------------
-# MxMove_MM is used for moving data from memory to memory
-#------------------------------------------------------------------------------
-
---- # ARI <- PCI
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8JK:        0   0   0   1   0   0   0   0 . 1   0   1   1   1   0   1   1
-# MOV8JK-SAME:   0   0   0   1   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV8JK-SAME:   0   0   0   1   0   0   0   0 . 1   0   1   1   1   0   1   1
-# MOV8JK-SAME:   0   0   0   1   1   0   0   0 . 1   1   1   1   1   1   1   1
-#               ---------------------------------------------------------------
-# MOV32JK-SAME:  0   0   1   0   0   0   0   0 . 1   0   1   1   1   0   1   1
-# MOV32JK-SAME:  0   0   0   1   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32JK-SAME:  0   0   1   0   0   0   1   0 . 1   0   1   1   1   0   1   1
-# MOV32JK-SAME:  1   0   1   0   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---+-----------+---+-------+---+-------------------------------
-#        BRIEF  DA |    REG    | L | SCALE | 0 |          DISPLACEMENT
-#               ---+-----------+---+-------+---+-------------------------------
-name: MxMove_RM_ARI_PCI
-body: |
-  bb.0:
-    MOV8jk  $a0,  0, $d1, implicit-def $ccr
-    MOV8jk  $a0, -1, $d1, implicit-def $ccr
-    MOV32jk $a0,  0, $d1, implicit-def $ccr
-    MOV32jk $a1,  0, $a2, implicit-def $ccr
-
-...
---- # ARI <- PCD
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8JQ-SAME:   0   0   0   1   0   0   0   0 . 1   0   1   1   1   0   1   0
-# MOV8JQ-SAME:   0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32JQ-SAME:  0   0   1   0   0   0   0   0 . 1   0   1   1   1   0   1   0
-# MOV32JQ-SAME:  1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-#               ---------------------------------------------------------------
-# MOV32JQ-SAME:  0   0   1   0   0   0   0   0 . 1   0   1   1   1   0   1   0
-# MOV32JQ-SAME:  1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-name: MxMove_RM_PCD
-body: |
-  bb.0:
-    MOV8jq  $a0,  0, implicit-def $ccr
-    MOV32jq $a0, -1, implicit-def $ccr
-    MOV32jq $a0, -1, implicit-def $ccr
-
-...
---- # ARII <- ARII
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8FF:        0   0   0   1   0   0   0   1 . 1   0   1   1   0   0   0   0
-# MOV8FF-SAME:   0   0   0   1   1   0   0   0 . 0   0   0   0   0   0   0   0
-# MOV8FF-SAME:   0   0   0   1   1   0   0   0 . 1   1   1   1   1   1   1   1
-#               ---------------------------------------------------------------
-# MOV8FF-SAME:   0   0   0   1   0   0   0   1 . 1   0   1   1   0   0   0   0
-# MOV8FF-SAME:   0   0   0   1   1   0   0   0 . 1   1   1   1   1   1   1   1
-# MOV8FF-SAME:   0   0   0   1   1   0   0   0 . 1   1   1   1   1   1   1   1
-#               ---------------------------------------------------------------
-# MOV32FF-SAME:  0   0   1   0   0   0   1   1 . 1   0   1   1   0   0   0   1
-# MOV32FF-SAME:  0   0   0   1   1   0   0   0 . 0   0   0   0   0   0   0   0
-# MOV32FF-SAME:  0   0   0   1   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32FF-SAME:  0   0   1   0   0   1   0   1 . 1   0   1   1   0   0   1   0
-# MOV32FF-SAME:  1   0   1   0   1   0   0   0 . 0   0   1   0   1   0   1   0
-# MOV32FF-SAME:  1   0   1   0   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---+-----------+---+-------+---+-------------------------------
-#        BRIEF  DA |    REG    | L | SCALE | 0 |          DISPLACEMENT
-#               ---+-----------+---+-------+---+-------------------------------
-name: MxMove_RM_ARII_ARII
-body: |
-  bb.0:
-    MOV8ff  -1, $a0, $d1, 0,  $a0, $d1, implicit-def $ccr
-    MOV8ff  -1, $a0, $d1, -1, $a0, $d1, implicit-def $ccr
-    MOV32ff 0,  $a1, $d1, 0,  $a1, $d1, implicit-def $ccr
-    MOV32ff 0,  $a2, $a2, 42, $a2, $a2, implicit-def $ccr
-
-...
---- # ARID <- ARID
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8PP:        0   0   0   1   0   0   0   1 . 0   1   1   0   1   0   0   0
-# MOV8PP-SAME:   0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-# MOV8PP-SAME:   0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32PP-SAME:  0   0   1   0   0   0   1   1 . 0   1   1   0   1   0   0   1
-# MOV32PP-SAME:  1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-# MOV32PP-SAME:  0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32PP-SAME:  0   0   1   0   0   0   1   1 . 0   1   1   0   1   0   0   1
-# MOV32PP-SAME:  0   0   0   0   0   0   0   0 . 0   0   1   0   1   0   1   0
-# MOV32PP-SAME:  1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-name: MxMove_RM_ARID
-body: |
-  bb.0:
-    MOV8pp   0, $a0, 0,  $a0, implicit-def $ccr
-    MOV32pp  0, $a1, -1, $a1, implicit-def $ccr
-    MOV32pp -1, $a1, 42, $a1, implicit-def $ccr
-
-...
---- # ARIPD <- ARIPD
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8EE:        0   0   0   1   0   0   0   1 . 0   0   1   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32EE-SAME:  0   0   1   0   0   0   1   1 . 0   0   1   0   0   0   0   1
-#               ---------------------------------------------------------------
-# MOV32EE-SAME:  0   0   1   0   0   0   1   1 . 0   0   1   0   0   0   0   1
-name: MxMove_RM_ARIPD
-body: |
-  bb.0:
-    MOV8ee  $a0, $a0, implicit-def $ccr
-    MOV32ee $a1, $a1, implicit-def $ccr
-    MOV32ee $a1, $a1, implicit-def $ccr
-
-...
---- # ARIPI <- ARIPI
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8OO-SAME:   0   0   0   1   0   0   0   0 . 1   1   0   1   1   0   0   0
-#               ---------------------------------------------------------------
-# MOV32OO-SAME:  0   0   1   0   0   0   1   0 . 1   1   0   1   1   0   0   1
-#               ---------------------------------------------------------------
-# MOV32OO-SAME:  0   0   1   0   0   0   1   0 . 1   1   0   1   1   0   0   1
-name: MxMove_RM_ARIPI
-body: |
-  bb.0:
-    MOV8oo  $a0, $a0, implicit-def $ccr
-    MOV32oo $a1, $a1, implicit-def $ccr
-    MOV32oo $a1, $a1, implicit-def $ccr
-
-...
---- # ARI <- ARI
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8JJ-SAME:   0   0   0   1   0   0   0   0 . 1   0   0   1   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32JJ-SAME:  0   0   1   0   0   0   1   0 . 1   0   0   1   0   0   0   1
-#               ---------------------------------------------------------------
-# MOV32JJ-SAME:  0   0   1   0   0   0   1   0 . 1   0   0   1   0   0   0   1
-name: MxMove_RM_ARI
-body: |
-  bb.0:
-    MOV8jj  $a0, $a0, implicit-def $ccr
-    MOV32jj $a1, $a1, implicit-def $ccr
-    MOV32jj $a1, $a1, implicit-def $ccr
-
-...
---- # ABS <- ABS
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8BB-SAME:   0   0   0   1   0   0   1   1 . 1   1   1   1   1   0   0   1
-# MOV8BB-SAME:   1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-# MOV8BB-SAME:   0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32BB-SAME:  0   0   1   0   0   0   1   1 . 1   1   1   1   1   0   0   1
-# MOV32BB-SAME:  0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-# MOV32BB-SAME:  0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-# MOV32BB-SAME:  1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-# MOV32BB-SAME:  1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-#               ---------------------------------------------------------------
-# MOV32BB-SAME:  0   0   1   0   0   0   1   1 . 1   1   1   1   1   0   0   1
-# MOV32BB-SAME:  0   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-# MOV32BB-SAME:  1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-# MOV32BB-SAME:  0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-# MOV32BB-SAME:  0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-name: MxMove_RM_ABS
-body: |
-  bb.0:
-    MOV8bb   0, -1,          implicit-def $ccr
-    MOV32bb -1,  0,          implicit-def $ccr
-    MOV32bb  0,  2147483647, implicit-def $ccr
-
-...
diff --git a/llvm/test/CodeGen/M68k/Encoding/Data/Classes/MxMove_MR.mir b/llvm/test/CodeGen/M68k/Encoding/Data/Classes/MxMove_MR.mir
deleted file mode 100644
index bb269a35a09e3..0000000000000
--- a/llvm/test/CodeGen/M68k/Encoding/Data/Classes/MxMove_MR.mir
+++ /dev/null
@@ -1,81 +0,0 @@
-# RUN: llc %s -mtriple=m68k -start-after=prologepilog -O0 -filetype=obj -o - \
-# RUN:   | extract-section .text \
-# RUN:   | FileCheck %s -check-prefixes=MOV8FD,MOV32FR,MOV8PD,MOV32PR,MOV8JD,MOV32JR
-
-#------------------------------------------------------------------------------
-# MxMove_MR is used for moving data from register to memory
-#------------------------------------------------------------------------------
-
---- # ARII
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8FD:        0   0   0   1   0   0   0   1 . 1   0   0   0   0   0   0   0
-# MOV8FD-SAME:   0   0   0   1   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV8FD-SAME:   0   0   0   1   0   0   0   1 . 1   0   0   0   0   0   0   0
-# MOV8FD-SAME:   0   0   0   1   1   0   0   0 . 1   1   1   1   1   1   1   1
-#               ---------------------------------------------------------------
-# MOV32FR-SAME:  0   0   1   0   0   0   1   1 . 1   0   0   0   0   0   0   0
-# MOV32FR-SAME:  0   0   0   1   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32FR-SAME:  0   0   1   0   0   1   0   1 . 1   0   0   0   0   0   0   1
-# MOV32FR-SAME:  1   0   1   0   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---+-----------+---+-------+---+-------------------------------
-#        BRIEF  DA |    REG    | L | SCALE | 0 |          DISPLACEMENT
-#               ---+-----------+---+-------+---+-------------------------------
-name: MxMove_MR_ARII
-body: |
-  bb.0:
-    MOV8fd  0,  $a0, $d1, $bd0, implicit-def $ccr
-    MOV8fd  -1, $a0, $d1, $bd0, implicit-def $ccr
-    MOV32fr 0,  $a1, $d1,  $d0, implicit-def $ccr
-    MOV32fr 0,  $a2, $a2,  $d1, implicit-def $ccr
-
-...
---- # ARID
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8PD-SAME:   0   0   0   1   0   0   0   1 . 0   1   0   0   0   0   0   0
-# MOV8PD-SAME:   0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32PR-SAME:  0   0   1   0   0   0   1   1 . 0   1   0   0   0   0   0   0
-# MOV32PR-SAME:  1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-#               ---------------------------------------------------------------
-# MOV32PR-SAME:  0   0   1   0   0   0   1   1 . 0   1   0   0   1   0   0   0
-# MOV32PR-SAME:  1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-name: MxMove_MR_ARID
-body: |
-  bb.0:
-    MOV8pd  0,  $a0, $bd0, implicit-def $ccr
-    MOV32pr -1, $a1,  $d0, implicit-def $ccr
-    MOV32pr -1, $a1,  $a0, implicit-def $ccr
-
-...
---- # ARI
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8JD-SAME:   0   0   0   1   0   0   0   0 . 1   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32JR-SAME:  0   0   1   0   0   0   1   0 . 1   0   0   0   0   0   1   1
-#               ---------------------------------------------------------------
-# MOV32JR-SAME:  0   0   1   0   0   0   1   0 . 1   0   0   0   1   1   0   0
-name: MxMove_MR_ARI
-body: |
-  bb.0:
-    MOV8jd  $a0, $bd0, implicit-def $ccr
-    MOV32jr $a1,  $d3, implicit-def $ccr
-    MOV32jr $a1,  $a4, implicit-def $ccr
-
-...
diff --git a/llvm/test/CodeGen/M68k/Encoding/Data/Classes/MxMove_RM.mir b/llvm/test/CodeGen/M68k/Encoding/Data/Classes/MxMove_RM.mir
deleted file mode 100644
index 648db14fff390..0000000000000
--- a/llvm/test/CodeGen/M68k/Encoding/Data/Classes/MxMove_RM.mir
+++ /dev/null
@@ -1,205 +0,0 @@
-# RUN: llc %s -mtriple=m68k -start-after=prologepilog -O0 -filetype=obj -o - \
-# RUN:   | extract-section .text \
-# RUN:   | FileCheck %s -check-prefixes=MOV8DK,MOV32RK,MOV8DQ,MOV32RQ,MOV8DF,MOV32RF
-# RUN: llc %s -mtriple=m68k -start-after=prologepilog -O0 -filetype=obj -o - \
-# RUN:   | extract-section .text \
-# RUN:   | FileCheck %s -check-prefixes=MOV8DP,MOV32RP,MOV8DJ,MOV32RJ,MOV8DO,MOV32RO
-# RUN: llc %s -mtriple=m68k -start-after=prologepilog -O0 -filetype=obj -o - \
-# RUN:   | extract-section .text \
-# RUN:   | FileCheck %s -check-prefixes=MOV8DE,MOV32RE,MOV8DB,MOV32RB
-
-#------------------------------------------------------------------------------
-# MxMove_RM is used for moving data from memory to register
-#------------------------------------------------------------------------------
-
---- # PCI
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8DK:        0   0   0   1   0   0   0   0 . 0   0   1   1   1   0   1   1
-# MOV8DK-SAME:   0   0   0   1   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV8DK-SAME:   0   0   0   1   0   0   0   0 . 0   0   1   1   1   0   1   1
-# MOV8DK-SAME:   0   0   0   1   1   0   0   0 . 1   1   1   1   1   1   1   1
-#               ---------------------------------------------------------------
-# MOV32RK-SAME:  0   0   1   0   0   0   0   0 . 0   0   1   1   1   0   1   1
-# MOV32RK-SAME:  0   0   0   1   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32RK-SAME:  0   0   1   0   0   0   1   0 . 0   0   1   1   1   0   1   1
-# MOV32RK-SAME:  1   0   1   0   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---+-----------+---+-------+---+-------------------------------
-#        BRIEF  DA |    REG    | L | SCALE | 0 |          DISPLACEMENT
-#               ---+-----------+---+-------+---+-------------------------------
-name: MxMove_RM_PCI
-body: |
-  bb.0:
-    $bd0 = MOV8dk  0,  $d1, implicit-def $ccr
-    $bd0 = MOV8dk  -1, $d1, implicit-def $ccr
-     $d0 = MOV32rk 0,  $d1, implicit-def $ccr
-     $d1 = MOV32rk 0,  $a2, implicit-def $ccr
-
-...
---- # PCD
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8DQ-SAME:   0   0   0   1   0   0   0   0 . 0   0   1   1   1   0   1   0
-# MOV8DQ-SAME:   0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32RQ-SAME:  0   0   1   0   0   0   0   0 . 0   0   1   1   1   0   1   0
-# MOV32RQ-SAME:  1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-#               ---------------------------------------------------------------
-# MOV32RQ-SAME:  0   0   1   0   0   0   0   0 . 0   1   1   1   1   0   1   0
-# MOV32RQ-SAME:  1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-name: MxMove_RM_PCD
-body: |
-  bb.0:
-    $bd0 = MOV8dq  0,  implicit-def $ccr
-     $d0 = MOV32rq -1, implicit-def $ccr
-     $a0 = MOV32rq -1, implicit-def $ccr
-
-...
---- # ARII
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8DF:        0   0   0   1   0   0   0   0 . 0   0   1   1   0   0   0   0
-# MOV8DF-SAME:   0   0   0   1   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV8DF-SAME:   0   0   0   1   0   0   0   0 . 0   0   1   1   0   0   0   0
-# MOV8DF-SAME:   0   0   0   1   1   0   0   0 . 1   1   1   1   1   1   1   1
-#               ---------------------------------------------------------------
-# MOV32RF-SAME:  0   0   1   0   0   0   0   0 . 0   0   1   1   0   0   0   1
-# MOV32RF-SAME:  0   0   0   1   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32RF-SAME:  0   0   1   0   0   0   1   0 . 0   0   1   1   0   0   1   0
-# MOV32RF-SAME:  1   0   1   0   1   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---+-----------+---+-------+---+-------------------------------
-#        BRIEF  DA |    REG    | L | SCALE | 0 |          DISPLACEMENT
-#               ---+-----------+---+-------+---+-------------------------------
-name: MxMove_RM_ARII
-body: |
-  bb.0:
-    $bd0 = MOV8df  0,  $a0, $d1, implicit-def $ccr
-    $bd0 = MOV8df  -1, $a0, $d1, implicit-def $ccr
-     $d0 = MOV32rf 0,  $a1, $d1, implicit-def $ccr
-     $d1 = MOV32rf 0,  $a2, $a2, implicit-def $ccr
-
-...
---- # ARID
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8DP:        0   0   0   1   0   0   0   0 . 0   0   1   0   1   0   0   0
-# MOV8DP-SAME:   0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32RP-SAME:  0   0   1   0   0   0   0   0 . 0   0   1   0   1   0   0   1
-# MOV32RP-SAME:  1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-#               ---------------------------------------------------------------
-# MOV32RP-SAME:  0   0   1   0   0   0   0   0 . 0   1   1   0   1   0   0   1
-# MOV32RP-SAME:  1   1   1   1   1   1   1   1 . 1   1   1   1   1   1   1   1
-name: MxMove_RM_ARID
-body: |
-  bb.0:
-    $bd0 = MOV8dp  0,  $a0, implicit-def $ccr
-     $d0 = MOV32rp -1, $a1, implicit-def $ccr
-     $a0 = MOV32rp -1, $a1, implicit-def $ccr
-
-...
---- # ARIPD
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8DE:        0   0   0   1   0   0   0   0 . 0   0   1   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32RE-SAME:  0   0   1   0   0   1   1   0 . 0   0   1   0   0   0   0   1
-#               ---------------------------------------------------------------
-# MOV32RE-SAME:  0   0   1   0   1   0   0   0 . 0   1   1   0   0   0   0   1
-name: MxMove_RM_ARIPD
-body: |
-  bb.0:
-    $bd0 = MOV8de  $a0, implicit-def $ccr
-     $d3 = MOV32re $a1, implicit-def $ccr
-     $a4 = MOV32re $a1, implicit-def $ccr
-
-...
---- # ARIPI
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8DO-SAME:   0   0   0   1   0   0   0   0 . 0   0   0   1   1   0   0   0
-#               ---------------------------------------------------------------
-# MOV32RO-SAME:  0   0   1   0   0   1   1   0 . 0   0   0   1   1   0   0   1
-#               ---------------------------------------------------------------
-# MOV32RO-SAME:  0   0   1   0   1   0   0   0 . 0   1   0   1   1   0   0   1
-name: MxMove_RM_ARIPI
-body: |
-  bb.0:
-    $bd0 = MOV8do  $a0, implicit-def $ccr
-     $d3 = MOV32ro $a1, implicit-def $ccr
-     $a4 = MOV32ro $a1, implicit-def $ccr
-
-...
---- # ARI
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8DJ-SAME:   0   0   0   1   0   0   0   0 . 0   0   0   1   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32RJ-SAME:  0   0   1   0   0   1   1   0 . 0   0   0   1   0   0   0   1
-#               ---------------------------------------------------------------
-# MOV32RJ-SAME:  0   0   1   0   1   0   0   0 . 0   1   0   1   0   0   0   1
-name: MxMove_RM_ARI
-body: |
-  bb.0:
-    $bd0 = MOV8dj  $a0, implicit-def $ccr
-     $d3 = MOV32rj $a1, implicit-def $ccr
-     $a4 = MOV32rj $a1, implicit-def $ccr
-
-...
---- # ABS
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8DB-SAME:   0   0   0   1   0   0   0   0 . 0   0   1   1   1   0   0   1
-# MOV8DB-SAME:   0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32RB-SAME:  0   0   1   0   0   1   1   0 . 0   0   1   1   1   0   0   1
-# MOV32RB-SAME:  0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-# MOV32RB-SAME:  0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-#               ---------------------------------------------------------------
-# MOV32RB-SAME:  0   0   1   0   1   0   0   0 . 0   1   1   1   1   0   0   1
-# MOV32RB-SAME:  0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-# MOV32RB-SAME:  0   0   0   0   0   0   0   0 . 0   0   0   0   0   0   0   0
-name: MxMove_RM_ABS
-body: |
-  bb.0:
-    $bd0 = MOV8db  0, implicit-def $ccr
-     $d3 = MOV32rb 0, implicit-def $ccr
-     $a4 = MOV32rb 0, implicit-def $ccr
-
-...
diff --git a/llvm/test/CodeGen/M68k/Encoding/Data/Classes/MxMove_RR.mir b/llvm/test/CodeGen/M68k/Encoding/Data/Classes/MxMove_RR.mir
deleted file mode 100644
index a71dea90f1e6e..0000000000000
--- a/llvm/test/CodeGen/M68k/Encoding/Data/Classes/MxMove_RR.mir
+++ /dev/null
@@ -1,30 +0,0 @@
-# RUN: llc %s -mtriple=m68k -start-after=prologepilog -O0 -filetype=obj -o - \
-# RUN:   | extract-section .text \
-# RUN:   | FileCheck %s -check-prefixes=MOV8DD,MOV16RA,MOV32RR
-
-#------------------------------------------------------------------------------
-# MxMove_RR moves data from register to register
-#------------------------------------------------------------------------------
-
-
-#               ---------------------------+-----------+-----------+-----------
-#                F   E | D   C | B   A   9 | 8   7   6 | 5   4   3 | 2   1   0
-#               -------+-------+-----------+-----------+-----------+-----------
-#                      |       |      DESTINATION      |         SOURCE
-#                0   0 | SIZE  |    REG    |    MODE   |    MODE   |    REG
-#               -------+-------+-----------+-----------+-----------+-----------
-# MOV8DD:        0   0   0   1   0   0   1   0 . 0   0   0   0   0   0   0   0
-# MOV16RA-SAME:  0   0   1   1   0   1   1   0 . 0   0   0   0   1   0   1   0
-# MOV16RA-SAME:  0   0   1   1   1   1   0   0 . 0   1   0   0   1   0   1   0
-# MOV16RA-SAME:  0   0   1   1   0   0   1   0 . 0   0   0   0   1   0   1   0
-# MOV32RR-SAME:  0   0   1   0   0   0   1   0 . 0   0   0   0   0   0   1   0
-# MOV32RR-SAME:  0   0   1   0   0   0   1   0 . 0   1   0   0   1   0   1   0
-name: MxMove_RR
-body: |
-  bb.0:
-    $bd1 = MOV8dd  $bd0, implicit-def $ccr
-    $wd3 = MOV16ra $wa2, implicit-def $ccr
-    $wa6 = MOV16ra $wa2, implicit-def $ccr
-    $wd1 = MOV16ra $wa2, implicit-def $ccr
-    $d1  = MOV32rr $d2,  implicit-def $ccr
-    $a1  = MOV32rr $a2,  implicit-def $ccr
diff --git a/llvm/test/CodeGen/PowerPC/2007-09-08-unaligned.ll b/llvm/test/CodeGen/PowerPC/2007-09-08-unaligned.ll
index 41ef5ccd0f968..c3599268e02ce 100644
--- a/llvm/test/CodeGen/PowerPC/2007-09-08-unaligned.ll
+++ b/llvm/test/CodeGen/PowerPC/2007-09-08-unaligned.ll
@@ -1,23 +1,37 @@
-; RUN: llc -verify-machineinstrs -mattr=-vsx \
-; RUN:   -mattr=+allow-unaligned-fp-access < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mattr=-vsx -mattr=+allow-unaligned-fp-access | FileCheck %s
+
 ; ModuleID = 'foo.c'
 
 target triple = "powerpc-unknown-linux-gnu"
 
-	%struct.anon = type <{ i8, float }>
+%struct.anon = type <{ i8, float }>
 @s = global %struct.anon <{ i8 3, float 0x4014666660000000 }>		; <%struct.anon*> [#uses=1]
 @u = global <{ i8, double }> <{ i8 3, double 5.100000e+00 }>		; <<{ i8, double }>*> [#uses=1]
 @t = weak global %struct.anon zeroinitializer		; <%struct.anon*> [#uses=2]
 @v = weak global <{ i8, double }> zeroinitializer		; <<{ i8, double }>*> [#uses=2]
 @.str = internal constant [8 x i8] c"%f %lf\0A\00"		; <[8 x i8]*> [#uses=1]
 
-; CHECK: foo
-; CHECK: lfs
-; CHECK: lfd
-; CHECK: stfs
-; CHECK: stfd
-; CHECK: blr
 define i32 @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    stwu 1, -16(1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    lis 3, s@ha
+; CHECK-NEXT:    la 3, s@l(3)
+; CHECK-NEXT:    lfs 0, 1(3)
+; CHECK-NEXT:    lis 3, u@ha
+; CHECK-NEXT:    la 3, u@l(3)
+; CHECK-NEXT:    lfd 1, 1(3)
+; CHECK-NEXT:    lis 3, t@ha
+; CHECK-NEXT:    la 3, t@l(3)
+; CHECK-NEXT:    stfs 0, 1(3)
+; CHECK-NEXT:    lis 3, v@ha
+; CHECK-NEXT:    la 3, v@l(3)
+; CHECK-NEXT:    stfd 1, 1(3)
+; CHECK-NEXT:    lwz 3, 12(1)
+; CHECK-NEXT:    addi 1, 1, 16
+; CHECK-NEXT:    blr
 entry:
 	%retval = alloca i32, align 4		; <i32*> [#uses=1]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
@@ -36,11 +50,30 @@ return:		; preds = %entry
 	ret i32 %retval6
 }
 
-; CHECK: main
-; CHECK: lfs
-; CHECK: lfd
-; CHECK: blr
 define i32 @main() {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    stw 0, 4(1)
+; CHECK-NEXT:    stwu 1, -16(1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset lr, 4
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    lis 3, t@ha
+; CHECK-NEXT:    la 3, t@l(3)
+; CHECK-NEXT:    lfs 1, 1(3)
+; CHECK-NEXT:    lis 3, v@ha
+; CHECK-NEXT:    la 3, v@l(3)
+; CHECK-NEXT:    lfd 2, 1(3)
+; CHECK-NEXT:    lis 3, .str@ha
+; CHECK-NEXT:    la 3, .str@l(3)
+; CHECK-NEXT:    creqv 6, 6, 6
+; CHECK-NEXT:    bl printf
+; CHECK-NEXT:    lwz 3, 12(1)
+; CHECK-NEXT:    lwz 0, 20(1)
+; CHECK-NEXT:    addi 1, 1, 16
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
 entry:
 	%retval = alloca i32, align 4		; <i32*> [#uses=1]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
diff --git a/llvm/test/CodeGen/PowerPC/frem.ll b/llvm/test/CodeGen/PowerPC/frem.ll
new file mode 100644
index 0000000000000..4df4f8aff56a9
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/frem.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=powerpc64le -mcpu=pwr9 < %s | FileCheck %s
+
+define float @frem32(float %a, float %b) {
+; CHECK-LABEL: frem32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    std 0, 16(1)
+; CHECK-NEXT:    stdu 1, -32(1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl fmodf
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi 1, 1, 32
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
+entry:
+  %rem = frem fast float %a, %b
+  ret float %rem
+}
+
+define double @frem64(double %a, double %b) {
+; CHECK-LABEL: frem64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    std 0, 16(1)
+; CHECK-NEXT:    stdu 1, -32(1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl fmod
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi 1, 1, 32
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
+entry:
+  %rem = frem fast double %a, %b
+  ret double %rem
+}
+
+define <4 x float> @frem4x32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: frem4x32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    std 0, 16(1)
+; CHECK-NEXT:    stdu 1, -96(1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    .cfi_offset v28, -64
+; CHECK-NEXT:    .cfi_offset v29, -48
+; CHECK-NEXT:    .cfi_offset v30, -32
+; CHECK-NEXT:    .cfi_offset v31, -16
+; CHECK-NEXT:    xxsldwi 0, 34, 34, 3
+; CHECK-NEXT:    stxv 60, 32(1) # 16-byte Folded Spill
+; CHECK-NEXT:    xscvspdpn 1, 0
+; CHECK-NEXT:    xxsldwi 0, 35, 35, 3
+; CHECK-NEXT:    stxv 61, 48(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 62, 64(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 63, 80(1) # 16-byte Folded Spill
+; CHECK-NEXT:    xscvspdpn 2, 0
+; CHECK-NEXT:    vmr 31, 3
+; CHECK-NEXT:    vmr 30, 2
+; CHECK-NEXT:    bl fmodf
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxsldwi 0, 62, 62, 1
+; CHECK-NEXT:    xscpsgndp 61, 1, 1
+; CHECK-NEXT:    xscvspdpn 1, 0
+; CHECK-NEXT:    xxsldwi 0, 63, 63, 1
+; CHECK-NEXT:    xscvspdpn 2, 0
+; CHECK-NEXT:    bl fmodf
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
+; CHECK-NEXT:    xxmrghd 0, 1, 61
+; CHECK-NEXT:    xscvspdpn 1, 62
+; CHECK-NEXT:    xscvspdpn 2, 63
+; CHECK-NEXT:    xvcvdpsp 60, 0
+; CHECK-NEXT:    bl fmodf
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 0, 62
+; CHECK-NEXT:    xscpsgndp 61, 1, 1
+; CHECK-NEXT:    xscvspdpn 1, 0
+; CHECK-NEXT:    xxswapd 0, 63
+; CHECK-NEXT:    xscvspdpn 2, 0
+; CHECK-NEXT:    bl fmodf
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
+; CHECK-NEXT:    xxmrghd 0, 61, 1
+; CHECK-NEXT:    lxv 63, 80(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 62, 64(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 61, 48(1) # 16-byte Folded Reload
+; CHECK-NEXT:    xvcvdpsp 34, 0
+; CHECK-NEXT:    vmrgew 2, 2, 28
+; CHECK-NEXT:    lxv 60, 32(1) # 16-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 96
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
+entry:
+  %rem = frem fast <4 x float> %a, %b
+  ret <4 x float> %rem
+}
+
+define <2 x double> @frem2x64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: frem2x64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    std 0, 16(1)
+; CHECK-NEXT:    stdu 1, -80(1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    .cfi_offset v29, -48
+; CHECK-NEXT:    .cfi_offset v30, -32
+; CHECK-NEXT:    .cfi_offset v31, -16
+; CHECK-NEXT:    stxv 62, 48(1) # 16-byte Folded Spill
+; CHECK-NEXT:    stxv 63, 64(1) # 16-byte Folded Spill
+; CHECK-NEXT:    vmr 31, 3
+; CHECK-NEXT:    xscpsgndp 2, 63, 63
+; CHECK-NEXT:    vmr 30, 2
+; CHECK-NEXT:    xscpsgndp 1, 62, 62
+; CHECK-NEXT:    stxv 61, 32(1) # 16-byte Folded Spill
+; CHECK-NEXT:    bl fmod
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xscpsgndp 61, 1, 1
+; CHECK-NEXT:    xxswapd 1, 62
+; CHECK-NEXT:    xxswapd 2, 63
+; CHECK-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; CHECK-NEXT:    # kill: def $f2 killed $f2 killed $vsl2
+; CHECK-NEXT:    bl fmod
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
+; CHECK-NEXT:    xxmrghd 34, 61, 1
+; CHECK-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
+; CHECK-NEXT:    lxv 61, 32(1) # 16-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 80
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
+entry:
+  %rem = frem fast <2 x double> %a, %b
+  ret <2 x double> %rem
+}
diff --git a/llvm/test/CodeGen/PowerPC/huge-frame-size.ll b/llvm/test/CodeGen/PowerPC/huge-frame-size.ll
new file mode 100644
index 0000000000000..b3af89cad7c4d
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/huge-frame-size.ll
@@ -0,0 +1,17 @@
+; REQUIRES: asserts
+; RUN: not --crash llc -verify-machineinstrs -mtriple=powerpc64le-linux-gnu < %s \
+; RUN:   2>&1 | FileCheck --check-prefix=CHECK-LE %s
+; RUN: not --crash llc -verify-machineinstrs -mtriple=powerpc64-ibm-aix-xcoff < %s \
+; RUN:   2>&1 | FileCheck --check-prefix=CHECK-BE %s
+
+declare void @bar(i8*)
+
+define void @foo(i8 %x) {
+; CHECK-LE: Unhandled stack size
+; CHECK-BE: Unhandled stack size
+entry:
+  %a = alloca i8, i64 4294967296, align 16
+  %b = getelementptr i8, i8* %a, i64 0
+  store volatile i8 %x, i8* %b
+  ret void
+}
diff --git a/llvm/test/CodeGen/PowerPC/pr35688.ll b/llvm/test/CodeGen/PowerPC/pr35688.ll
index 2be1add6cdfce..5b156fcc057d1 100644
--- a/llvm/test/CodeGen/PowerPC/pr35688.ll
+++ b/llvm/test/CodeGen/PowerPC/pr35688.ll
@@ -1,45 +1,26 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -enable-mssa-loop-dependency=false -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown < %s  | \
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown < %s  | \
 ; RUN:   FileCheck %s
-; RUN: llc -enable-mssa-loop-dependency=true -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown < %s  | \
-; RUN:   FileCheck %s --check-prefix=MSSA
-; Function Attrs: nounwind
+
+; With MemorySSA, everything is taken out of the loop by licm.
+; Loads and stores to undef are treated as non-aliasing.
 define void @ec_GFp_nistp256_points_mul() {
 ; CHECK-LABEL: ec_GFp_nistp256_points_mul:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    ld 5, 0(3)
-; CHECK-NEXT:    li 3, 127
+; CHECK-NEXT:    ld 3, 0(3)
 ; CHECK-NEXT:    li 4, 0
-; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:    subfic 5, 3, 0
+; CHECK-NEXT:    subfze 5, 4
+; CHECK-NEXT:    sradi 5, 5, 63
+; CHECK-NEXT:    subc 3, 5, 3
+; CHECK-NEXT:    subfe 3, 4, 5
+; CHECK-NEXT:    sradi 3, 3, 63
+; CHECK-NEXT:    std 3, 0(3)
+; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_1: # %fe_cmovznz.exit.i534.i.15
 ; CHECK-NEXT:    #
-; CHECK-NEXT:    subfic 6, 5, 0
-; CHECK-NEXT:    subfze 6, 4
-; CHECK-NEXT:    sradi 7, 6, 63
-; CHECK-NEXT:    srad 6, 6, 3
-; CHECK-NEXT:    subc 5, 7, 5
-; CHECK-NEXT:    subfe 5, 4, 6
-; CHECK-NEXT:    sradi 5, 5, 63
 ; CHECK-NEXT:    b .LBB0_1
-;
-; MSSA-LABEL: ec_GFp_nistp256_points_mul:
-; MSSA:       # %bb.0: # %entry
-; MSSA-NEXT:    ld 3, 0(3)
-; MSSA-NEXT:    li 4, 0
-; MSSA-NEXT:    subfic 5, 3, 0
-; MSSA-NEXT:    subfze 5, 4
-; MSSA-NEXT:    sradi 5, 5, 63
-; MSSA-NEXT:    subc 3, 5, 3
-; MSSA-NEXT:    subfe 3, 4, 5
-; MSSA-NEXT:    sradi 3, 3, 63
-; MSSA-NEXT:    std 3, 0(3)
-; MSSA-NEXT:    .p2align 4
-; MSSA-NEXT:  .LBB0_1: # %fe_cmovznz.exit.i534.i.15
-; MSSA-NEXT:    #
-; MSSA-NEXT:    b .LBB0_1
 
-; With MemorySSA, everything is taken out of the loop by licm.
-; Loads and stores to undef are treated as non-aliasing.
 entry:
   br label %fe_cmovznz.exit.i534.i.15
 
diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll
index 6f23d316d8cec..7cbd577682009 100644
--- a/llvm/test/CodeGen/PowerPC/sat-add.ll
+++ b/llvm/test/CodeGen/PowerPC/sat-add.ll
@@ -747,118 +747,114 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr {
 ; CHECK-NEXT:    vadduqm 0, 2, 6
 ; CHECK-NEXT:    xxswapd 0, 34
 ; CHECK-NEXT:    std 30, -16(1) # 8-byte Folded Spill
-; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    addis 3, 2, .LCPI48_0@toc@ha
 ; CHECK-NEXT:    vadduqm 1, 3, 7
 ; CHECK-NEXT:    xxswapd 1, 35
-; CHECK-NEXT:    xxswapd 2, 32
+; CHECK-NEXT:    addi 3, 3, .LCPI48_0@toc@l
+; CHECK-NEXT:    xxswapd 3, 32
 ; CHECK-NEXT:    mfvsrd 4, 34
-; CHECK-NEXT:    mfvsrd 9, 32
-; CHECK-NEXT:    mffprd 0, 0
+; CHECK-NEXT:    mfvsrd 8, 32
+; CHECK-NEXT:    xxswapd 2, 36
+; CHECK-NEXT:    mffprd 12, 0
 ; CHECK-NEXT:    xxswapd 0, 33
-; CHECK-NEXT:    mfvsrd 5, 38
-; CHECK-NEXT:    cmpld 9, 4
-; CHECK-NEXT:    cmpd 1, 9, 4
-; CHECK-NEXT:    vadduqm 6, 4, 8
-; CHECK-NEXT:    mffprd 4, 2
-; CHECK-NEXT:    sradi 5, 5, 63
-; CHECK-NEXT:    mffprd 30, 1
-; CHECK-NEXT:    xxswapd 1, 36
+; CHECK-NEXT:    vadduqm 10, 4, 8
+; CHECK-NEXT:    cmpld 8, 4
+; CHECK-NEXT:    cmpd 1, 8, 4
+; CHECK-NEXT:    mffprd 4, 3
+; CHECK-NEXT:    lxvd2x 3, 0, 3
+; CHECK-NEXT:    sradi 3, 8, 63
+; CHECK-NEXT:    mffprd 0, 1
+; CHECK-NEXT:    xxswapd 1, 37
+; CHECK-NEXT:    mfvsrd 5, 35
+; CHECK-NEXT:    vadduqm 11, 5, 9
+; CHECK-NEXT:    xxswapd 34, 3
+; CHECK-NEXT:    mfvsrd 9, 33
 ; CHECK-NEXT:    crandc 20, 4, 2
-; CHECK-NEXT:    cmpld 1, 4, 0
+; CHECK-NEXT:    cmpld 1, 4, 12
 ; CHECK-NEXT:    mffprd 4, 0
-; CHECK-NEXT:    xxswapd 0, 38
-; CHECK-NEXT:    mfvsrd 6, 35
-; CHECK-NEXT:    vadduqm 10, 5, 9
-; CHECK-NEXT:    cmpld 6, 4, 30
-; CHECK-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
-; CHECK-NEXT:    mfvsrd 10, 33
-; CHECK-NEXT:    mfvsrd 7, 36
-; CHECK-NEXT:    mfvsrd 11, 38
+; CHECK-NEXT:    xxswapd 0, 42
+; CHECK-NEXT:    mfvsrd 6, 36
+; CHECK-NEXT:    mfvsrd 10, 42
+; CHECK-NEXT:    cmpld 6, 4, 0
 ; CHECK-NEXT:    crand 21, 2, 4
-; CHECK-NEXT:    cmpld 10, 6
-; CHECK-NEXT:    cmpd 1, 10, 6
-; CHECK-NEXT:    mffprd 6, 1
-; CHECK-NEXT:    xxswapd 1, 37
+; CHECK-NEXT:    cmpld 9, 5
+; CHECK-NEXT:    cmpd 1, 9, 5
+; CHECK-NEXT:    mffprd 5, 1
+; CHECK-NEXT:    xxswapd 1, 43
+; CHECK-NEXT:    mffprd 30, 2
 ; CHECK-NEXT:    mffprd 4, 0
-; CHECK-NEXT:    xxswapd 0, 42
-; CHECK-NEXT:    mfvsrd 8, 37
-; CHECK-NEXT:    mfvsrd 12, 42
+; CHECK-NEXT:    mfvsrd 7, 37
+; CHECK-NEXT:    mfvsrd 11, 43
 ; CHECK-NEXT:    crandc 22, 4, 2
-; CHECK-NEXT:    cmpd 1, 11, 7
+; CHECK-NEXT:    cmpd 1, 10, 6
 ; CHECK-NEXT:    crand 23, 2, 24
-; CHECK-NEXT:    cmpld 11, 7
+; CHECK-NEXT:    cmpld 10, 6
 ; CHECK-NEXT:    crandc 24, 4, 2
-; CHECK-NEXT:    cmpld 1, 4, 6
+; CHECK-NEXT:    cmpld 1, 4, 30
+; CHECK-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
 ; CHECK-NEXT:    mffprd 4, 1
-; CHECK-NEXT:    mffprd 6, 0
+; CHECK-NEXT:    mfvsrd 6, 38
 ; CHECK-NEXT:    crand 25, 2, 4
-; CHECK-NEXT:    cmpld 12, 8
-; CHECK-NEXT:    cmpd 1, 12, 8
+; CHECK-NEXT:    cmpld 11, 7
+; CHECK-NEXT:    cmpd 1, 11, 7
 ; CHECK-NEXT:    crandc 26, 4, 2
-; CHECK-NEXT:    cmpld 1, 6, 4
+; CHECK-NEXT:    cmpld 1, 4, 5
+; CHECK-NEXT:    sradi 4, 6, 63
+; CHECK-NEXT:    mtfprd 0, 4
 ; CHECK-NEXT:    mfvsrd 4, 39
-; CHECK-NEXT:    mtfprd 0, 5
+; CHECK-NEXT:    mfvsrd 5, 40
+; CHECK-NEXT:    mfvsrd 6, 41
 ; CHECK-NEXT:    sradi 4, 4, 63
-; CHECK-NEXT:    mfvsrd 5, 41
 ; CHECK-NEXT:    mtfprd 1, 4
-; CHECK-NEXT:    xxspltd 34, 0, 0
-; CHECK-NEXT:    mfvsrd 4, 40
+; CHECK-NEXT:    sradi 4, 5, 63
+; CHECK-NEXT:    mtfprd 2, 4
+; CHECK-NEXT:    sradi 4, 6, 63
+; CHECK-NEXT:    mtfprd 5, 3
+; CHECK-NEXT:    sradi 3, 10, 63
+; CHECK-NEXT:    mtfprd 4, 4
+; CHECK-NEXT:    sradi 4, 9, 63
+; CHECK-NEXT:    mtfprd 6, 4
+; CHECK-NEXT:    xxspltd 35, 5, 0
+; CHECK-NEXT:    sradi 4, 11, 63
 ; CHECK-NEXT:    crnor 20, 21, 20
-; CHECK-NEXT:    sradi 4, 4, 63
+; CHECK-NEXT:    xxspltd 38, 4, 0
+; CHECK-NEXT:    mtfprd 3, 3
+; CHECK-NEXT:    li 3, -1
+; CHECK-NEXT:    xxspltd 36, 6, 0
+; CHECK-NEXT:    mtfprd 5, 4
 ; CHECK-NEXT:    crand 27, 2, 4
-; CHECK-NEXT:    mtfprd 2, 4
-; CHECK-NEXT:    sradi 4, 5, 63
-; CHECK-NEXT:    sradi 5, 10, 63
-; CHECK-NEXT:    mtfprd 3, 4
+; CHECK-NEXT:    xxspltd 37, 3, 0
+; CHECK-NEXT:    xxlxor 3, 35, 34
+; CHECK-NEXT:    xxspltd 35, 5, 0
 ; CHECK-NEXT:    isel 4, 0, 3, 20
-; CHECK-NEXT:    xxspltd 36, 2, 0
+; CHECK-NEXT:    mtfprd 8, 4
 ; CHECK-NEXT:    crnor 20, 23, 22
-; CHECK-NEXT:    mtfprd 4, 4
-; CHECK-NEXT:    sradi 4, 9, 63
-; CHECK-NEXT:    mtfprd 0, 4
-; CHECK-NEXT:    addis 4, 2, .LCPI48_0@toc@ha
-; CHECK-NEXT:    mtfprd 5, 5
-; CHECK-NEXT:    xxspltd 35, 4, 0
-; CHECK-NEXT:    addi 4, 4, .LCPI48_0@toc@l
-; CHECK-NEXT:    isel 5, 0, 3, 20
-; CHECK-NEXT:    lxvd2x 6, 0, 4
-; CHECK-NEXT:    mtfprd 4, 5
-; CHECK-NEXT:    addis 5, 2, .LCPI48_1@toc@ha
-; CHECK-NEXT:    xxspltd 37, 5, 0
-; CHECK-NEXT:    addi 4, 5, .LCPI48_1@toc@l
-; CHECK-NEXT:    xxlxor 7, 34, 35
+; CHECK-NEXT:    crnor 21, 25, 24
+; CHECK-NEXT:    crnor 22, 27, 26
+; CHECK-NEXT:    xxlxor 5, 36, 34
+; CHECK-NEXT:    xxspltd 36, 2, 0
+; CHECK-NEXT:    xxlxor 6, 37, 34
+; CHECK-NEXT:    xxlxor 7, 35, 34
+; CHECK-NEXT:    xxspltd 34, 0, 0
+; CHECK-NEXT:    xxspltd 35, 8, 0
+; CHECK-NEXT:    isel 4, 0, 3, 20
+; CHECK-NEXT:    isel 5, 0, 3, 21
+; CHECK-NEXT:    isel 3, 0, 3, 22
+; CHECK-NEXT:    xxlxor 0, 34, 35
 ; CHECK-NEXT:    xxspltd 34, 1, 0
-; CHECK-NEXT:    sradi 5, 11, 63
-; CHECK-NEXT:    lxvd2x 8, 0, 4
-; CHECK-NEXT:    xxspltd 35, 4, 0
-; CHECK-NEXT:    crnor 20, 25, 24
-; CHECK-NEXT:    sradi 4, 12, 63
-; CHECK-NEXT:    crnor 21, 27, 26
-; CHECK-NEXT:    xxswapd 4, 6
+; CHECK-NEXT:    mtfprd 8, 4
 ; CHECK-NEXT:    mtfprd 1, 5
-; CHECK-NEXT:    mtfprd 9, 4
-; CHECK-NEXT:    xxswapd 6, 8
-; CHECK-NEXT:    xxlxor 2, 34, 35
-; CHECK-NEXT:    xxspltd 35, 0, 0
-; CHECK-NEXT:    isel 4, 0, 3, 20
-; CHECK-NEXT:    xxspltd 39, 1, 0
-; CHECK-NEXT:    isel 3, 0, 3, 21
-; CHECK-NEXT:    xxspltd 40, 9, 0
-; CHECK-NEXT:    mtfprd 0, 4
-; CHECK-NEXT:    xxspltd 34, 3, 0
-; CHECK-NEXT:    mtfprd 1, 3
-; CHECK-NEXT:    xxsel 3, 6, 4, 39
-; CHECK-NEXT:    xxspltd 41, 0, 0
-; CHECK-NEXT:    xxsel 0, 6, 4, 35
-; CHECK-NEXT:    xxspltd 35, 1, 0
-; CHECK-NEXT:    xxsel 1, 6, 4, 37
-; CHECK-NEXT:    xxsel 4, 6, 4, 40
-; CHECK-NEXT:    xxlxor 5, 36, 41
-; CHECK-NEXT:    xxlxor 6, 34, 35
-; CHECK-NEXT:    xxsel 34, 32, 0, 7
-; CHECK-NEXT:    xxsel 35, 33, 1, 2
-; CHECK-NEXT:    xxsel 36, 38, 3, 5
-; CHECK-NEXT:    xxsel 37, 42, 4, 6
+; CHECK-NEXT:    mtfprd 9, 3
+; CHECK-NEXT:    xxspltd 35, 8, 0
+; CHECK-NEXT:    xxspltd 37, 1, 0
+; CHECK-NEXT:    xxspltd 39, 9, 0
+; CHECK-NEXT:    xxlxor 1, 34, 35
+; CHECK-NEXT:    xxsel 34, 32, 3, 0
+; CHECK-NEXT:    xxlxor 2, 36, 37
+; CHECK-NEXT:    xxlxor 4, 38, 39
+; CHECK-NEXT:    xxsel 35, 33, 5, 1
+; CHECK-NEXT:    xxsel 36, 42, 6, 2
+; CHECK-NEXT:    xxsel 37, 43, 7, 4
 ; CHECK-NEXT:    blr
   %c = call <4 x i128> @llvm.sadd.sat.v4i128(<4 x i128> %a, <4 x i128> %b)
   ret <4 x i128> %c
diff --git a/llvm/test/CodeGen/RISCV/add-before-shl.ll b/llvm/test/CodeGen/RISCV/add-before-shl.ll
index a80b634c468b9..212c74b79a343 100644
--- a/llvm/test/CodeGen/RISCV/add-before-shl.ll
+++ b/llvm/test/CodeGen/RISCV/add-before-shl.ll
@@ -21,7 +21,7 @@ define signext i32 @add_small_const(i32 signext %a) nounwind {
 ;
 ; RV64I-LABEL: add_small_const:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    addiw a0, a0, 1
 ; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    jalr zero, 0(ra)
@@ -35,7 +35,7 @@ define signext i32 @add_small_const(i32 signext %a) nounwind {
 ;
 ; RV64C-LABEL: add_small_const:
 ; RV64C:       # %bb.0:
-; RV64C-NEXT:    c.addi a0, 1
+; RV64C-NEXT:    c.addiw a0, 1
 ; RV64C-NEXT:    c.slli a0, 56
 ; RV64C-NEXT:    c.srai a0, 56
 ; RV64C-NEXT:    c.jr ra
@@ -75,7 +75,7 @@ define signext i32 @add_large_const(i32 signext %a) nounwind {
 ; RV64C:       # %bb.0:
 ; RV64C-NEXT:    c.lui a1, 1
 ; RV64C-NEXT:    c.addiw a1, -1
-; RV64C-NEXT:    c.add a0, a1
+; RV64C-NEXT:    c.addw a0, a1
 ; RV64C-NEXT:    c.slli a0, 48
 ; RV64C-NEXT:    c.srai a0, 48
 ; RV64C-NEXT:    c.jr ra
@@ -115,7 +115,7 @@ define signext i32 @add_huge_const(i32 signext %a) nounwind {
 ; RV64C:       # %bb.0:
 ; RV64C-NEXT:    c.lui a1, 8
 ; RV64C-NEXT:    c.addiw a1, -1
-; RV64C-NEXT:    c.add a0, a1
+; RV64C-NEXT:    c.addw a0, a1
 ; RV64C-NEXT:    c.slli a0, 48
 ; RV64C-NEXT:    c.srai a0, 48
 ; RV64C-NEXT:    c.jr ra
@@ -135,7 +135,7 @@ define signext i24 @add_non_machine_type(i24 signext %a) nounwind {
 ;
 ; RV64I-LABEL: add_non_machine_type:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 256
+; RV64I-NEXT:    addiw a0, a0, 256
 ; RV64I-NEXT:    slli a0, a0, 52
 ; RV64I-NEXT:    srai a0, a0, 40
 ; RV64I-NEXT:    jalr zero, 0(ra)
@@ -149,7 +149,7 @@ define signext i24 @add_non_machine_type(i24 signext %a) nounwind {
 ;
 ; RV64C-LABEL: add_non_machine_type:
 ; RV64C:       # %bb.0:
-; RV64C-NEXT:    addi a0, a0, 256
+; RV64C-NEXT:    addiw a0, a0, 256
 ; RV64C-NEXT:    c.slli a0, 52
 ; RV64C-NEXT:    c.srai a0, 40
 ; RV64C-NEXT:    c.jr ra
diff --git a/llvm/test/CodeGen/RISCV/add-imm.ll b/llvm/test/CodeGen/RISCV/add-imm.ll
index 37231aae4454c..82178da984b8f 100644
--- a/llvm/test/CodeGen/RISCV/add-imm.ll
+++ b/llvm/test/CodeGen/RISCV/add-imm.ll
@@ -14,7 +14,7 @@ define i32 @add_positive_low_bound_reject(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: add_positive_low_bound_reject:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 2047
+; RV64I-NEXT:    addiw a0, a0, 2047
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 2047
   ret i32 %1
@@ -29,8 +29,8 @@ define i32 @add_positive_low_bound_accept(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: add_positive_low_bound_accept:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 1024
-; RV64I-NEXT:    addi a0, a0, 1024
+; RV64I-NEXT:    addiw a0, a0, 1024
+; RV64I-NEXT:    addiw a0, a0, 1024
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 2048
   ret i32 %1
@@ -45,8 +45,8 @@ define i32 @add_positive_high_bound_accept(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: add_positive_high_bound_accept:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 2047
-; RV64I-NEXT:    addi a0, a0, 2047
+; RV64I-NEXT:    addiw a0, a0, 2047
+; RV64I-NEXT:    addiw a0, a0, 2047
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 4094
   ret i32 %1
@@ -64,7 +64,7 @@ define i32 @add_positive_high_bound_reject(i32 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a1, 1
 ; RV64I-NEXT:    addiw a1, a1, -1
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 4095
   ret i32 %1
@@ -78,7 +78,7 @@ define i32 @add_negative_high_bound_reject(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: add_negative_high_bound_reject:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, -2048
+; RV64I-NEXT:    addiw a0, a0, -2048
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -2048
   ret i32 %1
@@ -93,8 +93,8 @@ define i32 @add_negative_high_bound_accept(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: add_negative_high_bound_accept:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, -1025
-; RV64I-NEXT:    addi a0, a0, -1024
+; RV64I-NEXT:    addiw a0, a0, -1025
+; RV64I-NEXT:    addiw a0, a0, -1024
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -2049
   ret i32 %1
@@ -109,8 +109,8 @@ define i32 @add_negative_low_bound_accept(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: add_negative_low_bound_accept:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, -2048
-; RV64I-NEXT:    addi a0, a0, -2048
+; RV64I-NEXT:    addiw a0, a0, -2048
+; RV64I-NEXT:    addiw a0, a0, -2048
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -4096
   ret i32 %1
@@ -128,7 +128,7 @@ define i32 @add_negative_low_bound_reject(i32 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a1, 1048575
 ; RV64I-NEXT:    addiw a1, a1, -1
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, -4097
   ret i32 %1
@@ -143,8 +143,8 @@ define i32 @add32_accept(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: add32_accept:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 1500
-; RV64I-NEXT:    addi a0, a0, 1499
+; RV64I-NEXT:    addiw a0, a0, 1500
+; RV64I-NEXT:    addiw a0, a0, 1499
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 2999
   ret i32 %1
@@ -178,12 +178,10 @@ define signext i32 @add32_sext_reject_on_rv64(i32 signext %a) nounwind {
 ;
 ; RV64I-LABEL: add32_sext_reject_on_rv64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, 1
-; RV64I-NEXT:    addiw a1, a1, -1096
-; RV64I-NEXT:    add a2, a0, a1
-; RV64I-NEXT:    lui a3, %hi(gv0)
-; RV64I-NEXT:    addw a0, a0, a1
-; RV64I-NEXT:    sw a2, %lo(gv0)(a3)
+; RV64I-NEXT:    addiw a0, a0, 1500
+; RV64I-NEXT:    addiw a0, a0, 1500
+; RV64I-NEXT:    lui a1, %hi(gv0)
+; RV64I-NEXT:    sw a0, %lo(gv0)(a1)
 ; RV64I-NEXT:    ret
   %b = add nsw i32 %a, 3000
   store i32 %b, i32* @gv0, align 4
@@ -234,8 +232,8 @@ define void @add32_reject() nounwind {
 ; RV64I-NEXT:    lw a3, %lo(gb)(a2)
 ; RV64I-NEXT:    lui a4, 1
 ; RV64I-NEXT:    addiw a4, a4, -1096
-; RV64I-NEXT:    add a1, a1, a4
-; RV64I-NEXT:    add a3, a3, a4
+; RV64I-NEXT:    addw a1, a1, a4
+; RV64I-NEXT:    addw a3, a3, a4
 ; RV64I-NEXT:    sw a1, %lo(ga)(a0)
 ; RV64I-NEXT:    sw a3, %lo(gb)(a2)
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
index ea08b5ad53ed6..e56b037f0fd9c 100644
--- a/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
+++ b/llvm/test/CodeGen/RISCV/addimm-mulimm.ll
@@ -2,97 +2,339 @@
 ;; Test that (mul (add x, c1), c2) can be transformed to
 ;; (add (mul x, c2), c1*c2) if profitable.
 
-; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \
-; RUN:   | FileCheck -check-prefix=RV32IM %s
-; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
-; RUN:   | FileCheck -check-prefix=RV64IM %s
-
-define signext i32 @add_mul_trans_accept_1(i32 %x) {
-; RV32IM-LABEL: add_mul_trans_accept_1:
-; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    addi a1, zero, 11
-; RV32IM-NEXT:    mul a0, a0, a1
-; RV32IM-NEXT:    addi a0, a0, 407
-; RV32IM-NEXT:    ret
+; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-zba -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV32IMB %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-zba -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64IMB %s
+
+define i32 @add_mul_combine_accept_a1(i32 %x) {
+; RV32IMB-LABEL: add_mul_combine_accept_a1:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a1, zero, 29
+; RV32IMB-NEXT:    mul a0, a0, a1
+; RV32IMB-NEXT:    addi a0, a0, 1073
+; RV32IMB-NEXT:    ret
+;
+; RV64IMB-LABEL: add_mul_combine_accept_a1:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addi a1, zero, 29
+; RV64IMB-NEXT:    mulw a0, a0, a1
+; RV64IMB-NEXT:    addiw a0, a0, 1073
+; RV64IMB-NEXT:    ret
+  %tmp0 = add i32 %x, 37
+  %tmp1 = mul i32 %tmp0, 29
+  ret i32 %tmp1
+}
+
+define signext i32 @add_mul_combine_accept_a2(i32 signext %x) {
+; RV32IMB-LABEL: add_mul_combine_accept_a2:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a1, zero, 29
+; RV32IMB-NEXT:    mul a0, a0, a1
+; RV32IMB-NEXT:    addi a0, a0, 1073
+; RV32IMB-NEXT:    ret
 ;
-; RV64IM-LABEL: add_mul_trans_accept_1:
-; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    addi a1, zero, 11
-; RV64IM-NEXT:    mul a0, a0, a1
-; RV64IM-NEXT:    addiw a0, a0, 407
-; RV64IM-NEXT:    ret
+; RV64IMB-LABEL: add_mul_combine_accept_a2:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addi a1, zero, 29
+; RV64IMB-NEXT:    mulw a0, a0, a1
+; RV64IMB-NEXT:    addiw a0, a0, 1073
+; RV64IMB-NEXT:    ret
   %tmp0 = add i32 %x, 37
-  %tmp1 = mul i32 %tmp0, 11
+  %tmp1 = mul i32 %tmp0, 29
+  ret i32 %tmp1
+}
+
+define i64 @add_mul_combine_accept_a3(i64 %x) {
+; RV32IMB-LABEL: add_mul_combine_accept_a3:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a2, zero, 29
+; RV32IMB-NEXT:    mul a1, a1, a2
+; RV32IMB-NEXT:    mulhu a3, a0, a2
+; RV32IMB-NEXT:    add a1, a3, a1
+; RV32IMB-NEXT:    mul a2, a0, a2
+; RV32IMB-NEXT:    addi a0, a2, 1073
+; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a1, a2
+; RV32IMB-NEXT:    ret
+;
+; RV64IMB-LABEL: add_mul_combine_accept_a3:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addi a1, zero, 29
+; RV64IMB-NEXT:    mul a0, a0, a1
+; RV64IMB-NEXT:    addi a0, a0, 1073
+; RV64IMB-NEXT:    ret
+  %tmp0 = add i64 %x, 37
+  %tmp1 = mul i64 %tmp0, 29
+  ret i64 %tmp1
+}
+
+define i32 @add_mul_combine_accept_b1(i32 %x) {
+; RV32IMB-LABEL: add_mul_combine_accept_b1:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a1, zero, 23
+; RV32IMB-NEXT:    mul a0, a0, a1
+; RV32IMB-NEXT:    lui a1, 50
+; RV32IMB-NEXT:    addi a1, a1, 1119
+; RV32IMB-NEXT:    add a0, a0, a1
+; RV32IMB-NEXT:    ret
+;
+; RV64IMB-LABEL: add_mul_combine_accept_b1:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addi a1, zero, 23
+; RV64IMB-NEXT:    mulw a0, a0, a1
+; RV64IMB-NEXT:    lui a1, 50
+; RV64IMB-NEXT:    addiw a1, a1, 1119
+; RV64IMB-NEXT:    addw a0, a0, a1
+; RV64IMB-NEXT:    ret
+  %tmp0 = add i32 %x, 8953
+  %tmp1 = mul i32 %tmp0, 23
   ret i32 %tmp1
 }
 
-define signext i32 @add_mul_trans_accept_2(i32 %x) {
-; RV32IM-LABEL: add_mul_trans_accept_2:
-; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    addi a1, zero, 13
-; RV32IM-NEXT:    mul a0, a0, a1
-; RV32IM-NEXT:    lui a1, 28
-; RV32IM-NEXT:    addi a1, a1, 1701
-; RV32IM-NEXT:    add a0, a0, a1
-; RV32IM-NEXT:    ret
+define signext i32 @add_mul_combine_accept_b2(i32 signext %x) {
+; RV32IMB-LABEL: add_mul_combine_accept_b2:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a1, zero, 23
+; RV32IMB-NEXT:    mul a0, a0, a1
+; RV32IMB-NEXT:    lui a1, 50
+; RV32IMB-NEXT:    addi a1, a1, 1119
+; RV32IMB-NEXT:    add a0, a0, a1
+; RV32IMB-NEXT:    ret
 ;
-; RV64IM-LABEL: add_mul_trans_accept_2:
-; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    addi a1, zero, 13
-; RV64IM-NEXT:    mul a0, a0, a1
-; RV64IM-NEXT:    lui a1, 28
-; RV64IM-NEXT:    addiw a1, a1, 1701
-; RV64IM-NEXT:    addw a0, a0, a1
-; RV64IM-NEXT:    ret
+; RV64IMB-LABEL: add_mul_combine_accept_b2:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addi a1, zero, 23
+; RV64IMB-NEXT:    mulw a0, a0, a1
+; RV64IMB-NEXT:    lui a1, 50
+; RV64IMB-NEXT:    addiw a1, a1, 1119
+; RV64IMB-NEXT:    addw a0, a0, a1
+; RV64IMB-NEXT:    ret
   %tmp0 = add i32 %x, 8953
-  %tmp1 = mul i32 %tmp0, 13
+  %tmp1 = mul i32 %tmp0, 23
   ret i32 %tmp1
 }
 
-define signext i32 @add_mul_trans_reject_1(i32 %x) {
-; RV32IM-LABEL: add_mul_trans_reject_1:
-; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    addi a1, zero, 19
-; RV32IM-NEXT:    mul a0, a0, a1
-; RV32IM-NEXT:    lui a1, 9
-; RV32IM-NEXT:    addi a1, a1, 585
-; RV32IM-NEXT:    add a0, a0, a1
-; RV32IM-NEXT:    ret
+define i64 @add_mul_combine_accept_b3(i64 %x) {
+; RV32IMB-LABEL: add_mul_combine_accept_b3:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a2, zero, 23
+; RV32IMB-NEXT:    mul a1, a1, a2
+; RV32IMB-NEXT:    mulhu a3, a0, a2
+; RV32IMB-NEXT:    add a1, a3, a1
+; RV32IMB-NEXT:    mul a2, a0, a2
+; RV32IMB-NEXT:    lui a0, 50
+; RV32IMB-NEXT:    addi a0, a0, 1119
+; RV32IMB-NEXT:    add a0, a2, a0
+; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a1, a2
+; RV32IMB-NEXT:    ret
+;
+; RV64IMB-LABEL: add_mul_combine_accept_b3:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addi a1, zero, 23
+; RV64IMB-NEXT:    mul a0, a0, a1
+; RV64IMB-NEXT:    lui a1, 50
+; RV64IMB-NEXT:    addiw a1, a1, 1119
+; RV64IMB-NEXT:    add a0, a0, a1
+; RV64IMB-NEXT:    ret
+  %tmp0 = add i64 %x, 8953
+  %tmp1 = mul i64 %tmp0, 23
+  ret i64 %tmp1
+}
+
+define i32 @add_mul_combine_reject_a1(i32 %x) {
+; RV32IMB-LABEL: add_mul_combine_reject_a1:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a0, a0, 1971
+; RV32IMB-NEXT:    addi a1, zero, 29
+; RV32IMB-NEXT:    mul a0, a0, a1
+; RV32IMB-NEXT:    ret
 ;
-; RV64IM-LABEL: add_mul_trans_reject_1:
-; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    addi a1, zero, 19
-; RV64IM-NEXT:    mul a0, a0, a1
-; RV64IM-NEXT:    lui a1, 9
-; RV64IM-NEXT:    addiw a1, a1, 585
-; RV64IM-NEXT:    addw a0, a0, a1
-; RV64IM-NEXT:    ret
+; RV64IMB-LABEL: add_mul_combine_reject_a1:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addiw a0, a0, 1971
+; RV64IMB-NEXT:    addi a1, zero, 29
+; RV64IMB-NEXT:    mulw a0, a0, a1
+; RV64IMB-NEXT:    ret
   %tmp0 = add i32 %x, 1971
-  %tmp1 = mul i32 %tmp0, 19
+  %tmp1 = mul i32 %tmp0, 29
   ret i32 %tmp1
 }
 
-define signext i32 @add_mul_trans_reject_2(i32 %x) {
-; RV32IM-LABEL: add_mul_trans_reject_2:
-; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lui a1, 792
-; RV32IM-NEXT:    addi a1, a1, -1709
-; RV32IM-NEXT:    mul a0, a0, a1
-; RV32IM-NEXT:    lui a1, 1014660
-; RV32IM-NEXT:    addi a1, a1, -1891
-; RV32IM-NEXT:    add a0, a0, a1
-; RV32IM-NEXT:    ret
+define signext i32 @add_mul_combine_reject_a2(i32 signext %x) {
+; RV32IMB-LABEL: add_mul_combine_reject_a2:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a0, a0, 1971
+; RV32IMB-NEXT:    addi a1, zero, 29
+; RV32IMB-NEXT:    mul a0, a0, a1
+; RV32IMB-NEXT:    ret
 ;
-; RV64IM-LABEL: add_mul_trans_reject_2:
-; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lui a1, 792
-; RV64IM-NEXT:    addiw a1, a1, -1709
-; RV64IM-NEXT:    mul a0, a0, a1
-; RV64IM-NEXT:    lui a1, 1014660
-; RV64IM-NEXT:    addiw a1, a1, -1891
-; RV64IM-NEXT:    addw a0, a0, a1
-; RV64IM-NEXT:    ret
-  %tmp0 = add i32 %x, 1841231
-  %tmp1 = mul i32 %tmp0, 3242323
+; RV64IMB-LABEL: add_mul_combine_reject_a2:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addiw a0, a0, 1971
+; RV64IMB-NEXT:    addi a1, zero, 29
+; RV64IMB-NEXT:    mulw a0, a0, a1
+; RV64IMB-NEXT:    ret
+  %tmp0 = add i32 %x, 1971
+  %tmp1 = mul i32 %tmp0, 29
   ret i32 %tmp1
 }
+
+define i64 @add_mul_combine_reject_a3(i64 %x) {
+; RV32IMB-LABEL: add_mul_combine_reject_a3:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a2, zero, 29
+; RV32IMB-NEXT:    mul a1, a1, a2
+; RV32IMB-NEXT:    mulhu a3, a0, a2
+; RV32IMB-NEXT:    add a1, a3, a1
+; RV32IMB-NEXT:    mul a2, a0, a2
+; RV32IMB-NEXT:    lui a0, 14
+; RV32IMB-NEXT:    addi a0, a0, -185
+; RV32IMB-NEXT:    add a0, a2, a0
+; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a1, a2
+; RV32IMB-NEXT:    ret
+;
+; RV64IMB-LABEL: add_mul_combine_reject_a3:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addi a0, a0, 1971
+; RV64IMB-NEXT:    addi a1, zero, 29
+; RV64IMB-NEXT:    mul a0, a0, a1
+; RV64IMB-NEXT:    ret
+  %tmp0 = add i64 %x, 1971
+  %tmp1 = mul i64 %tmp0, 29
+  ret i64 %tmp1
+}
+
+define i32 @add_mul_combine_reject_c1(i32 %x) {
+; RV32IMB-LABEL: add_mul_combine_reject_c1:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a0, a0, 1000
+; RV32IMB-NEXT:    sh3add a1, a0, a0
+; RV32IMB-NEXT:    sh3add a0, a1, a0
+; RV32IMB-NEXT:    ret
+;
+; RV64IMB-LABEL: add_mul_combine_reject_c1:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addi a0, a0, 1000
+; RV64IMB-NEXT:    sh3add a1, a0, a0
+; RV64IMB-NEXT:    sh3add a0, a1, a0
+; RV64IMB-NEXT:    sext.w a0, a0
+; RV64IMB-NEXT:    ret
+  %tmp0 = add i32 %x, 1000
+  %tmp1 = mul i32 %tmp0, 73
+  ret i32 %tmp1
+}
+
+define signext i32 @add_mul_combine_reject_c2(i32 signext %x) {
+; RV32IMB-LABEL: add_mul_combine_reject_c2:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a0, a0, 1000
+; RV32IMB-NEXT:    sh3add a1, a0, a0
+; RV32IMB-NEXT:    sh3add a0, a1, a0
+; RV32IMB-NEXT:    ret
+;
+; RV64IMB-LABEL: add_mul_combine_reject_c2:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addi a0, a0, 1000
+; RV64IMB-NEXT:    sh3add a1, a0, a0
+; RV64IMB-NEXT:    sh3add a0, a1, a0
+; RV64IMB-NEXT:    sext.w a0, a0
+; RV64IMB-NEXT:    ret
+  %tmp0 = add i32 %x, 1000
+  %tmp1 = mul i32 %tmp0, 73
+  ret i32 %tmp1
+}
+
+define i64 @add_mul_combine_reject_c3(i64 %x) {
+; RV32IMB-LABEL: add_mul_combine_reject_c3:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a2, zero, 73
+; RV32IMB-NEXT:    mul a1, a1, a2
+; RV32IMB-NEXT:    mulhu a3, a0, a2
+; RV32IMB-NEXT:    add a1, a3, a1
+; RV32IMB-NEXT:    mul a2, a0, a2
+; RV32IMB-NEXT:    lui a0, 18
+; RV32IMB-NEXT:    addi a0, a0, -728
+; RV32IMB-NEXT:    add a0, a2, a0
+; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a1, a2
+; RV32IMB-NEXT:    ret
+;
+; RV64IMB-LABEL: add_mul_combine_reject_c3:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addi a0, a0, 1000
+; RV64IMB-NEXT:    sh3add a1, a0, a0
+; RV64IMB-NEXT:    sh3add a0, a1, a0
+; RV64IMB-NEXT:    ret
+  %tmp0 = add i64 %x, 1000
+  %tmp1 = mul i64 %tmp0, 73
+  ret i64 %tmp1
+}
+
+define i32 @add_mul_combine_reject_d1(i32 %x) {
+; RV32IMB-LABEL: add_mul_combine_reject_d1:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a0, a0, 1000
+; RV32IMB-NEXT:    sh1add a0, a0, a0
+; RV32IMB-NEXT:    slli a0, a0, 6
+; RV32IMB-NEXT:    ret
+;
+; RV64IMB-LABEL: add_mul_combine_reject_d1:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addi a0, a0, 1000
+; RV64IMB-NEXT:    sh1add a0, a0, a0
+; RV64IMB-NEXT:    slliw a0, a0, 6
+; RV64IMB-NEXT:    ret
+  %tmp0 = add i32 %x, 1000
+  %tmp1 = mul i32 %tmp0, 192
+  ret i32 %tmp1
+}
+
+define signext i32 @add_mul_combine_reject_d2(i32 signext %x) {
+; RV32IMB-LABEL: add_mul_combine_reject_d2:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a0, a0, 1000
+; RV32IMB-NEXT:    sh1add a0, a0, a0
+; RV32IMB-NEXT:    slli a0, a0, 6
+; RV32IMB-NEXT:    ret
+;
+; RV64IMB-LABEL: add_mul_combine_reject_d2:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addi a0, a0, 1000
+; RV64IMB-NEXT:    sh1add a0, a0, a0
+; RV64IMB-NEXT:    slliw a0, a0, 6
+; RV64IMB-NEXT:    ret
+  %tmp0 = add i32 %x, 1000
+  %tmp1 = mul i32 %tmp0, 192
+  ret i32 %tmp1
+}
+
+define i64 @add_mul_combine_reject_d3(i64 %x) {
+; RV32IMB-LABEL: add_mul_combine_reject_d3:
+; RV32IMB:       # %bb.0:
+; RV32IMB-NEXT:    addi a2, zero, 192
+; RV32IMB-NEXT:    mulhu a2, a0, a2
+; RV32IMB-NEXT:    sh1add a1, a1, a1
+; RV32IMB-NEXT:    slli a1, a1, 6
+; RV32IMB-NEXT:    add a1, a2, a1
+; RV32IMB-NEXT:    sh1add a0, a0, a0
+; RV32IMB-NEXT:    slli a2, a0, 6
+; RV32IMB-NEXT:    lui a0, 47
+; RV32IMB-NEXT:    addi a0, a0, -512
+; RV32IMB-NEXT:    add a0, a2, a0
+; RV32IMB-NEXT:    sltu a2, a0, a2
+; RV32IMB-NEXT:    add a1, a1, a2
+; RV32IMB-NEXT:    ret
+;
+; RV64IMB-LABEL: add_mul_combine_reject_d3:
+; RV64IMB:       # %bb.0:
+; RV64IMB-NEXT:    addi a0, a0, 1000
+; RV64IMB-NEXT:    sh1add a0, a0, a0
+; RV64IMB-NEXT:    slli a0, a0, 6
+; RV64IMB-NEXT:    ret
+  %tmp0 = add i64 %x, 1000
+  %tmp1 = mul i64 %tmp0, 192
+  ret i64 %tmp1
+}
diff --git a/llvm/test/CodeGen/RISCV/alu32.ll b/llvm/test/CodeGen/RISCV/alu32.ll
index b7cba24a43cd5..f498ff4973a26 100644
--- a/llvm/test/CodeGen/RISCV/alu32.ll
+++ b/llvm/test/CodeGen/RISCV/alu32.ll
@@ -18,7 +18,7 @@ define i32 @addi(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: addi:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    addiw a0, a0, 1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 1
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
index 888148f987e69..236c422dc60e2 100644
--- a/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-cmpxchg.ll
@@ -61,7 +61,7 @@ define void @cmpxchg_i8_monotonic_monotonic(i8* %ptr, i8 %cmp, i8 %val) nounwind
 ; RV64IA-LABEL: cmpxchg_i8_monotonic_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a4, zero, 255
 ; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -137,7 +137,7 @@ define void @cmpxchg_i8_acquire_monotonic(i8* %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-LABEL: cmpxchg_i8_acquire_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a4, zero, 255
 ; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -213,7 +213,7 @@ define void @cmpxchg_i8_acquire_acquire(i8* %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-LABEL: cmpxchg_i8_acquire_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a4, zero, 255
 ; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -289,7 +289,7 @@ define void @cmpxchg_i8_release_monotonic(i8* %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-LABEL: cmpxchg_i8_release_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a4, zero, 255
 ; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -365,7 +365,7 @@ define void @cmpxchg_i8_release_acquire(i8* %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-LABEL: cmpxchg_i8_release_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a4, zero, 255
 ; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -441,7 +441,7 @@ define void @cmpxchg_i8_acq_rel_monotonic(i8* %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-LABEL: cmpxchg_i8_acq_rel_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a4, zero, 255
 ; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -517,7 +517,7 @@ define void @cmpxchg_i8_acq_rel_acquire(i8* %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-LABEL: cmpxchg_i8_acq_rel_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a4, zero, 255
 ; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -593,7 +593,7 @@ define void @cmpxchg_i8_seq_cst_monotonic(i8* %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-LABEL: cmpxchg_i8_seq_cst_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a4, zero, 255
 ; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -669,7 +669,7 @@ define void @cmpxchg_i8_seq_cst_acquire(i8* %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-LABEL: cmpxchg_i8_seq_cst_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a4, zero, 255
 ; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -745,7 +745,7 @@ define void @cmpxchg_i8_seq_cst_seq_cst(i8* %ptr, i8 %cmp, i8 %val) nounwind {
 ; RV64IA-LABEL: cmpxchg_i8_seq_cst_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a4, zero, 255
 ; RV64IA-NEXT:    sllw a4, a4, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -822,7 +822,7 @@ define void @cmpxchg_i16_monotonic_monotonic(i16* %ptr, i16 %cmp, i16 %val) noun
 ; RV64IA-LABEL: cmpxchg_i16_monotonic_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
@@ -900,7 +900,7 @@ define void @cmpxchg_i16_acquire_monotonic(i16* %ptr, i16 %cmp, i16 %val) nounwi
 ; RV64IA-LABEL: cmpxchg_i16_acquire_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
@@ -978,7 +978,7 @@ define void @cmpxchg_i16_acquire_acquire(i16* %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-LABEL: cmpxchg_i16_acquire_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
@@ -1056,7 +1056,7 @@ define void @cmpxchg_i16_release_monotonic(i16* %ptr, i16 %cmp, i16 %val) nounwi
 ; RV64IA-LABEL: cmpxchg_i16_release_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
@@ -1134,7 +1134,7 @@ define void @cmpxchg_i16_release_acquire(i16* %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-LABEL: cmpxchg_i16_release_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
@@ -1212,7 +1212,7 @@ define void @cmpxchg_i16_acq_rel_monotonic(i16* %ptr, i16 %cmp, i16 %val) nounwi
 ; RV64IA-LABEL: cmpxchg_i16_acq_rel_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
@@ -1290,7 +1290,7 @@ define void @cmpxchg_i16_acq_rel_acquire(i16* %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-LABEL: cmpxchg_i16_acq_rel_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
@@ -1368,7 +1368,7 @@ define void @cmpxchg_i16_seq_cst_monotonic(i16* %ptr, i16 %cmp, i16 %val) nounwi
 ; RV64IA-LABEL: cmpxchg_i16_seq_cst_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
@@ -1446,7 +1446,7 @@ define void @cmpxchg_i16_seq_cst_acquire(i16* %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-LABEL: cmpxchg_i16_seq_cst_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
@@ -1524,7 +1524,7 @@ define void @cmpxchg_i16_seq_cst_seq_cst(i16* %ptr, i16 %cmp, i16 %val) nounwind
 ; RV64IA-LABEL: cmpxchg_i16_seq_cst_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a3, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a4, 16
 ; RV64IA-NEXT:    addiw a4, a4, -1
 ; RV64IA-NEXT:    sllw a5, a4, a0
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw.ll b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
index c623844646a06..eaf650a9739be 100644
--- a/llvm/test/CodeGen/RISCV/atomic-rmw.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw.ll
@@ -52,7 +52,7 @@ define i8 @atomicrmw_xchg_i8_monotonic(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xchg_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -116,7 +116,7 @@ define i8 @atomicrmw_xchg_i8_acquire(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xchg_i8_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -180,7 +180,7 @@ define i8 @atomicrmw_xchg_i8_release(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xchg_i8_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -244,7 +244,7 @@ define i8 @atomicrmw_xchg_i8_acq_rel(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xchg_i8_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -308,7 +308,7 @@ define i8 @atomicrmw_xchg_i8_seq_cst(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xchg_i8_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -372,7 +372,7 @@ define i8 @atomicrmw_add_i8_monotonic(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_add_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -436,7 +436,7 @@ define i8 @atomicrmw_add_i8_acquire(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_add_i8_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -500,7 +500,7 @@ define i8 @atomicrmw_add_i8_release(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_add_i8_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -564,7 +564,7 @@ define i8 @atomicrmw_add_i8_acq_rel(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_add_i8_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -628,7 +628,7 @@ define i8 @atomicrmw_add_i8_seq_cst(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_add_i8_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -692,7 +692,7 @@ define i8 @atomicrmw_sub_i8_monotonic(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_sub_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -756,7 +756,7 @@ define i8 @atomicrmw_sub_i8_acquire(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_sub_i8_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -820,7 +820,7 @@ define i8 @atomicrmw_sub_i8_release(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_sub_i8_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -884,7 +884,7 @@ define i8 @atomicrmw_sub_i8_acq_rel(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_sub_i8_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -948,7 +948,7 @@ define i8 @atomicrmw_sub_i8_seq_cst(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_sub_i8_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -1006,7 +1006,7 @@ define i8 @atomicrmw_and_i8_monotonic(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_and_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    not a3, a3
@@ -1058,7 +1058,7 @@ define i8 @atomicrmw_and_i8_acquire(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_and_i8_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    not a3, a3
@@ -1110,7 +1110,7 @@ define i8 @atomicrmw_and_i8_release(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_and_i8_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    not a3, a3
@@ -1162,7 +1162,7 @@ define i8 @atomicrmw_and_i8_acq_rel(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_and_i8_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    not a3, a3
@@ -1214,7 +1214,7 @@ define i8 @atomicrmw_and_i8_seq_cst(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_and_i8_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    not a3, a3
@@ -1273,7 +1273,7 @@ define i8 @atomicrmw_nand_i8_monotonic(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_nand_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -1339,7 +1339,7 @@ define i8 @atomicrmw_nand_i8_acquire(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_nand_i8_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -1405,7 +1405,7 @@ define i8 @atomicrmw_nand_i8_release(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_nand_i8_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -1471,7 +1471,7 @@ define i8 @atomicrmw_nand_i8_acq_rel(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_nand_i8_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -1537,7 +1537,7 @@ define i8 @atomicrmw_nand_i8_seq_cst(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_nand_i8_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -1592,7 +1592,7 @@ define i8 @atomicrmw_or_i8_monotonic(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_or_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    amoor.w a1, a1, (a2)
@@ -1636,7 +1636,7 @@ define i8 @atomicrmw_or_i8_acquire(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_or_i8_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    amoor.w.aq a1, a1, (a2)
@@ -1680,7 +1680,7 @@ define i8 @atomicrmw_or_i8_release(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_or_i8_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    amoor.w.rl a1, a1, (a2)
@@ -1724,7 +1724,7 @@ define i8 @atomicrmw_or_i8_acq_rel(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_or_i8_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    amoor.w.aqrl a1, a1, (a2)
@@ -1768,7 +1768,7 @@ define i8 @atomicrmw_or_i8_seq_cst(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_or_i8_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    amoor.w.aqrl a1, a1, (a2)
@@ -1812,7 +1812,7 @@ define i8 @atomicrmw_xor_i8_monotonic(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xor_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    amoxor.w a1, a1, (a2)
@@ -1856,7 +1856,7 @@ define i8 @atomicrmw_xor_i8_acquire(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xor_i8_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    amoxor.w.aq a1, a1, (a2)
@@ -1900,7 +1900,7 @@ define i8 @atomicrmw_xor_i8_release(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xor_i8_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    amoxor.w.rl a1, a1, (a2)
@@ -1944,7 +1944,7 @@ define i8 @atomicrmw_xor_i8_acq_rel(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xor_i8_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    amoxor.w.aqrl a1, a1, (a2)
@@ -1988,7 +1988,7 @@ define i8 @atomicrmw_xor_i8_seq_cst(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xor_i8_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    amoxor.w.aqrl a1, a1, (a2)
@@ -3586,7 +3586,7 @@ define i8 @atomicrmw_umax_i8_monotonic(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umax_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -3718,7 +3718,7 @@ define i8 @atomicrmw_umax_i8_acquire(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umax_i8_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -3850,7 +3850,7 @@ define i8 @atomicrmw_umax_i8_release(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umax_i8_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -3982,7 +3982,7 @@ define i8 @atomicrmw_umax_i8_acq_rel(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umax_i8_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -4114,7 +4114,7 @@ define i8 @atomicrmw_umax_i8_seq_cst(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umax_i8_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -4246,7 +4246,7 @@ define i8 @atomicrmw_umin_i8_monotonic(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umin_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -4378,7 +4378,7 @@ define i8 @atomicrmw_umin_i8_acquire(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umin_i8_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -4510,7 +4510,7 @@ define i8 @atomicrmw_umin_i8_release(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umin_i8_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -4642,7 +4642,7 @@ define i8 @atomicrmw_umin_i8_acq_rel(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umin_i8_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -4774,7 +4774,7 @@ define i8 @atomicrmw_umin_i8_seq_cst(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umin_i8_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -4843,7 +4843,7 @@ define i16 @atomicrmw_xchg_i16_monotonic(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xchg_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -4909,7 +4909,7 @@ define i16 @atomicrmw_xchg_i16_acquire(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xchg_i16_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -4975,7 +4975,7 @@ define i16 @atomicrmw_xchg_i16_release(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xchg_i16_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5041,7 +5041,7 @@ define i16 @atomicrmw_xchg_i16_acq_rel(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xchg_i16_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5107,7 +5107,7 @@ define i16 @atomicrmw_xchg_i16_seq_cst(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xchg_i16_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5173,7 +5173,7 @@ define i16 @atomicrmw_add_i16_monotonic(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_add_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5239,7 +5239,7 @@ define i16 @atomicrmw_add_i16_acquire(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_add_i16_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5305,7 +5305,7 @@ define i16 @atomicrmw_add_i16_release(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_add_i16_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5371,7 +5371,7 @@ define i16 @atomicrmw_add_i16_acq_rel(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_add_i16_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5437,7 +5437,7 @@ define i16 @atomicrmw_add_i16_seq_cst(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_add_i16_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5503,7 +5503,7 @@ define i16 @atomicrmw_sub_i16_monotonic(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_sub_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5569,7 +5569,7 @@ define i16 @atomicrmw_sub_i16_acquire(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_sub_i16_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5635,7 +5635,7 @@ define i16 @atomicrmw_sub_i16_release(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_sub_i16_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5701,7 +5701,7 @@ define i16 @atomicrmw_sub_i16_acq_rel(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_sub_i16_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5767,7 +5767,7 @@ define i16 @atomicrmw_sub_i16_seq_cst(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_sub_i16_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5827,7 +5827,7 @@ define i16 @atomicrmw_and_i16_monotonic(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_and_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5881,7 +5881,7 @@ define i16 @atomicrmw_and_i16_acquire(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_and_i16_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5935,7 +5935,7 @@ define i16 @atomicrmw_and_i16_release(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_and_i16_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -5989,7 +5989,7 @@ define i16 @atomicrmw_and_i16_acq_rel(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_and_i16_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -6043,7 +6043,7 @@ define i16 @atomicrmw_and_i16_seq_cst(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_and_i16_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -6104,7 +6104,7 @@ define i16 @atomicrmw_nand_i16_monotonic(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_nand_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -6172,7 +6172,7 @@ define i16 @atomicrmw_nand_i16_acquire(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_nand_i16_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -6240,7 +6240,7 @@ define i16 @atomicrmw_nand_i16_release(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_nand_i16_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -6308,7 +6308,7 @@ define i16 @atomicrmw_nand_i16_acq_rel(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_nand_i16_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -6376,7 +6376,7 @@ define i16 @atomicrmw_nand_i16_seq_cst(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_nand_i16_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -6434,7 +6434,7 @@ define i16 @atomicrmw_or_i16_monotonic(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_or_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    and a1, a1, a3
@@ -6482,7 +6482,7 @@ define i16 @atomicrmw_or_i16_acquire(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_or_i16_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    and a1, a1, a3
@@ -6530,7 +6530,7 @@ define i16 @atomicrmw_or_i16_release(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_or_i16_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    and a1, a1, a3
@@ -6578,7 +6578,7 @@ define i16 @atomicrmw_or_i16_acq_rel(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_or_i16_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    and a1, a1, a3
@@ -6626,7 +6626,7 @@ define i16 @atomicrmw_or_i16_seq_cst(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_or_i16_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    and a1, a1, a3
@@ -6674,7 +6674,7 @@ define i16 @atomicrmw_xor_i16_monotonic(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xor_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    and a1, a1, a3
@@ -6722,7 +6722,7 @@ define i16 @atomicrmw_xor_i16_acquire(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xor_i16_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    and a1, a1, a3
@@ -6770,7 +6770,7 @@ define i16 @atomicrmw_xor_i16_release(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xor_i16_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    and a1, a1, a3
@@ -6818,7 +6818,7 @@ define i16 @atomicrmw_xor_i16_acq_rel(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xor_i16_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    and a1, a1, a3
@@ -6866,7 +6866,7 @@ define i16 @atomicrmw_xor_i16_seq_cst(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xor_i16_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    and a1, a1, a3
@@ -8495,7 +8495,7 @@ define i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umax_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -8637,7 +8637,7 @@ define i16 @atomicrmw_umax_i16_acquire(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umax_i16_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -8779,7 +8779,7 @@ define i16 @atomicrmw_umax_i16_release(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umax_i16_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -8921,7 +8921,7 @@ define i16 @atomicrmw_umax_i16_acq_rel(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umax_i16_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -9063,7 +9063,7 @@ define i16 @atomicrmw_umax_i16_seq_cst(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umax_i16_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -9205,7 +9205,7 @@ define i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umin_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -9347,7 +9347,7 @@ define i16 @atomicrmw_umin_i16_acquire(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umin_i16_acquire:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -9489,7 +9489,7 @@ define i16 @atomicrmw_umin_i16_release(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umin_i16_release:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -9631,7 +9631,7 @@ define i16 @atomicrmw_umin_i16_acq_rel(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umin_i16_acq_rel:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -9773,7 +9773,7 @@ define i16 @atomicrmw_umin_i16_seq_cst(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umin_i16_seq_cst:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
diff --git a/llvm/test/CodeGen/RISCV/atomic-signext.ll b/llvm/test/CodeGen/RISCV/atomic-signext.ll
index dee847c7cdaee..e7ec2bf888666 100644
--- a/llvm/test/CodeGen/RISCV/atomic-signext.ll
+++ b/llvm/test/CodeGen/RISCV/atomic-signext.ll
@@ -170,7 +170,7 @@ define signext i8 @atomicrmw_xchg_i8_monotonic(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xchg_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -242,7 +242,7 @@ define signext i8 @atomicrmw_add_i8_monotonic(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_add_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -314,7 +314,7 @@ define signext i8 @atomicrmw_sub_i8_monotonic(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_sub_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -380,7 +380,7 @@ define signext i8 @atomicrmw_and_i8_monotonic(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_and_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    not a3, a3
@@ -447,7 +447,7 @@ define signext i8 @atomicrmw_nand_i8_monotonic(i8* %a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_nand_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -510,7 +510,7 @@ define signext i8 @atomicrmw_or_i8_monotonic(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_or_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    amoor.w a1, a1, (a2)
@@ -562,7 +562,7 @@ define signext i8 @atomicrmw_xor_i8_monotonic(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xor_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    andi a1, a1, 255
 ; RV64IA-NEXT:    sllw a1, a1, a0
 ; RV64IA-NEXT:    amoxor.w a1, a1, (a2)
@@ -994,7 +994,7 @@ define signext i8 @atomicrmw_umax_i8_monotonic(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umax_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -1132,7 +1132,7 @@ define signext i8 @atomicrmw_umin_i8_monotonic(i8 *%a, i8 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umin_i8_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    addi a3, zero, 255
 ; RV64IA-NEXT:    sllw a3, a3, a0
 ; RV64IA-NEXT:    andi a1, a1, 255
@@ -1209,7 +1209,7 @@ define signext i16 @atomicrmw_xchg_i16_monotonic(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xchg_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -1283,7 +1283,7 @@ define signext i16 @atomicrmw_add_i16_monotonic(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_add_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -1357,7 +1357,7 @@ define signext i16 @atomicrmw_sub_i16_monotonic(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_sub_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -1425,7 +1425,7 @@ define signext i16 @atomicrmw_and_i16_monotonic(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_and_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -1494,7 +1494,7 @@ define signext i16 @atomicrmw_nand_i16_monotonic(i16* %a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_nand_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -1560,7 +1560,7 @@ define signext i16 @atomicrmw_or_i16_monotonic(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_or_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    and a1, a1, a3
@@ -1616,7 +1616,7 @@ define signext i16 @atomicrmw_xor_i16_monotonic(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_xor_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a2, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    and a1, a1, a3
@@ -2063,7 +2063,7 @@ define signext i16 @atomicrmw_umax_i16_monotonic(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umax_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
@@ -2211,7 +2211,7 @@ define signext i16 @atomicrmw_umin_i16_monotonic(i16 *%a, i16 %b) nounwind {
 ; RV64IA-LABEL: atomicrmw_umin_i16_monotonic:
 ; RV64IA:       # %bb.0:
 ; RV64IA-NEXT:    andi a6, a0, -4
-; RV64IA-NEXT:    slli a0, a0, 3
+; RV64IA-NEXT:    slliw a0, a0, 3
 ; RV64IA-NEXT:    lui a3, 16
 ; RV64IA-NEXT:    addiw a3, a3, -1
 ; RV64IA-NEXT:    sllw a4, a3, a0
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-half.ll b/llvm/test/CodeGen/RISCV/calling-conv-half.ll
index fec5d4bf70f82..65be3f13b93a7 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-half.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-half.ll
@@ -347,9 +347,8 @@ define i32 @caller_half_on_stack() nounwind {
 ; RV64IF:       # %bb.0:
 ; RV64IF-NEXT:    addi sp, sp, -16
 ; RV64IF-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64IF-NEXT:    addi a0, zero, -183
-; RV64IF-NEXT:    slli a0, a0, 40
-; RV64IF-NEXT:    srli t0, a0, 32
+; RV64IF-NEXT:    lui a0, 1048565
+; RV64IF-NEXT:    addiw t0, a0, -1792
 ; RV64IF-NEXT:    addi a0, zero, 1
 ; RV64IF-NEXT:    addi a1, zero, 2
 ; RV64IF-NEXT:    addi a2, zero, 3
diff --git a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
index 65a152b44dc71..84dfed0d57e0b 100644
--- a/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
+++ b/llvm/test/CodeGen/RISCV/calling-conv-lp64-lp64f-lp64d-common.ll
@@ -54,15 +54,15 @@ define i32 @callee_many_scalars(i8 %a, i16 %b, i32 %c, i128 %d, i32 %e, i32 %f,
 ; RV64I-NEXT:    lui a0, 16
 ; RV64I-NEXT:    addiw a0, a0, -1
 ; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    add a0, t2, a0
-; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    addw a0, t2, a0
+; RV64I-NEXT:    addw a0, a0, a2
 ; RV64I-NEXT:    xor a1, a4, t1
 ; RV64I-NEXT:    xor a2, a3, a7
 ; RV64I-NEXT:    or a1, a2, a1
 ; RV64I-NEXT:    seqz a1, a1
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    add a0, a0, a5
-; RV64I-NEXT:    add a0, a0, a6
+; RV64I-NEXT:    addw a0, a1, a0
+; RV64I-NEXT:    addw a0, a0, a5
+; RV64I-NEXT:    addw a0, a0, a6
 ; RV64I-NEXT:    addw a0, a0, t0
 ; RV64I-NEXT:    ret
   %a_ext = zext i8 %a to i32
diff --git a/llvm/test/CodeGen/RISCV/div.ll b/llvm/test/CodeGen/RISCV/div.ll
index 991c91cc70ea0..e03273abf7818 100644
--- a/llvm/test/CodeGen/RISCV/div.ll
+++ b/llvm/test/CodeGen/RISCV/div.ll
@@ -713,7 +713,7 @@ define i32 @sdiv_pow2(i32 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sraiw a1, a0, 31
 ; RV64I-NEXT:    srliw a1, a1, 29
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    sraiw a0, a0, 3
 ; RV64I-NEXT:    ret
 ;
@@ -721,7 +721,7 @@ define i32 @sdiv_pow2(i32 %a) nounwind {
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    sraiw a1, a0, 31
 ; RV64IM-NEXT:    srliw a1, a1, 29
-; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    sraiw a0, a0, 3
 ; RV64IM-NEXT:    ret
   %1 = sdiv i32 %a, 8
@@ -749,7 +749,7 @@ define i32 @sdiv_pow2_2(i32 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sraiw a1, a0, 31
 ; RV64I-NEXT:    srliw a1, a1, 16
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    sraiw a0, a0, 16
 ; RV64I-NEXT:    ret
 ;
@@ -757,7 +757,7 @@ define i32 @sdiv_pow2_2(i32 %a) nounwind {
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    sraiw a1, a0, 31
 ; RV64IM-NEXT:    srliw a1, a1, 16
-; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    sraiw a0, a0, 16
 ; RV64IM-NEXT:    ret
   %1 = sdiv i32 %a, 65536
@@ -1117,7 +1117,7 @@ define i8 @sdiv8_pow2(i8 %a) nounwind {
 ; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    srli a1, a1, 12
 ; RV64I-NEXT:    andi a1, a1, 7
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    srai a0, a0, 59
 ; RV64I-NEXT:    ret
@@ -1128,7 +1128,7 @@ define i8 @sdiv8_pow2(i8 %a) nounwind {
 ; RV64IM-NEXT:    srai a1, a1, 56
 ; RV64IM-NEXT:    srli a1, a1, 12
 ; RV64IM-NEXT:    andi a1, a1, 7
-; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 56
 ; RV64IM-NEXT:    srai a0, a0, 59
 ; RV64IM-NEXT:    ret
@@ -1309,7 +1309,7 @@ define i16 @sdiv16_pow2(i16 %a) nounwind {
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    srli a1, a1, 28
 ; RV64I-NEXT:    andi a1, a1, 7
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 48
 ; RV64I-NEXT:    srai a0, a0, 51
 ; RV64I-NEXT:    ret
@@ -1320,7 +1320,7 @@ define i16 @sdiv16_pow2(i16 %a) nounwind {
 ; RV64IM-NEXT:    srai a1, a1, 48
 ; RV64IM-NEXT:    srli a1, a1, 28
 ; RV64IM-NEXT:    andi a1, a1, 7
-; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 48
 ; RV64IM-NEXT:    srai a0, a0, 51
 ; RV64IM-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll b/llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll
index 7fe7e8eb01cf5..f612b533008ec 100644
--- a/llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll
+++ b/llvm/test/CodeGen/RISCV/double-stack-spill-restore.ll
@@ -41,7 +41,7 @@ define double @func(double %d, i32 %n) nounwind {
 ; RV64IFD-NEXT:    fmv.d.x ft0, a0
 ; RV64IFD-NEXT:    beqz a2, .LBB0_2
 ; RV64IFD-NEXT:  # %bb.1: # %if.else
-; RV64IFD-NEXT:    addi a1, a1, -1
+; RV64IFD-NEXT:    addiw a1, a1, -1
 ; RV64IFD-NEXT:    fmv.x.d a0, ft0
 ; RV64IFD-NEXT:    fsd ft0, 0(sp) # 8-byte Folded Spill
 ; RV64IFD-NEXT:    call func@plt
diff --git a/llvm/test/CodeGen/RISCV/imm.ll b/llvm/test/CodeGen/RISCV/imm.ll
index 4d11ca60cbe6d..38d777d80bfcd 100644
--- a/llvm/test/CodeGen/RISCV/imm.ll
+++ b/llvm/test/CodeGen/RISCV/imm.ll
@@ -482,15 +482,13 @@ define i64 @imm_2reg_1() nounwind {
 define void @imm_store_i16_neg1(i16* %p) nounwind {
 ; RV32I-LABEL: imm_store_i16_neg1:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lui a1, 16
-; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    addi a1, zero, -1
 ; RV32I-NEXT:    sh a1, 0(a0)
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: imm_store_i16_neg1:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, 16
-; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    addi a1, zero, -1
 ; RV64I-NEXT:    sh a1, 0(a0)
 ; RV64I-NEXT:    ret
   store i16 -1, i16* %p
@@ -508,7 +506,6 @@ define void @imm_store_i32_neg1(i32* %p) nounwind {
 ; RV64I-LABEL: imm_store_i32_neg1:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a1, zero, -1
-; RV64I-NEXT:    srli a1, a1, 32
 ; RV64I-NEXT:    sw a1, 0(a0)
 ; RV64I-NEXT:    ret
   store i32 -1, i32* %p
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 2260233a45597..9be5cb1b6d145 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -91,13 +91,13 @@ define signext i32 @mul_constant(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: mul_constant:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a0, 2
+; RV64I-NEXT:    slliw a1, a0, 2
 ; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: mul_constant:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    slli a1, a0, 2
+; RV64IM-NEXT:    slliw a1, a0, 2
 ; RV64IM-NEXT:    addw a0, a1, a0
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, 5
@@ -485,13 +485,13 @@ define i32 @muli32_p65(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: muli32_p65:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a0, 6
+; RV64I-NEXT:    slliw a1, a0, 6
 ; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muli32_p65:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    slli a1, a0, 6
+; RV64IM-NEXT:    slliw a1, a0, 6
 ; RV64IM-NEXT:    addw a0, a1, a0
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, 65
@@ -513,13 +513,13 @@ define i32 @muli32_p63(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: muli32_p63:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a0, 6
+; RV64I-NEXT:    slliw a1, a0, 6
 ; RV64I-NEXT:    subw a0, a1, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muli32_p63:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    slli a1, a0, 6
+; RV64IM-NEXT:    slliw a1, a0, 6
 ; RV64IM-NEXT:    subw a0, a1, a0
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, 63
@@ -620,13 +620,13 @@ define i32 @muli32_m63(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: muli32_m63:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a0, 6
+; RV64I-NEXT:    slliw a1, a0, 6
 ; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muli32_m63:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    slli a1, a0, 6
+; RV64IM-NEXT:    slliw a1, a0, 6
 ; RV64IM-NEXT:    subw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, -63
@@ -650,15 +650,15 @@ define i32 @muli32_m65(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: muli32_m65:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a0, 6
-; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    slliw a1, a0, 6
+; RV64I-NEXT:    addw a0, a1, a0
 ; RV64I-NEXT:    negw a0, a0
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muli32_m65:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    slli a1, a0, 6
-; RV64IM-NEXT:    add a0, a1, a0
+; RV64IM-NEXT:    slliw a1, a0, 6
+; RV64IM-NEXT:    addw a0, a1, a0
 ; RV64IM-NEXT:    negw a0, a0
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, -65
@@ -782,7 +782,7 @@ define i32 @muli32_p384(i32 %a) nounwind {
 ; RV64IM-LABEL: muli32_p384:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    addi a1, zero, 384
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, 384
   ret i32 %1
@@ -818,7 +818,7 @@ define i32 @muli32_p12288(i32 %a) nounwind {
 ; RV64IM-LABEL: muli32_p12288:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    lui a1, 3
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, 12288
   ret i32 %1
@@ -841,15 +841,15 @@ define i32 @muli32_p4352(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: muli32_p4352:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a0, 8
-; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    slliw a1, a0, 8
+; RV64I-NEXT:    slliw a0, a0, 12
 ; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muli32_p4352:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    slli a1, a0, 8
-; RV64IM-NEXT:    slli a0, a0, 12
+; RV64IM-NEXT:    slliw a1, a0, 8
+; RV64IM-NEXT:    slliw a0, a0, 12
 ; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, 4352
@@ -873,15 +873,15 @@ define i32 @muli32_p3840(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: muli32_p3840:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a0, 8
-; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    slliw a1, a0, 8
+; RV64I-NEXT:    slliw a0, a0, 12
 ; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muli32_p3840:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    slli a1, a0, 8
-; RV64IM-NEXT:    slli a0, a0, 12
+; RV64IM-NEXT:    slliw a1, a0, 8
+; RV64IM-NEXT:    slliw a0, a0, 12
 ; RV64IM-NEXT:    subw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, 3840
@@ -905,15 +905,15 @@ define i32 @muli32_m3840(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: muli32_m3840:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a0, 12
-; RV64I-NEXT:    slli a0, a0, 8
+; RV64I-NEXT:    slliw a1, a0, 12
+; RV64I-NEXT:    slliw a0, a0, 8
 ; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: muli32_m3840:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    slli a1, a0, 12
-; RV64IM-NEXT:    slli a0, a0, 8
+; RV64IM-NEXT:    slliw a1, a0, 12
+; RV64IM-NEXT:    slliw a0, a0, 8
 ; RV64IM-NEXT:    subw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, -3840
@@ -954,7 +954,7 @@ define i32 @muli32_m4352(i32 %a) nounwind {
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    lui a1, 1048575
 ; RV64IM-NEXT:    addiw a1, a1, -256
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = mul i32 %a, -4352
   ret i32 %1
diff --git a/llvm/test/CodeGen/RISCV/rv32zba.ll b/llvm/test/CodeGen/RISCV/rv32zba.ll
index 4a42bb056fd1a..6ee346d2029c2 100644
--- a/llvm/test/CodeGen/RISCV/rv32zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zba.ll
@@ -739,3 +739,49 @@ define i32 @mul4104(i32 %a) {
   %c = mul i32 %a, 4104
   ret i32 %c
 }
+
+define i32 @add4104(i32 %a) {
+; RV32I-LABEL: add4104:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 1
+; RV32I-NEXT:    addi a1, a1, 8
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32B-LABEL: add4104:
+; RV32B:       # %bb.0:
+; RV32B-NEXT:    addi a1, zero, 1026
+; RV32B-NEXT:    sh2add a0, a1, a0
+; RV32B-NEXT:    ret
+;
+; RV32ZBA-LABEL: add4104:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    addi a1, zero, 1026
+; RV32ZBA-NEXT:    sh2add a0, a1, a0
+; RV32ZBA-NEXT:    ret
+  %c = add i32 %a, 4104
+  ret i32 %c
+}
+
+define i32 @add8208(i32 %a) {
+; RV32I-LABEL: add8208:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 2
+; RV32I-NEXT:    addi a1, a1, 16
+; RV32I-NEXT:    add a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32B-LABEL: add8208:
+; RV32B:       # %bb.0:
+; RV32B-NEXT:    addi a1, zero, 1026
+; RV32B-NEXT:    sh3add a0, a1, a0
+; RV32B-NEXT:    ret
+;
+; RV32ZBA-LABEL: add8208:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    addi a1, zero, 1026
+; RV32ZBA-NEXT:    sh3add a0, a1, a0
+; RV32ZBA-NEXT:    ret
+  %c = add i32 %a, 8208
+  ret i32 %c
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
index 671f90b5791aa..373325fcdfdc1 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-demanded-bits.ll
@@ -10,10 +10,10 @@ define i32 @foo(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mulw a0, a0, a0
-; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    mul a0, a0, a0
-; CHECK-NEXT:    add a0, a0, a2
-; CHECK-NEXT:    addi a0, a0, 1
+; CHECK-NEXT:    addiw a0, a0, 1
+; CHECK-NEXT:    mulw a0, a0, a0
+; CHECK-NEXT:    addw a0, a0, a2
+; CHECK-NEXT:    addiw a0, a0, 1
 ; CHECK-NEXT:    sllw a0, a0, a1
 ; CHECK-NEXT:    ret
   %b = mul i32 %x, %x
diff --git a/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll b/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll
index df0520aee11e5..31d59ea1367b8 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-exhaustive-w-insts.ll
@@ -178,7 +178,7 @@ define signext i32 @sext_addw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind
 define zeroext i32 @zext_addw_aext_aext(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: zext_addw_aext_aext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -189,7 +189,7 @@ define zeroext i32 @zext_addw_aext_aext(i32 %a, i32 %b) nounwind {
 define zeroext i32 @zext_addw_aext_sext(i32 %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: zext_addw_aext_sext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -200,7 +200,7 @@ define zeroext i32 @zext_addw_aext_sext(i32 %a, i32 signext %b) nounwind {
 define zeroext i32 @zext_addw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
 ; RV64I-LABEL: zext_addw_aext_zext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -211,7 +211,7 @@ define zeroext i32 @zext_addw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
 define zeroext i32 @zext_addw_sext_aext(i32 signext %a, i32 %b) nounwind {
 ; RV64I-LABEL: zext_addw_sext_aext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -222,7 +222,7 @@ define zeroext i32 @zext_addw_sext_aext(i32 signext %a, i32 %b) nounwind {
 define zeroext i32 @zext_addw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: zext_addw_sext_sext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -233,7 +233,7 @@ define zeroext i32 @zext_addw_sext_sext(i32 signext %a, i32 signext %b) nounwind
 define zeroext i32 @zext_addw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
 ; RV64I-LABEL: zext_addw_sext_zext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -244,7 +244,7 @@ define zeroext i32 @zext_addw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind
 define zeroext i32 @zext_addw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
 ; RV64I-LABEL: zext_addw_zext_aext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -255,7 +255,7 @@ define zeroext i32 @zext_addw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
 define zeroext i32 @zext_addw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: zext_addw_zext_sext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -266,7 +266,7 @@ define zeroext i32 @zext_addw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind
 define zeroext i32 @zext_addw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
 ; RV64I-LABEL: zext_addw_zext_zext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    addw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -445,7 +445,7 @@ define signext i32 @sext_subw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind
 define zeroext i32 @zext_subw_aext_aext(i32 %a, i32 %b) nounwind {
 ; RV64I-LABEL: zext_subw_aext_aext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -456,7 +456,7 @@ define zeroext i32 @zext_subw_aext_aext(i32 %a, i32 %b) nounwind {
 define zeroext i32 @zext_subw_aext_sext(i32 %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: zext_subw_aext_sext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -467,7 +467,7 @@ define zeroext i32 @zext_subw_aext_sext(i32 %a, i32 signext %b) nounwind {
 define zeroext i32 @zext_subw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
 ; RV64I-LABEL: zext_subw_aext_zext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -478,7 +478,7 @@ define zeroext i32 @zext_subw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
 define zeroext i32 @zext_subw_sext_aext(i32 signext %a, i32 %b) nounwind {
 ; RV64I-LABEL: zext_subw_sext_aext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -489,7 +489,7 @@ define zeroext i32 @zext_subw_sext_aext(i32 signext %a, i32 %b) nounwind {
 define zeroext i32 @zext_subw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: zext_subw_sext_sext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -500,7 +500,7 @@ define zeroext i32 @zext_subw_sext_sext(i32 signext %a, i32 signext %b) nounwind
 define zeroext i32 @zext_subw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
 ; RV64I-LABEL: zext_subw_sext_zext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -511,7 +511,7 @@ define zeroext i32 @zext_subw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind
 define zeroext i32 @zext_subw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
 ; RV64I-LABEL: zext_subw_zext_aext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -522,7 +522,7 @@ define zeroext i32 @zext_subw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
 define zeroext i32 @zext_subw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: zext_subw_zext_sext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -533,7 +533,7 @@ define zeroext i32 @zext_subw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind
 define zeroext i32 @zext_subw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
 ; RV64I-LABEL: zext_subw_zext_zext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    subw a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -1333,7 +1333,7 @@ define zeroext i32 @zext_sraw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind
 define i32 @aext_addiw_aext(i32 %a) nounwind {
 ; RV64I-LABEL: aext_addiw_aext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    addiw a0, a0, 1
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 1
   ret i32 %1
@@ -1342,7 +1342,7 @@ define i32 @aext_addiw_aext(i32 %a) nounwind {
 define i32 @aext_addiw_sext(i32 signext %a) nounwind {
 ; RV64I-LABEL: aext_addiw_sext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 2
+; RV64I-NEXT:    addiw a0, a0, 2
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 2
   ret i32 %1
@@ -1351,7 +1351,7 @@ define i32 @aext_addiw_sext(i32 signext %a) nounwind {
 define i32 @aext_addiw_zext(i32 zeroext %a) nounwind {
 ; RV64I-LABEL: aext_addiw_zext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 3
+; RV64I-NEXT:    addiw a0, a0, 3
 ; RV64I-NEXT:    ret
   %1 = add i32 %a, 3
   ret i32 %1
@@ -1387,7 +1387,7 @@ define signext i32 @sext_addiw_zext(i32 zeroext %a) nounwind {
 define zeroext i32 @zext_addiw_aext(i32 %a) nounwind {
 ; RV64I-LABEL: zext_addiw_aext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 7
+; RV64I-NEXT:    addiw a0, a0, 7
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -1398,7 +1398,7 @@ define zeroext i32 @zext_addiw_aext(i32 %a) nounwind {
 define zeroext i32 @zext_addiw_sext(i32 signext %a) nounwind {
 ; RV64I-LABEL: zext_addiw_sext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 8
+; RV64I-NEXT:    addiw a0, a0, 8
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
@@ -1409,7 +1409,7 @@ define zeroext i32 @zext_addiw_sext(i32 signext %a) nounwind {
 define zeroext i32 @zext_addiw_zext(i32 zeroext %a) nounwind {
 ; RV64I-LABEL: zext_addiw_zext:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, 9
+; RV64I-NEXT:    addiw a0, a0, 9
 ; RV64I-NEXT:    slli a0, a0, 32
 ; RV64I-NEXT:    srli a0, a0, 32
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
index 682f351478ed8..40db46ad1cae3 100644
--- a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll
@@ -7,16 +7,16 @@ define signext i32 @addw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
 ; CHECK-NEXT:    bge a0, a1, .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    not a2, a0
-; CHECK-NEXT:    add a2, a2, a1
-; CHECK-NEXT:    addi a3, a0, 1
-; CHECK-NEXT:    mul a3, a2, a3
-; CHECK-NEXT:    sub a1, a1, a0
-; CHECK-NEXT:    addi a1, a1, -2
+; CHECK-NEXT:    addw a2, a2, a1
+; CHECK-NEXT:    addiw a3, a0, 1
+; CHECK-NEXT:    mulw a3, a2, a3
+; CHECK-NEXT:    subw a1, a1, a0
+; CHECK-NEXT:    addiw a1, a1, -2
 ; CHECK-NEXT:    slli a1, a1, 32
 ; CHECK-NEXT:    slli a2, a2, 32
 ; CHECK-NEXT:    mulhu a1, a2, a1
 ; CHECK-NEXT:    srli a1, a1, 1
-; CHECK-NEXT:    add a0, a3, a0
+; CHECK-NEXT:    addw a0, a3, a0
 ; CHECK-NEXT:    addw a0, a0, a1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_2:
@@ -53,15 +53,15 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin
 ; CHECK-NEXT:    bge a0, a1, .LBB1_2
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    not a2, a0
-; CHECK-NEXT:    add a3, a2, a1
-; CHECK-NEXT:    mul a2, a3, a2
-; CHECK-NEXT:    sub a1, a1, a0
-; CHECK-NEXT:    addi a1, a1, -2
+; CHECK-NEXT:    addw a3, a2, a1
+; CHECK-NEXT:    mulw a2, a3, a2
+; CHECK-NEXT:    subw a1, a1, a0
+; CHECK-NEXT:    addiw a1, a1, -2
 ; CHECK-NEXT:    slli a1, a1, 32
 ; CHECK-NEXT:    slli a3, a3, 32
 ; CHECK-NEXT:    mulhu a1, a3, a1
 ; CHECK-NEXT:    srli a1, a1, 1
-; CHECK-NEXT:    sub a0, a2, a0
+; CHECK-NEXT:    subw a0, a2, a0
 ; CHECK-NEXT:    subw a0, a0, a1
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_2:
diff --git a/llvm/test/CodeGen/RISCV/rv64m-exhaustive-w-insts.ll b/llvm/test/CodeGen/RISCV/rv64m-exhaustive-w-insts.ll
index 5732ac9aa4109..7da8cc82d32ac 100644
--- a/llvm/test/CodeGen/RISCV/rv64m-exhaustive-w-insts.ll
+++ b/llvm/test/CodeGen/RISCV/rv64m-exhaustive-w-insts.ll
@@ -171,7 +171,7 @@ define signext i32 @sext_mulw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind
 define zeroext i32 @zext_mulw_aext_aext(i32 %a, i32 %b) nounwind {
 ; RV64IM-LABEL: zext_mulw_aext_aext:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 32
 ; RV64IM-NEXT:    srli a0, a0, 32
 ; RV64IM-NEXT:    ret
@@ -182,7 +182,7 @@ define zeroext i32 @zext_mulw_aext_aext(i32 %a, i32 %b) nounwind {
 define zeroext i32 @zext_mulw_aext_sext(i32 %a, i32 signext %b) nounwind {
 ; RV64IM-LABEL: zext_mulw_aext_sext:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 32
 ; RV64IM-NEXT:    srli a0, a0, 32
 ; RV64IM-NEXT:    ret
@@ -193,7 +193,7 @@ define zeroext i32 @zext_mulw_aext_sext(i32 %a, i32 signext %b) nounwind {
 define zeroext i32 @zext_mulw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
 ; RV64IM-LABEL: zext_mulw_aext_zext:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 32
 ; RV64IM-NEXT:    srli a0, a0, 32
 ; RV64IM-NEXT:    ret
@@ -204,7 +204,7 @@ define zeroext i32 @zext_mulw_aext_zext(i32 %a, i32 zeroext %b) nounwind {
 define zeroext i32 @zext_mulw_sext_aext(i32 signext %a, i32 %b) nounwind {
 ; RV64IM-LABEL: zext_mulw_sext_aext:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 32
 ; RV64IM-NEXT:    srli a0, a0, 32
 ; RV64IM-NEXT:    ret
@@ -215,7 +215,7 @@ define zeroext i32 @zext_mulw_sext_aext(i32 signext %a, i32 %b) nounwind {
 define zeroext i32 @zext_mulw_sext_sext(i32 signext %a, i32 signext %b) nounwind {
 ; RV64IM-LABEL: zext_mulw_sext_sext:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 32
 ; RV64IM-NEXT:    srli a0, a0, 32
 ; RV64IM-NEXT:    ret
@@ -226,7 +226,7 @@ define zeroext i32 @zext_mulw_sext_sext(i32 signext %a, i32 signext %b) nounwind
 define zeroext i32 @zext_mulw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind {
 ; RV64IM-LABEL: zext_mulw_sext_zext:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 32
 ; RV64IM-NEXT:    srli a0, a0, 32
 ; RV64IM-NEXT:    ret
@@ -237,7 +237,7 @@ define zeroext i32 @zext_mulw_sext_zext(i32 signext %a, i32 zeroext %b) nounwind
 define zeroext i32 @zext_mulw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
 ; RV64IM-LABEL: zext_mulw_zext_aext:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 32
 ; RV64IM-NEXT:    srli a0, a0, 32
 ; RV64IM-NEXT:    ret
@@ -248,7 +248,7 @@ define zeroext i32 @zext_mulw_zext_aext(i32 zeroext %a, i32 %b) nounwind {
 define zeroext i32 @zext_mulw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind {
 ; RV64IM-LABEL: zext_mulw_zext_sext:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 32
 ; RV64IM-NEXT:    srli a0, a0, 32
 ; RV64IM-NEXT:    ret
@@ -259,7 +259,7 @@ define zeroext i32 @zext_mulw_zext_sext(i32 zeroext %a, i32 signext %b) nounwind
 define zeroext i32 @zext_mulw_zext_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
 ; RV64IM-LABEL: zext_mulw_zext_zext:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulw a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 32
 ; RV64IM-NEXT:    srli a0, a0, 32
 ; RV64IM-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index e16efa5892e14..0c6c2f23b8963 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -1254,14 +1254,14 @@ define signext i32 @mulw192(i32 signext %a) {
 ;
 ; RV64B-LABEL: mulw192:
 ; RV64B:       # %bb.0:
-; RV64B-NEXT:    addi a1, zero, 192
-; RV64B-NEXT:    mulw a0, a0, a1
+; RV64B-NEXT:    sh1add a0, a0, a0
+; RV64B-NEXT:    slliw a0, a0, 6
 ; RV64B-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: mulw192:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    addi a1, zero, 192
-; RV64ZBA-NEXT:    mulw a0, a0, a1
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    slliw a0, a0, 6
 ; RV64ZBA-NEXT:    ret
   %c = mul i32 %a, 192
   ret i32 %c
@@ -1276,14 +1276,14 @@ define signext i32 @mulw320(i32 signext %a) {
 ;
 ; RV64B-LABEL: mulw320:
 ; RV64B:       # %bb.0:
-; RV64B-NEXT:    addi a1, zero, 320
-; RV64B-NEXT:    mulw a0, a0, a1
+; RV64B-NEXT:    sh2add a0, a0, a0
+; RV64B-NEXT:    slliw a0, a0, 6
 ; RV64B-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: mulw320:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    addi a1, zero, 320
-; RV64ZBA-NEXT:    mulw a0, a0, a1
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    slliw a0, a0, 6
 ; RV64ZBA-NEXT:    ret
   %c = mul i32 %a, 320
   ret i32 %c
@@ -1298,15 +1298,61 @@ define signext i32 @mulw576(i32 signext %a) {
 ;
 ; RV64B-LABEL: mulw576:
 ; RV64B:       # %bb.0:
-; RV64B-NEXT:    addi a1, zero, 576
-; RV64B-NEXT:    mulw a0, a0, a1
+; RV64B-NEXT:    sh3add a0, a0, a0
+; RV64B-NEXT:    slliw a0, a0, 6
 ; RV64B-NEXT:    ret
 ;
 ; RV64ZBA-LABEL: mulw576:
 ; RV64ZBA:       # %bb.0:
-; RV64ZBA-NEXT:    addi a1, zero, 576
-; RV64ZBA-NEXT:    mulw a0, a0, a1
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    slliw a0, a0, 6
 ; RV64ZBA-NEXT:    ret
   %c = mul i32 %a, 576
   ret i32 %c
 }
+
+define i64 @add4104(i64 %a) {
+; RV64I-LABEL: add4104:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1
+; RV64I-NEXT:    addiw a1, a1, 8
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64B-LABEL: add4104:
+; RV64B:       # %bb.0:
+; RV64B-NEXT:    addi a1, zero, 1026
+; RV64B-NEXT:    sh2add a0, a1, a0
+; RV64B-NEXT:    ret
+;
+; RV64ZBA-LABEL: add4104:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    addi a1, zero, 1026
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %c = add i64 %a, 4104
+  ret i64 %c
+}
+
+define i64 @add8208(i64 %a) {
+; RV64I-LABEL: add8208:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 2
+; RV64I-NEXT:    addiw a1, a1, 16
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64B-LABEL: add8208:
+; RV64B:       # %bb.0:
+; RV64B-NEXT:    addi a1, zero, 1026
+; RV64B-NEXT:    sh3add a0, a1, a0
+; RV64B-NEXT:    ret
+;
+; RV64ZBA-LABEL: add8208:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    addi a1, zero, 1026
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %c = add i64 %a, 8208
+  ret i64 %c
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb-zbp.ll b/llvm/test/CodeGen/RISCV/rv64zbb-zbp.ll
index 41cf5478f0b0d..798bec0113b83 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb-zbp.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb-zbp.ll
@@ -170,7 +170,7 @@ define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: rol_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sllw a2, a0, a1
-; RV64I-NEXT:    neg a1, a1
+; RV64I-NEXT:    negw a1, a1
 ; RV64I-NEXT:    srlw a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -198,7 +198,7 @@ define void @rol_i32_nosext(i32 signext %a, i32 signext %b, i32* %x) nounwind {
 ; RV64I-LABEL: rol_i32_nosext:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    sllw a3, a0, a1
-; RV64I-NEXT:    neg a1, a1
+; RV64I-NEXT:    negw a1, a1
 ; RV64I-NEXT:    srlw a0, a0, a1
 ; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sw a0, 0(a2)
@@ -231,7 +231,7 @@ define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a1, zero, -2
 ; RV64I-NEXT:    sllw a2, a1, a0
-; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    negw a0, a0
 ; RV64I-NEXT:    srlw a0, a1, a0
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -292,7 +292,7 @@ define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: ror_i32:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srlw a2, a0, a1
-; RV64I-NEXT:    neg a1, a1
+; RV64I-NEXT:    negw a1, a1
 ; RV64I-NEXT:    sllw a0, a0, a1
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
@@ -320,7 +320,7 @@ define void @ror_i32_nosext(i32 signext %a, i32 signext %b, i32* %x) nounwind {
 ; RV64I-LABEL: ror_i32_nosext:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    srlw a3, a0, a1
-; RV64I-NEXT:    neg a1, a1
+; RV64I-NEXT:    negw a1, a1
 ; RV64I-NEXT:    sllw a0, a0, a1
 ; RV64I-NEXT:    or a0, a3, a0
 ; RV64I-NEXT:    sw a0, 0(a2)
@@ -353,7 +353,7 @@ define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    addi a1, zero, -2
 ; RV64I-NEXT:    srlw a2, a1, a0
-; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    negw a0, a0
 ; RV64I-NEXT:    sllw a0, a1, a0
 ; RV64I-NEXT:    or a0, a2, a0
 ; RV64I-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index 361db386583bb..4d819955f12a7 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -265,7 +265,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ;
 ; RV64B-LABEL: log2_ceil_i32:
 ; RV64B:       # %bb.0:
-; RV64B-NEXT:    addi a0, a0, -1
+; RV64B-NEXT:    addiw a0, a0, -1
 ; RV64B-NEXT:    clzw a0, a0
 ; RV64B-NEXT:    addi a1, zero, 32
 ; RV64B-NEXT:    sub a0, a1, a0
@@ -273,7 +273,7 @@ define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
 ;
 ; RV64ZBB-LABEL: log2_ceil_i32:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    addi a0, a0, -1
+; RV64ZBB-NEXT:    addiw a0, a0, -1
 ; RV64ZBB-NEXT:    clzw a0, a0
 ; RV64ZBB-NEXT:    addi a1, zero, 32
 ; RV64ZBB-NEXT:    sub a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/commuted-op-indices-regression.mir b/llvm/test/CodeGen/RISCV/rvv/commuted-op-indices-regression.mir
new file mode 100644
index 0000000000000..2d389e0cf49fb
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/commuted-op-indices-regression.mir
@@ -0,0 +1,45 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=riscv64 -mattr=+experimental-v -run-pass=simple-register-coalescing %s -o - 2>&1 | FileCheck %s
+
+# This test used to crash in the register coalescer when the target would
+# return the out-of-bounds CommuteAnyOperandIndex for one of its commutable
+# operand indices.
+
+--- |
+  target triple = "riscv64"
+  target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
+
+  define void @commuted_op_indices() {
+    unreachable
+  }
+...
+---
+name:            commuted_op_indices
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vr, preferred-register: '' }
+  - { id: 1, class: vrnov0, preferred-register: '' }
+  - { id: 2, class: vrnov0, preferred-register: '' }
+  - { id: 3, class: vr, preferred-register: '' }
+body:             |
+  bb.0:
+    liveins: $v0, $v1, $v2
+    ; CHECK-LABEL: name: commuted_op_indices
+    ; CHECK: liveins: $v0, $v1, $v2
+    ; CHECK: [[COPY:%[0-9]+]]:vr = COPY $v0
+    ; CHECK: [[COPY1:%[0-9]+]]:vrnov0 = COPY $v1
+    ; CHECK: [[COPY2:%[0-9]+]]:vrnov0 = COPY $v2
+    ; CHECK: [[PseudoVNMSUB_VV_M1_:%[0-9]+]]:vr = PseudoVNMSUB_VV_M1 [[PseudoVNMSUB_VV_M1_]], [[COPY1]], [[COPY2]], $x0, 6, 1, implicit $vl, implicit $vtype
+    ; CHECK: [[COPY2:%[0-9]+]]:vr = COPY [[PseudoVNMSUB_VV_M1_]]
+    ; CHECK: dead [[COPY2]]:vr = PseudoVSLL_VI_M1 [[COPY2]], 11, $noreg, 6, implicit $vl, implicit $vtype
+    ; CHECK: $v0 = COPY [[PseudoVNMSUB_VV_M1_]]
+    ; CHECK: PseudoRET implicit $v0
+    %0:vr = COPY $v0
+    %1:vrnov0 = COPY $v1
+    %2:vrnov0 = COPY $v2
+    %0:vr = PseudoVNMSUB_VV_M1 %0, %1, killed %2, $x0, 6, 1, implicit $vl, implicit $vtype
+    %3:vr = COPY %0
+    %3:vr = PseudoVSLL_VI_M1 %3, 11, $noreg, 6, implicit $vl, implicit $vtype
+    $v0 = COPY %0
+    PseudoRET implicit $v0
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
index b5dbfb8212892..82cd16db58faa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
@@ -537,7 +537,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    addi a4, a4, 257
 ; LMULMAX2-RV64-NEXT:    mul a5, a5, a4
 ; LMULMAX2-RV64-NEXT:    srli a5, a5, 56
-; LMULMAX2-RV64-NEXT:    addi a5, a5, -56
+; LMULMAX2-RV64-NEXT:    addiw a5, a5, -56
 ; LMULMAX2-RV64-NEXT:    sb a5, 16(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 15
@@ -568,7 +568,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 31(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -598,7 +598,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 30(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -628,7 +628,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 29(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -658,7 +658,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 28(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -688,7 +688,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 27(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -718,7 +718,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 26(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -748,7 +748,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 25(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -778,7 +778,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 24(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -808,7 +808,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 23(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -838,7 +838,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 22(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -868,7 +868,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 21(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -898,7 +898,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 20(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -928,7 +928,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 19(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -958,7 +958,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 18(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
@@ -988,7 +988,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 17(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; LMULMAX2-RV64-NEXT:    addi a1, sp, 16
@@ -1529,7 +1529,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    addi a4, a4, 257
 ; LMULMAX1-RV64-NEXT:    mul a5, a5, a4
 ; LMULMAX1-RV64-NEXT:    srli a5, a5, 56
-; LMULMAX1-RV64-NEXT:    addi a5, a5, -56
+; LMULMAX1-RV64-NEXT:    addiw a5, a5, -56
 ; LMULMAX1-RV64-NEXT:    sb a5, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 15
@@ -1560,7 +1560,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 31(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -1590,7 +1590,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -1620,7 +1620,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 29(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -1650,7 +1650,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -1680,7 +1680,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 27(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -1710,7 +1710,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -1740,7 +1740,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 25(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -1770,7 +1770,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -1800,7 +1800,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 23(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -1830,7 +1830,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -1860,7 +1860,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 21(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -1890,7 +1890,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -1920,7 +1920,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 19(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -1950,7 +1950,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
@@ -1980,7 +1980,7 @@ define void @ctlz_v16i8(<16 x i8>* %x, <16 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 17(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
@@ -2309,7 +2309,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 16(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 7
@@ -2340,7 +2340,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 30(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -2370,7 +2370,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 28(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -2400,7 +2400,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 26(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -2430,7 +2430,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 24(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -2460,7 +2460,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 22(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -2490,7 +2490,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 20(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
@@ -2520,7 +2520,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 18(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; LMULMAX2-RV64-NEXT:    addi a1, sp, 16
@@ -2841,7 +2841,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX1-RV64-NEXT:    sh a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 7
@@ -2872,7 +2872,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX1-RV64-NEXT:    sh a1, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -2902,7 +2902,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX1-RV64-NEXT:    sh a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -2932,7 +2932,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX1-RV64-NEXT:    sh a1, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -2962,7 +2962,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX1-RV64-NEXT:    sh a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -2992,7 +2992,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX1-RV64-NEXT:    sh a1, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -3022,7 +3022,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX1-RV64-NEXT:    sh a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
@@ -3052,7 +3052,7 @@ define void @ctlz_v8i16(<8 x i16>* %x, <8 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX1-RV64-NEXT:    sh a1, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
@@ -3258,7 +3258,7 @@ define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    addi a4, a4, 257
 ; LMULMAX2-RV64-NEXT:    mul a5, a5, a4
 ; LMULMAX2-RV64-NEXT:    srli a5, a5, 56
-; LMULMAX2-RV64-NEXT:    addi a5, a5, -32
+; LMULMAX2-RV64-NEXT:    addiw a5, a5, -32
 ; LMULMAX2-RV64-NEXT:    sw a5, 16(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 3
@@ -3290,7 +3290,7 @@ define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX2-RV64-NEXT:    sw a1, 28(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -3321,7 +3321,7 @@ define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX2-RV64-NEXT:    sw a1, 24(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v25
@@ -3352,7 +3352,7 @@ define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX2-RV64-NEXT:    sw a1, 20(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX2-RV64-NEXT:    addi a1, sp, 16
@@ -3550,7 +3550,7 @@ define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    addi a4, a4, 257
 ; LMULMAX1-RV64-NEXT:    mul a5, a5, a4
 ; LMULMAX1-RV64-NEXT:    srli a5, a5, 56
-; LMULMAX1-RV64-NEXT:    addi a5, a5, -32
+; LMULMAX1-RV64-NEXT:    addiw a5, a5, -32
 ; LMULMAX1-RV64-NEXT:    sw a5, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
@@ -3582,7 +3582,7 @@ define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX1-RV64-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -3613,7 +3613,7 @@ define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX1-RV64-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
@@ -3644,7 +3644,7 @@ define void @ctlz_v4i32(<4 x i32>* %x, <4 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX1-RV64-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
@@ -5138,7 +5138,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e8, m2, ta, mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 31
@@ -5169,7 +5169,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 63(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 30
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5199,7 +5199,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 62(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 29
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5229,7 +5229,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 61(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 28
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5259,7 +5259,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 60(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 27
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5289,7 +5289,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 59(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 26
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5319,7 +5319,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 58(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 25
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5349,7 +5349,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 57(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 24
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5379,7 +5379,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 23
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5409,7 +5409,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 55(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 22
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5439,7 +5439,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 54(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 21
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5469,7 +5469,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 53(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 20
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5499,7 +5499,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 52(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 19
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5529,7 +5529,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 51(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 18
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5559,7 +5559,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 50(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 17
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5589,7 +5589,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 49(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 16
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5619,7 +5619,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 15
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5649,7 +5649,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 47(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 14
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5679,7 +5679,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 46(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 13
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5709,7 +5709,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 45(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 12
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5739,7 +5739,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 44(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 11
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5769,7 +5769,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 43(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 10
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5799,7 +5799,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 42(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 9
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5829,7 +5829,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 41(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 8
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5859,7 +5859,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5889,7 +5889,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 39(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5919,7 +5919,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 38(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5949,7 +5949,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 37(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -5979,7 +5979,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 36(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -6009,7 +6009,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 35(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -6039,7 +6039,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 34(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -6069,7 +6069,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX2-RV64-NEXT:    sb a1, 33(sp)
 ; LMULMAX2-RV64-NEXT:    vsetvli zero, a6, e8, m2, ta, mu
 ; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
@@ -7067,7 +7067,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 32(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e8, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 15
@@ -7098,7 +7098,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 47(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 14
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7128,7 +7128,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 46(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 13
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7158,7 +7158,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 45(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 12
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7188,7 +7188,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 44(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 11
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7218,7 +7218,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 43(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 10
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7248,7 +7248,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 42(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 9
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7278,7 +7278,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 41(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 8
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7308,7 +7308,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 40(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7338,7 +7338,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 39(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7368,7 +7368,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 38(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7398,7 +7398,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 37(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7428,7 +7428,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 36(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7458,7 +7458,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 35(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -7488,7 +7488,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 34(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7518,7 +7518,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 33(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    andi a1, a1, 255
@@ -7547,7 +7547,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 15
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7577,7 +7577,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 31(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 14
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7607,7 +7607,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 13
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7637,7 +7637,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 29(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 12
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7667,7 +7667,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 11
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7697,7 +7697,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 27(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 10
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7727,7 +7727,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 9
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7757,7 +7757,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 25(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 8
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7787,7 +7787,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7817,7 +7817,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 23(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7847,7 +7847,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7877,7 +7877,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 21(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7907,7 +7907,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7937,7 +7937,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 19(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -7967,7 +7967,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
@@ -7997,7 +7997,7 @@ define void @ctlz_v32i8(<32 x i8>* %x, <32 x i8>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -56
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -56
 ; LMULMAX1-RV64-NEXT:    sb a1, 17(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
@@ -8570,7 +8570,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e16, m2, ta, mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 15
@@ -8601,7 +8601,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 62(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 14
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8631,7 +8631,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 60(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 13
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8661,7 +8661,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 58(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 12
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8691,7 +8691,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 11
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8721,7 +8721,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 54(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 10
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8751,7 +8751,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 52(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 9
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8781,7 +8781,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 50(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 8
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8811,7 +8811,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 7
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8841,7 +8841,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 46(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8871,7 +8871,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 44(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8901,7 +8901,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 42(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8931,7 +8931,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8961,7 +8961,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 38(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -8991,7 +8991,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 36(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -9021,7 +9021,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX2-RV64-NEXT:    sh a1, 34(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 16, e16, m2, ta, mu
 ; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
@@ -9575,7 +9575,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    addi a1, a1, 257
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 32(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e16, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 7
@@ -9606,7 +9606,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 46(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
@@ -9636,7 +9636,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 44(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
@@ -9666,7 +9666,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 42(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
@@ -9696,7 +9696,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 40(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
@@ -9726,7 +9726,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 38(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v27
@@ -9756,7 +9756,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 36(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
@@ -9786,7 +9786,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 34(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v25
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a7
@@ -9815,7 +9815,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 7
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
@@ -9845,7 +9845,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 30(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 6
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
@@ -9875,7 +9875,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 5
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
@@ -9905,7 +9905,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 26(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 4
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
@@ -9935,7 +9935,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
@@ -9965,7 +9965,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 22(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v26
@@ -9995,7 +9995,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a2, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a2, a2, 56
-; LMULMAX1-RV64-NEXT:    addi a2, a2, -48
+; LMULMAX1-RV64-NEXT:    addiw a2, a2, -48
 ; LMULMAX1-RV64-NEXT:    sh a2, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a2, v25
@@ -10025,7 +10025,7 @@ define void @ctlz_v16i16(<16 x i16>* %x, <16 x i16>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a2, a2, a5
 ; LMULMAX1-RV64-NEXT:    mul a1, a2, a1
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -48
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -48
 ; LMULMAX1-RV64-NEXT:    sh a1, 18(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
@@ -10355,7 +10355,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    addi a4, a4, 257
 ; LMULMAX2-RV64-NEXT:    mul a5, a5, a4
 ; LMULMAX2-RV64-NEXT:    srli a5, a5, 56
-; LMULMAX2-RV64-NEXT:    addi a5, a5, -32
+; LMULMAX2-RV64-NEXT:    addiw a5, a5, -32
 ; LMULMAX2-RV64-NEXT:    sw a5, 32(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 1, e32, m2, ta, mu
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 7
@@ -10387,7 +10387,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX2-RV64-NEXT:    sw a1, 60(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 6
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -10418,7 +10418,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX2-RV64-NEXT:    sw a1, 56(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 5
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -10449,7 +10449,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX2-RV64-NEXT:    sw a1, 52(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 4
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -10480,7 +10480,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX2-RV64-NEXT:    sw a1, 48(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 3
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -10511,7 +10511,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX2-RV64-NEXT:    sw a1, 44(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v28, v26, 2
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v28
@@ -10542,7 +10542,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX2-RV64-NEXT:    sw a1, 40(sp)
 ; LMULMAX2-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX2-RV64-NEXT:    vmv.x.s a1, v26
@@ -10573,7 +10573,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX2-RV64-NEXT:    and a1, a1, a3
 ; LMULMAX2-RV64-NEXT:    mul a1, a1, a4
 ; LMULMAX2-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX2-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX2-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX2-RV64-NEXT:    sw a1, 36(sp)
 ; LMULMAX2-RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
 ; LMULMAX2-RV64-NEXT:    addi a1, sp, 32
@@ -10884,7 +10884,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    addi a5, a5, 257
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX1-RV64-NEXT:    sw a1, 32(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 3
@@ -10916,7 +10916,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX1-RV64-NEXT:    sw a1, 44(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v27, v26, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v27
@@ -10947,7 +10947,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX1-RV64-NEXT:    sw a1, 40(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v26, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -10978,7 +10978,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX1-RV64-NEXT:    sw a1, 36(sp)
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
 ; LMULMAX1-RV64-NEXT:    srliw a2, a1, 1
@@ -11008,7 +11008,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX1-RV64-NEXT:    sw a1, 16(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 3
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -11039,7 +11039,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX1-RV64-NEXT:    sw a1, 28(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v26, v25, 2
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v26
@@ -11070,7 +11070,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX1-RV64-NEXT:    sw a1, 24(sp)
 ; LMULMAX1-RV64-NEXT:    vslidedown.vi v25, v25, 1
 ; LMULMAX1-RV64-NEXT:    vmv.x.s a1, v25
@@ -11101,7 +11101,7 @@ define void @ctlz_v8i32(<8 x i32>* %x, <8 x i32>* %y) {
 ; LMULMAX1-RV64-NEXT:    and a1, a1, a4
 ; LMULMAX1-RV64-NEXT:    mul a1, a1, a5
 ; LMULMAX1-RV64-NEXT:    srli a1, a1, 56
-; LMULMAX1-RV64-NEXT:    addi a1, a1, -32
+; LMULMAX1-RV64-NEXT:    addiw a1, a1, -32
 ; LMULMAX1-RV64-NEXT:    sw a1, 20(sp)
 ; LMULMAX1-RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; LMULMAX1-RV64-NEXT:    addi a1, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
index 15627a32145ea..cf14b88e93757 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
@@ -1290,37 +1290,30 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, <16 x i8>* %z) {
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v8, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf4, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
-; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; LMULMAX1-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; LMULMAX1-NEXT:    vmv.v.i v26, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v27, v26
-; LMULMAX1-NEXT:    vslideup.vi v27, v25, 0
+; LMULMAX1-NEXT:    vsetivli zero, 4, e8, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v9, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf4, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
-; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v27, v25, 4
-; LMULMAX1-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
-; LMULMAX1-NEXT:    vmv.v.i v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e8, m1, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v25, v27, 0
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 4
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v10, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v10, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v27, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
+; LMULMAX1-NEXT:    vsetivli zero, 12, e8, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 8
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v11, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v11, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
-; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v27, 4
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 16, e8, m1, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v25, v26, 8
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 12
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; LMULMAX1-NEXT:    vse8.v v25, (a0)
+; LMULMAX1-NEXT:    vse8.v v26, (a0)
 ; LMULMAX1-NEXT:    ret
 ;
 ; LMULMAX4-LABEL: truncstore_v16i32_v16i8:
@@ -1624,43 +1617,36 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, <8 x i8>* %z) {
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
+; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; LMULMAX1-NEXT:    vmv.v.i v26, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v27, v26
-; LMULMAX1-NEXT:    vslideup.vi v27, v25, 0
+; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf2, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v9, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v27, v25, 2
-; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; LMULMAX1-NEXT:    vmv.v.i v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v25, v27, 0
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 2
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v10, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v10, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v27, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
+; LMULMAX1-NEXT:    vsetivli zero, 6, e8, mf2, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 4
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v11, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v11, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v27, 2
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v25, v26, 4
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 6
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
-; LMULMAX1-NEXT:    vse8.v v25, (a0)
+; LMULMAX1-NEXT:    vse8.v v26, (a0)
 ; LMULMAX1-NEXT:    ret
 ;
 ; LMULMAX4-LABEL: truncstore_v8i64_v8i8:
@@ -1685,37 +1671,30 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, <8 x i16>* %z) {
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v8, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; LMULMAX1-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; LMULMAX1-NEXT:    vmv.v.i v26, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e16, mf2, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v27, v26
-; LMULMAX1-NEXT:    vslideup.vi v27, v25, 0
+; LMULMAX1-NEXT:    vsetivli zero, 2, e16, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v9, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v27, v25, 2
-; LMULMAX1-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; LMULMAX1-NEXT:    vmv.v.i v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e16, m1, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v25, v27, 0
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 2
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v10, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v10, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e16, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v27, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
+; LMULMAX1-NEXT:    vsetivli zero, 6, e16, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 4
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v11, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v11, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v27, 2
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e16, m1, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v25, v26, 4
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 6
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
-; LMULMAX1-NEXT:    vse16.v v25, (a0)
+; LMULMAX1-NEXT:    vse16.v v26, (a0)
 ; LMULMAX1-NEXT:    ret
 ;
 ; LMULMAX4-LABEL: truncstore_v8i64_v8i16:
@@ -1779,88 +1758,68 @@ define void @truncstore_v16i64_v16i8(<16 x i64> %x, <16 x i8>* %z) {
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v26, v25, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
+; LMULMAX1-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
 ; LMULMAX1-NEXT:    vmv.v.i v25, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v27, v25
-; LMULMAX1-NEXT:    vslideup.vi v27, v26, 0
+; LMULMAX1-NEXT:    vsetivli zero, 2, e8, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v25, v26, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v26, v9, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v27, v26, 2
-; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; LMULMAX1-NEXT:    vmv.v.i v26, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v28, v26
-; LMULMAX1-NEXT:    vslideup.vi v28, v27, 0
+; LMULMAX1-NEXT:    vsetivli zero, 4, e8, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v25, v26, 2
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v10, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v10, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v29, v25
-; LMULMAX1-NEXT:    vslideup.vi v29, v27, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
+; LMULMAX1-NEXT:    vsetivli zero, 6, e8, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v25, v26, 4
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v11, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v11, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v29, v27, 2
-; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v28, v29, 4
-; LMULMAX1-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
-; LMULMAX1-NEXT:    vmv.v.i v27, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e8, m1, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v27, v28, 0
+; LMULMAX1-NEXT:    vslideup.vi v25, v26, 6
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v12, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v12, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v29, v25
-; LMULMAX1-NEXT:    vslideup.vi v29, v28, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
+; LMULMAX1-NEXT:    vsetivli zero, 10, e8, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v25, v26, 8
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v13, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v13, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v29, v28, 2
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v29, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
+; LMULMAX1-NEXT:    vsetivli zero, 12, e8, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v25, v26, 10
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v14, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v14, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v25, v28, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
+; LMULMAX1-NEXT:    vsetivli zero, 14, e8, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v25, v26, 12
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v15, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v15, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v25, v28, 2
-; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v25, 4
+; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 16, e8, m1, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v27, v26, 8
+; LMULMAX1-NEXT:    vslideup.vi v25, v26, 14
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; LMULMAX1-NEXT:    vse8.v v27, (a0)
+; LMULMAX1-NEXT:    vse8.v v25, (a0)
 ; LMULMAX1-NEXT:    ret
 ;
 ; LMULMAX4-LABEL: truncstore_v16i64_v16i8:
@@ -1897,67 +1856,54 @@ define void @truncstore_v16i64_v16i16(<16 x i64> %x, <16 x i16>* %z) {
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v8, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v26, v25, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; LMULMAX1-NEXT:    vmv.v.i v25, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e16, mf2, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v28, v25
-; LMULMAX1-NEXT:    vslideup.vi v28, v26, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v26, v9, 0
-; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v28, v26, 2
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; LMULMAX1-NEXT:    vmv.v.i v26, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, m1, tu, mu
+; LMULMAX1-NEXT:    vsetivli zero, 2, e16, m1, tu, mu
 ; LMULMAX1-NEXT:    vmv1r.v v27, v26
-; LMULMAX1-NEXT:    vslideup.vi v27, v28, 0
+; LMULMAX1-NEXT:    vslideup.vi v27, v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v10, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v9, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e16, mf2, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v29, v25
-; LMULMAX1-NEXT:    vslideup.vi v29, v28, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
+; LMULMAX1-NEXT:    vsetivli zero, 4, e16, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v27, v25, 2
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v11, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v10, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v29, v28, 2
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
+; LMULMAX1-NEXT:    vsetivli zero, 6, e16, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v27, v25, 4
+; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; LMULMAX1-NEXT:    vnsrl.wi v25, v11, 0
+; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e16, m1, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v27, v29, 4
+; LMULMAX1-NEXT:    vslideup.vi v27, v25, 6
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v12, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v12, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e16, mf2, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v29, v25
-; LMULMAX1-NEXT:    vslideup.vi v29, v28, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
+; LMULMAX1-NEXT:    vsetivli zero, 2, e16, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v13, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v13, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v29, v28, 2
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e16, m1, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v29, 0
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 2
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v14, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v14, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e16, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v25, v28, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
+; LMULMAX1-NEXT:    vsetivli zero, 6, e16, m1, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 4
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v15, 0
+; LMULMAX1-NEXT:    vnsrl.wi v25, v15, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v28, v28, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v25, v28, 2
+; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e16, m1, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v25, 4
+; LMULMAX1-NEXT:    vslideup.vi v26, v25, 6
 ; LMULMAX1-NEXT:    addi a1, a0, 16
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, m1, ta, mu
 ; LMULMAX1-NEXT:    vse16.v v26, (a1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
index 91fa3c4a48b55..6b1130a072007 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
@@ -196,15 +196,15 @@ define void @fpround_v8f64_v8f16(<8 x double>* %x, <8 x half>* %y) {
 ;
 ; LMULMAX1-LABEL: fpround_v8f64_v8f16:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    addi sp, sp, -48
-; LMULMAX1-NEXT:    .cfi_def_cfa_offset 48
+; LMULMAX1-NEXT:    addi sp, sp, -32
+; LMULMAX1-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; LMULMAX1-NEXT:    vle64.v v25, (a0)
-; LMULMAX1-NEXT:    addi a2, a0, 32
+; LMULMAX1-NEXT:    addi a2, a0, 16
 ; LMULMAX1-NEXT:    vle64.v v26, (a2)
 ; LMULMAX1-NEXT:    addi a2, a0, 48
 ; LMULMAX1-NEXT:    vle64.v v27, (a2)
-; LMULMAX1-NEXT:    addi a0, a0, 16
+; LMULMAX1-NEXT:    addi a0, a0, 32
 ; LMULMAX1-NEXT:    vle64.v v28, (a0)
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.rod.f.f.w v29, v27
@@ -216,35 +216,25 @@ define void @fpround_v8f64_v8f16(<8 x double>* %x, <8 x half>* %y) {
 ; LMULMAX1-NEXT:    vfncvt.rod.f.f.w v27, v28
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v28, v27
-; LMULMAX1-NEXT:    addi a0, sp, 20
+; LMULMAX1-NEXT:    addi a0, sp, 24
 ; LMULMAX1-NEXT:    vse16.v v28, (a0)
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.rod.f.f.w v27, v26
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v26, v27
-; LMULMAX1-NEXT:    addi a0, sp, 24
-; LMULMAX1-NEXT:    vse16.v v26, (a0)
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; LMULMAX1-NEXT:    addi a0, sp, 24
-; LMULMAX1-NEXT:    vle16.v v26, (a0)
-; LMULMAX1-NEXT:    addi a0, sp, 40
+; LMULMAX1-NEXT:    addi a0, sp, 20
 ; LMULMAX1-NEXT:    vse16.v v26, (a0)
-; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; LMULMAX1-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.rod.f.f.w v26, v25
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v25, v26
 ; LMULMAX1-NEXT:    addi a0, sp, 16
 ; LMULMAX1-NEXT:    vse16.v v25, (a0)
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; LMULMAX1-NEXT:    addi a0, sp, 16
-; LMULMAX1-NEXT:    vle16.v v25, (a0)
-; LMULMAX1-NEXT:    addi a0, sp, 32
-; LMULMAX1-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; LMULMAX1-NEXT:    addi a0, sp, 32
+; LMULMAX1-NEXT:    addi a0, sp, 16
 ; LMULMAX1-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-NEXT:    vse16.v v25, (a1)
-; LMULMAX1-NEXT:    addi sp, sp, 48
+; LMULMAX1-NEXT:    addi sp, sp, 32
 ; LMULMAX1-NEXT:    ret
   %a = load <8 x double>, <8 x double>* %x
   %d = fptrunc <8 x double> %a to <8 x half>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
index 1846ffb983605..9646e6eada6fd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
@@ -448,43 +448,36 @@ define void @fp2si_v8f64_v8i8(<8 x double>* %x, <8 x i8>* %y) {
 ; LMULMAX1-NEXT:    vnsrl.wi v27, v29, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
+; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; LMULMAX1-NEXT:    vmv.v.i v29, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v30, v29
-; LMULMAX1-NEXT:    vslideup.vi v30, v27, 0
+; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf2, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v29, v27, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.rtz.x.f.w v27, v28
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v30, v27, 2
-; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; LMULMAX1-NEXT:    vmv.v.i v27, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v27, v30, 0
+; LMULMAX1-NEXT:    vslideup.vi v29, v27, 2
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vfncvt.rtz.x.f.w v28, v26
+; LMULMAX1-NEXT:    vfncvt.rtz.x.f.w v27, v26
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v26, v28, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v27, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v29, v26, 0
+; LMULMAX1-NEXT:    vsetivli zero, 6, e8, mf2, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v29, v26, 4
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.rtz.x.f.w v26, v25
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v26, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v29, v25, 2
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v27, v29, 4
+; LMULMAX1-NEXT:    vslideup.vi v29, v25, 6
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
-; LMULMAX1-NEXT:    vse8.v v27, (a1)
+; LMULMAX1-NEXT:    vse8.v v29, (a1)
 ; LMULMAX1-NEXT:    ret
   %a = load <8 x double>, <8 x double>* %x
   %d = fptosi <8 x double> %a to <8 x i8>
@@ -522,43 +515,36 @@ define void @fp2ui_v8f64_v8i8(<8 x double>* %x, <8 x i8>* %y) {
 ; LMULMAX1-NEXT:    vnsrl.wi v27, v29, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
+; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; LMULMAX1-NEXT:    vmv.v.i v29, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v30, v29
-; LMULMAX1-NEXT:    vslideup.vi v30, v27, 0
+; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf2, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v29, v27, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.rtz.xu.f.w v27, v28
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v27, v27, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v30, v27, 2
-; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; LMULMAX1-NEXT:    vmv.v.i v27, 0
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v27, v30, 0
+; LMULMAX1-NEXT:    vslideup.vi v29, v27, 2
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vfncvt.rtz.xu.f.w v28, v26
+; LMULMAX1-NEXT:    vfncvt.rtz.xu.f.w v27, v26
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
-; LMULMAX1-NEXT:    vnsrl.wi v26, v28, 0
+; LMULMAX1-NEXT:    vnsrl.wi v26, v27, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v26, v26, 0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v29, v26, 0
+; LMULMAX1-NEXT:    vsetivli zero, 6, e8, mf2, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v29, v26, 4
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.rtz.xu.f.w v26, v25
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v26, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vnsrl.wi v25, v25, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v29, v25, 2
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v27, v29, 4
+; LMULMAX1-NEXT:    vslideup.vi v29, v25, 6
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
-; LMULMAX1-NEXT:    vse8.v v27, (a1)
+; LMULMAX1-NEXT:    vse8.v v29, (a1)
 ; LMULMAX1-NEXT:    ret
   %a = load <8 x double>, <8 x double>* %x
   %d = fptoui <8 x double> %a to <8 x i8>
@@ -577,7 +563,7 @@ define <8 x i1> @fp2si_v8f64_v8i1(<8 x double> %x) {
 ;
 ; LMULMAX1-LABEL: fp2si_v8f64_v8i1:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
+; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; LMULMAX1-NEXT:    vmv.v.i v25, 0
 ; LMULMAX1-NEXT:    vmclr.m v0
 ; LMULMAX1-NEXT:    vmerge.vim v26, v25, 1, v0
@@ -588,58 +574,43 @@ define <8 x i1> @fp2si_v8f64_v8i1(<8 x double> %x) {
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vmv.v.i v27, 0
 ; LMULMAX1-NEXT:    vmerge.vim v28, v27, 1, v0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v29, v26
-; LMULMAX1-NEXT:    vslideup.vi v29, v28, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
-; LMULMAX1-NEXT:    vmsne.vi v0, v29, 0
-; LMULMAX1-NEXT:    vmerge.vim v28, v25, 1, v0
+; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf2, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v26, v28, 0
+; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; LMULMAX1-NEXT:    vmsne.vi v0, v26, 0
+; LMULMAX1-NEXT:    vmerge.vim v26, v25, 1, v0
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vfncvt.rtz.x.f.w v29, v9
-; LMULMAX1-NEXT:    vand.vi v29, v29, 1
-; LMULMAX1-NEXT:    vmsne.vi v0, v29, 0
-; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vmerge.vim v29, v27, 1, v0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v28, v29, 2
-; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf4, ta, mu
+; LMULMAX1-NEXT:    vfncvt.rtz.x.f.w v28, v9
+; LMULMAX1-NEXT:    vand.vi v28, v28, 1
 ; LMULMAX1-NEXT:    vmsne.vi v0, v28, 0
-; LMULMAX1-NEXT:    vmerge.vim v28, v25, 1, v0
-; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; LMULMAX1-NEXT:    vmv.v.i v29, 0
-; LMULMAX1-NEXT:    vmclr.m v0
-; LMULMAX1-NEXT:    vmerge.vim v30, v29, 1, v0
+; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
+; LMULMAX1-NEXT:    vmerge.vim v28, v27, 1, v0
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v30, v28, 0
+; LMULMAX1-NEXT:    vslideup.vi v26, v28, 2
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; LMULMAX1-NEXT:    vmsne.vi v0, v30, 0
-; LMULMAX1-NEXT:    vmerge.vim v28, v29, 1, v0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vfncvt.rtz.x.f.w v29, v10
-; LMULMAX1-NEXT:    vand.vi v29, v29, 1
-; LMULMAX1-NEXT:    vmsne.vi v0, v29, 0
-; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vmerge.vim v29, v27, 1, v0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v29, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
 ; LMULMAX1-NEXT:    vmsne.vi v0, v26, 0
 ; LMULMAX1-NEXT:    vmerge.vim v26, v25, 1, v0
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vfncvt.rtz.x.f.w v29, v11
-; LMULMAX1-NEXT:    vand.vi v29, v29, 1
-; LMULMAX1-NEXT:    vmsne.vi v0, v29, 0
+; LMULMAX1-NEXT:    vfncvt.rtz.x.f.w v28, v10
+; LMULMAX1-NEXT:    vand.vi v28, v28, 1
+; LMULMAX1-NEXT:    vmsne.vi v0, v28, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vmerge.vim v27, v27, 1, v0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v27, 2
-; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf4, ta, mu
+; LMULMAX1-NEXT:    vmerge.vim v28, v27, 1, v0
+; LMULMAX1-NEXT:    vsetivli zero, 6, e8, mf2, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v26, v28, 4
+; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; LMULMAX1-NEXT:    vmsne.vi v0, v26, 0
 ; LMULMAX1-NEXT:    vmerge.vim v25, v25, 1, v0
+; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; LMULMAX1-NEXT:    vfncvt.rtz.x.f.w v26, v11
+; LMULMAX1-NEXT:    vand.vi v26, v26, 1
+; LMULMAX1-NEXT:    vmsne.vi v0, v26, 0
+; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
+; LMULMAX1-NEXT:    vmerge.vim v26, v27, 1, v0
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v28, v25, 4
+; LMULMAX1-NEXT:    vslideup.vi v25, v26, 6
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
-; LMULMAX1-NEXT:    vmsne.vi v0, v28, 0
+; LMULMAX1-NEXT:    vmsne.vi v0, v25, 0
 ; LMULMAX1-NEXT:    ret
   %z = fptosi <8 x double> %x to <8 x i1>
   ret <8 x i1> %z
@@ -656,7 +627,7 @@ define <8 x i1> @fp2ui_v8f64_v8i1(<8 x double> %x) {
 ;
 ; LMULMAX1-LABEL: fp2ui_v8f64_v8i1:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
+; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; LMULMAX1-NEXT:    vmv.v.i v25, 0
 ; LMULMAX1-NEXT:    vmclr.m v0
 ; LMULMAX1-NEXT:    vmerge.vim v26, v25, 1, v0
@@ -667,58 +638,43 @@ define <8 x i1> @fp2ui_v8f64_v8i1(<8 x double> %x) {
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
 ; LMULMAX1-NEXT:    vmv.v.i v27, 0
 ; LMULMAX1-NEXT:    vmerge.vim v28, v27, 1, v0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vmv1r.v v29, v26
-; LMULMAX1-NEXT:    vslideup.vi v29, v28, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
-; LMULMAX1-NEXT:    vmsne.vi v0, v29, 0
-; LMULMAX1-NEXT:    vmerge.vim v28, v25, 1, v0
+; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf2, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v26, v28, 0
+; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
+; LMULMAX1-NEXT:    vmsne.vi v0, v26, 0
+; LMULMAX1-NEXT:    vmerge.vim v26, v25, 1, v0
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vfncvt.rtz.xu.f.w v29, v9
-; LMULMAX1-NEXT:    vand.vi v29, v29, 1
-; LMULMAX1-NEXT:    vmsne.vi v0, v29, 0
-; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vmerge.vim v29, v27, 1, v0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v28, v29, 2
-; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf4, ta, mu
+; LMULMAX1-NEXT:    vfncvt.rtz.xu.f.w v28, v9
+; LMULMAX1-NEXT:    vand.vi v28, v28, 1
 ; LMULMAX1-NEXT:    vmsne.vi v0, v28, 0
-; LMULMAX1-NEXT:    vmerge.vim v28, v25, 1, v0
-; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; LMULMAX1-NEXT:    vmv.v.i v29, 0
-; LMULMAX1-NEXT:    vmclr.m v0
-; LMULMAX1-NEXT:    vmerge.vim v30, v29, 1, v0
+; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
+; LMULMAX1-NEXT:    vmerge.vim v28, v27, 1, v0
 ; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v30, v28, 0
+; LMULMAX1-NEXT:    vslideup.vi v26, v28, 2
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; LMULMAX1-NEXT:    vmsne.vi v0, v30, 0
-; LMULMAX1-NEXT:    vmerge.vim v28, v29, 1, v0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vfncvt.rtz.xu.f.w v29, v10
-; LMULMAX1-NEXT:    vand.vi v29, v29, 1
-; LMULMAX1-NEXT:    vmsne.vi v0, v29, 0
-; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vmerge.vim v29, v27, 1, v0
-; LMULMAX1-NEXT:    vsetivli zero, 2, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v29, 0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, ta, mu
 ; LMULMAX1-NEXT:    vmsne.vi v0, v26, 0
 ; LMULMAX1-NEXT:    vmerge.vim v26, v25, 1, v0
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; LMULMAX1-NEXT:    vfncvt.rtz.xu.f.w v29, v11
-; LMULMAX1-NEXT:    vand.vi v29, v29, 1
-; LMULMAX1-NEXT:    vmsne.vi v0, v29, 0
+; LMULMAX1-NEXT:    vfncvt.rtz.xu.f.w v28, v10
+; LMULMAX1-NEXT:    vand.vi v28, v28, 1
+; LMULMAX1-NEXT:    vmsne.vi v0, v28, 0
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
-; LMULMAX1-NEXT:    vmerge.vim v27, v27, 1, v0
-; LMULMAX1-NEXT:    vsetivli zero, 4, e8, mf4, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v26, v27, 2
-; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf4, ta, mu
+; LMULMAX1-NEXT:    vmerge.vim v28, v27, 1, v0
+; LMULMAX1-NEXT:    vsetivli zero, 6, e8, mf2, tu, mu
+; LMULMAX1-NEXT:    vslideup.vi v26, v28, 4
+; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; LMULMAX1-NEXT:    vmsne.vi v0, v26, 0
 ; LMULMAX1-NEXT:    vmerge.vim v25, v25, 1, v0
+; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; LMULMAX1-NEXT:    vfncvt.rtz.xu.f.w v26, v11
+; LMULMAX1-NEXT:    vand.vi v26, v26, 1
+; LMULMAX1-NEXT:    vmsne.vi v0, v26, 0
+; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf8, ta, mu
+; LMULMAX1-NEXT:    vmerge.vim v26, v27, 1, v0
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e8, mf2, tu, mu
-; LMULMAX1-NEXT:    vslideup.vi v28, v25, 4
+; LMULMAX1-NEXT:    vslideup.vi v25, v26, 6
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e8, mf2, ta, mu
-; LMULMAX1-NEXT:    vmsne.vi v0, v28, 0
+; LMULMAX1-NEXT:    vmsne.vi v0, v25, 0
 ; LMULMAX1-NEXT:    ret
   %z = fptoui <8 x double> %x to <8 x i1>
   ret <8 x i1> %z
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
index a1522ac3169eb..555f11a4e24e4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
@@ -466,15 +466,15 @@ define void @si2fp_v8i64_v8f16(<8 x i64>* %x, <8 x half>* %y) {
 ;
 ; LMULMAX1-LABEL: si2fp_v8i64_v8f16:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    addi sp, sp, -48
-; LMULMAX1-NEXT:    .cfi_def_cfa_offset 48
+; LMULMAX1-NEXT:    addi sp, sp, -32
+; LMULMAX1-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; LMULMAX1-NEXT:    vle64.v v25, (a0)
-; LMULMAX1-NEXT:    addi a2, a0, 32
+; LMULMAX1-NEXT:    addi a2, a0, 16
 ; LMULMAX1-NEXT:    vle64.v v26, (a2)
 ; LMULMAX1-NEXT:    addi a2, a0, 48
 ; LMULMAX1-NEXT:    vle64.v v27, (a2)
-; LMULMAX1-NEXT:    addi a0, a0, 16
+; LMULMAX1-NEXT:    addi a0, a0, 32
 ; LMULMAX1-NEXT:    vle64.v v28, (a0)
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.x.w v29, v27
@@ -486,35 +486,25 @@ define void @si2fp_v8i64_v8f16(<8 x i64>* %x, <8 x half>* %y) {
 ; LMULMAX1-NEXT:    vfncvt.f.x.w v27, v28
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v28, v27
-; LMULMAX1-NEXT:    addi a0, sp, 20
+; LMULMAX1-NEXT:    addi a0, sp, 24
 ; LMULMAX1-NEXT:    vse16.v v28, (a0)
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.x.w v27, v26
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v26, v27
-; LMULMAX1-NEXT:    addi a0, sp, 24
-; LMULMAX1-NEXT:    vse16.v v26, (a0)
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; LMULMAX1-NEXT:    addi a0, sp, 24
-; LMULMAX1-NEXT:    vle16.v v26, (a0)
-; LMULMAX1-NEXT:    addi a0, sp, 40
+; LMULMAX1-NEXT:    addi a0, sp, 20
 ; LMULMAX1-NEXT:    vse16.v v26, (a0)
-; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; LMULMAX1-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.x.w v26, v25
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v25, v26
 ; LMULMAX1-NEXT:    addi a0, sp, 16
 ; LMULMAX1-NEXT:    vse16.v v25, (a0)
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; LMULMAX1-NEXT:    addi a0, sp, 16
-; LMULMAX1-NEXT:    vle16.v v25, (a0)
-; LMULMAX1-NEXT:    addi a0, sp, 32
-; LMULMAX1-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; LMULMAX1-NEXT:    addi a0, sp, 32
+; LMULMAX1-NEXT:    addi a0, sp, 16
 ; LMULMAX1-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-NEXT:    vse16.v v25, (a1)
-; LMULMAX1-NEXT:    addi sp, sp, 48
+; LMULMAX1-NEXT:    addi sp, sp, 32
 ; LMULMAX1-NEXT:    ret
   %a = load <8 x i64>, <8 x i64>* %x
   %d = sitofp <8 x i64> %a to <8 x half>
@@ -536,15 +526,15 @@ define void @ui2fp_v8i64_v8f16(<8 x i64>* %x, <8 x half>* %y) {
 ;
 ; LMULMAX1-LABEL: ui2fp_v8i64_v8f16:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    addi sp, sp, -48
-; LMULMAX1-NEXT:    .cfi_def_cfa_offset 48
+; LMULMAX1-NEXT:    addi sp, sp, -32
+; LMULMAX1-NEXT:    .cfi_def_cfa_offset 32
 ; LMULMAX1-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
 ; LMULMAX1-NEXT:    vle64.v v25, (a0)
-; LMULMAX1-NEXT:    addi a2, a0, 32
+; LMULMAX1-NEXT:    addi a2, a0, 16
 ; LMULMAX1-NEXT:    vle64.v v26, (a2)
 ; LMULMAX1-NEXT:    addi a2, a0, 48
 ; LMULMAX1-NEXT:    vle64.v v27, (a2)
-; LMULMAX1-NEXT:    addi a0, a0, 16
+; LMULMAX1-NEXT:    addi a0, a0, 32
 ; LMULMAX1-NEXT:    vle64.v v28, (a0)
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.xu.w v29, v27
@@ -556,35 +546,25 @@ define void @ui2fp_v8i64_v8f16(<8 x i64>* %x, <8 x half>* %y) {
 ; LMULMAX1-NEXT:    vfncvt.f.xu.w v27, v28
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v28, v27
-; LMULMAX1-NEXT:    addi a0, sp, 20
+; LMULMAX1-NEXT:    addi a0, sp, 24
 ; LMULMAX1-NEXT:    vse16.v v28, (a0)
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.xu.w v27, v26
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v26, v27
-; LMULMAX1-NEXT:    addi a0, sp, 24
-; LMULMAX1-NEXT:    vse16.v v26, (a0)
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; LMULMAX1-NEXT:    addi a0, sp, 24
-; LMULMAX1-NEXT:    vle16.v v26, (a0)
-; LMULMAX1-NEXT:    addi a0, sp, 40
+; LMULMAX1-NEXT:    addi a0, sp, 20
 ; LMULMAX1-NEXT:    vse16.v v26, (a0)
-; LMULMAX1-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; LMULMAX1-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.xu.w v26, v25
 ; LMULMAX1-NEXT:    vsetvli zero, zero, e16, mf4, ta, mu
 ; LMULMAX1-NEXT:    vfncvt.f.f.w v25, v26
 ; LMULMAX1-NEXT:    addi a0, sp, 16
 ; LMULMAX1-NEXT:    vse16.v v25, (a0)
-; LMULMAX1-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; LMULMAX1-NEXT:    addi a0, sp, 16
-; LMULMAX1-NEXT:    vle16.v v25, (a0)
-; LMULMAX1-NEXT:    addi a0, sp, 32
-; LMULMAX1-NEXT:    vse16.v v25, (a0)
 ; LMULMAX1-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; LMULMAX1-NEXT:    addi a0, sp, 32
+; LMULMAX1-NEXT:    addi a0, sp, 16
 ; LMULMAX1-NEXT:    vle16.v v25, (a0)
 ; LMULMAX1-NEXT:    vse16.v v25, (a1)
-; LMULMAX1-NEXT:    addi sp, sp, 48
+; LMULMAX1-NEXT:    addi sp, sp, 32
 ; LMULMAX1-NEXT:    ret
   %a = load <8 x i64>, <8 x i64>* %x
   %d = uitofp <8 x i64> %a to <8 x half>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
index ef8c3613e1922..696b38e7879c2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vadd-vp.ll
@@ -156,6 +156,82 @@ define <4 x i8> @vadd_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
   ret <4 x i8> %v
 }
 
+declare <5 x i8> @llvm.vp.add.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32)
+
+define <5 x i8> @vadd_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vadd_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <5 x i8> @llvm.vp.add.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vadd_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vadd_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <5 x i1> undef, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> undef, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.add.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vadd_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vadd_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> undef, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.add.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vadd_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vadd_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> undef, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> undef, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> undef, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.add.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vadd_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vadd_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> undef, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> undef, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.add.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vadd_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vadd_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> undef, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> undef, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> undef, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> undef, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.add.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
 declare <8 x i8> @llvm.vp.add.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
 
 define <8 x i8> @vadd_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
index 4050892549615..6d6ff37df791f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vand-vp.ll
@@ -1228,6 +1228,114 @@ define <8 x i64> @vand_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
   ret <8 x i64> %v
 }
 
+declare <11 x i64> @llvm.vp.and.v11i64(<11 x i64>, <11 x i64>, <11 x i1>, i32)
+
+define <11 x i64> @vand_vv_v11i64(<11 x i64> %va, <11 x i64> %b, <11 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vv_v11i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
+; CHECK-NEXT:    vand.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <11 x i64> @llvm.vp.and.v11i64(<11 x i64> %va, <11 x i64> %b, <11 x i1> %m, i32 %evl)
+  ret <11 x i64> %v
+}
+
+define <11 x i64> @vand_vv_v11i64_unmasked(<11 x i64> %va, <11 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vv_v11i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <11 x i1> undef, i1 true, i32 0
+  %m = shufflevector <11 x i1> %head, <11 x i1> undef, <11 x i32> zeroinitializer
+  %v = call <11 x i64> @llvm.vp.and.v11i64(<11 x i64> %va, <11 x i64> %b, <11 x i1> %m, i32 %evl)
+  ret <11 x i64> %v
+}
+
+define <11 x i64> @vand_vx_v11i64(<11 x i64> %va, i64 %b, <11 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vand_vx_v11i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v25, v0
+; RV32-NEXT:    addi a3, zero, 32
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, mu
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 341
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, mu
+; RV32-NEXT:    vmerge.vxm v16, v16, a0, v0
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vmv1r.v v0, v25
+; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vand_vx_v11i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vand.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <11 x i64> undef, i64 %b, i32 0
+  %vb = shufflevector <11 x i64> %elt.head, <11 x i64> undef, <11 x i32> zeroinitializer
+  %v = call <11 x i64> @llvm.vp.and.v11i64(<11 x i64> %va, <11 x i64> %vb, <11 x i1> %m, i32 %evl)
+  ret <11 x i64> %v
+}
+
+define <11 x i64> @vand_vx_v11i64_unmasked(<11 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vand_vx_v11i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi a3, zero, 32
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, mu
+; RV32-NEXT:    vmv.v.x v16, a1
+; RV32-NEXT:    lui a1, 341
+; RV32-NEXT:    addi a1, a1, 1365
+; RV32-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
+; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vsetvli zero, a3, e32, m8, ta, mu
+; RV32-NEXT:    vmerge.vxm v16, v16, a0, v0
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, mu
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vand_vx_v11i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; RV64-NEXT:    vand.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <11 x i64> undef, i64 %b, i32 0
+  %vb = shufflevector <11 x i64> %elt.head, <11 x i64> undef, <11 x i32> zeroinitializer
+  %head = insertelement <11 x i1> undef, i1 true, i32 0
+  %m = shufflevector <11 x i1> %head, <11 x i1> undef, <11 x i32> zeroinitializer
+  %v = call <11 x i64> @llvm.vp.and.v11i64(<11 x i64> %va, <11 x i64> %vb, <11 x i1> %m, i32 %evl)
+  ret <11 x i64> %v
+}
+
+define <11 x i64> @vand_vi_v11i64(<11 x i64> %va, <11 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vi_v11i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
+; CHECK-NEXT:    vand.vi v8, v8, 4, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <11 x i64> undef, i64 4, i32 0
+  %vb = shufflevector <11 x i64> %elt.head, <11 x i64> undef, <11 x i32> zeroinitializer
+  %v = call <11 x i64> @llvm.vp.and.v11i64(<11 x i64> %va, <11 x i64> %vb, <11 x i1> %m, i32 %evl)
+  ret <11 x i64> %v
+}
+
+define <11 x i64> @vand_vi_v11i64_unmasked(<11 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vi_v11i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
+; CHECK-NEXT:    vand.vi v8, v8, 4
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <11 x i64> undef, i64 4, i32 0
+  %vb = shufflevector <11 x i64> %elt.head, <11 x i64> undef, <11 x i32> zeroinitializer
+  %head = insertelement <11 x i1> undef, i1 true, i32 0
+  %m = shufflevector <11 x i1> %head, <11 x i1> undef, <11 x i32> zeroinitializer
+  %v = call <11 x i64> @llvm.vp.and.v11i64(<11 x i64> %va, <11 x i64> %vb, <11 x i1> %m, i32 %evl)
+  ret <11 x i64> %v
+}
+
 declare <16 x i64> @llvm.vp.and.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
 
 define <16 x i64> @vand_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll
index 1a7e67abbc34e..ae2f58c7df4d3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdiv-vp.ll
@@ -104,6 +104,18 @@ define <4 x i8> @vdiv_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
   ret <4 x i8> %v
 }
 
+declare <6 x i8> @llvm.vp.sdiv.v6i8(<6 x i8>, <6 x i8>, <6 x i1>, i32)
+
+define <6 x i8> @vdiv_vv_v6i8(<6 x i8> %va, <6 x i8> %b, <6 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vdiv_vv_v6i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vdiv.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <6 x i8> @llvm.vp.sdiv.v6i8(<6 x i8> %va, <6 x i8> %b, <6 x i1> %m, i32 %evl)
+  ret <6 x i8> %v
+}
+
 declare <8 x i8> @llvm.vp.sdiv.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
 
 define <8 x i8> @vdiv_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll
index 559d4c1e1dc17..da9b5ce09dcfa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vdivu-vp.ll
@@ -104,6 +104,18 @@ define <4 x i8> @vdivu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
   ret <4 x i8> %v
 }
 
+declare <6 x i8> @llvm.vp.udiv.v6i8(<6 x i8>, <6 x i8>, <6 x i1>, i32)
+
+define <6 x i8> @vdivu_vv_v6i8(<6 x i8> %va, <6 x i8> %b, <6 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vdivu_vv_v6i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vdivu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <6 x i8> @llvm.vp.udiv.v6i8(<6 x i8> %va, <6 x i8> %b, <6 x i1> %m, i32 %evl)
+  ret <6 x i8> %v
+}
+
 declare <8 x i8> @llvm.vp.udiv.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
 
 define <8 x i8> @vdivu_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll
index e6ed948c06657..9302940d29ba5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfadd-vp.ll
@@ -56,6 +56,18 @@ define <2 x half> @vfadd_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext
   ret <2 x half> %v
 }
 
+declare <3 x half> @llvm.vp.fadd.v3f16(<3 x half>, <3 x half>, <3 x i1>, i32)
+
+define <3 x half> @vfadd_vv_v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_v3f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vfadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <3 x half> @llvm.vp.fadd.v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i32 %evl)
+  ret <3 x half> %v
+}
+
 declare <4 x half> @llvm.vp.fadd.v4f16(<4 x half>, <4 x half>, <4 x i1>, i32)
 
 define <4 x half> @vfadd_vv_v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll
index 87aaff0214512..aa4ab8dba3a27 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfdiv-vp.ll
@@ -56,6 +56,18 @@ define <2 x half> @vfdiv_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext
   ret <2 x half> %v
 }
 
+declare <3 x half> @llvm.vp.fdiv.v3f16(<3 x half>, <3 x half>, <3 x i1>, i32)
+
+define <3 x half> @vfdiv_vv_v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_v3f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vfdiv.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <3 x half> @llvm.vp.fdiv.v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i32 %evl)
+  ret <3 x half> %v
+}
+
 declare <4 x half> @llvm.vp.fdiv.v4f16(<4 x half>, <4 x half>, <4 x i1>, i32)
 
 define <4 x half> @vfdiv_vv_v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll
index 259fbfb247282..885a16510dbb5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfmul-vp.ll
@@ -56,6 +56,18 @@ define <2 x half> @vfmul_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext
   ret <2 x half> %v
 }
 
+declare <3 x half> @llvm.vp.fmul.v3f16(<3 x half>, <3 x half>, <3 x i1>, i32)
+
+define <3 x half> @vfmul_vv_v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmul_vv_v3f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vfmul.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <3 x half> @llvm.vp.fmul.v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i32 %evl)
+  ret <3 x half> %v
+}
+
 declare <4 x half> @llvm.vp.fmul.v4f16(<4 x half>, <4 x half>, <4 x i1>, i32)
 
 define <4 x half> @vfmul_vv_v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll
index ee04b73af882a..e4158640a459c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfsub-vp.ll
@@ -56,6 +56,18 @@ define <2 x half> @vfsub_vf_v2f16_unmasked(<2 x half> %va, half %b, i32 zeroext
   ret <2 x half> %v
 }
 
+declare <3 x half> @llvm.vp.fsub.v3f16(<3 x half>, <3 x half>, <3 x i1>, i32)
+
+define <3 x half> @vfsub_vv_v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_v3f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, mu
+; CHECK-NEXT:    vfsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <3 x half> @llvm.vp.fsub.v3f16(<3 x half> %va, <3 x half> %b, <3 x i1> %m, i32 %evl)
+  ret <3 x half> %v
+}
+
 declare <4 x half> @llvm.vp.fsub.v4f16(<4 x half>, <4 x half>, <4 x i1>, i32)
 
 define <4 x half> @vfsub_vv_v4f16(<4 x half> %va, <4 x half> %b, <4 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll
index 26834149778d3..a1a1005741d7c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vmul-vp.ll
@@ -354,6 +354,56 @@ define <8 x i16> @vmul_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl
   ret <8 x i16> %v
 }
 
+declare <12 x i16> @llvm.vp.mul.v12i16(<12 x i16>, <12 x i16>, <12 x i1>, i32)
+
+define <12 x i16> @vmul_vv_v12i16(<12 x i16> %va, <12 x i16> %b, <12 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmul_vv_v12i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
+; CHECK-NEXT:    vmul.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <12 x i16> @llvm.vp.mul.v12i16(<12 x i16> %va, <12 x i16> %b, <12 x i1> %m, i32 %evl)
+  ret <12 x i16> %v
+}
+
+define <12 x i16> @vmul_vv_v12i16_unmasked(<12 x i16> %va, <12 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vmul_vv_v12i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, mu
+; CHECK-NEXT:    vmul.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <12 x i1> undef, i1 true, i32 0
+  %m = shufflevector <12 x i1> %head, <12 x i1> undef, <12 x i32> zeroinitializer
+  %v = call <12 x i16> @llvm.vp.mul.v12i16(<12 x i16> %va, <12 x i16> %b, <12 x i1> %m, i32 %evl)
+  ret <12 x i16> %v
+}
+
+define <12 x i16> @vmul_vx_v12i16(<12 x i16> %va, i16 %b, <12 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmul_vx_v12i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <12 x i16> undef, i16 %b, i32 0
+  %vb = shufflevector <12 x i16> %elt.head, <12 x i16> undef, <12 x i32> zeroinitializer
+  %v = call <12 x i16> @llvm.vp.mul.v12i16(<12 x i16> %va, <12 x i16> %vb, <12 x i1> %m, i32 %evl)
+  ret <12 x i16> %v
+}
+
+define <12 x i16> @vmul_vx_v12i16_unmasked(<12 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vmul_vx_v12i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <12 x i16> undef, i16 %b, i32 0
+  %vb = shufflevector <12 x i16> %elt.head, <12 x i16> undef, <12 x i32> zeroinitializer
+  %head = insertelement <12 x i1> undef, i1 true, i32 0
+  %m = shufflevector <12 x i1> %head, <12 x i1> undef, <12 x i32> zeroinitializer
+  %v = call <12 x i16> @llvm.vp.mul.v12i16(<12 x i16> %va, <12 x i16> %vb, <12 x i1> %m, i32 %evl)
+  ret <12 x i16> %v
+}
+
 declare <16 x i16> @llvm.vp.mul.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32)
 
 define <16 x i16> @vmul_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vor-vp.ll
index 128dcf4e8546b..eeb2b286fb24e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vor-vp.ll
@@ -156,6 +156,82 @@ define <4 x i8> @vor_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
   ret <4 x i8> %v
 }
 
+declare <7 x i8> @llvm.vp.or.v5i8(<7 x i8>, <7 x i8>, <7 x i1>, i32)
+
+define <7 x i8> @vor_vv_v5i8(<7 x i8> %va, <7 x i8> %b, <7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vor_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <7 x i8> @llvm.vp.or.v5i8(<7 x i8> %va, <7 x i8> %b, <7 x i1> %m, i32 %evl)
+  ret <7 x i8> %v
+}
+
+define <7 x i8> @vor_vv_v5i8_unmasked(<7 x i8> %va, <7 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vor_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <7 x i1> undef, i1 true, i32 0
+  %m = shufflevector <7 x i1> %head, <7 x i1> undef, <7 x i32> zeroinitializer
+  %v = call <7 x i8> @llvm.vp.or.v5i8(<7 x i8> %va, <7 x i8> %b, <7 x i1> %m, i32 %evl)
+  ret <7 x i8> %v
+}
+
+define <7 x i8> @vor_vx_v5i8(<7 x i8> %va, i8 %b, <7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vor_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vor.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <7 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <7 x i8> %elt.head, <7 x i8> undef, <7 x i32> zeroinitializer
+  %v = call <7 x i8> @llvm.vp.or.v5i8(<7 x i8> %va, <7 x i8> %vb, <7 x i1> %m, i32 %evl)
+  ret <7 x i8> %v
+}
+
+define <7 x i8> @vor_vx_v5i8_unmasked(<7 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vor_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vor.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <7 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <7 x i8> %elt.head, <7 x i8> undef, <7 x i32> zeroinitializer
+  %head = insertelement <7 x i1> undef, i1 true, i32 0
+  %m = shufflevector <7 x i1> %head, <7 x i1> undef, <7 x i32> zeroinitializer
+  %v = call <7 x i8> @llvm.vp.or.v5i8(<7 x i8> %va, <7 x i8> %vb, <7 x i1> %m, i32 %evl)
+  ret <7 x i8> %v
+}
+
+define <7 x i8> @vor_vi_v5i8(<7 x i8> %va, <7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vor_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vor.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <7 x i8> undef, i8 5, i32 0
+  %vb = shufflevector <7 x i8> %elt.head, <7 x i8> undef, <7 x i32> zeroinitializer
+  %v = call <7 x i8> @llvm.vp.or.v5i8(<7 x i8> %va, <7 x i8> %vb, <7 x i1> %m, i32 %evl)
+  ret <7 x i8> %v
+}
+
+define <7 x i8> @vor_vi_v5i8_unmasked(<7 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vor_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vor.vi v8, v8, 5
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <7 x i8> undef, i8 5, i32 0
+  %vb = shufflevector <7 x i8> %elt.head, <7 x i8> undef, <7 x i32> zeroinitializer
+  %head = insertelement <7 x i1> undef, i1 true, i32 0
+  %m = shufflevector <7 x i1> %head, <7 x i1> undef, <7 x i32> zeroinitializer
+  %v = call <7 x i8> @llvm.vp.or.v5i8(<7 x i8> %va, <7 x i8> %vb, <7 x i1> %m, i32 %evl)
+  ret <7 x i8> %v
+}
+
 declare <8 x i8> @llvm.vp.or.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
 
 define <8 x i8> @vor_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll
index fe6cd06fded38..3a637eb80eb6e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vrem-vp.ll
@@ -104,6 +104,18 @@ define <4 x i8> @vrem_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
   ret <4 x i8> %v
 }
 
+declare <6 x i8> @llvm.vp.srem.v6i8(<6 x i8>, <6 x i8>, <6 x i1>, i32)
+
+define <6 x i8> @vrem_vv_v6i8(<6 x i8> %va, <6 x i8> %b, <6 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vrem_vv_v6i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vrem.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <6 x i8> @llvm.vp.srem.v6i8(<6 x i8> %va, <6 x i8> %b, <6 x i1> %m, i32 %evl)
+  ret <6 x i8> %v
+}
+
 declare <8 x i8> @llvm.vp.srem.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
 
 define <8 x i8> @vrem_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll
index 3d0796011433b..ead9220a76502 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vremu-vp.ll
@@ -104,6 +104,18 @@ define <4 x i8> @vremu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
   ret <4 x i8> %v
 }
 
+declare <6 x i8> @llvm.vp.urem.v6i8(<6 x i8>, <6 x i8>, <6 x i1>, i32)
+
+define <6 x i8> @vremu_vv_v6i8(<6 x i8> %va, <6 x i8> %b, <6 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vremu_vv_v6i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vremu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <6 x i8> @llvm.vp.urem.v6i8(<6 x i8> %va, <6 x i8> %b, <6 x i1> %m, i32 %evl)
+  ret <6 x i8> %v
+}
+
 declare <8 x i8> @llvm.vp.urem.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
 
 define <8 x i8> @vremu_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll
index 8cfdb57b8cab7..98a9c9938e988 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vshl-vp.ll
@@ -80,6 +80,18 @@ define <2 x i8> @vsll_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
   ret <2 x i8> %v
 }
 
+declare <3 x i8> @llvm.vp.shl.v3i8(<3 x i8>, <3 x i8>, <3 x i1>, i32)
+
+define <3 x i8> @vsll_vv_v3i8(<3 x i8> %va, <3 x i8> %b, <3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsll_vv_v3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vsll.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <3 x i8> @llvm.vp.shl.v3i8(<3 x i8> %va, <3 x i8> %b, <3 x i1> %m, i32 %evl)
+  ret <3 x i8> %v
+}
+
 declare <4 x i8> @llvm.vp.shl.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
 
 define <4 x i8> @vsll_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll
index 370edf420f57a..47f7ab7df3986 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsra-vp.ll
@@ -156,6 +156,18 @@ define <4 x i8> @vsra_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
   ret <4 x i8> %v
 }
 
+declare <7 x i8> @llvm.vp.ashr.v7i8(<7 x i8>, <7 x i8>, <7 x i1>, i32)
+
+define <7 x i8> @vsra_vv_v7i8(<7 x i8> %va, <7 x i8> %b, <7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsra_vv_v7i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vsra.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <7 x i8> @llvm.vp.ashr.v7i8(<7 x i8> %va, <7 x i8> %b, <7 x i1> %m, i32 %evl)
+  ret <7 x i8> %v
+}
+
 declare <8 x i8> @llvm.vp.ashr.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
 
 define <8 x i8> @vsra_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll
index 1c45115110fec..0638889c5a174 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsrl-vp.ll
@@ -156,6 +156,18 @@ define <4 x i8> @vsrl_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
   ret <4 x i8> %v
 }
 
+declare <7 x i8> @llvm.vp.lshr.v7i8(<7 x i8>, <7 x i8>, <7 x i1>, i32)
+
+define <7 x i8> @vsrl_vv_v7i8(<7 x i8> %va, <7 x i8> %b, <7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsrl_vv_v7i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <7 x i8> @llvm.vp.lshr.v7i8(<7 x i8> %va, <7 x i8> %b, <7 x i1> %m, i32 %evl)
+  ret <7 x i8> %v
+}
+
 declare <8 x i8> @llvm.vp.lshr.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
 
 define <8 x i8> @vsrl_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsub-vp.ll
index 077b1f03290c6..88955eb4540e0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsub-vp.ll
@@ -54,6 +54,56 @@ define <2 x i8> @vsub_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) {
   ret <2 x i8> %v
 }
 
+declare <3 x i8> @llvm.vp.sub.v3i8(<3 x i8>, <3 x i8>, <3 x i1>, i32)
+
+define <3 x i8> @vsub_vv_v3i8(<3 x i8> %va, <3 x i8> %b, <3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsub_vv_v3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <3 x i8> @llvm.vp.sub.v3i8(<3 x i8> %va, <3 x i8> %b, <3 x i1> %m, i32 %evl)
+  ret <3 x i8> %v
+}
+
+define <3 x i8> @vsub_vv_v3i8_unmasked(<3 x i8> %va, <3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsub_vv_v3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, mu
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <3 x i1> undef, i1 true, i32 0
+  %m = shufflevector <3 x i1> %head, <3 x i1> undef, <3 x i32> zeroinitializer
+  %v = call <3 x i8> @llvm.vp.sub.v3i8(<3 x i8> %va, <3 x i8> %b, <3 x i1> %m, i32 %evl)
+  ret <3 x i8> %v
+}
+
+define <3 x i8> @vsub_vx_v3i8(<3 x i8> %va, i8 %b, <3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsub_vx_v3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <3 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <3 x i8> %elt.head, <3 x i8> undef, <3 x i32> zeroinitializer
+  %v = call <3 x i8> @llvm.vp.sub.v3i8(<3 x i8> %va, <3 x i8> %vb, <3 x i1> %m, i32 %evl)
+  ret <3 x i8> %v
+}
+
+define <3 x i8> @vsub_vx_v3i8_unmasked(<3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsub_vx_v3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <3 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <3 x i8> %elt.head, <3 x i8> undef, <3 x i32> zeroinitializer
+  %head = insertelement <3 x i1> undef, i1 true, i32 0
+  %m = shufflevector <3 x i1> %head, <3 x i1> undef, <3 x i32> zeroinitializer
+  %v = call <3 x i8> @llvm.vp.sub.v3i8(<3 x i8> %va, <3 x i8> %vb, <3 x i1> %m, i32 %evl)
+  ret <3 x i8> %v
+}
+
 declare <4 x i8> @llvm.vp.sub.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
 
 define <4 x i8> @vsub_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vxor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vxor-vp.ll
index c9b4f966fba9d..b5b016364ec1e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vxor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vxor-vp.ll
@@ -310,6 +310,108 @@ define <8 x i8> @vxor_vi_v8i8_unmasked_1(<8 x i8> %va, i32 zeroext %evl) {
   ret <8 x i8> %v
 }
 
+declare <9 x i8> @llvm.vp.xor.v9i8(<9 x i8>, <9 x i8>, <9 x i1>, i32)
+
+define <9 x i8> @vxor_vv_v9i8(<9 x i8> %va, <9 x i8> %b, <9 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vv_v9i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vxor.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <9 x i8> @llvm.vp.xor.v9i8(<9 x i8> %va, <9 x i8> %b, <9 x i1> %m, i32 %evl)
+  ret <9 x i8> %v
+}
+
+define <9 x i8> @vxor_vv_v9i8_unmasked(<9 x i8> %va, <9 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vv_v9i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vxor.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <9 x i1> undef, i1 true, i32 0
+  %m = shufflevector <9 x i1> %head, <9 x i1> undef, <9 x i32> zeroinitializer
+  %v = call <9 x i8> @llvm.vp.xor.v9i8(<9 x i8> %va, <9 x i8> %b, <9 x i1> %m, i32 %evl)
+  ret <9 x i8> %v
+}
+
+define <9 x i8> @vxor_vx_v9i8(<9 x i8> %va, i8 %b, <9 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vx_v9i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vxor.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <9 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <9 x i8> %elt.head, <9 x i8> undef, <9 x i32> zeroinitializer
+  %v = call <9 x i8> @llvm.vp.xor.v9i8(<9 x i8> %va, <9 x i8> %vb, <9 x i1> %m, i32 %evl)
+  ret <9 x i8> %v
+}
+
+define <9 x i8> @vxor_vx_v9i8_unmasked(<9 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vx_v9i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vxor.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <9 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <9 x i8> %elt.head, <9 x i8> undef, <9 x i32> zeroinitializer
+  %head = insertelement <9 x i1> undef, i1 true, i32 0
+  %m = shufflevector <9 x i1> %head, <9 x i1> undef, <9 x i32> zeroinitializer
+  %v = call <9 x i8> @llvm.vp.xor.v9i8(<9 x i8> %va, <9 x i8> %vb, <9 x i1> %m, i32 %evl)
+  ret <9 x i8> %v
+}
+
+define <9 x i8> @vxor_vi_v9i8(<9 x i8> %va, <9 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vi_v9i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vxor.vi v8, v8, 7, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <9 x i8> undef, i8 7, i32 0
+  %vb = shufflevector <9 x i8> %elt.head, <9 x i8> undef, <9 x i32> zeroinitializer
+  %v = call <9 x i8> @llvm.vp.xor.v9i8(<9 x i8> %va, <9 x i8> %vb, <9 x i1> %m, i32 %evl)
+  ret <9 x i8> %v
+}
+
+define <9 x i8> @vxor_vi_v9i8_unmasked(<9 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vi_v9i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vxor.vi v8, v8, 7
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <9 x i8> undef, i8 7, i32 0
+  %vb = shufflevector <9 x i8> %elt.head, <9 x i8> undef, <9 x i32> zeroinitializer
+  %head = insertelement <9 x i1> undef, i1 true, i32 0
+  %m = shufflevector <9 x i1> %head, <9 x i1> undef, <9 x i32> zeroinitializer
+  %v = call <9 x i8> @llvm.vp.xor.v9i8(<9 x i8> %va, <9 x i8> %vb, <9 x i1> %m, i32 %evl)
+  ret <9 x i8> %v
+}
+
+define <9 x i8> @vxor_vi_v9i8_1(<9 x i8> %va, <9 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vi_v9i8_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <9 x i8> undef, i8 -1, i32 0
+  %vb = shufflevector <9 x i8> %elt.head, <9 x i8> undef, <9 x i32> zeroinitializer
+  %v = call <9 x i8> @llvm.vp.xor.v9i8(<9 x i8> %va, <9 x i8> %vb, <9 x i1> %m, i32 %evl)
+  ret <9 x i8> %v
+}
+
+define <9 x i8> @vxor_vi_v9i8_unmasked_1(<9 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vi_v9i8_unmasked_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vxor.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <9 x i8> undef, i8 -1, i32 0
+  %vb = shufflevector <9 x i8> %elt.head, <9 x i8> undef, <9 x i32> zeroinitializer
+  %head = insertelement <9 x i1> undef, i1 true, i32 0
+  %m = shufflevector <9 x i1> %head, <9 x i1> undef, <9 x i32> zeroinitializer
+  %v = call <9 x i8> @llvm.vp.xor.v9i8(<9 x i8> %va, <9 x i8> %vb, <9 x i1> %m, i32 %evl)
+  ret <9 x i8> %v
+}
+
 declare <16 x i8> @llvm.vp.xor.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
 
 define <16 x i8> @vxor_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
index 82c09251b329c..86d696f0c2341 100644
--- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll
@@ -2267,33 +2267,31 @@ define <vscale x 32 x i8> @mgather_baseidx_nxv32i8(i8* %base, <vscale x 32 x i8>
 ; RV64-LABEL: mgather_baseidx_nxv32i8:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vmv1r.v v25, v0
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; RV64-NEXT:    vsext.vf8 v16, v8
+; RV64-NEXT:    vsetvli zero, zero, e8, m1, tu, mu
+; RV64-NEXT:    vluxei64.v v12, (a0), v16, v0.t
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    srli a2, a1, 2
-; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, mu
-; RV64-NEXT:    vslidedown.vx v26, v0, a2
-; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, mu
-; RV64-NEXT:    vslidedown.vx v0, v26, a1
-; RV64-NEXT:    vsetvli a2, zero, e64, m8, ta, mu
-; RV64-NEXT:    vsext.vf8 v16, v11
+; RV64-NEXT:    srli a2, a1, 3
+; RV64-NEXT:    vsetvli a3, zero, e8, mf4, ta, mu
+; RV64-NEXT:    vslidedown.vx v0, v0, a2
+; RV64-NEXT:    vsetvli a3, zero, e64, m8, ta, mu
+; RV64-NEXT:    vsext.vf8 v16, v9
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, tu, mu
-; RV64-NEXT:    vluxei64.v v15, (a0), v16, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
+; RV64-NEXT:    vluxei64.v v13, (a0), v16, v0.t
+; RV64-NEXT:    srli a1, a1, 2
+; RV64-NEXT:    vsetvli a3, zero, e8, mf2, ta, mu
+; RV64-NEXT:    vslidedown.vx v0, v25, a1
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
 ; RV64-NEXT:    vsext.vf8 v16, v10
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, tu, mu
-; RV64-NEXT:    vmv1r.v v0, v26
 ; RV64-NEXT:    vluxei64.v v14, (a0), v16, v0.t
-; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, mu
-; RV64-NEXT:    vsext.vf8 v16, v8
-; RV64-NEXT:    vsetvli zero, zero, e8, m1, tu, mu
-; RV64-NEXT:    vmv1r.v v0, v25
-; RV64-NEXT:    vluxei64.v v12, (a0), v16, v0.t
-; RV64-NEXT:    vsetvli a2, zero, e8, mf4, ta, mu
-; RV64-NEXT:    vslidedown.vx v0, v25, a1
+; RV64-NEXT:    vsetvli a1, zero, e8, mf4, ta, mu
+; RV64-NEXT:    vslidedown.vx v0, v0, a2
 ; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
-; RV64-NEXT:    vsext.vf8 v16, v9
+; RV64-NEXT:    vsext.vf8 v16, v11
 ; RV64-NEXT:    vsetvli zero, zero, e8, m1, tu, mu
-; RV64-NEXT:    vluxei64.v v13, (a0), v16, v0.t
+; RV64-NEXT:    vluxei64.v v15, (a0), v16, v0.t
 ; RV64-NEXT:    vmv4r.v v8, v12
 ; RV64-NEXT:    ret
   %ptrs = getelementptr inbounds i8, i8* %base, <vscale x 32 x i8> %idxs
diff --git a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
index 10df4fc4884a7..57e049e072d92 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vadd-vp.ll
@@ -156,6 +156,82 @@ define <vscale x 2 x i8> @vadd_vi_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zer
   ret <vscale x 2 x i8> %v
 }
 
+declare <vscale x 3 x i8> @llvm.vp.add.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vadd_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vadd_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.add.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vadd_vv_nxv3i8_unmasked(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vadd_vv_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 3 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> undef, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.add.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vadd_vx_nxv3i8(<vscale x 3 x i8> %va, i8 %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vadd_vx_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> undef, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.add.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vadd_vx_nxv3i8_unmasked(<vscale x 3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vadd_vx_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> undef, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> undef, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.add.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vadd_vi_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vadd_vi_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> undef, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> undef, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.add.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vadd_vi_nxv3i8_unmasked(<vscale x 3 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vadd_vi_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> undef, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> undef, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> undef, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.add.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
 declare <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
 
 define <vscale x 4 x i8> @vadd_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll
index c37e1e9ceb93f..b62ccc8f410a1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vand-vp.ll
@@ -840,6 +840,82 @@ define <vscale x 8 x i16> @vand_vi_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32
   ret <vscale x 8 x i16> %v
 }
 
+declare <vscale x 14 x i16> @llvm.vp.and.nxv14i16(<vscale x 14 x i16>, <vscale x 14 x i16>, <vscale x 14 x i1>, i32)
+
+define <vscale x 14 x i16> @vand_vv_nxv14i16(<vscale x 14 x i16> %va, <vscale x 14 x i16> %b, <vscale x 14 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vv_nxv14i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
+; CHECK-NEXT:    vand.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 14 x i16> @llvm.vp.and.nxv14i16(<vscale x 14 x i16> %va, <vscale x 14 x i16> %b, <vscale x 14 x i1> %m, i32 %evl)
+  ret <vscale x 14 x i16> %v
+}
+
+define <vscale x 14 x i16> @vand_vv_nxv14i16_unmasked(<vscale x 14 x i16> %va, <vscale x 14 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vv_nxv14i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
+; CHECK-NEXT:    vand.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 14 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 14 x i1> %head, <vscale x 14 x i1> undef, <vscale x 14 x i32> zeroinitializer
+  %v = call <vscale x 14 x i16> @llvm.vp.and.nxv14i16(<vscale x 14 x i16> %va, <vscale x 14 x i16> %b, <vscale x 14 x i1> %m, i32 %evl)
+  ret <vscale x 14 x i16> %v
+}
+
+define <vscale x 14 x i16> @vand_vx_nxv14i16(<vscale x 14 x i16> %va, i16 %b, <vscale x 14 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vx_nxv14i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vand.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 14 x i16> undef, i16 %b, i32 0
+  %vb = shufflevector <vscale x 14 x i16> %elt.head, <vscale x 14 x i16> undef, <vscale x 14 x i32> zeroinitializer
+  %v = call <vscale x 14 x i16> @llvm.vp.and.nxv14i16(<vscale x 14 x i16> %va, <vscale x 14 x i16> %vb, <vscale x 14 x i1> %m, i32 %evl)
+  ret <vscale x 14 x i16> %v
+}
+
+define <vscale x 14 x i16> @vand_vx_nxv14i16_unmasked(<vscale x 14 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vx_nxv14i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vand.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 14 x i16> undef, i16 %b, i32 0
+  %vb = shufflevector <vscale x 14 x i16> %elt.head, <vscale x 14 x i16> undef, <vscale x 14 x i32> zeroinitializer
+  %head = insertelement <vscale x 14 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 14 x i1> %head, <vscale x 14 x i1> undef, <vscale x 14 x i32> zeroinitializer
+  %v = call <vscale x 14 x i16> @llvm.vp.and.nxv14i16(<vscale x 14 x i16> %va, <vscale x 14 x i16> %vb, <vscale x 14 x i1> %m, i32 %evl)
+  ret <vscale x 14 x i16> %v
+}
+
+define <vscale x 14 x i16> @vand_vi_nxv14i16(<vscale x 14 x i16> %va, <vscale x 14 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vi_nxv14i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
+; CHECK-NEXT:    vand.vi v8, v8, 4, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 14 x i16> undef, i16 4, i32 0
+  %vb = shufflevector <vscale x 14 x i16> %elt.head, <vscale x 14 x i16> undef, <vscale x 14 x i32> zeroinitializer
+  %v = call <vscale x 14 x i16> @llvm.vp.and.nxv14i16(<vscale x 14 x i16> %va, <vscale x 14 x i16> %vb, <vscale x 14 x i1> %m, i32 %evl)
+  ret <vscale x 14 x i16> %v
+}
+
+define <vscale x 14 x i16> @vand_vi_nxv14i16_unmasked(<vscale x 14 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vand_vi_nxv14i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, mu
+; CHECK-NEXT:    vand.vi v8, v8, 4
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 14 x i16> undef, i16 4, i32 0
+  %vb = shufflevector <vscale x 14 x i16> %elt.head, <vscale x 14 x i16> undef, <vscale x 14 x i32> zeroinitializer
+  %head = insertelement <vscale x 14 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 14 x i1> %head, <vscale x 14 x i1> undef, <vscale x 14 x i32> zeroinitializer
+  %v = call <vscale x 14 x i16> @llvm.vp.and.nxv14i16(<vscale x 14 x i16> %va, <vscale x 14 x i16> %vb, <vscale x 14 x i1> %m, i32 %evl)
+  ret <vscale x 14 x i16> %v
+}
+
 declare <vscale x 16 x i16> @llvm.vp.and.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
 
 define <vscale x 16 x i16> @vand_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
index a9b0e66a7d965..c797bde9ee592 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdiv-vp.ll
@@ -104,6 +104,18 @@ define <vscale x 2 x i8> @vdiv_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b,
   ret <vscale x 2 x i8> %v
 }
 
+declare <vscale x 3 x i8> @llvm.vp.sdiv.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vdiv_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vdiv_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vdiv.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.sdiv.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
 declare <vscale x 4 x i8> @llvm.vp.sdiv.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
 
 define <vscale x 4 x i8> @vdiv_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv32.ll
index 644b84a3c88f0..bd01a919638c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv32.ll
@@ -736,6 +736,33 @@ define <vscale x 1 x i64> @vdivu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
   ret <vscale x 1 x i64> %vc
 }
 
+define <vscale x 1 x i64> @vdivu_vi_nxv1i64_1(<vscale x 1 x i64> %va) {
+; CHECK-LABEL: vdivu_vi_nxv1i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vsrl.vi v8, v8, 1
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+  %vc = udiv <vscale x 1 x i64> %va, %splat
+  ret <vscale x 1 x i64> %vc
+}
+
+; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) if c is power of 2
+define <vscale x 1 x i64> @vdivu_vi_nxv1i64_2(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb) {
+; CHECK-LABEL: vdivu_vi_nxv1i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vadd.vi v25, v9, 4
+; CHECK-NEXT:    vsrl.vv v8, v8, v25
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+  %vc = shl <vscale x 1 x i64> %splat, %vb
+  %vd = udiv <vscale x 1 x i64> %va, %vc
+  ret <vscale x 1 x i64> %vd
+}
+
 define <vscale x 2 x i64> @vdivu_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
 ; CHECK-LABEL: vdivu_vv_nxv2i64:
 ; CHECK:       # %bb.0:
@@ -788,6 +815,33 @@ define <vscale x 2 x i64> @vdivu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
   ret <vscale x 2 x i64> %vc
 }
 
+define <vscale x 2 x i64> @vdivu_vi_nxv2i64_1(<vscale x 2 x i64> %va) {
+; CHECK-LABEL: vdivu_vi_nxv2i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vsrl.vi v8, v8, 1
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %vc = udiv <vscale x 2 x i64> %va, %splat
+  ret <vscale x 2 x i64> %vc
+}
+
+; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) if c is power of 2
+define <vscale x 2 x i64> @vdivu_vi_nxv2i64_2(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
+; CHECK-LABEL: vdivu_vi_nxv2i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vadd.vi v26, v10, 4
+; CHECK-NEXT:    vsrl.vv v8, v8, v26
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %vc = shl <vscale x 2 x i64> %splat, %vb
+  %vd = udiv <vscale x 2 x i64> %va, %vc
+  ret <vscale x 2 x i64> %vd
+}
+
 define <vscale x 4 x i64> @vdivu_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb) {
 ; CHECK-LABEL: vdivu_vv_nxv4i64:
 ; CHECK:       # %bb.0:
@@ -840,6 +894,33 @@ define <vscale x 4 x i64> @vdivu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
   ret <vscale x 4 x i64> %vc
 }
 
+define <vscale x 4 x i64> @vdivu_vi_nxv4i64_1(<vscale x 4 x i64> %va) {
+; CHECK-LABEL: vdivu_vi_nxv4i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vsrl.vi v8, v8, 1
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %vc = udiv <vscale x 4 x i64> %va, %splat
+  ret <vscale x 4 x i64> %vc
+}
+
+; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) if c is power of 2
+define <vscale x 4 x i64> @vdivu_vi_nxv4i64_2(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb) {
+; CHECK-LABEL: vdivu_vi_nxv4i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vadd.vi v28, v12, 4
+; CHECK-NEXT:    vsrl.vv v8, v8, v28
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %vc = shl <vscale x 4 x i64> %splat, %vb
+  %vd = udiv <vscale x 4 x i64> %va, %vc
+  ret <vscale x 4 x i64> %vd
+}
+
 define <vscale x 8 x i64> @vdivu_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb) {
 ; CHECK-LABEL: vdivu_vv_nxv8i64:
 ; CHECK:       # %bb.0:
@@ -892,3 +973,29 @@ define <vscale x 8 x i64> @vdivu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   ret <vscale x 8 x i64> %vc
 }
 
+define <vscale x 8 x i64> @vdivu_vi_nxv8i64_1(<vscale x 8 x i64> %va) {
+; CHECK-LABEL: vdivu_vi_nxv8i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vsrl.vi v8, v8, 1
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %vc = udiv <vscale x 8 x i64> %va, %splat
+  ret <vscale x 8 x i64> %vc
+}
+
+; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) if c is power of 2
+define <vscale x 8 x i64> @vdivu_vi_nxv8i64_2(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb) {
+; CHECK-LABEL: vdivu_vi_nxv8i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vadd.vi v16, v16, 4
+; CHECK-NEXT:    vsrl.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %vc = shl <vscale x 8 x i64> %splat, %vb
+  %vd = udiv <vscale x 8 x i64> %va, %vc
+  ret <vscale x 8 x i64> %vd
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv64.ll
index 2553832fd82fc..a351185ea6510 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-sdnode-rv64.ll
@@ -699,6 +699,33 @@ define <vscale x 1 x i64> @vdivu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
   ret <vscale x 1 x i64> %vc
 }
 
+define <vscale x 1 x i64> @vdivu_vi_nxv1i64_1(<vscale x 1 x i64> %va) {
+; CHECK-LABEL: vdivu_vi_nxv1i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vsrl.vi v8, v8, 1
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+  %vc = udiv <vscale x 1 x i64> %va, %splat
+  ret <vscale x 1 x i64> %vc
+}
+
+; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) if c is power of 2
+define <vscale x 1 x i64> @vdivu_vi_nxv1i64_2(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb) {
+; CHECK-LABEL: vdivu_vi_nxv1i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vadd.vi v25, v9, 4
+; CHECK-NEXT:    vsrl.vv v8, v8, v25
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+  %vc = shl <vscale x 1 x i64> %splat, %vb
+  %vd = udiv <vscale x 1 x i64> %va, %vc
+  ret <vscale x 1 x i64> %vd
+}
+
 define <vscale x 2 x i64> @vdivu_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
 ; CHECK-LABEL: vdivu_vv_nxv2i64:
 ; CHECK:       # %bb.0:
@@ -738,6 +765,33 @@ define <vscale x 2 x i64> @vdivu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
   ret <vscale x 2 x i64> %vc
 }
 
+define <vscale x 2 x i64> @vdivu_vi_nxv2i64_1(<vscale x 2 x i64> %va) {
+; CHECK-LABEL: vdivu_vi_nxv2i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vsrl.vi v8, v8, 1
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %vc = udiv <vscale x 2 x i64> %va, %splat
+  ret <vscale x 2 x i64> %vc
+}
+
+; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) if c is power of 2
+define <vscale x 2 x i64> @vdivu_vi_nxv2i64_2(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
+; CHECK-LABEL: vdivu_vi_nxv2i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vadd.vi v26, v10, 4
+; CHECK-NEXT:    vsrl.vv v8, v8, v26
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %vc = shl <vscale x 2 x i64> %splat, %vb
+  %vd = udiv <vscale x 2 x i64> %va, %vc
+  ret <vscale x 2 x i64> %vd
+}
+
 define <vscale x 4 x i64> @vdivu_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb) {
 ; CHECK-LABEL: vdivu_vv_nxv4i64:
 ; CHECK:       # %bb.0:
@@ -777,6 +831,33 @@ define <vscale x 4 x i64> @vdivu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
   ret <vscale x 4 x i64> %vc
 }
 
+define <vscale x 4 x i64> @vdivu_vi_nxv4i64_1(<vscale x 4 x i64> %va) {
+; CHECK-LABEL: vdivu_vi_nxv4i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vsrl.vi v8, v8, 1
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %vc = udiv <vscale x 4 x i64> %va, %splat
+  ret <vscale x 4 x i64> %vc
+}
+
+; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) if c is power of 2
+define <vscale x 4 x i64> @vdivu_vi_nxv4i64_2(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb) {
+; CHECK-LABEL: vdivu_vi_nxv4i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vadd.vi v28, v12, 4
+; CHECK-NEXT:    vsrl.vv v8, v8, v28
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %vc = shl <vscale x 4 x i64> %splat, %vb
+  %vd = udiv <vscale x 4 x i64> %va, %vc
+  ret <vscale x 4 x i64> %vd
+}
+
 define <vscale x 8 x i64> @vdivu_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb) {
 ; CHECK-LABEL: vdivu_vv_nxv8i64:
 ; CHECK:       # %bb.0:
@@ -816,3 +897,29 @@ define <vscale x 8 x i64> @vdivu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   ret <vscale x 8 x i64> %vc
 }
 
+define <vscale x 8 x i64> @vdivu_vi_nxv8i64_1(<vscale x 8 x i64> %va) {
+; CHECK-LABEL: vdivu_vi_nxv8i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vsrl.vi v8, v8, 1
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %vc = udiv <vscale x 8 x i64> %va, %splat
+  ret <vscale x 8 x i64> %vc
+}
+
+; fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) if c is power of 2
+define <vscale x 8 x i64> @vdivu_vi_nxv8i64_2(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb) {
+; CHECK-LABEL: vdivu_vi_nxv8i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vadd.vi v16, v16, 4
+; CHECK-NEXT:    vsrl.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %vc = shl <vscale x 8 x i64> %splat, %vb
+  %vd = udiv <vscale x 8 x i64> %va, %vc
+  ret <vscale x 8 x i64> %vd
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
index eedf9fa447e40..0b2e4b67753ea 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vdivu-vp.ll
@@ -104,6 +104,18 @@ define <vscale x 2 x i8> @vdivu_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b,
   ret <vscale x 2 x i8> %v
 }
 
+declare <vscale x 3 x i8> @llvm.vp.udiv.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vdivu_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vdivu_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vdivu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.udiv.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
 declare <vscale x 4 x i8> @llvm.vp.udiv.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
 
 define <vscale x 4 x i8> @vdivu_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 2f33166cb3922..c3c40aed36706 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -760,6 +760,18 @@ define <vscale x 4 x double> @vfadd_vf_nxv4f64_unmasked(<vscale x 4 x double> %v
   ret <vscale x 4 x double> %v
 }
 
+declare <vscale x 7 x double> @llvm.vp.fadd.nxv7f64(<vscale x 7 x double>, <vscale x 7 x double>, <vscale x 7 x i1>, i32)
+
+define <vscale x 7 x double> @vfadd_vv_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x double> %b, <vscale x 7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfadd_vv_nxv7f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
+; CHECK-NEXT:    vfadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 7 x double> @llvm.vp.fadd.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x double> %b, <vscale x 7 x i1> %m, i32 %evl)
+  ret <vscale x 7 x double> %v
+}
+
 declare <vscale x 8 x double> @llvm.vp.fadd.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x double> @vfadd_vv_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
index c5f219c5e206d..4146f215730c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfdiv-vp.ll
@@ -760,6 +760,18 @@ define <vscale x 4 x double> @vfdiv_vf_nxv4f64_unmasked(<vscale x 4 x double> %v
   ret <vscale x 4 x double> %v
 }
 
+declare <vscale x 7 x double> @llvm.vp.fdiv.nxv7f64(<vscale x 7 x double>, <vscale x 7 x double>, <vscale x 7 x i1>, i32)
+
+define <vscale x 7 x double> @vfdiv_vv_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x double> %b, <vscale x 7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfdiv_vv_nxv7f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
+; CHECK-NEXT:    vfdiv.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 7 x double> @llvm.vp.fdiv.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x double> %b, <vscale x 7 x i1> %m, i32 %evl)
+  ret <vscale x 7 x double> %v
+}
+
 declare <vscale x 8 x double> @llvm.vp.fdiv.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x double> @vfdiv_vv_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
index f33fd4a604fe4..a474d5907d95a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfmul-vp.ll
@@ -760,6 +760,18 @@ define <vscale x 4 x double> @vfmul_vf_nxv4f64_unmasked(<vscale x 4 x double> %v
   ret <vscale x 4 x double> %v
 }
 
+declare <vscale x 7 x double> @llvm.vp.fmul.nxv7f64(<vscale x 7 x double>, <vscale x 7 x double>, <vscale x 7 x i1>, i32)
+
+define <vscale x 7 x double> @vfmul_vv_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x double> %b, <vscale x 7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmul_vv_nxv7f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
+; CHECK-NEXT:    vfmul.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 7 x double> @llvm.vp.fmul.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x double> %b, <vscale x 7 x i1> %m, i32 %evl)
+  ret <vscale x 7 x double> %v
+}
+
 declare <vscale x 8 x double> @llvm.vp.fmul.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x double> @vfmul_vv_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
index cfcfdf8af7682..131ae96ad708d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfsub-vp.ll
@@ -760,6 +760,18 @@ define <vscale x 4 x double> @vfsub_vf_nxv4f64_unmasked(<vscale x 4 x double> %v
   ret <vscale x 4 x double> %v
 }
 
+declare <vscale x 7 x double> @llvm.vp.fsub.nxv7f64(<vscale x 7 x double>, <vscale x 7 x double>, <vscale x 7 x i1>, i32)
+
+define <vscale x 7 x double> @vfsub_vv_nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x double> %b, <vscale x 7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfsub_vv_nxv7f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
+; CHECK-NEXT:    vfsub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 7 x double> @llvm.vp.fsub.nxv7f64(<vscale x 7 x double> %va, <vscale x 7 x double> %b, <vscale x 7 x i1> %m, i32 %evl)
+  ret <vscale x 7 x double> %v
+}
+
 declare <vscale x 8 x double> @llvm.vp.fsub.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x double> @vfsub_vv_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode-rv32.ll
index 787ded5b7bc08..c2ac91c7651cc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode-rv32.ll
@@ -673,6 +673,30 @@ define <vscale x 1 x i64> @vmul_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
   ret <vscale x 1 x i64> %vc
 }
 
+define <vscale x 1 x i64> @vmul_vi_nxv1i64_1(<vscale x 1 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv1i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+  %vc = mul <vscale x 1 x i64> %va, %splat
+  ret <vscale x 1 x i64> %vc
+}
+
+define <vscale x 1 x i64> @vmul_vi_nxv1i64_2(<vscale x 1 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv1i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vsll.vi v8, v8, 4
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+  %vc = mul <vscale x 1 x i64> %va, %splat
+  ret <vscale x 1 x i64> %vc
+}
+
 define <vscale x 2 x i64> @vmul_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
 ; CHECK-LABEL: vmul_vv_nxv2i64:
 ; CHECK:       # %bb.0:
@@ -715,6 +739,30 @@ define <vscale x 2 x i64> @vmul_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
   ret <vscale x 2 x i64> %vc
 }
 
+define <vscale x 2 x i64> @vmul_vi_nxv2i64_1(<vscale x 2 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv2i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %vc = mul <vscale x 2 x i64> %va, %splat
+  ret <vscale x 2 x i64> %vc
+}
+
+define <vscale x 2 x i64> @vmul_vi_nxv2i64_2(<vscale x 2 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv2i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vsll.vi v8, v8, 4
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %vc = mul <vscale x 2 x i64> %va, %splat
+  ret <vscale x 2 x i64> %vc
+}
+
 define <vscale x 4 x i64> @vmul_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb) {
 ; CHECK-LABEL: vmul_vv_nxv4i64:
 ; CHECK:       # %bb.0:
@@ -757,6 +805,30 @@ define <vscale x 4 x i64> @vmul_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
   ret <vscale x 4 x i64> %vc
 }
 
+define <vscale x 4 x i64> @vmul_vi_nxv4i64_1(<vscale x 4 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv4i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %vc = mul <vscale x 4 x i64> %va, %splat
+  ret <vscale x 4 x i64> %vc
+}
+
+define <vscale x 4 x i64> @vmul_vi_nxv4i64_2(<vscale x 4 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv4i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vsll.vi v8, v8, 4
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %vc = mul <vscale x 4 x i64> %va, %splat
+  ret <vscale x 4 x i64> %vc
+}
+
 define <vscale x 8 x i64> @vmul_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb) {
 ; CHECK-LABEL: vmul_vv_nxv8i64:
 ; CHECK:       # %bb.0:
@@ -799,3 +871,26 @@ define <vscale x 8 x i64> @vmul_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   ret <vscale x 8 x i64> %vc
 }
 
+define <vscale x 8 x i64> @vmul_vi_nxv8i64_1(<vscale x 8 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv8i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %vc = mul <vscale x 8 x i64> %va, %splat
+  ret <vscale x 8 x i64> %vc
+}
+
+define <vscale x 8 x i64> @vmul_vi_nxv8i64_2(<vscale x 8 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv8i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vsll.vi v8, v8, 4
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %vc = mul <vscale x 8 x i64> %va, %splat
+  ret <vscale x 8 x i64> %vc
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode-rv64.ll
index 51cae34ce648c..471442b8fee0f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmul-sdnode-rv64.ll
@@ -666,6 +666,30 @@ define <vscale x 1 x i64> @vmul_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
   ret <vscale x 1 x i64> %vc
 }
 
+define <vscale x 1 x i64> @vmul_vi_nxv1i64_1(<vscale x 1 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv1i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+  %vc = mul <vscale x 1 x i64> %va, %splat
+  ret <vscale x 1 x i64> %vc
+}
+
+define <vscale x 1 x i64> @vmul_vi_nxv1i64_2(<vscale x 1 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv1i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vsll.vi v8, v8, 4
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+  %vc = mul <vscale x 1 x i64> %va, %splat
+  ret <vscale x 1 x i64> %vc
+}
+
 define <vscale x 2 x i64> @vmul_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
 ; CHECK-LABEL: vmul_vv_nxv2i64:
 ; CHECK:       # %bb.0:
@@ -701,6 +725,30 @@ define <vscale x 2 x i64> @vmul_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
   ret <vscale x 2 x i64> %vc
 }
 
+define <vscale x 2 x i64> @vmul_vi_nxv2i64_1(<vscale x 2 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv2i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %vc = mul <vscale x 2 x i64> %va, %splat
+  ret <vscale x 2 x i64> %vc
+}
+
+define <vscale x 2 x i64> @vmul_vi_nxv2i64_2(<vscale x 2 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv2i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vsll.vi v8, v8, 4
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %vc = mul <vscale x 2 x i64> %va, %splat
+  ret <vscale x 2 x i64> %vc
+}
+
 define <vscale x 4 x i64> @vmul_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb) {
 ; CHECK-LABEL: vmul_vv_nxv4i64:
 ; CHECK:       # %bb.0:
@@ -736,6 +784,30 @@ define <vscale x 4 x i64> @vmul_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
   ret <vscale x 4 x i64> %vc
 }
 
+define <vscale x 4 x i64> @vmul_vi_nxv4i64_1(<vscale x 4 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv4i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %vc = mul <vscale x 4 x i64> %va, %splat
+  ret <vscale x 4 x i64> %vc
+}
+
+define <vscale x 4 x i64> @vmul_vi_nxv4i64_2(<vscale x 4 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv4i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vsll.vi v8, v8, 4
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %vc = mul <vscale x 4 x i64> %va, %splat
+  ret <vscale x 4 x i64> %vc
+}
+
 define <vscale x 8 x i64> @vmul_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb) {
 ; CHECK-LABEL: vmul_vv_nxv8i64:
 ; CHECK:       # %bb.0:
@@ -771,3 +843,26 @@ define <vscale x 8 x i64> @vmul_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   ret <vscale x 8 x i64> %vc
 }
 
+define <vscale x 8 x i64> @vmul_vi_nxv8i64_1(<vscale x 8 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv8i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i64> undef, i64 2, i32 0
+  %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %vc = mul <vscale x 8 x i64> %va, %splat
+  ret <vscale x 8 x i64> %vc
+}
+
+define <vscale x 8 x i64> @vmul_vi_nxv8i64_2(<vscale x 8 x i64> %va) {
+; CHECK-LABEL: vmul_vi_nxv8i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vsll.vi v8, v8, 4
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %vc = mul <vscale x 8 x i64> %va, %splat
+  ret <vscale x 8 x i64> %vc
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll
index 8d42c4c8f93b3..d222e4c30ad73 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vmul-vp.ll
@@ -804,6 +804,56 @@ define <vscale x 4 x i32> @vmul_vx_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32
   ret <vscale x 4 x i32> %v
 }
 
+declare <vscale x 7 x i32> @llvm.vp.mul.nxv7i32(<vscale x 7 x i32>, <vscale x 7 x i32>, <vscale x 7 x i1>, i32)
+
+define <vscale x 7 x i32> @vmul_vv_nxv7i32(<vscale x 7 x i32> %va, <vscale x 7 x i32> %b, <vscale x 7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmul_vv_nxv7i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
+; CHECK-NEXT:    vmul.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 7 x i32> @llvm.vp.mul.nxv7i32(<vscale x 7 x i32> %va, <vscale x 7 x i32> %b, <vscale x 7 x i1> %m, i32 %evl)
+  ret <vscale x 7 x i32> %v
+}
+
+define <vscale x 7 x i32> @vmul_vv_nxv7i32_unmasked(<vscale x 7 x i32> %va, <vscale x 7 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vmul_vv_nxv7i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, mu
+; CHECK-NEXT:    vmul.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 7 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 7 x i1> %head, <vscale x 7 x i1> undef, <vscale x 7 x i32> zeroinitializer
+  %v = call <vscale x 7 x i32> @llvm.vp.mul.nxv7i32(<vscale x 7 x i32> %va, <vscale x 7 x i32> %b, <vscale x 7 x i1> %m, i32 %evl)
+  ret <vscale x 7 x i32> %v
+}
+
+define <vscale x 7 x i32> @vmul_vx_nxv7i32(<vscale x 7 x i32> %va, i32 %b, <vscale x 7 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmul_vx_nxv7i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vmul.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 7 x i32> undef, i32 %b, i32 0
+  %vb = shufflevector <vscale x 7 x i32> %elt.head, <vscale x 7 x i32> undef, <vscale x 7 x i32> zeroinitializer
+  %v = call <vscale x 7 x i32> @llvm.vp.mul.nxv7i32(<vscale x 7 x i32> %va, <vscale x 7 x i32> %vb, <vscale x 7 x i1> %m, i32 %evl)
+  ret <vscale x 7 x i32> %v
+}
+
+define <vscale x 7 x i32> @vmul_vx_nxv7i32_unmasked(<vscale x 7 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vmul_vx_nxv7i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vmul.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 7 x i32> undef, i32 %b, i32 0
+  %vb = shufflevector <vscale x 7 x i32> %elt.head, <vscale x 7 x i32> undef, <vscale x 7 x i32> zeroinitializer
+  %head = insertelement <vscale x 7 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 7 x i1> %head, <vscale x 7 x i1> undef, <vscale x 7 x i32> zeroinitializer
+  %v = call <vscale x 7 x i32> @llvm.vp.mul.nxv7i32(<vscale x 7 x i32> %va, <vscale x 7 x i32> %vb, <vscale x 7 x i1> %m, i32 %evl)
+  ret <vscale x 7 x i32> %v
+}
+
 declare <vscale x 8 x i32> @llvm.vp.mul.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x i32> @vmul_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll
index ecccda50c5d1e..586ee00afcaf2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vor-vp.ll
@@ -1296,6 +1296,82 @@ define <vscale x 8 x i32> @vor_vi_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 z
   ret <vscale x 8 x i32> %v
 }
 
+declare <vscale x 10 x i32> @llvm.vp.or.nxv10i32(<vscale x 10 x i32>, <vscale x 10 x i32>, <vscale x 10 x i1>, i32)
+
+define <vscale x 10 x i32> @vor_vv_nxv10i32(<vscale x 10 x i32> %va, <vscale x 10 x i32> %b, <vscale x 10 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vor_vv_nxv10i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
+; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 10 x i32> @llvm.vp.or.nxv10i32(<vscale x 10 x i32> %va, <vscale x 10 x i32> %b, <vscale x 10 x i1> %m, i32 %evl)
+  ret <vscale x 10 x i32> %v
+}
+
+define <vscale x 10 x i32> @vor_vv_nxv10i32_unmasked(<vscale x 10 x i32> %va, <vscale x 10 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vor_vv_nxv10i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
+; CHECK-NEXT:    vor.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 10 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 10 x i1> %head, <vscale x 10 x i1> undef, <vscale x 10 x i32> zeroinitializer
+  %v = call <vscale x 10 x i32> @llvm.vp.or.nxv10i32(<vscale x 10 x i32> %va, <vscale x 10 x i32> %b, <vscale x 10 x i1> %m, i32 %evl)
+  ret <vscale x 10 x i32> %v
+}
+
+define <vscale x 10 x i32> @vor_vx_nxv10i32(<vscale x 10 x i32> %va, i32 %b, <vscale x 10 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vor_vx_nxv10i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vor.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 10 x i32> undef, i32 %b, i32 0
+  %vb = shufflevector <vscale x 10 x i32> %elt.head, <vscale x 10 x i32> undef, <vscale x 10 x i32> zeroinitializer
+  %v = call <vscale x 10 x i32> @llvm.vp.or.nxv10i32(<vscale x 10 x i32> %va, <vscale x 10 x i32> %vb, <vscale x 10 x i1> %m, i32 %evl)
+  ret <vscale x 10 x i32> %v
+}
+
+define <vscale x 10 x i32> @vor_vx_nxv10i32_unmasked(<vscale x 10 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vor_vx_nxv10i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vor.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 10 x i32> undef, i32 %b, i32 0
+  %vb = shufflevector <vscale x 10 x i32> %elt.head, <vscale x 10 x i32> undef, <vscale x 10 x i32> zeroinitializer
+  %head = insertelement <vscale x 10 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 10 x i1> %head, <vscale x 10 x i1> undef, <vscale x 10 x i32> zeroinitializer
+  %v = call <vscale x 10 x i32> @llvm.vp.or.nxv10i32(<vscale x 10 x i32> %va, <vscale x 10 x i32> %vb, <vscale x 10 x i1> %m, i32 %evl)
+  ret <vscale x 10 x i32> %v
+}
+
+define <vscale x 10 x i32> @vor_vi_nxv10i32(<vscale x 10 x i32> %va, <vscale x 10 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vor_vi_nxv10i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
+; CHECK-NEXT:    vor.vi v8, v8, 5, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 10 x i32> undef, i32 5, i32 0
+  %vb = shufflevector <vscale x 10 x i32> %elt.head, <vscale x 10 x i32> undef, <vscale x 10 x i32> zeroinitializer
+  %v = call <vscale x 10 x i32> @llvm.vp.or.nxv10i32(<vscale x 10 x i32> %va, <vscale x 10 x i32> %vb, <vscale x 10 x i1> %m, i32 %evl)
+  ret <vscale x 10 x i32> %v
+}
+
+define <vscale x 10 x i32> @vor_vi_nxv10i32_unmasked(<vscale x 10 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vor_vi_nxv10i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
+; CHECK-NEXT:    vor.vi v8, v8, 5
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 10 x i32> undef, i32 5, i32 0
+  %vb = shufflevector <vscale x 10 x i32> %elt.head, <vscale x 10 x i32> undef, <vscale x 10 x i32> zeroinitializer
+  %head = insertelement <vscale x 10 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 10 x i1> %head, <vscale x 10 x i1> undef, <vscale x 10 x i32> zeroinitializer
+  %v = call <vscale x 10 x i32> @llvm.vp.or.nxv10i32(<vscale x 10 x i32> %va, <vscale x 10 x i32> %vb, <vscale x 10 x i1> %m, i32 %evl)
+  ret <vscale x 10 x i32> %v
+}
+
 declare <vscale x 16 x i32> @llvm.vp.or.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
 
 define <vscale x 16 x i32> @vor_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
index 0bb7504e94367..89f2b7d2b6864 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vrem-vp.ll
@@ -104,6 +104,18 @@ define <vscale x 2 x i8> @vrem_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b,
   ret <vscale x 2 x i8> %v
 }
 
+declare <vscale x 3 x i8> @llvm.vp.srem.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vrem_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vrem_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vrem.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.srem.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
 declare <vscale x 4 x i8> @llvm.vp.srem.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
 
 define <vscale x 4 x i8> @vrem_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode-rv32.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode-rv32.ll
index adea0f92dc334..a1c7920f5b6d9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode-rv32.ll
@@ -750,6 +750,37 @@ define <vscale x 1 x i64> @vremu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
   ret <vscale x 1 x i64> %vc
 }
 
+; fold (urem x, pow2) -> (and x, pow2-1)
+define <vscale x 1 x i64> @vremu_vi_nxv1i64_1(<vscale x 1 x i64> %va) {
+; CHECK-LABEL: vremu_vi_nxv1i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+  %vc = urem <vscale x 1 x i64> %va, %splat
+  ret <vscale x 1 x i64> %vc
+}
+
+; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+define <vscale x 1 x i64> @vremu_vi_nxv1i64_2(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb) {
+; CHECK-LABEL: vremu_vi_nxv1i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, zero, 16
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.v.x v25, a0
+; CHECK-NEXT:    vsll.vv v25, v25, v9
+; CHECK-NEXT:    vadd.vi v25, v25, -1
+; CHECK-NEXT:    vand.vv v8, v8, v25
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+  %vc = shl <vscale x 1 x i64> %splat, %vb
+  %vd = urem <vscale x 1 x i64> %va, %vc
+  ret <vscale x 1 x i64> %vd
+}
+
 define <vscale x 2 x i64> @vremu_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
 ; CHECK-LABEL: vremu_vv_nxv2i64:
 ; CHECK:       # %bb.0:
@@ -804,6 +835,37 @@ define <vscale x 2 x i64> @vremu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
   ret <vscale x 2 x i64> %vc
 }
 
+; fold (urem x, pow2) -> (and x, pow2-1)
+define <vscale x 2 x i64> @vremu_vi_nxv2i64_1(<vscale x 2 x i64> %va) {
+; CHECK-LABEL: vremu_vi_nxv2i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %vc = urem <vscale x 2 x i64> %va, %splat
+  ret <vscale x 2 x i64> %vc
+}
+
+; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+define <vscale x 2 x i64> @vremu_vi_nxv2i64_2(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
+; CHECK-LABEL: vremu_vi_nxv2i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, zero, 16
+; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vmv.v.x v26, a0
+; CHECK-NEXT:    vsll.vv v26, v26, v10
+; CHECK-NEXT:    vadd.vi v26, v26, -1
+; CHECK-NEXT:    vand.vv v8, v8, v26
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %vc = shl <vscale x 2 x i64> %splat, %vb
+  %vd = urem <vscale x 2 x i64> %va, %vc
+  ret <vscale x 2 x i64> %vd
+}
+
 define <vscale x 4 x i64> @vremu_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb) {
 ; CHECK-LABEL: vremu_vv_nxv4i64:
 ; CHECK:       # %bb.0:
@@ -858,6 +920,37 @@ define <vscale x 4 x i64> @vremu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
   ret <vscale x 4 x i64> %vc
 }
 
+; fold (urem x, pow2) -> (and x, pow2-1)
+define <vscale x 4 x i64> @vremu_vi_nxv4i64_1(<vscale x 4 x i64> %va) {
+; CHECK-LABEL: vremu_vi_nxv4i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %vc = urem <vscale x 4 x i64> %va, %splat
+  ret <vscale x 4 x i64> %vc
+}
+
+;fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+define <vscale x 4 x i64> @vremu_vi_nxv4i64_2(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb) {
+; CHECK-LABEL: vremu_vi_nxv4i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, zero, 16
+; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vmv.v.x v28, a0
+; CHECK-NEXT:    vsll.vv v28, v28, v12
+; CHECK-NEXT:    vadd.vi v28, v28, -1
+; CHECK-NEXT:    vand.vv v8, v8, v28
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %vc = shl <vscale x 4 x i64> %splat, %vb
+  %vd = urem <vscale x 4 x i64> %va, %vc
+  ret <vscale x 4 x i64> %vd
+}
+
 define <vscale x 8 x i64> @vremu_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb) {
 ; CHECK-LABEL: vremu_vv_nxv8i64:
 ; CHECK:       # %bb.0:
@@ -911,3 +1004,34 @@ define <vscale x 8 x i64> @vremu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   %vc = urem <vscale x 8 x i64> %va, %splat
   ret <vscale x 8 x i64> %vc
 }
+
+; fold (urem x, pow2) -> (and x, pow2-1)
+define <vscale x 8 x i64> @vremu_vi_nxv8i64_1(<vscale x 8 x i64> %va) {
+; CHECK-LABEL: vremu_vi_nxv8i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %vc = urem <vscale x 8 x i64> %va, %splat
+  ret <vscale x 8 x i64> %vc
+}
+
+; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+define <vscale x 8 x i64> @vremu_vi_nxv8i64_2(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb) {
+; CHECK-LABEL: vremu_vi_nxv8i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, zero, 16
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vmv.v.x v24, a0
+; CHECK-NEXT:    vsll.vv v16, v24, v16
+; CHECK-NEXT:    vadd.vi v16, v16, -1
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %vc = shl <vscale x 8 x i64> %splat, %vb
+  %vd = urem <vscale x 8 x i64> %va, %vc
+  ret <vscale x 8 x i64> %vd
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode-rv64.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode-rv64.ll
index 077b4597037c8..70bebb1fcf168 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-sdnode-rv64.ll
@@ -737,6 +737,37 @@ define <vscale x 1 x i64> @vremu_vi_nxv1i64_0(<vscale x 1 x i64> %va) {
   ret <vscale x 1 x i64> %vc
 }
 
+; fold (urem x, pow2) -> (and x, pow2-1)
+define <vscale x 1 x i64> @vremu_vi_nxv1i64_1(<vscale x 1 x i64> %va) {
+; CHECK-LABEL: vremu_vi_nxv1i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+  %vc = urem <vscale x 1 x i64> %va, %splat
+  ret <vscale x 1 x i64> %vc
+}
+
+; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+define <vscale x 1 x i64> @vremu_vi_nxv1i64_2(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb) {
+; CHECK-LABEL: vremu_vi_nxv1i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, zero, 16
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, mu
+; CHECK-NEXT:    vmv.v.x v25, a0
+; CHECK-NEXT:    vsll.vv v25, v25, v9
+; CHECK-NEXT:    vadd.vi v25, v25, -1
+; CHECK-NEXT:    vand.vv v8, v8, v25
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 1 x i64> %head, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
+  %vc = shl <vscale x 1 x i64> %splat, %vb
+  %vd = urem <vscale x 1 x i64> %va, %vc
+  ret <vscale x 1 x i64> %vd
+}
+
 define <vscale x 2 x i64> @vremu_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
 ; CHECK-LABEL: vremu_vv_nxv2i64:
 ; CHECK:       # %bb.0:
@@ -778,6 +809,37 @@ define <vscale x 2 x i64> @vremu_vi_nxv2i64_0(<vscale x 2 x i64> %va) {
   ret <vscale x 2 x i64> %vc
 }
 
+; fold (urem x, pow2) -> (and x, pow2-1)
+define <vscale x 2 x i64> @vremu_vi_nxv2i64_1(<vscale x 2 x i64> %va) {
+; CHECK-LABEL: vremu_vi_nxv2i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %vc = urem <vscale x 2 x i64> %va, %splat
+  ret <vscale x 2 x i64> %vc
+}
+
+; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+define <vscale x 2 x i64> @vremu_vi_nxv2i64_2(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb) {
+; CHECK-LABEL: vremu_vi_nxv2i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, zero, 16
+; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vmv.v.x v26, a0
+; CHECK-NEXT:    vsll.vv v26, v26, v10
+; CHECK-NEXT:    vadd.vi v26, v26, -1
+; CHECK-NEXT:    vand.vv v8, v8, v26
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 2 x i64> %head, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %vc = shl <vscale x 2 x i64> %splat, %vb
+  %vd = urem <vscale x 2 x i64> %va, %vc
+  ret <vscale x 2 x i64> %vd
+}
+
 define <vscale x 4 x i64> @vremu_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb) {
 ; CHECK-LABEL: vremu_vv_nxv4i64:
 ; CHECK:       # %bb.0:
@@ -819,6 +881,37 @@ define <vscale x 4 x i64> @vremu_vi_nxv4i64_0(<vscale x 4 x i64> %va) {
   ret <vscale x 4 x i64> %vc
 }
 
+; fold (urem x, pow2) -> (and x, pow2-1)
+define <vscale x 4 x i64> @vremu_vi_nxv4i64_1(<vscale x 4 x i64> %va) {
+; CHECK-LABEL: vremu_vi_nxv4i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %vc = urem <vscale x 4 x i64> %va, %splat
+  ret <vscale x 4 x i64> %vc
+}
+
+;fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+define <vscale x 4 x i64> @vremu_vi_nxv4i64_2(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb) {
+; CHECK-LABEL: vremu_vi_nxv4i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, zero, 16
+; CHECK-NEXT:    vsetvli a1, zero, e64, m4, ta, mu
+; CHECK-NEXT:    vmv.v.x v28, a0
+; CHECK-NEXT:    vsll.vv v28, v28, v12
+; CHECK-NEXT:    vadd.vi v28, v28, -1
+; CHECK-NEXT:    vand.vv v8, v8, v28
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 4 x i64> %head, <vscale x 4 x i64> undef, <vscale x 4 x i32> zeroinitializer
+  %vc = shl <vscale x 4 x i64> %splat, %vb
+  %vd = urem <vscale x 4 x i64> %va, %vc
+  ret <vscale x 4 x i64> %vd
+}
+
 define <vscale x 8 x i64> @vremu_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb) {
 ; CHECK-LABEL: vremu_vv_nxv8i64:
 ; CHECK:       # %bb.0:
@@ -860,3 +953,33 @@ define <vscale x 8 x i64> @vremu_vi_nxv8i64_0(<vscale x 8 x i64> %va) {
   ret <vscale x 8 x i64> %vc
 }
 
+; fold (urem x, pow2) -> (and x, pow2-1)
+define <vscale x 8 x i64> @vremu_vi_nxv8i64_1(<vscale x 8 x i64> %va) {
+; CHECK-LABEL: vremu_vi_nxv8i64_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vand.vi v8, v8, 15
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %vc = urem <vscale x 8 x i64> %va, %splat
+  ret <vscale x 8 x i64> %vc
+}
+
+; fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+define <vscale x 8 x i64> @vremu_vi_nxv8i64_2(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb) {
+; CHECK-LABEL: vremu_vi_nxv8i64_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, zero, 16
+; CHECK-NEXT:    vsetvli a1, zero, e64, m8, ta, mu
+; CHECK-NEXT:    vmv.v.x v24, a0
+; CHECK-NEXT:    vsll.vv v16, v24, v16
+; CHECK-NEXT:    vadd.vi v16, v16, -1
+; CHECK-NEXT:    vand.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i64> undef, i64 16, i32 0
+  %splat = shufflevector <vscale x 8 x i64> %head, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %vc = shl <vscale x 8 x i64> %splat, %vb
+  %vd = urem <vscale x 8 x i64> %va, %vc
+  ret <vscale x 8 x i64> %vd
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
index f6e0cb71c2416..e7a3b33f3ba15 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vremu-vp.ll
@@ -104,6 +104,18 @@ define <vscale x 2 x i8> @vremu_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b,
   ret <vscale x 2 x i8> %v
 }
 
+declare <vscale x 3 x i8> @llvm.vp.urem.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vremu_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vremu_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, mu
+; CHECK-NEXT:    vremu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.urem.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
 declare <vscale x 4 x i8> @llvm.vp.urem.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
 
 define <vscale x 4 x i8> @vremu_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll
index 1aca9277d9a0c..061a2f44ead81 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vshl-vp.ll
@@ -232,6 +232,18 @@ define <vscale x 4 x i8> @vsll_vi_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zer
   ret <vscale x 4 x i8> %v
 }
 
+declare <vscale x 5 x i8> @llvm.vp.shl.nxv5i8(<vscale x 5 x i8>, <vscale x 5 x i8>, <vscale x 5 x i1>, i32)
+
+define <vscale x 5 x i8> @vsll_vv_nxv5i8(<vscale x 5 x i8> %va, <vscale x 5 x i8> %b, <vscale x 5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsll_vv_nxv5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vsll.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 5 x i8> @llvm.vp.shl.nxv5i8(<vscale x 5 x i8> %va, <vscale x 5 x i8> %b, <vscale x 5 x i1> %m, i32 %evl)
+  ret <vscale x 5 x i8> %v
+}
+
 declare <vscale x 8 x i8> @llvm.vp.shl.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x i8> @vsll_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
index 3ddd25937273a..2ff5d07860f23 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsra-vp.ll
@@ -1636,6 +1636,18 @@ define <vscale x 4 x i64> @vsra_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
   ret <vscale x 4 x i64> %v
 }
 
+declare <vscale x 6 x i64> @llvm.vp.ashr.nxv6i64(<vscale x 6 x i64>, <vscale x 6 x i64>, <vscale x 6 x i1>, i32)
+
+define <vscale x 6 x i64> @vsra_vv_nxv6i64(<vscale x 6 x i64> %va, <vscale x 6 x i64> %b, <vscale x 6 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsra_vv_nxv6i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
+; CHECK-NEXT:    vsra.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 6 x i64> @llvm.vp.ashr.nxv6i64(<vscale x 6 x i64> %va, <vscale x 6 x i64> %b, <vscale x 6 x i1> %m, i32 %evl)
+  ret <vscale x 6 x i64> %v
+}
+
 declare <vscale x 8 x i64> @llvm.vp.ashr.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x i64> @vsra_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
index f42b5798e1329..b58873b452b67 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsrl-vp.ll
@@ -1636,6 +1636,18 @@ define <vscale x 4 x i64> @vsrl_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32
   ret <vscale x 4 x i64> %v
 }
 
+declare <vscale x 5 x i64> @llvm.vp.lshr.nxv5i64(<vscale x 5 x i64>, <vscale x 5 x i64>, <vscale x 5 x i1>, i32)
+
+define <vscale x 5 x i64> @vsrl_vv_nxv5i64(<vscale x 5 x i64> %va, <vscale x 5 x i64> %b, <vscale x 5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsrl_vv_nxv5i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, mu
+; CHECK-NEXT:    vsrl.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 5 x i64> @llvm.vp.lshr.nxv5i64(<vscale x 5 x i64> %va, <vscale x 5 x i64> %b, <vscale x 5 x i1> %m, i32 %evl)
+  ret <vscale x 5 x i64> %v
+}
+
 declare <vscale x 8 x i64> @llvm.vp.lshr.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x i64> @vsrl_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll
index a0b7b7de02275..94e697f97cd93 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsub-vp.ll
@@ -154,6 +154,56 @@ define <vscale x 4 x i8> @vsub_vx_nxv4i8_unmasked(<vscale x 4 x i8> %va, i8 %b,
   ret <vscale x 4 x i8> %v
 }
 
+declare <vscale x 5 x i8> @llvm.vp.sub.nxv5i8(<vscale x 5 x i8>, <vscale x 5 x i8>, <vscale x 5 x i1>, i32)
+
+define <vscale x 5 x i8> @vsub_vv_nxv5i8(<vscale x 5 x i8> %va, <vscale x 5 x i8> %b, <vscale x 5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsub_vv_nxv5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 5 x i8> @llvm.vp.sub.nxv5i8(<vscale x 5 x i8> %va, <vscale x 5 x i8> %b, <vscale x 5 x i1> %m, i32 %evl)
+  ret <vscale x 5 x i8> %v
+}
+
+define <vscale x 5 x i8> @vsub_vv_nxv5i8_unmasked(<vscale x 5 x i8> %va, <vscale x 5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsub_vv_nxv5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, mu
+; CHECK-NEXT:    vsub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 5 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 5 x i1> %head, <vscale x 5 x i1> undef, <vscale x 5 x i32> zeroinitializer
+  %v = call <vscale x 5 x i8> @llvm.vp.sub.nxv5i8(<vscale x 5 x i8> %va, <vscale x 5 x i8> %b, <vscale x 5 x i1> %m, i32 %evl)
+  ret <vscale x 5 x i8> %v
+}
+
+define <vscale x 5 x i8> @vsub_vx_nxv5i8(<vscale x 5 x i8> %va, i8 %b, <vscale x 5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsub_vx_nxv5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 5 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <vscale x 5 x i8> %elt.head, <vscale x 5 x i8> undef, <vscale x 5 x i32> zeroinitializer
+  %v = call <vscale x 5 x i8> @llvm.vp.sub.nxv5i8(<vscale x 5 x i8> %va, <vscale x 5 x i8> %vb, <vscale x 5 x i1> %m, i32 %evl)
+  ret <vscale x 5 x i8> %v
+}
+
+define <vscale x 5 x i8> @vsub_vx_nxv5i8_unmasked(<vscale x 5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsub_vx_nxv5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vsub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 5 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <vscale x 5 x i8> %elt.head, <vscale x 5 x i8> undef, <vscale x 5 x i32> zeroinitializer
+  %head = insertelement <vscale x 5 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 5 x i1> %head, <vscale x 5 x i1> undef, <vscale x 5 x i32> zeroinitializer
+  %v = call <vscale x 5 x i8> @llvm.vp.sub.nxv5i8(<vscale x 5 x i8> %va, <vscale x 5 x i8> %vb, <vscale x 5 x i1> %m, i32 %evl)
+  ret <vscale x 5 x i8> %v
+}
+
 declare <vscale x 8 x i8> @llvm.vp.sub.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x i8> @vsub_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll
index a9756058b8c0a..cfd5bd116c6c1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vxor-vp.ll
@@ -412,6 +412,108 @@ define <vscale x 8 x i8> @vxor_vi_nxv8i8_unmasked_1(<vscale x 8 x i8> %va, i32 z
   ret <vscale x 8 x i8> %v
 }
 
+declare <vscale x 15 x i8> @llvm.vp.xor.nxv15i8(<vscale x 15 x i8>, <vscale x 15 x i8>, <vscale x 15 x i1>, i32)
+
+define <vscale x 15 x i8> @vxor_vv_nxv15i8(<vscale x 15 x i8> %va, <vscale x 15 x i8> %b, <vscale x 15 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vv_nxv15i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
+; CHECK-NEXT:    vxor.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 15 x i8> @llvm.vp.xor.nxv15i8(<vscale x 15 x i8> %va, <vscale x 15 x i8> %b, <vscale x 15 x i1> %m, i32 %evl)
+  ret <vscale x 15 x i8> %v
+}
+
+define <vscale x 15 x i8> @vxor_vv_nxv15i8_unmasked(<vscale x 15 x i8> %va, <vscale x 15 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vv_nxv15i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
+; CHECK-NEXT:    vxor.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 15 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 15 x i1> %head, <vscale x 15 x i1> undef, <vscale x 15 x i32> zeroinitializer
+  %v = call <vscale x 15 x i8> @llvm.vp.xor.nxv15i8(<vscale x 15 x i8> %va, <vscale x 15 x i8> %b, <vscale x 15 x i1> %m, i32 %evl)
+  ret <vscale x 15 x i8> %v
+}
+
+define <vscale x 15 x i8> @vxor_vx_nxv15i8(<vscale x 15 x i8> %va, i8 %b, <vscale x 15 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vx_nxv15i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vxor.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 15 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <vscale x 15 x i8> %elt.head, <vscale x 15 x i8> undef, <vscale x 15 x i32> zeroinitializer
+  %v = call <vscale x 15 x i8> @llvm.vp.xor.nxv15i8(<vscale x 15 x i8> %va, <vscale x 15 x i8> %vb, <vscale x 15 x i1> %m, i32 %evl)
+  ret <vscale x 15 x i8> %v
+}
+
+define <vscale x 15 x i8> @vxor_vx_nxv15i8_unmasked(<vscale x 15 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vx_nxv15i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vxor.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 15 x i8> undef, i8 %b, i32 0
+  %vb = shufflevector <vscale x 15 x i8> %elt.head, <vscale x 15 x i8> undef, <vscale x 15 x i32> zeroinitializer
+  %head = insertelement <vscale x 15 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 15 x i1> %head, <vscale x 15 x i1> undef, <vscale x 15 x i32> zeroinitializer
+  %v = call <vscale x 15 x i8> @llvm.vp.xor.nxv15i8(<vscale x 15 x i8> %va, <vscale x 15 x i8> %vb, <vscale x 15 x i1> %m, i32 %evl)
+  ret <vscale x 15 x i8> %v
+}
+
+define <vscale x 15 x i8> @vxor_vi_nxv15i8(<vscale x 15 x i8> %va, <vscale x 15 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vi_nxv15i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
+; CHECK-NEXT:    vxor.vi v8, v8, 7, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 15 x i8> undef, i8 7, i32 0
+  %vb = shufflevector <vscale x 15 x i8> %elt.head, <vscale x 15 x i8> undef, <vscale x 15 x i32> zeroinitializer
+  %v = call <vscale x 15 x i8> @llvm.vp.xor.nxv15i8(<vscale x 15 x i8> %va, <vscale x 15 x i8> %vb, <vscale x 15 x i1> %m, i32 %evl)
+  ret <vscale x 15 x i8> %v
+}
+
+define <vscale x 15 x i8> @vxor_vi_nxv15i8_unmasked(<vscale x 15 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vi_nxv15i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
+; CHECK-NEXT:    vxor.vi v8, v8, 7
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 15 x i8> undef, i8 7, i32 0
+  %vb = shufflevector <vscale x 15 x i8> %elt.head, <vscale x 15 x i8> undef, <vscale x 15 x i32> zeroinitializer
+  %head = insertelement <vscale x 15 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 15 x i1> %head, <vscale x 15 x i1> undef, <vscale x 15 x i32> zeroinitializer
+  %v = call <vscale x 15 x i8> @llvm.vp.xor.nxv15i8(<vscale x 15 x i8> %va, <vscale x 15 x i8> %vb, <vscale x 15 x i1> %m, i32 %evl)
+  ret <vscale x 15 x i8> %v
+}
+
+define <vscale x 15 x i8> @vxor_vi_nxv15i8_1(<vscale x 15 x i8> %va, <vscale x 15 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vi_nxv15i8_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
+; CHECK-NEXT:    vnot.v v8, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 15 x i8> undef, i8 -1, i32 0
+  %vb = shufflevector <vscale x 15 x i8> %elt.head, <vscale x 15 x i8> undef, <vscale x 15 x i32> zeroinitializer
+  %v = call <vscale x 15 x i8> @llvm.vp.xor.nxv15i8(<vscale x 15 x i8> %va, <vscale x 15 x i8> %vb, <vscale x 15 x i1> %m, i32 %evl)
+  ret <vscale x 15 x i8> %v
+}
+
+define <vscale x 15 x i8> @vxor_vi_nxv15i8_unmasked_1(<vscale x 15 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vxor_vi_nxv15i8_unmasked_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, mu
+; CHECK-NEXT:    vxor.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 15 x i8> undef, i8 -1, i32 0
+  %vb = shufflevector <vscale x 15 x i8> %elt.head, <vscale x 15 x i8> undef, <vscale x 15 x i32> zeroinitializer
+  %head = insertelement <vscale x 15 x i1> undef, i1 true, i32 0
+  %m = shufflevector <vscale x 15 x i1> %head, <vscale x 15 x i1> undef, <vscale x 15 x i32> zeroinitializer
+  %v = call <vscale x 15 x i8> @llvm.vp.xor.nxv15i8(<vscale x 15 x i8> %va, <vscale x 15 x i8> %vb, <vscale x 15 x i1> %m, i32 %evl)
+  ret <vscale x 15 x i8> %v
+}
+
 declare <vscale x 16 x i8> @llvm.vp.xor.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
 
 define <vscale x 16 x i8> @vxor_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
diff --git a/llvm/test/CodeGen/RISCV/sadd_sat.ll b/llvm/test/CodeGen/RISCV/sadd_sat.ll
index be326d2155775..4ced792c37b3a 100644
--- a/llvm/test/CodeGen/RISCV/sadd_sat.ll
+++ b/llvm/test/CodeGen/RISCV/sadd_sat.ll
@@ -21,9 +21,9 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32I-NEXT:    slti a1, a1, 0
 ; RV32I-NEXT:    beq a1, a2, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:  .LBB0_2:
 ; RV32I-NEXT:    ret
 ;
@@ -52,9 +52,9 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32IZbbNOZbt-NEXT:    slti a1, a1, 0
 ; RV32IZbbNOZbt-NEXT:    beq a1, a2, .LBB0_2
 ; RV32IZbbNOZbt-NEXT:  # %bb.1:
-; RV32IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV32IZbbNOZbt-NEXT:    srai a0, a0, 31
 ; RV32IZbbNOZbt-NEXT:    lui a1, 524288
-; RV32IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV32IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV32IZbbNOZbt-NEXT:  .LBB0_2:
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
@@ -70,14 +70,13 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32IZbbZbt-LABEL: func:
 ; RV32IZbbZbt:       # %bb.0:
 ; RV32IZbbZbt-NEXT:    add a2, a0, a1
-; RV32IZbbZbt-NEXT:    slti a3, a2, 0
-; RV32IZbbZbt-NEXT:    lui a4, 524288
-; RV32IZbbZbt-NEXT:    addi a5, a4, -1
-; RV32IZbbZbt-NEXT:    cmov a3, a3, a5, a4
 ; RV32IZbbZbt-NEXT:    slt a0, a2, a0
 ; RV32IZbbZbt-NEXT:    slti a1, a1, 0
 ; RV32IZbbZbt-NEXT:    xor a0, a1, a0
-; RV32IZbbZbt-NEXT:    cmov a0, a0, a3, a2
+; RV32IZbbZbt-NEXT:    srai a1, a2, 31
+; RV32IZbbZbt-NEXT:    lui a3, 524288
+; RV32IZbbZbt-NEXT:    xor a1, a1, a3
+; RV32IZbbZbt-NEXT:    cmov a0, a0, a1, a2
 ; RV32IZbbZbt-NEXT:    ret
   %tmp = call i32 @llvm.sadd.sat.i32(i32 %x, i32 %y);
   ret i32 %tmp;
@@ -98,11 +97,9 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32I-NEXT:    and a2, a3, a2
 ; RV32I-NEXT:    bgez a2, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a1, 0
-; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    sub a2, a2, a0
 ; RV32I-NEXT:    srai a0, a1, 31
-; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    xor a1, a0, a1
 ; RV32I-NEXT:  .LBB1_2:
 ; RV32I-NEXT:    ret
 ;
@@ -114,10 +111,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV64I-NEXT:    slti a1, a1, 0
 ; RV64I-NEXT:    beq a1, a2, .LBB1_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srai a0, a0, 63
 ; RV64I-NEXT:    addi a1, zero, -1
 ; RV64I-NEXT:    slli a1, a1, 63
-; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:  .LBB1_2:
 ; RV64I-NEXT:    ret
 ;
@@ -134,11 +131,9 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32IZbbNOZbt-NEXT:    andn a2, a2, a3
 ; RV32IZbbNOZbt-NEXT:    bgez a2, .LBB1_2
 ; RV32IZbbNOZbt-NEXT:  # %bb.1:
-; RV32IZbbNOZbt-NEXT:    slti a0, a1, 0
-; RV32IZbbNOZbt-NEXT:    lui a2, 524288
-; RV32IZbbNOZbt-NEXT:    sub a2, a2, a0
 ; RV32IZbbNOZbt-NEXT:    srai a0, a1, 31
-; RV32IZbbNOZbt-NEXT:    mv a1, a2
+; RV32IZbbNOZbt-NEXT:    lui a1, 524288
+; RV32IZbbNOZbt-NEXT:    xor a1, a0, a1
 ; RV32IZbbNOZbt-NEXT:  .LBB1_2:
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
@@ -150,10 +145,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV64IZbbNOZbt-NEXT:    slti a1, a1, 0
 ; RV64IZbbNOZbt-NEXT:    beq a1, a2, .LBB1_2
 ; RV64IZbbNOZbt-NEXT:  # %bb.1:
-; RV64IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV64IZbbNOZbt-NEXT:    srai a0, a0, 63
 ; RV64IZbbNOZbt-NEXT:    addi a1, zero, -1
 ; RV64IZbbNOZbt-NEXT:    slli a1, a1, 63
-; RV64IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV64IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV64IZbbNOZbt-NEXT:  .LBB1_2:
 ; RV64IZbbNOZbt-NEXT:    ret
 ;
@@ -163,31 +158,28 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32IZbbZbt-NEXT:    add a2, a0, a2
 ; RV32IZbbZbt-NEXT:    sltu a0, a2, a0
 ; RV32IZbbZbt-NEXT:    add a0, a4, a0
-; RV32IZbbZbt-NEXT:    slti a4, a0, 0
-; RV32IZbbZbt-NEXT:    lui a6, 524288
-; RV32IZbbZbt-NEXT:    addi a5, a6, -1
-; RV32IZbbZbt-NEXT:    cmov a4, a4, a5, a6
+; RV32IZbbZbt-NEXT:    srai a4, a0, 31
+; RV32IZbbZbt-NEXT:    lui a5, 524288
+; RV32IZbbZbt-NEXT:    xor a6, a4, a5
 ; RV32IZbbZbt-NEXT:    xor a5, a1, a0
 ; RV32IZbbZbt-NEXT:    xor a1, a1, a3
 ; RV32IZbbZbt-NEXT:    andn a1, a5, a1
 ; RV32IZbbZbt-NEXT:    slti a3, a1, 0
-; RV32IZbbZbt-NEXT:    cmov a1, a3, a4, a0
-; RV32IZbbZbt-NEXT:    srai a0, a0, 31
-; RV32IZbbZbt-NEXT:    cmov a0, a3, a0, a2
+; RV32IZbbZbt-NEXT:    cmov a1, a3, a6, a0
+; RV32IZbbZbt-NEXT:    cmov a0, a3, a4, a2
 ; RV32IZbbZbt-NEXT:    ret
 ;
 ; RV64IZbbZbt-LABEL: func2:
 ; RV64IZbbZbt:       # %bb.0:
 ; RV64IZbbZbt-NEXT:    add a2, a0, a1
-; RV64IZbbZbt-NEXT:    slti a3, a2, 0
-; RV64IZbbZbt-NEXT:    addi a4, zero, -1
-; RV64IZbbZbt-NEXT:    slli a5, a4, 63
-; RV64IZbbZbt-NEXT:    srli a4, a4, 1
-; RV64IZbbZbt-NEXT:    cmov a3, a3, a4, a5
 ; RV64IZbbZbt-NEXT:    slt a0, a2, a0
 ; RV64IZbbZbt-NEXT:    slti a1, a1, 0
 ; RV64IZbbZbt-NEXT:    xor a0, a1, a0
-; RV64IZbbZbt-NEXT:    cmov a0, a0, a3, a2
+; RV64IZbbZbt-NEXT:    srai a1, a2, 63
+; RV64IZbbZbt-NEXT:    addi a3, zero, -1
+; RV64IZbbZbt-NEXT:    slli a3, a3, 63
+; RV64IZbbZbt-NEXT:    xor a1, a1, a3
+; RV64IZbbZbt-NEXT:    cmov a0, a0, a1, a2
 ; RV64IZbbZbt-NEXT:    ret
   %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %y);
   ret i64 %tmp;
diff --git a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
index e0d7e0deb8a22..eb0988ecbc936 100644
--- a/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/sadd_sat_plus.ll
@@ -22,9 +22,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32I-NEXT:    slti a1, a1, 0
 ; RV32I-NEXT:    beq a1, a2, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:  .LBB0_2:
 ; RV32I-NEXT:    ret
 ;
@@ -56,9 +56,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32IZbbNOZbt-NEXT:    slti a1, a1, 0
 ; RV32IZbbNOZbt-NEXT:    beq a1, a2, .LBB0_2
 ; RV32IZbbNOZbt-NEXT:  # %bb.1:
-; RV32IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV32IZbbNOZbt-NEXT:    srai a0, a0, 31
 ; RV32IZbbNOZbt-NEXT:    lui a1, 524288
-; RV32IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV32IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV32IZbbNOZbt-NEXT:  .LBB0_2:
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
@@ -80,10 +80,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32IZbbZbt-NEXT:    slt a0, a2, a0
 ; RV32IZbbZbt-NEXT:    slti a1, a1, 0
 ; RV32IZbbZbt-NEXT:    xor a0, a1, a0
-; RV32IZbbZbt-NEXT:    slti a1, a2, 0
+; RV32IZbbZbt-NEXT:    srai a1, a2, 31
 ; RV32IZbbZbt-NEXT:    lui a3, 524288
-; RV32IZbbZbt-NEXT:    addi a4, a3, -1
-; RV32IZbbZbt-NEXT:    cmov a1, a1, a4, a3
+; RV32IZbbZbt-NEXT:    xor a1, a1, a3
 ; RV32IZbbZbt-NEXT:    cmov a0, a0, a1, a2
 ; RV32IZbbZbt-NEXT:    ret
   %a = mul i32 %y, %z
@@ -106,11 +105,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32I-NEXT:    and a2, a2, a3
 ; RV32I-NEXT:    bgez a2, .LBB1_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a1, 0
-; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    sub a2, a2, a0
 ; RV32I-NEXT:    srai a0, a1, 31
-; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    xor a1, a0, a1
 ; RV32I-NEXT:  .LBB1_2:
 ; RV32I-NEXT:    ret
 ;
@@ -122,10 +119,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV64I-NEXT:    slti a2, a2, 0
 ; RV64I-NEXT:    beq a2, a1, .LBB1_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srai a0, a0, 63
 ; RV64I-NEXT:    addi a1, zero, -1
 ; RV64I-NEXT:    slli a1, a1, 63
-; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:  .LBB1_2:
 ; RV64I-NEXT:    ret
 ;
@@ -142,11 +139,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32IZbbNOZbt-NEXT:    andn a2, a3, a2
 ; RV32IZbbNOZbt-NEXT:    bgez a2, .LBB1_2
 ; RV32IZbbNOZbt-NEXT:  # %bb.1:
-; RV32IZbbNOZbt-NEXT:    slti a0, a1, 0
-; RV32IZbbNOZbt-NEXT:    lui a2, 524288
-; RV32IZbbNOZbt-NEXT:    sub a2, a2, a0
 ; RV32IZbbNOZbt-NEXT:    srai a0, a1, 31
-; RV32IZbbNOZbt-NEXT:    mv a1, a2
+; RV32IZbbNOZbt-NEXT:    lui a1, 524288
+; RV32IZbbNOZbt-NEXT:    xor a1, a0, a1
 ; RV32IZbbNOZbt-NEXT:  .LBB1_2:
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
@@ -158,10 +153,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV64IZbbNOZbt-NEXT:    slti a2, a2, 0
 ; RV64IZbbNOZbt-NEXT:    beq a2, a1, .LBB1_2
 ; RV64IZbbNOZbt-NEXT:  # %bb.1:
-; RV64IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV64IZbbNOZbt-NEXT:    srai a0, a0, 63
 ; RV64IZbbNOZbt-NEXT:    addi a1, zero, -1
 ; RV64IZbbNOZbt-NEXT:    slli a1, a1, 63
-; RV64IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV64IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV64IZbbNOZbt-NEXT:  .LBB1_2:
 ; RV64IZbbNOZbt-NEXT:    ret
 ;
@@ -171,31 +166,28 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32IZbbZbt-NEXT:    add a3, a0, a4
 ; RV32IZbbZbt-NEXT:    sltu a0, a3, a0
 ; RV32IZbbZbt-NEXT:    add a0, a2, a0
-; RV32IZbbZbt-NEXT:    slti a2, a0, 0
-; RV32IZbbZbt-NEXT:    lui a6, 524288
-; RV32IZbbZbt-NEXT:    addi a4, a6, -1
-; RV32IZbbZbt-NEXT:    cmov a2, a2, a4, a6
+; RV32IZbbZbt-NEXT:    srai a2, a0, 31
+; RV32IZbbZbt-NEXT:    lui a4, 524288
+; RV32IZbbZbt-NEXT:    xor a6, a2, a4
 ; RV32IZbbZbt-NEXT:    xor a4, a1, a0
 ; RV32IZbbZbt-NEXT:    xor a1, a1, a5
 ; RV32IZbbZbt-NEXT:    andn a1, a4, a1
 ; RV32IZbbZbt-NEXT:    slti a4, a1, 0
-; RV32IZbbZbt-NEXT:    cmov a1, a4, a2, a0
-; RV32IZbbZbt-NEXT:    srai a0, a0, 31
-; RV32IZbbZbt-NEXT:    cmov a0, a4, a0, a3
+; RV32IZbbZbt-NEXT:    cmov a1, a4, a6, a0
+; RV32IZbbZbt-NEXT:    cmov a0, a4, a2, a3
 ; RV32IZbbZbt-NEXT:    ret
 ;
 ; RV64IZbbZbt-LABEL: func64:
 ; RV64IZbbZbt:       # %bb.0:
 ; RV64IZbbZbt-NEXT:    add a1, a0, a2
-; RV64IZbbZbt-NEXT:    slti a3, a1, 0
-; RV64IZbbZbt-NEXT:    addi a4, zero, -1
-; RV64IZbbZbt-NEXT:    slli a5, a4, 63
-; RV64IZbbZbt-NEXT:    srli a4, a4, 1
-; RV64IZbbZbt-NEXT:    cmov a3, a3, a4, a5
 ; RV64IZbbZbt-NEXT:    slt a0, a1, a0
 ; RV64IZbbZbt-NEXT:    slti a2, a2, 0
 ; RV64IZbbZbt-NEXT:    xor a0, a2, a0
-; RV64IZbbZbt-NEXT:    cmov a0, a0, a3, a1
+; RV64IZbbZbt-NEXT:    srai a2, a1, 63
+; RV64IZbbZbt-NEXT:    addi a3, zero, -1
+; RV64IZbbZbt-NEXT:    slli a3, a3, 63
+; RV64IZbbZbt-NEXT:    xor a2, a2, a3
+; RV64IZbbZbt-NEXT:    cmov a0, a0, a2, a1
 ; RV64IZbbZbt-NEXT:    ret
   %a = mul i64 %y, %z
   %tmp = call i64 @llvm.sadd.sat.i64(i64 %x, i64 %z)
@@ -231,7 +223,7 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 48
 ; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    mulw a1, a1, a2
 ; RV64I-NEXT:    slli a1, a1, 48
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    add a0, a0, a1
@@ -309,7 +301,7 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    mulw a1, a1, a2
 ; RV64I-NEXT:    slli a1, a1, 56
 ; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    add a0, a0, a1
@@ -384,7 +376,7 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 60
 ; RV64I-NEXT:    srai a0, a0, 60
-; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    mulw a1, a1, a2
 ; RV64I-NEXT:    slli a1, a1, 60
 ; RV64I-NEXT:    srai a1, a1, 60
 ; RV64I-NEXT:    add a0, a0, a1
@@ -421,7 +413,7 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    slli a0, a0, 60
 ; RV64IZbb-NEXT:    srai a0, a0, 60
-; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    mulw a1, a1, a2
 ; RV64IZbb-NEXT:    slli a1, a1, 60
 ; RV64IZbb-NEXT:    srai a1, a1, 60
 ; RV64IZbb-NEXT:    add a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/select-cc.ll b/llvm/test/CodeGen/RISCV/select-cc.ll
index edc3bf0261f7a..bffba3aaa81bf 100644
--- a/llvm/test/CodeGen/RISCV/select-cc.ll
+++ b/llvm/test/CodeGen/RISCV/select-cc.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zbt -disable-block-placement -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32IBT %s
 
-define i32 @foo(i32 %a, i32 *%b) nounwind {
+define signext i32 @foo(i32 signext %a, i32 *%b) nounwind {
 ; RV32I-LABEL: foo:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lw a2, 0(a1)
@@ -159,3 +159,58 @@ define i32 @foo(i32 %a, i32 *%b) nounwind {
 
   ret i32 %val24
 }
+
+; Test that we can ComputeNumSignBits across basic blocks when the live out is
+; RISCVISD::SELECT_CC. There should be no slli+srai or sext.h in the output.
+define signext i16 @numsignbits(i16 signext %0, i16 signext %1, i16 signext %2, i16 signext %3) nounwind {
+; RV32I-LABEL: numsignbits:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    mv s0, a3
+; RV32I-NEXT:    beqz a0, .LBB1_2
+; RV32I-NEXT:  # %bb.1:
+; RV32I-NEXT:    mv s0, a2
+; RV32I-NEXT:  .LBB1_2:
+; RV32I-NEXT:    beqz a1, .LBB1_4
+; RV32I-NEXT:  # %bb.3:
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call bar@plt
+; RV32I-NEXT:  .LBB1_4:
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV32IBT-LABEL: numsignbits:
+; RV32IBT:       # %bb.0:
+; RV32IBT-NEXT:    addi sp, sp, -16
+; RV32IBT-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IBT-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32IBT-NEXT:    cmov s0, a0, a2, a3
+; RV32IBT-NEXT:    beqz a1, .LBB1_2
+; RV32IBT-NEXT:  # %bb.1:
+; RV32IBT-NEXT:    mv a0, s0
+; RV32IBT-NEXT:    call bar@plt
+; RV32IBT-NEXT:  .LBB1_2:
+; RV32IBT-NEXT:    mv a0, s0
+; RV32IBT-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32IBT-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IBT-NEXT:    addi sp, sp, 16
+; RV32IBT-NEXT:    ret
+  %5 = icmp eq i16 %0, 0
+  %6 = select i1 %5, i16 %3, i16 %2
+  %7 = icmp eq i16 %1, 0
+  br i1 %7, label %9, label %8
+
+8:                                                ; preds = %4
+  tail call void @bar(i16 signext %6)
+  br label %9
+
+9:                                                ; preds = %8, %4
+  ret i16 %6
+}
+
+declare void @bar(i16 signext)
diff --git a/llvm/test/CodeGen/RISCV/setcc-logic.ll b/llvm/test/CodeGen/RISCV/setcc-logic.ll
index 23c0129e168bb..2107c17d1ec07 100644
--- a/llvm/test/CodeGen/RISCV/setcc-logic.ll
+++ b/llvm/test/CodeGen/RISCV/setcc-logic.ll
@@ -84,9 +84,8 @@ define i1 @and_icmps_const_1bit_diff(i32 %x) nounwind {
 ;
 ; RV64I-LABEL: and_icmps_const_1bit_diff:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, -44
+; RV64I-NEXT:    addiw a0, a0, -44
 ; RV64I-NEXT:    andi a0, a0, -17
-; RV64I-NEXT:    sext.w a0, a0
 ; RV64I-NEXT:    snez a0, a0
 ; RV64I-NEXT:    ret
   %a = icmp ne i32 %x, 44
diff --git a/llvm/test/CodeGen/RISCV/shadowcallstack.ll b/llvm/test/CodeGen/RISCV/shadowcallstack.ll
index 3c02a60941527..93ea48e3bc1dd 100644
--- a/llvm/test/CodeGen/RISCV/shadowcallstack.ll
+++ b/llvm/test/CodeGen/RISCV/shadowcallstack.ll
@@ -121,8 +121,8 @@ define i32 @f4() shadowcallstack {
 ; RV64-NEXT:    call bar@plt
 ; RV64-NEXT:    mv s0, a0
 ; RV64-NEXT:    call bar@plt
-; RV64-NEXT:    add a1, s3, s1
-; RV64-NEXT:    add a0, s0, a0
+; RV64-NEXT:    addw a1, s3, s1
+; RV64-NEXT:    addw a0, s0, a0
 ; RV64-NEXT:    addw a0, a1, a0
 ; RV64-NEXT:    ld s3, 0(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/shift-masked-shamt.ll b/llvm/test/CodeGen/RISCV/shift-masked-shamt.ll
index 6163d9de39d3c..d7d901e6cf802 100644
--- a/llvm/test/CodeGen/RISCV/shift-masked-shamt.ll
+++ b/llvm/test/CodeGen/RISCV/shift-masked-shamt.ll
@@ -112,7 +112,7 @@ define i32 @sll_redundant_mask_zeros(i32 %a, i32 %b) nounwind {
 ;
 ; RV64I-LABEL: sll_redundant_mask_zeros:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    slliw a1, a1, 1
 ; RV64I-NEXT:    sllw a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = shl i32 %b, 1
@@ -130,7 +130,7 @@ define i32 @srl_redundant_mask_zeros(i32 %a, i32 %b) nounwind {
 ;
 ; RV64I-LABEL: srl_redundant_mask_zeros:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    slliw a1, a1, 2
 ; RV64I-NEXT:    srlw a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = shl i32 %b, 2
@@ -148,7 +148,7 @@ define i32 @sra_redundant_mask_zeros(i32 %a, i32 %b) nounwind {
 ;
 ; RV64I-LABEL: sra_redundant_mask_zeros:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    slliw a1, a1, 3
 ; RV64I-NEXT:    sraw a0, a0, a1
 ; RV64I-NEXT:    ret
   %1 = shl i32 %b, 3
diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll
index 2d943ae71cd4a..9f92edcce1676 100644
--- a/llvm/test/CodeGen/RISCV/split-offsets.ll
+++ b/llvm/test/CodeGen/RISCV/split-offsets.ll
@@ -87,7 +87,7 @@ define void @test2([65536 x i32]** %sp, [65536 x i32]* %t, i32 %n) {
 ; RV64I-NEXT:    bge a4, a2, .LBB1_2
 ; RV64I-NEXT:  .LBB1_1: # %while_body
 ; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    addi a4, a3, 1
+; RV64I-NEXT:    addiw a4, a3, 1
 ; RV64I-NEXT:    sw a4, 0(a1)
 ; RV64I-NEXT:    sw a3, 4(a1)
 ; RV64I-NEXT:    sw a4, 0(a0)
diff --git a/llvm/test/CodeGen/RISCV/srem-lkk.ll b/llvm/test/CodeGen/RISCV/srem-lkk.ll
index 8efffbabc74c6..a890e04008b90 100644
--- a/llvm/test/CodeGen/RISCV/srem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-lkk.ll
@@ -54,9 +54,9 @@ define i32 @fold_srem_positive_odd(i32 %x) nounwind {
 ; RV64IM-NEXT:    addw a1, a1, a0
 ; RV64IM-NEXT:    srliw a2, a1, 31
 ; RV64IM-NEXT:    srli a1, a1, 6
-; RV64IM-NEXT:    add a1, a1, a2
+; RV64IM-NEXT:    addw a1, a1, a2
 ; RV64IM-NEXT:    addi a2, zero, 95
-; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    mulw a1, a1, a2
 ; RV64IM-NEXT:    subw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = srem i32 %x, 95
@@ -107,9 +107,9 @@ define i32 @fold_srem_positive_even(i32 %x) nounwind {
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a2, a1, 63
 ; RV64IM-NEXT:    srai a1, a1, 40
-; RV64IM-NEXT:    add a1, a1, a2
+; RV64IM-NEXT:    addw a1, a1, a2
 ; RV64IM-NEXT:    addi a2, zero, 1060
-; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    mulw a1, a1, a2
 ; RV64IM-NEXT:    subw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = srem i32 %x, 1060
@@ -160,9 +160,9 @@ define i32 @fold_srem_negative_odd(i32 %x) nounwind {
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a2, a1, 63
 ; RV64IM-NEXT:    srai a1, a1, 40
-; RV64IM-NEXT:    add a1, a1, a2
+; RV64IM-NEXT:    addw a1, a1, a2
 ; RV64IM-NEXT:    addi a2, zero, -723
-; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    mulw a1, a1, a2
 ; RV64IM-NEXT:    subw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = srem i32 %x, -723
@@ -216,10 +216,10 @@ define i32 @fold_srem_negative_even(i32 %x) nounwind {
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a2, a1, 63
 ; RV64IM-NEXT:    srai a1, a1, 40
-; RV64IM-NEXT:    add a1, a1, a2
+; RV64IM-NEXT:    addw a1, a1, a2
 ; RV64IM-NEXT:    lui a2, 1048570
 ; RV64IM-NEXT:    addiw a2, a2, 1595
-; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    mulw a1, a1, a2
 ; RV64IM-NEXT:    subw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = srem i32 %x, -22981
@@ -292,14 +292,13 @@ define i32 @combine_srem_sdiv(i32 %x) nounwind {
 ; RV64IM-NEXT:    addiw a2, a2, 389
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 32
-; RV64IM-NEXT:    addw a2, a1, a0
-; RV64IM-NEXT:    srliw a2, a2, 31
-; RV64IM-NEXT:    add a1, a1, a0
+; RV64IM-NEXT:    addw a1, a1, a0
+; RV64IM-NEXT:    srliw a2, a1, 31
 ; RV64IM-NEXT:    sraiw a1, a1, 6
-; RV64IM-NEXT:    add a1, a1, a2
+; RV64IM-NEXT:    addw a1, a1, a2
 ; RV64IM-NEXT:    addi a2, zero, 95
-; RV64IM-NEXT:    mul a2, a1, a2
-; RV64IM-NEXT:    sub a0, a0, a2
+; RV64IM-NEXT:    mulw a2, a1, a2
+; RV64IM-NEXT:    subw a0, a0, a2
 ; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = srem i32 %x, 95
diff --git a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
index 8c57e6069117a..f24e0eefcc907 100644
--- a/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/RISCV/srem-seteq-illegal-types.ll
@@ -462,11 +462,11 @@ define void @test_srem_vec(<3 x i33>* %X) nounwind {
 ; RV64-NEXT:    and a3, a3, a4
 ; RV64-NEXT:    srli a3, a3, 32
 ; RV64-NEXT:    sb a3, 12(s0)
-; RV64-NEXT:    slli a1, a1, 2
+; RV64-NEXT:    slliw a1, a1, 2
 ; RV64-NEXT:    srli a3, s4, 31
 ; RV64-NEXT:    and a2, a2, a3
 ; RV64-NEXT:    srli a4, a2, 31
-; RV64-NEXT:    sub a1, a4, a1
+; RV64-NEXT:    subw a1, a4, a1
 ; RV64-NEXT:    sw a1, 8(s0)
 ; RV64-NEXT:    and a0, a0, a3
 ; RV64-NEXT:    slli a1, a2, 33
@@ -649,11 +649,11 @@ define void @test_srem_vec(<3 x i33>* %X) nounwind {
 ; RV64M-NEXT:    and a4, a4, a5
 ; RV64M-NEXT:    srli a4, a4, 32
 ; RV64M-NEXT:    sb a4, 12(a0)
-; RV64M-NEXT:    slli a2, a2, 2
+; RV64M-NEXT:    slliw a2, a2, 2
 ; RV64M-NEXT:    srli a4, a6, 31
 ; RV64M-NEXT:    and a1, a1, a4
 ; RV64M-NEXT:    srli a5, a1, 31
-; RV64M-NEXT:    sub a2, a5, a2
+; RV64M-NEXT:    subw a2, a5, a2
 ; RV64M-NEXT:    sw a2, 8(a0)
 ; RV64M-NEXT:    slli a1, a1, 33
 ; RV64M-NEXT:    and a2, a3, a4
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index 0261d3695975f..42a7ee5c23962 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -165,10 +165,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a5, a5, a1
 ; RV64IM-NEXT:    srli a2, a5, 63
 ; RV64IM-NEXT:    srli a5, a5, 6
-; RV64IM-NEXT:    add a2, a5, a2
+; RV64IM-NEXT:    addw a2, a5, a2
 ; RV64IM-NEXT:    addi a5, zero, 95
-; RV64IM-NEXT:    mul a2, a2, a5
-; RV64IM-NEXT:    sub a1, a1, a2
+; RV64IM-NEXT:    mulw a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a2
 ; RV64IM-NEXT:    lui a2, 777976
 ; RV64IM-NEXT:    addiw a2, a2, -1057
 ; RV64IM-NEXT:    slli a2, a2, 15
@@ -180,10 +180,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    sub a2, a2, a4
 ; RV64IM-NEXT:    srli a5, a2, 63
 ; RV64IM-NEXT:    srli a2, a2, 6
-; RV64IM-NEXT:    add a2, a2, a5
+; RV64IM-NEXT:    addw a2, a2, a5
 ; RV64IM-NEXT:    addi a5, zero, -124
-; RV64IM-NEXT:    mul a2, a2, a5
-; RV64IM-NEXT:    sub a2, a4, a2
+; RV64IM-NEXT:    mulw a2, a2, a5
+; RV64IM-NEXT:    subw a2, a4, a2
 ; RV64IM-NEXT:    lui a4, 2675
 ; RV64IM-NEXT:    addiw a4, a4, -251
 ; RV64IM-NEXT:    slli a4, a4, 13
@@ -195,10 +195,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mulh a4, a3, a4
 ; RV64IM-NEXT:    srli a5, a4, 63
 ; RV64IM-NEXT:    srli a4, a4, 5
-; RV64IM-NEXT:    add a4, a4, a5
+; RV64IM-NEXT:    addw a4, a4, a5
 ; RV64IM-NEXT:    addi a5, zero, 98
-; RV64IM-NEXT:    mul a4, a4, a5
-; RV64IM-NEXT:    sub a3, a3, a4
+; RV64IM-NEXT:    mulw a4, a4, a5
+; RV64IM-NEXT:    subw a3, a3, a4
 ; RV64IM-NEXT:    lui a4, 1040212
 ; RV64IM-NEXT:    addiw a4, a4, 1977
 ; RV64IM-NEXT:    slli a4, a4, 12
@@ -210,10 +210,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mulh a4, a6, a4
 ; RV64IM-NEXT:    srli a5, a4, 63
 ; RV64IM-NEXT:    srli a4, a4, 7
-; RV64IM-NEXT:    add a4, a4, a5
+; RV64IM-NEXT:    addw a4, a4, a5
 ; RV64IM-NEXT:    addi a5, zero, -1003
-; RV64IM-NEXT:    mul a4, a4, a5
-; RV64IM-NEXT:    sub a4, a6, a4
+; RV64IM-NEXT:    mulw a4, a4, a5
+; RV64IM-NEXT:    subw a4, a6, a4
 ; RV64IM-NEXT:    sh a4, 6(a0)
 ; RV64IM-NEXT:    sh a3, 4(a0)
 ; RV64IM-NEXT:    sh a2, 2(a0)
@@ -373,31 +373,31 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a2, a2, a1
 ; RV64IM-NEXT:    srli a3, a2, 63
 ; RV64IM-NEXT:    srli a2, a2, 6
-; RV64IM-NEXT:    add a2, a2, a3
+; RV64IM-NEXT:    addw a2, a2, a3
 ; RV64IM-NEXT:    addi a3, zero, 95
-; RV64IM-NEXT:    mul a2, a2, a3
-; RV64IM-NEXT:    sub t0, a1, a2
+; RV64IM-NEXT:    mulw a2, a2, a3
+; RV64IM-NEXT:    subw t0, a1, a2
 ; RV64IM-NEXT:    mulh a2, a4, a5
 ; RV64IM-NEXT:    add a2, a2, a4
 ; RV64IM-NEXT:    srli a1, a2, 63
 ; RV64IM-NEXT:    srli a2, a2, 6
-; RV64IM-NEXT:    add a1, a2, a1
-; RV64IM-NEXT:    mul a1, a1, a3
-; RV64IM-NEXT:    sub a1, a4, a1
+; RV64IM-NEXT:    addw a1, a2, a1
+; RV64IM-NEXT:    mulw a1, a1, a3
+; RV64IM-NEXT:    subw a1, a4, a1
 ; RV64IM-NEXT:    mulh a2, a7, a5
 ; RV64IM-NEXT:    add a2, a2, a7
 ; RV64IM-NEXT:    srli a4, a2, 63
 ; RV64IM-NEXT:    srli a2, a2, 6
-; RV64IM-NEXT:    add a2, a2, a4
-; RV64IM-NEXT:    mul a2, a2, a3
-; RV64IM-NEXT:    sub a2, a7, a2
+; RV64IM-NEXT:    addw a2, a2, a4
+; RV64IM-NEXT:    mulw a2, a2, a3
+; RV64IM-NEXT:    subw a2, a7, a2
 ; RV64IM-NEXT:    mulh a4, a6, a5
 ; RV64IM-NEXT:    add a4, a4, a6
 ; RV64IM-NEXT:    srli a5, a4, 63
 ; RV64IM-NEXT:    srli a4, a4, 6
-; RV64IM-NEXT:    add a4, a4, a5
-; RV64IM-NEXT:    mul a3, a4, a3
-; RV64IM-NEXT:    sub a3, a6, a3
+; RV64IM-NEXT:    addw a4, a4, a5
+; RV64IM-NEXT:    mulw a3, a4, a3
+; RV64IM-NEXT:    subw a3, a6, a3
 ; RV64IM-NEXT:    sh a3, 6(a0)
 ; RV64IM-NEXT:    sh a2, 4(a0)
 ; RV64IM-NEXT:    sh a1, 2(a0)
@@ -579,10 +579,10 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    addi a1, zero, 95
 ; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __divdi3@plt
-; RV64I-NEXT:    add a0, s8, a0
-; RV64I-NEXT:    add a1, s7, s1
-; RV64I-NEXT:    add a2, s6, s4
-; RV64I-NEXT:    add a3, s5, s9
+; RV64I-NEXT:    addw a0, s8, a0
+; RV64I-NEXT:    addw a1, s7, s1
+; RV64I-NEXT:    addw a2, s6, s4
+; RV64I-NEXT:    addw a3, s5, s9
 ; RV64I-NEXT:    sh a3, 6(s0)
 ; RV64I-NEXT:    sh a2, 4(s0)
 ; RV64I-NEXT:    sh a1, 2(s0)
@@ -619,35 +619,35 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a2, a2, a1
 ; RV64IM-NEXT:    srli a3, a2, 63
 ; RV64IM-NEXT:    srai a2, a2, 6
-; RV64IM-NEXT:    add t3, a2, a3
+; RV64IM-NEXT:    addw t3, a2, a3
 ; RV64IM-NEXT:    addi t0, zero, 95
-; RV64IM-NEXT:    mul a3, t3, t0
-; RV64IM-NEXT:    sub t1, a1, a3
+; RV64IM-NEXT:    mulw a3, t3, t0
+; RV64IM-NEXT:    subw t1, a1, a3
 ; RV64IM-NEXT:    mulh a3, a4, a5
 ; RV64IM-NEXT:    add a3, a3, a4
 ; RV64IM-NEXT:    srli a1, a3, 63
 ; RV64IM-NEXT:    srai a3, a3, 6
-; RV64IM-NEXT:    add a1, a3, a1
-; RV64IM-NEXT:    mul a3, a1, t0
-; RV64IM-NEXT:    sub t2, a4, a3
+; RV64IM-NEXT:    addw a1, a3, a1
+; RV64IM-NEXT:    mulw a3, a1, t0
+; RV64IM-NEXT:    subw t2, a4, a3
 ; RV64IM-NEXT:    mulh a4, a7, a5
 ; RV64IM-NEXT:    add a4, a4, a7
 ; RV64IM-NEXT:    srli a3, a4, 63
 ; RV64IM-NEXT:    srai a4, a4, 6
-; RV64IM-NEXT:    add a3, a4, a3
-; RV64IM-NEXT:    mul a4, a3, t0
-; RV64IM-NEXT:    sub a4, a7, a4
+; RV64IM-NEXT:    addw a3, a4, a3
+; RV64IM-NEXT:    mulw a4, a3, t0
+; RV64IM-NEXT:    subw a4, a7, a4
 ; RV64IM-NEXT:    mulh a5, a6, a5
 ; RV64IM-NEXT:    add a5, a5, a6
 ; RV64IM-NEXT:    srli a2, a5, 63
 ; RV64IM-NEXT:    srai a5, a5, 6
-; RV64IM-NEXT:    add a2, a5, a2
-; RV64IM-NEXT:    mul a5, a2, t0
-; RV64IM-NEXT:    sub a5, a6, a5
-; RV64IM-NEXT:    add a2, a5, a2
-; RV64IM-NEXT:    add a3, a4, a3
-; RV64IM-NEXT:    add a1, t2, a1
-; RV64IM-NEXT:    add a4, t1, t3
+; RV64IM-NEXT:    addw a2, a5, a2
+; RV64IM-NEXT:    mulw a5, a2, t0
+; RV64IM-NEXT:    subw a5, a6, a5
+; RV64IM-NEXT:    addw a2, a5, a2
+; RV64IM-NEXT:    addw a3, a4, a3
+; RV64IM-NEXT:    addw a1, t2, a1
+; RV64IM-NEXT:    addw a4, t1, t3
 ; RV64IM-NEXT:    sh a4, 6(a0)
 ; RV64IM-NEXT:    sh a1, 4(a0)
 ; RV64IM-NEXT:    sh a3, 2(a0)
@@ -750,15 +750,15 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    srli a4, a2, 58
 ; RV64I-NEXT:    add a4, a2, a4
 ; RV64I-NEXT:    andi a4, a4, -64
-; RV64I-NEXT:    sub s2, a2, a4
+; RV64I-NEXT:    subw s2, a2, a4
 ; RV64I-NEXT:    srli a2, a1, 59
 ; RV64I-NEXT:    add a2, a1, a2
 ; RV64I-NEXT:    andi a2, a2, -32
-; RV64I-NEXT:    sub s3, a1, a2
+; RV64I-NEXT:    subw s3, a1, a2
 ; RV64I-NEXT:    srli a1, a3, 61
 ; RV64I-NEXT:    add a1, a3, a1
 ; RV64I-NEXT:    andi a1, a1, -8
-; RV64I-NEXT:    sub s1, a3, a1
+; RV64I-NEXT:    subw s1, a3, a1
 ; RV64I-NEXT:    addi a1, zero, 95
 ; RV64I-NEXT:    call __moddi3@plt
 ; RV64I-NEXT:    sh a0, 6(s0)
@@ -791,22 +791,22 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a5, a5, a1
 ; RV64IM-NEXT:    srli a2, a5, 63
 ; RV64IM-NEXT:    srli a5, a5, 6
-; RV64IM-NEXT:    add a2, a5, a2
+; RV64IM-NEXT:    addw a2, a5, a2
 ; RV64IM-NEXT:    addi a5, zero, 95
-; RV64IM-NEXT:    mul a2, a2, a5
-; RV64IM-NEXT:    sub a1, a1, a2
+; RV64IM-NEXT:    mulw a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a2
 ; RV64IM-NEXT:    srli a2, a4, 58
 ; RV64IM-NEXT:    add a2, a4, a2
 ; RV64IM-NEXT:    andi a2, a2, -64
-; RV64IM-NEXT:    sub a2, a4, a2
+; RV64IM-NEXT:    subw a2, a4, a2
 ; RV64IM-NEXT:    srli a4, a3, 59
 ; RV64IM-NEXT:    add a4, a3, a4
 ; RV64IM-NEXT:    andi a4, a4, -32
-; RV64IM-NEXT:    sub a3, a3, a4
+; RV64IM-NEXT:    subw a3, a3, a4
 ; RV64IM-NEXT:    srli a4, a6, 61
 ; RV64IM-NEXT:    add a4, a6, a4
 ; RV64IM-NEXT:    andi a4, a4, -8
-; RV64IM-NEXT:    sub a4, a6, a4
+; RV64IM-NEXT:    subw a4, a6, a4
 ; RV64IM-NEXT:    sh a4, 4(a0)
 ; RV64IM-NEXT:    sh a3, 2(a0)
 ; RV64IM-NEXT:    sh a2, 0(a0)
@@ -948,10 +948,10 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a4, a4, a1
 ; RV64IM-NEXT:    srli a5, a4, 63
 ; RV64IM-NEXT:    srli a4, a4, 4
-; RV64IM-NEXT:    add a4, a4, a5
+; RV64IM-NEXT:    addw a4, a4, a5
 ; RV64IM-NEXT:    addi a5, zero, 23
-; RV64IM-NEXT:    mul a4, a4, a5
-; RV64IM-NEXT:    sub a1, a1, a4
+; RV64IM-NEXT:    mulw a4, a4, a5
+; RV64IM-NEXT:    subw a1, a1, a4
 ; RV64IM-NEXT:    lui a4, 6413
 ; RV64IM-NEXT:    addiw a4, a4, 1265
 ; RV64IM-NEXT:    slli a4, a4, 13
@@ -963,10 +963,10 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mulh a4, a3, a4
 ; RV64IM-NEXT:    srli a5, a4, 63
 ; RV64IM-NEXT:    srli a4, a4, 8
-; RV64IM-NEXT:    add a4, a4, a5
+; RV64IM-NEXT:    addw a4, a4, a5
 ; RV64IM-NEXT:    addi a5, zero, 654
-; RV64IM-NEXT:    mul a4, a4, a5
-; RV64IM-NEXT:    sub a3, a3, a4
+; RV64IM-NEXT:    mulw a4, a4, a5
+; RV64IM-NEXT:    subw a3, a3, a4
 ; RV64IM-NEXT:    lui a4, 12375
 ; RV64IM-NEXT:    addiw a4, a4, -575
 ; RV64IM-NEXT:    slli a4, a4, 12
@@ -978,11 +978,11 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mulh a4, a2, a4
 ; RV64IM-NEXT:    srli a5, a4, 63
 ; RV64IM-NEXT:    srli a4, a4, 11
-; RV64IM-NEXT:    add a4, a4, a5
+; RV64IM-NEXT:    addw a4, a4, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addiw a5, a5, 1327
-; RV64IM-NEXT:    mul a4, a4, a5
-; RV64IM-NEXT:    sub a2, a2, a4
+; RV64IM-NEXT:    mulw a4, a4, a5
+; RV64IM-NEXT:    subw a2, a2, a4
 ; RV64IM-NEXT:    sh zero, 0(a0)
 ; RV64IM-NEXT:    sh a2, 6(a0)
 ; RV64IM-NEXT:    sh a3, 2(a0)
@@ -1082,7 +1082,7 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    add a1, a2, a1
 ; RV64I-NEXT:    lui a3, 8
 ; RV64I-NEXT:    and a1, a1, a3
-; RV64I-NEXT:    sub s3, a2, a1
+; RV64I-NEXT:    subw s3, a2, a1
 ; RV64I-NEXT:    addi a1, zero, 23
 ; RV64I-NEXT:    call __moddi3@plt
 ; RV64I-NEXT:    mv s1, a0
@@ -1119,10 +1119,10 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a4, a4, a1
 ; RV64IM-NEXT:    srli a5, a4, 63
 ; RV64IM-NEXT:    srli a4, a4, 4
-; RV64IM-NEXT:    add a4, a4, a5
+; RV64IM-NEXT:    addw a4, a4, a5
 ; RV64IM-NEXT:    addi a5, zero, 23
-; RV64IM-NEXT:    mul a4, a4, a5
-; RV64IM-NEXT:    sub a1, a1, a4
+; RV64IM-NEXT:    mulw a4, a4, a5
+; RV64IM-NEXT:    subw a1, a1, a4
 ; RV64IM-NEXT:    lui a4, 12375
 ; RV64IM-NEXT:    addiw a4, a4, -575
 ; RV64IM-NEXT:    slli a4, a4, 12
@@ -1134,16 +1134,16 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mulh a4, a3, a4
 ; RV64IM-NEXT:    srli a5, a4, 63
 ; RV64IM-NEXT:    srli a4, a4, 11
-; RV64IM-NEXT:    add a4, a4, a5
+; RV64IM-NEXT:    addw a4, a4, a5
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addiw a5, a5, 1327
-; RV64IM-NEXT:    mul a4, a4, a5
-; RV64IM-NEXT:    sub a3, a3, a4
+; RV64IM-NEXT:    mulw a4, a4, a5
+; RV64IM-NEXT:    subw a3, a3, a4
 ; RV64IM-NEXT:    srli a4, a2, 49
 ; RV64IM-NEXT:    add a4, a2, a4
 ; RV64IM-NEXT:    lui a5, 8
 ; RV64IM-NEXT:    and a4, a4, a5
-; RV64IM-NEXT:    sub a2, a2, a4
+; RV64IM-NEXT:    subw a2, a2, a4
 ; RV64IM-NEXT:    sh zero, 0(a0)
 ; RV64IM-NEXT:    sh a2, 2(a0)
 ; RV64IM-NEXT:    sh a3, 6(a0)
diff --git a/llvm/test/CodeGen/RISCV/ssub_sat.ll b/llvm/test/CodeGen/RISCV/ssub_sat.ll
index d845f87fd5920..94acffac7a815 100644
--- a/llvm/test/CodeGen/RISCV/ssub_sat.ll
+++ b/llvm/test/CodeGen/RISCV/ssub_sat.ll
@@ -21,9 +21,9 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32I-NEXT:    slt a1, a0, a2
 ; RV32I-NEXT:    beq a3, a1, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:  .LBB0_2:
 ; RV32I-NEXT:    ret
 ;
@@ -52,9 +52,9 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32IZbbNOZbt-NEXT:    slt a1, a0, a2
 ; RV32IZbbNOZbt-NEXT:    beq a3, a1, .LBB0_2
 ; RV32IZbbNOZbt-NEXT:  # %bb.1:
-; RV32IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV32IZbbNOZbt-NEXT:    srai a0, a0, 31
 ; RV32IZbbNOZbt-NEXT:    lui a1, 524288
-; RV32IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV32IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV32IZbbNOZbt-NEXT:  .LBB0_2:
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
@@ -73,10 +73,9 @@ define signext i32 @func(i32 signext %x, i32 signext %y) nounwind {
 ; RV32IZbbZbt-NEXT:    sub a1, a0, a1
 ; RV32IZbbZbt-NEXT:    slt a0, a1, a0
 ; RV32IZbbZbt-NEXT:    xor a0, a2, a0
-; RV32IZbbZbt-NEXT:    slti a2, a1, 0
+; RV32IZbbZbt-NEXT:    srai a2, a1, 31
 ; RV32IZbbZbt-NEXT:    lui a3, 524288
-; RV32IZbbZbt-NEXT:    addi a4, a3, -1
-; RV32IZbbZbt-NEXT:    cmov a2, a2, a4, a3
+; RV32IZbbZbt-NEXT:    xor a2, a2, a3
 ; RV32IZbbZbt-NEXT:    cmov a0, a0, a2, a1
 ; RV32IZbbZbt-NEXT:    ret
   %tmp = call i32 @llvm.ssub.sat.i32(i32 %x, i32 %y);
@@ -98,11 +97,9 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32I-NEXT:    sub a0, a0, a2
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB1_2:
-; RV32I-NEXT:    slti a0, a1, 0
-; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    sub a2, a2, a0
 ; RV32I-NEXT:    srai a0, a1, 31
-; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    xor a1, a0, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func2:
@@ -113,10 +110,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV64I-NEXT:    slt a1, a0, a2
 ; RV64I-NEXT:    beq a3, a1, .LBB1_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srai a0, a0, 63
 ; RV64I-NEXT:    addi a1, zero, -1
 ; RV64I-NEXT:    slli a1, a1, 63
-; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:  .LBB1_2:
 ; RV64I-NEXT:    ret
 ;
@@ -134,11 +131,9 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32IZbbNOZbt-NEXT:    sub a0, a0, a2
 ; RV32IZbbNOZbt-NEXT:    ret
 ; RV32IZbbNOZbt-NEXT:  .LBB1_2:
-; RV32IZbbNOZbt-NEXT:    slti a0, a1, 0
-; RV32IZbbNOZbt-NEXT:    lui a2, 524288
-; RV32IZbbNOZbt-NEXT:    sub a2, a2, a0
 ; RV32IZbbNOZbt-NEXT:    srai a0, a1, 31
-; RV32IZbbNOZbt-NEXT:    mv a1, a2
+; RV32IZbbNOZbt-NEXT:    lui a1, 524288
+; RV32IZbbNOZbt-NEXT:    xor a1, a0, a1
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
 ; RV64IZbbNOZbt-LABEL: func2:
@@ -149,10 +144,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV64IZbbNOZbt-NEXT:    slt a1, a0, a2
 ; RV64IZbbNOZbt-NEXT:    beq a3, a1, .LBB1_2
 ; RV64IZbbNOZbt-NEXT:  # %bb.1:
-; RV64IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV64IZbbNOZbt-NEXT:    srai a0, a0, 63
 ; RV64IZbbNOZbt-NEXT:    addi a1, zero, -1
 ; RV64IZbbNOZbt-NEXT:    slli a1, a1, 63
-; RV64IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV64IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV64IZbbNOZbt-NEXT:  .LBB1_2:
 ; RV64IZbbNOZbt-NEXT:    ret
 ;
@@ -161,18 +156,16 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV32IZbbZbt-NEXT:    sltu a4, a0, a2
 ; RV32IZbbZbt-NEXT:    sub a5, a1, a3
 ; RV32IZbbZbt-NEXT:    sub a4, a5, a4
-; RV32IZbbZbt-NEXT:    slti a7, a4, 0
-; RV32IZbbZbt-NEXT:    lui a6, 524288
-; RV32IZbbZbt-NEXT:    addi a5, a6, -1
-; RV32IZbbZbt-NEXT:    cmov a6, a7, a5, a6
+; RV32IZbbZbt-NEXT:    srai a6, a4, 31
+; RV32IZbbZbt-NEXT:    lui a5, 524288
+; RV32IZbbZbt-NEXT:    xor a7, a6, a5
 ; RV32IZbbZbt-NEXT:    xor a5, a1, a4
 ; RV32IZbbZbt-NEXT:    xor a1, a1, a3
 ; RV32IZbbZbt-NEXT:    and a1, a1, a5
 ; RV32IZbbZbt-NEXT:    slti a3, a1, 0
-; RV32IZbbZbt-NEXT:    cmov a1, a3, a6, a4
-; RV32IZbbZbt-NEXT:    srai a4, a4, 31
+; RV32IZbbZbt-NEXT:    cmov a1, a3, a7, a4
 ; RV32IZbbZbt-NEXT:    sub a0, a0, a2
-; RV32IZbbZbt-NEXT:    cmov a0, a3, a4, a0
+; RV32IZbbZbt-NEXT:    cmov a0, a3, a6, a0
 ; RV32IZbbZbt-NEXT:    ret
 ;
 ; RV64IZbbZbt-LABEL: func2:
@@ -181,11 +174,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; RV64IZbbZbt-NEXT:    sub a1, a0, a1
 ; RV64IZbbZbt-NEXT:    slt a0, a1, a0
 ; RV64IZbbZbt-NEXT:    xor a0, a2, a0
-; RV64IZbbZbt-NEXT:    slti a2, a1, 0
+; RV64IZbbZbt-NEXT:    srai a2, a1, 63
 ; RV64IZbbZbt-NEXT:    addi a3, zero, -1
-; RV64IZbbZbt-NEXT:    slli a4, a3, 63
-; RV64IZbbZbt-NEXT:    srli a3, a3, 1
-; RV64IZbbZbt-NEXT:    cmov a2, a2, a3, a4
+; RV64IZbbZbt-NEXT:    slli a3, a3, 63
+; RV64IZbbZbt-NEXT:    xor a2, a2, a3
 ; RV64IZbbZbt-NEXT:    cmov a0, a0, a2, a1
 ; RV64IZbbZbt-NEXT:    ret
   %tmp = call i64 @llvm.ssub.sat.i64(i64 %x, i64 %y);
diff --git a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
index 923cf75996a68..fa93cc0e7e883 100644
--- a/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/ssub_sat_plus.ll
@@ -22,9 +22,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32I-NEXT:    slt a2, a0, a3
 ; RV32I-NEXT:    beq a1, a2, .LBB0_2
 ; RV32I-NEXT:  # %bb.1:
-; RV32I-NEXT:    slti a0, a0, 0
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    lui a1, 524288
-; RV32I-NEXT:    sub a0, a1, a0
+; RV32I-NEXT:    xor a0, a0, a1
 ; RV32I-NEXT:  .LBB0_2:
 ; RV32I-NEXT:    ret
 ;
@@ -56,9 +56,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32IZbbNOZbt-NEXT:    slt a2, a0, a3
 ; RV32IZbbNOZbt-NEXT:    beq a1, a2, .LBB0_2
 ; RV32IZbbNOZbt-NEXT:  # %bb.1:
-; RV32IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV32IZbbNOZbt-NEXT:    srai a0, a0, 31
 ; RV32IZbbNOZbt-NEXT:    lui a1, 524288
-; RV32IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV32IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV32IZbbNOZbt-NEXT:  .LBB0_2:
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
@@ -80,10 +80,9 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; RV32IZbbZbt-NEXT:    sub a1, a0, a1
 ; RV32IZbbZbt-NEXT:    slt a0, a1, a0
 ; RV32IZbbZbt-NEXT:    xor a0, a2, a0
-; RV32IZbbZbt-NEXT:    slti a2, a1, 0
+; RV32IZbbZbt-NEXT:    srai a2, a1, 31
 ; RV32IZbbZbt-NEXT:    lui a3, 524288
-; RV32IZbbZbt-NEXT:    addi a4, a3, -1
-; RV32IZbbZbt-NEXT:    cmov a2, a2, a4, a3
+; RV32IZbbZbt-NEXT:    xor a2, a2, a3
 ; RV32IZbbZbt-NEXT:    cmov a0, a0, a2, a1
 ; RV32IZbbZbt-NEXT:    ret
   %a = mul i32 %y, %z
@@ -106,11 +105,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32I-NEXT:    sub a0, a0, a4
 ; RV32I-NEXT:    ret
 ; RV32I-NEXT:  .LBB1_2:
-; RV32I-NEXT:    slti a0, a1, 0
-; RV32I-NEXT:    lui a2, 524288
-; RV32I-NEXT:    sub a2, a2, a0
 ; RV32I-NEXT:    srai a0, a1, 31
-; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    xor a1, a0, a1
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: func64:
@@ -121,10 +118,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV64I-NEXT:    slt a1, a0, a1
 ; RV64I-NEXT:    beq a3, a1, .LBB1_2
 ; RV64I-NEXT:  # %bb.1:
-; RV64I-NEXT:    slti a0, a0, 0
+; RV64I-NEXT:    srai a0, a0, 63
 ; RV64I-NEXT:    addi a1, zero, -1
 ; RV64I-NEXT:    slli a1, a1, 63
-; RV64I-NEXT:    sub a0, a1, a0
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:  .LBB1_2:
 ; RV64I-NEXT:    ret
 ;
@@ -142,11 +139,9 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32IZbbNOZbt-NEXT:    sub a0, a0, a4
 ; RV32IZbbNOZbt-NEXT:    ret
 ; RV32IZbbNOZbt-NEXT:  .LBB1_2:
-; RV32IZbbNOZbt-NEXT:    slti a0, a1, 0
-; RV32IZbbNOZbt-NEXT:    lui a2, 524288
-; RV32IZbbNOZbt-NEXT:    sub a2, a2, a0
 ; RV32IZbbNOZbt-NEXT:    srai a0, a1, 31
-; RV32IZbbNOZbt-NEXT:    mv a1, a2
+; RV32IZbbNOZbt-NEXT:    lui a1, 524288
+; RV32IZbbNOZbt-NEXT:    xor a1, a0, a1
 ; RV32IZbbNOZbt-NEXT:    ret
 ;
 ; RV64IZbbNOZbt-LABEL: func64:
@@ -157,10 +152,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV64IZbbNOZbt-NEXT:    slt a1, a0, a1
 ; RV64IZbbNOZbt-NEXT:    beq a3, a1, .LBB1_2
 ; RV64IZbbNOZbt-NEXT:  # %bb.1:
-; RV64IZbbNOZbt-NEXT:    slti a0, a0, 0
+; RV64IZbbNOZbt-NEXT:    srai a0, a0, 63
 ; RV64IZbbNOZbt-NEXT:    addi a1, zero, -1
 ; RV64IZbbNOZbt-NEXT:    slli a1, a1, 63
-; RV64IZbbNOZbt-NEXT:    sub a0, a1, a0
+; RV64IZbbNOZbt-NEXT:    xor a0, a0, a1
 ; RV64IZbbNOZbt-NEXT:  .LBB1_2:
 ; RV64IZbbNOZbt-NEXT:    ret
 ;
@@ -169,18 +164,16 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV32IZbbZbt-NEXT:    sltu a2, a0, a4
 ; RV32IZbbZbt-NEXT:    sub a3, a1, a5
 ; RV32IZbbZbt-NEXT:    sub a2, a3, a2
-; RV32IZbbZbt-NEXT:    slti a7, a2, 0
-; RV32IZbbZbt-NEXT:    lui a6, 524288
-; RV32IZbbZbt-NEXT:    addi a3, a6, -1
-; RV32IZbbZbt-NEXT:    cmov a6, a7, a3, a6
+; RV32IZbbZbt-NEXT:    srai a6, a2, 31
+; RV32IZbbZbt-NEXT:    lui a3, 524288
+; RV32IZbbZbt-NEXT:    xor a7, a6, a3
 ; RV32IZbbZbt-NEXT:    xor a3, a1, a2
 ; RV32IZbbZbt-NEXT:    xor a1, a1, a5
 ; RV32IZbbZbt-NEXT:    and a1, a1, a3
 ; RV32IZbbZbt-NEXT:    slti a3, a1, 0
-; RV32IZbbZbt-NEXT:    cmov a1, a3, a6, a2
-; RV32IZbbZbt-NEXT:    srai a2, a2, 31
+; RV32IZbbZbt-NEXT:    cmov a1, a3, a7, a2
 ; RV32IZbbZbt-NEXT:    sub a0, a0, a4
-; RV32IZbbZbt-NEXT:    cmov a0, a3, a2, a0
+; RV32IZbbZbt-NEXT:    cmov a0, a3, a6, a0
 ; RV32IZbbZbt-NEXT:    ret
 ;
 ; RV64IZbbZbt-LABEL: func64:
@@ -189,11 +182,10 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; RV64IZbbZbt-NEXT:    sub a2, a0, a2
 ; RV64IZbbZbt-NEXT:    slt a0, a2, a0
 ; RV64IZbbZbt-NEXT:    xor a0, a1, a0
-; RV64IZbbZbt-NEXT:    slti a1, a2, 0
+; RV64IZbbZbt-NEXT:    srai a1, a2, 63
 ; RV64IZbbZbt-NEXT:    addi a3, zero, -1
-; RV64IZbbZbt-NEXT:    slli a4, a3, 63
-; RV64IZbbZbt-NEXT:    srli a3, a3, 1
-; RV64IZbbZbt-NEXT:    cmov a1, a1, a3, a4
+; RV64IZbbZbt-NEXT:    slli a3, a3, 63
+; RV64IZbbZbt-NEXT:    xor a1, a1, a3
 ; RV64IZbbZbt-NEXT:    cmov a0, a0, a1, a2
 ; RV64IZbbZbt-NEXT:    ret
   %a = mul i64 %y, %z
@@ -230,7 +222,7 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 48
 ; RV64I-NEXT:    srai a0, a0, 48
-; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    mulw a1, a1, a2
 ; RV64I-NEXT:    slli a1, a1, 48
 ; RV64I-NEXT:    srai a1, a1, 48
 ; RV64I-NEXT:    sub a0, a0, a1
@@ -308,7 +300,7 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    srai a0, a0, 56
-; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    mulw a1, a1, a2
 ; RV64I-NEXT:    slli a1, a1, 56
 ; RV64I-NEXT:    srai a1, a1, 56
 ; RV64I-NEXT:    sub a0, a0, a1
@@ -383,7 +375,7 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 60
 ; RV64I-NEXT:    srai a0, a0, 60
-; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    mulw a1, a1, a2
 ; RV64I-NEXT:    slli a1, a1, 60
 ; RV64I-NEXT:    srai a1, a1, 60
 ; RV64I-NEXT:    sub a0, a0, a1
@@ -420,7 +412,7 @@ define i4 @func4(i4 %x, i4 %y, i4 %z) nounwind {
 ; RV64IZbb:       # %bb.0:
 ; RV64IZbb-NEXT:    slli a0, a0, 60
 ; RV64IZbb-NEXT:    srai a0, a0, 60
-; RV64IZbb-NEXT:    mul a1, a1, a2
+; RV64IZbb-NEXT:    mulw a1, a1, a2
 ; RV64IZbb-NEXT:    slli a1, a1, 60
 ; RV64IZbb-NEXT:    srai a1, a1, 60
 ; RV64IZbb-NEXT:    sub a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
index 589374493d71f..251e53adfe8ff 100644
--- a/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/uadd_sat_plus.ll
@@ -25,7 +25,7 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ;
 ; RV64I-LABEL: func32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    mulw a1, a1, a2
 ; RV64I-NEXT:    addw a1, a0, a1
 ; RV64I-NEXT:    sext.w a2, a0
 ; RV64I-NEXT:    addi a0, zero, -1
diff --git a/llvm/test/CodeGen/RISCV/urem-lkk.ll b/llvm/test/CodeGen/RISCV/urem-lkk.ll
index f448ec6fa906e..e34845807c3f2 100644
--- a/llvm/test/CodeGen/RISCV/urem-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-lkk.ll
@@ -53,12 +53,12 @@ define i32 @fold_urem_positive_odd(i32 %x) nounwind {
 ; RV64IM-NEXT:    addiw a2, a2, 777
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 32
-; RV64IM-NEXT:    sub a2, a0, a1
+; RV64IM-NEXT:    subw a2, a0, a1
 ; RV64IM-NEXT:    srliw a2, a2, 1
 ; RV64IM-NEXT:    add a1, a2, a1
 ; RV64IM-NEXT:    srli a1, a1, 6
 ; RV64IM-NEXT:    addi a2, zero, 95
-; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    mulw a1, a1, a2
 ; RV64IM-NEXT:    subw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = urem i32 %x, 95
@@ -110,7 +110,7 @@ define i32 @fold_urem_positive_even(i32 %x) nounwind {
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 42
 ; RV64IM-NEXT:    addi a2, zero, 1060
-; RV64IM-NEXT:    mul a1, a1, a2
+; RV64IM-NEXT:    mulw a1, a1, a2
 ; RV64IM-NEXT:    subw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = urem i32 %x, 1060
@@ -185,13 +185,13 @@ define i32 @combine_urem_udiv(i32 %x) nounwind {
 ; RV64IM-NEXT:    addiw a2, a2, 777
 ; RV64IM-NEXT:    mul a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 32
-; RV64IM-NEXT:    sub a2, a0, a1
+; RV64IM-NEXT:    subw a2, a0, a1
 ; RV64IM-NEXT:    srliw a2, a2, 1
 ; RV64IM-NEXT:    add a1, a2, a1
 ; RV64IM-NEXT:    srli a1, a1, 6
 ; RV64IM-NEXT:    addi a2, zero, 95
-; RV64IM-NEXT:    mul a2, a1, a2
-; RV64IM-NEXT:    sub a0, a0, a2
+; RV64IM-NEXT:    mulw a2, a1, a2
+; RV64IM-NEXT:    subw a0, a0, a2
 ; RV64IM-NEXT:    addw a0, a0, a1
 ; RV64IM-NEXT:    ret
   %1 = urem i32 %x, 95
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index e91b0834e7d23..17b68b99fc0b1 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -162,8 +162,8 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    srli a2, a2, 6
 ; RV64IM-NEXT:    addi a5, zero, 95
-; RV64IM-NEXT:    mul a2, a2, a5
-; RV64IM-NEXT:    sub a1, a1, a2
+; RV64IM-NEXT:    mulw a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a2
 ; RV64IM-NEXT:    srli a2, a4, 2
 ; RV64IM-NEXT:    lui a5, 264
 ; RV64IM-NEXT:    addiw a5, a5, 1057
@@ -176,8 +176,8 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mulhu a2, a2, a5
 ; RV64IM-NEXT:    srli a2, a2, 3
 ; RV64IM-NEXT:    addi a5, zero, 124
-; RV64IM-NEXT:    mul a2, a2, a5
-; RV64IM-NEXT:    sub a2, a4, a2
+; RV64IM-NEXT:    mulw a2, a2, a5
+; RV64IM-NEXT:    subw a2, a4, a2
 ; RV64IM-NEXT:    srli a4, a3, 1
 ; RV64IM-NEXT:    lui a5, 2675
 ; RV64IM-NEXT:    addiw a5, a5, -251
@@ -190,8 +190,8 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mulhu a4, a4, a5
 ; RV64IM-NEXT:    srli a4, a4, 4
 ; RV64IM-NEXT:    addi a5, zero, 98
-; RV64IM-NEXT:    mul a4, a4, a5
-; RV64IM-NEXT:    sub a3, a3, a4
+; RV64IM-NEXT:    mulw a4, a4, a5
+; RV64IM-NEXT:    subw a3, a3, a4
 ; RV64IM-NEXT:    lui a4, 8364
 ; RV64IM-NEXT:    addiw a4, a4, -1977
 ; RV64IM-NEXT:    slli a4, a4, 12
@@ -203,8 +203,8 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mulhu a4, a6, a4
 ; RV64IM-NEXT:    srli a4, a4, 7
 ; RV64IM-NEXT:    addi a5, zero, 1003
-; RV64IM-NEXT:    mul a4, a4, a5
-; RV64IM-NEXT:    sub a4, a6, a4
+; RV64IM-NEXT:    mulw a4, a4, a5
+; RV64IM-NEXT:    subw a4, a6, a4
 ; RV64IM-NEXT:    sh a4, 6(a0)
 ; RV64IM-NEXT:    sh a3, 4(a0)
 ; RV64IM-NEXT:    sh a2, 2(a0)
@@ -366,29 +366,29 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a2, a3, a2
 ; RV64IM-NEXT:    srli a2, a2, 6
 ; RV64IM-NEXT:    addi a3, zero, 95
-; RV64IM-NEXT:    mul a2, a2, a3
-; RV64IM-NEXT:    sub t0, a1, a2
+; RV64IM-NEXT:    mulw a2, a2, a3
+; RV64IM-NEXT:    subw t0, a1, a2
 ; RV64IM-NEXT:    mulhu a2, a4, a5
 ; RV64IM-NEXT:    sub a1, a4, a2
 ; RV64IM-NEXT:    srli a1, a1, 1
 ; RV64IM-NEXT:    add a1, a1, a2
 ; RV64IM-NEXT:    srli a1, a1, 6
-; RV64IM-NEXT:    mul a1, a1, a3
-; RV64IM-NEXT:    sub a1, a4, a1
+; RV64IM-NEXT:    mulw a1, a1, a3
+; RV64IM-NEXT:    subw a1, a4, a1
 ; RV64IM-NEXT:    mulhu a2, a7, a5
 ; RV64IM-NEXT:    sub a4, a7, a2
 ; RV64IM-NEXT:    srli a4, a4, 1
 ; RV64IM-NEXT:    add a2, a4, a2
 ; RV64IM-NEXT:    srli a2, a2, 6
-; RV64IM-NEXT:    mul a2, a2, a3
-; RV64IM-NEXT:    sub a2, a7, a2
+; RV64IM-NEXT:    mulw a2, a2, a3
+; RV64IM-NEXT:    subw a2, a7, a2
 ; RV64IM-NEXT:    mulhu a4, a6, a5
 ; RV64IM-NEXT:    sub a5, a6, a4
 ; RV64IM-NEXT:    srli a5, a5, 1
 ; RV64IM-NEXT:    add a4, a5, a4
 ; RV64IM-NEXT:    srli a4, a4, 6
-; RV64IM-NEXT:    mul a3, a4, a3
-; RV64IM-NEXT:    sub a3, a6, a3
+; RV64IM-NEXT:    mulw a3, a4, a3
+; RV64IM-NEXT:    subw a3, a6, a3
 ; RV64IM-NEXT:    sh a3, 6(a0)
 ; RV64IM-NEXT:    sh a2, 4(a0)
 ; RV64IM-NEXT:    sh a1, 2(a0)
@@ -570,10 +570,10 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ; RV64I-NEXT:    addi a1, zero, 95
 ; RV64I-NEXT:    mv a0, s2
 ; RV64I-NEXT:    call __udivdi3@plt
-; RV64I-NEXT:    add a0, s8, a0
-; RV64I-NEXT:    add a1, s7, s1
-; RV64I-NEXT:    add a2, s6, s4
-; RV64I-NEXT:    add a3, s5, s9
+; RV64I-NEXT:    addw a0, s8, a0
+; RV64I-NEXT:    addw a1, s7, s1
+; RV64I-NEXT:    addw a2, s6, s4
+; RV64I-NEXT:    addw a3, s5, s9
 ; RV64I-NEXT:    sh a3, 6(s0)
 ; RV64I-NEXT:    sh a2, 4(s0)
 ; RV64I-NEXT:    sh a1, 2(s0)
@@ -612,33 +612,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a2, a3, a2
 ; RV64IM-NEXT:    srli t3, a2, 6
 ; RV64IM-NEXT:    addi t0, zero, 95
-; RV64IM-NEXT:    mul a3, t3, t0
-; RV64IM-NEXT:    sub t1, a1, a3
+; RV64IM-NEXT:    mulw a3, t3, t0
+; RV64IM-NEXT:    subw t1, a1, a3
 ; RV64IM-NEXT:    mulhu a3, a4, a5
 ; RV64IM-NEXT:    sub a1, a4, a3
 ; RV64IM-NEXT:    srli a1, a1, 1
 ; RV64IM-NEXT:    add a1, a1, a3
 ; RV64IM-NEXT:    srli a1, a1, 6
-; RV64IM-NEXT:    mul a3, a1, t0
-; RV64IM-NEXT:    sub t2, a4, a3
+; RV64IM-NEXT:    mulw a3, a1, t0
+; RV64IM-NEXT:    subw t2, a4, a3
 ; RV64IM-NEXT:    mulhu a4, a7, a5
 ; RV64IM-NEXT:    sub a3, a7, a4
 ; RV64IM-NEXT:    srli a3, a3, 1
 ; RV64IM-NEXT:    add a3, a3, a4
 ; RV64IM-NEXT:    srli a3, a3, 6
-; RV64IM-NEXT:    mul a4, a3, t0
-; RV64IM-NEXT:    sub a4, a7, a4
+; RV64IM-NEXT:    mulw a4, a3, t0
+; RV64IM-NEXT:    subw a4, a7, a4
 ; RV64IM-NEXT:    mulhu a5, a6, a5
 ; RV64IM-NEXT:    sub a2, a6, a5
 ; RV64IM-NEXT:    srli a2, a2, 1
 ; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    srli a2, a2, 6
-; RV64IM-NEXT:    mul a5, a2, t0
-; RV64IM-NEXT:    sub a5, a6, a5
-; RV64IM-NEXT:    add a2, a5, a2
-; RV64IM-NEXT:    add a3, a4, a3
-; RV64IM-NEXT:    add a1, t2, a1
-; RV64IM-NEXT:    add a4, t1, t3
+; RV64IM-NEXT:    mulw a5, a2, t0
+; RV64IM-NEXT:    subw a5, a6, a5
+; RV64IM-NEXT:    addw a2, a5, a2
+; RV64IM-NEXT:    addw a3, a4, a3
+; RV64IM-NEXT:    addw a1, t2, a1
+; RV64IM-NEXT:    addw a4, t1, t3
 ; RV64IM-NEXT:    sh a4, 6(a0)
 ; RV64IM-NEXT:    sh a1, 4(a0)
 ; RV64IM-NEXT:    sh a3, 2(a0)
@@ -759,8 +759,8 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a2, a2, a5
 ; RV64IM-NEXT:    srli a2, a2, 6
 ; RV64IM-NEXT:    addi a5, zero, 95
-; RV64IM-NEXT:    mul a2, a2, a5
-; RV64IM-NEXT:    sub a1, a1, a2
+; RV64IM-NEXT:    mulw a2, a2, a5
+; RV64IM-NEXT:    subw a1, a1, a2
 ; RV64IM-NEXT:    andi a2, a4, 63
 ; RV64IM-NEXT:    andi a3, a3, 31
 ; RV64IM-NEXT:    andi a4, a6, 7
@@ -900,8 +900,8 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    add a4, a5, a4
 ; RV64IM-NEXT:    srli a4, a4, 4
 ; RV64IM-NEXT:    addi a5, zero, 23
-; RV64IM-NEXT:    mul a4, a4, a5
-; RV64IM-NEXT:    sub a1, a1, a4
+; RV64IM-NEXT:    mulw a4, a4, a5
+; RV64IM-NEXT:    subw a1, a1, a4
 ; RV64IM-NEXT:    srli a4, a3, 1
 ; RV64IM-NEXT:    lui a5, 6413
 ; RV64IM-NEXT:    addiw a5, a5, 1265
@@ -914,8 +914,8 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    mulhu a4, a4, a5
 ; RV64IM-NEXT:    srli a4, a4, 7
 ; RV64IM-NEXT:    addi a5, zero, 654
-; RV64IM-NEXT:    mul a4, a4, a5
-; RV64IM-NEXT:    sub a3, a3, a4
+; RV64IM-NEXT:    mulw a4, a4, a5
+; RV64IM-NEXT:    subw a3, a3, a4
 ; RV64IM-NEXT:    lui a4, 1044567
 ; RV64IM-NEXT:    addiw a4, a4, -575
 ; RV64IM-NEXT:    slli a4, a4, 12
@@ -928,8 +928,8 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    srli a4, a4, 12
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addiw a5, a5, 1327
-; RV64IM-NEXT:    mul a4, a4, a5
-; RV64IM-NEXT:    sub a2, a2, a4
+; RV64IM-NEXT:    mulw a4, a4, a5
+; RV64IM-NEXT:    subw a2, a2, a4
 ; RV64IM-NEXT:    sh zero, 0(a0)
 ; RV64IM-NEXT:    sh a2, 6(a0)
 ; RV64IM-NEXT:    sh a3, 2(a0)
diff --git a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
index beeaf54e4238b..7e28cfbe5c379 100644
--- a/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
+++ b/llvm/test/CodeGen/RISCV/usub_sat_plus.ll
@@ -25,7 +25,7 @@ define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ;
 ; RV64I-LABEL: func32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    mul a1, a1, a2
+; RV64I-NEXT:    mulw a1, a1, a2
 ; RV64I-NEXT:    subw a1, a0, a1
 ; RV64I-NEXT:    sext.w a2, a0
 ; RV64I-NEXT:    mv a0, zero
diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll
index 7efa1a3726031..219494b0ea6da 100644
--- a/llvm/test/CodeGen/RISCV/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/vararg.ll
@@ -567,7 +567,7 @@ define i64 @va2(i8 *%fmt, ...) nounwind {
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a0, sp, 24
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a0, 8(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    lw a0, 8(sp)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a0, a0, 7
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addiw a0, a0, 7
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    slli a1, a0, 32
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    srli a1, a1, 32
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a1, a1, 8
@@ -596,7 +596,7 @@ define i64 @va2(i8 *%fmt, ...) nounwind {
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a0, s0, 8
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a0, -24(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    lw a0, -24(s0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a0, a0, 7
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addiw a0, a0, 7
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    slli a1, a0, 32
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    srli a1, a1, 32
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a1, a1, 8
@@ -893,7 +893,7 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a0, 8(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    lw a0, 8(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a2, 16(sp)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a0, a0, 7
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addiw a0, a0, 7
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    slli a2, a0, 32
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    srli a2, a2, 32
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a2, a2, 8
@@ -922,7 +922,7 @@ define i64 @va3(i32 %a, i64 %b, ...) nounwind {
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a0, -24(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    lw a0, -24(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a2, 0(s0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a0, a0, 7
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addiw a0, a0, 7
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    slli a2, a0, 32
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    srli a2, a2, 32
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a2, a2, 8
@@ -1314,8 +1314,8 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addi a3, a0, 8
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    sd a3, 8(sp)
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ld a0, 0(a0)
-; LP64-LP64F-LP64D-FPELIM-NEXT:    add a1, a1, s0
-; LP64-LP64F-LP64D-FPELIM-NEXT:    add a1, a1, a2
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addw a1, a1, s0
+; LP64-LP64F-LP64D-FPELIM-NEXT:    addw a1, a1, a2
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    addw a0, a1, a0
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-FPELIM-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
@@ -1357,8 +1357,8 @@ define i32 @va4_va_copy(i32 %argno, ...) nounwind {
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addi a3, a0, 8
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    sd a3, -32(s0)
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld a0, 0(a0)
-; LP64-LP64F-LP64D-WITHFP-NEXT:    add a1, a1, s1
-; LP64-LP64F-LP64D-WITHFP-NEXT:    add a1, a1, a2
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addw a1, a1, s1
+; LP64-LP64F-LP64D-WITHFP-NEXT:    addw a1, a1, a2
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    addw a0, a1, a0
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld s1, 24(sp) # 8-byte Folded Reload
 ; LP64-LP64F-LP64D-WITHFP-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 93789c0085de6..2695508b40ae5 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -358,12 +358,10 @@ define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
 ;
 ; RV64-LABEL: uaddo.i32:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addw a3, a0, a1
-; RV64-NEXT:    sext.w a4, a0
-; RV64-NEXT:    sltu a3, a3, a4
-; RV64-NEXT:    add a0, a0, a1
-; RV64-NEXT:    sw a0, 0(a2)
-; RV64-NEXT:    mv a0, a3
+; RV64-NEXT:    addw a1, a0, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    sw a1, 0(a2)
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: uaddo.i32:
@@ -375,12 +373,10 @@ define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
 ;
 ; RV64ZBA-LABEL: uaddo.i32:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addw a3, a0, a1
-; RV64ZBA-NEXT:    sext.w a4, a0
-; RV64ZBA-NEXT:    sltu a3, a3, a4
-; RV64ZBA-NEXT:    add a0, a0, a1
-; RV64ZBA-NEXT:    sw a0, 0(a2)
-; RV64ZBA-NEXT:    mv a0, a3
+; RV64ZBA-NEXT:    addw a1, a0, a1
+; RV64ZBA-NEXT:    sext.w a0, a0
+; RV64ZBA-NEXT:    sltu a0, a1, a0
+; RV64ZBA-NEXT:    sw a1, 0(a2)
 ; RV64ZBA-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
@@ -402,10 +398,8 @@ define zeroext i1 @uaddo.i32.constant(i32 %v1, i32* %res) {
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    sext.w a2, a0
 ; RV64-NEXT:    addiw a3, a0, -2
-; RV64-NEXT:    sltu a2, a3, a2
-; RV64-NEXT:    addi a0, a0, -2
-; RV64-NEXT:    sw a0, 0(a1)
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    sltu a0, a3, a2
+; RV64-NEXT:    sw a3, 0(a1)
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: uaddo.i32.constant:
@@ -419,10 +413,8 @@ define zeroext i1 @uaddo.i32.constant(i32 %v1, i32* %res) {
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    sext.w a2, a0
 ; RV64ZBA-NEXT:    addiw a3, a0, -2
-; RV64ZBA-NEXT:    sltu a2, a3, a2
-; RV64ZBA-NEXT:    addi a0, a0, -2
-; RV64ZBA-NEXT:    sw a0, 0(a1)
-; RV64ZBA-NEXT:    mv a0, a2
+; RV64ZBA-NEXT:    sltu a0, a3, a2
+; RV64ZBA-NEXT:    sw a3, 0(a1)
 ; RV64ZBA-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 -2)
@@ -638,12 +630,10 @@ define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
 ;
 ; RV64-LABEL: usubo.i32:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    subw a3, a0, a1
-; RV64-NEXT:    sext.w a4, a0
-; RV64-NEXT:    sltu a3, a4, a3
-; RV64-NEXT:    sub a0, a0, a1
-; RV64-NEXT:    sw a0, 0(a2)
-; RV64-NEXT:    mv a0, a3
+; RV64-NEXT:    subw a1, a0, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    sw a1, 0(a2)
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: usubo.i32:
@@ -655,12 +645,10 @@ define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
 ;
 ; RV64ZBA-LABEL: usubo.i32:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    subw a3, a0, a1
-; RV64ZBA-NEXT:    sext.w a4, a0
-; RV64ZBA-NEXT:    sltu a3, a4, a3
-; RV64ZBA-NEXT:    sub a0, a0, a1
-; RV64ZBA-NEXT:    sw a0, 0(a2)
-; RV64ZBA-NEXT:    mv a0, a3
+; RV64ZBA-NEXT:    subw a1, a0, a1
+; RV64ZBA-NEXT:    sext.w a0, a0
+; RV64ZBA-NEXT:    sltu a0, a0, a1
+; RV64ZBA-NEXT:    sw a1, 0(a2)
 ; RV64ZBA-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
@@ -681,11 +669,9 @@ define zeroext i1 @usubo.i32.constant.rhs(i32 %v1, i32* %res) {
 ; RV64-LABEL: usubo.i32.constant.rhs:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    addiw a2, a0, 2
-; RV64-NEXT:    sext.w a3, a0
-; RV64-NEXT:    sltu a2, a3, a2
-; RV64-NEXT:    addi a0, a0, 2
-; RV64-NEXT:    sw a0, 0(a1)
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    sw a2, 0(a1)
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: usubo.i32.constant.rhs:
@@ -698,11 +684,9 @@ define zeroext i1 @usubo.i32.constant.rhs(i32 %v1, i32* %res) {
 ; RV64ZBA-LABEL: usubo.i32.constant.rhs:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    addiw a2, a0, 2
-; RV64ZBA-NEXT:    sext.w a3, a0
-; RV64ZBA-NEXT:    sltu a2, a3, a2
-; RV64ZBA-NEXT:    addi a0, a0, 2
-; RV64ZBA-NEXT:    sw a0, 0(a1)
-; RV64ZBA-NEXT:    mv a0, a2
+; RV64ZBA-NEXT:    sext.w a0, a0
+; RV64ZBA-NEXT:    sltu a0, a0, a2
+; RV64ZBA-NEXT:    sw a2, 0(a1)
 ; RV64ZBA-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 -2)
@@ -724,13 +708,11 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 %v1, i32* %res) {
 ;
 ; RV64-LABEL: usubo.i32.constant.lhs:
 ; RV64:       # %bb.0: # %entry
-; RV64-NEXT:    addi a3, zero, -2
-; RV64-NEXT:    subw a2, a3, a0
-; RV64-NEXT:    addi a2, a2, 1
-; RV64-NEXT:    seqz a2, a2
-; RV64-NEXT:    sub a0, a3, a0
-; RV64-NEXT:    sw a0, 0(a1)
-; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    addi a2, zero, -2
+; RV64-NEXT:    subw a2, a2, a0
+; RV64-NEXT:    addi a0, a2, 1
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    sw a2, 0(a1)
 ; RV64-NEXT:    ret
 ;
 ; RV32ZBA-LABEL: usubo.i32.constant.lhs:
@@ -744,13 +726,11 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 %v1, i32* %res) {
 ;
 ; RV64ZBA-LABEL: usubo.i32.constant.lhs:
 ; RV64ZBA:       # %bb.0: # %entry
-; RV64ZBA-NEXT:    addi a3, zero, -2
-; RV64ZBA-NEXT:    subw a2, a3, a0
-; RV64ZBA-NEXT:    addi a2, a2, 1
-; RV64ZBA-NEXT:    seqz a2, a2
-; RV64ZBA-NEXT:    sub a0, a3, a0
-; RV64ZBA-NEXT:    sw a0, 0(a1)
-; RV64ZBA-NEXT:    mv a0, a2
+; RV64ZBA-NEXT:    addi a2, zero, -2
+; RV64ZBA-NEXT:    subw a2, a2, a0
+; RV64ZBA-NEXT:    addi a0, a2, 1
+; RV64ZBA-NEXT:    seqz a0, a0
+; RV64ZBA-NEXT:    sw a2, 0(a1)
 ; RV64ZBA-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 -2, i32 %v1)
@@ -903,12 +883,12 @@ define zeroext i1 @smulo2.i32(i32 %v1, i32* %res) {
 ; RV64ZBA-LABEL: smulo2.i32:
 ; RV64ZBA:       # %bb.0: # %entry
 ; RV64ZBA-NEXT:    sext.w a0, a0
-; RV64ZBA-NEXT:    addi a2, zero, 13
-; RV64ZBA-NEXT:    mul a3, a0, a2
-; RV64ZBA-NEXT:    mulw a0, a0, a2
-; RV64ZBA-NEXT:    xor a0, a0, a3
+; RV64ZBA-NEXT:    sh1add a2, a0, a0
+; RV64ZBA-NEXT:    sh2add a2, a2, a0
+; RV64ZBA-NEXT:    sext.w a0, a2
+; RV64ZBA-NEXT:    xor a0, a0, a2
 ; RV64ZBA-NEXT:    snez a0, a0
-; RV64ZBA-NEXT:    sw a3, 0(a1)
+; RV64ZBA-NEXT:    sw a2, 0(a1)
 ; RV64ZBA-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 13)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index f101dd4fcec93..603f667d0c615 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -227,11 +227,9 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s1
 ; CHECK-NEXT:    vadd.f32 q0, q0, r0
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:  .LBB1_4:
 ; CHECK-NEXT:    vldr s0, .LCPI1_0
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    pop {r7, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.5:
@@ -280,7 +278,7 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-LABEL: fast_float_half_mac:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    sub sp, #8
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    beq.w .LBB2_20
@@ -303,13 +301,13 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
 ; CHECK-NEXT:    vmul.f16 q5, q6, q5
 ; CHECK-NEXT:    adds r0, #8
-; CHECK-NEXT:    vcvtt.f32.f16 s27, s21
+; CHECK-NEXT:    vcvtt.f32.f16 s23, s21
+; CHECK-NEXT:    vcvtb.f32.f16 s22, s21
+; CHECK-NEXT:    vcvtt.f32.f16 s21, s20
+; CHECK-NEXT:    vcvtb.f32.f16 s20, s20
 ; CHECK-NEXT:    adds r1, #8
-; CHECK-NEXT:    vcvtb.f32.f16 s26, s21
 ; CHECK-NEXT:    adds r3, #4
-; CHECK-NEXT:    vcvtt.f32.f16 s25, s20
-; CHECK-NEXT:    vcvtb.f32.f16 s24, s20
-; CHECK-NEXT:    vadd.f32 q5, q3, q6
+; CHECK-NEXT:    vadd.f32 q5, q3, q5
 ; CHECK-NEXT:    subs.w lr, lr, #1
 ; CHECK-NEXT:    bne .LBB2_3
 ; CHECK-NEXT:    b .LBB2_19
@@ -349,8 +347,8 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    bpl .LBB2_8
 ; CHECK-NEXT:  .LBB2_7: @ %cond.load12
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vldr.16 s24, [r0, #6]
-; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vldr.16 s22, [r0, #6]
+; CHECK-NEXT:    vins.f16 s21, s22
 ; CHECK-NEXT:  .LBB2_8: @ %else13
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
 ; CHECK-NEXT:    vcmp.u32 cs, q2, q4
@@ -391,15 +389,15 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    bpl .LBB2_5
 ; CHECK-NEXT:  .LBB2_13: @ %cond.load6
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vldr.16 s24, [r0, #2]
-; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vldr.16 s22, [r0, #2]
+; CHECK-NEXT:    vins.f16 s20, s22
 ; CHECK-NEXT:    lsls r4, r2, #29
 ; CHECK-NEXT:    bpl .LBB2_6
 ; CHECK-NEXT:  .LBB2_14: @ %cond.load9
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vmovx.f16 s24, s21
 ; CHECK-NEXT:    vldr.16 s21, [r0, #4]
-; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vmovx.f16 s22, s0
+; CHECK-NEXT:    vins.f16 s21, s22
 ; CHECK-NEXT:    lsls r2, r2, #28
 ; CHECK-NEXT:    bmi .LBB2_7
 ; CHECK-NEXT:    b .LBB2_8
@@ -410,21 +408,21 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:    bpl .LBB2_10
 ; CHECK-NEXT:  .LBB2_16: @ %cond.load19
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vldr.16 s28, [r1, #2]
-; CHECK-NEXT:    vins.f16 s24, s28
+; CHECK-NEXT:    vldr.16 s26, [r1, #2]
+; CHECK-NEXT:    vins.f16 s24, s26
 ; CHECK-NEXT:    lsls r4, r2, #29
 ; CHECK-NEXT:    bpl .LBB2_11
 ; CHECK-NEXT:  .LBB2_17: @ %cond.load22
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vmovx.f16 s28, s25
 ; CHECK-NEXT:    vldr.16 s25, [r1, #4]
-; CHECK-NEXT:    vins.f16 s25, s28
+; CHECK-NEXT:    vmovx.f16 s26, s0
+; CHECK-NEXT:    vins.f16 s25, s26
 ; CHECK-NEXT:    lsls r2, r2, #28
 ; CHECK-NEXT:    bpl.w .LBB2_2
 ; CHECK-NEXT:  .LBB2_18: @ %cond.load25
 ; CHECK-NEXT:    @ in Loop: Header=BB2_3 Depth=1
-; CHECK-NEXT:    vldr.16 s28, [r1, #6]
-; CHECK-NEXT:    vins.f16 s25, s28
+; CHECK-NEXT:    vldr.16 s26, [r1, #6]
+; CHECK-NEXT:    vins.f16 s25, s26
 ; CHECK-NEXT:    b .LBB2_2
 ; CHECK-NEXT:  .LBB2_19: @ %middle.block
 ; CHECK-NEXT:    vdup.32 q0, r12
@@ -439,9 +437,8 @@ define arm_aapcs_vfpcc float @fast_float_half_mac(half* nocapture readonly %b, h
 ; CHECK-NEXT:  .LBB2_20:
 ; CHECK-NEXT:    vldr s0, .LCPI2_0
 ; CHECK-NEXT:  .LBB2_21: @ %for.cond.cleanup
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.22:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
index fddbfa8b66207..856e150e6012d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll
@@ -14,8 +14,8 @@ define arm_aapcs_vfpcc float @vctpi32(float* %0, i32 %1) {
 ; CHECK-NEXT:    vmvn.i32 q1, #0x1f
 ; CHECK-NEXT:    vmov.32 q3[0], r0
 ; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vadd.i32 q1, q3, q1
 ; CHECK-NEXT:    subs r3, r1, #1
+; CHECK-NEXT:    vadd.i32 q1, q3, q1
 ; CHECK-NEXT:    vidup.u32 q2, r2, #8
 ; CHECK-NEXT:    vmov r0, s4
 ; CHECK-NEXT:    vadd.i32 q1, q2, r0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
index 8c23b09e650bf..9162d4a3f2142 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll
@@ -15,10 +15,10 @@ define arm_aapcs_vfpcc void @arm_var_f32_mve(float* %pSrc, i32 %blockSize, float
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
 ; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %arm_mean_f32_mve.exit
-; CHECK-NEXT:    vmov s4, r1
-; CHECK-NEXT:    vadd.f32 s0, s3, s3
-; CHECK-NEXT:    vcvt.f32.u32 s4, s4
-; CHECK-NEXT:    vdiv.f32 s0, s0, s4
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vadd.f32 s2, s3, s3
+; CHECK-NEXT:    vcvt.f32.u32 s0, s0
+; CHECK-NEXT:    vdiv.f32 s0, s2, s0
 ; CHECK-NEXT:    vmov r12, s0
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    dlstp.32 lr, r1
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
index 2974db0d816b9..f1b3014e358a0 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@@ -986,11 +986,11 @@ define arm_aapcs_vfpcc void @half_half_mul(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    vmov.32 q1[1], r10
 ; CHECK-NEXT:    adds r5, #8
 ; CHECK-NEXT:    vmul.f16 q0, q0, q1
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
-; CHECK-NEXT:    vstrb.8 q1, [r6], #16
+; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    vstrb.8 q0, [r6], #16
 ; CHECK-NEXT:    le lr, .LBB5_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r12, r3
@@ -1100,11 +1100,11 @@ define arm_aapcs_vfpcc void @half_half_add(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    vmov.32 q1[1], r10
 ; CHECK-NEXT:    adds r5, #8
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
-; CHECK-NEXT:    vstrb.8 q1, [r6], #16
+; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    vstrb.8 q0, [r6], #16
 ; CHECK-NEXT:    le lr, .LBB6_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r12, r3
@@ -1214,11 +1214,11 @@ define arm_aapcs_vfpcc void @half_half_sub(half* nocapture readonly %a, half* no
 ; CHECK-NEXT:    vmov.32 q1[1], r10
 ; CHECK-NEXT:    adds r5, #8
 ; CHECK-NEXT:    vsub.f16 q0, q0, q1
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
-; CHECK-NEXT:    vstrb.8 q1, [r6], #16
+; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    vstrb.8 q0, [r6], #16
 ; CHECK-NEXT:    le lr, .LBB7_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r12, r3
@@ -1333,11 +1333,11 @@ define arm_aapcs_vfpcc void @half_short_mul(half* nocapture readonly %a, i16* no
 ; CHECK-NEXT:    vmov.16 q0[3], r8
 ; CHECK-NEXT:    vcvt.f16.s16 q0, q0
 ; CHECK-NEXT:    vmul.f16 q0, q1, q0
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
-; CHECK-NEXT:    vstrb.8 q1, [r6], #16
+; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
+; CHECK-NEXT:    vstrb.8 q0, [r6], #16
 ; CHECK-NEXT:    le lr, .LBB8_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    ldr r7, [sp] @ 4-byte Reload
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
index 024857b658023..b01a0cc047c29 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll
@@ -240,11 +240,11 @@ define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA,
 ; CHECK-NEXT:  .LBB5_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
-; CHECK-NEXT:    vrintr.f32 s7, s3
-; CHECK-NEXT:    vrintr.f32 s6, s2
-; CHECK-NEXT:    vrintr.f32 s5, s1
-; CHECK-NEXT:    vrintr.f32 s4, s0
-; CHECK-NEXT:    vstrw.32 q1, [r1], #16
+; CHECK-NEXT:    vrintr.f32 s3, s3
+; CHECK-NEXT:    vrintr.f32 s2, s2
+; CHECK-NEXT:    vrintr.f32 s1, s1
+; CHECK-NEXT:    vrintr.f32 s0, s0
+; CHECK-NEXT:    vstrw.32 q0, [r1], #16
 ; CHECK-NEXT:    letp lr, .LBB5_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
index d5a44e41e77f9..29174b44cd45a 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unpredload.ll
@@ -11,9 +11,9 @@ define void @arm_cmplx_mag_squared_q15_mve(i16* %pSrc, i16* %pDst, i32 %blockSiz
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vld20.16 {q0, q1}, [r0]
 ; CHECK-NEXT:    vld21.16 {q0, q1}, [r0]!
-; CHECK-NEXT:    vmulh.s16 q2, q1, q1
+; CHECK-NEXT:    vmulh.s16 q1, q1, q1
 ; CHECK-NEXT:    vmulh.s16 q0, q0, q0
-; CHECK-NEXT:    vqadd.s16 q0, q0, q2
+; CHECK-NEXT:    vqadd.s16 q0, q0, q1
 ; CHECK-NEXT:    vshr.s16 q0, q0, #1
 ; CHECK-NEXT:    vstrh.16 q0, [r1], #16
 ; CHECK-NEXT:    letp lr, .LBB0_1
diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
index 98f00707df37c..607a55b52370b 100644
--- a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -14,9 +14,9 @@ define <4 x i32> @v4i32(i32 %index, i32 %TC, <4 x i32> %V1, <4 x i32> %V2) {
 ; CHECK-NEXT:    vpnot
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vcmpt.u32 hi, q1, q0
-; CHECK-NEXT:    vmov d0, r2, r3
 ; CHECK-NEXT:    vldr d1, [sp]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov d0, r2, r3
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov r2, r3, d1
@@ -156,8 +156,8 @@ define <8 x i16> @v8i16(i32 %index, i32 %TC, <8 x i16> %V1, <8 x i16> %V2) {
 ; CHECK-NEXT:    vpnot
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vcmpt.i16 ne, q0, zr
-; CHECK-NEXT:    vmov d0, r2, r3
 ; CHECK-NEXT:    vldr d1, [sp, #48]
+; CHECK-NEXT:    vmov d0, r2, r3
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vmov r2, r3, d1
@@ -339,12 +339,12 @@ define <16 x i8> @v16i8(i32 %index, i32 %TC, <16 x i8> %V1, <16 x i8> %V2) {
 ; CHECK-NEXT:    vmov.8 q3[14], r0
 ; CHECK-NEXT:    vmov.u16 r0, q0[7]
 ; CHECK-NEXT:    vmov.8 q3[15], r0
-; CHECK-NEXT:    vmov d0, r2, r3
 ; CHECK-NEXT:    add r0, sp, #88
 ; CHECK-NEXT:    vcmp.i8 ne, q3, zr
 ; CHECK-NEXT:    vldr d1, [sp, #80]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vpnot
+; CHECK-NEXT:    vmov d0, r2, r3
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vcmpt.i8 ne, q2, zr
 ; CHECK-NEXT:    vpsel q0, q0, q1
diff --git a/llvm/test/CodeGen/Thumb2/mve-be.ll b/llvm/test/CodeGen/Thumb2/mve-be.ll
index d941036488ba6..534530d772418 100644
--- a/llvm/test/CodeGen/Thumb2/mve-be.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-be.ll
@@ -70,10 +70,10 @@ entry:
 define <4 x i32> @add_soft(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LE-LABEL: add_soft:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vmov d0, r0, r1
 ; CHECK-LE-NEXT:    mov r0, sp
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-LE-NEXT:    vmov r0, r1, d0
 ; CHECK-LE-NEXT:    vmov r2, r3, d1
@@ -81,9 +81,9 @@ define <4 x i32> @add_soft(<4 x i32> %src1, <4 x i32> %src2) {
 ;
 ; CHECK-BE-LABEL: add_soft:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
 ; CHECK-BE-NEXT:    mov r0, sp
+; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-BE-NEXT:    vadd.i32 q0, q1, q0
diff --git a/llvm/test/CodeGen/Thumb2/mve-ctlz.ll b/llvm/test/CodeGen/Thumb2/mve-ctlz.ll
index c44a9efe39573..eee41da87423d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-ctlz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-ctlz.ll
@@ -12,8 +12,10 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_0_t(<2 x i64> %src){
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r0, r1
-; CHECK-NEXT:    vmov s6, r0
+; CHECK-NEXT:    vmov s2, r0
 ; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vldr s1, .LCPI0_0
+; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    clz r0, r0
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    cset r2, ne
@@ -21,10 +23,7 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_0_t(<2 x i64> %src){
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r0, r1
-; CHECK-NEXT:    vmov s4, r0
-; CHECK-NEXT:    vldr s5, .LCPI0_0
-; CHECK-NEXT:    vmov.f32 s7, s5
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
@@ -76,8 +75,10 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_1_t(<2 x i64> %src){
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r0, r1
-; CHECK-NEXT:    vmov s6, r0
+; CHECK-NEXT:    vmov s2, r0
 ; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vldr s1, .LCPI4_0
+; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    clz r0, r0
 ; CHECK-NEXT:    cmp r1, #0
 ; CHECK-NEXT:    cset r2, ne
@@ -85,10 +86,7 @@ define arm_aapcs_vfpcc <2 x i64> @ctlz_2i64_1_t(<2 x i64> %src){
 ; CHECK-NEXT:    cmp r2, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r0, r1
-; CHECK-NEXT:    vmov s4, r0
-; CHECK-NEXT:    vldr s5, .LCPI4_0
-; CHECK-NEXT:    vmov.f32 s7, s5
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov s0, r0
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
diff --git a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll
index f6f51068dd5f8..724bd4f7963b8 100644
--- a/llvm/test/CodeGen/Thumb2/mve-ctpop.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-ctpop.ll
@@ -12,6 +12,7 @@ define arm_aapcs_vfpcc <2 x i64> @ctpop_2i64_t(<2 x i64> %src){
 ; CHECK-NEXT:    vmov r3, r4, d0
 ; CHECK-NEXT:    mov.w r12, #858993459
 ; CHECK-NEXT:    vldr s1, .LCPI0_0
+; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    and.w r0, lr, r2, lsr #1
 ; CHECK-NEXT:    subs r0, r2, r0
 ; CHECK-NEXT:    and.w r2, r12, r0, lsr #2
@@ -51,7 +52,6 @@ define arm_aapcs_vfpcc <2 x i64> @ctpop_2i64_t(<2 x i64> %src){
 ; CHECK-NEXT:    vmov s2, r1
 ; CHECK-NEXT:    add.w r0, r2, r0, lsr #24
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
diff --git a/llvm/test/CodeGen/Thumb2/mve-cttz.ll b/llvm/test/CodeGen/Thumb2/mve-cttz.ll
index b844bc217e571..e5d4a93ee4f67 100644
--- a/llvm/test/CodeGen/Thumb2/mve-cttz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-cttz.ll
@@ -4,8 +4,7 @@
 define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_0_t(<2 x i64> %src){
 ; CHECK-LABEL: cttz_2i64_0_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    rbit r1, r1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    clz r1, r1
@@ -16,7 +15,9 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_0_t(<2 x i64> %src){
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r1, r0
 ; CHECK-NEXT:    vmov s2, r1
-; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vldr s1, .LCPI0_0
+; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    rbit r1, r1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    clz r1, r1
@@ -27,8 +28,6 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_0_t(<2 x i64> %src){
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r1, r0
 ; CHECK-NEXT:    vmov s0, r1
-; CHECK-NEXT:    vldr s1, .LCPI0_0
-; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
@@ -78,8 +77,7 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_1_t(<2 x i64> %src){
 ; CHECK-LABEL: cttz_2i64_1_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov r0, r1, d1
 ; CHECK-NEXT:    rbit r1, r1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    clz r1, r1
@@ -90,7 +88,9 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_1_t(<2 x i64> %src){
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r1, r0
 ; CHECK-NEXT:    vmov s2, r1
-; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vldr s1, .LCPI4_0
+; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    rbit r1, r1
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    clz r1, r1
@@ -101,8 +101,6 @@ define arm_aapcs_vfpcc <2 x i64> @cttz_2i64_1_t(<2 x i64> %src){
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    clzne r1, r0
 ; CHECK-NEXT:    vmov s0, r1
-; CHECK-NEXT:    vldr s1, .LCPI4_0
-; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    bx lr
 ; CHECK-NEXT:    .p2align 2
 ; CHECK-NEXT:  @ %bb.1:
diff --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
index 3a746fc749feb..bb853f698cdfd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll
@@ -724,11 +724,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @fdiv_f32(<4 x float> %in1, <4 x float> %in2) {
 ; CHECK-LABEL: fdiv_f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vdiv.f32 s11, s3, s7
-; CHECK-NEXT:    vdiv.f32 s10, s2, s6
-; CHECK-NEXT:    vdiv.f32 s9, s1, s5
-; CHECK-NEXT:    vdiv.f32 s8, s0, s4
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vdiv.f32 s3, s3, s7
+; CHECK-NEXT:    vdiv.f32 s2, s2, s6
+; CHECK-NEXT:    vdiv.f32 s1, s1, s5
+; CHECK-NEXT:    vdiv.f32 s0, s0, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %out = fdiv <4 x float> %in1, %in2
@@ -774,27 +773,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fdiv_f16(<8 x half> %in1, <8 x half> %in2) {
 ; CHECK-LABEL: fdiv_f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmovx.f16 s0, s4
-; CHECK-NEXT:    vmovx.f16 s2, s8
-; CHECK-NEXT:    vmovx.f16 s14, s9
-; CHECK-NEXT:    vdiv.f16 s12, s2, s0
-; CHECK-NEXT:    vdiv.f16 s0, s8, s4
-; CHECK-NEXT:    vins.f16 s0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s5
-; CHECK-NEXT:    vdiv.f16 s12, s14, s12
-; CHECK-NEXT:    vdiv.f16 s1, s9, s5
-; CHECK-NEXT:    vins.f16 s1, s12
-; CHECK-NEXT:    vmovx.f16 s12, s6
-; CHECK-NEXT:    vmovx.f16 s14, s10
-; CHECK-NEXT:    vdiv.f16 s2, s10, s6
-; CHECK-NEXT:    vdiv.f16 s12, s14, s12
-; CHECK-NEXT:    vmovx.f16 s14, s11
-; CHECK-NEXT:    vins.f16 s2, s12
-; CHECK-NEXT:    vmovx.f16 s12, s7
-; CHECK-NEXT:    vdiv.f16 s12, s14, s12
-; CHECK-NEXT:    vdiv.f16 s3, s11, s7
-; CHECK-NEXT:    vins.f16 s3, s12
+; CHECK-NEXT:    vmovx.f16 s10, s0
+; CHECK-NEXT:    vmovx.f16 s8, s4
+; CHECK-NEXT:    vdiv.f16 s0, s0, s4
+; CHECK-NEXT:    vdiv.f16 s8, s10, s8
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vdiv.f16 s1, s1, s5
+; CHECK-NEXT:    vdiv.f16 s4, s8, s4
+; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vmovx.f16 s4, s6
+; CHECK-NEXT:    vdiv.f16 s2, s2, s6
+; CHECK-NEXT:    vdiv.f16 s4, s8, s4
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s6, s3
+; CHECK-NEXT:    vmovx.f16 s4, s7
+; CHECK-NEXT:    vdiv.f16 s3, s3, s7
+; CHECK-NEXT:    vdiv.f16 s4, s6, s4
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %out = fdiv <8 x half> %in1, %in2
@@ -806,8 +804,8 @@ define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q5, q0
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s20
@@ -816,59 +814,59 @@ define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) {
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s20
-; CHECK-NEXT:    vmov s24, r0
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s24, s24
-; CHECK-NEXT:    vcvtt.f16.f32 s24, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s21
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s25, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s21
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s25, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s22
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s26, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s22
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s26, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s23
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s27, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s23
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl fmodf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s27, s0
-; CHECK-NEXT:    vmov q0, q6
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %out = frem <8 x half> %in1, %in2
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index cc8a3b36c8305..c7dd83b204d79 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -1424,22 +1424,22 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    ldrd r12, r6, [r0, #4]
-; CHECK-NEXT:    and r8, r3, #1
+; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    ldrb r0, [r0]
-; CHECK-NEXT:    vldr.16 s4, .LCPI17_0
+; CHECK-NEXT:    and r8, r3, #1
+; CHECK-NEXT:    vldr.16 s0, .LCPI17_0
 ; CHECK-NEXT:    lsr.w r9, r3, #1
-; CHECK-NEXT:    vmov.i32 q0, #0x0
 ; CHECK-NEXT:    b .LBB17_3
 ; CHECK-NEXT:  .LBB17_1: @ %if.else
 ; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
-; CHECK-NEXT:    vstr.16 s8, [r12]
-; CHECK-NEXT:    vmovx.f16 s9, s8
+; CHECK-NEXT:    vmovx.f16 s5, s4
+; CHECK-NEXT:    vstr.16 s4, [r12]
 ; CHECK-NEXT:  .LBB17_2: @ %if.end
 ; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
-; CHECK-NEXT:    vstr.16 s9, [r12, #2]
+; CHECK-NEXT:    vstr.16 s5, [r12, #2]
 ; CHECK-NEXT:    adds r6, #10
 ; CHECK-NEXT:    subs r0, #1
 ; CHECK-NEXT:    add.w r12, r12, #4
@@ -1448,15 +1448,15 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:  .LBB17_3: @ %do.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB17_5 Depth 2
-; CHECK-NEXT:    vldrh.u16 q3, [r6]
+; CHECK-NEXT:    vldrh.u16 q2, [r6]
 ; CHECK-NEXT:    movs r5, #0
+; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vshlc q4, r5, #16
+; CHECK-NEXT:    vldrh.u16 q3, [r6, #4]
 ; CHECK-NEXT:    vmov q5, q3
 ; CHECK-NEXT:    vshlc q5, r5, #16
-; CHECK-NEXT:    vldrh.u16 q4, [r6, #4]
-; CHECK-NEXT:    vmov q6, q4
-; CHECK-NEXT:    vshlc q6, r5, #16
-; CHECK-NEXT:    vldrh.u16 q2, [r12]
-; CHECK-NEXT:    vmov.f32 s9, s1
+; CHECK-NEXT:    vldrh.u16 q1, [r12]
+; CHECK-NEXT:    vmov.f32 s5, s1
 ; CHECK-NEXT:    mov r5, r2
 ; CHECK-NEXT:    wls lr, r9, .LBB17_6
 ; CHECK-NEXT:  @ %bb.4: @ %while.body.preheader
@@ -1466,19 +1466,19 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    @ Parent Loop BB17_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    ldrh r7, [r1], #4
-; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vfma.f16 q2, q3, r7
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    vfma.f16 q1, q2, r7
 ; CHECK-NEXT:    ldrh r4, [r1, #-2]
-; CHECK-NEXT:    vmov.u16 r7, q2[0]
-; CHECK-NEXT:    vfma.f16 q2, q4, r7
-; CHECK-NEXT:    vins.f16 s9, s4
-; CHECK-NEXT:    vfma.f16 q2, q5, r4
-; CHECK-NEXT:    vmov.u16 r4, q2[1]
-; CHECK-NEXT:    vfma.f16 q2, q6, r4
+; CHECK-NEXT:    vmov.u16 r7, q1[0]
+; CHECK-NEXT:    vfma.f16 q1, q3, r7
+; CHECK-NEXT:    vins.f16 s5, s0
+; CHECK-NEXT:    vfma.f16 q1, q4, r4
+; CHECK-NEXT:    vmov.u16 r4, q1[1]
+; CHECK-NEXT:    vfma.f16 q1, q5, r4
 ; CHECK-NEXT:    strh r4, [r5, #2]
-; CHECK-NEXT:    vmov.f32 s8, s9
+; CHECK-NEXT:    vmov.f32 s4, s5
 ; CHECK-NEXT:    strh r7, [r5], #4
-; CHECK-NEXT:    vmov.16 q2[2], r3
+; CHECK-NEXT:    vmov.16 q1[2], r3
 ; CHECK-NEXT:    le lr, .LBB17_5
 ; CHECK-NEXT:  .LBB17_6: @ %while.end
 ; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
@@ -1487,15 +1487,15 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:  @ %bb.7: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
 ; CHECK-NEXT:    ldrh r1, [r1]
-; CHECK-NEXT:    vfma.f16 q2, q3, r1
-; CHECK-NEXT:    vmov.u16 r1, q2[0]
-; CHECK-NEXT:    vfma.f16 q2, q4, r1
+; CHECK-NEXT:    vfma.f16 q1, q2, r1
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
+; CHECK-NEXT:    vfma.f16 q1, q3, r1
 ; CHECK-NEXT:    strh r1, [r5]
-; CHECK-NEXT:    vmovx.f16 s6, s8
-; CHECK-NEXT:    vstr.16 s6, [r12]
+; CHECK-NEXT:    vmovx.f16 s2, s4
+; CHECK-NEXT:    vstr.16 s2, [r12]
 ; CHECK-NEXT:    b .LBB17_2
 ; CHECK-NEXT:  .LBB17_8: @ %do.end
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
 ; CHECK-NEXT:    .p2align 1
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 2d73c7531fe69..a299e4c14e072 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -1418,8 +1418,8 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biqu
 ; CHECK-NEXT:    @ Child Loop BB17_3 Depth 2
 ; CHECK-NEXT:    ldrd r5, r7, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r3]
-; CHECK-NEXT:    vldr s8, [r0, #8]
 ; CHECK-NEXT:    ldr r6, [r0, #12]
+; CHECK-NEXT:    vldr s8, [r0, #8]
 ; CHECK-NEXT:    vstrw.32 q1, [r4]
 ; CHECK-NEXT:    vdup.32 q1, r7
 ; CHECK-NEXT:    vldr s12, [r0, #16]
@@ -1649,8 +1649,8 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_
 ; CHECK-NEXT:    sub sp, #4
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #64
-; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
 ; CHECK-NEXT:    ldrd r12, r10, [r0]
 ; CHECK-NEXT:    @ implicit-def: $s2
 ; CHECK-NEXT:    and r7, r3, #3
@@ -1658,19 +1658,19 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_
 ; CHECK-NEXT:    lsrs r0, r3, #2
 ; CHECK-NEXT:    str r0, [sp] @ 4-byte Spill
 ; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT:    str r2, [sp, #44] @ 4-byte Spill
+; CHECK-NEXT:    str r2, [sp, #28] @ 4-byte Spill
 ; CHECK-NEXT:    b .LBB19_3
 ; CHECK-NEXT:  .LBB19_1: @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    vmov.f32 s14, s7
-; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    vmov.f32 s0, s10
+; CHECK-NEXT:    vmov.f32 s4, s3
 ; CHECK-NEXT:    vmov.f32 s7, s6
 ; CHECK-NEXT:  .LBB19_2: @ %if.end69
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    vstr s8, [r10]
+; CHECK-NEXT:    ldr r2, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    subs.w r12, r12, #1
-; CHECK-NEXT:    vstr s0, [r10, #4]
+; CHECK-NEXT:    vstr s1, [r10]
 ; CHECK-NEXT:    add.w r9, r9, #128
+; CHECK-NEXT:    vstr s4, [r10, #4]
 ; CHECK-NEXT:    vstr s14, [r10, #8]
 ; CHECK-NEXT:    mov r1, r2
 ; CHECK-NEXT:    vstr s7, [r10, #12]
@@ -1679,48 +1679,48 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_
 ; CHECK-NEXT:  .LBB19_3: @ %do.body
 ; CHECK-NEXT:    @ =>This Loop Header: Depth=1
 ; CHECK-NEXT:    @ Child Loop BB19_5 Depth 2
-; CHECK-NEXT:    vldr s7, [r10, #8]
-; CHECK-NEXT:    mov r5, r2
 ; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
-; CHECK-NEXT:    vldr s8, [r10]
-; CHECK-NEXT:    vldr s10, [r10, #4]
+; CHECK-NEXT:    mov r5, r2
+; CHECK-NEXT:    vldr s1, [r10]
+; CHECK-NEXT:    vldr s3, [r10, #4]
+; CHECK-NEXT:    vldr s7, [r10, #8]
 ; CHECK-NEXT:    vldr s6, [r10, #12]
 ; CHECK-NEXT:    wls lr, r0, .LBB19_6
 ; CHECK-NEXT:  @ %bb.4: @ %while.body.lr.ph
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    ldr r5, [sp, #44] @ 4-byte Reload
+; CHECK-NEXT:    ldr r5, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:  .LBB19_5: @ %while.body
 ; CHECK-NEXT:    @ Parent Loop BB19_3 Depth=1
 ; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vldr s8, [r1, #12]
-; CHECK-NEXT:    vldrw.u32 q0, [r9, #112]
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vldr s10, [r1, #8]
 ; CHECK-NEXT:    vmov r7, s7
+; CHECK-NEXT:    vldrw.u32 q2, [r9, #16]
 ; CHECK-NEXT:    vmov r11, s6
+; CHECK-NEXT:    vldrw.u32 q1, [r9, #112]
+; CHECK-NEXT:    vmov r4, s1
+; CHECK-NEXT:    vldr s1, [r1, #12]
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vldr s3, [r1, #8]
+; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q1, [r9]
-; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov r8, s8
-; CHECK-NEXT:    vldrw.u32 q0, [r9, #16]
+; CHECK-NEXT:    vmov r8, s1
 ; CHECK-NEXT:    ldr r6, [r1, #4]
 ; CHECK-NEXT:    vldrw.u32 q7, [r9, #32]
 ; CHECK-NEXT:    vmul.f32 q1, q1, r8
-; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r0, s3
 ; CHECK-NEXT:    vldrw.u32 q3, [r9, #48]
-; CHECK-NEXT:    vfma.f32 q1, q0, r0
+; CHECK-NEXT:    vfma.f32 q1, q2, r0
 ; CHECK-NEXT:    ldr r0, [r1], #16
 ; CHECK-NEXT:    vfma.f32 q1, q7, r6
 ; CHECK-NEXT:    vldrw.u32 q6, [r9, #64]
+; CHECK-NEXT:    vmov.f32 s2, s1
 ; CHECK-NEXT:    vfma.f32 q1, q3, r0
 ; CHECK-NEXT:    vldrw.u32 q5, [r9, #80]
 ; CHECK-NEXT:    vfma.f32 q1, q6, r4
 ; CHECK-NEXT:    vldrw.u32 q4, [r9, #96]
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vfma.f32 q1, q5, r3
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vfma.f32 q1, q4, r7
-; CHECK-NEXT:    vfma.f32 q1, q0, r11
-; CHECK-NEXT:    vmov.f32 s2, s8
+; CHECK-NEXT:    vfma.f32 q1, q2, r11
 ; CHECK-NEXT:    vstrb.8 q1, [r5], #16
 ; CHECK-NEXT:    le lr, .LBB19_5
 ; CHECK-NEXT:  .LBB19_6: @ %while.end
@@ -1730,74 +1730,68 @@ define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_
 ; CHECK-NEXT:    beq .LBB19_1
 ; CHECK-NEXT:  @ %bb.7: @ %if.then
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    vldr s24, [r1]
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vldr s0, [r1, #4]
-; CHECK-NEXT:    vldrw.u32 q3, [r9]
-; CHECK-NEXT:    vldr s3, [r1, #12]
-; CHECK-NEXT:    vldrw.u32 q4, [r9, #32]
-; CHECK-NEXT:    vldr s1, [r1, #8]
-; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    vldrw.u32 q2, [r9, #96]
-; CHECK-NEXT:    vmov r6, s3
+; CHECK-NEXT:    vmov lr, s6
+; CHECK-NEXT:    vldr s6, [r1, #12]
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vstrw.32 q2, [sp, #8] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r9, #112]
+; CHECK-NEXT:    vldr s1, [r1, #8]
+; CHECK-NEXT:    vldrw.u32 q3, [r9]
+; CHECK-NEXT:    vldr s4, [r1, #4]
+; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov r6, s6
+; CHECK-NEXT:    vldrw.u32 q2, [r9, #16]
+; CHECK-NEXT:    vldr s0, [r1]
 ; CHECK-NEXT:    vmul.f32 q3, q3, r6
 ; CHECK-NEXT:    vmov r6, s1
-; CHECK-NEXT:    vstrw.32 q2, [sp, #24] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [r9, #112]
+; CHECK-NEXT:    vldrw.u32 q4, [r9, #32]
+; CHECK-NEXT:    vfma.f32 q3, q2, r6
+; CHECK-NEXT:    vmov r4, s4
 ; CHECK-NEXT:    vldrw.u32 q5, [r9, #48]
-; CHECK-NEXT:    vmov r4, s0
-; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [r9, #80]
 ; CHECK-NEXT:    vldrw.u32 q7, [r9, #64]
-; CHECK-NEXT:    vmov r3, s24
-; CHECK-NEXT:    vstrw.32 q2, [sp, #8] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [r9, #16]
-; CHECK-NEXT:    vmov r2, s7
-; CHECK-NEXT:    cmp r7, #1
-; CHECK-NEXT:    vfma.f32 q3, q2, r6
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #8] @ 16-byte Reload
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vfma.f32 q3, q4, r4
-; CHECK-NEXT:    vmov lr, s6
 ; CHECK-NEXT:    vfma.f32 q3, q5, r3
+; CHECK-NEXT:    vldrw.u32 q6, [r9, #80]
+; CHECK-NEXT:    vmov r1, s3
 ; CHECK-NEXT:    vfma.f32 q3, q7, r0
-; CHECK-NEXT:    vfma.f32 q3, q2, r1
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #24] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #8] @ 16-byte Reload
+; CHECK-NEXT:    vmov r2, s7
+; CHECK-NEXT:    vfma.f32 q3, q6, r1
+; CHECK-NEXT:    cmp r7, #1
 ; CHECK-NEXT:    vfma.f32 q3, q2, r2
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vfma.f32 q3, q2, lr
 ; CHECK-NEXT:    bne .LBB19_9
 ; CHECK-NEXT:  @ %bb.8: @ %if.then58
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    vstr s12, [r5]
-; CHECK-NEXT:    vmov.f32 s8, s24
-; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    vmov.f32 s1, s0
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s14, s12
-; CHECK-NEXT:    b .LBB19_11
+; CHECK-NEXT:    b .LBB19_12
 ; CHECK-NEXT:  .LBB19_9: @ %if.else
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    cmp r7, #2
 ; CHECK-NEXT:    vstmia r5, {s12, s13}
-; CHECK-NEXT:    bne .LBB19_12
+; CHECK-NEXT:    bne .LBB19_11
 ; CHECK-NEXT:  @ %bb.10: @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    vmov.f32 s8, s0
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s4, s0
 ; CHECK-NEXT:    vmov.f32 s14, s13
-; CHECK-NEXT:    vmov.f32 s0, s24
 ; CHECK-NEXT:    vmov.f32 s7, s12
-; CHECK-NEXT:  .LBB19_11: @ %if.end69
-; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
-; CHECK-NEXT:    vmov.f32 s2, s3
-; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    b .LBB19_2
-; CHECK-NEXT:  .LBB19_12: @ %if.else64
+; CHECK-NEXT:    b .LBB19_12
+; CHECK-NEXT:  .LBB19_11: @ %if.else64
 ; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
 ; CHECK-NEXT:    vmov.f32 s7, s13
-; CHECK-NEXT:    ldr r2, [sp, #44] @ 4-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    vstr s14, [r5, #8]
-; CHECK-NEXT:    vmov.f32 s8, s1
+; CHECK-NEXT:  .LBB19_12: @ %if.end69
+; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
+; CHECK-NEXT:    vmov.f32 s2, s6
 ; CHECK-NEXT:    b .LBB19_2
 ; CHECK-NEXT:  .LBB19_13: @ %do.end
-; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    add sp, #48
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -2028,8 +2022,8 @@ define void @arm_biquad_cascade_df2T_f32(%struct.arm_biquad_cascade_df2T_instanc
 ; CHECK-NEXT:    b .LBB20_3
 ; CHECK-NEXT:  .LBB20_1: @ %if.else
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
-; CHECK-NEXT:    vstr s4, [r12]
 ; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vstr s4, [r12]
 ; CHECK-NEXT:  .LBB20_2: @ %if.end
 ; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
 ; CHECK-NEXT:    vstr s6, [r12, #4]
@@ -2211,9 +2205,9 @@ do.end:                                           ; preds = %if.end
 define arm_aapcs_vfpcc float @vecAddAcrossF32Mve(<4 x float> %in) {
 ; CHECK-LABEL: vecAddAcrossF32Mve:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f32 s4, s0, s1
-; CHECK-NEXT:    vadd.f32 s4, s4, s2
-; CHECK-NEXT:    vadd.f32 s0, s4, s3
+; CHECK-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    vadd.f32 s0, s0, s3
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = extractelement <4 x float> %in, i32 0
diff --git a/llvm/test/CodeGen/Thumb2/mve-fmas.ll b/llvm/test/CodeGen/Thumb2/mve-fmas.ll
index a212158487286..b13a98666c728 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fmas.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fmas.ll
@@ -22,25 +22,25 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v1(<8 x half> %src1, <8 x half> %src2,
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s0, s4, s8
 ; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vins.f16 s0, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vmla.f16 s12, s8, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vins.f16 s1, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vmla.f16 s12, s8, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vins.f16 s2, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    vmla.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vins.f16 s3, s13
+; CHECK-MVE-NEXT:    vmla.f16 s8, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s13
+; CHECK-MVE-NEXT:    vins.f16 s2, s12
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -67,25 +67,25 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v2(<8 x half> %src1, <8 x half> %src2,
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s0, s4, s8
 ; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vins.f16 s0, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vmla.f16 s12, s8, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vins.f16 s1, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vmla.f16 s12, s8, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vins.f16 s2, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    vmla.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vmla.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vins.f16 s3, s13
+; CHECK-MVE-NEXT:    vmla.f16 s8, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s13
+; CHECK-MVE-NEXT:    vins.f16 s2, s12
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -112,25 +112,25 @@ define arm_aapcs_vfpcc <8 x half> @vfms16(<8 x half> %src1, <8 x half> %src2, <8
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    vmls.f16 s0, s4, s8
 ; CHECK-MVE-NEXT:    vmls.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vins.f16 s0, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vmls.f16 s12, s8, s4
 ; CHECK-MVE-NEXT:    vmls.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vmls.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vins.f16 s1, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vmls.f16 s12, s8, s4
 ; CHECK-MVE-NEXT:    vmls.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vmls.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vins.f16 s2, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    vmls.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vmls.f16 s13, s14, s12
-; CHECK-MVE-NEXT:    vins.f16 s3, s13
+; CHECK-MVE-NEXT:    vmls.f16 s8, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s13
+; CHECK-MVE-NEXT:    vins.f16 s2, s12
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -161,22 +161,22 @@ define arm_aapcs_vfpcc <8 x half> @vfmar16(<8 x half> %src1, <8 x half> %src2, f
 ; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
 ; CHECK-MVE-NEXT:    vmla.f16 s0, s4, s8
 ; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s5
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
-; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmla.f16 s1, s5, s8
-; CHECK-MVE-NEXT:    vins.f16 s1, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s6
+; CHECK-MVE-NEXT:    vmla.f16 s10, s4, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s2
+; CHECK-MVE-NEXT:    vmla.f16 s10, s4, s8
 ; CHECK-MVE-NEXT:    vmla.f16 s2, s6, s8
-; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s7
-; CHECK-MVE-NEXT:    vins.f16 s2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s3
-; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vmla.f16 s6, s4, s8
 ; CHECK-MVE-NEXT:    vmla.f16 s3, s7, s8
-; CHECK-MVE-NEXT:    vins.f16 s3, s12
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s2, s10
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %src3 = fptrunc float %src3o to half
@@ -207,33 +207,33 @@ define arm_aapcs_vfpcc <8 x half> @vfma16(<8 x half> %src1, <8 x half> %src2, fl
 ; CHECK-MVE:       @ %bb.0: @ %entry
 ; CHECK-MVE-NEXT:    vmov q3, q0
 ; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s12
 ; CHECK-MVE-NEXT:    vmov.f32 s8, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vmla.f16 s8, s2, s0
 ; CHECK-MVE-NEXT:    vmov.f32 s0, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s9, s12
-; CHECK-MVE-NEXT:    vmla.f16 s8, s9, s10
 ; CHECK-MVE-NEXT:    vmla.f16 s0, s12, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmov.f32 s9, s3
 ; CHECK-MVE-NEXT:    vmov.f32 s1, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s13
+; CHECK-MVE-NEXT:    vmov.f32 s8, s3
+; CHECK-MVE-NEXT:    vmla.f16 s8, s4, s2
 ; CHECK-MVE-NEXT:    vmla.f16 s1, s13, s5
-; CHECK-MVE-NEXT:    vmla.f16 s9, s10, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vins.f16 s1, s9
-; CHECK-MVE-NEXT:    vmov.f32 s9, s3
+; CHECK-MVE-NEXT:    vins.f16 s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s14
+; CHECK-MVE-NEXT:    vmov.f32 s8, s3
+; CHECK-MVE-NEXT:    vmla.f16 s8, s4, s2
 ; CHECK-MVE-NEXT:    vmov.f32 s2, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s14
-; CHECK-MVE-NEXT:    vmla.f16 s9, s10, s8
 ; CHECK-MVE-NEXT:    vmla.f16 s2, s14, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s9
-; CHECK-MVE-NEXT:    vmov.f32 s9, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s15
-; CHECK-MVE-NEXT:    vmla.f16 s9, s10, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
+; CHECK-MVE-NEXT:    vmov.f32 s8, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    vmla.f16 s3, s15, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s9
+; CHECK-MVE-NEXT:    vmla.f16 s8, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %src3 = fptrunc float %src3o to half
@@ -364,14 +364,13 @@ define arm_aapcs_vfpcc <4 x float> @vfmas32(<4 x float> %src1, <4 x float> %src2
 ;
 ; CHECK-MVE-LABEL: vfmas32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    @ kill: def $s8 killed $s8 def $q2
 ; CHECK-MVE-NEXT:    vmov.f32 s11, s8
-; CHECK-MVE-NEXT:    vmla.f32 s11, s3, s7
 ; CHECK-MVE-NEXT:    vmov.f32 s10, s8
-; CHECK-MVE-NEXT:    vmla.f32 s10, s2, s6
 ; CHECK-MVE-NEXT:    vmov.f32 s9, s8
-; CHECK-MVE-NEXT:    vmla.f32 s9, s1, s5
 ; CHECK-MVE-NEXT:    vmla.f32 s8, s0, s4
+; CHECK-MVE-NEXT:    vmla.f32 s11, s3, s7
+; CHECK-MVE-NEXT:    vmla.f32 s10, s2, s6
+; CHECK-MVE-NEXT:    vmla.f32 s9, s1, s5
 ; CHECK-MVE-NEXT:    vmov q0, q2
 ; CHECK-MVE-NEXT:    bx lr
 entry:
@@ -401,8 +400,6 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v1_pred(<8 x half> %src1, <8 x half> %
 ;
 ; CHECK-MVE-LABEL: vfma16_v1_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
@@ -417,98 +414,96 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v1_pred(<8 x half> %src1, <8 x half> %
 ; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s13, s15
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s0
+; CHECK-MVE-NEXT:    vmla.f16 s14, s4, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s0, s12
+; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s1
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s9
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
+; CHECK-MVE-NEXT:    vmla.f16 s14, s8, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s1
+; CHECK-MVE-NEXT:    vmov.f32 s8, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s18, s5, s9
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s1, s18
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
+; CHECK-MVE-NEXT:    vmla.f16 s8, s5, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s10
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
+; CHECK-MVE-NEXT:    vmla.f16 s14, s8, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s8, s2
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s2
+; CHECK-MVE-NEXT:    vmla.f16 s8, s6, s10
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s18, s6, s10
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s2, s18
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s2, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s3
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s11
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
+; CHECK-MVE-NEXT:    vmla.f16 s10, s6, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s10
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s3
+; CHECK-MVE-NEXT:    vmov.f32 s6, s3
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmla.f16 s18, s7, s11
+; CHECK-MVE-NEXT:    vmla.f16 s6, s7, s11
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s3, s18
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s3, s6
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -534,8 +529,6 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v2_pred(<8 x half> %src1, <8 x half> %
 ;
 ; CHECK-MVE-LABEL: vfma16_v2_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
@@ -550,98 +543,96 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v2_pred(<8 x half> %src1, <8 x half> %
 ; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s13, s15
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s0
+; CHECK-MVE-NEXT:    vmla.f16 s14, s4, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s0, s12
+; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s1
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s9
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
+; CHECK-MVE-NEXT:    vmla.f16 s14, s8, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s1
+; CHECK-MVE-NEXT:    vmov.f32 s8, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s18, s5, s9
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s1, s18
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
+; CHECK-MVE-NEXT:    vmla.f16 s8, s5, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s10
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
+; CHECK-MVE-NEXT:    vmla.f16 s14, s8, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s8, s2
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s2
+; CHECK-MVE-NEXT:    vmla.f16 s8, s6, s10
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s18, s6, s10
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s2, s18
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s2, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s3
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s11
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmla.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
+; CHECK-MVE-NEXT:    vmla.f16 s10, s6, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s10
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s3
+; CHECK-MVE-NEXT:    vmov.f32 s6, s3
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmla.f16 s18, s7, s11
+; CHECK-MVE-NEXT:    vmla.f16 s6, s7, s11
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s3, s18
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s3, s6
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -667,8 +658,6 @@ define arm_aapcs_vfpcc <8 x half> @vfms16_pred(<8 x half> %src1, <8 x half> %src
 ;
 ; CHECK-MVE-LABEL: vfms16_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-MVE-NEXT:    vmovx.f16 s14, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
@@ -683,98 +672,96 @@ define arm_aapcs_vfpcc <8 x half> @vfms16_pred(<8 x half> %src1, <8 x half> %src
 ; CHECK-MVE-NEXT:    vmls.f16 s15, s14, s12
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s13, s15
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s0
+; CHECK-MVE-NEXT:    vmls.f16 s14, s4, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmls.f16 s12, s4, s8
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s0, s12
+; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s1
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s9
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmls.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
+; CHECK-MVE-NEXT:    vmls.f16 s14, s8, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s1
+; CHECK-MVE-NEXT:    vmov.f32 s8, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmls.f16 s18, s5, s9
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s1, s18
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
+; CHECK-MVE-NEXT:    vmls.f16 s8, s5, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s1, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s10
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmls.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
+; CHECK-MVE-NEXT:    vmls.f16 s14, s8, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s8, s2
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s2
+; CHECK-MVE-NEXT:    vmls.f16 s8, s6, s10
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmls.f16 s18, s6, s10
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s2, s18
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s18, #0
-; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s2, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s3
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s11
-; CHECK-MVE-NEXT:    vmov.f32 s22, s20
-; CHECK-MVE-NEXT:    vmls.f16 s22, s18, s16
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
+; CHECK-MVE-NEXT:    vmls.f16 s10, s6, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s20, s22
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s10
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmov.f32 s18, s3
+; CHECK-MVE-NEXT:    vmov.f32 s6, s3
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmls.f16 s18, s7, s11
+; CHECK-MVE-NEXT:    vmls.f16 s6, s7, s11
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s3, s18
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s3, s6
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <8 x half> %src2, %src3
@@ -805,108 +792,107 @@ define arm_aapcs_vfpcc <8 x half> @vfmar16_pred(<8 x half> %src1, <8 x half> %sr
 ;
 ; CHECK-MVE-LABEL: vfmar16_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s12, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s4
-; CHECK-MVE-NEXT:    vcmp.f16 s8, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s10, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s10, s0
-; CHECK-MVE-NEXT:    vmov.f32 s14, s10
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s8, s8
+; CHECK-MVE-NEXT:    vmov.f32 s14, s12
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s14, s8, s12
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmla.f16 s14, s10, s8
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s14
+; CHECK-MVE-NEXT:    vseleq.f16 s10, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s8, s0
+; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s8, s4, s12
-; CHECK-MVE-NEXT:    vseleq.f16 s8, s0, s8
-; CHECK-MVE-NEXT:    movs r1, #0
-; CHECK-MVE-NEXT:    vins.f16 s8, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s0, s10
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s1
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
+; CHECK-MVE-NEXT:    vmov.f32 s12, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
 ; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s10, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s1
+; CHECK-MVE-NEXT:    vmov.f32 s10, s1
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s13, s5, s12
+; CHECK-MVE-NEXT:    vmla.f16 s10, s5, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s9, s1, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vins.f16 s9, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s13
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s1, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s2
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
+; CHECK-MVE-NEXT:    vmla.f16 s12, s4, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s10, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s2
+; CHECK-MVE-NEXT:    vmov.f32 s10, s2
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmla.f16 s10, s6, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s13, s6, s12
-; CHECK-MVE-NEXT:    vseleq.f16 s10, s2, s13
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s2, s10
 ; CHECK-MVE-NEXT:    movs r1, #0
-; CHECK-MVE-NEXT:    vins.f16 s10, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s13
+; CHECK-MVE-NEXT:    vmov.f32 s10, s6
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s15, s14, s12
+; CHECK-MVE-NEXT:    vmla.f16 s10, s4, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s10
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s3
+; CHECK-MVE-NEXT:    vmov.f32 s6, s3
 ; CHECK-MVE-NEXT:    cset r0, ne
-; CHECK-MVE-NEXT:    vmla.f16 s13, s7, s12
+; CHECK-MVE-NEXT:    vmla.f16 s6, s7, s8
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s11, s3, s13
-; CHECK-MVE-NEXT:    vins.f16 s11, s14
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s3, s6
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %src3 = fptrunc float %src3o to half
@@ -942,104 +928,103 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_pred(<8 x half> %src1, <8 x half> %src
 ; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s10, #0
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s12, s8
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s8, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s0
-; CHECK-MVE-NEXT:    vmov.f32 s8, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s8, s14, s10
+; CHECK-MVE-NEXT:    vmla.f16 s14, s12, s10
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s14, s8
+; CHECK-MVE-NEXT:    vseleq.f16 s10, s12, s14
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s8, s12
+; CHECK-MVE-NEXT:    vmla.f16 s12, s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s8, s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s8, s0, s8
-; CHECK-MVE-NEXT:    movs r1, #0
-; CHECK-MVE-NEXT:    vins.f16 s8, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s0, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vins.f16 s0, s10
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s12
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s15, s13, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s1
+; CHECK-MVE-NEXT:    vmov.f32 s12, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s4
 ; CHECK-MVE-NEXT:    vcmp.f16 s5, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s10, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s12
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s13, s1, s5
+; CHECK-MVE-NEXT:    vmla.f16 s10, s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s9, s1, s13
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s2
-; CHECK-MVE-NEXT:    vins.f16 s9, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s1, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s2
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s15, s13, s14
+; CHECK-MVE-NEXT:    vmla.f16 s12, s10, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s10, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s12
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmla.f16 s10, s2, s6
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmla.f16 s13, s2, s6
-; CHECK-MVE-NEXT:    vseleq.f16 s10, s2, s13
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s2, s10
 ; CHECK-MVE-NEXT:    movs r1, #0
-; CHECK-MVE-NEXT:    vins.f16 s10, s14
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s14, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s12
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s13, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmla.f16 s15, s13, s14
+; CHECK-MVE-NEXT:    vmla.f16 s10, s6, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s7, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s13, s15
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s10
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmla.f16 s12, s3, s7
+; CHECK-MVE-NEXT:    vmla.f16 s8, s3, s7
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s11, s3, s12
-; CHECK-MVE-NEXT:    vins.f16 s11, s14
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s3, s8
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %src3 = fptrunc float %src3o to half
@@ -1068,51 +1053,50 @@ define arm_aapcs_vfpcc <4 x float> @vfma32_v1_pred(<4 x float> %src1, <4 x float
 ;
 ; CHECK-MVE-LABEL: vfma32_v1_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s1
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s1
+; CHECK-MVE-NEXT:    vmla.f32 s14, s4, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    vmov.f32 s14, s0
+; CHECK-MVE-NEXT:    vmov.f32 s8, s3
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s2
+; CHECK-MVE-NEXT:    vmla.f32 s12, s5, s9
 ; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov.f32 s5, s2
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
-; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r3, #1
 ; CHECK-MVE-NEXT:    cmp r3, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s6, #0
 ; CHECK-MVE-NEXT:    cset r3, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmla.f32 s13, s7, s11
 ; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmla.f32 s8, s7, s11
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vmla.f32 s12, s5, s9
-; CHECK-MVE-NEXT:    vmla.f32 s14, s4, s8
-; CHECK-MVE-NEXT:    vmla.f32 s15, s6, s10
-; CHECK-MVE-NEXT:    vseleq.f32 s7, s3, s13
+; CHECK-MVE-NEXT:    vmla.f32 s5, s6, s10
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s8
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s6, s2, s15
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s5, s1, s12
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s12
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s4, s0, s14
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s14
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <4 x float> %src2, %src3
@@ -1138,51 +1122,50 @@ define arm_aapcs_vfpcc <4 x float> @vfma32_v2_pred(<4 x float> %src1, <4 x float
 ;
 ; CHECK-MVE-LABEL: vfma32_v2_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s1
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s1
+; CHECK-MVE-NEXT:    vmla.f32 s14, s4, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    vmov.f32 s14, s0
+; CHECK-MVE-NEXT:    vmov.f32 s8, s3
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s2
+; CHECK-MVE-NEXT:    vmla.f32 s12, s5, s9
 ; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov.f32 s5, s2
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
-; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r3, #1
 ; CHECK-MVE-NEXT:    cmp r3, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s6, #0
 ; CHECK-MVE-NEXT:    cset r3, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmla.f32 s13, s7, s11
 ; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmla.f32 s8, s7, s11
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vmla.f32 s12, s5, s9
-; CHECK-MVE-NEXT:    vmla.f32 s14, s4, s8
-; CHECK-MVE-NEXT:    vmla.f32 s15, s6, s10
-; CHECK-MVE-NEXT:    vseleq.f32 s7, s3, s13
+; CHECK-MVE-NEXT:    vmla.f32 s5, s6, s10
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s8
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s6, s2, s15
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s5, s1, s12
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s12
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s4, s0, s14
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s14
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <4 x float> %src2, %src3
@@ -1208,51 +1191,50 @@ define arm_aapcs_vfpcc <4 x float> @vfms32_pred(<4 x float> %src1, <4 x float> %
 ;
 ; CHECK-MVE-LABEL: vfms32_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    vmov.f32 s14, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s1
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s4, #0
-; CHECK-MVE-NEXT:    vmov.f32 s13, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s12, s1
+; CHECK-MVE-NEXT:    vmls.f32 s14, s4, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    vmov.f32 s14, s0
+; CHECK-MVE-NEXT:    vmov.f32 s8, s3
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s15, s2
+; CHECK-MVE-NEXT:    vmls.f32 s12, s5, s9
 ; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov.f32 s5, s2
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
-; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r3, #1
 ; CHECK-MVE-NEXT:    cmp r3, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s6, #0
 ; CHECK-MVE-NEXT:    cset r3, ne
-; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmls.f32 s13, s7, s11
 ; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmls.f32 s8, s7, s11
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vmls.f32 s12, s5, s9
-; CHECK-MVE-NEXT:    vmls.f32 s14, s4, s8
-; CHECK-MVE-NEXT:    vmls.f32 s15, s6, s10
-; CHECK-MVE-NEXT:    vseleq.f32 s7, s3, s13
+; CHECK-MVE-NEXT:    vmls.f32 s5, s6, s10
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s8
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s6, s2, s15
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s5, s1, s12
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s12
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s4, s0, s14
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s14
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %0 = fmul <4 x float> %src2, %src3
@@ -1281,8 +1263,10 @@ define arm_aapcs_vfpcc <4 x float> @vfmar32_pred(<4 x float> %src1, <4 x float>
 ;
 ; CHECK-MVE-LABEL: vfmar32_pred:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    vmov.f32 s10, s1
 ; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s5, #0
+; CHECK-MVE-NEXT:    movs r2, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -1291,17 +1275,16 @@ define arm_aapcs_vfpcc <4 x float> @vfmar32_pred(<4 x float> %src1, <4 x float>
 ; CHECK-MVE-NEXT:    vmov.f32 s14, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s10, s1
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    vmov.f32 s12, s0
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s9, s2
+; CHECK-MVE-NEXT:    vmla.f32 s10, s5, s8
+; CHECK-MVE-NEXT:    vmov.f32 s5, s2
 ; CHECK-MVE-NEXT:    cset r2, ne
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
 ; CHECK-MVE-NEXT:    movs r3, #0
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov.f32 s12, s0
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r3, #1
 ; CHECK-MVE-NEXT:    cmp r3, #0
@@ -1312,20 +1295,18 @@ define arm_aapcs_vfpcc <4 x float> @vfmar32_pred(<4 x float> %src1, <4 x float>
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vmla.f32 s14, s7, s8
 ; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmla.f32 s14, s7, s8
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vmla.f32 s10, s5, s8
-; CHECK-MVE-NEXT:    vmla.f32 s12, s4, s8
-; CHECK-MVE-NEXT:    vmla.f32 s9, s6, s8
-; CHECK-MVE-NEXT:    vseleq.f32 s7, s3, s14
+; CHECK-MVE-NEXT:    vmla.f32 s5, s6, s8
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s14
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s6, s2, s9
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s5, s1, s10
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s10
+; CHECK-MVE-NEXT:    vmla.f32 s12, s4, s8
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s4, s0, s12
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s12
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %i = insertelement <4 x float> undef, float %src3, i32 0
@@ -1366,15 +1347,15 @@ define arm_aapcs_vfpcc <4 x float> @vfmas32_pred(<4 x float> %src1, <4 x float>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r2, #0
-; CHECK-MVE-NEXT:    vmov.f32 s10, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r2, #1
-; CHECK-MVE-NEXT:    vmov.f32 s12, s8
 ; CHECK-MVE-NEXT:    cmp r2, #0
 ; CHECK-MVE-NEXT:    vcmp.f32 s7, #0
 ; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov.f32 s10, s8
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vmov.f32 s12, s8
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r3, #1
 ; CHECK-MVE-NEXT:    cmp r3, #0
@@ -1388,17 +1369,16 @@ define arm_aapcs_vfpcc <4 x float> @vfmas32_pred(<4 x float> %src1, <4 x float>
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    vmla.f32 s14, s3, s7
 ; CHECK-MVE-NEXT:    cmp r3, #0
-; CHECK-MVE-NEXT:    vmla.f32 s10, s1, s5
-; CHECK-MVE-NEXT:    vmla.f32 s12, s0, s4
 ; CHECK-MVE-NEXT:    vmla.f32 s8, s2, s6
-; CHECK-MVE-NEXT:    vseleq.f32 s7, s3, s14
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s3, s14
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s6, s2, s8
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s2, s8
+; CHECK-MVE-NEXT:    vmla.f32 s10, s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s5, s1, s10
+; CHECK-MVE-NEXT:    vmla.f32 s12, s0, s4
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s1, s10
 ; CHECK-MVE-NEXT:    cmp r2, #0
-; CHECK-MVE-NEXT:    vseleq.f32 s4, s0, s12
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s0, s12
 ; CHECK-MVE-NEXT:    bx lr
 entry:
   %i = insertelement <4 x float> undef, float %src3, i32 0
diff --git a/llvm/test/CodeGen/Thumb2/mve-fmath.ll b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
index d75025c012072..cfda5a737e886 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fmath.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fmath.ll
@@ -5,11 +5,10 @@
 define arm_aapcs_vfpcc <4 x float> @sqrt_float32_t(<4 x float> %src) {
 ; CHECK-LABEL: sqrt_float32_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vsqrt.f32 s7, s3
-; CHECK-NEXT:    vsqrt.f32 s6, s2
-; CHECK-NEXT:    vsqrt.f32 s5, s1
-; CHECK-NEXT:    vsqrt.f32 s4, s0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vsqrt.f32 s3, s3
+; CHECK-NEXT:    vsqrt.f32 s2, s2
+; CHECK-NEXT:    vsqrt.f32 s1, s1
+; CHECK-NEXT:    vsqrt.f32 s0, s0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %src)
@@ -19,23 +18,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @sqrt_float16_t(<8 x half> %src) {
 ; CHECK-LABEL: sqrt_float16_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovx.f16 s0, s4
-; CHECK-NEXT:    vsqrt.f16 s8, s0
-; CHECK-NEXT:    vsqrt.f16 s0, s4
-; CHECK-NEXT:    vins.f16 s0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vsqrt.f16 s8, s8
-; CHECK-NEXT:    vsqrt.f16 s1, s5
-; CHECK-NEXT:    vins.f16 s1, s8
-; CHECK-NEXT:    vmovx.f16 s8, s6
-; CHECK-NEXT:    vsqrt.f16 s8, s8
-; CHECK-NEXT:    vsqrt.f16 s2, s6
-; CHECK-NEXT:    vins.f16 s2, s8
-; CHECK-NEXT:    vmovx.f16 s8, s7
-; CHECK-NEXT:    vsqrt.f16 s8, s8
-; CHECK-NEXT:    vsqrt.f16 s3, s7
-; CHECK-NEXT:    vins.f16 s3, s8
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vsqrt.f16 s0, s0
+; CHECK-NEXT:    vsqrt.f16 s4, s4
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vsqrt.f16 s4, s4
+; CHECK-NEXT:    vsqrt.f16 s1, s1
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vsqrt.f16 s4, s4
+; CHECK-NEXT:    vsqrt.f16 s2, s2
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s3
+; CHECK-NEXT:    vsqrt.f16 s4, s4
+; CHECK-NEXT:    vsqrt.f16 s3, s3
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = call fast <8 x half> @llvm.sqrt.v8f16(<8 x half> %src)
@@ -101,52 +99,52 @@ define arm_aapcs_vfpcc <8 x half> @cos_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl cosf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.cos.v8f16(<8 x half> %src)
@@ -212,52 +210,52 @@ define arm_aapcs_vfpcc <8 x half> @sin_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl sinf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.sin.v8f16(<8 x half> %src)
@@ -323,52 +321,52 @@ define arm_aapcs_vfpcc <8 x half> @exp_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl expf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.exp.v8f16(<8 x half> %src)
@@ -434,52 +432,52 @@ define arm_aapcs_vfpcc <8 x half> @exp2_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl exp2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.exp2.v8f16(<8 x half> %src)
@@ -545,52 +543,52 @@ define arm_aapcs_vfpcc <8 x half> @log_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl logf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.log.v8f16(<8 x half> %src)
@@ -656,52 +654,52 @@ define arm_aapcs_vfpcc <8 x half> @log2_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log2f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.log2.v8f16(<8 x half> %src)
@@ -767,52 +765,52 @@ define arm_aapcs_vfpcc <8 x half> @log10_float16_t(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov q4, q0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
-; CHECK-NEXT:    vmov s20, r0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s20, s20
-; CHECK-NEXT:    vcvtt.f16.f32 s20, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s21, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s22, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s23, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    bl log10f
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s23, s0
-; CHECK-NEXT:    vmov q0, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.log10.v8f16(<8 x half> %src)
@@ -883,8 +881,8 @@ define arm_aapcs_vfpcc <8 x half> @pow_float16_t(<8 x half> %src1, <8 x half> %s
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmov q5, q0
 ; CHECK-NEXT:    vmov q4, q1
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s20
@@ -893,59 +891,59 @@ define arm_aapcs_vfpcc <8 x half> @pow_float16_t(<8 x half> %src1, <8 x half> %s
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s20
-; CHECK-NEXT:    vmov s24, r0
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s16
 ; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov s16, r0
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s24, s24
-; CHECK-NEXT:    vcvtt.f16.f32 s24, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s16
+; CHECK-NEXT:    vcvtt.f16.f32 s16, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s21
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s25, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s21
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s17
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s25, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s17, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s22
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s26, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s22
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s18
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s26, s0
+; CHECK-NEXT:    vcvtt.f16.f32 s18, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s23
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtb.f16.f32 s27, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s23
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    vcvtt.f32.f16 s0, s19
 ; CHECK-NEXT:    vmov r1, s0
 ; CHECK-NEXT:    bl powf
 ; CHECK-NEXT:    vmov s0, r0
-; CHECK-NEXT:    vcvtt.f16.f32 s27, s0
-; CHECK-NEXT:    vmov q0, q6
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vcvtt.f16.f32 s19, s0
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %0 = call fast <8 x half> @llvm.pow.v8f16(<8 x half> %src1, <8 x half> %src2)
@@ -996,8 +994,8 @@ define arm_aapcs_vfpcc <4 x float> @copysign_float32_t(<4 x float> %src1, <4 x f
 ; CHECK-NEXT:    bfi r5, r1, #31, #1
 ; CHECK-NEXT:    lsr.w r1, r12, #31
 ; CHECK-NEXT:    bfi r3, r1, #31, #1
-; CHECK-NEXT:    vmov s3, r5
 ; CHECK-NEXT:    vmov s2, r4
+; CHECK-NEXT:    vmov s3, r5
 ; CHECK-NEXT:    vmov s1, r0
 ; CHECK-NEXT:    vmov s0, r3
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
@@ -1013,81 +1011,80 @@ define arm_aapcs_vfpcc <8 x half> @copysign_float16_t(<8 x half> %src1, <8 x hal
 ; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vmovx.f16 s8, s4
 ; CHECK-NEXT:    vstr.16 s8, [sp, #24]
-; CHECK-NEXT:    vmovx.f16 s8, s5
 ; CHECK-NEXT:    vstr.16 s4, [sp, #28]
-; CHECK-NEXT:    vstr.16 s8, [sp, #16]
-; CHECK-NEXT:    vmovx.f16 s8, s6
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vstr.16 s4, [sp, #16]
+; CHECK-NEXT:    vmovx.f16 s4, s6
 ; CHECK-NEXT:    vstr.16 s5, [sp, #20]
-; CHECK-NEXT:    vstr.16 s8, [sp, #8]
-; CHECK-NEXT:    vmovx.f16 s8, s7
+; CHECK-NEXT:    vstr.16 s4, [sp, #8]
+; CHECK-NEXT:    vmovx.f16 s4, s7
 ; CHECK-NEXT:    vstr.16 s6, [sp, #12]
-; CHECK-NEXT:    vstr.16 s8, [sp]
+; CHECK-NEXT:    vstr.16 s4, [sp]
 ; CHECK-NEXT:    vstr.16 s7, [sp, #4]
-; CHECK-NEXT:    vmovx.f16 s4, s0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #25]
+; CHECK-NEXT:    vmovx.f16 s4, s0
 ; CHECK-NEXT:    vabs.f16 s4, s4
-; CHECK-NEXT:    vneg.f16 s6, s4
+; CHECK-NEXT:    vabs.f16 s0, s0
 ; CHECK-NEXT:    tst.w r0, #128
+; CHECK-NEXT:    vneg.f16 s6, s4
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #29]
-; CHECK-NEXT:    vseleq.f16 s8, s4, s6
-; CHECK-NEXT:    vabs.f16 s4, s0
+; CHECK-NEXT:    vseleq.f16 s4, s4, s6
+; CHECK-NEXT:    vneg.f16 s6, s0
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vneg.f16 s6, s4
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vabs.f16 s0, s3
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #17]
-; CHECK-NEXT:    vseleq.f16 s4, s4, s6
+; CHECK-NEXT:    vseleq.f16 s0, s0, s6
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vabs.f16 s8, s8
+; CHECK-NEXT:    vabs.f16 s4, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #21]
-; CHECK-NEXT:    vneg.f16 s10, s8
-; CHECK-NEXT:    vseleq.f16 s8, s8, s10
-; CHECK-NEXT:    vabs.f16 s10, s1
+; CHECK-NEXT:    vneg.f16 s6, s4
+; CHECK-NEXT:    vseleq.f16 s4, s4, s6
+; CHECK-NEXT:    vabs.f16 s6, s1
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vneg.f16 s12, s10
+; CHECK-NEXT:    vneg.f16 s8, s6
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #9]
-; CHECK-NEXT:    vseleq.f16 s5, s10, s12
+; CHECK-NEXT:    vseleq.f16 s1, s6, s8
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vmovx.f16 s4, s2
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vabs.f16 s8, s8
+; CHECK-NEXT:    vabs.f16 s4, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #13]
-; CHECK-NEXT:    vneg.f16 s10, s8
-; CHECK-NEXT:    vseleq.f16 s8, s8, s10
-; CHECK-NEXT:    vabs.f16 s10, s2
+; CHECK-NEXT:    vneg.f16 s6, s4
+; CHECK-NEXT:    vseleq.f16 s4, s4, s6
+; CHECK-NEXT:    vabs.f16 s2, s2
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vneg.f16 s12, s10
+; CHECK-NEXT:    vneg.f16 s6, s2
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vneg.f16 s2, s0
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #1]
-; CHECK-NEXT:    vseleq.f16 s6, s10, s12
+; CHECK-NEXT:    vseleq.f16 s2, s2, s6
 ; CHECK-NEXT:    tst.w r0, #128
-; CHECK-NEXT:    vins.f16 s6, s8
-; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s3
 ; CHECK-NEXT:    cset r0, ne
-; CHECK-NEXT:    vabs.f16 s8, s8
+; CHECK-NEXT:    vabs.f16 s4, s4
 ; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    ldrb.w r0, [sp, #5]
-; CHECK-NEXT:    vneg.f16 s10, s8
-; CHECK-NEXT:    vseleq.f16 s8, s8, s10
+; CHECK-NEXT:    vneg.f16 s6, s4
+; CHECK-NEXT:    vseleq.f16 s4, s4, s6
+; CHECK-NEXT:    vabs.f16 s6, s3
 ; CHECK-NEXT:    tst.w r0, #128
+; CHECK-NEXT:    vneg.f16 s8, s6
 ; CHECK-NEXT:    cset r0, ne
 ; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    vseleq.f16 s7, s0, s2
-; CHECK-NEXT:    vins.f16 s7, s8
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vseleq.f16 s3, s6, s8
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll
index 1ec651ac06de8..0e993f35ce85d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-fp-negabs.ll
@@ -5,23 +5,22 @@
 define arm_aapcs_vfpcc <8 x half> @fneg_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: fneg_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vneg.f16 s8, s0
-; CHECK-MVE-NEXT:    vneg.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vneg.f16 s8, s8
-; CHECK-MVE-NEXT:    vneg.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vneg.f16 s8, s8
-; CHECK-MVE-NEXT:    vneg.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vneg.f16 s8, s8
-; CHECK-MVE-NEXT:    vneg.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vneg.f16 s0, s0
+; CHECK-MVE-NEXT:    vneg.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vneg.f16 s4, s4
+; CHECK-MVE-NEXT:    vneg.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vneg.f16 s4, s4
+; CHECK-MVE-NEXT:    vneg.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vneg.f16 s4, s4
+; CHECK-MVE-NEXT:    vneg.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fneg_float16_t:
@@ -36,11 +35,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @fneg_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: fneg_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vneg.f32 s7, s3
-; CHECK-MVE-NEXT:    vneg.f32 s6, s2
-; CHECK-MVE-NEXT:    vneg.f32 s5, s1
-; CHECK-MVE-NEXT:    vneg.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vneg.f32 s3, s3
+; CHECK-MVE-NEXT:    vneg.f32 s2, s2
+; CHECK-MVE-NEXT:    vneg.f32 s1, s1
+; CHECK-MVE-NEXT:    vneg.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fneg_float32_t:
@@ -77,23 +75,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fabs_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: fabs_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vabs.f16 s8, s0
-; CHECK-MVE-NEXT:    vabs.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vabs.f16 s8, s8
-; CHECK-MVE-NEXT:    vabs.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vabs.f16 s8, s8
-; CHECK-MVE-NEXT:    vabs.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vabs.f16 s8, s8
-; CHECK-MVE-NEXT:    vabs.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vabs.f16 s0, s0
+; CHECK-MVE-NEXT:    vabs.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vabs.f16 s4, s4
+; CHECK-MVE-NEXT:    vabs.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vabs.f16 s4, s4
+; CHECK-MVE-NEXT:    vabs.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vabs.f16 s4, s4
+; CHECK-MVE-NEXT:    vabs.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fabs_float16_t:
@@ -108,11 +105,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @fabs_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: fabs_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vabs.f32 s7, s3
-; CHECK-MVE-NEXT:    vabs.f32 s6, s2
-; CHECK-MVE-NEXT:    vabs.f32 s5, s1
-; CHECK-MVE-NEXT:    vabs.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vabs.f32 s3, s3
+; CHECK-MVE-NEXT:    vabs.f32 s2, s2
+; CHECK-MVE-NEXT:    vabs.f32 s1, s1
+; CHECK-MVE-NEXT:    vabs.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fabs_float32_t:
diff --git a/llvm/test/CodeGen/Thumb2/mve-frint.ll b/llvm/test/CodeGen/Thumb2/mve-frint.ll
index 8cfc1418d4d9b..1d7dcc8bf8440 100644
--- a/llvm/test/CodeGen/Thumb2/mve-frint.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-frint.ll
@@ -5,11 +5,10 @@
 define arm_aapcs_vfpcc <4 x float> @fceil_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: fceil_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vrintp.f32 s7, s3
-; CHECK-MVE-NEXT:    vrintp.f32 s6, s2
-; CHECK-MVE-NEXT:    vrintp.f32 s5, s1
-; CHECK-MVE-NEXT:    vrintp.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrintp.f32 s3, s3
+; CHECK-MVE-NEXT:    vrintp.f32 s2, s2
+; CHECK-MVE-NEXT:    vrintp.f32 s1, s1
+; CHECK-MVE-NEXT:    vrintp.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fceil_float32_t:
@@ -24,23 +23,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fceil_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: fceil_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vrintp.f16 s8, s0
-; CHECK-MVE-NEXT:    vrintp.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vrintp.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintp.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vrintp.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintp.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vrintp.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintp.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vrintp.f16 s0, s0
+; CHECK-MVE-NEXT:    vrintp.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vrintp.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintp.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vrintp.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintp.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vrintp.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintp.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fceil_float16_t:
@@ -79,11 +77,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @ftrunc_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: ftrunc_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vrintz.f32 s7, s3
-; CHECK-MVE-NEXT:    vrintz.f32 s6, s2
-; CHECK-MVE-NEXT:    vrintz.f32 s5, s1
-; CHECK-MVE-NEXT:    vrintz.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrintz.f32 s3, s3
+; CHECK-MVE-NEXT:    vrintz.f32 s2, s2
+; CHECK-MVE-NEXT:    vrintz.f32 s1, s1
+; CHECK-MVE-NEXT:    vrintz.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: ftrunc_float32_t:
@@ -98,23 +95,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @ftrunc_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: ftrunc_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vrintz.f16 s8, s0
-; CHECK-MVE-NEXT:    vrintz.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vrintz.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintz.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vrintz.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintz.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vrintz.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintz.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vrintz.f16 s0, s0
+; CHECK-MVE-NEXT:    vrintz.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vrintz.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintz.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vrintz.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintz.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vrintz.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintz.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: ftrunc_float16_t:
@@ -153,11 +149,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @frint_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: frint_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vrintx.f32 s7, s3
-; CHECK-MVE-NEXT:    vrintx.f32 s6, s2
-; CHECK-MVE-NEXT:    vrintx.f32 s5, s1
-; CHECK-MVE-NEXT:    vrintx.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrintx.f32 s3, s3
+; CHECK-MVE-NEXT:    vrintx.f32 s2, s2
+; CHECK-MVE-NEXT:    vrintx.f32 s1, s1
+; CHECK-MVE-NEXT:    vrintx.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: frint_float32_t:
@@ -172,23 +167,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @frint_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: frint_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vrintx.f16 s8, s0
-; CHECK-MVE-NEXT:    vrintx.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vrintx.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintx.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vrintx.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintx.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vrintx.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintx.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vrintx.f16 s0, s0
+; CHECK-MVE-NEXT:    vrintx.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vrintx.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintx.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vrintx.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintx.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vrintx.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintx.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: frint_float16_t:
@@ -227,11 +221,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @fnearbyint_float32_t(<4 x float> %src) {
 ; CHECK-LABEL: fnearbyint_float32_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vrintr.f32 s7, s3
-; CHECK-NEXT:    vrintr.f32 s6, s2
-; CHECK-NEXT:    vrintr.f32 s5, s1
-; CHECK-NEXT:    vrintr.f32 s4, s0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vrintr.f32 s3, s3
+; CHECK-NEXT:    vrintr.f32 s2, s2
+; CHECK-NEXT:    vrintr.f32 s1, s1
+; CHECK-NEXT:    vrintr.f32 s0, s0
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %src)
@@ -241,23 +234,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fnearbyint_float16_t(<8 x half> %src) {
 ; CHECK-LABEL: fnearbyint_float16_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovx.f16 s0, s4
-; CHECK-NEXT:    vrintr.f16 s8, s0
-; CHECK-NEXT:    vrintr.f16 s0, s4
-; CHECK-NEXT:    vins.f16 s0, s8
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vrintr.f16 s8, s8
-; CHECK-NEXT:    vrintr.f16 s1, s5
-; CHECK-NEXT:    vins.f16 s1, s8
-; CHECK-NEXT:    vmovx.f16 s8, s6
-; CHECK-NEXT:    vrintr.f16 s8, s8
-; CHECK-NEXT:    vrintr.f16 s2, s6
-; CHECK-NEXT:    vins.f16 s2, s8
-; CHECK-NEXT:    vmovx.f16 s8, s7
-; CHECK-NEXT:    vrintr.f16 s8, s8
-; CHECK-NEXT:    vrintr.f16 s3, s7
-; CHECK-NEXT:    vins.f16 s3, s8
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vrintr.f16 s0, s0
+; CHECK-NEXT:    vrintr.f16 s4, s4
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vrintr.f16 s4, s4
+; CHECK-NEXT:    vrintr.f16 s1, s1
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vrintr.f16 s4, s4
+; CHECK-NEXT:    vrintr.f16 s2, s2
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s3
+; CHECK-NEXT:    vrintr.f16 s4, s4
+; CHECK-NEXT:    vrintr.f16 s3, s3
+; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = call fast <8 x half> @llvm.nearbyint.v8f16(<8 x half> %src)
@@ -291,11 +283,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @ffloor_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: ffloor_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vrintm.f32 s7, s3
-; CHECK-MVE-NEXT:    vrintm.f32 s6, s2
-; CHECK-MVE-NEXT:    vrintm.f32 s5, s1
-; CHECK-MVE-NEXT:    vrintm.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrintm.f32 s3, s3
+; CHECK-MVE-NEXT:    vrintm.f32 s2, s2
+; CHECK-MVE-NEXT:    vrintm.f32 s1, s1
+; CHECK-MVE-NEXT:    vrintm.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: ffloor_float32_t:
@@ -310,23 +301,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @ffloor_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: ffloor_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vrintm.f16 s8, s0
-; CHECK-MVE-NEXT:    vrintm.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vrintm.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintm.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vrintm.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintm.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vrintm.f16 s8, s8
-; CHECK-MVE-NEXT:    vrintm.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vrintm.f16 s0, s0
+; CHECK-MVE-NEXT:    vrintm.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vrintm.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintm.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vrintm.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintm.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vrintm.f16 s4, s4
+; CHECK-MVE-NEXT:    vrintm.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: ffloor_float16_t:
@@ -365,11 +355,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @fround_float32_t(<4 x float> %src) {
 ; CHECK-MVE-LABEL: fround_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vrinta.f32 s7, s3
-; CHECK-MVE-NEXT:    vrinta.f32 s6, s2
-; CHECK-MVE-NEXT:    vrinta.f32 s5, s1
-; CHECK-MVE-NEXT:    vrinta.f32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vrinta.f32 s3, s3
+; CHECK-MVE-NEXT:    vrinta.f32 s2, s2
+; CHECK-MVE-NEXT:    vrinta.f32 s1, s1
+; CHECK-MVE-NEXT:    vrinta.f32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fround_float32_t:
@@ -384,23 +373,22 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fround_float16_t(<8 x half> %src) {
 ; CHECK-MVE-LABEL: fround_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q1, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s4
-; CHECK-MVE-NEXT:    vrinta.f16 s8, s0
-; CHECK-MVE-NEXT:    vrinta.f16 s0, s4
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
-; CHECK-MVE-NEXT:    vrinta.f16 s8, s8
-; CHECK-MVE-NEXT:    vrinta.f16 s1, s5
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
-; CHECK-MVE-NEXT:    vrinta.f16 s8, s8
-; CHECK-MVE-NEXT:    vrinta.f16 s2, s6
-; CHECK-MVE-NEXT:    vins.f16 s2, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s8, s7
-; CHECK-MVE-NEXT:    vrinta.f16 s8, s8
-; CHECK-MVE-NEXT:    vrinta.f16 s3, s7
-; CHECK-MVE-NEXT:    vins.f16 s3, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s0
+; CHECK-MVE-NEXT:    vrinta.f16 s0, s0
+; CHECK-MVE-NEXT:    vrinta.f16 s4, s4
+; CHECK-MVE-NEXT:    vins.f16 s0, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vrinta.f16 s4, s4
+; CHECK-MVE-NEXT:    vrinta.f16 s1, s1
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vrinta.f16 s4, s4
+; CHECK-MVE-NEXT:    vrinta.f16 s2, s2
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vrinta.f16 s4, s4
+; CHECK-MVE-NEXT:    vrinta.f16 s3, s3
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: fround_float16_t:
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
index 37e4122ac012c..ac1c0d03c85b5 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll
@@ -89,23 +89,23 @@ define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_sext(i16* %base, <8 x i16>* %off
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q0, [r1]
 ; CHECK-NEXT:    vshl.i32 q0, q0, #1
-; CHECK-NEXT:    vadd.i32 q1, q0, r0
-; CHECK-NEXT:    vmov r2, r3, d2
-; CHECK-NEXT:    vldr.16 s8, [r3]
-; CHECK-NEXT:    vldr.16 s0, [r2]
-; CHECK-NEXT:    vmov r2, r3, d3
-; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    vldr.16 s4, [r3]
-; CHECK-NEXT:    vldr.16 s1, [r2]
-; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vldr.16 s0, [r2]
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vins.f16 s0, s4
 ; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
+; CHECK-NEXT:    vldr.16 s2, [r3]
+; CHECK-NEXT:    vldr.16 s1, [r2]
 ; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vins.f16 s1, s2
 ; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vldr.16 s8, [r1]
+; CHECK-NEXT:    vldr.16 s4, [r1]
 ; CHECK-NEXT:    vldr.16 s2, [r0]
 ; CHECK-NEXT:    vmov r0, r1, d3
-; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vins.f16 s2, s4
 ; CHECK-NEXT:    vldr.16 s4, [r1]
 ; CHECK-NEXT:    vldr.16 s3, [r0]
 ; CHECK-NEXT:    vins.f16 s3, s4
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
index 816969209ff8d..654e7eea28a1c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@@ -172,10 +172,10 @@ define arm_aapcs_vfpcc <8 x float> @ptr_v8f32(<8 x float*>* %offptr) {
 ; CHECK-NEXT:    vldr s3, [r2]
 ; CHECK-NEXT:    vldr s2, [r12]
 ; CHECK-NEXT:    vldr s1, [r1]
+; CHECK-NEXT:    vldr s0, [lr]
 ; CHECK-NEXT:    vldr s7, [r3]
 ; CHECK-NEXT:    vldr s6, [r0]
 ; CHECK-NEXT:    vldr s5, [r5]
-; CHECK-NEXT:    vldr s0, [lr]
 ; CHECK-NEXT:    vldr s4, [r4]
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -413,23 +413,23 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @ptr_f16(<8 x half*>* %offptr) {
 ; CHECK-LABEL: ptr_f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov r1, r2, d2
-; CHECK-NEXT:    vldr.16 s8, [r2]
-; CHECK-NEXT:    vldr.16 s0, [r1]
-; CHECK-NEXT:    vmov r1, r2, d3
-; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r1, r2, d0
 ; CHECK-NEXT:    vldr.16 s4, [r2]
-; CHECK-NEXT:    vldr.16 s1, [r1]
-; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vldr.16 s0, [r1]
+; CHECK-NEXT:    vmov r1, r2, d1
+; CHECK-NEXT:    vins.f16 s0, s4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldr.16 s1, [r1]
+; CHECK-NEXT:    vldr.16 s2, [r2]
 ; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vldr.16 s8, [r1]
+; CHECK-NEXT:    vins.f16 s1, s2
+; CHECK-NEXT:    vldr.16 s4, [r1]
 ; CHECK-NEXT:    vldr.16 s2, [r0]
 ; CHECK-NEXT:    vmov r0, r1, d3
-; CHECK-NEXT:    vins.f16 s2, s8
-; CHECK-NEXT:    vldr.16 s4, [r1]
 ; CHECK-NEXT:    vldr.16 s3, [r0]
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vldr.16 s4, [r1]
 ; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
@@ -441,15 +441,15 @@ entry:
 define arm_aapcs_vfpcc <4 x half> @ptr_v4f16(<4 x half*>* %offptr) {
 ; CHECK-LABEL: ptr_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vldr.16 s8, [r1]
-; CHECK-NEXT:    vldr.16 s0, [r0]
-; CHECK-NEXT:    vmov r0, r1, d3
-; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r0, r1, d0
 ; CHECK-NEXT:    vldr.16 s4, [r1]
+; CHECK-NEXT:    vldr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vldr.16 s2, [r1]
 ; CHECK-NEXT:    vldr.16 s1, [r0]
-; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vins.f16 s1, s2
 ; CHECK-NEXT:    bx lr
 entry:
   %offs = load <4 x half*>, <4 x half*>* %offptr, align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll
index 9d4d261a82709..917cec927a993 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll
@@ -81,8 +81,6 @@ declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.mve.vld4q.v16i8
 define arm_aapcs_vfpcc void @test_vst2q_u32(i32* %addr, %struct.uint32x4x2_t %value.coerce) {
 ; CHECK-LABEL: test_vst2q_u32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    vst20.32 {q0, q1}, [r0]
 ; CHECK-NEXT:    vst21.32 {q0, q1}, [r0]
 ; CHECK-NEXT:    bx lr
@@ -97,8 +95,6 @@ entry:
 define arm_aapcs_vfpcc i32* @test_vst2q_u32_post(i32* %addr, %struct.uint32x4x2_t %value.coerce) {
 ; CHECK-LABEL: test_vst2q_u32_post:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    vst20.32 {q0, q1}, [r0]
 ; CHECK-NEXT:    vst21.32 {q0, q1}, [r0]!
 ; CHECK-NEXT:    bx lr
@@ -116,8 +112,6 @@ declare void @llvm.arm.mve.vst2q.p0i32.v4i32(i32*, <4 x i32>, <4 x i32>, i32)
 define arm_aapcs_vfpcc void @test_vst2q_f16(half* %addr, %struct.float16x8x2_t %value.coerce) {
 ; CHECK-LABEL: test_vst2q_f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    vst20.16 {q0, q1}, [r0]
 ; CHECK-NEXT:    vst21.16 {q0, q1}, [r0]
 ; CHECK-NEXT:    bx lr
@@ -132,8 +126,6 @@ entry:
 define arm_aapcs_vfpcc half* @test_vst2q_f16_post(half* %addr, %struct.float16x8x2_t %value.coerce) {
 ; CHECK-LABEL: test_vst2q_f16_post:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1
 ; CHECK-NEXT:    vst20.16 {q0, q1}, [r0]
 ; CHECK-NEXT:    vst21.16 {q0, q1}, [r0]!
 ; CHECK-NEXT:    bx lr
@@ -151,10 +143,6 @@ declare void @llvm.arm.mve.vst2q.p0f16.v8f16(half*, <8 x half>, <8 x half>, i32)
 define arm_aapcs_vfpcc void @test_vst4q_s8(i8* %addr, %struct.int8x16x4_t %value.coerce) {
 ; CHECK-LABEL: test_vst4q_s8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    vst40.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst41.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst42.8 {q0, q1, q2, q3}, [r0]
@@ -175,10 +163,6 @@ entry:
 define arm_aapcs_vfpcc i8* @test_vst4q_s8_post(i8* %addr, %struct.int8x16x4_t %value.coerce) {
 ; CHECK-LABEL: test_vst4q_s8_post:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3
 ; CHECK-NEXT:    vst40.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst41.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst42.8 {q0, q1, q2, q3}, [r0]
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
index bf601d71761cc..120105cfd14c7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-cost.ll
@@ -6,55 +6,55 @@ define arm_aapcs_vfpcc <4 x i32> @loads_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    .vsave {d9}
+; CHECK-NEXT:    vpush {d9}
 ; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    vldrw.u32 q5, [r2]
 ; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.f32 s10, s7
 ; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vand q2, q2, q0
+; CHECK-NEXT:    vand q3, q2, q0
 ; CHECK-NEXT:    vand q0, q1, q0
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov r4, r1, d4
-; CHECK-NEXT:    vmov.f32 s12, s6
-; CHECK-NEXT:    vmov.f32 s14, s7
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov.f32 s16, s22
-; CHECK-NEXT:    vmov.f32 s18, s23
 ; CHECK-NEXT:    vmov r3, lr, d0
-; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vmov r0, r12, d5
-; CHECK-NEXT:    vmov.f32 s8, s20
-; CHECK-NEXT:    vmov.f32 s10, s21
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov r4, r1, d6
+; CHECK-NEXT:    vmov r0, r12, d7
+; CHECK-NEXT:    vldrw.u32 q3, [r2]
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vmov.f32 s18, s15
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    vmov.f32 s0, s12
+; CHECK-NEXT:    vmov.f32 s6, s13
 ; CHECK-NEXT:    adds r2, r5, r4
-; CHECK-NEXT:    vmov r4, s16
+; CHECK-NEXT:    vmov r4, s8
 ; CHECK-NEXT:    asr.w r6, r5, #31
 ; CHECK-NEXT:    adcs r1, r6
 ; CHECK-NEXT:    asrl r2, r1, r4
 ; CHECK-NEXT:    vmov r1, s4
 ; CHECK-NEXT:    adds r6, r1, r3
-; CHECK-NEXT:    vmov r3, s8
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    asr.w r4, r1, #31
 ; CHECK-NEXT:    adc.w r1, r4, lr
 ; CHECK-NEXT:    asrl r6, r1, r3
 ; CHECK-NEXT:    vmov r5, r4, d1
-; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov r1, s10
 ; CHECK-NEXT:    vmov q0[2], q0[0], r6, r2
 ; CHECK-NEXT:    adds r0, r0, r1
 ; CHECK-NEXT:    asr.w r3, r1, #31
 ; CHECK-NEXT:    adc.w r1, r3, r12
 ; CHECK-NEXT:    vmov r3, s18
 ; CHECK-NEXT:    asrl r0, r1, r3
-; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    adds r6, r1, r5
 ; CHECK-NEXT:    asr.w r2, r1, #31
 ; CHECK-NEXT:    adc.w r1, r2, r4
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    asrl r6, r1, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r6, r0
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vpop {d9}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %a = load <4 x i32>, <4 x i32> *%A, align 4
@@ -142,30 +142,30 @@ define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    .vsave {d9}
+; CHECK-NEXT:    vpush {d9}
+; CHECK-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-NEXT:    vmov.i64 q0, #0xffffffff
-; CHECK-NEXT:    vldrw.u32 q5, [r2]
-; CHECK-NEXT:    vmov.f32 s4, s10
-; CHECK-NEXT:    vmov.f32 s6, s11
-; CHECK-NEXT:    vmov.f32 s10, s9
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vand q3, q2, q0
 ; CHECK-NEXT:    vand q1, q1, q0
-; CHECK-NEXT:    vand q2, q2, q0
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vmov r5, r1, d2
-; CHECK-NEXT:    vmov.f32 s12, s2
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov r6, s12
-; CHECK-NEXT:    vmov.f32 s16, s22
-; CHECK-NEXT:    vmov.f32 s18, s23
-; CHECK-NEXT:    vmov r4, lr, d4
-; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov r0, r12, d3
-; CHECK-NEXT:    vmov.f32 s4, s20
-; CHECK-NEXT:    vmov.f32 s6, s21
+; CHECK-NEXT:    vmov r4, lr, d2
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov r5, r1, d6
+; CHECK-NEXT:    vmov r0, r12, d7
+; CHECK-NEXT:    vldrw.u32 q3, [r2]
+; CHECK-NEXT:    vmov.f32 s10, s3
+; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vmov.f32 s18, s15
+; CHECK-NEXT:    vmov.f32 s14, s1
+; CHECK-NEXT:    vmov r6, s4
+; CHECK-NEXT:    vmov.f32 s4, s12
+; CHECK-NEXT:    vmov.f32 s2, s13
 ; CHECK-NEXT:    adds r2, r6, r5
-; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov r5, s8
 ; CHECK-NEXT:    asr.w r7, r6, #31
 ; CHECK-NEXT:    adcs r1, r7
 ; CHECK-NEXT:    asrl r2, r1, r5
@@ -175,23 +175,23 @@ define arm_aapcs_vfpcc void @load_store_i32(<4 x i32> *%A, <4 x i32> *%B, <4 x i
 ; CHECK-NEXT:    asr.w r5, r1, #31
 ; CHECK-NEXT:    adc.w r1, r5, lr
 ; CHECK-NEXT:    asrl r4, r1, r7
-; CHECK-NEXT:    vmov r6, r5, d5
-; CHECK-NEXT:    vmov r1, s14
-; CHECK-NEXT:    vmov q2[2], q2[0], r4, r2
+; CHECK-NEXT:    vmov r6, r5, d3
+; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov q1[2], q1[0], r4, r2
 ; CHECK-NEXT:    adds r0, r0, r1
 ; CHECK-NEXT:    asr.w r7, r1, #31
 ; CHECK-NEXT:    adc.w r1, r7, r12
 ; CHECK-NEXT:    vmov r7, s18
 ; CHECK-NEXT:    asrl r0, r1, r7
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    adds r6, r6, r1
 ; CHECK-NEXT:    asr.w r2, r1, #31
 ; CHECK-NEXT:    adc.w r1, r2, r5
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    asrl r6, r1, r2
-; CHECK-NEXT:    vmov q2[3], q2[1], r6, r0
-; CHECK-NEXT:    vstrw.32 q2, [r3]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    vmov q1[3], q1[1], r6, r0
+; CHECK-NEXT:    vstrw.32 q1, [r3]
+; CHECK-NEXT:    vpop {d9}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
 entry:
@@ -280,9 +280,9 @@ define arm_aapcs_vfpcc void @load_one_store_i32(<4 x i32> *%A, <4 x i32> *%D) {
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    adds.w r12, r2, r2
 ; CHECK-NEXT:    asr.w r3, r2, #31
 ; CHECK-NEXT:    adc.w r7, r3, r2, asr #31
@@ -370,26 +370,24 @@ define arm_aapcs_vfpcc void @mul_i32(<4 x i32> *%A, <4 x i32> *%B, i64 %C, <4 x
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
 ; CHECK-NEXT:    ldr.w lr, [sp, #20]
-; CHECK-NEXT:    vmov.f32 s8, s0
-; CHECK-NEXT:    vmov.f32 s12, s4
-; CHECK-NEXT:    vmov.f32 s14, s5
 ; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov r5, s4
+; CHECK-NEXT:    vmov.f32 s4, s6
+; CHECK-NEXT:    vmov.f32 s6, s7
 ; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    smull r12, r3, r1, r0
-; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov.f32 s8, s2
-; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmullb.s32 q2, q1, q0
 ; CHECK-NEXT:    asrl r12, r3, r2
-; CHECK-NEXT:    vmov.f32 s2, s7
-; CHECK-NEXT:    vmullb.s32 q1, q0, q2
-; CHECK-NEXT:    vmov r6, r1, d2
-; CHECK-NEXT:    vmov r4, r7, d3
+; CHECK-NEXT:    vmov r6, r1, d4
+; CHECK-NEXT:    vmov r4, r7, d5
 ; CHECK-NEXT:    asrl r6, r1, r2
 ; CHECK-NEXT:    asrl r4, r7, r2
 ; CHECK-NEXT:    smull r0, r5, r5, r0
diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
index 44fd3e621969c..655a67bad734e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving.ll
@@ -65,20 +65,20 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_trunc_i32(<4 x i32> %a, <4 x i32> %b)
 ; CHECK-LABEL: ext_add_trunc_i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f32 s8, s6
-; CHECK-NEXT:    vmov.f32 s12, s2
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s6, s7
 ; CHECK-NEXT:    vmov r0, s8
-; CHECK-NEXT:    vmov r1, s12
-; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vmov r2, s14
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov.f32 s8, s2
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov r1, s8
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.f32 s2, s5
 ; CHECK-NEXT:    add.w r12, r1, r0
-; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    add r1, r2
-; CHECK-NEXT:    vmov r2, s6
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    add r0, r3
@@ -184,17 +184,17 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32>
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vmov.f32 s12, s6
 ; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s14, s7
 ; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov.f32 s14, s7
 ; CHECK-NEXT:    vand q1, q1, q2
-; CHECK-NEXT:    vmov.f32 s8, s2
-; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov r4, s8
-; CHECK-NEXT:    vmov r0, r1, d6
 ; CHECK-NEXT:    vmov r2, r3, d2
-; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vand q3, q3, q2
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov r0, r1, d6
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov.f32 s10, s1
 ; CHECK-NEXT:    vmov r12, lr, d7
+; CHECK-NEXT:    vmov r4, s4
 ; CHECK-NEXT:    adds r0, r0, r4
 ; CHECK-NEXT:    asr.w r5, r4, #31
 ; CHECK-NEXT:    adcs r1, r5
@@ -205,9 +205,9 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32>
 ; CHECK-NEXT:    adcs r3, r4
 ; CHECK-NEXT:    lsrl r2, r3, #1
 ; CHECK-NEXT:    vmov r1, r5, d3
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov q1[2], q1[0], r2, r0
-; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    adds.w r4, r3, r12
 ; CHECK-NEXT:    asr.w r6, r3, #31
 ; CHECK-NEXT:    adc.w r3, r6, lr
@@ -216,8 +216,7 @@ define arm_aapcs_vfpcc <4 x i32> @ext_add_ashr_trunc_i32(<4 x i32> %a, <4 x i32>
 ; CHECK-NEXT:    adc.w r1, r2, r5
 ; CHECK-NEXT:    lsrl r4, r3, #1
 ; CHECK-NEXT:    lsrl r0, r1, #1
-; CHECK-NEXT:    vmov q1[3], q1[1], r0, r4
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r4
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %sa = sext <4 x i32> %a to <4 x i64>
@@ -346,11 +345,11 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b)
 ; CHECK-NEXT:    vand q2, q2, q3
 ; CHECK-NEXT:    vand q1, q1, q3
 ; CHECK-NEXT:    vmov.f32 s12, s2
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    vmov r12, r2, d5
 ; CHECK-NEXT:    vmov r8, r9, d3
-; CHECK-NEXT:    vmov r1, s14
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.f32 s2, s1
 ; CHECK-NEXT:    vmov lr, s2
 ; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
 ; CHECK-NEXT:    adds.w r4, r1, r12
@@ -359,21 +358,21 @@ define arm_aapcs_vfpcc <4 x i32> @ext_ops_trunc_i32(<4 x i32> %a, <4 x i32> %b)
 ; CHECK-NEXT:    asrl r4, r5, r12
 ; CHECK-NEXT:    subs.w r0, r4, r12
 ; CHECK-NEXT:    sbc.w r2, r5, r2
-; CHECK-NEXT:    asr.w r5, lr, #31
 ; CHECK-NEXT:    umull r0, r4, r0, r12
 ; CHECK-NEXT:    adds.w r6, lr, r8
+; CHECK-NEXT:    mla r3, r2, r12, r4
+; CHECK-NEXT:    asr.w r5, lr, #31
 ; CHECK-NEXT:    adc.w r5, r5, r9
+; CHECK-NEXT:    rsbs r2, r1, #0
 ; CHECK-NEXT:    asrl r6, r5, r8
-; CHECK-NEXT:    mla r3, r2, r12, r4
+; CHECK-NEXT:    lsll r0, r3, r2
 ; CHECK-NEXT:    subs.w r7, r6, r8
+; CHECK-NEXT:    vmov r6, r2, d4
 ; CHECK-NEXT:    sbc.w r10, r5, r9
-; CHECK-NEXT:    rsbs r2, r1, #0
 ; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    lsll r0, r3, r2
-; CHECK-NEXT:    vmov r6, r2, d4
 ; CHECK-NEXT:    lsll r0, r3, r12
-; CHECK-NEXT:    asrs r3, r5, #31
 ; CHECK-NEXT:    adds r4, r5, r6
+; CHECK-NEXT:    asr.w r3, r5, #31
 ; CHECK-NEXT:    adcs r3, r2
 ; CHECK-NEXT:    asrl r4, r3, r6
 ; CHECK-NEXT:    subs r4, r4, r6
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
index 2abf5ef19addd..f65ad3e7de22b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
@@ -851,15 +851,15 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    bpl .LBB18_5
 ; CHECK-LE-NEXT:  .LBB18_4: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s4, [r2, #6]
-; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:    vldr.16 s2, [r2, #6]
+; CHECK-LE-NEXT:    vins.f16 s1, s2
 ; CHECK-LE-NEXT:  .LBB18_5: @ %else8
 ; CHECK-LE-NEXT:    vmrs r2, p0
 ; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-LE-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-LE-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-LE-NEXT:    vcvtb.f32.f16 s4, s0
+; CHECK-LE-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-LE-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-LE-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-LE-NEXT:    vcvtb.f32.f16 s0, s0
 ; CHECK-LE-NEXT:    and r3, r2, #1
 ; CHECK-LE-NEXT:    rsbs r3, r3, #0
 ; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
@@ -874,19 +874,19 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r2, s4
+; CHECK-LE-NEXT:    vmovne r2, s0
 ; CHECK-LE-NEXT:    strne r2, [r0]
 ; CHECK-LE-NEXT:    lsls r2, r1, #30
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s5
+; CHECK-LE-NEXT:    vmovmi r2, s1
 ; CHECK-LE-NEXT:    strmi r2, [r0, #4]
 ; CHECK-LE-NEXT:    lsls r2, r1, #29
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s6
+; CHECK-LE-NEXT:    vmovmi r2, s2
 ; CHECK-LE-NEXT:    strmi r2, [r0, #8]
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s7
+; CHECK-LE-NEXT:    vmovmi r1, s3
 ; CHECK-LE-NEXT:    strmi r1, [r0, #12]
 ; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    pop {r7, pc}
@@ -895,14 +895,14 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-LE-NEXT:    lsls r3, r1, #30
 ; CHECK-LE-NEXT:    bpl .LBB18_2
 ; CHECK-LE-NEXT:  .LBB18_7: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s4, [r2, #2]
-; CHECK-LE-NEXT:    vins.f16 s0, s4
+; CHECK-LE-NEXT:    vldr.16 s2, [r2, #2]
+; CHECK-LE-NEXT:    vins.f16 s0, s2
 ; CHECK-LE-NEXT:    lsls r3, r1, #29
 ; CHECK-LE-NEXT:    bpl .LBB18_3
 ; CHECK-LE-NEXT:  .LBB18_8: @ %cond.load4
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-LE-NEXT:    vldr.16 s1, [r2, #4]
-; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:    vmovx.f16 s2, s0
+; CHECK-LE-NEXT:    vins.f16 s1, s2
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    bmi .LBB18_4
 ; CHECK-LE-NEXT:    b .LBB18_5
@@ -942,15 +942,15 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    beq .LBB18_5
 ; CHECK-BE-NEXT:  .LBB18_4: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s4, [r2, #6]
-; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:    vldr.16 s2, [r2, #6]
+; CHECK-BE-NEXT:    vins.f16 s1, s2
 ; CHECK-BE-NEXT:  .LBB18_5: @ %else8
 ; CHECK-BE-NEXT:    vmrs r2, p0
 ; CHECK-BE-NEXT:    movs r1, #0
-; CHECK-BE-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-BE-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-BE-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-BE-NEXT:    vcvtb.f32.f16 s4, s0
+; CHECK-BE-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-BE-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-BE-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-BE-NEXT:    vcvtb.f32.f16 s0, s0
 ; CHECK-BE-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
@@ -965,19 +965,19 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s4
+; CHECK-BE-NEXT:    vmovmi r2, s0
 ; CHECK-BE-NEXT:    strmi r2, [r0]
 ; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s5
+; CHECK-BE-NEXT:    vmovmi r2, s1
 ; CHECK-BE-NEXT:    strmi r2, [r0, #4]
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s6
+; CHECK-BE-NEXT:    vmovmi r2, s2
 ; CHECK-BE-NEXT:    strmi r2, [r0, #8]
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne r1, s7
+; CHECK-BE-NEXT:    vmovne r1, s3
 ; CHECK-BE-NEXT:    strne r1, [r0, #12]
 ; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    pop {r7, pc}
@@ -986,14 +986,14 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 ; CHECK-BE-NEXT:    lsls r3, r1, #29
 ; CHECK-BE-NEXT:    bpl .LBB18_2
 ; CHECK-BE-NEXT:  .LBB18_7: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s4, [r2, #2]
-; CHECK-BE-NEXT:    vins.f16 s0, s4
+; CHECK-BE-NEXT:    vldr.16 s2, [r2, #2]
+; CHECK-BE-NEXT:    vins.f16 s0, s2
 ; CHECK-BE-NEXT:    lsls r3, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB18_3
 ; CHECK-BE-NEXT:  .LBB18_8: @ %cond.load4
-; CHECK-BE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-BE-NEXT:    vldr.16 s1, [r2, #4]
-; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:    vmovx.f16 s2, s0
+; CHECK-BE-NEXT:    vins.f16 s1, s2
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    bne .LBB18_4
 ; CHECK-BE-NEXT:    b .LBB18_5
@@ -1042,15 +1042,15 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    bpl .LBB19_5
 ; CHECK-LE-NEXT:  .LBB19_4: @ %cond.load7
-; CHECK-LE-NEXT:    vldr.16 s4, [r2, #6]
-; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:    vldr.16 s2, [r2, #6]
+; CHECK-LE-NEXT:    vins.f16 s1, s2
 ; CHECK-LE-NEXT:  .LBB19_5: @ %else8
 ; CHECK-LE-NEXT:    vmrs r2, p0
 ; CHECK-LE-NEXT:    movs r1, #0
-; CHECK-LE-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-LE-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-LE-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-LE-NEXT:    vcvtb.f32.f16 s4, s0
+; CHECK-LE-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-LE-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-LE-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-LE-NEXT:    vcvtb.f32.f16 s0, s0
 ; CHECK-LE-NEXT:    and r3, r2, #1
 ; CHECK-LE-NEXT:    rsbs r3, r3, #0
 ; CHECK-LE-NEXT:    bfi r1, r3, #0, #1
@@ -1065,19 +1065,19 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    itt ne
-; CHECK-LE-NEXT:    vmovne r2, s4
+; CHECK-LE-NEXT:    vmovne r2, s0
 ; CHECK-LE-NEXT:    strne r2, [r0]
 ; CHECK-LE-NEXT:    lsls r2, r1, #30
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s5
+; CHECK-LE-NEXT:    vmovmi r2, s1
 ; CHECK-LE-NEXT:    strmi r2, [r0, #4]
 ; CHECK-LE-NEXT:    lsls r2, r1, #29
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r2, s6
+; CHECK-LE-NEXT:    vmovmi r2, s2
 ; CHECK-LE-NEXT:    strmi r2, [r0, #8]
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    itt mi
-; CHECK-LE-NEXT:    vmovmi r1, s7
+; CHECK-LE-NEXT:    vmovmi r1, s3
 ; CHECK-LE-NEXT:    strmi r1, [r0, #12]
 ; CHECK-LE-NEXT:    add sp, #4
 ; CHECK-LE-NEXT:    pop {r7, pc}
@@ -1086,14 +1086,14 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-LE-NEXT:    lsls r3, r1, #30
 ; CHECK-LE-NEXT:    bpl .LBB19_2
 ; CHECK-LE-NEXT:  .LBB19_7: @ %cond.load1
-; CHECK-LE-NEXT:    vldr.16 s4, [r2, #2]
-; CHECK-LE-NEXT:    vins.f16 s0, s4
+; CHECK-LE-NEXT:    vldr.16 s2, [r2, #2]
+; CHECK-LE-NEXT:    vins.f16 s0, s2
 ; CHECK-LE-NEXT:    lsls r3, r1, #29
 ; CHECK-LE-NEXT:    bpl .LBB19_3
 ; CHECK-LE-NEXT:  .LBB19_8: @ %cond.load4
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-LE-NEXT:    vldr.16 s1, [r2, #4]
-; CHECK-LE-NEXT:    vins.f16 s1, s4
+; CHECK-LE-NEXT:    vmovx.f16 s2, s0
+; CHECK-LE-NEXT:    vins.f16 s1, s2
 ; CHECK-LE-NEXT:    lsls r1, r1, #28
 ; CHECK-LE-NEXT:    bmi .LBB19_4
 ; CHECK-LE-NEXT:    b .LBB19_5
@@ -1133,15 +1133,15 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    beq .LBB19_5
 ; CHECK-BE-NEXT:  .LBB19_4: @ %cond.load7
-; CHECK-BE-NEXT:    vldr.16 s4, [r2, #6]
-; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:    vldr.16 s2, [r2, #6]
+; CHECK-BE-NEXT:    vins.f16 s1, s2
 ; CHECK-BE-NEXT:  .LBB19_5: @ %else8
 ; CHECK-BE-NEXT:    vmrs r2, p0
 ; CHECK-BE-NEXT:    movs r1, #0
-; CHECK-BE-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-BE-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-BE-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-BE-NEXT:    vcvtb.f32.f16 s4, s0
+; CHECK-BE-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-BE-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-BE-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-BE-NEXT:    vcvtb.f32.f16 s0, s0
 ; CHECK-BE-NEXT:    ubfx r3, r2, #12, #1
 ; CHECK-BE-NEXT:    rsbs r3, r3, #0
 ; CHECK-BE-NEXT:    bfi r1, r3, #0, #1
@@ -1156,19 +1156,19 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
 ; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s4
+; CHECK-BE-NEXT:    vmovmi r2, s0
 ; CHECK-BE-NEXT:    strmi r2, [r0]
 ; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s5
+; CHECK-BE-NEXT:    vmovmi r2, s1
 ; CHECK-BE-NEXT:    strmi r2, [r0, #4]
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    itt mi
-; CHECK-BE-NEXT:    vmovmi r2, s6
+; CHECK-BE-NEXT:    vmovmi r2, s2
 ; CHECK-BE-NEXT:    strmi r2, [r0, #8]
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    itt ne
-; CHECK-BE-NEXT:    vmovne r1, s7
+; CHECK-BE-NEXT:    vmovne r1, s3
 ; CHECK-BE-NEXT:    strne r1, [r0, #12]
 ; CHECK-BE-NEXT:    add sp, #4
 ; CHECK-BE-NEXT:    pop {r7, pc}
@@ -1177,14 +1177,14 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 ; CHECK-BE-NEXT:    lsls r3, r1, #29
 ; CHECK-BE-NEXT:    bpl .LBB19_2
 ; CHECK-BE-NEXT:  .LBB19_7: @ %cond.load1
-; CHECK-BE-NEXT:    vldr.16 s4, [r2, #2]
-; CHECK-BE-NEXT:    vins.f16 s0, s4
+; CHECK-BE-NEXT:    vldr.16 s2, [r2, #2]
+; CHECK-BE-NEXT:    vins.f16 s0, s2
 ; CHECK-BE-NEXT:    lsls r3, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB19_3
 ; CHECK-BE-NEXT:  .LBB19_8: @ %cond.load4
-; CHECK-BE-NEXT:    vmovx.f16 s4, s1
 ; CHECK-BE-NEXT:    vldr.16 s1, [r2, #4]
-; CHECK-BE-NEXT:    vins.f16 s1, s4
+; CHECK-BE-NEXT:    vmovx.f16 s2, s0
+; CHECK-BE-NEXT:    vins.f16 s1, s2
 ; CHECK-BE-NEXT:    lsls r1, r1, #31
 ; CHECK-BE-NEXT:    bne .LBB19_4
 ; CHECK-BE-NEXT:    b .LBB19_5
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
index c6c989b2ff85e..8c30520d02cd4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
@@ -108,8 +108,8 @@ define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrwt.32 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
@@ -117,8 +117,8 @@ define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-BE-LABEL: masked_v4i32_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrwt.32 q1, [r0, #4]!
@@ -137,8 +137,8 @@ define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrwt.32 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
@@ -146,8 +146,8 @@ define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-BE-LABEL: masked_v4i32_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrwt.32 q1, [r0], #4
@@ -327,8 +327,8 @@ define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrht.16 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
@@ -336,8 +336,8 @@ define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-BE-LABEL: masked_v8i16_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.16 q2, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrht.16 q1, [r0, #4]!
@@ -356,8 +356,8 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrht.16 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
@@ -365,8 +365,8 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-BE-LABEL: masked_v8i16_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.16 q2, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrht.16 q1, [r0], #4
@@ -405,8 +405,8 @@ define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s8 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrbt.8 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
@@ -414,8 +414,8 @@ define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-BE-LABEL: masked_v16i8_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrb.u8 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrb.u8 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.8 q2, q0
 ; CHECK-BE-NEXT:    vpt.s8 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrbt.8 q1, [r0, #4]!
@@ -434,8 +434,8 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s8 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrbt.8 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
@@ -443,8 +443,8 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-BE-LABEL: masked_v16i8_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrb.u8 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrb.u8 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.8 q2, q0
 ; CHECK-BE-NEXT:    vpt.s8 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrbt.8 q1, [r0], #4
@@ -568,8 +568,8 @@ define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrwt.32 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
@@ -577,8 +577,8 @@ define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-BE-LABEL: masked_v4f32_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrwt.32 q1, [r0, #4]!
@@ -597,8 +597,8 @@ define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrwt.32 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
@@ -606,8 +606,8 @@ define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-BE-LABEL: masked_v4f32_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrwt.32 q1, [r0], #4
@@ -709,8 +709,8 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %
 ; CHECK-LE-NEXT:    lsls r2, r1, #30
 ; CHECK-LE-NEXT:    bpl .LBB16_2
 ; CHECK-LE-NEXT:  .LBB16_10: @ %cond.store1
-; CHECK-LE-NEXT:    vmovx.f16 s4, s0
-; CHECK-LE-NEXT:    vstr.16 s4, [sp, #24]
+; CHECK-LE-NEXT:    vmovx.f16 s0, s0
+; CHECK-LE-NEXT:    vstr.16 s0, [sp, #24]
 ; CHECK-LE-NEXT:    ldrh.w r2, [sp, #24]
 ; CHECK-LE-NEXT:    strh r2, [r0, #2]
 ; CHECK-LE-NEXT:    lsls r2, r1, #29
@@ -722,8 +722,8 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %
 ; CHECK-LE-NEXT:    lsls r2, r1, #28
 ; CHECK-LE-NEXT:    bpl .LBB16_4
 ; CHECK-LE-NEXT:  .LBB16_12: @ %cond.store5
-; CHECK-LE-NEXT:    vmovx.f16 s4, s1
-; CHECK-LE-NEXT:    vstr.16 s4, [sp, #16]
+; CHECK-LE-NEXT:    vmovx.f16 s0, s1
+; CHECK-LE-NEXT:    vstr.16 s0, [sp, #16]
 ; CHECK-LE-NEXT:    ldrh.w r2, [sp, #16]
 ; CHECK-LE-NEXT:    strh r2, [r0, #6]
 ; CHECK-LE-NEXT:    lsls r2, r1, #27
@@ -735,8 +735,8 @@ define arm_aapcs_vfpcc void @masked_v8f16_align1(<8 x half> *%dest, <8 x half> %
 ; CHECK-LE-NEXT:    lsls r2, r1, #26
 ; CHECK-LE-NEXT:    bpl .LBB16_6
 ; CHECK-LE-NEXT:  .LBB16_14: @ %cond.store9
-; CHECK-LE-NEXT:    vmovx.f16 s4, s2
-; CHECK-LE-NEXT:    vstr.16 s4, [sp, #8]
+; CHECK-LE-NEXT:    vmovx.f16 s0, s2
+; CHECK-LE-NEXT:    vstr.16 s0, [sp, #8]
 ; CHECK-LE-NEXT:    ldrh.w r2, [sp, #8]
 ; CHECK-LE-NEXT:    strh r2, [r0, #10]
 ; CHECK-LE-NEXT:    lsls r2, r1, #25
@@ -877,8 +877,8 @@ define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_pre:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrht.16 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
@@ -886,8 +886,8 @@ define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-BE-LABEL: masked_v8f16_pre:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.16 q2, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrht.16 q1, [r0, #4]!
@@ -906,8 +906,8 @@ define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_post:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vldr d1, [sp]
-; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
+; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
 ; CHECK-LE-NEXT:    vstrht.16 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
@@ -915,8 +915,8 @@ define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-BE-LABEL: masked_v8f16_post:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vldr d1, [sp]
-; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
+; CHECK-BE-NEXT:    vldrh.u16 q1, [r1]
 ; CHECK-BE-NEXT:    vrev64.16 q2, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q2, zr
 ; CHECK-BE-NEXT:    vstrht.16 q1, [r0], #4
@@ -1253,12 +1253,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align4(<4 x half> *%dest, <4 x float>
 ; CHECK-LE-NEXT:    it gt
 ; CHECK-LE-NEXT:    movgt r2, #1
 ; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
 ; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-LE-NEXT:    csetm r2, ne
-; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
 ; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s5, s2
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
+; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s6, s2
 ; CHECK-LE-NEXT:    vcvtt.f16.f32 s5, s3
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    bne .LBB25_5
@@ -1328,12 +1328,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align4(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    it gt
 ; CHECK-BE-NEXT:    movgt r2, #1
 ; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-BE-NEXT:    csetm r2, ne
-; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
+; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    vcvtb.f16.f32 s1, s6
+; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
+; CHECK-BE-NEXT:    vcvtb.f16.f32 s2, s6
 ; CHECK-BE-NEXT:    vcvtt.f16.f32 s1, s7
 ; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    bmi .LBB25_5
@@ -1354,8 +1354,8 @@ define arm_aapcs_vfpcc void @masked_v4f16_align4(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    bpl .LBB25_2
 ; CHECK-BE-NEXT:  .LBB25_6: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #2]
+; CHECK-BE-NEXT:    vmovx.f16 s0, s0
+; CHECK-BE-NEXT:    vstr.16 s0, [r0, #2]
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB25_3
 ; CHECK-BE-NEXT:  .LBB25_7: @ %cond.store3
@@ -1409,12 +1409,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align2(<4 x half> *%dest, <4 x float>
 ; CHECK-LE-NEXT:    it gt
 ; CHECK-LE-NEXT:    movgt r2, #1
 ; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
 ; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-LE-NEXT:    csetm r2, ne
-; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
 ; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s5, s2
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
+; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s6, s2
 ; CHECK-LE-NEXT:    vcvtt.f16.f32 s5, s3
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    bne .LBB26_5
@@ -1484,12 +1484,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align2(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    it gt
 ; CHECK-BE-NEXT:    movgt r2, #1
 ; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-BE-NEXT:    csetm r2, ne
-; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
+; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    vcvtb.f16.f32 s1, s6
+; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
+; CHECK-BE-NEXT:    vcvtb.f16.f32 s2, s6
 ; CHECK-BE-NEXT:    vcvtt.f16.f32 s1, s7
 ; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    bmi .LBB26_5
@@ -1510,8 +1510,8 @@ define arm_aapcs_vfpcc void @masked_v4f16_align2(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    bpl .LBB26_2
 ; CHECK-BE-NEXT:  .LBB26_6: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vstr.16 s4, [r0, #2]
+; CHECK-BE-NEXT:    vmovx.f16 s0, s0
+; CHECK-BE-NEXT:    vstr.16 s0, [r0, #2]
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
 ; CHECK-BE-NEXT:    bpl .LBB26_3
 ; CHECK-BE-NEXT:  .LBB26_7: @ %cond.store3
@@ -1565,12 +1565,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(<4 x half> *%dest, <4 x float>
 ; CHECK-LE-NEXT:    it gt
 ; CHECK-LE-NEXT:    movgt r2, #1
 ; CHECK-LE-NEXT:    cmp r2, #0
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
 ; CHECK-LE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-LE-NEXT:    csetm r2, ne
-; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
 ; CHECK-LE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-LE-NEXT:    vcvtb.f16.f32 s5, s2
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s4, s0
+; CHECK-LE-NEXT:    vcvtt.f16.f32 s4, s1
+; CHECK-LE-NEXT:    vcvtb.f16.f32 s6, s2
 ; CHECK-LE-NEXT:    vcvtt.f16.f32 s5, s3
 ; CHECK-LE-NEXT:    lsls r2, r1, #31
 ; CHECK-LE-NEXT:    bne .LBB27_5
@@ -1648,12 +1648,12 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    it gt
 ; CHECK-BE-NEXT:    movgt r2, #1
 ; CHECK-BE-NEXT:    cmp r2, #0
-; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r3, #2, #1
 ; CHECK-BE-NEXT:    csetm r2, ne
-; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
+; CHECK-BE-NEXT:    vcvtb.f16.f32 s0, s4
 ; CHECK-BE-NEXT:    bfi r1, r2, #3, #1
-; CHECK-BE-NEXT:    vcvtb.f16.f32 s1, s6
+; CHECK-BE-NEXT:    vcvtt.f16.f32 s0, s5
+; CHECK-BE-NEXT:    vcvtb.f16.f32 s2, s6
 ; CHECK-BE-NEXT:    vcvtt.f16.f32 s1, s7
 ; CHECK-BE-NEXT:    lsls r2, r1, #28
 ; CHECK-BE-NEXT:    bmi .LBB27_5
@@ -1676,8 +1676,8 @@ define arm_aapcs_vfpcc void @masked_v4f16_align1(<4 x half> *%dest, <4 x float>
 ; CHECK-BE-NEXT:    lsls r2, r1, #29
 ; CHECK-BE-NEXT:    bpl .LBB27_2
 ; CHECK-BE-NEXT:  .LBB27_6: @ %cond.store1
-; CHECK-BE-NEXT:    vmovx.f16 s4, s0
-; CHECK-BE-NEXT:    vstr.16 s4, [sp, #8]
+; CHECK-BE-NEXT:    vmovx.f16 s0, s0
+; CHECK-BE-NEXT:    vstr.16 s0, [sp, #8]
 ; CHECK-BE-NEXT:    ldrh.w r2, [sp, #8]
 ; CHECK-BE-NEXT:    strh r2, [r0, #2]
 ; CHECK-BE-NEXT:    lsls r2, r1, #30
diff --git a/llvm/test/CodeGen/Thumb2/mve-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
index 6b2939c3a0c1b..912773e2d5131 100644
--- a/llvm/test/CodeGen/Thumb2/mve-minmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-minmax.ll
@@ -279,11 +279,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @maxnm_float32_t(<4 x float> %src1, <4 x float> %src2) {
 ; CHECK-MVE-LABEL: maxnm_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmaxnm.f32 s11, s7, s3
-; CHECK-MVE-NEXT:    vmaxnm.f32 s10, s6, s2
-; CHECK-MVE-NEXT:    vmaxnm.f32 s9, s5, s1
-; CHECK-MVE-NEXT:    vmaxnm.f32 s8, s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vmaxnm.f32 s3, s7, s3
+; CHECK-MVE-NEXT:    vmaxnm.f32 s2, s6, s2
+; CHECK-MVE-NEXT:    vmaxnm.f32 s1, s5, s1
+; CHECK-MVE-NEXT:    vmaxnm.f32 s0, s4, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: maxnm_float32_t:
@@ -299,27 +298,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @minnm_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: minnm_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q2, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vminnm.f16 s12, s2, s0
-; CHECK-MVE-NEXT:    vminnm.f16 s0, s4, s8
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vminnm.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vins.f16 s1, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vminnm.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vins.f16 s2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vminnm.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vins.f16 s3, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-MVE-NEXT:    vminnm.f16 s8, s10, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vminnm.f16 s1, s5, s1
+; CHECK-MVE-NEXT:    vminnm.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vminnm.f16 s2, s6, s2
+; CHECK-MVE-NEXT:    vminnm.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
+; CHECK-MVE-NEXT:    vminnm.f16 s3, s7, s3
+; CHECK-MVE-NEXT:    vminnm.f16 s4, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: minnm_float16_t:
diff --git a/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll b/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll
new file mode 100644
index 0000000000000..2ee6a43da532e
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-minmaxi.ll
@@ -0,0 +1,1380 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs %s -o - | FileCheck %s
+
+declare i8 @llvm.smax.i8(i8 %a, i8 %b) readnone
+
+define arm_aapcs_vfpcc i8 @smaxi8(i8 %a, i8 %b) {
+; CHECK-LABEL: smaxi8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, gt
+; CHECK-NEXT:    bx lr
+  %c = call i8 @llvm.smax.i8(i8 %a, i8 %b)
+  ret i8 %c
+}
+
+declare i16 @llvm.smax.i16(i16 %a, i16 %b) readnone
+
+define arm_aapcs_vfpcc i16 @smaxi16(i16 %a, i16 %b) {
+; CHECK-LABEL: smaxi16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, gt
+; CHECK-NEXT:    bx lr
+  %c = call i16 @llvm.smax.i16(i16 %a, i16 %b)
+  ret i16 %c
+}
+
+declare i32 @llvm.smax.i32(i32 %a, i32 %b) readnone
+
+define arm_aapcs_vfpcc i32 @smaxi32(i32 %a, i32 %b) {
+; CHECK-LABEL: smaxi32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, gt
+; CHECK-NEXT:    bx lr
+  %c = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+declare i64 @llvm.smax.i64(i64 %a, i64 %b) readnone
+
+define arm_aapcs_vfpcc i64 @smaxi64(i64 %a, i64 %b) {
+; CHECK-LABEL: smaxi64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r12, r0, r2, gt
+; CHECK-NEXT:    cmp r0, r2
+; CHECK-NEXT:    csel r0, r0, r2, hi
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r0, r0, r12, eq
+; CHECK-NEXT:    csel r1, r1, r3, gt
+; CHECK-NEXT:    bx lr
+  %c = call i64 @llvm.smax.i64(i64 %a, i64 %b)
+  ret i64 %c
+}
+
+declare <8 x i8> @llvm.smax.v8i8(<8 x i8> %a, <8 x i8> %b) readnone
+
+define arm_aapcs_vfpcc <8 x i8> @smax8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: smax8i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vmovlb.s8 q0, q0
+; CHECK-NEXT:    vmax.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <8 x i8> @llvm.smax.v8i8(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %c
+}
+
+declare <16 x i8> @llvm.smax.v16i8(<16 x i8> %a, <16 x i8> %b) readnone
+
+define arm_aapcs_vfpcc <16 x i8> @smax16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: smax16i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmax.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %c
+}
+
+declare <32 x i8> @llvm.smax.v32i8(<32 x i8> %a, <32 x i8> %b) readnone
+
+define arm_aapcs_vfpcc void @smax32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %p) {
+; CHECK-LABEL: smax32i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmax.s8 q1, q1, q3
+; CHECK-NEXT:    vmax.s8 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+  %c = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %a, <32 x i8> %b)
+  store <32 x i8> %c, <32 x i8>* %p
+  ret void
+}
+
+declare <4 x i16> @llvm.smax.v4i16(<4 x i16> %a, <4 x i16> %b) readnone
+
+define arm_aapcs_vfpcc <4 x i16> @smax4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: smax4i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    vmovlb.s16 q0, q0
+; CHECK-NEXT:    vmax.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <4 x i16> @llvm.smax.v4i16(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i16> %c
+}
+
+declare <8 x i16> @llvm.smax.v8i16(<8 x i16> %a, <8 x i16> %b) readnone
+
+define arm_aapcs_vfpcc <8 x i16> @smax8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: smax8i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmax.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %c
+}
+
+declare <16 x i16> @llvm.smax.v16i16(<16 x i16> %a, <16 x i16> %b) readnone
+
+define arm_aapcs_vfpcc void @smax16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %p) {
+; CHECK-LABEL: smax16i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmax.s16 q1, q1, q3
+; CHECK-NEXT:    vmax.s16 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+  %c = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %a, <16 x i16> %b)
+  store <16 x i16> %c, <16 x i16>* %p
+  ret void
+}
+
+declare <2 x i32> @llvm.smax.v2i32(<2 x i32> %a, <2 x i32> %b) readnone
+
+define arm_aapcs_vfpcc <2 x i32> @smax2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: smax2i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    asr.w r9, r0, #31
+; CHECK-NEXT:    cmp.w r9, r1, asr #31
+; CHECK-NEXT:    cset r3, gt
+; CHECK-NEXT:    asrs r5, r2, #31
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    asr.w r3, r1, #31
+; CHECK-NEXT:    csel r12, r0, r1, ne
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel lr, r0, r1, hi
+; CHECK-NEXT:    cmp r9, r3
+; CHECK-NEXT:    cset r4, eq
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    vmov r4, s4
+; CHECK-NEXT:    csel r12, lr, r12, ne
+; CHECK-NEXT:    cmp.w r5, r4, asr #31
+; CHECK-NEXT:    cset r6, gt
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    asr.w r6, r4, #31
+; CHECK-NEXT:    csel lr, r2, r4, ne
+; CHECK-NEXT:    cmp r2, r4
+; CHECK-NEXT:    csel r8, r2, r4, hi
+; CHECK-NEXT:    cmp r5, r6
+; CHECK-NEXT:    cset r7, eq
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    csel r7, r8, lr, ne
+; CHECK-NEXT:    cmp.w r9, r1, asr #31
+; CHECK-NEXT:    vmov q0[2], q0[0], r7, r12
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    asrgt r3, r0, #31
+; CHECK-NEXT:    cmp.w r5, r4, asr #31
+; CHECK-NEXT:    it gt
+; CHECK-NEXT:    asrgt r6, r2, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r3
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+  %c = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %c
+}
+
+declare <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b) readnone
+
+define arm_aapcs_vfpcc <4 x i32> @smax4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: smax4i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmax.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %c
+}
+
+declare <8 x i32> @llvm.smax.v8i32(<8 x i32> %a, <8 x i32> %b) readnone
+
+define arm_aapcs_vfpcc void @smax8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %p) {
+; CHECK-LABEL: smax8i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmax.s32 q1, q1, q3
+; CHECK-NEXT:    vmax.s32 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+  %c = call <8 x i32>@llvm.smax.v8i32(<8 x i32> %a, <8 x i32> %b)
+  store <8 x i32> %c, <8 x i32>* %p
+  ret void
+}
+
+declare <1 x i64> @llvm.smax.v1i64(<1 x i64> %a, <1 x i64> %b) readnone
+
+define arm_aapcs_vfpcc <1 x i64> @smax1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: smax1i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r12, r0, r2, gt
+; CHECK-NEXT:    cmp r0, r2
+; CHECK-NEXT:    csel r0, r0, r2, hi
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r0, r0, r12, eq
+; CHECK-NEXT:    csel r1, r1, r3, gt
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    bx lr
+  %c = call <1 x i64> @llvm.smax.v1i64(<1 x i64> %a, <1 x i64> %b)
+  ret <1 x i64> %c
+}
+
+declare <2 x i64> @llvm.smax.v2i64(<2 x i64> %a, <2 x i64> %b) readnone
+
+define arm_aapcs_vfpcc <2 x i64> @smax2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: smax2i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vmov r1, r12, d3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    cmp r3, r12
+; CHECK-NEXT:    cset r0, gt
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csel lr, r2, r1, ne
+; CHECK-NEXT:    cmp r3, r12
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    cmp r2, r1
+; CHECK-NEXT:    csel r1, r2, r1, hi
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csel lr, r1, lr, ne
+; CHECK-NEXT:    vmov r1, r2, d2
+; CHECK-NEXT:    vmov r0, r4, d0
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    cset r5, gt
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csel r5, r0, r1, ne
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    cset r6, eq
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, hi
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csel r0, r0, r5, ne
+; CHECK-NEXT:    cmp r3, r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
+; CHECK-NEXT:    csel r0, r3, r12, gt
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    csel r1, r4, r2, gt
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+  %c = call <2 x i64> @llvm.smax.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %c
+}
+
+declare <4 x i64> @llvm.smax.v4i64(<4 x i64> %a, <4 x i64> %b) readnone
+
+define arm_aapcs_vfpcc void @smax4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) {
+; CHECK-LABEL: smax4i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vmov r2, r12, d7
+; CHECK-NEXT:    vmov r3, lr, d3
+; CHECK-NEXT:    cmp lr, r12
+; CHECK-NEXT:    cset r1, gt
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    csel r1, r3, r2, ne
+; CHECK-NEXT:    cmp lr, r12
+; CHECK-NEXT:    cset r4, eq
+; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    csel r2, r3, r2, hi
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csel r1, r2, r1, ne
+; CHECK-NEXT:    vmov r2, r3, d6
+; CHECK-NEXT:    vmov r4, r5, d2
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    cset r6, gt
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csel r6, r4, r2, ne
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    cset r7, eq
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    csel r2, r4, r2, hi
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    csel r2, r2, r6, ne
+; CHECK-NEXT:    cmp lr, r12
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT:    csel r1, lr, r12, gt
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    csel r2, r5, r3, gt
+; CHECK-NEXT:    vmov r3, r7, d1
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    vmov r2, r1, d5
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    cmp r7, r1
+; CHECK-NEXT:    cset r6, gt
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csel r6, r3, r2, ne
+; CHECK-NEXT:    cmp r7, r1
+; CHECK-NEXT:    cset r5, eq
+; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    csel r2, r3, r2, hi
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csel r12, r2, r6, ne
+; CHECK-NEXT:    vmov r3, r6, d4
+; CHECK-NEXT:    vmov r5, r4, d0
+; CHECK-NEXT:    cmp r4, r6
+; CHECK-NEXT:    cset r2, gt
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csel lr, r5, r3, ne
+; CHECK-NEXT:    cmp r4, r6
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    csel r3, r5, r3, hi
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csel r2, r3, lr, ne
+; CHECK-NEXT:    cmp r7, r1
+; CHECK-NEXT:    csel r1, r7, r1, gt
+; CHECK-NEXT:    cmp r4, r6
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
+; CHECK-NEXT:    csel r2, r4, r6, gt
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+  %c = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %a, <4 x i64> %b)
+  store <4 x i64> %c, <4 x i64>* %p
+  ret void
+}
+
+declare i8 @llvm.umax.i8(i8 %a, i8 %b) readnone
+
+define arm_aapcs_vfpcc i8 @umaxi8(i8 %a, i8 %b) {
+; CHECK-LABEL: umaxi8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    uxtb r1, r1
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, hi
+; CHECK-NEXT:    bx lr
+  %c = call i8 @llvm.umax.i8(i8 %a, i8 %b)
+  ret i8 %c
+}
+
+declare i16 @llvm.umax.i16(i16 %a, i16 %b) readnone
+
+define arm_aapcs_vfpcc i16 @umaxi16(i16 %a, i16 %b) {
+; CHECK-LABEL: umaxi16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    uxth r1, r1
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, hi
+; CHECK-NEXT:    bx lr
+  %c = call i16 @llvm.umax.i16(i16 %a, i16 %b)
+  ret i16 %c
+}
+
+declare i32 @llvm.umax.i32(i32 %a, i32 %b) readnone
+
+define arm_aapcs_vfpcc i32 @umaxi32(i32 %a, i32 %b) {
+; CHECK-LABEL: umaxi32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, hi
+; CHECK-NEXT:    bx lr
+  %c = call i32 @llvm.umax.i32(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+declare i64 @llvm.umax.i64(i64 %a, i64 %b) readnone
+
+define arm_aapcs_vfpcc i64 @umaxi64(i64 %a, i64 %b) {
+; CHECK-LABEL: umaxi64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r12, r0, r2, hi
+; CHECK-NEXT:    cmp r0, r2
+; CHECK-NEXT:    csel r0, r0, r2, hi
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r0, r0, r12, eq
+; CHECK-NEXT:    csel r1, r1, r3, hi
+; CHECK-NEXT:    bx lr
+  %c = call i64 @llvm.umax.i64(i64 %a, i64 %b)
+  ret i64 %c
+}
+
+declare <8 x i8> @llvm.umax.v8i8(<8 x i8> %a, <8 x i8> %b) readnone
+
+define arm_aapcs_vfpcc <8 x i8> @umax8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: umax8i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmovlb.u8 q1, q1
+; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    vmax.u16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <8 x i8> @llvm.umax.v8i8(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %c
+}
+
+declare <16 x i8> @llvm.umax.v16i8(<16 x i8> %a, <16 x i8> %b) readnone
+
+define arm_aapcs_vfpcc <16 x i8> @umax16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: umax16i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmax.u8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %c
+}
+
+declare <32 x i8> @llvm.umax.v32i8(<32 x i8> %a, <32 x i8> %b) readnone
+
+define arm_aapcs_vfpcc void @umax32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %p) {
+; CHECK-LABEL: umax32i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmax.u8 q1, q1, q3
+; CHECK-NEXT:    vmax.u8 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+  %c = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %a, <32 x i8> %b)
+  store <32 x i8> %c, <32 x i8>* %p
+  ret void
+}
+
+declare <4 x i16> @llvm.umax.v4i16(<4 x i16> %a, <4 x i16> %b) readnone
+
+define arm_aapcs_vfpcc <4 x i16> @umax4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: umax4i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmovlb.u16 q1, q1
+; CHECK-NEXT:    vmovlb.u16 q0, q0
+; CHECK-NEXT:    vmax.u32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <4 x i16> @llvm.umax.v4i16(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i16> %c
+}
+
+declare <8 x i16> @llvm.umax.v8i16(<8 x i16> %a, <8 x i16> %b) readnone
+
+define arm_aapcs_vfpcc <8 x i16> @umax8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: umax8i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmax.u16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %c
+}
+
+declare <16 x i16> @llvm.umax.v16i16(<16 x i16> %a, <16 x i16> %b) readnone
+
+define arm_aapcs_vfpcc void @umax16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %p) {
+; CHECK-LABEL: umax16i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmax.u16 q1, q1, q3
+; CHECK-NEXT:    vmax.u16 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+  %c = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %a, <16 x i16> %b)
+  store <16 x i16> %c, <16 x i16>* %p
+  ret void
+}
+
+declare <2 x i32> @llvm.umax.v2i32(<2 x i32> %a, <2 x i32> %b) readnone
+
+define arm_aapcs_vfpcc <2 x i32> @umax2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: umax2i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
+; CHECK-NEXT:    vand q1, q1, q2
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    cmp r12, r1
+; CHECK-NEXT:    cset r3, hi
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    csel r3, r2, r0, ne
+; CHECK-NEXT:    cmp r12, r1
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    cmp r2, r0
+; CHECK-NEXT:    csel r0, r2, r0, hi
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    csel lr, r0, r3, ne
+; CHECK-NEXT:    vmov r1, r2, d2
+; CHECK-NEXT:    vmov r3, r0, d0
+; CHECK-NEXT:    cmp r0, r2
+; CHECK-NEXT:    cset r4, hi
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csel r4, r3, r1, ne
+; CHECK-NEXT:    cmp r0, r2
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    cmp r3, r1
+; CHECK-NEXT:    csel r1, r3, r1, hi
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csel r1, r1, r4, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, lr
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
+; CHECK-NEXT:    pop {r4, pc}
+  %c = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %c
+}
+
+declare <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> %b) readnone
+
+define arm_aapcs_vfpcc <4 x i32> @umax4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: umax4i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmax.u32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %c
+}
+
+declare <8 x i32> @llvm.umax.v8i32(<8 x i32> %a, <8 x i32> %b) readnone
+
+define arm_aapcs_vfpcc void @umax8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %p) {
+; CHECK-LABEL: umax8i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmax.u32 q1, q1, q3
+; CHECK-NEXT:    vmax.u32 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+  %c = call <8 x i32>@llvm.umax.v8i32(<8 x i32> %a, <8 x i32> %b)
+  store <8 x i32> %c, <8 x i32>* %p
+  ret void
+}
+
+declare <1 x i64> @llvm.umax.v1i64(<1 x i64> %a, <1 x i64> %b) readnone
+
+define arm_aapcs_vfpcc <1 x i64> @umax1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: umax1i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r12, r0, r2, hi
+; CHECK-NEXT:    cmp r0, r2
+; CHECK-NEXT:    csel r0, r0, r2, hi
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r0, r0, r12, eq
+; CHECK-NEXT:    csel r1, r1, r3, hi
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    bx lr
+  %c = call <1 x i64> @llvm.umax.v1i64(<1 x i64> %a, <1 x i64> %b)
+  ret <1 x i64> %c
+}
+
+declare <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b) readnone
+
+define arm_aapcs_vfpcc <2 x i64> @umax2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: umax2i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vmov r1, r12, d3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    cmp r3, r12
+; CHECK-NEXT:    cset r0, hi
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csel lr, r2, r1, ne
+; CHECK-NEXT:    cmp r3, r12
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    cmp r2, r1
+; CHECK-NEXT:    csel r1, r2, r1, hi
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csel lr, r1, lr, ne
+; CHECK-NEXT:    vmov r1, r2, d2
+; CHECK-NEXT:    vmov r0, r4, d0
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    cset r5, hi
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csel r5, r0, r1, ne
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    cset r6, eq
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, hi
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csel r0, r0, r5, ne
+; CHECK-NEXT:    cmp r3, r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
+; CHECK-NEXT:    csel r0, r3, r12, hi
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    csel r1, r4, r2, hi
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+  %c = call <2 x i64> @llvm.umax.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %c
+}
+
+declare <4 x i64> @llvm.umax.v4i64(<4 x i64> %a, <4 x i64> %b) readnone
+
+define arm_aapcs_vfpcc void @umax4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) {
+; CHECK-LABEL: umax4i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vmov r2, r12, d7
+; CHECK-NEXT:    vmov r3, lr, d3
+; CHECK-NEXT:    cmp lr, r12
+; CHECK-NEXT:    cset r1, hi
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    csel r1, r3, r2, ne
+; CHECK-NEXT:    cmp lr, r12
+; CHECK-NEXT:    cset r4, eq
+; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    csel r2, r3, r2, hi
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csel r1, r2, r1, ne
+; CHECK-NEXT:    vmov r2, r3, d6
+; CHECK-NEXT:    vmov r4, r5, d2
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    cset r6, hi
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csel r6, r4, r2, ne
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    cset r7, eq
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    csel r2, r4, r2, hi
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    csel r2, r2, r6, ne
+; CHECK-NEXT:    cmp lr, r12
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT:    csel r1, lr, r12, hi
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    csel r2, r5, r3, hi
+; CHECK-NEXT:    vmov r3, r7, d1
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    vmov r2, r1, d5
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    cmp r7, r1
+; CHECK-NEXT:    cset r6, hi
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csel r6, r3, r2, ne
+; CHECK-NEXT:    cmp r7, r1
+; CHECK-NEXT:    cset r5, eq
+; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    csel r2, r3, r2, hi
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csel r12, r2, r6, ne
+; CHECK-NEXT:    vmov r3, r6, d4
+; CHECK-NEXT:    vmov r5, r4, d0
+; CHECK-NEXT:    cmp r4, r6
+; CHECK-NEXT:    cset r2, hi
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csel lr, r5, r3, ne
+; CHECK-NEXT:    cmp r4, r6
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    csel r3, r5, r3, hi
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csel r2, r3, lr, ne
+; CHECK-NEXT:    cmp r7, r1
+; CHECK-NEXT:    csel r1, r7, r1, hi
+; CHECK-NEXT:    cmp r4, r6
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
+; CHECK-NEXT:    csel r2, r4, r6, hi
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+  %c = call <4 x i64> @llvm.umax.v4i64(<4 x i64> %a, <4 x i64> %b)
+  store <4 x i64> %c, <4 x i64>* %p
+  ret void
+}
+
+declare i8 @llvm.smin.i8(i8 %a, i8 %b) readnone
+
+define arm_aapcs_vfpcc i8 @smini8(i8 %a, i8 %b) {
+; CHECK-LABEL: smini8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    sxtb r0, r0
+; CHECK-NEXT:    sxtb r1, r1
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, lt
+; CHECK-NEXT:    bx lr
+  %c = call i8 @llvm.smin.i8(i8 %a, i8 %b)
+  ret i8 %c
+}
+
+declare i16 @llvm.smin.i16(i16 %a, i16 %b) readnone
+
+define arm_aapcs_vfpcc i16 @smini16(i16 %a, i16 %b) {
+; CHECK-LABEL: smini16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    sxth r0, r0
+; CHECK-NEXT:    sxth r1, r1
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, lt
+; CHECK-NEXT:    bx lr
+  %c = call i16 @llvm.smin.i16(i16 %a, i16 %b)
+  ret i16 %c
+}
+
+declare i32 @llvm.smin.i32(i32 %a, i32 %b) readnone
+
+define arm_aapcs_vfpcc i32 @smini32(i32 %a, i32 %b) {
+; CHECK-LABEL: smini32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, lt
+; CHECK-NEXT:    bx lr
+  %c = call i32 @llvm.smin.i32(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+declare i64 @llvm.smin.i64(i64 %a, i64 %b) readnone
+
+define arm_aapcs_vfpcc i64 @smini64(i64 %a, i64 %b) {
+; CHECK-LABEL: smini64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r12, r0, r2, lt
+; CHECK-NEXT:    cmp r0, r2
+; CHECK-NEXT:    csel r0, r0, r2, lo
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r0, r0, r12, eq
+; CHECK-NEXT:    csel r1, r1, r3, lt
+; CHECK-NEXT:    bx lr
+  %c = call i64 @llvm.smin.i64(i64 %a, i64 %b)
+  ret i64 %c
+}
+
+declare <8 x i8> @llvm.smin.v8i8(<8 x i8> %a, <8 x i8> %b) readnone
+
+define arm_aapcs_vfpcc <8 x i8> @smin8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: smin8i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmovlb.s8 q1, q1
+; CHECK-NEXT:    vmovlb.s8 q0, q0
+; CHECK-NEXT:    vmin.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <8 x i8> @llvm.smin.v8i8(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %c
+}
+
+declare <16 x i8> @llvm.smin.v16i8(<16 x i8> %a, <16 x i8> %b) readnone
+
+define arm_aapcs_vfpcc <16 x i8> @smin16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: smin16i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmin.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %c
+}
+
+declare <32 x i8> @llvm.smin.v32i8(<32 x i8> %a, <32 x i8> %b) readnone
+
+define arm_aapcs_vfpcc void @smin32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %p) {
+; CHECK-LABEL: smin32i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmin.s8 q1, q1, q3
+; CHECK-NEXT:    vmin.s8 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+  %c = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %a, <32 x i8> %b)
+  store <32 x i8> %c, <32 x i8>* %p
+  ret void
+}
+
+declare <4 x i16> @llvm.smin.v4i16(<4 x i16> %a, <4 x i16> %b) readnone
+
+define arm_aapcs_vfpcc <4 x i16> @smin4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: smin4i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmovlb.s16 q1, q1
+; CHECK-NEXT:    vmovlb.s16 q0, q0
+; CHECK-NEXT:    vmin.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <4 x i16> @llvm.smin.v4i16(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i16> %c
+}
+
+declare <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %b) readnone
+
+define arm_aapcs_vfpcc <8 x i16> @smin8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: smin8i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmin.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %c
+}
+
+declare <16 x i16> @llvm.smin.v16i16(<16 x i16> %a, <16 x i16> %b) readnone
+
+define arm_aapcs_vfpcc void @smin16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %p) {
+; CHECK-LABEL: smin16i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmin.s16 q1, q1, q3
+; CHECK-NEXT:    vmin.s16 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+  %c = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %a, <16 x i16> %b)
+  store <16 x i16> %c, <16 x i16>* %p
+  ret void
+}
+
+declare <2 x i32> @llvm.smin.v2i32(<2 x i32> %a, <2 x i32> %b) readnone
+
+define arm_aapcs_vfpcc <2 x i32> @smin2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: smin2i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r1, s6
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    asr.w r9, r0, #31
+; CHECK-NEXT:    cmp.w r9, r1, asr #31
+; CHECK-NEXT:    cset r3, lt
+; CHECK-NEXT:    asrs r5, r2, #31
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    asr.w r3, r1, #31
+; CHECK-NEXT:    csel r12, r0, r1, ne
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel lr, r0, r1, lo
+; CHECK-NEXT:    cmp r9, r3
+; CHECK-NEXT:    cset r4, eq
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    vmov r4, s4
+; CHECK-NEXT:    csel r12, lr, r12, ne
+; CHECK-NEXT:    cmp.w r5, r4, asr #31
+; CHECK-NEXT:    cset r6, lt
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    asr.w r6, r4, #31
+; CHECK-NEXT:    csel lr, r2, r4, ne
+; CHECK-NEXT:    cmp r2, r4
+; CHECK-NEXT:    csel r8, r2, r4, lo
+; CHECK-NEXT:    cmp r5, r6
+; CHECK-NEXT:    cset r7, eq
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    csel r7, r8, lr, ne
+; CHECK-NEXT:    cmp.w r9, r1, asr #31
+; CHECK-NEXT:    vmov q0[2], q0[0], r7, r12
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    asrlt r3, r0, #31
+; CHECK-NEXT:    cmp.w r5, r4, asr #31
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    asrlt r6, r2, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r6, r3
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
+  %c = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %c
+}
+
+declare <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> %b) readnone
+
+define arm_aapcs_vfpcc <4 x i32> @smin4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: smin4i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmin.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %c
+}
+
+declare <8 x i32> @llvm.smin.v8i32(<8 x i32> %a, <8 x i32> %b) readnone
+
+define arm_aapcs_vfpcc void @smin8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %p) {
+; CHECK-LABEL: smin8i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmin.s32 q1, q1, q3
+; CHECK-NEXT:    vmin.s32 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+  %c = call <8 x i32>@llvm.smin.v8i32(<8 x i32> %a, <8 x i32> %b)
+  store <8 x i32> %c, <8 x i32>* %p
+  ret void
+}
+
+declare <1 x i64> @llvm.smin.v1i64(<1 x i64> %a, <1 x i64> %b) readnone
+
+define arm_aapcs_vfpcc <1 x i64> @smin1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: smin1i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r12, r0, r2, lt
+; CHECK-NEXT:    cmp r0, r2
+; CHECK-NEXT:    csel r0, r0, r2, lo
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r0, r0, r12, eq
+; CHECK-NEXT:    csel r1, r1, r3, lt
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    bx lr
+  %c = call <1 x i64> @llvm.smin.v1i64(<1 x i64> %a, <1 x i64> %b)
+  ret <1 x i64> %c
+}
+
+declare <2 x i64> @llvm.smin.v2i64(<2 x i64> %a, <2 x i64> %b) readnone
+
+define arm_aapcs_vfpcc <2 x i64> @smin2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: smin2i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vmov r1, r12, d3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    cmp r3, r12
+; CHECK-NEXT:    cset r0, lt
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csel lr, r2, r1, ne
+; CHECK-NEXT:    cmp r3, r12
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    cmp r2, r1
+; CHECK-NEXT:    csel r1, r2, r1, lo
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csel lr, r1, lr, ne
+; CHECK-NEXT:    vmov r1, r2, d2
+; CHECK-NEXT:    vmov r0, r4, d0
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    cset r5, lt
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csel r5, r0, r1, ne
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    cset r6, eq
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, lo
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csel r0, r0, r5, ne
+; CHECK-NEXT:    cmp r3, r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
+; CHECK-NEXT:    csel r0, r3, r12, lt
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    csel r1, r4, r2, lt
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+  %c = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %c
+}
+
+declare <4 x i64> @llvm.smin.v4i64(<4 x i64> %a, <4 x i64> %b) readnone
+
+define arm_aapcs_vfpcc void @smin4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) {
+; CHECK-LABEL: smin4i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vmov r2, r12, d7
+; CHECK-NEXT:    vmov r3, lr, d3
+; CHECK-NEXT:    cmp lr, r12
+; CHECK-NEXT:    cset r1, lt
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    csel r1, r3, r2, ne
+; CHECK-NEXT:    cmp lr, r12
+; CHECK-NEXT:    cset r4, eq
+; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    csel r2, r3, r2, lo
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csel r1, r2, r1, ne
+; CHECK-NEXT:    vmov r2, r3, d6
+; CHECK-NEXT:    vmov r4, r5, d2
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    cset r6, lt
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csel r6, r4, r2, ne
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    cset r7, eq
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    csel r2, r4, r2, lo
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    csel r2, r2, r6, ne
+; CHECK-NEXT:    cmp lr, r12
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT:    csel r1, lr, r12, lt
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    csel r2, r5, r3, lt
+; CHECK-NEXT:    vmov r3, r7, d1
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    vmov r2, r1, d5
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    cmp r7, r1
+; CHECK-NEXT:    cset r6, lt
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csel r6, r3, r2, ne
+; CHECK-NEXT:    cmp r7, r1
+; CHECK-NEXT:    cset r5, eq
+; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    csel r2, r3, r2, lo
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csel r12, r2, r6, ne
+; CHECK-NEXT:    vmov r3, r6, d4
+; CHECK-NEXT:    vmov r5, r4, d0
+; CHECK-NEXT:    cmp r4, r6
+; CHECK-NEXT:    cset r2, lt
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csel lr, r5, r3, ne
+; CHECK-NEXT:    cmp r4, r6
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    csel r3, r5, r3, lo
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csel r2, r3, lr, ne
+; CHECK-NEXT:    cmp r7, r1
+; CHECK-NEXT:    csel r1, r7, r1, lt
+; CHECK-NEXT:    cmp r4, r6
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
+; CHECK-NEXT:    csel r2, r4, r6, lt
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+  %c = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %a, <4 x i64> %b)
+  store <4 x i64> %c, <4 x i64>* %p
+  ret void
+}
+
+declare i8 @llvm.umin.i8(i8 %a, i8 %b) readnone
+
+define arm_aapcs_vfpcc i8 @umini8(i8 %a, i8 %b) {
+; CHECK-LABEL: umini8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxtb r0, r0
+; CHECK-NEXT:    uxtb r1, r1
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, lo
+; CHECK-NEXT:    bx lr
+  %c = call i8 @llvm.umin.i8(i8 %a, i8 %b)
+  ret i8 %c
+}
+
+declare i16 @llvm.umin.i16(i16 %a, i16 %b) readnone
+
+define arm_aapcs_vfpcc i16 @umini16(i16 %a, i16 %b) {
+; CHECK-LABEL: umini16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    uxth r0, r0
+; CHECK-NEXT:    uxth r1, r1
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, lo
+; CHECK-NEXT:    bx lr
+  %c = call i16 @llvm.umin.i16(i16 %a, i16 %b)
+  ret i16 %c
+}
+
+declare i32 @llvm.umin.i32(i32 %a, i32 %b) readnone
+
+define arm_aapcs_vfpcc i32 @umini32(i32 %a, i32 %b) {
+; CHECK-LABEL: umini32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, lo
+; CHECK-NEXT:    bx lr
+  %c = call i32 @llvm.umin.i32(i32 %a, i32 %b)
+  ret i32 %c
+}
+
+declare i64 @llvm.umin.i64(i64 %a, i64 %b) readnone
+
+define arm_aapcs_vfpcc i64 @umini64(i64 %a, i64 %b) {
+; CHECK-LABEL: umini64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r12, r0, r2, lo
+; CHECK-NEXT:    cmp r0, r2
+; CHECK-NEXT:    csel r0, r0, r2, lo
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r0, r0, r12, eq
+; CHECK-NEXT:    csel r1, r1, r3, lo
+; CHECK-NEXT:    bx lr
+  %c = call i64 @llvm.umin.i64(i64 %a, i64 %b)
+  ret i64 %c
+}
+
+declare <8 x i8> @llvm.umin.v8i8(<8 x i8> %a, <8 x i8> %b) readnone
+
+define arm_aapcs_vfpcc <8 x i8> @umin8i8(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: umin8i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmovlb.u8 q1, q1
+; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    vmin.u16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <8 x i8> @llvm.umin.v8i8(<8 x i8> %a, <8 x i8> %b)
+  ret <8 x i8> %c
+}
+
+declare <16 x i8> @llvm.umin.v16i8(<16 x i8> %a, <16 x i8> %b) readnone
+
+define arm_aapcs_vfpcc <16 x i8> @umin16i8(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: umin16i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmin.u8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %c
+}
+
+declare <32 x i8> @llvm.umin.v32i8(<32 x i8> %a, <32 x i8> %b) readnone
+
+define arm_aapcs_vfpcc void @umin32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %p) {
+; CHECK-LABEL: umin32i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmin.u8 q1, q1, q3
+; CHECK-NEXT:    vmin.u8 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+  %c = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %a, <32 x i8> %b)
+  store <32 x i8> %c, <32 x i8>* %p
+  ret void
+}
+
+declare <4 x i16> @llvm.umin.v4i16(<4 x i16> %a, <4 x i16> %b) readnone
+
+define arm_aapcs_vfpcc <4 x i16> @umin4i16(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: umin4i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmovlb.u16 q1, q1
+; CHECK-NEXT:    vmovlb.u16 q0, q0
+; CHECK-NEXT:    vmin.u32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <4 x i16> @llvm.umin.v4i16(<4 x i16> %a, <4 x i16> %b)
+  ret <4 x i16> %c
+}
+
+declare <8 x i16> @llvm.umin.v8i16(<8 x i16> %a, <8 x i16> %b) readnone
+
+define arm_aapcs_vfpcc <8 x i16> @umin8i16(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: umin8i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmin.u16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %c
+}
+
+declare <16 x i16> @llvm.umin.v16i16(<16 x i16> %a, <16 x i16> %b) readnone
+
+define arm_aapcs_vfpcc void @umin16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %p) {
+; CHECK-LABEL: umin16i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmin.u16 q1, q1, q3
+; CHECK-NEXT:    vmin.u16 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+  %c = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %a, <16 x i16> %b)
+  store <16 x i16> %c, <16 x i16>* %p
+  ret void
+}
+
+declare <2 x i32> @llvm.umin.v2i32(<2 x i32> %a, <2 x i32> %b) readnone
+
+define arm_aapcs_vfpcc <2 x i32> @umin2i32(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: umin2i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    vmov.i64 q2, #0xffffffff
+; CHECK-NEXT:    vand q1, q1, q2
+; CHECK-NEXT:    vand q0, q0, q2
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov r2, r12, d1
+; CHECK-NEXT:    cmp r12, r1
+; CHECK-NEXT:    cset r3, lo
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    csel r3, r2, r0, ne
+; CHECK-NEXT:    cmp r12, r1
+; CHECK-NEXT:    cset r1, eq
+; CHECK-NEXT:    cmp r2, r0
+; CHECK-NEXT:    csel r0, r2, r0, lo
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    csel lr, r0, r3, ne
+; CHECK-NEXT:    vmov r1, r2, d2
+; CHECK-NEXT:    vmov r3, r0, d0
+; CHECK-NEXT:    cmp r0, r2
+; CHECK-NEXT:    cset r4, lo
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csel r4, r3, r1, ne
+; CHECK-NEXT:    cmp r0, r2
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    cmp r3, r1
+; CHECK-NEXT:    csel r1, r3, r1, lo
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csel r1, r1, r4, ne
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, lr
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
+; CHECK-NEXT:    pop {r4, pc}
+  %c = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %c
+}
+
+declare <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> %b) readnone
+
+define arm_aapcs_vfpcc <4 x i32> @umin4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: umin4i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmin.u32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+  %c = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %c
+}
+
+declare <8 x i32> @llvm.umin.v8i32(<8 x i32> %a, <8 x i32> %b) readnone
+
+define arm_aapcs_vfpcc void @umin8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %p) {
+; CHECK-LABEL: umin8i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vmin.u32 q1, q1, q3
+; CHECK-NEXT:    vmin.u32 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+  %c = call <8 x i32>@llvm.umin.v8i32(<8 x i32> %a, <8 x i32> %b)
+  store <8 x i32> %c, <8 x i32>* %p
+  ret void
+}
+
+declare <1 x i64> @llvm.umin.v1i64(<1 x i64> %a, <1 x i64> %b) readnone
+
+define arm_aapcs_vfpcc <1 x i64> @umin1i64(<1 x i64> %a, <1 x i64> %b) {
+; CHECK-LABEL: umin1i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .pad #8
+; CHECK-NEXT:    sub sp, #8
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r12, r0, r2, lo
+; CHECK-NEXT:    cmp r0, r2
+; CHECK-NEXT:    csel r0, r0, r2, lo
+; CHECK-NEXT:    cmp r1, r3
+; CHECK-NEXT:    csel r0, r0, r12, eq
+; CHECK-NEXT:    csel r1, r1, r3, lo
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
+; CHECK-NEXT:    vmov r0, s0
+; CHECK-NEXT:    add sp, #8
+; CHECK-NEXT:    bx lr
+  %c = call <1 x i64> @llvm.umin.v1i64(<1 x i64> %a, <1 x i64> %b)
+  ret <1 x i64> %c
+}
+
+declare <2 x i64> @llvm.umin.v2i64(<2 x i64> %a, <2 x i64> %b) readnone
+
+define arm_aapcs_vfpcc <2 x i64> @umin2i64(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: umin2i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vmov r1, r12, d3
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    cmp r3, r12
+; CHECK-NEXT:    cset r0, lo
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csel lr, r2, r1, ne
+; CHECK-NEXT:    cmp r3, r12
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    cmp r2, r1
+; CHECK-NEXT:    csel r1, r2, r1, lo
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    csel lr, r1, lr, ne
+; CHECK-NEXT:    vmov r1, r2, d2
+; CHECK-NEXT:    vmov r0, r4, d0
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    cset r5, lo
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csel r5, r0, r1, ne
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    cset r6, eq
+; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    csel r0, r0, r1, lo
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csel r0, r0, r5, ne
+; CHECK-NEXT:    cmp r3, r12
+; CHECK-NEXT:    vmov q0[2], q0[0], r0, lr
+; CHECK-NEXT:    csel r0, r3, r12, lo
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    csel r1, r4, r2, lo
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+  %c = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %a, <2 x i64> %b)
+  ret <2 x i64> %c
+}
+
+declare <4 x i64> @llvm.umin.v4i64(<4 x i64> %a, <4 x i64> %b) readnone
+
+define arm_aapcs_vfpcc void @umin4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %p) {
+; CHECK-LABEL: umin4i64:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NEXT:    vmov r2, r12, d7
+; CHECK-NEXT:    vmov r3, lr, d3
+; CHECK-NEXT:    cmp lr, r12
+; CHECK-NEXT:    cset r1, lo
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    csel r1, r3, r2, ne
+; CHECK-NEXT:    cmp lr, r12
+; CHECK-NEXT:    cset r4, eq
+; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    csel r2, r3, r2, lo
+; CHECK-NEXT:    cmp r4, #0
+; CHECK-NEXT:    csel r1, r2, r1, ne
+; CHECK-NEXT:    vmov r2, r3, d6
+; CHECK-NEXT:    vmov r4, r5, d2
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    cset r6, lo
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csel r6, r4, r2, ne
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    cset r7, eq
+; CHECK-NEXT:    cmp r4, r2
+; CHECK-NEXT:    csel r2, r4, r2, lo
+; CHECK-NEXT:    cmp r7, #0
+; CHECK-NEXT:    csel r2, r2, r6, ne
+; CHECK-NEXT:    cmp lr, r12
+; CHECK-NEXT:    vmov q1[2], q1[0], r2, r1
+; CHECK-NEXT:    csel r1, lr, r12, lo
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    csel r2, r5, r3, lo
+; CHECK-NEXT:    vmov r3, r7, d1
+; CHECK-NEXT:    vmov q1[3], q1[1], r2, r1
+; CHECK-NEXT:    vmov r2, r1, d5
+; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
+; CHECK-NEXT:    cmp r7, r1
+; CHECK-NEXT:    cset r6, lo
+; CHECK-NEXT:    cmp r6, #0
+; CHECK-NEXT:    csel r6, r3, r2, ne
+; CHECK-NEXT:    cmp r7, r1
+; CHECK-NEXT:    cset r5, eq
+; CHECK-NEXT:    cmp r3, r2
+; CHECK-NEXT:    csel r2, r3, r2, lo
+; CHECK-NEXT:    cmp r5, #0
+; CHECK-NEXT:    csel r12, r2, r6, ne
+; CHECK-NEXT:    vmov r3, r6, d4
+; CHECK-NEXT:    vmov r5, r4, d0
+; CHECK-NEXT:    cmp r4, r6
+; CHECK-NEXT:    cset r2, lo
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csel lr, r5, r3, ne
+; CHECK-NEXT:    cmp r4, r6
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    cmp r5, r3
+; CHECK-NEXT:    csel r3, r5, r3, lo
+; CHECK-NEXT:    cmp r2, #0
+; CHECK-NEXT:    csel r2, r3, lr, ne
+; CHECK-NEXT:    cmp r7, r1
+; CHECK-NEXT:    csel r1, r7, r1, lo
+; CHECK-NEXT:    cmp r4, r6
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r12
+; CHECK-NEXT:    csel r2, r4, r6, lo
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
+; CHECK-NEXT:    vstrw.32 q0, [r0]
+; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
+  %c = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %a, <4 x i64> %b)
+  store <4 x i64> %c, <4 x i64>* %p
+  ret void
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-nofloat.ll b/llvm/test/CodeGen/Thumb2/mve-nofloat.ll
index 24eef30656e95..ded3d3141e361 100644
--- a/llvm/test/CodeGen/Thumb2/mve-nofloat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-nofloat.ll
@@ -104,20 +104,20 @@ define arm_aapcs_vfpcc <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float>
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NOFP-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NOFP-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NOFP-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NOFP-NEXT:    vmov q5, q1
-; CHECK-NOFP-NEXT:    vmov q6, q0
-; CHECK-NOFP-NEXT:    vmov r4, r0, d13
-; CHECK-NOFP-NEXT:    vmov r5, r1, d11
+; CHECK-NOFP-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NOFP-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NOFP-NEXT:    vmov q4, q1
+; CHECK-NOFP-NEXT:    vmov q5, q0
+; CHECK-NOFP-NEXT:    vmov r4, r0, d11
+; CHECK-NOFP-NEXT:    vmov r5, r1, d9
 ; CHECK-NOFP-NEXT:    bl __aeabi_fadd
 ; CHECK-NOFP-NEXT:    vmov s19, r0
 ; CHECK-NOFP-NEXT:    mov r0, r4
 ; CHECK-NOFP-NEXT:    mov r1, r5
 ; CHECK-NOFP-NEXT:    bl __aeabi_fadd
 ; CHECK-NOFP-NEXT:    vmov s18, r0
-; CHECK-NOFP-NEXT:    vmov r4, r0, d12
-; CHECK-NOFP-NEXT:    vmov r5, r1, d10
+; CHECK-NOFP-NEXT:    vmov r4, r0, d10
+; CHECK-NOFP-NEXT:    vmov r5, r1, d8
 ; CHECK-NOFP-NEXT:    bl __aeabi_fadd
 ; CHECK-NOFP-NEXT:    vmov s17, r0
 ; CHECK-NOFP-NEXT:    mov r0, r4
@@ -125,7 +125,7 @@ define arm_aapcs_vfpcc <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float>
 ; CHECK-NOFP-NEXT:    bl __aeabi_fadd
 ; CHECK-NOFP-NEXT:    vmov s16, r0
 ; CHECK-NOFP-NEXT:    vmov q0, q4
-; CHECK-NOFP-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NOFP-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NOFP-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; CHECK-FP-LABEL: vector_add_f32:
diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index 8c8667f1762a5..8a1109950c03a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -152,40 +152,39 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    .pad #408
 ; CHECK-NEXT:    sub sp, #408
 ; CHECK-NEXT:    movw r7, :lower16:.L_MergedGlobals
-; CHECK-NEXT:    vldr s12, .LCPI1_0
-; CHECK-NEXT:    movt r7, :upper16:.L_MergedGlobals
 ; CHECK-NEXT:    vldr s15, .LCPI1_1
-; CHECK-NEXT:    mov r3, r7
-; CHECK-NEXT:    mov r4, r7
-; CHECK-NEXT:    ldr r0, [r3, #4]!
+; CHECK-NEXT:    movt r7, :upper16:.L_MergedGlobals
 ; CHECK-NEXT:    movw r2, :lower16:e
+; CHECK-NEXT:    mov r4, r7
+; CHECK-NEXT:    mov r3, r7
 ; CHECK-NEXT:    ldr r6, [r4, #8]!
-; CHECK-NEXT:    vmov r5, s15
-; CHECK-NEXT:    vmov s13, r3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    movt r2, :upper16:e
+; CHECK-NEXT:    ldr r0, [r3, #4]!
 ; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    movt r2, :upper16:e
+; CHECK-NEXT:    vmov r5, s15
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r4
-; CHECK-NEXT:    vmov s21, r2
-; CHECK-NEXT:    vmov.f32 s14, s13
+; CHECK-NEXT:    vmov s13, r3
+; CHECK-NEXT:    vldr s12, .LCPI1_0
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r2
-; CHECK-NEXT:    vmov.f32 s20, s12
 ; CHECK-NEXT:    vdup.32 q7, r3
 ; CHECK-NEXT:    vmov q6[2], q6[0], r3, r5
-; CHECK-NEXT:    vmov.f32 s22, s13
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #92]
 ; CHECK-NEXT:    vmov q0, q7
 ; CHECK-NEXT:    vmov q6[3], q6[1], r3, r2
 ; CHECK-NEXT:    vmov q4, q7
 ; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.32 q7[1], r2
-; CHECK-NEXT:    vmov.f32 s23, s15
+; CHECK-NEXT:    vmov s21, r2
 ; CHECK-NEXT:    movs r1, #64
+; CHECK-NEXT:    vmov.f32 s20, s12
 ; CHECK-NEXT:    str r0, [sp, #40]
-; CHECK-NEXT:    vstrw.32 q5, [r0]
+; CHECK-NEXT:    vmov.f32 s22, s13
 ; CHECK-NEXT:    str r6, [r0]
-; CHECK-NEXT:    vstrw.32 q7, [r0]
+; CHECK-NEXT:    vmov.f32 s23, s15
 ; CHECK-NEXT:    str r0, [r0]
+; CHECK-NEXT:    vstrw.32 q5, [r0]
+; CHECK-NEXT:    vstrw.32 q7, [r0]
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    vstrw.32 q6, [r0]
 ; CHECK-NEXT:    mov.w r8, #0
@@ -193,6 +192,7 @@ define dso_local i32 @e() #0 {
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r3
 ; CHECK-NEXT:    mov.w r12, #4
 ; CHECK-NEXT:    vmov q1[3], q1[1], r2, r4
+; CHECK-NEXT:    vmov.f32 s14, s13
 ; CHECK-NEXT:    vmov q2[3], q2[1], r4, r5
 ; CHECK-NEXT:    vmov.32 q4[0], r8
 ; CHECK-NEXT:    @ implicit-def: $r2
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index 303ee3a5c19e7..d4ad24933070f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -34,13 +34,13 @@ define void @DCT_mve1(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    letp lr, .LBB0_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT:    vadd.f32 s4, s2, s3
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    add.w r7, r2, r0, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    adds r0, #1
 ; CHECK-NEXT:    add r3, r9
 ; CHECK-NEXT:    cmp r0, r12
-; CHECK-NEXT:    vadd.f32 s0, s0, s4
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    vstr s0, [r7]
 ; CHECK-NEXT:    bne .LBB0_2
 ; CHECK-NEXT:  .LBB0_5: @ %for.cond.cleanup
@@ -138,15 +138,15 @@ define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    letp lr, .LBB1_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    vadd.f32 s8, s2, s3
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    add.w r0, r2, r10, lsl #2
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    add r11, r9
-; CHECK-NEXT:    vadd.f32 s2, s6, s7
+; CHECK-NEXT:    vadd.f32 s6, s6, s7
 ; CHECK-NEXT:    add r7, r9
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
-; CHECK-NEXT:    vadd.f32 s0, s0, s8
-; CHECK-NEXT:    vadd.f32 s2, s4, s2
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    vadd.f32 s2, s4, s6
 ; CHECK-NEXT:    vstr s0, [r0]
 ; CHECK-NEXT:    add.w r0, r2, r4, lsl #2
 ; CHECK-NEXT:    adds r4, #2
@@ -284,21 +284,21 @@ define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    letp lr, .LBB2_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB2_2 Depth=1
-; CHECK-NEXT:    vadd.f32 s12, s10, s11
+; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    add r9, r11
-; CHECK-NEXT:    vadd.f32 s10, s6, s7
+; CHECK-NEXT:    vadd.f32 s6, s6, s7
 ; CHECK-NEXT:    add.w r0, r1, r2, lsl #2
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
 ; CHECK-NEXT:    add r12, r11
-; CHECK-NEXT:    vadd.f32 s6, s2, s3
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    add r10, r11
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    vadd.f32 s2, s8, s12
-; CHECK-NEXT:    vadd.f32 s4, s4, s10
-; CHECK-NEXT:    vadd.f32 s0, s0, s6
-; CHECK-NEXT:    vstr s2, [r0]
+; CHECK-NEXT:    vadd.f32 s8, s8, s10
+; CHECK-NEXT:    vadd.f32 s4, s4, s6
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    vstr s8, [r0]
 ; CHECK-NEXT:    add.w r0, r1, r5, lsl #2
 ; CHECK-NEXT:    adds r5, #3
 ; CHECK-NEXT:    vstr s4, [r0]
@@ -461,22 +461,22 @@ define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    letp lr, .LBB3_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB3_2 Depth=1
-; CHECK-NEXT:    vadd.f32 s16, s14, s15
+; CHECK-NEXT:    vadd.f32 s14, s14, s15
 ; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
 ; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT:    vadd.f32 s14, s10, s11
+; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
-; CHECK-NEXT:    vadd.f32 s10, s6, s7
+; CHECK-NEXT:    vadd.f32 s6, s6, s7
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
-; CHECK-NEXT:    vadd.f32 s6, s2, s3
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    vadd.f32 s2, s12, s16
-; CHECK-NEXT:    vadd.f32 s8, s8, s14
-; CHECK-NEXT:    vadd.f32 s4, s4, s10
-; CHECK-NEXT:    vadd.f32 s0, s0, s6
-; CHECK-NEXT:    vstr s2, [r0]
+; CHECK-NEXT:    vadd.f32 s12, s12, s14
+; CHECK-NEXT:    vadd.f32 s8, s8, s10
+; CHECK-NEXT:    vadd.f32 s4, s4, s6
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    vstr s12, [r0]
 ; CHECK-NEXT:    add.w r0, r1, r6, lsl #2
 ; CHECK-NEXT:    adds r6, #4
 ; CHECK-NEXT:    vstr s8, [r0]
@@ -663,24 +663,24 @@ define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    letp lr, .LBB4_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB4_2 Depth=1
-; CHECK-NEXT:    vadd.f32 s20, s18, s19
+; CHECK-NEXT:    vadd.f32 s18, s18, s19
 ; CHECK-NEXT:    add.w r1, r2, r11, lsl #2
 ; CHECK-NEXT:    vadd.f32 s16, s16, s17
-; CHECK-NEXT:    vadd.f32 s18, s14, s15
+; CHECK-NEXT:    vadd.f32 s14, s14, s15
 ; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    vadd.f32 s14, s6, s7
+; CHECK-NEXT:    vadd.f32 s6, s6, s7
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
-; CHECK-NEXT:    vadd.f32 s6, s10, s11
+; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    vadd.f32 s10, s2, s3
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    vadd.f32 s2, s16, s20
-; CHECK-NEXT:    vadd.f32 s12, s12, s18
-; CHECK-NEXT:    vadd.f32 s4, s4, s14
-; CHECK-NEXT:    vadd.f32 s6, s8, s6
-; CHECK-NEXT:    vadd.f32 s0, s0, s10
-; CHECK-NEXT:    vstr s2, [r1]
+; CHECK-NEXT:    vadd.f32 s1, s16, s18
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
+; CHECK-NEXT:    vadd.f32 s12, s12, s14
+; CHECK-NEXT:    vadd.f32 s4, s4, s6
+; CHECK-NEXT:    vadd.f32 s6, s8, s10
+; CHECK-NEXT:    vstr s1, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    adds r0, #5
 ; CHECK-NEXT:    vstr s12, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r10, lsl #2
@@ -884,32 +884,32 @@ define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    letp lr, .LBB5_3
 ; CHECK-NEXT:  @ %bb.4: @ %middle.block
 ; CHECK-NEXT:    @ in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT:    vadd.f32 s24, s22, s23
+; CHECK-NEXT:    vadd.f32 s22, s22, s23
 ; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
 ; CHECK-NEXT:    vadd.f32 s20, s20, s21
-; CHECK-NEXT:    vadd.f32 s22, s18, s19
+; CHECK-NEXT:    vadd.f32 s18, s18, s19
 ; CHECK-NEXT:    vadd.f32 s16, s16, s17
-; CHECK-NEXT:    vadd.f32 s18, s6, s7
-; CHECK-NEXT:    vadd.f32 s4, s4, s5
-; CHECK-NEXT:    vadd.f32 s6, s14, s15
-; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    vadd.f32 s14, s10, s11
+; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    vadd.f32 s10, s2, s3
-; CHECK-NEXT:    vadd.f32 s2, s20, s24
-; CHECK-NEXT:    vadd.f32 s1, s16, s22
-; CHECK-NEXT:    vadd.f32 s6, s12, s6
-; CHECK-NEXT:    vadd.f32 s4, s4, s18
-; CHECK-NEXT:    vadd.f32 s8, s8, s14
-; CHECK-NEXT:    vadd.f32 s0, s0, s10
-; CHECK-NEXT:    vstr s2, [r1]
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
+; CHECK-NEXT:    vadd.f32 s1, s20, s22
+; CHECK-NEXT:    vadd.f32 s6, s6, s7
+; CHECK-NEXT:    vadd.f32 s3, s16, s18
+; CHECK-NEXT:    vadd.f32 s4, s4, s5
+; CHECK-NEXT:    vadd.f32 s8, s8, s10
+; CHECK-NEXT:    vadd.f32 s14, s14, s15
+; CHECK-NEXT:    vadd.f32 s12, s12, s13
+; CHECK-NEXT:    vstr s1, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    adds r0, #6
-; CHECK-NEXT:    vstr s1, [r1]
+; CHECK-NEXT:    vstr s3, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r11, lsl #2
+; CHECK-NEXT:    vadd.f32 s4, s4, s6
 ; CHECK-NEXT:    vstr s8, [r1]
 ; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT:    vadd.f32 s6, s12, s14
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s0, [r1]
 ; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
@@ -1124,19 +1124,17 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    add.w r6, r11, r5
 ; CHECK-NEXT:    vmov q6, q5
 ; CHECK-NEXT:    vmov q5, q4
-; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vmov q2, q3
+; CHECK-NEXT:    vmov q4, q3
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrwt.u32 q0, [r6]
 ; CHECK-NEXT:    vmov q3, q1
 ; CHECK-NEXT:    vldrw.u32 q1, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vfmat.f32 q1, q0, q7
+; CHECK-NEXT:    adds r7, r6, r5
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #56] @ 16-byte Spill
 ; CHECK-NEXT:    vmov q1, q3
-; CHECK-NEXT:    vmov q3, q2
-; CHECK-NEXT:    vmov q2, q4
+; CHECK-NEXT:    vmov q3, q4
 ; CHECK-NEXT:    vmov q4, q5
 ; CHECK-NEXT:    vmov q5, q6
 ; CHECK-NEXT:    vldrw.u32 q6, [sp, #40] @ 16-byte Reload
@@ -1157,33 +1155,33 @@ define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    vadd.f32 s0, s26, s27
 ; CHECK-NEXT:    add.w r1, r2, r8, lsl #2
 ; CHECK-NEXT:    vadd.f32 s2, s24, s25
-; CHECK-NEXT:    vadd.f32 s3, s20, s21
 ; CHECK-NEXT:    vadd.f32 s1, s22, s23
-; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    vadd.f32 s20, s10, s11
-; CHECK-NEXT:    vadd.f32 s11, s14, s15
-; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    vadd.f32 s14, s6, s7
+; CHECK-NEXT:    vadd.f32 s3, s20, s21
+; CHECK-NEXT:    vadd.f32 s6, s6, s7
 ; CHECK-NEXT:    vadd.f32 s4, s4, s5
+; CHECK-NEXT:    vadd.f32 s10, s10, s11
+; CHECK-NEXT:    vadd.f32 s8, s8, s9
 ; CHECK-NEXT:    vadd.f32 s0, s2, s0
-; CHECK-NEXT:    vadd.f32 s10, s18, s19
-; CHECK-NEXT:    vadd.f32 s9, s16, s17
+; CHECK-NEXT:    vadd.f32 s9, s18, s19
+; CHECK-NEXT:    vadd.f32 s11, s16, s17
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #56] @ 16-byte Reload
 ; CHECK-NEXT:    vadd.f32 s2, s3, s1
-; CHECK-NEXT:    vadd.f32 s6, s18, s19
-; CHECK-NEXT:    vadd.f32 s5, s16, s17
-; CHECK-NEXT:    vadd.f32 s4, s4, s14
+; CHECK-NEXT:    vadd.f32 s5, s18, s19
+; CHECK-NEXT:    vadd.f32 s7, s16, s17
+; CHECK-NEXT:    vadd.f32 s4, s4, s6
 ; CHECK-NEXT:    vstr s0, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
-; CHECK-NEXT:    vadd.f32 s12, s12, s11
+; CHECK-NEXT:    vadd.f32 s14, s14, s15
 ; CHECK-NEXT:    adds r0, #7
-; CHECK-NEXT:    vadd.f32 s10, s9, s10
+; CHECK-NEXT:    vadd.f32 s12, s12, s13
 ; CHECK-NEXT:    vstr s2, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
-; CHECK-NEXT:    vadd.f32 s8, s8, s20
-; CHECK-NEXT:    vadd.f32 s6, s5, s6
+; CHECK-NEXT:    vadd.f32 s8, s8, s10
+; CHECK-NEXT:    vadd.f32 s6, s7, s5
 ; CHECK-NEXT:    vstr s4, [r1]
+; CHECK-NEXT:    vadd.f32 s10, s11, s9
 ; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT:    vadd.f32 s12, s12, s14
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vstr s6, [r1]
 ; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
@@ -1453,34 +1451,34 @@ define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float*
 ; CHECK-NEXT:    vadd.f32 s0, s30, s31
 ; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
 ; CHECK-NEXT:    vadd.f32 s2, s28, s29
-; CHECK-NEXT:    vadd.f32 s12, s12, s13
-; CHECK-NEXT:    vadd.f32 s5, s14, s15
 ; CHECK-NEXT:    vadd.f32 s4, s26, s27
 ; CHECK-NEXT:    vadd.f32 s6, s24, s25
-; CHECK-NEXT:    vadd.f32 s14, s18, s19
+; CHECK-NEXT:    vadd.f32 s5, s18, s19
 ; CHECK-NEXT:    vadd.f32 s7, s16, s17
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #56] @ 16-byte Reload
+; CHECK-NEXT:    vadd.f32 s10, s10, s11
 ; CHECK-NEXT:    vadd.f32 s8, s8, s9
-; CHECK-NEXT:    vadd.f32 s13, s10, s11
-; CHECK-NEXT:    vadd.f32 s10, s18, s19
-; CHECK-NEXT:    vadd.f32 s9, s16, s17
+; CHECK-NEXT:    vadd.f32 s9, s18, s19
+; CHECK-NEXT:    vadd.f32 s11, s16, s17
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #72] @ 16-byte Reload
-; CHECK-NEXT:    vadd.f32 s0, s2, s0
-; CHECK-NEXT:    vadd.f32 s11, s18, s19
+; CHECK-NEXT:    vadd.f32 s14, s14, s15
+; CHECK-NEXT:    vadd.f32 s12, s12, s13
+; CHECK-NEXT:    vadd.f32 s13, s18, s19
 ; CHECK-NEXT:    vadd.f32 s15, s16, s17
+; CHECK-NEXT:    vadd.f32 s0, s2, s0
 ; CHECK-NEXT:    vadd.f32 s2, s6, s4
-; CHECK-NEXT:    vadd.f32 s6, s12, s5
-; CHECK-NEXT:    vadd.f32 s12, s7, s14
-; CHECK-NEXT:    vadd.f32 s10, s9, s10
+; CHECK-NEXT:    vadd.f32 s8, s8, s10
+; CHECK-NEXT:    vadd.f32 s10, s11, s9
+; CHECK-NEXT:    vadd.f32 s6, s12, s14
+; CHECK-NEXT:    vadd.f32 s1, s22, s23
+; CHECK-NEXT:    vadd.f32 s14, s15, s13
 ; CHECK-NEXT:    vstr s0, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
-; CHECK-NEXT:    vadd.f32 s8, s8, s13
+; CHECK-NEXT:    vadd.f32 s3, s20, s21
 ; CHECK-NEXT:    adds r0, #8
-; CHECK-NEXT:    vadd.f32 s14, s15, s11
 ; CHECK-NEXT:    vstr s2, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r8, lsl #2
-; CHECK-NEXT:    vadd.f32 s1, s22, s23
-; CHECK-NEXT:    vadd.f32 s3, s20, s21
+; CHECK-NEXT:    vadd.f32 s12, s7, s5
 ; CHECK-NEXT:    vstr s10, [r1]
 ; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
 ; CHECK-NEXT:    vstr s14, [r1]
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll b/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll
index 0f1b483a0eb1e..b4f7a8ca4d47d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-build-var.ll
@@ -159,11 +159,11 @@ define arm_aapcs_vfpcc <2 x i64> @build_var0_v2i1(i32 %s, i32 %t, <2 x i64> %a,
 ; CHECK-LABEL: build_var0_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    vldr s10, .LCPI9_0
 ; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    vmov s8, r0
-; CHECK-NEXT:    vldr s10, .LCPI9_0
-; CHECK-NEXT:    vmov.f32 s9, s8
 ; CHECK-NEXT:    vmov.f32 s11, s10
+; CHECK-NEXT:    vmov.f32 s9, s8
 ; CHECK-NEXT:    vbic q1, q1, q2
 ; CHECK-NEXT:    vand q0, q0, q2
 ; CHECK-NEXT:    vorr q0, q0, q1
@@ -183,9 +183,9 @@ define arm_aapcs_vfpcc <2 x i64> @build_var1_v2i1(i32 %s, i32 %t, <2 x i64> %a,
 ; CHECK-LABEL: build_var1_v2i1:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    cmp r0, r1
+; CHECK-NEXT:    vldr s8, .LCPI10_0
 ; CHECK-NEXT:    csetm r0, lo
 ; CHECK-NEXT:    vmov s10, r0
-; CHECK-NEXT:    vldr s8, .LCPI10_0
 ; CHECK-NEXT:    vmov.f32 s9, s8
 ; CHECK-NEXT:    vmov.f32 s11, s10
 ; CHECK-NEXT:    vbic q1, q1, q2
diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
index e9ae87165b455..477db0718410e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll
@@ -79,9 +79,9 @@ entry:
 define <4 x i32> @shuffle2_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: shuffle2_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    vmov d0, r0, r1
 ; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    vcmp.i32 eq, q0, zr
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    mov r0, sp
@@ -100,9 +100,9 @@ entry:
 define <8 x i16> @shuffle2_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: shuffle2_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    vmov d0, r0, r1
 ; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    vcmp.i16 eq, q0, zr
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    mov r0, sp
@@ -121,9 +121,9 @@ entry:
 define <16 x i8> @shuffle2_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: shuffle2_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    vmov d0, r0, r1
 ; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    vcmp.i8 eq, q0, zr
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    mov r0, sp
@@ -223,9 +223,9 @@ entry:
 define <4 x i32> @shuffle4_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: shuffle4_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov d1, r2, r3
-; CHECK-NEXT:    vmov.i8 q1, #0xff
 ; CHECK-NEXT:    vmov d0, r0, r1
+; CHECK-NEXT:    vmov.i8 q1, #0xff
+; CHECK-NEXT:    vmov d1, r2, r3
 ; CHECK-NEXT:    add r0, sp, #16
 ; CHECK-NEXT:    vcmp.i32 eq, q0, zr
 ; CHECK-NEXT:    vmov.i8 q0, #0x0
diff --git a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
index ca9a725c79abd..3f7b0e6a437b1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll
@@ -253,16 +253,15 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    mov.w r2, #-1
 ; CHECK-NEXT:    vmov.f32 s16, s10
 ; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT:    vmov.f32 s20, s14
 ; CHECK-NEXT:    vmov.f32 s18, s11
+; CHECK-NEXT:    vmov.f32 s20, s14
 ; CHECK-NEXT:    vmov.f32 s22, s15
 ; CHECK-NEXT:    vmullb.s32 q6, q5, q4
-; CHECK-NEXT:    vmov.f32 s14, s13
+; CHECK-NEXT:    vmov.f32 s10, s9
 ; CHECK-NEXT:    vmov r4, r7, d13
 ; CHECK-NEXT:    asrl r4, r7, #31
-; CHECK-NEXT:    vmov.f32 s10, s9
-; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
 ; CHECK-NEXT:    vmov r6, s12
+; CHECK-NEXT:    rsbs.w r5, r4, #-2147483648
 ; CHECK-NEXT:    sbcs.w r5, r2, r7
 ; CHECK-NEXT:    mov.w r5, #0
 ; CHECK-NEXT:    it lt
@@ -306,10 +305,11 @@ define arm_aapcs_vfpcc void @ssatmul_4_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    csetm r4, ne
 ; CHECK-NEXT:    vmov q5[2], q5[0], r3, r4
 ; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov r4, s14
+; CHECK-NEXT:    vmov.f32 s10, s13
 ; CHECK-NEXT:    vbic q6, q1, q5
 ; CHECK-NEXT:    vand q4, q4, q5
 ; CHECK-NEXT:    vorr q4, q4, q6
+; CHECK-NEXT:    vmov r4, s10
 ; CHECK-NEXT:    smull r6, r5, r6, r5
 ; CHECK-NEXT:    asrl r6, r5, #31
 ; CHECK-NEXT:    smull r4, r7, r4, r3
@@ -522,17 +522,15 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    vorr q4, q4, q0
 ; CHECK-NEXT:    vpt.u32 cs, q1, q4
 ; CHECK-NEXT:    vldrwt.u32 q4, [r0], #16
-; CHECK-NEXT:    vmov.f32 s24, s18
 ; CHECK-NEXT:    vpst
 ; CHECK-NEXT:    vldrwt.u32 q5, [r1], #16
-; CHECK-NEXT:    vmov.f32 s28, s22
+; CHECK-NEXT:    vmov.f32 s24, s18
 ; CHECK-NEXT:    vmov.f32 s26, s19
+; CHECK-NEXT:    vmov.f32 s28, s22
 ; CHECK-NEXT:    vmov.f32 s30, s23
 ; CHECK-NEXT:    vmullb.s32 q0, q7, q6
-; CHECK-NEXT:    vmov.f32 s18, s17
 ; CHECK-NEXT:    vmov r6, r5, d1
 ; CHECK-NEXT:    asrl r6, r5, #31
-; CHECK-NEXT:    vmov.f32 s22, s21
 ; CHECK-NEXT:    rsbs.w r7, r6, #-2147483648
 ; CHECK-NEXT:    sbcs.w r7, r12, r5
 ; CHECK-NEXT:    mov.w r7, #0
@@ -575,11 +573,13 @@ define arm_aapcs_vfpcc void @ssatmul_4t_q31(i32* nocapture readonly %pSrcA, i32*
 ; CHECK-NEXT:    cmp r4, #0
 ; CHECK-NEXT:    csetm r4, ne
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r4
-; CHECK-NEXT:    vmov r3, s18
-; CHECK-NEXT:    vmov r4, s22
 ; CHECK-NEXT:    vbic q7, q3, q0
 ; CHECK-NEXT:    vand q0, q6, q0
 ; CHECK-NEXT:    vorr q6, q0, q7
+; CHECK-NEXT:    vmov.f32 s2, s17
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov.f32 s2, s21
+; CHECK-NEXT:    vmov r4, s2
 ; CHECK-NEXT:    smull r6, r5, r4, r3
 ; CHECK-NEXT:    vmov r4, s16
 ; CHECK-NEXT:    asrl r6, r5, #31
diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
index c960979a37033..198a4d8a4d56a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
@@ -60,20 +60,14 @@ define arm_aapcs_vfpcc <2 x i64> @sadd_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    it ne
 ; CHECK-NEXT:    asrne r3, r1, #31
-; CHECK-NEXT:    cmp r0, #0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
-; CHECK-NEXT:    cset r2, mi
-; CHECK-NEXT:    mvn r3, #-2147483648
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cinv r2, r3, eq
 ; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    csel r0, r2, r0, ne
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cset r2, mi
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cinv r2, r3, eq
+; CHECK-NEXT:    mov.w r2, #-2147483648
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    eorne.w r0, r2, r0, asr #31
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csel r1, r2, r1, ne
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    eorne.w r1, r2, r1, asr #31
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
@@ -183,40 +177,34 @@ define arm_aapcs_vfpcc <2 x i64> @ssub_int64_t(<2 x i64> %src1, <2 x i64> %src2)
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    subs r0, r2, r0
+; CHECK-NEXT:    subs r2, r2, r0
 ; CHECK-NEXT:    eor.w r12, r3, r1
-; CHECK-NEXT:    sbc.w r1, r3, r1
-; CHECK-NEXT:    eor.w r2, r3, r1
+; CHECK-NEXT:    sbc.w r0, r3, r1
+; CHECK-NEXT:    eor.w r1, r3, r0
 ; CHECK-NEXT:    vmov r3, r4, d0
-; CHECK-NEXT:    ands.w r2, r2, r12
-; CHECK-NEXT:    vmov lr, r2, d2
+; CHECK-NEXT:    ands.w r1, r1, r12
+; CHECK-NEXT:    vmov lr, r1, d2
 ; CHECK-NEXT:    cset r12, mi
 ; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    asrne r0, r1, #31
+; CHECK-NEXT:    asrne r2, r0, #31
 ; CHECK-NEXT:    subs.w r3, r3, lr
-; CHECK-NEXT:    eor.w r5, r4, r2
-; CHECK-NEXT:    sbc.w r2, r4, r2
-; CHECK-NEXT:    eors r4, r2
+; CHECK-NEXT:    eor.w r5, r4, r1
+; CHECK-NEXT:    sbc.w r1, r4, r1
+; CHECK-NEXT:    eors r4, r1
 ; CHECK-NEXT:    ands r5, r4
 ; CHECK-NEXT:    cset r5, mi
 ; CHECK-NEXT:    cmp r5, #0
 ; CHECK-NEXT:    it ne
-; CHECK-NEXT:    asrne r3, r2, #31
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    vmov q0[2], q0[0], r3, r0
-; CHECK-NEXT:    cset r0, mi
-; CHECK-NEXT:    mvn r3, #-2147483648
-; CHECK-NEXT:    cmp r0, #0
-; CHECK-NEXT:    cinv r0, r3, eq
+; CHECK-NEXT:    asrne r3, r1, #31
+; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
 ; CHECK-NEXT:    cmp.w r12, #0
-; CHECK-NEXT:    csel r0, r0, r1, ne
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    cset r1, mi
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    cinv r1, r3, eq
+; CHECK-NEXT:    mov.w r2, #-2147483648
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    eorne.w r0, r2, r0, asr #31
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    csel r1, r1, r2, ne
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    eorne.w r1, r2, r1, asr #31
 ; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
index 0e80c6241c041..a4b3632e4dd7e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
@@ -93,23 +93,23 @@ define arm_aapcs_vfpcc void @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr, <
 ; CHECK-LABEL: scaled_v8f16_sext:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q1, [r1]
-; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vshl.i32 q2, q1, #1
 ; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
-; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vmov r1, r2, d4
+; CHECK-NEXT:    vshl.i32 q1, q1, #1
 ; CHECK-NEXT:    vstr.16 s0, [r1]
-; CHECK-NEXT:    vstr.16 s12, [r2]
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    vmov r1, r2, d5
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s0, s1
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
 ; CHECK-NEXT:    vstr.16 s1, [r1]
-; CHECK-NEXT:    vstr.16 s8, [r2]
+; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vstr.16 s2, [r0]
-; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmovx.f16 s0, s3
 ; CHECK-NEXT:    vstr.16 s3, [r0]
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
index f549cebe304e9..fba6524589e59 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
@@ -111,20 +111,20 @@ define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.s32 q2, [r1]
 ; CHECK-NEXT:    vldrh.s32 q1, [r1, #8]
-; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
 ; CHECK-NEXT:    vmov r1, r2, d4
 ; CHECK-NEXT:    vstr.16 s0, [r1]
-; CHECK-NEXT:    vstr.16 s12, [r2]
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    vmov r1, r2, d5
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s0, s1
 ; CHECK-NEXT:    vstr.16 s1, [r1]
-; CHECK-NEXT:    vstr.16 s8, [r2]
+; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vstr.16 s2, [r0]
-; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmovx.f16 s0, s3
 ; CHECK-NEXT:    vstr.16 s3, [r0]
@@ -184,20 +184,20 @@ define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr,
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q2, [r1]
 ; CHECK-NEXT:    vldrw.u32 q1, [r1, #16]
-; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vadd.i32 q2, q2, r0
 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
 ; CHECK-NEXT:    vmov r1, r2, d4
 ; CHECK-NEXT:    vstr.16 s0, [r1]
-; CHECK-NEXT:    vstr.16 s12, [r2]
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    vmov r1, r2, d5
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s0, s1
 ; CHECK-NEXT:    vstr.16 s1, [r1]
-; CHECK-NEXT:    vstr.16 s8, [r2]
+; CHECK-NEXT:    vstr.16 s0, [r2]
 ; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vstr.16 s2, [r0]
-; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmovx.f16 s0, s3
 ; CHECK-NEXT:    vstr.16 s3, [r0]
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
index affb361febd68..4c2ef5e01e28c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll
@@ -291,8 +291,8 @@ entry:
 define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) {
 ; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vldrb.s32 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vstrw.32 q0, [r0, q2]
@@ -310,8 +310,8 @@ entry:
 define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) {
 ; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vldrb.u32 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vstrw.32 q0, [r0, q2]
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
index e8daac426b4cf..edd8a07166e4a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
@@ -312,18 +312,18 @@ define arm_aapcs_vfpcc void @ptr_f16(<8 x half> %v, <8 x half*>* %offptr) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NEXT:    vmov r0, r1, d4
 ; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    vstr.16 s12, [r1]
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d5
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s0, s1
 ; CHECK-NEXT:    vstr.16 s1, [r0]
-; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d2
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vmovx.f16 s0, s2
 ; CHECK-NEXT:    vstr.16 s2, [r0]
-; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmovx.f16 s0, s3
 ; CHECK-NEXT:    vstr.16 s3, [r0]
@@ -339,10 +339,10 @@ define arm_aapcs_vfpcc void @ptr_v4f16(<4 x half> %v, <4 x half*>* %offptr) {
 ; CHECK-LABEL: ptr_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s0
 ; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vstr.16 s0, [r0]
-; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vstr.16 s0, [r1]
 ; CHECK-NEXT:    vmov r0, r1, d3
 ; CHECK-NEXT:    vmovx.f16 s0, s1
 ; CHECK-NEXT:    vstr.16 s1, [r0]
diff --git a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
index 54249151d448e..bdf5fb2354ed2 100644
--- a/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-sext-masked-load.ll
@@ -52,30 +52,29 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpt.s32 lt, q0, zr
 ; CHECK-NEXT:    vldrwt.u32 q5, [r0]
-; CHECK-NEXT:    vmov.f64 d8, d10
-; CHECK-NEXT:    vmov.f32 s18, s21
-; CHECK-NEXT:    vmov r0, s18
+; CHECK-NEXT:    vmov.f32 s2, s21
+; CHECK-NEXT:    vmov r0, s2
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    bl __aeabi_l2d
-; CHECK-NEXT:    vmov r2, s16
+; CHECK-NEXT:    vmov r2, s20
 ; CHECK-NEXT:    vmov d9, r0, r1
 ; CHECK-NEXT:    asrs r3, r2, #31
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl __aeabi_l2d
-; CHECK-NEXT:    vmov.f64 d12, d11
-; CHECK-NEXT:    vmov.f32 s26, s23
+; CHECK-NEXT:    vmov.f32 s2, s23
 ; CHECK-NEXT:    vmov d8, r0, r1
-; CHECK-NEXT:    vmov r2, s26
+; CHECK-NEXT:    vmov.f32 s20, s22
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    asrs r3, r2, #31
 ; CHECK-NEXT:    mov r0, r2
 ; CHECK-NEXT:    mov r1, r3
 ; CHECK-NEXT:    bl __aeabi_l2d
-; CHECK-NEXT:    vmov r2, s24
+; CHECK-NEXT:    vmov r2, s20
 ; CHECK-NEXT:    vmov d11, r0, r1
 ; CHECK-NEXT:    asrs r3, r2, #31
 ; CHECK-NEXT:    mov r0, r2
@@ -84,7 +83,7 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
 ; CHECK-NEXT:    vmov d10, r0, r1
 ; CHECK-NEXT:    vmov q0, q4
 ; CHECK-NEXT:    vmov q1, q5
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %active.lane.mask = icmp slt <4 x i32> %a, zeroinitializer
diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
index b66e7b24536cf..6ede494c81ea7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll
@@ -118,8 +118,8 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle2step_i32(<8 x i32> %src) {
 ; CHECK-NEXT:    vmov.f32 s9, s3
 ; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vmov.f32 s10, s5
-; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s11, s7
+; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
 ; CHECK-NEXT:    bx lr
@@ -135,17 +135,17 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle3step_i32(<16 x i32> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s12, s1
-; CHECK-NEXT:    vmov.f32 s16, s0
 ; CHECK-NEXT:    vmov.f32 s13, s4
-; CHECK-NEXT:    vmov.f32 s17, s3
 ; CHECK-NEXT:    vmov.f32 s14, s7
 ; CHECK-NEXT:    vmov.f32 s18, s6
-; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s6, s8
+; CHECK-NEXT:    vmov.f32 s12, s1
 ; CHECK-NEXT:    vmov.f32 s15, s10
+; CHECK-NEXT:    vmov.f32 s16, s0
+; CHECK-NEXT:    vmov.f32 s17, s3
 ; CHECK-NEXT:    vmov.f32 s19, s9
 ; CHECK-NEXT:    vadd.i32 q3, q4, q3
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov.f32 s6, s8
 ; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    vadd.i32 q0, q3, q1
 ; CHECK-NEXT:    vpop {d8, d9}
@@ -167,18 +167,18 @@ define arm_aapcs_vfpcc <4 x i32> @shuffle4step_i32(<16 x i32> %src) {
 ; CHECK-NEXT:    vmov.f32 s16, s3
 ; CHECK-NEXT:    vmov.f32 s20, s2
 ; CHECK-NEXT:    vmov.f32 s17, s7
-; CHECK-NEXT:    vmov.f32 s21, s6
 ; CHECK-NEXT:    vmov.f32 s18, s11
-; CHECK-NEXT:    vmov.f32 s22, s10
 ; CHECK-NEXT:    vmov.f32 s19, s15
+; CHECK-NEXT:    vmov.f32 s21, s6
+; CHECK-NEXT:    vmov.f32 s22, s10
 ; CHECK-NEXT:    vmov.f32 s23, s14
 ; CHECK-NEXT:    vadd.i32 q4, q5, q4
 ; CHECK-NEXT:    vmov.f32 s20, s1
 ; CHECK-NEXT:    vmov.f32 s21, s5
-; CHECK-NEXT:    vmov.f32 s1, s4
 ; CHECK-NEXT:    vmov.f32 s22, s9
-; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vmov.f32 s23, s13
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vmov.f32 s3, s12
 ; CHECK-NEXT:    vadd.i32 q0, q0, q5
 ; CHECK-NEXT:    vadd.i32 q0, q0, q4
@@ -202,12 +202,12 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle1_i16(<8 x i16> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmovx.f16 s0, s7
-; CHECK-NEXT:    vins.f16 s0, s7
 ; CHECK-NEXT:    vmovx.f16 s1, s6
-; CHECK-NEXT:    vins.f16 s1, s6
 ; CHECK-NEXT:    vmovx.f16 s2, s5
-; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vins.f16 s0, s7
+; CHECK-NEXT:    vins.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
@@ -228,14 +228,14 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) {
 ; CHECK-LABEL: shuffle3_i16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovx.f16 s1, s7
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vins.f16 s1, s7
 ; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vmovx.f16 s0, s4
 ; CHECK-NEXT:    vins.f16 s5, s4
-; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vins.f16 s2, s0
 ; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmovx.f16 s1, s7
 ; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vins.f16 s1, s7
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
@@ -323,30 +323,27 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle2step_i16(<16 x i16> %src) {
 ; CHECK-LABEL: shuffle2step_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8}
-; CHECK-NEXT:    vpush {d8}
 ; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vins.f16 s0, s1
 ; CHECK-NEXT:    vmovx.f16 s9, s2
+; CHECK-NEXT:    vins.f16 s0, s1
 ; CHECK-NEXT:    vins.f16 s2, s3
-; CHECK-NEXT:    vmovx.f16 s10, s4
-; CHECK-NEXT:    vmovx.f16 s16, s1
 ; CHECK-NEXT:    vmov q3, q0
-; CHECK-NEXT:    vins.f16 s8, s16
+; CHECK-NEXT:    vmovx.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s8, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vins.f16 s4, s5
+; CHECK-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NEXT:    vins.f16 s9, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s5
+; CHECK-NEXT:    vins.f16 s4, s5
 ; CHECK-NEXT:    vins.f16 s10, s0
-; CHECK-NEXT:    vmov.f32 s13, s2
-; CHECK-NEXT:    vmov.f32 s14, s4
-; CHECK-NEXT:    vmovx.f16 s0, s7
 ; CHECK-NEXT:    vmovx.f16 s11, s6
+; CHECK-NEXT:    vmovx.f16 s0, s7
 ; CHECK-NEXT:    vins.f16 s6, s7
+; CHECK-NEXT:    vmov.f32 s13, s2
 ; CHECK-NEXT:    vins.f16 s11, s0
+; CHECK-NEXT:    vmov.f32 s14, s4
 ; CHECK-NEXT:    vmov.f32 s15, s6
 ; CHECK-NEXT:    vadd.i16 q0, q3, q2
-; CHECK-NEXT:    vpop {d8}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = shufflevector <16 x i16> %src, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -358,51 +355,54 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle3step_i16(<32 x i16> %src) {
 ; CHECK-LABEL: shuffle3step_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vmovx.f16 s16, s1
+; CHECK-NEXT:    .vsave {d11, d12, d13}
+; CHECK-NEXT:    vpush {d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.f32 s12, s0
-; CHECK-NEXT:    vins.f16 s12, s16
-; CHECK-NEXT:    vmovx.f16 s16, s4
+; CHECK-NEXT:    vmovx.f16 s14, s1
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vmovx.f16 s14, s4
 ; CHECK-NEXT:    vmov.f32 s13, s3
-; CHECK-NEXT:    vmovx.f16 s20, s5
-; CHECK-NEXT:    vins.f16 s13, s16
-; CHECK-NEXT:    vmovx.f16 s16, s7
+; CHECK-NEXT:    vmovx.f16 s15, s7
+; CHECK-NEXT:    vins.f16 s13, s14
 ; CHECK-NEXT:    vmov.f32 s14, s6
-; CHECK-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-NEXT:    vins.f16 s14, s16
-; CHECK-NEXT:    vmovx.f16 s16, s2
-; CHECK-NEXT:    vins.f16 s1, s16
+; CHECK-NEXT:    vins.f16 s14, s15
+; CHECK-NEXT:    vmovx.f16 s15, s2
+; CHECK-NEXT:    vins.f16 s1, s15
+; CHECK-NEXT:    vmovx.f16 s15, s5
 ; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vins.f16 s17, s20
-; CHECK-NEXT:    vmovx.f16 s20, s10
-; CHECK-NEXT:    vmov.f32 s15, s9
-; CHECK-NEXT:    vins.f16 s15, s20
-; CHECK-NEXT:    vmovx.f16 s20, s11
-; CHECK-NEXT:    vins.f16 s10, s20
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vins.f16 s17, s15
 ; CHECK-NEXT:    vmov.f32 s16, s1
+; CHECK-NEXT:    vmovx.f16 s1, s10
+; CHECK-NEXT:    vmov.f32 s15, s9
+; CHECK-NEXT:    vins.f16 s15, s1
+; CHECK-NEXT:    vmovx.f16 s1, s11
+; CHECK-NEXT:    vins.f16 s10, s1
+; CHECK-NEXT:    vmovx.f16 s1, s3
+; CHECK-NEXT:    vmov.u16 r0, q1[5]
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmovx.f16 s7, s9
 ; CHECK-NEXT:    vmov.f32 s23, s10
 ; CHECK-NEXT:    vmov.f32 s22, s8
-; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vins.f16 s1, s5
 ; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    vins.f16 s6, s8
+; CHECK-NEXT:    vins.f16 s7, s11
 ; CHECK-NEXT:    vmovnb.i32 q6, q4
+; CHECK-NEXT:    vmov.f32 s19, s10
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov q2, q1
+; CHECK-NEXT:    vmovnb.i32 q2, q0
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vmov.f32 s2, s10
 ; CHECK-NEXT:    vmov.f32 s18, s26
-; CHECK-NEXT:    vmov.f32 s19, s23
-; CHECK-NEXT:    vmovx.f16 s20, s0
-; CHECK-NEXT:    vins.f16 s20, s2
-; CHECK-NEXT:    vmovx.f16 s21, s3
-; CHECK-NEXT:    vins.f16 s2, s8
-; CHECK-NEXT:    vmovx.f16 s3, s9
-; CHECK-NEXT:    vins.f16 s21, s5
-; CHECK-NEXT:    vins.f16 s3, s11
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovnb.i32 q1, q5
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s23, s3
-; CHECK-NEXT:    vadd.i16 q0, q3, q5
+; CHECK-NEXT:    vadd.i16 q0, q3, q0
 ; CHECK-NEXT:    vadd.i16 q0, q0, q4
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d11, d12, d13}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -416,53 +416,51 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @shuffle4step_i16(<32 x i16> %src) {
 ; CHECK-LABEL: shuffle4step_i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    vmovx.f16 s20, s11
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vmovx.f16 s18, s9
-; CHECK-NEXT:    vins.f16 s18, s20
-; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vmovx.f16 s16, s11
+; CHECK-NEXT:    vins.f16 s18, s16
 ; CHECK-NEXT:    vmovx.f16 s19, s13
-; CHECK-NEXT:    vins.f16 s9, s11
-; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmovx.f16 s16, s15
 ; CHECK-NEXT:    vmovx.f16 s20, s3
+; CHECK-NEXT:    vins.f16 s19, s16
 ; CHECK-NEXT:    vmovx.f16 s16, s1
-; CHECK-NEXT:    vins.f16 s13, s15
 ; CHECK-NEXT:    vins.f16 s16, s20
-; CHECK-NEXT:    vmovx.f16 s20, s7
 ; CHECK-NEXT:    vmovx.f16 s17, s5
+; CHECK-NEXT:    vmovx.f16 s20, s7
+; CHECK-NEXT:    vins.f16 s9, s11
+; CHECK-NEXT:    vins.f16 s13, s15
+; CHECK-NEXT:    vins.f16 s5, s7
 ; CHECK-NEXT:    vins.f16 s1, s3
 ; CHECK-NEXT:    vins.f16 s17, s20
+; CHECK-NEXT:    vmov.f32 s20, s1
+; CHECK-NEXT:    vmovx.f16 s1, s10
 ; CHECK-NEXT:    vmov.f32 s22, s9
 ; CHECK-NEXT:    vmov.f32 s23, s13
-; CHECK-NEXT:    vins.f16 s5, s7
-; CHECK-NEXT:    vmov.f32 s20, s1
-; CHECK-NEXT:    vmovx.f16 s24, s10
 ; CHECK-NEXT:    vmov.f32 s21, s5
 ; CHECK-NEXT:    vadd.i16 q4, q5, q4
 ; CHECK-NEXT:    vmovx.f16 s22, s8
-; CHECK-NEXT:    vins.f16 s22, s24
-; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vmovx.f16 s24, s14
+; CHECK-NEXT:    vins.f16 s22, s1
 ; CHECK-NEXT:    vmovx.f16 s23, s12
-; CHECK-NEXT:    vins.f16 s12, s14
-; CHECK-NEXT:    vmov.f32 s10, s8
-; CHECK-NEXT:    vmov.f32 s11, s12
-; CHECK-NEXT:    vins.f16 s23, s24
-; CHECK-NEXT:    vmovx.f16 s24, s2
+; CHECK-NEXT:    vmovx.f16 s1, s14
 ; CHECK-NEXT:    vmovx.f16 s20, s0
-; CHECK-NEXT:    vins.f16 s20, s24
-; CHECK-NEXT:    vmovx.f16 s24, s6
+; CHECK-NEXT:    vins.f16 s23, s1
+; CHECK-NEXT:    vmovx.f16 s1, s2
+; CHECK-NEXT:    vins.f16 s20, s1
 ; CHECK-NEXT:    vmovx.f16 s21, s4
-; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vins.f16 s8, s10
 ; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vins.f16 s21, s1
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmov.f32 s3, s12
 ; CHECK-NEXT:    vmov.f32 s1, s4
-; CHECK-NEXT:    vmov.f32 s2, s10
-; CHECK-NEXT:    vmov.f32 s3, s11
+; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vadd.i16 q0, q0, q5
 ; CHECK-NEXT:    vadd.i16 q0, q0, q4
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = shufflevector <32 x i16> %src, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -769,12 +767,11 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle3step_i8(<64 x i8> %src) {
 ; CHECK-NEXT:    vmov.8 q4[14], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[14]
 ; CHECK-NEXT:    vmov.8 q4[15], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[2]
 ; CHECK-NEXT:    vmov q5, q3
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-NEXT:    vmov.f32 s15, s19
 ; CHECK-NEXT:    vmov.8 q5[11], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[0]
-; CHECK-NEXT:    vmov.f32 s14, s22
-; CHECK-NEXT:    vmov.f32 s15, s19
 ; CHECK-NEXT:    vmov.8 q4[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[3]
 ; CHECK-NEXT:    vmov.8 q4[1], r0
@@ -797,19 +794,20 @@ define arm_aapcs_vfpcc <16 x i8> @shuffle3step_i8(<64 x i8> %src) {
 ; CHECK-NEXT:    vmov.u8 r0, q1[14]
 ; CHECK-NEXT:    vmov.8 q4[10], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-NEXT:    vmov.f32 s14, s22
 ; CHECK-NEXT:    vmov.8 q5[12], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-NEXT:    vmov q6, q4
 ; CHECK-NEXT:    vmov.8 q5[13], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[10]
 ; CHECK-NEXT:    vmov.8 q5[14], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[13]
 ; CHECK-NEXT:    vmov.8 q5[15], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[1]
-; CHECK-NEXT:    vmov q6, q4
 ; CHECK-NEXT:    vmov.8 q6[11], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.f32 s18, s26
 ; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.f32 s18, s26
+; CHECK-NEXT:    vmov.u8 r0, q0[2]
 ; CHECK-NEXT:    vadd.i8 q3, q4, q3
 ; CHECK-NEXT:    vmov.8 q4[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[5]
@@ -1028,8 +1026,8 @@ define arm_aapcs_vfpcc <2 x i64> @shuffle2_i64(<2 x i64> %src) {
 ; CHECK-LABEL: shuffle2_i64:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov.f32 s7, s1
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
@@ -1146,8 +1144,8 @@ define arm_aapcs_vfpcc <4 x float> @shuffle2step_f32(<8 x float> %src) {
 ; CHECKFP-NEXT:    vmov.f32 s9, s3
 ; CHECKFP-NEXT:    vmov.f32 s1, s2
 ; CHECKFP-NEXT:    vmov.f32 s10, s5
-; CHECKFP-NEXT:    vmov.f32 s2, s4
 ; CHECKFP-NEXT:    vmov.f32 s11, s7
+; CHECKFP-NEXT:    vmov.f32 s2, s4
 ; CHECKFP-NEXT:    vmov.f32 s3, s6
 ; CHECKFP-NEXT:    vadd.f32 q0, q0, q2
 ; CHECKFP-NEXT:    bx lr
@@ -1163,17 +1161,17 @@ define arm_aapcs_vfpcc <4 x float> @shuffle3step_f32(<16 x float> %src) {
 ; CHECKFP:       @ %bb.0: @ %entry
 ; CHECKFP-NEXT:    .vsave {d8, d9}
 ; CHECKFP-NEXT:    vpush {d8, d9}
-; CHECKFP-NEXT:    vmov.f32 s12, s1
-; CHECKFP-NEXT:    vmov.f32 s16, s0
 ; CHECKFP-NEXT:    vmov.f32 s13, s4
-; CHECKFP-NEXT:    vmov.f32 s17, s3
 ; CHECKFP-NEXT:    vmov.f32 s14, s7
 ; CHECKFP-NEXT:    vmov.f32 s18, s6
-; CHECKFP-NEXT:    vmov.f32 s4, s2
-; CHECKFP-NEXT:    vmov.f32 s6, s8
+; CHECKFP-NEXT:    vmov.f32 s12, s1
 ; CHECKFP-NEXT:    vmov.f32 s15, s10
+; CHECKFP-NEXT:    vmov.f32 s16, s0
+; CHECKFP-NEXT:    vmov.f32 s17, s3
 ; CHECKFP-NEXT:    vmov.f32 s19, s9
 ; CHECKFP-NEXT:    vadd.f32 q3, q4, q3
+; CHECKFP-NEXT:    vmov.f32 s4, s2
+; CHECKFP-NEXT:    vmov.f32 s6, s8
 ; CHECKFP-NEXT:    vmov.f32 s7, s11
 ; CHECKFP-NEXT:    vadd.f32 q0, q3, q1
 ; CHECKFP-NEXT:    vpop {d8, d9}
@@ -1195,18 +1193,18 @@ define arm_aapcs_vfpcc <4 x float> @shuffle4step_f32(<16 x float> %src) {
 ; CHECKFP-NEXT:    vmov.f32 s16, s3
 ; CHECKFP-NEXT:    vmov.f32 s20, s2
 ; CHECKFP-NEXT:    vmov.f32 s17, s7
-; CHECKFP-NEXT:    vmov.f32 s21, s6
 ; CHECKFP-NEXT:    vmov.f32 s18, s11
-; CHECKFP-NEXT:    vmov.f32 s22, s10
 ; CHECKFP-NEXT:    vmov.f32 s19, s15
+; CHECKFP-NEXT:    vmov.f32 s21, s6
+; CHECKFP-NEXT:    vmov.f32 s22, s10
 ; CHECKFP-NEXT:    vmov.f32 s23, s14
 ; CHECKFP-NEXT:    vadd.f32 q4, q5, q4
 ; CHECKFP-NEXT:    vmov.f32 s20, s1
 ; CHECKFP-NEXT:    vmov.f32 s21, s5
-; CHECKFP-NEXT:    vmov.f32 s1, s4
 ; CHECKFP-NEXT:    vmov.f32 s22, s9
-; CHECKFP-NEXT:    vmov.f32 s2, s8
 ; CHECKFP-NEXT:    vmov.f32 s23, s13
+; CHECKFP-NEXT:    vmov.f32 s1, s4
+; CHECKFP-NEXT:    vmov.f32 s2, s8
 ; CHECKFP-NEXT:    vmov.f32 s3, s12
 ; CHECKFP-NEXT:    vadd.f32 q0, q0, q5
 ; CHECKFP-NEXT:    vadd.f32 q0, q0, q4
@@ -1230,12 +1228,12 @@ define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmovx.f16 s0, s7
-; CHECK-NEXT:    vins.f16 s0, s7
 ; CHECK-NEXT:    vmovx.f16 s1, s6
-; CHECK-NEXT:    vins.f16 s1, s6
 ; CHECK-NEXT:    vmovx.f16 s2, s5
-; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vins.f16 s0, s7
+; CHECK-NEXT:    vins.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1256,14 +1254,14 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) {
 ; CHECK-LABEL: shuffle3_f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmovx.f16 s1, s7
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vins.f16 s1, s7
 ; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vmovx.f16 s0, s4
 ; CHECK-NEXT:    vins.f16 s5, s4
-; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vins.f16 s2, s0
 ; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmovx.f16 s1, s7
 ; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vins.f16 s1, s7
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x half> %src, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
@@ -1340,24 +1338,24 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle2step_f16(<16 x half> %src) {
 ; CHECKFP-LABEL: shuffle2step_f16:
 ; CHECKFP:       @ %bb.0: @ %entry
-; CHECKFP-NEXT:    vmovx.f16 s12, s1
 ; CHECKFP-NEXT:    vmovx.f16 s8, s0
-; CHECKFP-NEXT:    vins.f16 s8, s12
-; CHECKFP-NEXT:    vmovx.f16 s12, s3
+; CHECKFP-NEXT:    vmovx.f16 s10, s1
+; CHECKFP-NEXT:    vins.f16 s8, s10
 ; CHECKFP-NEXT:    vmovx.f16 s9, s2
-; CHECKFP-NEXT:    vins.f16 s0, s1
-; CHECKFP-NEXT:    vins.f16 s9, s12
-; CHECKFP-NEXT:    vins.f16 s2, s3
+; CHECKFP-NEXT:    vmovx.f16 s10, s3
 ; CHECKFP-NEXT:    vmovx.f16 s12, s5
+; CHECKFP-NEXT:    vins.f16 s9, s10
 ; CHECKFP-NEXT:    vmovx.f16 s10, s4
 ; CHECKFP-NEXT:    vins.f16 s10, s12
-; CHECKFP-NEXT:    vins.f16 s4, s5
-; CHECKFP-NEXT:    vmov.f32 s1, s2
-; CHECKFP-NEXT:    vmovx.f16 s12, s7
 ; CHECKFP-NEXT:    vmovx.f16 s11, s6
+; CHECKFP-NEXT:    vmovx.f16 s12, s7
+; CHECKFP-NEXT:    vins.f16 s2, s3
 ; CHECKFP-NEXT:    vins.f16 s6, s7
-; CHECKFP-NEXT:    vmov.f32 s2, s4
+; CHECKFP-NEXT:    vins.f16 s4, s5
+; CHECKFP-NEXT:    vins.f16 s0, s1
+; CHECKFP-NEXT:    vmov.f32 s1, s2
 ; CHECKFP-NEXT:    vins.f16 s11, s12
+; CHECKFP-NEXT:    vmov.f32 s2, s4
 ; CHECKFP-NEXT:    vmov.f32 s3, s6
 ; CHECKFP-NEXT:    vadd.f16 q0, q0, q2
 ; CHECKFP-NEXT:    bx lr
@@ -1371,45 +1369,43 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @shuffle3step_f16(<32 x half> %src) {
 ; CHECKFP-LABEL: shuffle3step_f16:
 ; CHECKFP:       @ %bb.0: @ %entry
-; CHECKFP-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECKFP-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECKFP-NEXT:    vmovx.f16 s16, s2
+; CHECKFP-NEXT:    .vsave {d8, d9, d10}
+; CHECKFP-NEXT:    vpush {d8, d9, d10}
 ; CHECKFP-NEXT:    vmov.f32 s12, s1
-; CHECKFP-NEXT:    vins.f16 s12, s16
-; CHECKFP-NEXT:    vmovx.f16 s16, s5
+; CHECKFP-NEXT:    vmovx.f16 s14, s2
+; CHECKFP-NEXT:    vins.f16 s12, s14
 ; CHECKFP-NEXT:    vmov.f32 s13, s4
-; CHECKFP-NEXT:    vmovx.f16 s20, s11
-; CHECKFP-NEXT:    vins.f16 s13, s16
-; CHECKFP-NEXT:    vmov.f32 s19, s10
-; CHECKFP-NEXT:    vins.f16 s19, s20
+; CHECKFP-NEXT:    vmovx.f16 s14, s5
+; CHECKFP-NEXT:    vmov.f32 s15, s10
+; CHECKFP-NEXT:    vins.f16 s13, s14
+; CHECKFP-NEXT:    vmovx.f16 s14, s11
+; CHECKFP-NEXT:    vins.f16 s15, s14
 ; CHECKFP-NEXT:    vmov.f32 s14, s7
-; CHECKFP-NEXT:    vmovx.f16 s20, s8
-; CHECKFP-NEXT:    vmov.f32 s28, s6
-; CHECKFP-NEXT:    vins.f16 s14, s20
-; CHECKFP-NEXT:    vmovx.f16 s20, s7
-; CHECKFP-NEXT:    vins.f16 s28, s20
-; CHECKFP-NEXT:    vmovx.f16 s24, s1
-; CHECKFP-NEXT:    vmovx.f16 s20, s0
-; CHECKFP-NEXT:    vins.f16 s0, s24
-; CHECKFP-NEXT:    vins.f16 s20, s2
-; CHECKFP-NEXT:    vmovx.f16 s26, s4
-; CHECKFP-NEXT:    vmovx.f16 s21, s3
-; CHECKFP-NEXT:    vins.f16 s3, s26
-; CHECKFP-NEXT:    vins.f16 s21, s5
-; CHECKFP-NEXT:    vmovx.f16 s30, s10
-; CHECKFP-NEXT:    vmovx.f16 s23, s9
-; CHECKFP-NEXT:    vmov.f32 s18, s8
+; CHECKFP-NEXT:    vmovx.f16 s16, s8
+; CHECKFP-NEXT:    vmovx.f16 s4, s4
+; CHECKFP-NEXT:    vmovx.f16 s7, s7
+; CHECKFP-NEXT:    vmov.f32 s20, s6
+; CHECKFP-NEXT:    vmovx.f16 s10, s10
+; CHECKFP-NEXT:    vmovx.f16 s17, s3
+; CHECKFP-NEXT:    vmovx.f16 s19, s9
+; CHECKFP-NEXT:    vmovx.f16 s18, s6
+; CHECKFP-NEXT:    vins.f16 s14, s16
+; CHECKFP-NEXT:    vmovx.f16 s16, s0
+; CHECKFP-NEXT:    vmovx.f16 s1, s1
+; CHECKFP-NEXT:    vins.f16 s20, s7
+; CHECKFP-NEXT:    vins.f16 s3, s4
+; CHECKFP-NEXT:    vins.f16 s9, s10
+; CHECKFP-NEXT:    vins.f16 s0, s1
+; CHECKFP-NEXT:    vins.f16 s16, s2
 ; CHECKFP-NEXT:    vmov.f32 s1, s3
-; CHECKFP-NEXT:    vins.f16 s9, s30
-; CHECKFP-NEXT:    vins.f16 s23, s11
-; CHECKFP-NEXT:    vmov.f32 s2, s28
-; CHECKFP-NEXT:    vmovx.f16 s22, s6
+; CHECKFP-NEXT:    vins.f16 s17, s5
+; CHECKFP-NEXT:    vins.f16 s19, s11
+; CHECKFP-NEXT:    vins.f16 s18, s8
+; CHECKFP-NEXT:    vmov.f32 s2, s20
 ; CHECKFP-NEXT:    vmov.f32 s3, s9
-; CHECKFP-NEXT:    vins.f16 s22, s8
-; CHECKFP-NEXT:    vmov.f32 s15, s19
-; CHECKFP-NEXT:    vadd.f16 q0, q0, q5
+; CHECKFP-NEXT:    vadd.f16 q0, q0, q4
 ; CHECKFP-NEXT:    vadd.f16 q0, q0, q3
-; CHECKFP-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECKFP-NEXT:    vpop {d8, d9, d10}
 ; CHECKFP-NEXT:    bx lr
 entry:
   %s1 = shufflevector <32 x half> %src, <32 x half> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
@@ -1425,47 +1421,47 @@ define arm_aapcs_vfpcc <8 x half> @shuffle4step_f16(<32 x half> %src) {
 ; CHECKFP:       @ %bb.0: @ %entry
 ; CHECKFP-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECKFP-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECKFP-NEXT:    vmovx.f16 s20, s11
 ; CHECKFP-NEXT:    vmovx.f16 s18, s9
-; CHECKFP-NEXT:    vins.f16 s18, s20
-; CHECKFP-NEXT:    vmovx.f16 s20, s15
+; CHECKFP-NEXT:    vmovx.f16 s16, s11
+; CHECKFP-NEXT:    vins.f16 s18, s16
 ; CHECKFP-NEXT:    vmovx.f16 s19, s13
-; CHECKFP-NEXT:    vins.f16 s9, s11
-; CHECKFP-NEXT:    vins.f16 s19, s20
-; CHECKFP-NEXT:    vmovx.f16 s20, s3
+; CHECKFP-NEXT:    vmovx.f16 s16, s15
+; CHECKFP-NEXT:    vmovx.f16 s22, s8
+; CHECKFP-NEXT:    vins.f16 s19, s16
 ; CHECKFP-NEXT:    vmovx.f16 s16, s1
-; CHECKFP-NEXT:    vmovx.f16 s24, s10
+; CHECKFP-NEXT:    vmovx.f16 s20, s3
+; CHECKFP-NEXT:    vins.f16 s1, s3
+; CHECKFP-NEXT:    vmovx.f16 s3, s10
 ; CHECKFP-NEXT:    vins.f16 s16, s20
-; CHECKFP-NEXT:    vmovx.f16 s20, s7
 ; CHECKFP-NEXT:    vmovx.f16 s17, s5
-; CHECKFP-NEXT:    vins.f16 s13, s15
-; CHECKFP-NEXT:    vins.f16 s17, s20
-; CHECKFP-NEXT:    vmovx.f16 s22, s8
-; CHECKFP-NEXT:    vins.f16 s22, s24
-; CHECKFP-NEXT:    vmovx.f16 s24, s14
+; CHECKFP-NEXT:    vmovx.f16 s20, s7
+; CHECKFP-NEXT:    vins.f16 s22, s3
 ; CHECKFP-NEXT:    vmovx.f16 s23, s12
-; CHECKFP-NEXT:    vins.f16 s1, s3
-; CHECKFP-NEXT:    vins.f16 s23, s24
-; CHECKFP-NEXT:    vmovx.f16 s24, s2
+; CHECKFP-NEXT:    vmovx.f16 s3, s14
+; CHECKFP-NEXT:    vins.f16 s17, s20
+; CHECKFP-NEXT:    vins.f16 s23, s3
 ; CHECKFP-NEXT:    vmovx.f16 s20, s0
+; CHECKFP-NEXT:    vmovx.f16 s3, s2
+; CHECKFP-NEXT:    vins.f16 s9, s11
+; CHECKFP-NEXT:    vins.f16 s13, s15
 ; CHECKFP-NEXT:    vins.f16 s5, s7
-; CHECKFP-NEXT:    vins.f16 s20, s24
-; CHECKFP-NEXT:    vmovx.f16 s24, s6
+; CHECKFP-NEXT:    vins.f16 s20, s3
 ; CHECKFP-NEXT:    vmovx.f16 s21, s4
+; CHECKFP-NEXT:    vmovx.f16 s3, s6
 ; CHECKFP-NEXT:    vins.f16 s8, s10
-; CHECKFP-NEXT:    vins.f16 s21, s24
-; CHECKFP-NEXT:    vmov.f32 s26, s9
 ; CHECKFP-NEXT:    vins.f16 s12, s14
-; CHECKFP-NEXT:    vins.f16 s0, s2
-; CHECKFP-NEXT:    vmov.f32 s27, s13
 ; CHECKFP-NEXT:    vins.f16 s4, s6
+; CHECKFP-NEXT:    vins.f16 s21, s3
+; CHECKFP-NEXT:    vins.f16 s0, s2
 ; CHECKFP-NEXT:    vmov.f32 s24, s1
+; CHECKFP-NEXT:    vmov.f32 s26, s9
+; CHECKFP-NEXT:    vmov.f32 s27, s13
+; CHECKFP-NEXT:    vmov.f32 s25, s5
 ; CHECKFP-NEXT:    vmov.f32 s2, s8
+; CHECKFP-NEXT:    vadd.f16 q4, q6, q4
 ; CHECKFP-NEXT:    vmov.f32 s3, s12
 ; CHECKFP-NEXT:    vmov.f32 s1, s4
-; CHECKFP-NEXT:    vmov.f32 s25, s5
 ; CHECKFP-NEXT:    vadd.f16 q0, q0, q5
-; CHECKFP-NEXT:    vadd.f16 q4, q6, q4
 ; CHECKFP-NEXT:    vadd.f16 q0, q0, q4
 ; CHECKFP-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECKFP-NEXT:    bx lr
@@ -1495,8 +1491,8 @@ define arm_aapcs_vfpcc <2 x double> @shuffle2_f64(<2 x double> %src) {
 ; CHECK-LABEL: shuffle2_f64:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f32 s4, s2
-; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s5, s3
 ; CHECK-NEXT:    vmov.f32 s7, s1
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
@@ -1559,7 +1555,6 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @insert_f32(float %a) {
 ; CHECK-LABEL: insert_f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 def $q0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = insertelement <4 x float> undef, float %a, i32 0
@@ -1569,7 +1564,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @insert_f16(half %a) {
 ; CHECK-LABEL: insert_f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 def $q0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = insertelement <8 x half> undef, half %a, i32 0
@@ -1579,7 +1573,6 @@ entry:
 define arm_aapcs_vfpcc <2 x double> @insert_f64(double %a) {
 ; CHECK-LABEL: insert_f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = insertelement <2 x double> undef, double %a, i32 0
@@ -1696,7 +1689,6 @@ entry:
 define arm_aapcs_vfpcc float @extract_f32_0(<4 x float> %a) {
 ; CHECK-LABEL: extract_f32_0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = extractelement <4 x float> %a, i32 0
@@ -1716,7 +1708,6 @@ entry:
 define arm_aapcs_vfpcc half @extract_f16_0(<8 x half> %a) {
 ; CHECK-LABEL: extract_f16_0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = extractelement <8 x half> %a, i32 0
@@ -1736,7 +1727,6 @@ entry:
 define arm_aapcs_vfpcc double @extract_f64_0(<2 x double> %a) {
 ; CHECK-LABEL: extract_f64_0:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    bx lr
 entry:
   %res = extractelement <2 x double> %a, i32 0
diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
index 043f7d9576a3b..b487407eefa5a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
@@ -52,10 +52,10 @@ define arm_aapcs_vfpcc <4 x i32> @sext_i32_1357_swapped(<8 x i16> %src) {
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    mov r0, sp
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
-; CHECK-NEXT:    vldrh.s32 q2, [r0]
+; CHECK-NEXT:    vldrh.s32 q0, [r0]
 ; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
-; CHECK-NEXT:    vmov.f32 s0, s9
-; CHECK-NEXT:    vmov.f32 s1, s11
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s1, s3
 ; CHECK-NEXT:    vmov.f32 s2, s5
 ; CHECK-NEXT:    vmov.f32 s3, s7
 ; CHECK-NEXT:    add sp, #16
@@ -94,9 +94,9 @@ define arm_aapcs_vfpcc <8 x i32> @sext_i32_02468101214_swapped(<16 x i16> %src)
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vldrh.s32 q1, [r1]
+; CHECK-NEXT:    vmov.f32 s7, s10
 ; CHECK-NEXT:    vmov.f32 s5, s6
 ; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vmov.f32 s7, s10
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
 entry:
@@ -126,17 +126,17 @@ define arm_aapcs_vfpcc <8 x i32> @sext_i32_13579111315_swapped(<16 x i16> %src)
 ; CHECK-NEXT:    add r1, sp, #16
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    vstrw.32 q1, [r1]
-; CHECK-NEXT:    vldrh.s32 q2, [r0]
+; CHECK-NEXT:    vldrh.s32 q0, [r0]
 ; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
-; CHECK-NEXT:    vldrh.s32 q3, [r1]
-; CHECK-NEXT:    vmov.f32 s0, s9
-; CHECK-NEXT:    vmov.f32 s1, s11
 ; CHECK-NEXT:    vldrh.s32 q2, [r1, #8]
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s1, s3
 ; CHECK-NEXT:    vmov.f32 s2, s5
 ; CHECK-NEXT:    vmov.f32 s3, s7
-; CHECK-NEXT:    vmov.f32 s4, s13
-; CHECK-NEXT:    vmov.f32 s5, s15
+; CHECK-NEXT:    vldrh.s32 q1, [r1]
 ; CHECK-NEXT:    vmov.f32 s6, s9
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
@@ -195,10 +195,10 @@ define arm_aapcs_vfpcc <4 x i32> @zext_i32_1357_swapped(<8 x i16> %src) {
 ; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    mov r0, sp
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
-; CHECK-NEXT:    vldrh.u32 q2, [r0]
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
 ; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
-; CHECK-NEXT:    vmov.f32 s0, s9
-; CHECK-NEXT:    vmov.f32 s1, s11
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s1, s3
 ; CHECK-NEXT:    vmov.f32 s2, s5
 ; CHECK-NEXT:    vmov.f32 s3, s7
 ; CHECK-NEXT:    add sp, #16
@@ -237,9 +237,9 @@ define arm_aapcs_vfpcc <8 x i32> @zext_i32_02468101214_swapped(<16 x i16> %src)
 ; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vldrh.u32 q1, [r1]
+; CHECK-NEXT:    vmov.f32 s7, s10
 ; CHECK-NEXT:    vmov.f32 s5, s6
 ; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vmov.f32 s7, s10
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
 entry:
@@ -269,17 +269,17 @@ define arm_aapcs_vfpcc <8 x i32> @zext_i32_13579111315_swapped(<16 x i16> %src)
 ; CHECK-NEXT:    add r1, sp, #16
 ; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    vstrw.32 q1, [r1]
-; CHECK-NEXT:    vldrh.u32 q2, [r0]
+; CHECK-NEXT:    vldrh.u32 q0, [r0]
 ; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
-; CHECK-NEXT:    vldrh.u32 q3, [r1]
-; CHECK-NEXT:    vmov.f32 s0, s9
-; CHECK-NEXT:    vmov.f32 s1, s11
 ; CHECK-NEXT:    vldrh.u32 q2, [r1, #8]
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s1, s3
 ; CHECK-NEXT:    vmov.f32 s2, s5
 ; CHECK-NEXT:    vmov.f32 s3, s7
-; CHECK-NEXT:    vmov.f32 s4, s13
-; CHECK-NEXT:    vmov.f32 s5, s15
+; CHECK-NEXT:    vldrh.u32 q1, [r1]
 ; CHECK-NEXT:    vmov.f32 s6, s9
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    add sp, #32
 ; CHECK-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
index ce08e69b6816c..7318ec8077deb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-shufflemov.ll
@@ -37,12 +37,12 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle_i16_76543210(<8 x i16> %s1, <8 x i16>
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmovx.f16 s0, s7
-; CHECK-NEXT:    vins.f16 s0, s7
 ; CHECK-NEXT:    vmovx.f16 s1, s6
-; CHECK-NEXT:    vins.f16 s1, s6
 ; CHECK-NEXT:    vmovx.f16 s2, s5
-; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vins.f16 s0, s7
+; CHECK-NEXT:    vins.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
@@ -340,12 +340,12 @@ define arm_aapcs_vfpcc <8 x half> @shuffle_f16_76543210(<8 x half> %s1, <8 x hal
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmovx.f16 s0, s7
-; CHECK-NEXT:    vins.f16 s0, s7
 ; CHECK-NEXT:    vmovx.f16 s1, s6
-; CHECK-NEXT:    vins.f16 s1, s6
 ; CHECK-NEXT:    vmovx.f16 s2, s5
-; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vmovx.f16 s3, s4
+; CHECK-NEXT:    vins.f16 s0, s7
+; CHECK-NEXT:    vins.f16 s1, s6
+; CHECK-NEXT:    vins.f16 s2, s5
 ; CHECK-NEXT:    vins.f16 s3, s4
 ; CHECK-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
index a71adb8a655d1..d145b6a61737b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-simple-arith.ll
@@ -56,11 +56,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @add_float32_t(<4 x float> %src1, <4 x float> %src2) {
 ; CHECK-MVE-LABEL: add_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vadd.f32 s11, s7, s3
-; CHECK-MVE-NEXT:    vadd.f32 s10, s6, s2
-; CHECK-MVE-NEXT:    vadd.f32 s9, s5, s1
-; CHECK-MVE-NEXT:    vadd.f32 s8, s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vadd.f32 s3, s7, s3
+; CHECK-MVE-NEXT:    vadd.f32 s2, s6, s2
+; CHECK-MVE-NEXT:    vadd.f32 s1, s5, s1
+; CHECK-MVE-NEXT:    vadd.f32 s0, s4, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: add_float32_t:
@@ -75,27 +74,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @add_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: add_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q2, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vadd.f16 s12, s2, s0
-; CHECK-MVE-NEXT:    vadd.f16 s0, s4, s8
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vadd.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vins.f16 s1, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vadd.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vins.f16 s2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vadd.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vins.f16 s3, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vadd.f16 s0, s4, s0
+; CHECK-MVE-NEXT:    vadd.f16 s8, s10, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vadd.f16 s1, s5, s1
+; CHECK-MVE-NEXT:    vadd.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vadd.f16 s2, s6, s2
+; CHECK-MVE-NEXT:    vadd.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
+; CHECK-MVE-NEXT:    vadd.f16 s3, s7, s3
+; CHECK-MVE-NEXT:    vadd.f16 s4, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: add_float16_t:
@@ -189,11 +187,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @sub_float32_t(<4 x float> %src1, <4 x float> %src2) {
 ; CHECK-MVE-LABEL: sub_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vsub.f32 s11, s7, s3
-; CHECK-MVE-NEXT:    vsub.f32 s10, s6, s2
-; CHECK-MVE-NEXT:    vsub.f32 s9, s5, s1
-; CHECK-MVE-NEXT:    vsub.f32 s8, s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vsub.f32 s3, s7, s3
+; CHECK-MVE-NEXT:    vsub.f32 s2, s6, s2
+; CHECK-MVE-NEXT:    vsub.f32 s1, s5, s1
+; CHECK-MVE-NEXT:    vsub.f32 s0, s4, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: sub_float32_t:
@@ -208,27 +205,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @sub_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: sub_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q2, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vsub.f16 s12, s2, s0
-; CHECK-MVE-NEXT:    vsub.f16 s0, s4, s8
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vsub.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vins.f16 s1, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vsub.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vins.f16 s2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vsub.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vsub.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vins.f16 s3, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vsub.f16 s0, s4, s0
+; CHECK-MVE-NEXT:    vsub.f16 s8, s10, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vsub.f16 s1, s5, s1
+; CHECK-MVE-NEXT:    vsub.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vsub.f16 s2, s6, s2
+; CHECK-MVE-NEXT:    vsub.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
+; CHECK-MVE-NEXT:    vsub.f16 s3, s7, s3
+; CHECK-MVE-NEXT:    vsub.f16 s4, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: sub_float16_t:
@@ -324,27 +320,26 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @mul_float16_t(<8 x half> %src1, <8 x half> %src2) {
 ; CHECK-MVE-LABEL: mul_float16_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q2, q0
-; CHECK-MVE-NEXT:    vmovx.f16 s2, s4
-; CHECK-MVE-NEXT:    vmovx.f16 s0, s8
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s5
-; CHECK-MVE-NEXT:    vmul.f16 s12, s2, s0
-; CHECK-MVE-NEXT:    vmul.f16 s0, s4, s8
-; CHECK-MVE-NEXT:    vins.f16 s0, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s9
-; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmul.f16 s1, s5, s9
-; CHECK-MVE-NEXT:    vins.f16 s1, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s10
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s6
-; CHECK-MVE-NEXT:    vmul.f16 s2, s6, s10
-; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s7
-; CHECK-MVE-NEXT:    vins.f16 s2, s12
-; CHECK-MVE-NEXT:    vmovx.f16 s12, s11
-; CHECK-MVE-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-MVE-NEXT:    vmul.f16 s3, s7, s11
-; CHECK-MVE-NEXT:    vins.f16 s3, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s0
+; CHECK-MVE-NEXT:    vmovx.f16 s10, s4
+; CHECK-MVE-NEXT:    vmul.f16 s0, s4, s0
+; CHECK-MVE-NEXT:    vmul.f16 s8, s10, s8
+; CHECK-MVE-NEXT:    vins.f16 s0, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s5
+; CHECK-MVE-NEXT:    vmul.f16 s1, s5, s1
+; CHECK-MVE-NEXT:    vmul.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s6
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vmul.f16 s2, s6, s2
+; CHECK-MVE-NEXT:    vmul.f16 s4, s8, s4
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s7
+; CHECK-MVE-NEXT:    vmul.f16 s3, s7, s3
+; CHECK-MVE-NEXT:    vmul.f16 s4, s6, s4
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: mul_float16_t:
@@ -359,11 +354,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @mul_float32_t(<4 x float> %src1, <4 x float> %src2) {
 ; CHECK-MVE-LABEL: mul_float32_t:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmul.f32 s11, s7, s3
-; CHECK-MVE-NEXT:    vmul.f32 s10, s6, s2
-; CHECK-MVE-NEXT:    vmul.f32 s9, s5, s1
-; CHECK-MVE-NEXT:    vmul.f32 s8, s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q2
+; CHECK-MVE-NEXT:    vmul.f32 s3, s7, s3
+; CHECK-MVE-NEXT:    vmul.f32 s2, s6, s2
+; CHECK-MVE-NEXT:    vmul.f32 s1, s5, s1
+; CHECK-MVE-NEXT:    vmul.f32 s0, s4, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: mul_float32_t:
diff --git a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
index 488a20bc9602f..4b76906034057 100644
--- a/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-soft-float-abi.ll
@@ -6,10 +6,10 @@
 define <16 x i8> @vector_add_i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ; CHECK-LE-LABEL: vector_add_i8:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vmov d0, r0, r1
 ; CHECK-LE-NEXT:    mov r0, sp
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vadd.i8 q0, q0, q1
 ; CHECK-LE-NEXT:    vmov r0, r1, d0
 ; CHECK-LE-NEXT:    vmov r2, r3, d1
@@ -17,9 +17,9 @@ define <16 x i8> @vector_add_i8(<16 x i8> %lhs, <16 x i8> %rhs) {
 ;
 ; CHECK-BE-LABEL: vector_add_i8:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
 ; CHECK-BE-NEXT:    mov r0, sp
+; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
 ; CHECK-BE-NEXT:    vldrb.u8 q0, [r0]
 ; CHECK-BE-NEXT:    vadd.i8 q0, q1, q0
@@ -35,10 +35,10 @@ entry:
 define <8 x i16> @vector_add_i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; CHECK-LE-LABEL: vector_add_i16:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vmov d0, r0, r1
 ; CHECK-LE-NEXT:    mov r0, sp
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-LE-NEXT:    vmov r0, r1, d0
 ; CHECK-LE-NEXT:    vmov r2, r3, d1
@@ -46,9 +46,9 @@ define <8 x i16> @vector_add_i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ;
 ; CHECK-BE-LABEL: vector_add_i16:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
 ; CHECK-BE-NEXT:    mov r0, sp
+; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vldrh.u16 q0, [r0]
 ; CHECK-BE-NEXT:    vadd.i16 q0, q1, q0
@@ -64,10 +64,10 @@ entry:
 define <4 x i32> @vector_add_i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; CHECK-LE-LABEL: vector_add_i32:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vmov d0, r0, r1
 ; CHECK-LE-NEXT:    mov r0, sp
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-LE-NEXT:    vmov d1, r2, r3
 ; CHECK-LE-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-LE-NEXT:    vmov r0, r1, d0
 ; CHECK-LE-NEXT:    vmov r2, r3, d1
@@ -75,9 +75,9 @@ define <4 x i32> @vector_add_i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ;
 ; CHECK-BE-LABEL: vector_add_i32:
 ; CHECK-BE:       @ %bb.0: @ %entry
-; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
 ; CHECK-BE-NEXT:    mov r0, sp
+; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-BE-NEXT:    vadd.i32 q0, q1, q0
@@ -144,10 +144,10 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) {
 ; CHECK-MVE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-MVE-NEXT:    vmov d9, r2, r3
 ; CHECK-MVE-NEXT:    vmov d8, r0, r1
 ; CHECK-MVE-NEXT:    add r0, sp, #64
 ; CHECK-MVE-NEXT:    vldrw.u32 q6, [r0]
+; CHECK-MVE-NEXT:    vmov d9, r2, r3
 ; CHECK-MVE-NEXT:    vmov.u16 r4, q4[0]
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q6[0]
 ; CHECK-MVE-NEXT:    bl __aeabi_h2f
@@ -239,13 +239,13 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) {
 ; CHECK-BE-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-BE-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
 ; CHECK-BE-NEXT:    add r0, sp, #64
 ; CHECK-BE-NEXT:    vldrh.u16 q6, [r0]
+; CHECK-BE-NEXT:    vmov d1, r3, r2
 ; CHECK-BE-NEXT:    vrev64.16 q4, q0
-; CHECK-BE-NEXT:    vmov.u16 r4, q4[0]
 ; CHECK-BE-NEXT:    vmov.u16 r0, q6[0]
+; CHECK-BE-NEXT:    vmov.u16 r4, q4[0]
 ; CHECK-BE-NEXT:    bl __aeabi_h2f
 ; CHECK-BE-NEXT:    mov r5, r0
 ; CHECK-BE-NEXT:    mov r0, r4
@@ -332,10 +332,10 @@ define <8 x half> @vector_add_f16(<8 x half> %lhs, <8 x half> %rhs) {
 ;
 ; CHECK-FP-LABEL: vector_add_f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmov d1, r2, r3
 ; CHECK-FP-NEXT:    vmov d0, r0, r1
 ; CHECK-FP-NEXT:    mov r0, sp
 ; CHECK-FP-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-FP-NEXT:    vmov d1, r2, r3
 ; CHECK-FP-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vmov r0, r1, d0
 ; CHECK-FP-NEXT:    vmov r2, r3, d1
@@ -352,21 +352,21 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK-MVE-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-MVE-NEXT:    .pad #4
 ; CHECK-MVE-NEXT:    sub sp, #4
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    mov r4, r0
-; CHECK-MVE-NEXT:    add r0, sp, #56
-; CHECK-MVE-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-MVE-NEXT:    add r0, sp, #40
+; CHECK-MVE-NEXT:    vldrw.u32 q4, [r0]
 ; CHECK-MVE-NEXT:    mov r6, r1
 ; CHECK-MVE-NEXT:    mov r0, r3
 ; CHECK-MVE-NEXT:    mov r5, r2
-; CHECK-MVE-NEXT:    vmov r7, r1, d11
+; CHECK-MVE-NEXT:    vmov r7, r1, d9
 ; CHECK-MVE-NEXT:    bl __aeabi_fadd
 ; CHECK-MVE-NEXT:    vmov s19, r0
 ; CHECK-MVE-NEXT:    mov r0, r5
 ; CHECK-MVE-NEXT:    mov r1, r7
 ; CHECK-MVE-NEXT:    bl __aeabi_fadd
-; CHECK-MVE-NEXT:    vmov r5, r1, d10
+; CHECK-MVE-NEXT:    vmov r5, r1, d8
 ; CHECK-MVE-NEXT:    vmov s18, r0
 ; CHECK-MVE-NEXT:    mov r0, r6
 ; CHECK-MVE-NEXT:    bl __aeabi_fadd
@@ -377,7 +377,7 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK-MVE-NEXT:    vmov s16, r0
 ; CHECK-MVE-NEXT:    vmov r2, r3, d9
 ; CHECK-MVE-NEXT:    vmov r0, r1, d8
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    add sp, #4
 ; CHECK-MVE-NEXT:    pop {r4, r5, r6, r7, pc}
 ;
@@ -385,23 +385,23 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-BE-NEXT:    push {r4, r5, r7, lr}
-; CHECK-BE-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-BE-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-BE-NEXT:    vmov d1, r3, r2
+; CHECK-BE-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-BE-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-BE-NEXT:    vmov d0, r1, r0
-; CHECK-BE-NEXT:    add r1, sp, #64
-; CHECK-BE-NEXT:    vldrw.u32 q6, [r1]
-; CHECK-BE-NEXT:    vrev64.32 q5, q0
-; CHECK-BE-NEXT:    vmov r4, r0, d11
-; CHECK-BE-NEXT:    vmov r5, r1, d13
+; CHECK-BE-NEXT:    add r1, sp, #48
+; CHECK-BE-NEXT:    vldrw.u32 q5, [r1]
+; CHECK-BE-NEXT:    vmov d1, r3, r2
+; CHECK-BE-NEXT:    vrev64.32 q4, q0
+; CHECK-BE-NEXT:    vmov r4, r0, d9
+; CHECK-BE-NEXT:    vmov r5, r1, d11
 ; CHECK-BE-NEXT:    bl __aeabi_fadd
 ; CHECK-BE-NEXT:    vmov s19, r0
 ; CHECK-BE-NEXT:    mov r0, r4
 ; CHECK-BE-NEXT:    mov r1, r5
 ; CHECK-BE-NEXT:    bl __aeabi_fadd
 ; CHECK-BE-NEXT:    vmov s18, r0
-; CHECK-BE-NEXT:    vmov r4, r0, d10
-; CHECK-BE-NEXT:    vmov r5, r1, d12
+; CHECK-BE-NEXT:    vmov r4, r0, d8
+; CHECK-BE-NEXT:    vmov r5, r1, d10
 ; CHECK-BE-NEXT:    bl __aeabi_fadd
 ; CHECK-BE-NEXT:    vmov s17, r0
 ; CHECK-BE-NEXT:    mov r0, r4
@@ -411,15 +411,15 @@ define <4 x float> @vector_add_f32(<4 x float> %lhs, <4 x float> %rhs) {
 ; CHECK-BE-NEXT:    vrev64.32 q0, q4
 ; CHECK-BE-NEXT:    vmov r1, r0, d0
 ; CHECK-BE-NEXT:    vmov r3, r2, d1
-; CHECK-BE-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-BE-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-BE-NEXT:    pop {r4, r5, r7, pc}
 ;
 ; CHECK-FP-LABEL: vector_add_f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmov d1, r2, r3
 ; CHECK-FP-NEXT:    vmov d0, r0, r1
 ; CHECK-FP-NEXT:    mov r0, sp
 ; CHECK-FP-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-FP-NEXT:    vmov d1, r2, r3
 ; CHECK-FP-NEXT:    vadd.f32 q0, q0, q1
 ; CHECK-FP-NEXT:    vmov r0, r1, d0
 ; CHECK-FP-NEXT:    vmov r2, r3, d1
diff --git a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
index df14b59f9934d..56f95b2218378 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vabdus.ll
@@ -186,8 +186,8 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
 ; CHECK-NEXT:    .pad #4
 ; CHECK-NEXT:    sub sp, #4
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .vsave {d9}
+; CHECK-NEXT:    vpush {d9}
 ; CHECK-NEXT:    mov.w lr, #256
 ; CHECK-NEXT:    mov.w r12, #1
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
@@ -195,12 +195,13 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
-; CHECK-NEXT:    vmov.f32 s12, s4
-; CHECK-NEXT:    vmov.f32 s16, s8
+; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r5, s8
 ; CHECK-NEXT:    vmov.f32 s14, s5
-; CHECK-NEXT:    vmov r3, s12
 ; CHECK-NEXT:    vmov.f32 s18, s9
-; CHECK-NEXT:    vmov r5, s16
+; CHECK-NEXT:    vmov.f32 s4, s6
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov.f32 s8, s10
 ; CHECK-NEXT:    vmov r7, s18
 ; CHECK-NEXT:    asrs r4, r3, #31
 ; CHECK-NEXT:    subs.w r8, r3, r5
@@ -209,24 +210,21 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    bfi r4, r5, #0, #4
 ; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    vmov.f32 s12, s6
-; CHECK-NEXT:    vmov.f32 s14, s7
-; CHECK-NEXT:    vmov.f32 s4, s10
-; CHECK-NEXT:    vmov.f32 s6, s11
-; CHECK-NEXT:    vmov r3, s6
 ; CHECK-NEXT:    subs.w r9, r5, r7
 ; CHECK-NEXT:    asr.w r6, r5, #31
-; CHECK-NEXT:    vmov r5, s12
+; CHECK-NEXT:    vmov r5, s4
 ; CHECK-NEXT:    sbc.w r6, r6, r7, asr #31
 ; CHECK-NEXT:    and.w r6, r12, r6, asr #31
 ; CHECK-NEXT:    rsbs r6, r6, #0
 ; CHECK-NEXT:    bfi r4, r6, #4, #4
-; CHECK-NEXT:    vmov r6, s14
+; CHECK-NEXT:    vmov r6, s6
+; CHECK-NEXT:    vmov.f32 s6, s11
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    asrs r7, r6, #31
 ; CHECK-NEXT:    subs.w r10, r6, r3
-; CHECK-NEXT:    asr.w r7, r6, #31
+; CHECK-NEXT:    asr.w r6, r5, #31
 ; CHECK-NEXT:    sbc.w r3, r7, r3, asr #31
-; CHECK-NEXT:    vmov r7, s4
-; CHECK-NEXT:    asrs r6, r5, #31
+; CHECK-NEXT:    vmov r7, s8
 ; CHECK-NEXT:    asr.w r11, r3, #31
 ; CHECK-NEXT:    and.w r3, r12, r3, asr #31
 ; CHECK-NEXT:    rsbs r3, r3, #0
@@ -247,7 +245,7 @@ define void @vabd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    vstrb.8 q1, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB8_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d9}
 ; CHECK-NEXT:    add sp, #4
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
 entry:
@@ -390,7 +388,13 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    vmov.f32 s20, s12
 ; CHECK-NEXT:    vmov.f32 s22, s13
 ; CHECK-NEXT:    vand q5, q5, q0
+; CHECK-NEXT:    vmov.f32 s8, s10
 ; CHECK-NEXT:    vmov r5, r6, d10
+; CHECK-NEXT:    vmov.f32 s10, s11
+; CHECK-NEXT:    vmov.f32 s12, s14
+; CHECK-NEXT:    vand q2, q2, q0
+; CHECK-NEXT:    vmov.f32 s14, s15
+; CHECK-NEXT:    vand q3, q3, q0
 ; CHECK-NEXT:    subs.w r8, r5, r3
 ; CHECK-NEXT:    vmov r7, r3, d11
 ; CHECK-NEXT:    sbc.w r4, r6, r4
@@ -398,12 +402,6 @@ define void @vabd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly %
 ; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    bfi r4, r5, #0, #4
 ; CHECK-NEXT:    vmov r5, r6, d9
-; CHECK-NEXT:    vmov.f32 s16, s10
-; CHECK-NEXT:    vmov.f32 s18, s11
-; CHECK-NEXT:    vand q2, q4, q0
-; CHECK-NEXT:    vmov.f32 s16, s14
-; CHECK-NEXT:    vmov.f32 s18, s15
-; CHECK-NEXT:    vand q3, q4, q0
 ; CHECK-NEXT:    subs.w r9, r7, r5
 ; CHECK-NEXT:    mov.w r7, #1
 ; CHECK-NEXT:    sbcs r3, r6
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
index a94079d659216..43a3d1b049acf 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
@@ -759,8 +759,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oeq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -769,15 +769,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -785,76 +785,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16:
@@ -871,8 +870,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_one_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -884,16 +883,16 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -901,11 +900,10 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -913,11 +911,11 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -925,14 +923,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -940,11 +939,11 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -952,14 +951,14 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -967,10 +966,10 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
@@ -979,10 +978,9 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_one_v8f16:
@@ -1000,8 +998,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ogt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1010,15 +1008,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1026,76 +1024,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16:
@@ -1112,8 +1109,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1122,15 +1119,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1138,76 +1135,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16:
@@ -1224,8 +1220,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_olt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1234,15 +1230,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1250,76 +1246,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16:
@@ -1336,8 +1331,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ole_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1346,15 +1341,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1362,76 +1357,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16:
@@ -1448,8 +1442,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ueq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1461,16 +1455,16 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1478,11 +1472,10 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1490,11 +1483,11 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1502,14 +1495,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1517,11 +1511,11 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1529,14 +1523,14 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1544,10 +1538,10 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
@@ -1556,10 +1550,9 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16:
@@ -1577,8 +1570,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1587,15 +1580,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1603,76 +1596,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_une_v8f16:
@@ -1689,8 +1681,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ugt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1699,15 +1691,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1715,76 +1707,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16:
@@ -1801,8 +1792,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_uge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1811,15 +1802,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1827,76 +1818,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16:
@@ -1913,8 +1903,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ult_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -1923,15 +1913,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -1939,76 +1929,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16:
@@ -2025,8 +2014,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ule_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -2035,15 +2024,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -2051,76 +2040,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16:
@@ -2137,8 +2125,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ord_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -2147,15 +2135,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -2163,76 +2151,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16:
@@ -2250,8 +2237,8 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_uno_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9, d10, d11}
-; CHECK-MVE-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s16, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s18, s0
 ; CHECK-MVE-NEXT:    vcmp.f16 s18, s16
@@ -2260,15 +2247,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s8
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s0, s4
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s18, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
@@ -2276,76 +2263,75 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %s
 ; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    vins.f16 s16, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s5
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s9
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s13
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s5
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s6
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s0, s16
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
+; CHECK-MVE-NEXT:    vcmp.f16 s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s10
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s14
-; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s6
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s3
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s7
-; CHECK-MVE-NEXT:    vcmp.f16 s22, s20
-; CHECK-MVE-NEXT:    vmovx.f16 s20, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
+; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s11
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s22, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s15
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s7
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s20, s22, s20
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s20
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
+; CHECK-MVE-NEXT:    vpop {d8, d9}
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
index e47207594a7ac..d90688d43d6e0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
@@ -801,8 +801,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oeq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -822,12 +820,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -839,17 +837,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -861,17 +859,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -882,17 +880,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16:
@@ -912,8 +908,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_one_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -937,12 +931,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -956,19 +950,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -982,19 +976,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1007,7 +1001,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
@@ -1016,10 +1010,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_one_v8f16:
@@ -1040,8 +1032,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ogt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1061,12 +1051,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1078,17 +1068,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1100,17 +1090,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1121,17 +1111,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16:
@@ -1151,8 +1139,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1172,12 +1158,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1189,17 +1175,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1211,17 +1197,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1232,17 +1218,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16:
@@ -1262,8 +1246,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_olt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1283,12 +1265,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1300,17 +1282,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1322,17 +1304,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1343,17 +1325,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16:
@@ -1373,8 +1353,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ole_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1394,12 +1372,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1411,17 +1389,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1433,17 +1411,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1454,17 +1432,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16:
@@ -1484,8 +1460,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ueq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1509,12 +1483,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1528,19 +1502,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s1, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1554,19 +1528,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1579,7 +1553,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
@@ -1588,10 +1562,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16:
@@ -1612,8 +1584,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1633,12 +1603,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1650,17 +1620,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1672,17 +1642,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1693,17 +1663,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_une_v8f16:
@@ -1723,8 +1691,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ugt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1744,12 +1710,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1761,17 +1727,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1783,17 +1749,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1804,17 +1770,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16:
@@ -1834,8 +1798,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_uge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1855,12 +1817,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1872,17 +1834,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -1894,17 +1856,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -1915,17 +1877,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16:
@@ -1945,8 +1905,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ult_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -1966,12 +1924,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -1983,17 +1941,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -2005,17 +1963,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -2026,17 +1984,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16:
@@ -2056,8 +2012,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ule_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -2077,12 +2031,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -2094,17 +2048,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -2116,17 +2070,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -2137,17 +2091,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16:
@@ -2167,8 +2119,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ord_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -2188,12 +2138,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -2205,17 +2155,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -2227,17 +2177,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -2248,17 +2198,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16:
@@ -2279,8 +2227,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_uno_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -2300,12 +2246,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -2317,17 +2263,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -2339,17 +2285,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -2360,17 +2306,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half %src2, <
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16:
@@ -3190,8 +3134,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_oeq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3211,12 +3153,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3228,17 +3170,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3250,17 +3192,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3271,17 +3213,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_oeq_v8f16:
@@ -3301,8 +3241,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_one_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3326,12 +3264,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3345,19 +3283,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3371,19 +3309,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3396,7 +3334,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
@@ -3405,10 +3343,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_one_v8f16:
@@ -3429,8 +3365,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ogt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3450,12 +3384,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3467,17 +3401,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3489,17 +3423,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3510,17 +3444,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ogt_v8f16:
@@ -3540,8 +3472,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_oge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3561,12 +3491,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3578,17 +3508,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3600,17 +3530,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3621,17 +3551,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_oge_v8f16:
@@ -3651,8 +3579,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_olt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3672,12 +3598,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3689,17 +3615,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3711,17 +3637,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3732,17 +3658,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_olt_v8f16:
@@ -3762,8 +3686,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ole_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3783,12 +3705,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3800,17 +3722,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3822,17 +3744,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3843,17 +3765,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ole_v8f16:
@@ -3873,8 +3793,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ueq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -3898,12 +3816,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -3917,19 +3835,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -3943,19 +3861,19 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s2
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -3968,7 +3886,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
@@ -3977,10 +3895,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ueq_v8f16:
@@ -4001,8 +3917,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4022,12 +3936,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4039,17 +3953,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4061,17 +3975,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4082,17 +3996,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_une_v8f16:
@@ -4112,8 +4024,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ugt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4133,12 +4043,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4150,17 +4060,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4172,17 +4082,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4193,17 +4103,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ugt_v8f16:
@@ -4223,8 +4131,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_uge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4244,12 +4150,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4261,17 +4167,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4283,17 +4189,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4304,17 +4210,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_uge_v8f16:
@@ -4334,8 +4238,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ult_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4355,12 +4257,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4372,17 +4274,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4394,17 +4296,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4415,17 +4317,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ult_v8f16:
@@ -4445,8 +4345,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ule_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4466,12 +4364,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4483,17 +4381,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4505,17 +4403,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4526,17 +4424,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ule_v8f16:
@@ -4556,8 +4452,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ord_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4577,12 +4471,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4594,17 +4488,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4616,17 +4510,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4637,17 +4531,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ord_v8f16:
@@ -4668,8 +4560,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_uno_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
@@ -4689,12 +4579,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4706,17 +4596,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4728,17 +4618,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4749,17 +4639,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half %src2,
 ; CHECK-MVE-NEXT:    vcmp.f16 s4, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_uno_v8f16:
@@ -4782,8 +4670,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oeq_v8f16_bc:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
@@ -4803,12 +4689,12 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s13
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s12, s8
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s12, s8
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s13
+; CHECK-MVE-NEXT:    vins.f16 s0, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s1
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s9
@@ -4820,17 +4706,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s14
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s14
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s17, s13, s9
-; CHECK-MVE-NEXT:    vins.f16 s17, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s13, s9
+; CHECK-MVE-NEXT:    vins.f16 s1, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s10
@@ -4842,17 +4728,17 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s5, s15
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s15
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s18, s14, s10
-; CHECK-MVE-NEXT:    vins.f16 s18, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s14, s10
+; CHECK-MVE-NEXT:    vins.f16 s2, s6
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s3
 ; CHECK-MVE-NEXT:    vcmp.f16 s6, s4
 ; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
@@ -4863,17 +4749,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16_bc(<8 x half> %src, half %src2
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s4
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s6, s5, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s6, s8, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s19, s15, s11
-; CHECK-MVE-NEXT:    vins.f16 s19, s6
-; CHECK-MVE-NEXT:    vmov q0, q4
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s15, s11
+; CHECK-MVE-NEXT:    vins.f16 s3, s6
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16_bc:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
index eee5c5b249e1a..33231783c5e69 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
@@ -759,8 +759,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oeq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -774,43 +772,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -819,20 +817,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -840,17 +838,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16:
@@ -867,8 +863,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_one_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -884,7 +878,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -892,15 +886,13 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -908,25 +900,27 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -937,22 +931,22 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -962,7 +956,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
@@ -971,10 +965,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_one_v8f16:
@@ -992,8 +984,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ogt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1007,43 +997,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
@@ -1052,20 +1042,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
@@ -1073,17 +1063,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16:
@@ -1100,8 +1088,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_oge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1115,43 +1101,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
@@ -1160,20 +1146,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
@@ -1181,17 +1167,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16:
@@ -1208,8 +1192,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_olt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1223,43 +1205,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -1268,20 +1250,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -1289,17 +1271,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16:
@@ -1316,8 +1296,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ole_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1331,43 +1309,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
@@ -1376,20 +1354,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
@@ -1397,17 +1375,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16:
@@ -1424,8 +1400,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ueq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1441,7 +1415,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1449,15 +1423,13 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1465,25 +1437,27 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1494,22 +1468,22 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -1519,7 +1493,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
@@ -1528,10 +1502,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16:
@@ -1549,8 +1521,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1564,43 +1534,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
@@ -1609,20 +1579,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
@@ -1630,17 +1600,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_une_v8f16:
@@ -1657,8 +1625,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ugt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1672,43 +1638,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
@@ -1717,20 +1683,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
@@ -1738,17 +1704,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16:
@@ -1765,8 +1729,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_uge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1780,43 +1742,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
@@ -1825,20 +1787,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
@@ -1846,17 +1808,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16:
@@ -1873,8 +1833,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ult_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1888,43 +1846,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
@@ -1933,20 +1891,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
@@ -1954,17 +1912,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16:
@@ -1981,8 +1937,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ule_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -1996,43 +1950,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
@@ -2041,20 +1995,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
@@ -2062,17 +2016,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16:
@@ -2089,8 +2041,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_ord_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, s12
@@ -2104,43 +2054,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
@@ -2149,20 +2099,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
@@ -2170,17 +2120,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16:
@@ -2198,8 +2146,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_uno_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, s12
@@ -2213,43 +2159,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
@@ -2258,20 +2204,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
@@ -2279,17 +2225,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16:
@@ -3064,8 +3008,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_oeq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3079,43 +3021,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -3124,20 +3066,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -3145,17 +3087,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_oeq_v8f16:
@@ -3172,8 +3112,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_one_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3189,7 +3127,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -3197,15 +3135,13 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -3213,25 +3149,27 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -3242,22 +3180,22 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -3267,7 +3205,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
@@ -3276,10 +3214,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_one_v8f16:
@@ -3297,8 +3233,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ogt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3312,43 +3246,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -3357,20 +3291,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r1, #1
@@ -3378,17 +3312,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it mi
 ; CHECK-MVE-NEXT:    movmi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ogt_v8f16:
@@ -3405,8 +3337,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_oge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3420,43 +3350,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
@@ -3465,20 +3395,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r1, #1
@@ -3486,17 +3416,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ls
 ; CHECK-MVE-NEXT:    movls r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_oge_v8f16:
@@ -3513,8 +3441,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_olt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3528,43 +3454,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
@@ -3573,20 +3499,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r1, #1
@@ -3594,17 +3520,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it gt
 ; CHECK-MVE-NEXT:    movgt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_olt_v8f16:
@@ -3621,8 +3545,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ole_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3636,43 +3558,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
@@ -3681,20 +3603,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r1, #1
@@ -3702,17 +3624,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ge
 ; CHECK-MVE-NEXT:    movge r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ole_v8f16:
@@ -3729,8 +3649,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ueq_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3746,7 +3664,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -3754,15 +3672,13 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -3770,25 +3686,27 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -3799,22 +3717,22 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r1, #1
@@ -3824,7 +3742,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it eq
 ; CHECK-MVE-NEXT:    moveq r0, #1
@@ -3833,10 +3751,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ueq_v8f16:
@@ -3854,8 +3770,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_une_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3869,43 +3783,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
@@ -3914,20 +3828,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r1, #1
@@ -3935,17 +3849,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it ne
 ; CHECK-MVE-NEXT:    movne r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_une_v8f16:
@@ -3962,8 +3874,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ugt_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -3977,43 +3887,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
@@ -4022,20 +3932,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r1, #1
@@ -4043,17 +3953,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it lt
 ; CHECK-MVE-NEXT:    movlt r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ugt_v8f16:
@@ -4070,8 +3978,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_uge_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -4085,43 +3991,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
@@ -4130,20 +4036,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r1, #1
@@ -4151,17 +4057,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it le
 ; CHECK-MVE-NEXT:    movle r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_uge_v8f16:
@@ -4178,8 +4082,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ult_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -4193,43 +4095,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
@@ -4238,20 +4140,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r1, #1
@@ -4259,17 +4161,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it hi
 ; CHECK-MVE-NEXT:    movhi r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ult_v8f16:
@@ -4286,8 +4186,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ule_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
@@ -4301,43 +4199,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
@@ -4346,20 +4244,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r1, #1
@@ -4367,17 +4265,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it pl
 ; CHECK-MVE-NEXT:    movpl r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ule_v8f16:
@@ -4394,8 +4290,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_ord_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, s12
@@ -4409,43 +4303,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
@@ -4454,20 +4348,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r1, #1
@@ -4475,17 +4369,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vc
 ; CHECK-MVE-NEXT:    movvc r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_ord_v8f16:
@@ -4503,8 +4395,6 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
 ; CHECK-MVE-LABEL: vcmp_r_uno_v8f16:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    .vsave {d8, d9}
-; CHECK-MVE-NEXT:    vpush {d8, d9}
 ; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
 ; CHECK-MVE-NEXT:    movs r1, #0
 ; CHECK-MVE-NEXT:    vcmp.f16 s12, s12
@@ -4518,43 +4408,43 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s14, s12
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    movs r0, #0
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
-; CHECK-MVE-NEXT:    movs r0, #0
-; CHECK-MVE-NEXT:    vins.f16 s12, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s8, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s5
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s9
 ; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmovx.f16 s8, s10
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s13, s9, s5
-; CHECK-MVE-NEXT:    vins.f16 s13, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    vseleq.f16 s1, s9, s5
+; CHECK-MVE-NEXT:    vins.f16 s0, s12
+; CHECK-MVE-NEXT:    vins.f16 s1, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s6
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
@@ -4563,20 +4453,20 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s8, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    cset r1, ne
-; CHECK-MVE-NEXT:    vmovx.f16 s18, s11
 ; CHECK-MVE-NEXT:    cmp r1, #0
 ; CHECK-MVE-NEXT:    mov.w r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s14, s10, s6
-; CHECK-MVE-NEXT:    vins.f16 s14, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s3
-; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
-; CHECK-MVE-NEXT:    vmovx.f16 s16, s7
+; CHECK-MVE-NEXT:    vseleq.f16 s2, s10, s6
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s11
+; CHECK-MVE-NEXT:    vins.f16 s2, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
+; CHECK-MVE-NEXT:    vcmp.f16 s4, s4
+; CHECK-MVE-NEXT:    vmovx.f16 s4, s7
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r1, #1
@@ -4584,17 +4474,15 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half>
 ; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
 ; CHECK-MVE-NEXT:    cset r1, ne
 ; CHECK-MVE-NEXT:    cmp r1, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vseleq.f16 s4, s6, s4
 ; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-MVE-NEXT:    it vs
 ; CHECK-MVE-NEXT:    movvs r0, #1
 ; CHECK-MVE-NEXT:    cmp r0, #0
 ; CHECK-MVE-NEXT:    cset r0, ne
 ; CHECK-MVE-NEXT:    cmp r0, #0
-; CHECK-MVE-NEXT:    vseleq.f16 s15, s11, s7
-; CHECK-MVE-NEXT:    vins.f16 s15, s16
-; CHECK-MVE-NEXT:    vmov q0, q3
-; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    vseleq.f16 s3, s11, s7
+; CHECK-MVE-NEXT:    vins.f16 s3, s4
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vcmp_r_uno_v8f16:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
index 9b1175fabce3b..84a9e0145f0c7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll
@@ -5,11 +5,10 @@
 define arm_aapcs_vfpcc <4 x float> @foo_float_int32(<4 x i32> %src) {
 ; CHECK-MVE-LABEL: foo_float_int32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvt.f32.s32 s7, s3
-; CHECK-MVE-NEXT:    vcvt.f32.s32 s6, s2
-; CHECK-MVE-NEXT:    vcvt.f32.s32 s5, s1
-; CHECK-MVE-NEXT:    vcvt.f32.s32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vcvt.f32.s32 s3, s3
+; CHECK-MVE-NEXT:    vcvt.f32.s32 s2, s2
+; CHECK-MVE-NEXT:    vcvt.f32.s32 s1, s1
+; CHECK-MVE-NEXT:    vcvt.f32.s32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: foo_float_int32:
@@ -24,11 +23,10 @@ entry:
 define arm_aapcs_vfpcc <4 x float> @foo_float_uint32(<4 x i32> %src) {
 ; CHECK-MVE-LABEL: foo_float_uint32:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvt.f32.u32 s7, s3
-; CHECK-MVE-NEXT:    vcvt.f32.u32 s6, s2
-; CHECK-MVE-NEXT:    vcvt.f32.u32 s5, s1
-; CHECK-MVE-NEXT:    vcvt.f32.u32 s4, s0
-; CHECK-MVE-NEXT:    vmov q0, q1
+; CHECK-MVE-NEXT:    vcvt.f32.u32 s3, s3
+; CHECK-MVE-NEXT:    vcvt.f32.u32 s2, s2
+; CHECK-MVE-NEXT:    vcvt.f32.u32 s1, s1
+; CHECK-MVE-NEXT:    vcvt.f32.u32 s0, s0
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: foo_float_uint32:
@@ -43,15 +41,15 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @foo_int32_float(<4 x float> %src) {
 ; CHECK-MVE-LABEL: foo_int32_float:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s4, s2
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s6, s0
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s8, s3
-; CHECK-MVE-NEXT:    vcvt.s32.f32 s10, s1
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s2, s2
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s0, s0
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s4, s3
+; CHECK-MVE-NEXT:    vcvt.s32.f32 s6, s1
+; CHECK-MVE-NEXT:    vmov r0, s2
+; CHECK-MVE-NEXT:    vmov r1, s0
+; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-MVE-NEXT:    vmov r0, s4
 ; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov r1, s10
 ; CHECK-MVE-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -67,15 +65,15 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @foo_uint32_float(<4 x float> %src) {
 ; CHECK-MVE-LABEL: foo_uint32_float:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s4, s2
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s6, s0
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s8, s3
-; CHECK-MVE-NEXT:    vcvt.u32.f32 s10, s1
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s2, s2
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s0, s0
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s4, s3
+; CHECK-MVE-NEXT:    vcvt.u32.f32 s6, s1
+; CHECK-MVE-NEXT:    vmov r0, s2
+; CHECK-MVE-NEXT:    vmov r1, s0
+; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
 ; CHECK-MVE-NEXT:    vmov r0, s4
 ; CHECK-MVE-NEXT:    vmov r1, s6
-; CHECK-MVE-NEXT:    vmov q0[2], q0[0], r1, r0
-; CHECK-MVE-NEXT:    vmov r0, s8
-; CHECK-MVE-NEXT:    vmov r1, s10
 ; CHECK-MVE-NEXT:    vmov q0[3], q0[1], r1, r0
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -96,28 +94,28 @@ define arm_aapcs_vfpcc <8 x half> @foo_half_int16(<8 x i16> %src) {
 ; CHECK-MVE-NEXT:    vmov s0, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[1]
 ; CHECK-MVE-NEXT:    vmov s2, r0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q1[3]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s2
 ; CHECK-MVE-NEXT:    vcvt.f16.s32 s0, s0
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s2, s2
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[3]
+; CHECK-MVE-NEXT:    vins.f16 s0, s2
+; CHECK-MVE-NEXT:    vmov s2, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[2]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
-; CHECK-MVE-NEXT:    vmov s10, r0
-; CHECK-MVE-NEXT:    vmov.s16 r0, q1[4]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s1, s10
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s2, s2
 ; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vmov.s16 r0, q1[4]
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s1, s8
+; CHECK-MVE-NEXT:    vins.f16 s1, s2
+; CHECK-MVE-NEXT:    vmov s2, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[5]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s2, s8
-; CHECK-MVE-NEXT:    vmov s10, r0
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s2, s2
+; CHECK-MVE-NEXT:    vmov s8, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[7]
+; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
 ; CHECK-MVE-NEXT:    vmov.s16 r0, q1[6]
-; CHECK-MVE-NEXT:    vcvt.f16.s32 s10, s10
-; CHECK-MVE-NEXT:    vmov s4, r0
-; CHECK-MVE-NEXT:    vins.f16 s2, s10
 ; CHECK-MVE-NEXT:    vcvt.f16.s32 s8, s8
+; CHECK-MVE-NEXT:    vmov s4, r0
 ; CHECK-MVE-NEXT:    vcvt.f16.s32 s3, s4
 ; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
@@ -139,28 +137,28 @@ define arm_aapcs_vfpcc <8 x half> @foo_half_uint16(<8 x i16> %src) {
 ; CHECK-MVE-NEXT:    vmov s0, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[1]
 ; CHECK-MVE-NEXT:    vmov s2, r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q1[3]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s2
 ; CHECK-MVE-NEXT:    vcvt.f16.u32 s0, s0
-; CHECK-MVE-NEXT:    vins.f16 s0, s8
-; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-MVE-NEXT:    vins.f16 s0, s2
+; CHECK-MVE-NEXT:    vmov s2, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[2]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
-; CHECK-MVE-NEXT:    vmov s10, r0
-; CHECK-MVE-NEXT:    vmov.u16 r0, q1[4]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s1, s10
-; CHECK-MVE-NEXT:    vins.f16 s1, s8
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s2, s2
 ; CHECK-MVE-NEXT:    vmov s8, r0
+; CHECK-MVE-NEXT:    vmov.u16 r0, q1[4]
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s1, s8
+; CHECK-MVE-NEXT:    vins.f16 s1, s2
+; CHECK-MVE-NEXT:    vmov s2, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[5]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s2, s8
-; CHECK-MVE-NEXT:    vmov s10, r0
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-MVE-NEXT:    vmov s8, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
+; CHECK-MVE-NEXT:    vins.f16 s2, s8
 ; CHECK-MVE-NEXT:    vmov s8, r0
 ; CHECK-MVE-NEXT:    vmov.u16 r0, q1[6]
-; CHECK-MVE-NEXT:    vcvt.f16.u32 s10, s10
-; CHECK-MVE-NEXT:    vmov s4, r0
-; CHECK-MVE-NEXT:    vins.f16 s2, s10
 ; CHECK-MVE-NEXT:    vcvt.f16.u32 s8, s8
+; CHECK-MVE-NEXT:    vmov s4, r0
 ; CHECK-MVE-NEXT:    vcvt.f16.u32 s3, s4
 ; CHECK-MVE-NEXT:    vins.f16 s3, s8
 ; CHECK-MVE-NEXT:    bx lr
@@ -177,15 +175,15 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @foo_int16_half(<8 x half> %src) {
 ; CHECK-MVE-LABEL: foo_int16_half:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
+; CHECK-MVE-NEXT:    vcvt.s32.f16 s12, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s0
 ; CHECK-MVE-NEXT:    vcvt.s32.f16 s0, s0
-; CHECK-MVE-NEXT:    vcvt.s32.f16 s14, s14
+; CHECK-MVE-NEXT:    vcvt.s32.f16 s14, s2
 ; CHECK-MVE-NEXT:    vmov r0, s0
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vmovx.f16 s10, s1
 ; CHECK-MVE-NEXT:    vcvt.s32.f16 s8, s3
-; CHECK-MVE-NEXT:    vcvt.s32.f16 s12, s2
 ; CHECK-MVE-NEXT:    vcvt.s32.f16 s5, s1
 ; CHECK-MVE-NEXT:    vmov.16 q0[0], r0
 ; CHECK-MVE-NEXT:    vmov r0, s14
@@ -219,15 +217,15 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @foo_uint16_half(<8 x half> %src) {
 ; CHECK-MVE-LABEL: foo_uint16_half:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmovx.f16 s14, s0
+; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
+; CHECK-MVE-NEXT:    vcvt.s32.f16 s12, s2
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s0
 ; CHECK-MVE-NEXT:    vcvt.s32.f16 s0, s0
-; CHECK-MVE-NEXT:    vcvt.s32.f16 s14, s14
+; CHECK-MVE-NEXT:    vcvt.s32.f16 s14, s2
 ; CHECK-MVE-NEXT:    vmov r0, s0
 ; CHECK-MVE-NEXT:    vmovx.f16 s4, s3
-; CHECK-MVE-NEXT:    vmovx.f16 s6, s2
 ; CHECK-MVE-NEXT:    vmovx.f16 s10, s1
 ; CHECK-MVE-NEXT:    vcvt.s32.f16 s8, s3
-; CHECK-MVE-NEXT:    vcvt.s32.f16 s12, s2
 ; CHECK-MVE-NEXT:    vcvt.s32.f16 s5, s1
 ; CHECK-MVE-NEXT:    vmov.16 q0[0], r0
 ; CHECK-MVE-NEXT:    vmov r0, s14
@@ -355,14 +353,13 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vmovn32_trunc1(<4 x float> %src1, <4 x float> %src2) {
 ; CHECK-MVE-LABEL: vmovn32_trunc1:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q2, q0
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s8
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s2
 ; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s4
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s9
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s1
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s3
 ; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s5
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s10
 ; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s6
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s11
 ; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s7
 ; CHECK-MVE-NEXT:    bx lr
 ;
@@ -380,15 +377,14 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @vmovn32_trunc2(<4 x float> %src1, <4 x float> %src2) {
 ; CHECK-MVE-LABEL: vmovn32_trunc2:
 ; CHECK-MVE:       @ %bb.0: @ %entry
-; CHECK-MVE-NEXT:    vmov q2, q0
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s0, s4
-; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s8
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s1, s5
-; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s9
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s2, s6
-; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s10
-; CHECK-MVE-NEXT:    vcvtb.f16.f32 s3, s7
-; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s11
+; CHECK-MVE-NEXT:    vcvtt.f16.f32 s0, s0
+; CHECK-MVE-NEXT:    vcvtt.f16.f32 s1, s1
+; CHECK-MVE-NEXT:    vcvtt.f16.f32 s2, s2
+; CHECK-MVE-NEXT:    vcvtt.f16.f32 s3, s3
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s4
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s5
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s6
+; CHECK-MVE-NEXT:    vcvtb.f16.f32 s4, s7
 ; CHECK-MVE-NEXT:    bx lr
 ;
 ; CHECK-MVEFP-LABEL: vmovn32_trunc2:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
index 98ee5fdd3f34e..844e39e2964bb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcvt16.ll
@@ -4,11 +4,10 @@
 define arm_aapcs_vfpcc <4 x float> @fpext_4(<4 x half> %src1) {
 ; CHECK-LABEL: fpext_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
-; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
+; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
+; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
+; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
 ; CHECK-NEXT:    bx lr
 entry:
   %out = fpext <4 x half> %src1 to <4 x float>
@@ -19,12 +18,12 @@ define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) {
 ; CHECK-LABEL: fpext_8:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vcvtt.f32.f16 s11, s1
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s3
 ; CHECK-NEXT:    vcvtb.f32.f16 s10, s1
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s3
 ; CHECK-NEXT:    vcvtt.f32.f16 s9, s0
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s2
 ; CHECK-NEXT:    vcvtb.f32.f16 s8, s0
+; CHECK-NEXT:    vcvtt.f32.f16 s7, s3
+; CHECK-NEXT:    vcvtb.f32.f16 s6, s3
+; CHECK-NEXT:    vcvtt.f32.f16 s5, s2
 ; CHECK-NEXT:    vcvtb.f32.f16 s4, s2
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    bx lr
@@ -37,11 +36,10 @@ entry:
 define arm_aapcs_vfpcc <4 x half> @fptrunc_4(<4 x float> %src1) {
 ; CHECK-LABEL: fptrunc_4:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vcvtb.f16.f32 s4, s0
-; CHECK-NEXT:    vcvtt.f16.f32 s4, s1
-; CHECK-NEXT:    vcvtb.f16.f32 s5, s2
-; CHECK-NEXT:    vcvtt.f16.f32 s5, s3
-; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s2
+; CHECK-NEXT:    vcvtt.f16.f32 s0, s1
+; CHECK-NEXT:    vcvtt.f16.f32 s1, s3
 ; CHECK-NEXT:    bx lr
 entry:
   %out = fptrunc <4 x float> %src1 to <4 x half>
@@ -51,15 +49,14 @@ entry:
 define arm_aapcs_vfpcc <8 x half> @fptrunc_8(<8 x float> %src1) {
 ; CHECK-LABEL: fptrunc_8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vcvtb.f16.f32 s0, s8
-; CHECK-NEXT:    vcvtt.f16.f32 s0, s9
-; CHECK-NEXT:    vcvtb.f16.f32 s1, s10
-; CHECK-NEXT:    vcvtt.f16.f32 s1, s11
+; CHECK-NEXT:    vcvtb.f16.f32 s0, s0
+; CHECK-NEXT:    vcvtb.f16.f32 s2, s2
 ; CHECK-NEXT:    vcvtb.f16.f32 s2, s4
+; CHECK-NEXT:    vcvtt.f16.f32 s0, s1
+; CHECK-NEXT:    vcvtt.f16.f32 s1, s3
 ; CHECK-NEXT:    vcvtt.f16.f32 s2, s5
-; CHECK-NEXT:    vcvtb.f16.f32 s3, s6
 ; CHECK-NEXT:    vcvtt.f16.f32 s3, s7
+; CHECK-NEXT:    vcvtb.f16.f32 s4, s6
 ; CHECK-NEXT:    bx lr
 entry:
   %out = fptrunc <8 x float> %src1 to <8 x half>
@@ -247,12 +244,12 @@ define arm_aapcs_vfpcc <8 x float> @load_shuffleext_16(<16 x half>* %src) {
 ; CHECK-NEXT:    vld20.16 {q2, q3}, [r0]
 ; CHECK-NEXT:    vld21.16 {q2, q3}, [r0]
 ; CHECK-NEXT:    vcvtt.f32.f16 s3, s9
-; CHECK-NEXT:    vcvtt.f32.f16 s7, s11
 ; CHECK-NEXT:    vcvtb.f32.f16 s2, s9
-; CHECK-NEXT:    vcvtb.f32.f16 s6, s11
 ; CHECK-NEXT:    vcvtt.f32.f16 s1, s8
-; CHECK-NEXT:    vcvtt.f32.f16 s5, s10
 ; CHECK-NEXT:    vcvtb.f32.f16 s0, s8
+; CHECK-NEXT:    vcvtt.f32.f16 s7, s11
+; CHECK-NEXT:    vcvtb.f32.f16 s6, s11
+; CHECK-NEXT:    vcvtt.f32.f16 s5, s10
 ; CHECK-NEXT:    vcvtb.f32.f16 s4, s10
 ; CHECK-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
index 0f7393c85d670..f444ec4ef1e94 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll
@@ -136,7 +136,6 @@ entry:
 define arm_aapcs_vfpcc <2 x double> @vdup_f64(double %src) {
 ; CHECK-LABEL: vdup_f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    @ kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    vmov.f32 s2, s0
 ; CHECK-NEXT:    vmov.f32 s3, s1
 ; CHECK-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
index c96baf10bc607..bd2aa4be5fab7 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fadd.ll
@@ -16,17 +16,17 @@ entry:
 define arm_aapcs_vfpcc float @fadd_v4f32(<4 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fadd_v4f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vadd.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vadd.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vadd.f32 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fadd_v4f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vadd.f32 s6, s0, s1
-; CHECK-NOFP-NEXT:    vadd.f32 s6, s6, s2
-; CHECK-NOFP-NEXT:    vadd.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    vadd.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -38,21 +38,21 @@ define arm_aapcs_vfpcc float @fadd_v8f32(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fadd_v8f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vadd.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vadd.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vadd.f32 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fadd_v8f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vadd.f32 s12, s0, s4
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vadd.f32 s10, s1, s5
-; CHECK-NOFP-NEXT:    vadd.f32 s14, s2, s6
-; CHECK-NOFP-NEXT:    vadd.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vadd.f32 s10, s12, s10
-; CHECK-NOFP-NEXT:    vadd.f32 s2, s10, s14
-; CHECK-NOFP-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-NOFP-NEXT:    vadd.f32 s2, s2, s6
+; CHECK-NOFP-NEXT:    vadd.f32 s4, s3, s7
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s10
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vadd.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -63,8 +63,8 @@ entry:
 define arm_aapcs_vfpcc half @fadd_v2f16(<2 x half> %x, half %y) {
 ; CHECK-LABEL: fadd_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vadd.f16 s0, s0, s6
+; CHECK-NEXT:    vmovx.f16 s2, s0
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NEXT:    vadd.f16 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -75,21 +75,21 @@ entry:
 define arm_aapcs_vfpcc half @fadd_v4f16(<4 x half> %x, half %y) {
 ; CHECK-FP-LABEL: fadd_v4f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmovx.f16 s8, s0
-; CHECK-FP-NEXT:    vadd.f16 s6, s1, s6
-; CHECK-FP-NEXT:    vadd.f16 s0, s0, s8
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vadd.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vadd.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vadd.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fadd_v4f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vadd.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vadd.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -102,25 +102,25 @@ define arm_aapcs_vfpcc half @fadd_v8f16(<8 x half> %x, half %y) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q2, q0
 ; CHECK-FP-NEXT:    vadd.f16 q0, q0, q2
-; CHECK-FP-NEXT:    vadd.f16 s6, s2, s3
+; CHECK-FP-NEXT:    vadd.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vadd.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vadd.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vadd.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fadd_v8f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s2
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s2
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vadd.f16 s6, s6, s3
-; CHECK-NOFP-NEXT:    vadd.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vadd.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -134,37 +134,37 @@ define arm_aapcs_vfpcc half @fadd_v16f16(<16 x half> %x, half %y) {
 ; CHECK-FP-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vadd.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vadd.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vadd.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vadd.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vadd.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vadd.f16 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fadd_v16f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s12, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vadd.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vadd.f16 s12, s0, s4
-; CHECK-NOFP-NEXT:    vadd.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vadd.f16 s12, s1, s5
-; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s5
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s1
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s2
-; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vadd.f16 s12, s2, s6
-; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s6
-; CHECK-NOFP-NEXT:    vadd.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vadd.f16 s12, s3, s7
-; CHECK-NOFP-NEXT:    vadd.f16 s10, s10, s12
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s10
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s1, s5
 ; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vadd.f16 s0, s10, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s1
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s10, s4
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vadd.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s2, s2, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vadd.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vadd.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -196,9 +196,9 @@ entry:
 define arm_aapcs_vfpcc double @fadd_v4f64(<4 x double> %x, double %y) {
 ; CHECK-LABEL: fadd_v4f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f64 d5, d1, d3
+; CHECK-NEXT:    vadd.f64 d1, d1, d3
 ; CHECK-NEXT:    vadd.f64 d0, d0, d2
-; CHECK-NEXT:    vadd.f64 d0, d0, d5
+; CHECK-NEXT:    vadd.f64 d0, d0, d1
 ; CHECK-NEXT:    vadd.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -209,8 +209,8 @@ entry:
 define arm_aapcs_vfpcc float @fadd_v2f32_nofast(<2 x float> %x, float %y) {
 ; CHECK-LABEL: fadd_v2f32_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f32 s4, s4, s0
-; CHECK-NEXT:    vadd.f32 s0, s4, s1
+; CHECK-NEXT:    vadd.f32 s0, s4, s0
+; CHECK-NEXT:    vadd.f32 s0, s0, s1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fadd.f32.v2f32(float %y, <2 x float> %x)
@@ -220,10 +220,10 @@ entry:
 define arm_aapcs_vfpcc float @fadd_v4f32_nofast(<4 x float> %x, float %y) {
 ; CHECK-LABEL: fadd_v4f32_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f32 s4, s4, s0
-; CHECK-NEXT:    vadd.f32 s4, s4, s1
-; CHECK-NEXT:    vadd.f32 s4, s4, s2
-; CHECK-NEXT:    vadd.f32 s0, s4, s3
+; CHECK-NEXT:    vadd.f32 s0, s4, s0
+; CHECK-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    vadd.f32 s0, s0, s3
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fadd.f32.v4f32(float %y, <4 x float> %x)
@@ -233,10 +233,10 @@ entry:
 define arm_aapcs_vfpcc float @fadd_v8f32_nofast(<8 x float> %x, float %y) {
 ; CHECK-LABEL: fadd_v8f32_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f32 s8, s8, s0
-; CHECK-NEXT:    vadd.f32 s8, s8, s1
-; CHECK-NEXT:    vadd.f32 s8, s8, s2
-; CHECK-NEXT:    vadd.f32 s0, s8, s3
+; CHECK-NEXT:    vadd.f32 s0, s8, s0
+; CHECK-NEXT:    vadd.f32 s0, s0, s1
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-NEXT:    vadd.f32 s0, s0, s3
 ; CHECK-NEXT:    vadd.f32 s0, s0, s4
 ; CHECK-NEXT:    vadd.f32 s0, s0, s5
 ; CHECK-NEXT:    vadd.f32 s0, s0, s6
@@ -250,12 +250,12 @@ entry:
 define arm_aapcs_vfpcc half @fadd_v4f16_nofast(<4 x half> %x, half %y) {
 ; CHECK-LABEL: fadd_v4f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f16 s4, s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vadd.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vadd.f16 s4, s4, s1
-; CHECK-NEXT:    vadd.f16 s0, s4, s0
+; CHECK-NEXT:    vadd.f16 s2, s4, s0
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vadd.f16 s0, s2, s0
+; CHECK-NEXT:    vmovx.f16 s2, s1
+; CHECK-NEXT:    vadd.f16 s0, s0, s1
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fadd.f16.v4f16(half %y, <4 x half> %x)
@@ -266,17 +266,17 @@ define arm_aapcs_vfpcc half @fadd_v8f16_nofast(<8 x half> %x, half %y) {
 ; CHECK-LABEL: fadd_v8f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vadd.f16 s4, s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vadd.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vadd.f16 s4, s4, s1
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vadd.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s2
-; CHECK-NEXT:    vadd.f16 s4, s4, s2
-; CHECK-NEXT:    vadd.f16 s4, s4, s6
-; CHECK-NEXT:    vadd.f16 s4, s4, s3
+; CHECK-NEXT:    vmovx.f16 s0, s0
 ; CHECK-NEXT:    vadd.f16 s0, s4, s0
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vadd.f16 s0, s0, s1
+; CHECK-NEXT:    vadd.f16 s0, s0, s4
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s3
+; CHECK-NEXT:    vadd.f16 s0, s0, s3
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fadd.f16.v8f16(half %y, <8 x half> %x)
@@ -287,18 +287,18 @@ define arm_aapcs_vfpcc half @fadd_v16f16_nofast(<16 x half> %x, half %y) {
 ; CHECK-LABEL: fadd_v16f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vadd.f16 s8, s8, s0
-; CHECK-NEXT:    vmovx.f16 s10, s0
-; CHECK-NEXT:    vadd.f16 s8, s8, s10
-; CHECK-NEXT:    vmovx.f16 s10, s1
-; CHECK-NEXT:    vadd.f16 s8, s8, s1
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vadd.f16 s8, s8, s10
-; CHECK-NEXT:    vmovx.f16 s10, s2
-; CHECK-NEXT:    vadd.f16 s8, s8, s2
-; CHECK-NEXT:    vmovx.f16 s2, s4
-; CHECK-NEXT:    vadd.f16 s8, s8, s10
-; CHECK-NEXT:    vadd.f16 s8, s8, s3
+; CHECK-NEXT:    vmovx.f16 s0, s0
 ; CHECK-NEXT:    vadd.f16 s0, s8, s0
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vadd.f16 s0, s0, s1
+; CHECK-NEXT:    vadd.f16 s0, s0, s8
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s3
+; CHECK-NEXT:    vadd.f16 s0, s0, s3
+; CHECK-NEXT:    vadd.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s4
 ; CHECK-NEXT:    vadd.f16 s0, s0, s4
 ; CHECK-NEXT:    vadd.f16 s0, s0, s2
 ; CHECK-NEXT:    vmovx.f16 s2, s5
@@ -329,8 +329,8 @@ entry:
 define arm_aapcs_vfpcc double @fadd_v2f64_nofast(<2 x double> %x, double %y) {
 ; CHECK-LABEL: fadd_v2f64_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f64 d2, d2, d0
-; CHECK-NEXT:    vadd.f64 d0, d2, d1
+; CHECK-NEXT:    vadd.f64 d0, d2, d0
+; CHECK-NEXT:    vadd.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call double @llvm.vector.reduce.fadd.f64.v2f64(double %y, <2 x double> %x)
@@ -340,8 +340,8 @@ entry:
 define arm_aapcs_vfpcc double @fadd_v4f64_nofast(<4 x double> %x, double %y) {
 ; CHECK-LABEL: fadd_v4f64_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vadd.f64 d4, d4, d0
-; CHECK-NEXT:    vadd.f64 d0, d4, d1
+; CHECK-NEXT:    vadd.f64 d0, d4, d0
+; CHECK-NEXT:    vadd.f64 d0, d0, d1
 ; CHECK-NEXT:    vadd.f64 d0, d0, d2
 ; CHECK-NEXT:    vadd.f64 d0, d0, d3
 ; CHECK-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
index 07a0077b09301..7cafb7262f460 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fminmax.ll
@@ -15,16 +15,16 @@ entry:
 define arm_aapcs_vfpcc float @fmin_v4f32(<4 x float> %x) {
 ; CHECK-FP-LABEL: fmin_v4f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vminnm.f32 s4, s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f32 s4, s4, s2
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s3
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x)
@@ -36,9 +36,9 @@ define arm_aapcs_vfpcc float @fmin_v8f32(<8 x float> %x) {
 ; CHECK-FP-LABEL: fmin_v8f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vminnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f32:
@@ -49,15 +49,15 @@ define arm_aapcs_vfpcc float @fmin_v8f32(<8 x float> %x) {
 ; CHECK-NOFP-NEXT:    vselgt.f32 s8, s1, s5
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s6, s2
-; CHECK-NOFP-NEXT:    vselgt.f32 s10, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s7, s3
-; CHECK-NOFP-NEXT:    vselgt.f32 s12, s2, s6
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s8
+; CHECK-NOFP-NEXT:    vselgt.f32 s2, s2, s6
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vselgt.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vminnm.f32 s2, s10, s8
-; CHECK-NOFP-NEXT:    vminnm.f32 s2, s2, s12
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s2, s0
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vselgt.f32 s4, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x)
@@ -67,20 +67,20 @@ entry:
 define arm_aapcs_vfpcc half @fmin_v4f16(<4 x half> %x) {
 ; CHECK-FP-LABEL: fmin_v4f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s4, s1
-; CHECK-FP-NEXT:    vmovx.f16 s6, s0
-; CHECK-FP-NEXT:    vminnm.f16 s4, s1, s4
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s4, s0
+; CHECK-FP-NEXT:    vminnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x)
@@ -92,24 +92,24 @@ define arm_aapcs_vfpcc half @fmin_v8f16(<8 x half> %x) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x)
@@ -122,9 +122,9 @@ define arm_aapcs_vfpcc half @fmin_v16f16(<16 x half> %x) {
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v16f16:
@@ -132,42 +132,42 @@ define arm_aapcs_vfpcc half @fmin_v16f16(<16 x half> %x) {
 ; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s0
 ; CHECK-NOFP-NEXT:    vcmp.f16 s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s1
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s4, s0
 ; CHECK-NOFP-NEXT:    vselgt.f16 s8, s10, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s5, s1
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s10, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s1, s5
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s5
-; CHECK-NOFP-NEXT:    vcmp.f16 s10, s12
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s8
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vcmp.f16 s4, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s6, s2
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s12, s10
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s8, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s2
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s2, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s6
-; CHECK-NOFP-NEXT:    vcmp.f16 s10, s12
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vcmp.f16 s4, s2
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s7, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s12, s10
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f16 s4, s0
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vcmp.f16 s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s8, s0
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x)
@@ -199,10 +199,10 @@ define arm_aapcs_vfpcc double @fmin_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    vcmp.f64 d3, d1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f64 d2, d0
-; CHECK-NEXT:    vselgt.f64 d4, d1, d3
+; CHECK-NEXT:    vselgt.f64 d1, d1, d3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f64 d0, d0, d2
-; CHECK-NEXT:    vminnm.f64 d0, d0, d4
+; CHECK-NEXT:    vminnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x)
@@ -222,16 +222,16 @@ entry:
 define arm_aapcs_vfpcc float @fmin_v4f32_nofast(<4 x float> %x) {
 ; CHECK-FP-LABEL: fmin_v4f32_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f32_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vminnm.f32 s4, s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f32 s4, s4, s2
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s3
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %x)
@@ -242,20 +242,20 @@ define arm_aapcs_vfpcc float @fmin_v8f32_nofast(<8 x float> %x) {
 ; CHECK-FP-LABEL: fmin_v8f32_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vminnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f32_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vminnm.f32 s10, s0, s4
 ; CHECK-NOFP-NEXT:    vminnm.f32 s8, s1, s5
-; CHECK-NOFP-NEXT:    vminnm.f32 s8, s10, s8
-; CHECK-NOFP-NEXT:    vminnm.f32 s10, s2, s6
-; CHECK-NOFP-NEXT:    vminnm.f32 s8, s8, s10
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s8, s0
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s8
+; CHECK-NOFP-NEXT:    vminnm.f32 s2, s2, s6
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s2, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %x)
@@ -265,20 +265,20 @@ entry:
 define arm_aapcs_vfpcc half @fmin_v4f16_nofast(<4 x half> %x) {
 ; CHECK-FP-LABEL: fmin_v4f16_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s4, s1
-; CHECK-FP-NEXT:    vmovx.f16 s6, s0
-; CHECK-FP-NEXT:    vminnm.f16 s4, s1, s4
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s4, s0
+; CHECK-FP-NEXT:    vminnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %x)
@@ -290,24 +290,24 @@ define arm_aapcs_vfpcc half @fmin_v8f16_nofast(<8 x half> %x) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s4, s4, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %x)
@@ -320,36 +320,36 @@ define arm_aapcs_vfpcc half @fmin_v16f16_nofast(<16 x half> %x) {
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v16f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vminnm.f16 s8, s10, s8
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s0, s4
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s10, s8
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s1, s5
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s5
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s1
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s2, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s3, s7
-; CHECK-NOFP-NEXT:    vminnm.f16 s8, s8, s10
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s8
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s1, s5
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s8, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s8, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s2, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %x)
@@ -378,9 +378,9 @@ entry:
 define arm_aapcs_vfpcc double @fmin_v4f64_nofast(<4 x double> %x) {
 ; CHECK-LABEL: fmin_v4f64_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vminnm.f64 d4, d1, d3
+; CHECK-NEXT:    vminnm.f64 d1, d1, d3
 ; CHECK-NEXT:    vminnm.f64 d0, d0, d2
-; CHECK-NEXT:    vminnm.f64 d0, d0, d4
+; CHECK-NEXT:    vminnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x)
@@ -403,17 +403,17 @@ entry:
 define arm_aapcs_vfpcc float @fmin_v4f32_acc(<4 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmin_v4f32_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vminnm.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f32_acc:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vminnm.f32 s6, s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f32 s6, s6, s2
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -427,9 +427,9 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmin_v8f32_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vminnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
@@ -441,15 +441,15 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc(<8 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vselgt.f32 s10, s1, s5
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s6, s2
-; CHECK-NOFP-NEXT:    vselgt.f32 s12, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s7, s3
-; CHECK-NOFP-NEXT:    vselgt.f32 s14, s2, s6
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s10
+; CHECK-NOFP-NEXT:    vselgt.f32 s2, s2, s6
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vselgt.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vminnm.f32 s2, s12, s10
-; CHECK-NOFP-NEXT:    vminnm.f32 s2, s2, s14
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s2, s0
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vselgt.f32 s4, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vminnm.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -462,21 +462,21 @@ entry:
 define arm_aapcs_vfpcc half @fmin_v4f16_acc(<4 x half> %x, half %y) {
 ; CHECK-FP-LABEL: fmin_v4f16_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmovx.f16 s8, s0
-; CHECK-FP-NEXT:    vminnm.f16 s6, s1, s6
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s8
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vminnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f16_acc:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -489,8 +489,8 @@ entry:
 define arm_aapcs_vfpcc half @fmin_v2f16_acc(<2 x half> %x, half %y) {
 ; CHECK-LABEL: fmin_v2f16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-NEXT:    vmovx.f16 s2, s0
+; CHECK-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -505,25 +505,25 @@ define arm_aapcs_vfpcc half @fmin_v8f16_acc(<8 x half> %x, half %y) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q2, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q2
-; CHECK-FP-NEXT:    vminnm.f16 s6, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f16_acc:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -539,9 +539,9 @@ define arm_aapcs_vfpcc half @fmin_v16f16_acc(<16 x half> %x, half %y) {
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
@@ -550,42 +550,42 @@ define arm_aapcs_vfpcc half @fmin_v16f16_acc(<16 x half> %x, half %y) {
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NOFP-NEXT:    vcmp.f16 s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s1
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s4, s0
 ; CHECK-NOFP-NEXT:    vselgt.f16 s10, s12, s10
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s5, s1
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s1, s5
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s5
-; CHECK-NOFP-NEXT:    vcmp.f16 s12, s14
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s10
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s1
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vcmp.f16 s4, s10
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s6, s2
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s14, s12
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s10, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s2
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s2, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s6
-; CHECK-NOFP-NEXT:    vcmp.f16 s12, s14
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vcmp.f16 s4, s2
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s7, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s14, s12
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f16 s4, s0
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vcmp.f16 s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s10, s0
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -626,10 +626,10 @@ define arm_aapcs_vfpcc double @fmin_v4f64_acc(<4 x double> %x, double %y) {
 ; CHECK-NEXT:    vcmp.f64 d3, d1
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f64 d2, d0
-; CHECK-NEXT:    vselgt.f64 d5, d1, d3
+; CHECK-NEXT:    vselgt.f64 d1, d1, d3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f64 d0, d0, d2
-; CHECK-NEXT:    vminnm.f64 d0, d0, d5
+; CHECK-NEXT:    vminnm.f64 d0, d0, d1
 ; CHECK-NEXT:    vminnm.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -657,9 +657,9 @@ entry:
 define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmin_v4f32_acc_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vminnm.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f32 s0, s4
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f32 s0, s4, s0
@@ -667,9 +667,9 @@ define arm_aapcs_vfpcc float @fmin_v4f32_acc_nofast(<4 x float> %x, float %y) {
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f32_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vminnm.f32 s6, s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f32 s6, s6, s2
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    vcmp.f32 s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s4, s0
@@ -685,9 +685,9 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc_nofast(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmin_v8f32_acc_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vminnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f32 s0, s8
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f32 s0, s8, s0
@@ -695,13 +695,13 @@ define arm_aapcs_vfpcc float @fmin_v8f32_acc_nofast(<8 x float> %x, float %y) {
 ;
 ; CHECK-NOFP-LABEL: fmin_v8f32_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vminnm.f32 s12, s0, s4
 ; CHECK-NOFP-NEXT:    vminnm.f32 s10, s1, s5
-; CHECK-NOFP-NEXT:    vminnm.f32 s10, s12, s10
-; CHECK-NOFP-NEXT:    vminnm.f32 s12, s2, s6
-; CHECK-NOFP-NEXT:    vminnm.f32 s10, s10, s12
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vminnm.f32 s0, s10, s0
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s10
+; CHECK-NOFP-NEXT:    vminnm.f32 s2, s2, s6
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f32 s2, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f32 s0, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s8, s0
@@ -716,11 +716,11 @@ entry:
 define arm_aapcs_vfpcc half @fmin_v4f16_acc_nofast(<4 x half> %x, half %y) {
 ; CHECK-FP-LABEL: fmin_v4f16_acc_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmovx.f16 s8, s0
-; CHECK-FP-NEXT:    vminnm.f16 s6, s1, s6
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s8
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vminnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f16 s0, s4
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f16 s0, s4, s0
@@ -728,11 +728,11 @@ define arm_aapcs_vfpcc half @fmin_v4f16_acc_nofast(<4 x half> %x, half %y) {
 ;
 ; CHECK-NOFP-LABEL: fmin_v4f16_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f16 s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s4, s0
@@ -749,9 +749,9 @@ define arm_aapcs_vfpcc half @fmin_v8f16_acc_nofast(<8 x half> %x, half %y) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q2, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q2
-; CHECK-FP-NEXT:    vminnm.f16 s6, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f16 s0, s4
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f16 s0, s4, s0
@@ -760,16 +760,16 @@ define arm_aapcs_vfpcc half @fmin_v8f16_acc_nofast(<8 x half> %x, half %y) {
 ; CHECK-NOFP-LABEL: fmin_v8f16_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vminnm.f16 s6, s6, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f16 s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s4, s0
@@ -787,9 +787,9 @@ define arm_aapcs_vfpcc half @fmin_v16f16_acc_nofast(<16 x half> %x, half %y) {
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vminnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vminnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vminnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f16 s0, s8
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f16 s0, s8, s0
@@ -797,29 +797,29 @@ define arm_aapcs_vfpcc half @fmin_v16f16_acc_nofast(<16 x half> %x, half %y) {
 ;
 ; CHECK-NOFP-LABEL: fmin_v16f16_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s12, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vminnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vminnm.f16 s12, s0, s4
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vminnm.f16 s12, s1, s5
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s5
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s1
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s2
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vminnm.f16 s12, s2, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s6
-; CHECK-NOFP-NEXT:    vminnm.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vminnm.f16 s12, s3, s7
-; CHECK-NOFP-NEXT:    vminnm.f16 s10, s10, s12
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s10
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s1
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s10, s4
 ; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vminnm.f16 s0, s10, s0
+; CHECK-NOFP-NEXT:    vminnm.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s2, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vminnm.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vminnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f16 s0, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s8, s0
@@ -863,9 +863,9 @@ entry:
 define arm_aapcs_vfpcc double @fmin_v4f64_acc_nofast(<4 x double> %x, double %y) {
 ; CHECK-LABEL: fmin_v4f64_acc_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vminnm.f64 d5, d1, d3
+; CHECK-NEXT:    vminnm.f64 d1, d1, d3
 ; CHECK-NEXT:    vminnm.f64 d0, d0, d2
-; CHECK-NEXT:    vminnm.f64 d0, d0, d5
+; CHECK-NEXT:    vminnm.f64 d0, d0, d1
 ; CHECK-NEXT:    vcmp.f64 d0, d4
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f64 d0, d4, d0
@@ -890,16 +890,16 @@ entry:
 define arm_aapcs_vfpcc float @fmax_v4f32(<4 x float> %x) {
 ; CHECK-FP-LABEL: fmax_v4f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s4, s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s4, s4, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x)
@@ -910,9 +910,9 @@ define arm_aapcs_vfpcc float @fmax_v8f32(<8 x float> %x) {
 ; CHECK-FP-LABEL: fmax_v8f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmaxnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f32:
@@ -923,15 +923,15 @@ define arm_aapcs_vfpcc float @fmax_v8f32(<8 x float> %x) {
 ; CHECK-NOFP-NEXT:    vselgt.f32 s8, s1, s5
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s2, s6
-; CHECK-NOFP-NEXT:    vselgt.f32 s10, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s3, s7
-; CHECK-NOFP-NEXT:    vselgt.f32 s12, s2, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s8
+; CHECK-NOFP-NEXT:    vselgt.f32 s2, s2, s6
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vselgt.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s10, s8
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s2, s12
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s2, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vselgt.f32 s4, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x)
@@ -941,20 +941,20 @@ entry:
 define arm_aapcs_vfpcc half @fmax_v4f16(<4 x half> %x) {
 ; CHECK-FP-LABEL: fmax_v4f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s4, s1
-; CHECK-FP-NEXT:    vmovx.f16 s6, s0
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s1, s4
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s4, s0
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x)
@@ -966,24 +966,24 @@ define arm_aapcs_vfpcc half @fmax_v8f16(<8 x half> %x) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x)
@@ -996,9 +996,9 @@ define arm_aapcs_vfpcc half @fmax_v16f16(<16 x half> %x) {
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v16f16:
@@ -1006,42 +1006,42 @@ define arm_aapcs_vfpcc half @fmax_v16f16(<16 x half> %x) {
 ; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s0
 ; CHECK-NOFP-NEXT:    vcmp.f16 s10, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s1
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s0, s4
 ; CHECK-NOFP-NEXT:    vselgt.f16 s8, s10, s8
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s1, s5
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s10, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s1, s5
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s5
-; CHECK-NOFP-NEXT:    vcmp.f16 s12, s10
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s8
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vcmp.f16 s8, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s2, s6
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s12, s10
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s8, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s2
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s2, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s6
-; CHECK-NOFP-NEXT:    vcmp.f16 s12, s10
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vcmp.f16 s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s3, s7
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s12, s10
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f16 s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vselgt.f16 s10, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vcmp.f16 s4, s2
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s8, s0
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call fast half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x)
@@ -1073,10 +1073,10 @@ define arm_aapcs_vfpcc double @fmax_v4f64(<4 x double> %x) {
 ; CHECK-NEXT:    vcmp.f64 d1, d3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f64 d0, d2
-; CHECK-NEXT:    vselgt.f64 d4, d1, d3
+; CHECK-NEXT:    vselgt.f64 d1, d1, d3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f64 d0, d0, d2
-; CHECK-NEXT:    vmaxnm.f64 d0, d0, d4
+; CHECK-NEXT:    vmaxnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x)
@@ -1096,16 +1096,16 @@ entry:
 define arm_aapcs_vfpcc float @fmax_v4f32_nofast(<4 x float> %x) {
 ; CHECK-FP-LABEL: fmax_v4f32_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f32_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s4, s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s4, s4, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %x)
@@ -1116,20 +1116,20 @@ define arm_aapcs_vfpcc float @fmax_v8f32_nofast(<8 x float> %x) {
 ; CHECK-FP-LABEL: fmax_v8f32_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmaxnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f32_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s10, s0, s4
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s8, s1, s5
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s8, s10, s8
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s10, s2, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s8, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s8
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s2, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %x)
@@ -1139,20 +1139,20 @@ entry:
 define arm_aapcs_vfpcc half @fmax_v4f16_nofast(<4 x half> %x) {
 ; CHECK-FP-LABEL: fmax_v4f16_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s4, s1
-; CHECK-FP-NEXT:    vmovx.f16 s6, s0
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s1, s4
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s4, s0
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %x)
@@ -1164,24 +1164,24 @@ define arm_aapcs_vfpcc half @fmax_v8f16_nofast(<8 x half> %x) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s4, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s0, s4
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s4, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %x)
@@ -1194,36 +1194,36 @@ define arm_aapcs_vfpcc half @fmax_v16f16_nofast(<16 x half> %x) {
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v16f16_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s10, s8
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s10, s8
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s1, s5
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s5
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s1
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s2, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s3, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s8, s8, s10
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s8
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s1, s5
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s8, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s8, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s2, s2, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %x)
@@ -1252,9 +1252,9 @@ entry:
 define arm_aapcs_vfpcc double @fmax_v4f64_nofast(<4 x double> %x) {
 ; CHECK-LABEL: fmax_v4f64_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmaxnm.f64 d4, d1, d3
+; CHECK-NEXT:    vmaxnm.f64 d1, d1, d3
 ; CHECK-NEXT:    vmaxnm.f64 d0, d0, d2
-; CHECK-NEXT:    vmaxnm.f64 d0, d0, d4
+; CHECK-NEXT:    vmaxnm.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x)
@@ -1277,17 +1277,17 @@ entry:
 define arm_aapcs_vfpcc float @fmax_v4f32_acc(<4 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmax_v4f32_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmaxnm.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f32_acc:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s6, s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s6, s6, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -1301,9 +1301,9 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmax_v8f32_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmaxnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
@@ -1315,15 +1315,15 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc(<8 x float> %x, float %y) {
 ; CHECK-NOFP-NEXT:    vselgt.f32 s10, s1, s5
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s2, s6
-; CHECK-NOFP-NEXT:    vselgt.f32 s12, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f32 s3, s7
-; CHECK-NOFP-NEXT:    vselgt.f32 s14, s2, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s10
+; CHECK-NOFP-NEXT:    vselgt.f32 s2, s2, s6
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vselgt.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s12, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s2, s14
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s2, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vselgt.f32 s4, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -1336,8 +1336,8 @@ entry:
 define arm_aapcs_vfpcc half @fmax_v2f16_acc(<2 x half> %x, half %y) {
 ; CHECK-LABEL: fmax_v2f16_acc:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-NEXT:    vmovx.f16 s2, s0
+; CHECK-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1350,21 +1350,21 @@ entry:
 define arm_aapcs_vfpcc half @fmax_v4f16_acc(<4 x half> %x, half %y) {
 ; CHECK-FP-LABEL: fmax_v4f16_acc:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmovx.f16 s8, s0
-; CHECK-FP-NEXT:    vmaxnm.f16 s6, s1, s6
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s8
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f16_acc:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -1379,25 +1379,25 @@ define arm_aapcs_vfpcc half @fmax_v8f16_acc(<8 x half> %x, half %y) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q2, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q2
-; CHECK-FP-NEXT:    vmaxnm.f16 s6, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f16_acc:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -1413,9 +1413,9 @@ define arm_aapcs_vfpcc half @fmax_v16f16_acc(<16 x half> %x, half %y) {
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
@@ -1424,42 +1424,42 @@ define arm_aapcs_vfpcc half @fmax_v16f16_acc(<16 x half> %x, half %y) {
 ; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s12, s0
 ; CHECK-NOFP-NEXT:    vcmp.f16 s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s1
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s0, s4
 ; CHECK-NOFP-NEXT:    vselgt.f16 s10, s12, s10
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s1, s5
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s1, s5
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s5
-; CHECK-NOFP-NEXT:    vcmp.f16 s14, s12
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s10
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s1
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vcmp.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s2, s6
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s14, s12
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s10, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s2
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s2, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s6
-; CHECK-NOFP-NEXT:    vcmp.f16 s14, s12
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vselgt.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vcmp.f16 s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vcmp.f16 s3, s7
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s14, s12
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s2, s4
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vcmp.f16 s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vselgt.f16 s12, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vcmp.f16 s4, s2
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vselgt.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s10, s0
+; CHECK-NOFP-NEXT:    vselgt.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -1500,10 +1500,10 @@ define arm_aapcs_vfpcc double @fmax_v4f64_acc(<4 x double> %x, double %y) {
 ; CHECK-NEXT:    vcmp.f64 d1, d3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vcmp.f64 d0, d2
-; CHECK-NEXT:    vselgt.f64 d5, d1, d3
+; CHECK-NEXT:    vselgt.f64 d1, d1, d3
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f64 d0, d0, d2
-; CHECK-NEXT:    vmaxnm.f64 d0, d0, d5
+; CHECK-NEXT:    vmaxnm.f64 d0, d0, d1
 ; CHECK-NEXT:    vmaxnm.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1531,9 +1531,9 @@ entry:
 define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmax_v4f32_acc_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmaxnm.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f32 s4, s0
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f32 s0, s4, s0
@@ -1541,9 +1541,9 @@ define arm_aapcs_vfpcc float @fmax_v4f32_acc_nofast(<4 x float> %x, float %y) {
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f32_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s6, s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s6, s6, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    vcmp.f32 s4, s0
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s4, s0
@@ -1559,9 +1559,9 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc_nofast(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmax_v8f32_acc_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmaxnm.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f32 s8, s0
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f32 s0, s8, s0
@@ -1569,13 +1569,13 @@ define arm_aapcs_vfpcc float @fmax_v8f32_acc_nofast(<8 x float> %x, float %y) {
 ;
 ; CHECK-NOFP-LABEL: fmax_v8f32_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s12, s0, s4
 ; CHECK-NOFP-NEXT:    vmaxnm.f32 s10, s1, s5
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s12, s2, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s10, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s10
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s2, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f32 s8, s0
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f32 s0, s8, s0
@@ -1590,11 +1590,11 @@ entry:
 define arm_aapcs_vfpcc half @fmax_v4f16_acc_nofast(<4 x half> %x, half %y) {
 ; CHECK-FP-LABEL: fmax_v4f16_acc_nofast:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmovx.f16 s8, s0
-; CHECK-FP-NEXT:    vmaxnm.f16 s6, s1, s6
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s8
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f16 s4, s0
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f16 s0, s4, s0
@@ -1602,11 +1602,11 @@ define arm_aapcs_vfpcc half @fmax_v4f16_acc_nofast(<4 x half> %x, half %y) {
 ;
 ; CHECK-NOFP-LABEL: fmax_v4f16_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f16 s4, s0
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s4, s0
@@ -1623,9 +1623,9 @@ define arm_aapcs_vfpcc half @fmax_v8f16_acc_nofast(<8 x half> %x, half %y) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q2, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q2
-; CHECK-FP-NEXT:    vmaxnm.f16 s6, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f16 s4, s0
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f16 s0, s4, s0
@@ -1634,16 +1634,16 @@ define arm_aapcs_vfpcc half @fmax_v8f16_acc_nofast(<8 x half> %x, half %y) {
 ; CHECK-NOFP-LABEL: fmax_v8f16_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s6, s6, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f16 s4, s0
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s4, s0
@@ -1661,9 +1661,9 @@ define arm_aapcs_vfpcc half @fmax_v16f16_acc_nofast(<16 x half> %x, half %y) {
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmaxnm.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmaxnm.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmaxnm.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vcmp.f16 s8, s0
 ; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-FP-NEXT:    vselgt.f16 s0, s8, s0
@@ -1671,29 +1671,29 @@ define arm_aapcs_vfpcc half @fmax_v16f16_acc_nofast(<16 x half> %x, half %y) {
 ;
 ; CHECK-NOFP-LABEL: fmax_v16f16_acc_nofast:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s12, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s12, s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s12, s1, s5
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s5
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s1
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s2
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s12, s2, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s6
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s12, s3, s7
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s10, s10, s12
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s10
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s1
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s10, s4
 ; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s10, s0
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s4, s2, s6
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s2, s2, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vmaxnm.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vcmp.f16 s8, s0
 ; CHECK-NOFP-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NOFP-NEXT:    vselgt.f16 s0, s8, s0
@@ -1737,9 +1737,9 @@ entry:
 define arm_aapcs_vfpcc double @fmax_v4f64_acc_nofast(<4 x double> %x, double %y) {
 ; CHECK-LABEL: fmax_v4f64_acc_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmaxnm.f64 d5, d1, d3
+; CHECK-NEXT:    vmaxnm.f64 d1, d1, d3
 ; CHECK-NEXT:    vmaxnm.f64 d0, d0, d2
-; CHECK-NEXT:    vmaxnm.f64 d0, d0, d5
+; CHECK-NEXT:    vmaxnm.f64 d0, d0, d1
 ; CHECK-NEXT:    vcmp.f64 d4, d0
 ; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
 ; CHECK-NEXT:    vselgt.f64 d0, d4, d0
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
index bbc30d99d10c6..b847b05f566f1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-fmul.ll
@@ -16,17 +16,17 @@ entry:
 define arm_aapcs_vfpcc float @fmul_v4f32(<4 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmul_v4f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmul.f32 s6, s2, s3
+; CHECK-FP-NEXT:    vmul.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmul.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmul.f32 s0, s0, s6
+; CHECK-FP-NEXT:    vmul.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vmul.f32 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmul_v4f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmul.f32 s6, s0, s1
-; CHECK-NOFP-NEXT:    vmul.f32 s6, s6, s2
-; CHECK-NOFP-NEXT:    vmul.f32 s0, s6, s3
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s3
 ; CHECK-NOFP-NEXT:    vmul.f32 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -38,21 +38,21 @@ define arm_aapcs_vfpcc float @fmul_v8f32(<8 x float> %x, float %y) {
 ; CHECK-FP-LABEL: fmul_v8f32:
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vmul.f32 q0, q0, q1
-; CHECK-FP-NEXT:    vmul.f32 s4, s2, s3
+; CHECK-FP-NEXT:    vmul.f32 s2, s2, s3
 ; CHECK-FP-NEXT:    vmul.f32 s0, s0, s1
-; CHECK-FP-NEXT:    vmul.f32 s0, s0, s4
+; CHECK-FP-NEXT:    vmul.f32 s0, s0, s2
 ; CHECK-FP-NEXT:    vmul.f32 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmul_v8f32:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmul.f32 s12, s0, s4
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmul.f32 s10, s1, s5
-; CHECK-NOFP-NEXT:    vmul.f32 s14, s2, s6
-; CHECK-NOFP-NEXT:    vmul.f32 s0, s3, s7
-; CHECK-NOFP-NEXT:    vmul.f32 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmul.f32 s2, s10, s14
-; CHECK-NOFP-NEXT:    vmul.f32 s0, s2, s0
+; CHECK-NOFP-NEXT:    vmul.f32 s2, s2, s6
+; CHECK-NOFP-NEXT:    vmul.f32 s4, s3, s7
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s10
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmul.f32 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmul.f32 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -63,8 +63,8 @@ entry:
 define arm_aapcs_vfpcc half @fmul_v2f16(<2 x half> %x, half %y) {
 ; CHECK-LABEL: fmul_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vmul.f16 s0, s0, s6
+; CHECK-NEXT:    vmovx.f16 s2, s0
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NEXT:    vmul.f16 s0, s4, s0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -75,21 +75,21 @@ entry:
 define arm_aapcs_vfpcc half @fmul_v4f16(<4 x half> %x, half %y) {
 ; CHECK-FP-LABEL: fmul_v4f16:
 ; CHECK-FP:       @ %bb.0: @ %entry
-; CHECK-FP-NEXT:    vmovx.f16 s6, s1
-; CHECK-FP-NEXT:    vmovx.f16 s8, s0
-; CHECK-FP-NEXT:    vmul.f16 s6, s1, s6
-; CHECK-FP-NEXT:    vmul.f16 s0, s0, s8
+; CHECK-FP-NEXT:    vmovx.f16 s2, s1
+; CHECK-FP-NEXT:    vmovx.f16 s6, s0
+; CHECK-FP-NEXT:    vmul.f16 s2, s1, s2
 ; CHECK-FP-NEXT:    vmul.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vmul.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmul_v4f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s1
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vmul.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s0
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s1
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vmul.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -102,25 +102,25 @@ define arm_aapcs_vfpcc half @fmul_v8f16(<8 x half> %x, half %y) {
 ; CHECK-FP:       @ %bb.0: @ %entry
 ; CHECK-FP-NEXT:    vrev32.16 q2, q0
 ; CHECK-FP-NEXT:    vmul.f16 q0, q0, q2
-; CHECK-FP-NEXT:    vmul.f16 s6, s2, s3
+; CHECK-FP-NEXT:    vmul.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmul.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmul.f16 s0, s0, s6
+; CHECK-FP-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vmul.f16 s0, s4, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmul_v8f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
 ; CHECK-NOFP-NEXT:    vmovx.f16 s6, s0
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s1
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s0, s6
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s6, s1
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmovx.f16 s8, s2
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s6, s2
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s6, s8
-; CHECK-NOFP-NEXT:    vmul.f16 s6, s6, s3
-; CHECK-NOFP-NEXT:    vmul.f16 s0, s6, s0
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s6, s1
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s1
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s6
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s3
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s3
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vmul.f16 s0, s4, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -134,37 +134,37 @@ define arm_aapcs_vfpcc half @fmul_v16f16(<16 x half> %x, half %y) {
 ; CHECK-FP-NEXT:    vmul.f16 q0, q0, q1
 ; CHECK-FP-NEXT:    vrev32.16 q1, q0
 ; CHECK-FP-NEXT:    vmul.f16 q0, q0, q1
-; CHECK-FP-NEXT:    vmul.f16 s4, s2, s3
+; CHECK-FP-NEXT:    vmul.f16 s2, s2, s3
 ; CHECK-FP-NEXT:    vmul.f16 s0, s0, s1
-; CHECK-FP-NEXT:    vmul.f16 s0, s0, s4
+; CHECK-FP-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-FP-NEXT:    vmul.f16 s0, s8, s0
 ; CHECK-FP-NEXT:    bx lr
 ;
 ; CHECK-NOFP-LABEL: fmul_v16f16:
 ; CHECK-NOFP:       @ %bb.0: @ %entry
-; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NOFP-NEXT:    vmovx.f16 s12, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s4
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s4
 ; CHECK-NOFP-NEXT:    vmul.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmul.f16 s12, s0, s4
-; CHECK-NOFP-NEXT:    vmul.f16 s10, s12, s10
-; CHECK-NOFP-NEXT:    vmul.f16 s12, s1, s5
-; CHECK-NOFP-NEXT:    vmul.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s5
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s1
-; CHECK-NOFP-NEXT:    vmovx.f16 s4, s7
-; CHECK-NOFP-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s14, s2
-; CHECK-NOFP-NEXT:    vmul.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmul.f16 s12, s2, s6
-; CHECK-NOFP-NEXT:    vmul.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s12, s6
-; CHECK-NOFP-NEXT:    vmul.f16 s12, s14, s12
-; CHECK-NOFP-NEXT:    vmovx.f16 s0, s3
-; CHECK-NOFP-NEXT:    vmul.f16 s10, s10, s12
-; CHECK-NOFP-NEXT:    vmul.f16 s12, s3, s7
-; CHECK-NOFP-NEXT:    vmul.f16 s10, s10, s12
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s10
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s1, s5
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s5
+; CHECK-NOFP-NEXT:    vmovx.f16 s10, s1
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s10, s4
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s4
+; CHECK-NOFP-NEXT:    vmul.f16 s4, s2, s6
 ; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s4
-; CHECK-NOFP-NEXT:    vmul.f16 s0, s10, s0
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s6
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s2
+; CHECK-NOFP-NEXT:    vmul.f16 s2, s2, s4
+; CHECK-NOFP-NEXT:    vmovx.f16 s4, s3
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmul.f16 s2, s3, s7
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NOFP-NEXT:    vmovx.f16 s2, s7
+; CHECK-NOFP-NEXT:    vmul.f16 s2, s4, s2
+; CHECK-NOFP-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NOFP-NEXT:    vmul.f16 s0, s8, s0
 ; CHECK-NOFP-NEXT:    bx lr
 entry:
@@ -196,9 +196,9 @@ entry:
 define arm_aapcs_vfpcc double @fmul_v4f64(<4 x double> %x, double %y) {
 ; CHECK-LABEL: fmul_v4f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f64 d5, d1, d3
+; CHECK-NEXT:    vmul.f64 d1, d1, d3
 ; CHECK-NEXT:    vmul.f64 d0, d0, d2
-; CHECK-NEXT:    vmul.f64 d0, d0, d5
+; CHECK-NEXT:    vmul.f64 d0, d0, d1
 ; CHECK-NEXT:    vmul.f64 d0, d4, d0
 ; CHECK-NEXT:    bx lr
 entry:
@@ -209,8 +209,8 @@ entry:
 define arm_aapcs_vfpcc float @fmul_v2f32_nofast(<2 x float> %x, float %y) {
 ; CHECK-LABEL: fmul_v2f32_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f32 s4, s4, s0
-; CHECK-NEXT:    vmul.f32 s0, s4, s1
+; CHECK-NEXT:    vmul.f32 s0, s4, s0
+; CHECK-NEXT:    vmul.f32 s0, s0, s1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fmul.f32.v2f32(float %y, <2 x float> %x)
@@ -220,10 +220,10 @@ entry:
 define arm_aapcs_vfpcc float @fmul_v4f32_nofast(<4 x float> %x, float %y) {
 ; CHECK-LABEL: fmul_v4f32_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f32 s4, s4, s0
-; CHECK-NEXT:    vmul.f32 s4, s4, s1
-; CHECK-NEXT:    vmul.f32 s4, s4, s2
-; CHECK-NEXT:    vmul.f32 s0, s4, s3
+; CHECK-NEXT:    vmul.f32 s0, s4, s0
+; CHECK-NEXT:    vmul.f32 s0, s0, s1
+; CHECK-NEXT:    vmul.f32 s0, s0, s2
+; CHECK-NEXT:    vmul.f32 s0, s0, s3
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call float @llvm.vector.reduce.fmul.f32.v4f32(float %y, <4 x float> %x)
@@ -233,10 +233,10 @@ entry:
 define arm_aapcs_vfpcc float @fmul_v8f32_nofast(<8 x float> %x, float %y) {
 ; CHECK-LABEL: fmul_v8f32_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f32 s8, s8, s0
-; CHECK-NEXT:    vmul.f32 s8, s8, s1
-; CHECK-NEXT:    vmul.f32 s8, s8, s2
-; CHECK-NEXT:    vmul.f32 s0, s8, s3
+; CHECK-NEXT:    vmul.f32 s0, s8, s0
+; CHECK-NEXT:    vmul.f32 s0, s0, s1
+; CHECK-NEXT:    vmul.f32 s0, s0, s2
+; CHECK-NEXT:    vmul.f32 s0, s0, s3
 ; CHECK-NEXT:    vmul.f32 s0, s0, s4
 ; CHECK-NEXT:    vmul.f32 s0, s0, s5
 ; CHECK-NEXT:    vmul.f32 s0, s0, s6
@@ -250,9 +250,9 @@ entry:
 define arm_aapcs_vfpcc half @fmul_v2f16_nofast(<2 x half> %x, half %y) {
 ; CHECK-LABEL: fmul_v2f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f16 s4, s4, s0
+; CHECK-NEXT:    vmul.f16 s2, s4, s0
 ; CHECK-NEXT:    vmovx.f16 s0, s0
-; CHECK-NEXT:    vmul.f16 s0, s4, s0
+; CHECK-NEXT:    vmul.f16 s0, s2, s0
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmul.f16.v2f16(half %y, <2 x half> %x)
@@ -262,12 +262,12 @@ entry:
 define arm_aapcs_vfpcc half @fmul_v4f16_nofast(<4 x half> %x, half %y) {
 ; CHECK-LABEL: fmul_v4f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f16 s4, s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vmul.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s0, s1
-; CHECK-NEXT:    vmul.f16 s4, s4, s1
-; CHECK-NEXT:    vmul.f16 s0, s4, s0
+; CHECK-NEXT:    vmul.f16 s2, s4, s0
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmul.f16 s0, s2, s0
+; CHECK-NEXT:    vmovx.f16 s2, s1
+; CHECK-NEXT:    vmul.f16 s0, s0, s1
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmul.f16.v4f16(half %y, <4 x half> %x)
@@ -278,17 +278,17 @@ define arm_aapcs_vfpcc half @fmul_v8f16_nofast(<8 x half> %x, half %y) {
 ; CHECK-LABEL: fmul_v8f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.f16 s4, s4, s0
-; CHECK-NEXT:    vmovx.f16 s6, s0
-; CHECK-NEXT:    vmul.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vmul.f16 s4, s4, s1
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmul.f16 s4, s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s2
-; CHECK-NEXT:    vmul.f16 s4, s4, s2
-; CHECK-NEXT:    vmul.f16 s4, s4, s6
-; CHECK-NEXT:    vmul.f16 s4, s4, s3
+; CHECK-NEXT:    vmovx.f16 s0, s0
 ; CHECK-NEXT:    vmul.f16 s0, s4, s0
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vmul.f16 s0, s0, s1
+; CHECK-NEXT:    vmul.f16 s0, s0, s4
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s3
+; CHECK-NEXT:    vmul.f16 s0, s0, s3
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call half @llvm.vector.reduce.fmul.f16.v8f16(half %y, <8 x half> %x)
@@ -299,18 +299,18 @@ define arm_aapcs_vfpcc half @fmul_v16f16_nofast(<16 x half> %x, half %y) {
 ; CHECK-LABEL: fmul_v16f16_nofast:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmul.f16 s8, s8, s0
-; CHECK-NEXT:    vmovx.f16 s10, s0
-; CHECK-NEXT:    vmul.f16 s8, s8, s10
-; CHECK-NEXT:    vmovx.f16 s10, s1
-; CHECK-NEXT:    vmul.f16 s8, s8, s1
-; CHECK-NEXT:    vmovx.f16 s0, s3
-; CHECK-NEXT:    vmul.f16 s8, s8, s10
-; CHECK-NEXT:    vmovx.f16 s10, s2
-; CHECK-NEXT:    vmul.f16 s8, s8, s2
-; CHECK-NEXT:    vmovx.f16 s2, s4
-; CHECK-NEXT:    vmul.f16 s8, s8, s10
-; CHECK-NEXT:    vmul.f16 s8, s8, s3
+; CHECK-NEXT:    vmovx.f16 s0, s0
 ; CHECK-NEXT:    vmul.f16 s0, s8, s0
+; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vmul.f16 s0, s0, s1
+; CHECK-NEXT:    vmul.f16 s0, s0, s8
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s3
+; CHECK-NEXT:    vmul.f16 s0, s0, s3
+; CHECK-NEXT:    vmul.f16 s0, s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s4
 ; CHECK-NEXT:    vmul.f16 s0, s0, s4
 ; CHECK-NEXT:    vmul.f16 s0, s0, s2
 ; CHECK-NEXT:    vmovx.f16 s2, s5
@@ -341,8 +341,8 @@ entry:
 define arm_aapcs_vfpcc double @fmul_v2f64_nofast(<2 x double> %x, double %y) {
 ; CHECK-LABEL: fmul_v2f64_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f64 d2, d2, d0
-; CHECK-NEXT:    vmul.f64 d0, d2, d1
+; CHECK-NEXT:    vmul.f64 d0, d2, d0
+; CHECK-NEXT:    vmul.f64 d0, d0, d1
 ; CHECK-NEXT:    bx lr
 entry:
   %z = call double @llvm.vector.reduce.fmul.f64.v2f64(double %y, <2 x double> %x)
@@ -352,8 +352,8 @@ entry:
 define arm_aapcs_vfpcc double @fmul_v4f64_nofast(<4 x double> %x, double %y) {
 ; CHECK-LABEL: fmul_v4f64_nofast:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmul.f64 d4, d4, d0
-; CHECK-NEXT:    vmul.f64 d0, d4, d1
+; CHECK-NEXT:    vmul.f64 d0, d4, d0
+; CHECK-NEXT:    vmul.f64 d0, d0, d1
 ; CHECK-NEXT:    vmul.f64 d0, d0, d2
 ; CHECK-NEXT:    vmul.f64 d0, d0, d3
 ; CHECK-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
index 7bcc0193217d3..f5adcf0427649 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll
@@ -503,10 +503,10 @@ define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vadd.f32 q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB5_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vadd.f32 s4, s2, s3
+; CHECK-NEXT:    vadd.f32 s2, s2, s3
 ; CHECK-NEXT:    cmp r2, r1
 ; CHECK-NEXT:    vadd.f32 s0, s0, s1
-; CHECK-NEXT:    vadd.f32 s0, s0, s4
+; CHECK-NEXT:    vadd.f32 s0, s0, s2
 ; CHECK-NEXT:    beq .LBB5_9
 ; CHECK-NEXT:  .LBB5_7: @ %for.body.preheader1
 ; CHECK-NEXT:    sub.w lr, r1, r2
@@ -601,10 +601,10 @@ define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vmul.f32 q0, q1, q0
 ; CHECK-NEXT:    le lr, .LBB6_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vmul.f32 s4, s2, s3
+; CHECK-NEXT:    vmul.f32 s2, s2, s3
 ; CHECK-NEXT:    cmp r2, r1
 ; CHECK-NEXT:    vmul.f32 s0, s0, s1
-; CHECK-NEXT:    vmul.f32 s0, s0, s4
+; CHECK-NEXT:    vmul.f32 s0, s0, s2
 ; CHECK-NEXT:    beq .LBB6_9
 ; CHECK-NEXT:  .LBB6_7: @ %for.body.preheader1
 ; CHECK-NEXT:    sub.w lr, r1, r2
@@ -1464,9 +1464,9 @@ define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB15_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vminnm.f32 s4, s2, s3
+; CHECK-NEXT:    vminnm.f32 s2, s2, s3
 ; CHECK-NEXT:    vminnm.f32 s0, s0, s1
-; CHECK-NEXT:    vminnm.f32 s0, s0, s4
+; CHECK-NEXT:    vminnm.f32 s0, s0, s2
 ; CHECK-NEXT:    cmp r2, r1
 ; CHECK-NEXT:    beq .LBB15_9
 ; CHECK-NEXT:  .LBB15_7: @ %for.body.preheader1
@@ -1567,9 +1567,9 @@ define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
 ; CHECK-NEXT:    vpsel q0, q0, q1
 ; CHECK-NEXT:    le lr, .LBB16_5
 ; CHECK-NEXT:  @ %bb.6: @ %middle.block
-; CHECK-NEXT:    vmaxnm.f32 s4, s2, s3
+; CHECK-NEXT:    vmaxnm.f32 s2, s2, s3
 ; CHECK-NEXT:    vmaxnm.f32 s0, s0, s1
-; CHECK-NEXT:    vmaxnm.f32 s0, s0, s4
+; CHECK-NEXT:    vmaxnm.f32 s0, s0, s2
 ; CHECK-NEXT:    cmp r2, r1
 ; CHECK-NEXT:    beq .LBB16_9
 ; CHECK-NEXT:  .LBB16_7: @ %for.body.preheader1
diff --git a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll
index d06a5418c70da..bf966ee17b7e6 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll
@@ -54,17 +54,17 @@ define arm_aapcs_vfpcc <8 x i16> @vrhadd_s16(<8 x i16> %src1, <8 x i16> %src2) {
 define arm_aapcs_vfpcc <4 x i32> @vrhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vrhadd_s32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s12, s2
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s16, s6
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    .vsave {d9}
+; CHECK-NEXT:    vpush {d9}
+; CHECK-NEXT:    vmov.f32 s8, s2
 ; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov.f32 s10, s3
+; CHECK-NEXT:    vmov.f32 s14, s1
+; CHECK-NEXT:    vmov.f32 s18, s5
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
@@ -77,27 +77,26 @@ define arm_aapcs_vfpcc <4 x i32> @vrhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-NEXT:    adc.w r3, r2, r3, asr #31
 ; CHECK-NEXT:    adds r2, r1, #1
 ; CHECK-NEXT:    adc r1, r3, #0
-; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    vmov r3, s18
 ; CHECK-NEXT:    lsrl r2, r1, #1
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    adds r0, #1
 ; CHECK-NEXT:    adc r1, r1, #0
 ; CHECK-NEXT:    lsrl r0, r1, #1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    asrs r2, r1, #31
 ; CHECK-NEXT:    adds r1, r1, r3
 ; CHECK-NEXT:    adc.w r3, r2, r3, asr #31
 ; CHECK-NEXT:    adds r2, r1, #1
 ; CHECK-NEXT:    adc r1, r3, #0
 ; CHECK-NEXT:    lsrl r2, r1, #1
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r0
+; CHECK-NEXT:    vpop {d9}
 ; CHECK-NEXT:    bx lr
   %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
   %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
@@ -153,17 +152,17 @@ define arm_aapcs_vfpcc <8 x i16> @vhadd_s16(<8 x i16> %src1, <8 x i16> %src2) {
 define arm_aapcs_vfpcc <4 x i32> @vhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vhadd_s32:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s12, s2
-; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s16, s6
-; CHECK-NEXT:    vmov r0, s12
-; CHECK-NEXT:    vmov.f32 s18, s7
-; CHECK-NEXT:    vmov r2, s16
-; CHECK-NEXT:    vmov.f32 s6, s5
-; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    .vsave {d9}
+; CHECK-NEXT:    vpush {d9}
+; CHECK-NEXT:    vmov.f32 s8, s2
 ; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov.f32 s10, s3
+; CHECK-NEXT:    vmov.f32 s14, s1
+; CHECK-NEXT:    vmov.f32 s18, s5
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov r2, s8
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
@@ -173,22 +172,21 @@ define arm_aapcs_vfpcc <4 x i32> @vhadd_s32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-NEXT:    asr.w r12, r1, #31
 ; CHECK-NEXT:    adc.w r1, r12, r3, asr #31
 ; CHECK-NEXT:    lsrl r2, r1, #1
-; CHECK-NEXT:    vmov r3, s6
-; CHECK-NEXT:    vmov q2[2], q2[0], r2, r0
-; CHECK-NEXT:    vmov r0, s14
-; CHECK-NEXT:    vmov r2, s18
+; CHECK-NEXT:    vmov r3, s18
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov r0, s10
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    asrs r1, r0, #31
 ; CHECK-NEXT:    adds r0, r0, r2
 ; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
 ; CHECK-NEXT:    lsrl r0, r1, #1
-; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r1, s14
 ; CHECK-NEXT:    adds r2, r1, r3
 ; CHECK-NEXT:    asr.w r12, r1, #31
 ; CHECK-NEXT:    adc.w r1, r12, r3, asr #31
 ; CHECK-NEXT:    lsrl r2, r1, #1
-; CHECK-NEXT:    vmov q2[3], q2[1], r2, r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vmov q0[3], q0[1], r2, r0
+; CHECK-NEXT:    vpop {d9}
 ; CHECK-NEXT:    bx lr
   %sextsrc1 = sext <4 x i32> %src1 to <4 x i64>
   %sextsrc2 = sext <4 x i32> %src2 to <4 x i64>
@@ -255,10 +253,10 @@ define arm_aapcs_vfpcc <4 x i32> @vrhadd_u32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.i64 q4, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s12, s2
 ; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s12, s2
 ; CHECK-NEXT:    vand q2, q2, q4
+; CHECK-NEXT:    vmov.f32 s14, s3
 ; CHECK-NEXT:    vand q3, q3, q4
 ; CHECK-NEXT:    vmov r0, r1, d4
 ; CHECK-NEXT:    vmov r2, r3, d6
@@ -356,10 +354,10 @@ define arm_aapcs_vfpcc <4 x i32> @vhadd_u32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.i64 q4, #0xffffffff
-; CHECK-NEXT:    vmov.f32 s12, s2
 ; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s12, s2
 ; CHECK-NEXT:    vand q2, q2, q4
+; CHECK-NEXT:    vmov.f32 s14, s3
 ; CHECK-NEXT:    vand q3, q3, q4
 ; CHECK-NEXT:    vmov r0, r1, d4
 ; CHECK-NEXT:    vmov r2, r3, d6
@@ -498,23 +496,23 @@ define void @vhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, lr}
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .vsave {d9}
+; CHECK-NEXT:    vpush {d9}
 ; CHECK-NEXT:    mov.w lr, #256
 ; CHECK-NEXT:  .LBB14_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov r3, s4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
-; CHECK-NEXT:    vmov.f32 s8, s2
-; CHECK-NEXT:    vmov.f32 s12, s6
-; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov.f32 s14, s7
-; CHECK-NEXT:    vmov r3, s8
-; CHECK-NEXT:    vmov r5, s12
-; CHECK-NEXT:    vmov.f32 s2, s1
-; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vmov.f32 s8, s6
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov.f32 s18, s5
+; CHECK-NEXT:    vmov r5, s8
+; CHECK-NEXT:    asrs r4, r3, #31
 ; CHECK-NEXT:    adds.w r12, r3, r5
-; CHECK-NEXT:    asr.w r4, r3, #31
 ; CHECK-NEXT:    adc.w r3, r4, r5, asr #31
 ; CHECK-NEXT:    vmov r5, s4
 ; CHECK-NEXT:    lsrl r12, r3, #1
@@ -523,24 +521,24 @@ define void @vhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly
 ; CHECK-NEXT:    asr.w r4, r3, #31
 ; CHECK-NEXT:    adc.w r3, r4, r5, asr #31
 ; CHECK-NEXT:    lsrl r6, r3, #1
-; CHECK-NEXT:    vmov r5, s14
-; CHECK-NEXT:    vmov r3, s10
-; CHECK-NEXT:    vmov q4[2], q4[0], r6, r12
+; CHECK-NEXT:    vmov r5, s6
+; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov q3[2], q3[0], r6, r12
 ; CHECK-NEXT:    adds r4, r3, r5
 ; CHECK-NEXT:    asr.w r6, r3, #31
 ; CHECK-NEXT:    adc.w r3, r6, r5, asr #31
 ; CHECK-NEXT:    lsrl r4, r3, #1
-; CHECK-NEXT:    vmov r5, s6
-; CHECK-NEXT:    vmov r3, s2
+; CHECK-NEXT:    vmov r5, s18
+; CHECK-NEXT:    vmov r3, s10
 ; CHECK-NEXT:    adds r6, r3, r5
 ; CHECK-NEXT:    asr.w r12, r3, #31
 ; CHECK-NEXT:    adc.w r3, r12, r5, asr #31
 ; CHECK-NEXT:    lsrl r6, r3, #1
-; CHECK-NEXT:    vmov q4[3], q4[1], r6, r4
-; CHECK-NEXT:    vstrb.8 q4, [r2], #16
+; CHECK-NEXT:    vmov q3[3], q3[1], r6, r4
+; CHECK-NEXT:    vstrb.8 q3, [r2], #16
 ; CHECK-NEXT:    le lr, .LBB14_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d9}
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   br label %vector.body
@@ -677,10 +675,10 @@ define void @vhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly
 ; CHECK-NEXT:    vldrw.u32 q3, [r0], #16
 ; CHECK-NEXT:    vldrw.u32 q4, [r1], #16
 ; CHECK-NEXT:    vmov.f32 s4, s14
-; CHECK-NEXT:    vmov.f32 s8, s18
 ; CHECK-NEXT:    vmov.f32 s6, s15
-; CHECK-NEXT:    vmov.f32 s10, s19
+; CHECK-NEXT:    vmov.f32 s8, s18
 ; CHECK-NEXT:    vand q1, q1, q0
+; CHECK-NEXT:    vmov.f32 s10, s19
 ; CHECK-NEXT:    vand q2, q2, q0
 ; CHECK-NEXT:    vmov r3, r5, d2
 ; CHECK-NEXT:    vmov r4, r6, d4
@@ -859,10 +857,10 @@ define void @vrhadd_loop_s32(i32* nocapture readonly %x, i32* nocapture readonly
 ; CHECK-NEXT:    vldrw.u32 q3, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q4, [r0], #16
 ; CHECK-NEXT:    vmov.f32 s4, s14
-; CHECK-NEXT:    vmov.f32 s8, s18
 ; CHECK-NEXT:    vmov.f32 s6, s15
-; CHECK-NEXT:    vmov.f32 s10, s19
+; CHECK-NEXT:    vmov.f32 s8, s18
 ; CHECK-NEXT:    vand q1, q1, q0
+; CHECK-NEXT:    vmov.f32 s10, s19
 ; CHECK-NEXT:    vand q2, q2, q0
 ; CHECK-NEXT:    vmov r3, r12, d2
 ; CHECK-NEXT:    vmov r4, r5, d4
@@ -1049,10 +1047,10 @@ define void @vrhadd_loop_u32(i32* nocapture readonly %x, i32* nocapture readonly
 ; CHECK-NEXT:    vldrw.u32 q3, [r1], #16
 ; CHECK-NEXT:    vldrw.u32 q4, [r0], #16
 ; CHECK-NEXT:    vmov.f32 s4, s14
-; CHECK-NEXT:    vmov.f32 s8, s18
 ; CHECK-NEXT:    vmov.f32 s6, s15
-; CHECK-NEXT:    vmov.f32 s10, s19
+; CHECK-NEXT:    vmov.f32 s8, s18
 ; CHECK-NEXT:    vand q1, q1, q0
+; CHECK-NEXT:    vmov.f32 s10, s19
 ; CHECK-NEXT:    vand q2, q2, q0
 ; CHECK-NEXT:    vmov r3, r12, d2
 ; CHECK-NEXT:    vmov r4, r5, d4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
index 6bd3ee578b89c..693afc151c796 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2-post.ll
@@ -70,15 +70,13 @@ define <4 x i64> *@vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0], #32
-; CHECK-NEXT:    vmov.f64 d4, d1
+; CHECK-NEXT:    vmov.f32 s8, s2
 ; CHECK-NEXT:    vmov.f32 s9, s3
-; CHECK-NEXT:    vmov.f32 s10, s6
 ; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s11, s7
 ; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vmov r4, r7, d4
+; CHECK-NEXT:    vmov lr, r12, d3
 ; CHECK-NEXT:    vmov r2, r5, d0
-; CHECK-NEXT:    vmov lr, r12, d5
+; CHECK-NEXT:    vmov r4, r7, d4
 ; CHECK-NEXT:    vmov r3, r6, d1
 ; CHECK-NEXT:    adds.w r3, r3, lr
 ; CHECK-NEXT:    adc.w r6, r6, r12
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld2.ll b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
index 5e5328bf755d2..f9c4965d05ca1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld2.ll
@@ -81,7 +81,6 @@ define void @vld2_v16i32(<32 x i32> *%src, <16 x i32> *%dst) {
 ; CHECK-NEXT:    vld21.32 {q5, q6}, [r0]
 ; CHECK-NEXT:    vld21.32 {q1, q2}, [r3]
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q1_q2
 ; CHECK-NEXT:    vadd.i32 q5, q5, q6
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vadd.i32 q1, q1, q2
@@ -101,14 +100,14 @@ entry:
 define void @vld2_v4i32_align1(<8 x i32> *%src, <4 x i32> *%dst) {
 ; CHECK-LABEL: vld2_v4i32_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.u8 q1, [r0]
 ; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
+; CHECK-NEXT:    vldrb.u8 q1, [r0]
 ; CHECK-NEXT:    vmov.f32 s8, s5
 ; CHECK-NEXT:    vmov.f32 s9, s7
 ; CHECK-NEXT:    vmov.f32 s5, s6
 ; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s7, s2
 ; CHECK-NEXT:    vadd.i32 q0, q1, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -206,25 +205,25 @@ define void @vld2_v8i16_align1(<16 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld2_v8i16_align1:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u8 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
 ; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s4, s6
 ; CHECK-NEXT:    vmovx.f16 s5, s2
-; CHECK-NEXT:    vins.f16 s0, s1
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
-; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vmovx.f16 s6, s3
 ; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vins.f16 s5, s6
 ; CHECK-NEXT:    vmovx.f16 s6, s8
 ; CHECK-NEXT:    vins.f16 s6, s12
-; CHECK-NEXT:    vmovx.f16 s12, s11
 ; CHECK-NEXT:    vmovx.f16 s7, s10
-; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmovx.f16 s12, s11
+; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vins.f16 s10, s11
 ; CHECK-NEXT:    vins.f16 s8, s9
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vins.f16 s7, s12
 ; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vins.f16 s10, s11
 ; CHECK-NEXT:    vmov.f32 s3, s10
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -323,15 +322,13 @@ define void @vld2_v2i64(<4 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d4, d1
+; CHECK-NEXT:    vmov.f32 s8, s2
 ; CHECK-NEXT:    vmov.f32 s9, s3
-; CHECK-NEXT:    vmov.f32 s10, s6
 ; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s11, s7
 ; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vmov r0, r4, d4
+; CHECK-NEXT:    vmov lr, r12, d3
 ; CHECK-NEXT:    vmov r5, r6, d0
-; CHECK-NEXT:    vmov lr, r12, d5
+; CHECK-NEXT:    vmov r0, r4, d4
 ; CHECK-NEXT:    vmov r3, r2, d1
 ; CHECK-NEXT:    adds.w r3, r3, lr
 ; CHECK-NEXT:    adc.w r2, r2, r12
@@ -355,34 +352,30 @@ define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
-; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov lr, r12, d5
 ; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    vmov.f32 s3, s9
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d8, d5
+; CHECK-NEXT:    vmov.f32 s16, s10
 ; CHECK-NEXT:    vmov.f32 s17, s11
-; CHECK-NEXT:    vmov.f32 s18, s14
-; CHECK-NEXT:    vmov.f32 s10, s12
-; CHECK-NEXT:    vmov lr, r12, d3
+; CHECK-NEXT:    vmov r5, r6, d4
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    vmov.f32 s19, s15
-; CHECK-NEXT:    vmov.f32 s11, s13
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vmov.f32 s3, s13
 ; CHECK-NEXT:    vmov r0, r7, d8
-; CHECK-NEXT:    vmov r5, r6, d4
 ; CHECK-NEXT:    adds.w lr, lr, r2
 ; CHECK-NEXT:    adc.w r12, r12, r3
-; CHECK-NEXT:    vmov r3, r4, d9
+; CHECK-NEXT:    vmov r3, r4, d7
 ; CHECK-NEXT:    adds r0, r0, r5
 ; CHECK-NEXT:    adc.w r8, r6, r7
-; CHECK-NEXT:    vmov r6, r5, d5
+; CHECK-NEXT:    vmov r6, r5, d1
 ; CHECK-NEXT:    vmov r2, r7, d0
 ; CHECK-NEXT:    adds r3, r3, r6
 ; CHECK-NEXT:    adc.w r6, r5, r4
@@ -395,7 +388,7 @@ define void @vld2_v4i64(<8 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-NEXT:    adc.w r0, r7, r4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d8}
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %l1 = load <8 x i64>, <8 x i64>* %src, align 8
@@ -483,7 +476,6 @@ define void @vld2_v16f32(<32 x float> *%src, <16 x float> *%dst) {
 ; CHECK-NEXT:    vld21.32 {q5, q6}, [r0]
 ; CHECK-NEXT:    vld21.32 {q1, q2}, [r3]
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #48]
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q1_q2
 ; CHECK-NEXT:    vadd.f32 q5, q5, q6
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vadd.f32 q1, q1, q2
@@ -503,14 +495,14 @@ entry:
 define void @vld2_v4f32_align1(<8 x float> *%src, <4 x float> *%dst) {
 ; CHECK-LABEL: vld2_v4f32_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrb.u8 q1, [r0]
 ; CHECK-NEXT:    vldrb.u8 q0, [r0, #16]
+; CHECK-NEXT:    vldrb.u8 q1, [r0]
 ; CHECK-NEXT:    vmov.f32 s8, s5
 ; CHECK-NEXT:    vmov.f32 s9, s7
 ; CHECK-NEXT:    vmov.f32 s5, s6
 ; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vmov.f32 s7, s2
 ; CHECK-NEXT:    vadd.f32 q0, q1, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -533,11 +525,11 @@ define void @vld2_v2f16(<4 x half> *%src, <2 x half> *%dst) {
 ; CHECK-NEXT:    ldr r0, [r0, #4]
 ; CHECK-NEXT:    vmov.32 q0[0], r2
 ; CHECK-NEXT:    vmov.32 q0[1], r0
-; CHECK-NEXT:    vmovx.f16 s4, s1
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s2, s1
+; CHECK-NEXT:    vins.f16 s4, s2
 ; CHECK-NEXT:    vins.f16 s0, s1
-; CHECK-NEXT:    vadd.f16 q0, q0, q2
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    str r0, [r1]
 ; CHECK-NEXT:    bx lr
@@ -554,14 +546,14 @@ define void @vld2_v4f16(<8 x half> *%src, <4 x half> *%dst) {
 ; CHECK-LABEL: vld2_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u16 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s1
 ; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s4, s6
 ; CHECK-NEXT:    vmovx.f16 s5, s2
-; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vmovx.f16 s6, s3
 ; CHECK-NEXT:    vins.f16 s2, s3
-; CHECK-NEXT:    vins.f16 s5, s8
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s5, s6
 ; CHECK-NEXT:    vmov.f32 s1, s2
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vmov r0, r2, d0
@@ -618,25 +610,25 @@ define void @vld2_v8f16_align1(<16 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld2_v8f16_align1:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrb.u8 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s1
+; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
 ; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s3
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s4, s6
 ; CHECK-NEXT:    vmovx.f16 s5, s2
-; CHECK-NEXT:    vins.f16 s0, s1
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vldrb.u8 q2, [r0, #16]
-; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vmovx.f16 s6, s3
 ; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vins.f16 s5, s6
 ; CHECK-NEXT:    vmovx.f16 s6, s8
 ; CHECK-NEXT:    vins.f16 s6, s12
-; CHECK-NEXT:    vmovx.f16 s12, s11
 ; CHECK-NEXT:    vmovx.f16 s7, s10
+; CHECK-NEXT:    vmovx.f16 s12, s11
+; CHECK-NEXT:    vins.f16 s2, s3
+; CHECK-NEXT:    vins.f16 s10, s11
 ; CHECK-NEXT:    vins.f16 s8, s9
+; CHECK-NEXT:    vins.f16 s0, s1
 ; CHECK-NEXT:    vmov.f32 s1, s2
-; CHECK-NEXT:    vins.f16 s10, s11
-; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vins.f16 s7, s12
+; CHECK-NEXT:    vmov.f32 s2, s8
 ; CHECK-NEXT:    vmov.f32 s3, s10
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld3.ll b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
index b998d62b0d9c6..bf76ba3a513ca 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld3.ll
@@ -10,7 +10,6 @@ define void @vld3_v2i32(<6 x i32> *%src, <2 x i32> *%dst) {
 ; CHECK-NEXT:    push {r7, lr}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    ldrd r2, r0, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d2, d0
 ; CHECK-NEXT:    vmov.f32 s6, s3
 ; CHECK-NEXT:    vmov r12, lr, d0
 ; CHECK-NEXT:    vmov r3, s6
@@ -37,20 +36,20 @@ define void @vld3_v4i32(<12 x i32> *%src, <4 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.f32 s11, s17
 ; CHECK-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
 ; CHECK-NEXT:    vmov.f32 s3, s19
 ; CHECK-NEXT:    vadd.i32 q0, q2, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -72,37 +71,37 @@ define void @vld3_v8i32(<24 x i32> *%src, <8 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.f32 s11, s17
 ; CHECK-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
 ; CHECK-NEXT:    vadd.i32 q0, q2, q0
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s16, s9
-; CHECK-NEXT:    vmov.f64 d10, d4
 ; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s4, s10
-; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vmov.f32 s16, s9
 ; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s11
 ; CHECK-NEXT:    vmov.f32 s23, s13
 ; CHECK-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vmov.f32 s6, s12
 ; CHECK-NEXT:    vmov.f32 s7, s15
 ; CHECK-NEXT:    vadd.i32 q1, q4, q1
 ; CHECK-NEXT:    vstrw.32 q1, [r1]
@@ -124,71 +123,71 @@ define void @vld3_v16i32(<48 x i32> *%src, <16 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.f32 s11, s17
 ; CHECK-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
 ; CHECK-NEXT:    vadd.i32 q0, q2, q0
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s16, s9
-; CHECK-NEXT:    vmov.f64 d10, d4
 ; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmov.f32 s21, s11
 ; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s23, s13
 ; CHECK-NEXT:    vmov.f32 s4, s10
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
 ; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    vmov.f32 s19, s14
-; CHECK-NEXT:    vmov.f32 s23, s13
+; CHECK-NEXT:    vadd.i32 q4, q5, q4
 ; CHECK-NEXT:    vmov.f32 s7, s15
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
-; CHECK-NEXT:    vadd.i32 q4, q5, q4
-; CHECK-NEXT:    vmov.f32 s20, s13
 ; CHECK-NEXT:    vadd.i32 q1, q4, q1
-; CHECK-NEXT:    vmov.f64 d8, d6
-; CHECK-NEXT:    vmov.f32 s17, s15
-; CHECK-NEXT:    vmov.f32 s21, s8
 ; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vmov.f32 s21, s8
 ; CHECK-NEXT:    vmov.f32 s22, s11
-; CHECK-NEXT:    vmov.f32 s8, s14
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
-; CHECK-NEXT:    vmov.f32 s10, s24
+; CHECK-NEXT:    vmov.f32 s16, s12
+; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s20, s13
 ; CHECK-NEXT:    vmov.f32 s23, s26
 ; CHECK-NEXT:    vmov.f32 s19, s25
 ; CHECK-NEXT:    vadd.i32 q4, q4, q5
+; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vmov.f32 s10, s24
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
 ; CHECK-NEXT:    vmov.f32 s11, s27
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
 ; CHECK-NEXT:    vadd.i32 q2, q4, q2
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
-; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s24, s17
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f64 d14, d8
-; CHECK-NEXT:    vstrw.32 q1, [r1]
 ; CHECK-NEXT:    vmov.f32 s25, s12
-; CHECK-NEXT:    vmov.f32 s29, s19
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
 ; CHECK-NEXT:    vmov.f32 s26, s15
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s30, s14
-; CHECK-NEXT:    vmov.f32 s12, s18
-; CHECK-NEXT:    vmov.f32 s14, s20
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vmov.f32 s24, s17
 ; CHECK-NEXT:    vmov.f32 s27, s22
+; CHECK-NEXT:    vmov.f32 s28, s16
+; CHECK-NEXT:    vmov.f32 s29, s19
 ; CHECK-NEXT:    vmov.f32 s31, s21
 ; CHECK-NEXT:    vadd.i32 q6, q7, q6
+; CHECK-NEXT:    vmov.f32 s12, s18
+; CHECK-NEXT:    vmov.f32 s14, s20
 ; CHECK-NEXT:    vmov.f32 s15, s23
 ; CHECK-NEXT:    vadd.i32 q3, q6, q3
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
@@ -216,23 +215,22 @@ define void @vld3_v2i16(<6 x i16> *%src, <2 x i16> *%dst) {
 ; CHECK-NEXT:    ldr r2, [r0, #8]
 ; CHECK-NEXT:    mov r3, sp
 ; CHECK-NEXT:    str r2, [sp]
-; CHECK-NEXT:    vmov.f64 d2, d0
 ; CHECK-NEXT:    vmov.f32 s6, s3
 ; CHECK-NEXT:    vmov.f32 s8, s1
-; CHECK-NEXT:    vmov.f64 d6, d1
 ; CHECK-NEXT:    vmov r0, s6
 ; CHECK-NEXT:    vldrh.u32 q1, [r3]
-; CHECK-NEXT:    vmov.f32 s10, s4
-; CHECK-NEXT:    vmov.f32 s14, s5
-; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    vmov.f32 s6, s4
+; CHECK-NEXT:    vmov.f32 s4, s2
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    strh r0, [r1, #2]
 ; CHECK-NEXT:    vmov r0, s8
 ; CHECK-NEXT:    vmov r2, s0
 ; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s12
+; CHECK-NEXT:    vmov r2, s4
 ; CHECK-NEXT:    add r0, r2
 ; CHECK-NEXT:    strh r0, [r1]
 ; CHECK-NEXT:    add sp, #8
@@ -292,49 +290,49 @@ define void @vld3_v8i16(<24 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s6
-; CHECK-NEXT:    vmov.f32 s0, s5
-; CHECK-NEXT:    vins.f16 s0, s8
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmovx.f16 s12, s9
-; CHECK-NEXT:    vmov.f32 s1, s8
-; CHECK-NEXT:    vins.f16 s1, s12
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s2, s11
-; CHECK-NEXT:    vmov.u16 r0, q2[5]
-; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vmov.f32 s0, s5
+; CHECK-NEXT:    vmovx.f16 s2, s6
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s9
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmovx.f16 s5, s5
+; CHECK-NEXT:    vins.f16 s1, s2
 ; CHECK-NEXT:    vmov.f32 s19, s14
-; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmovx.f16 s2, s15
 ; CHECK-NEXT:    vmov.f32 s18, s12
+; CHECK-NEXT:    vins.f16 s19, s2
+; CHECK-NEXT:    vmov.f32 s2, s11
 ; CHECK-NEXT:    vmov q5, q4
-; CHECK-NEXT:    vmovnb.i32 q5, q0
-; CHECK-NEXT:    vmov.f32 s2, s22
-; CHECK-NEXT:    vmovx.f16 s20, s5
-; CHECK-NEXT:    vmov.f32 s3, s19
-; CHECK-NEXT:    vmov.f64 d8, d2
-; CHECK-NEXT:    vins.f16 s16, s20
-; CHECK-NEXT:    vmovx.f16 s20, s8
+; CHECK-NEXT:    vmov.f32 s16, s4
+; CHECK-NEXT:    vins.f16 s16, s5
+; CHECK-NEXT:    vmovx.f16 s5, s8
 ; CHECK-NEXT:    vmov.f32 s17, s7
-; CHECK-NEXT:    vins.f16 s17, s20
-; CHECK-NEXT:    vmovx.f16 s20, s11
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vins.f16 s17, s5
+; CHECK-NEXT:    vmovx.f16 s5, s11
 ; CHECK-NEXT:    vmov.f32 s18, s10
-; CHECK-NEXT:    vins.f16 s18, s20
-; CHECK-NEXT:    vmovx.f16 s20, s14
+; CHECK-NEXT:    vmov.u16 r0, q2[5]
+; CHECK-NEXT:    vmovx.f16 s11, s13
+; CHECK-NEXT:    vins.f16 s18, s5
+; CHECK-NEXT:    vmovx.f16 s5, s7
+; CHECK-NEXT:    vmovnb.i32 q5, q0
+; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vmovx.f16 s14, s14
 ; CHECK-NEXT:    vmov.f32 s19, s13
-; CHECK-NEXT:    vins.f16 s19, s20
-; CHECK-NEXT:    vmovx.f16 s20, s4
-; CHECK-NEXT:    vins.f16 s20, s6
-; CHECK-NEXT:    vmovx.f16 s21, s7
-; CHECK-NEXT:    vins.f16 s6, s12
-; CHECK-NEXT:    vmovx.f16 s7, s13
-; CHECK-NEXT:    vins.f16 s21, s9
-; CHECK-NEXT:    vins.f16 s7, s15
-; CHECK-NEXT:    vmov.16 q5[4], r0
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmovnb.i32 q2, q5
-; CHECK-NEXT:    vmov.f32 s22, s10
-; CHECK-NEXT:    vmov.f32 s23, s7
-; CHECK-NEXT:    vadd.i16 q1, q4, q5
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s5, s9
+; CHECK-NEXT:    vins.f16 s10, s12
+; CHECK-NEXT:    vins.f16 s11, s15
+; CHECK-NEXT:    vins.f16 s19, s14
+; CHECK-NEXT:    vmov.16 q1[4], r0
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vmovnb.i32 q3, q1
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov.f32 s6, s14
+; CHECK-NEXT:    vmov.f32 s2, s22
+; CHECK-NEXT:    vadd.i16 q1, q4, q1
 ; CHECK-NEXT:    vadd.i16 q0, q1, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
@@ -355,103 +353,98 @@ define void @vld3_v16i16(<48 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vmov.f64 d0, d2
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vins.f16 s0, s8
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s8
 ; CHECK-NEXT:    vmov.f32 s1, s7
-; CHECK-NEXT:    vmovx.f16 s12, s8
-; CHECK-NEXT:    vmovx.f16 s16, s9
-; CHECK-NEXT:    vins.f16 s1, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s11
+; CHECK-NEXT:    vins.f16 s1, s2
 ; CHECK-NEXT:    vmov.f32 s2, s10
-; CHECK-NEXT:    vmov.u16 r2, q2[5]
+; CHECK-NEXT:    vmovx.f16 s14, s18
+; CHECK-NEXT:    vmov.f32 s3, s17
 ; CHECK-NEXT:    vins.f16 s2, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s6
+; CHECK-NEXT:    vins.f16 s3, s14
+; CHECK-NEXT:    vmovx.f16 s14, s19
+; CHECK-NEXT:    vins.f16 s18, s14
 ; CHECK-NEXT:    vins.f16 s5, s12
+; CHECK-NEXT:    vmovx.f16 s12, s9
 ; CHECK-NEXT:    vmov.f32 s13, s8
-; CHECK-NEXT:    vins.f16 s13, s16
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vins.f16 s13, s12
 ; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmovx.f16 s20, s18
-; CHECK-NEXT:    vmov.f32 s3, s17
-; CHECK-NEXT:    vins.f16 s3, s20
-; CHECK-NEXT:    vmovx.f16 s20, s19
-; CHECK-NEXT:    vins.f16 s18, s20
+; CHECK-NEXT:    vmovx.f16 s5, s7
+; CHECK-NEXT:    vmov.u16 r2, q2[5]
 ; CHECK-NEXT:    vmov.f32 s14, s11
+; CHECK-NEXT:    vmovx.f16 s11, s17
 ; CHECK-NEXT:    vmov.f32 s23, s18
 ; CHECK-NEXT:    vmov.f32 s22, s16
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s5, s9
 ; CHECK-NEXT:    vmov q6, q5
+; CHECK-NEXT:    vins.f16 s10, s16
+; CHECK-NEXT:    vins.f16 s11, s19
 ; CHECK-NEXT:    vmovnb.i32 q6, q3
-; CHECK-NEXT:    vmov.f32 s14, s26
-; CHECK-NEXT:    vmov.f32 s15, s23
-; CHECK-NEXT:    vmovx.f16 s20, s4
-; CHECK-NEXT:    vins.f16 s20, s6
-; CHECK-NEXT:    vmovx.f16 s21, s7
-; CHECK-NEXT:    vins.f16 s6, s16
-; CHECK-NEXT:    vmovx.f16 s7, s17
-; CHECK-NEXT:    vins.f16 s21, s9
-; CHECK-NEXT:    vins.f16 s7, s19
-; CHECK-NEXT:    vmov.16 q5[4], r2
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmovnb.i32 q2, q5
-; CHECK-NEXT:    vmov.f32 s22, s10
+; CHECK-NEXT:    vmov.f32 s15, s18
+; CHECK-NEXT:    vmov.16 q1[4], r2
+; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmovnb.i32 q4, q1
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov.f32 s6, s18
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmov.f32 s23, s7
-; CHECK-NEXT:    vadd.i16 q0, q0, q5
-; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s14, s26
+; CHECK-NEXT:    vmovx.f16 s6, s10
 ; CHECK-NEXT:    vadd.i16 q0, q0, q3
-; CHECK-NEXT:    vmovx.f16 s12, s10
-; CHECK-NEXT:    vins.f16 s4, s12
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmovx.f16 s0, s9
-; CHECK-NEXT:    vmovx.f16 s16, s13
-; CHECK-NEXT:    vmov.f32 s5, s12
-; CHECK-NEXT:    vins.f16 s5, s16
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s6, s15
-; CHECK-NEXT:    vmov.u16 r0, q3[5]
-; CHECK-NEXT:    vmovx.f16 s20, s19
+; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vmovx.f16 s7, s19
 ; CHECK-NEXT:    vmov.f32 s27, s18
-; CHECK-NEXT:    vins.f16 s27, s20
-; CHECK-NEXT:    vmov.f64 d10, d4
-; CHECK-NEXT:    vins.f16 s20, s0
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s6, s13
+; CHECK-NEXT:    vmov.f32 s5, s12
+; CHECK-NEXT:    vins.f16 s27, s7
 ; CHECK-NEXT:    vmov.f32 s26, s16
-; CHECK-NEXT:    vmovx.f16 s0, s12
-; CHECK-NEXT:    vmov.f32 s21, s11
-; CHECK-NEXT:    vins.f16 s21, s0
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vmov.f32 s6, s15
 ; CHECK-NEXT:    vmov q7, q6
+; CHECK-NEXT:    vmov.f32 s20, s8
 ; CHECK-NEXT:    vmovnb.i32 q7, q1
-; CHECK-NEXT:    vmovx.f16 s0, s15
+; CHECK-NEXT:    vmovx.f16 s6, s9
+; CHECK-NEXT:    vins.f16 s20, s6
+; CHECK-NEXT:    vmovx.f16 s6, s12
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmovx.f16 s8, s8
+; CHECK-NEXT:    vmovx.f16 s9, s11
+; CHECK-NEXT:    vins.f16 s21, s6
+; CHECK-NEXT:    vmovx.f16 s6, s15
+; CHECK-NEXT:    vmov.u16 r0, q3[5]
+; CHECK-NEXT:    vmovx.f16 s15, s17
 ; CHECK-NEXT:    vmov.f32 s22, s14
-; CHECK-NEXT:    vins.f16 s22, s0
+; CHECK-NEXT:    vins.f16 s8, s10
+; CHECK-NEXT:    vins.f16 s9, s13
+; CHECK-NEXT:    vins.f16 s14, s16
+; CHECK-NEXT:    vins.f16 s15, s19
+; CHECK-NEXT:    vins.f16 s22, s6
+; CHECK-NEXT:    vmovx.f16 s6, s18
+; CHECK-NEXT:    vmov.f32 s23, s17
+; CHECK-NEXT:    vmov.16 q2[4], r0
+; CHECK-NEXT:    vmov q4, q3
+; CHECK-NEXT:    vins.f16 s23, s6
+; CHECK-NEXT:    vmovnb.i32 q4, q2
+; CHECK-NEXT:    vmov.f32 s11, s15
+; CHECK-NEXT:    vmov.f32 s10, s18
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s6, s30
+; CHECK-NEXT:    vadd.i16 q2, q5, q2
 ; CHECK-NEXT:    vmov.f32 s7, s27
-; CHECK-NEXT:    vmovx.f16 s24, s8
-; CHECK-NEXT:    vmovx.f16 s0, s18
-; CHECK-NEXT:    vmov.f32 s23, s17
-; CHECK-NEXT:    vins.f16 s24, s10
-; CHECK-NEXT:    vins.f16 s23, s0
-; CHECK-NEXT:    vins.f16 s2, s16
-; CHECK-NEXT:    vmovx.f16 s25, s11
-; CHECK-NEXT:    vmovx.f16 s3, s17
-; CHECK-NEXT:    vins.f16 s25, s13
-; CHECK-NEXT:    vins.f16 s3, s19
-; CHECK-NEXT:    vmov.16 q6[4], r0
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmovnb.i32 q2, q6
-; CHECK-NEXT:    vmov.f32 s26, s10
-; CHECK-NEXT:    vmov.f32 s27, s3
-; CHECK-NEXT:    vadd.i16 q0, q5, q6
-; CHECK-NEXT:    vadd.i16 q0, q0, q1
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vadd.i16 q1, q2, q1
+; CHECK-NEXT:    vstrw.32 q1, [r1]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -558,22 +551,21 @@ define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) {
 ; CHECK-NEXT:    vmov.u8 r0, q0[0]
 ; CHECK-NEXT:    vmov.16 q2[0], r2
 ; CHECK-NEXT:    vmov.u8 r2, q0[4]
-; CHECK-NEXT:    vmov.16 q2[1], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[7]
 ; CHECK-NEXT:    vmov.16 q3[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.16 q2[2], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[10]
+; CHECK-NEXT:    vmov.16 q2[1], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[7]
 ; CHECK-NEXT:    vmov.16 q3[1], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.16 q2[3], r2
-; CHECK-NEXT:    vmov.u8 r2, q0[13]
+; CHECK-NEXT:    vmov.16 q2[2], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[10]
 ; CHECK-NEXT:    vmov.16 q3[2], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[9]
-; CHECK-NEXT:    vmov.16 q2[4], r2
+; CHECK-NEXT:    vmov.16 q2[3], r2
+; CHECK-NEXT:    vmov.u8 r2, q0[13]
 ; CHECK-NEXT:    vmov.16 q3[3], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[12]
-; CHECK-NEXT:    vins.f16 s10, s4
+; CHECK-NEXT:    vmov.16 q2[4], r2
 ; CHECK-NEXT:    vmov.16 q3[4], r0
 ; CHECK-NEXT:    vmov.u8 r0, q0[15]
 ; CHECK-NEXT:    vmovx.f16 s16, s6
@@ -581,6 +573,7 @@ define void @vld3_v8i8(<24 x i8> *%src, <8 x i8> *%dst) {
 ; CHECK-NEXT:    vmovx.f16 s11, s5
 ; CHECK-NEXT:    vmov.16 q3[5], r0
 ; CHECK-NEXT:    vins.f16 s18, s16
+; CHECK-NEXT:    vins.f16 s10, s4
 ; CHECK-NEXT:    vins.f16 s11, s7
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.u8 r0, q0[2]
@@ -653,12 +646,11 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-NEXT:    vmov.8 q4[14], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[14]
 ; CHECK-NEXT:    vmov.8 q4[15], r0
-; CHECK-NEXT:    vmov.u8 r0, q2[2]
 ; CHECK-NEXT:    vmov q5, q3
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-NEXT:    vmov.f32 s15, s19
 ; CHECK-NEXT:    vmov.8 q5[11], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[0]
-; CHECK-NEXT:    vmov.f32 s14, s22
-; CHECK-NEXT:    vmov.f32 s15, s19
 ; CHECK-NEXT:    vmov.8 q4[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[3]
 ; CHECK-NEXT:    vmov.8 q4[1], r0
@@ -681,19 +673,20 @@ define void @vld3_v16i8(<48 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-NEXT:    vmov.u8 r0, q0[14]
 ; CHECK-NEXT:    vmov.8 q4[10], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-NEXT:    vmov.f32 s14, s22
 ; CHECK-NEXT:    vmov.8 q5[12], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-NEXT:    vmov q6, q4
 ; CHECK-NEXT:    vmov.8 q5[13], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[10]
 ; CHECK-NEXT:    vmov.8 q5[14], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[13]
 ; CHECK-NEXT:    vmov.8 q5[15], r0
 ; CHECK-NEXT:    vmov.u8 r0, q2[1]
-; CHECK-NEXT:    vmov q6, q4
 ; CHECK-NEXT:    vmov.8 q6[11], r0
-; CHECK-NEXT:    vmov.u8 r0, q1[2]
-; CHECK-NEXT:    vmov.f32 s18, s26
 ; CHECK-NEXT:    vmov.f32 s19, s23
+; CHECK-NEXT:    vmov.f32 s18, s26
+; CHECK-NEXT:    vmov.u8 r0, q1[2]
 ; CHECK-NEXT:    vadd.i8 q3, q4, q3
 ; CHECK-NEXT:    vmov.8 q4[0], r0
 ; CHECK-NEXT:    vmov.u8 r0, q1[5]
@@ -753,19 +746,15 @@ define void @vld3_v2i64(<6 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d6, d1
+; CHECK-NEXT:    vmov.f32 s12, s2
 ; CHECK-NEXT:    vmov.f32 s13, s3
-; CHECK-NEXT:    vmov.f32 s14, s4
-; CHECK-NEXT:    vmov.f32 s2, s10
-; CHECK-NEXT:    vmov.f32 s3, s11
-; CHECK-NEXT:    vmov.f32 s15, s5
-; CHECK-NEXT:    vmov.f32 s10, s6
-; CHECK-NEXT:    vmov.f32 s11, s7
-; CHECK-NEXT:    vmov r5, r8, d6
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmov r0, r3, d5
+; CHECK-NEXT:    vmov r2, r4, d3
 ; CHECK-NEXT:    vmov r6, r7, d0
-; CHECK-NEXT:    vmov r0, r3, d1
-; CHECK-NEXT:    vmov lr, r12, d7
-; CHECK-NEXT:    vmov r2, r4, d5
+; CHECK-NEXT:    vmov r5, r8, d6
+; CHECK-NEXT:    vmov lr, r12, d1
 ; CHECK-NEXT:    adds.w r0, r0, lr
 ; CHECK-NEXT:    adc.w r3, r3, r12
 ; CHECK-NEXT:    adds r0, r0, r2
@@ -795,50 +784,42 @@ define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
 ; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #80]
-; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
 ; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    vmov.f32 s2, s10
-; CHECK-NEXT:    vmov.f32 s3, s11
-; CHECK-NEXT:    vmov.f32 s10, s14
-; CHECK-NEXT:    vmov.f32 s7, s13
-; CHECK-NEXT:    vmov.f32 s11, s15
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
-; CHECK-NEXT:    vmov.f64 d10, d7
-; CHECK-NEXT:    vmov lr, r12, d3
-; CHECK-NEXT:    vmov r5, r4, d1
-; CHECK-NEXT:    vmov r3, r8, d5
-; CHECK-NEXT:    vmov.f32 s21, s15
-; CHECK-NEXT:    vmov.f32 s22, s24
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmov.f32 s23, s25
-; CHECK-NEXT:    vmov.f32 s15, s19
-; CHECK-NEXT:    vmov.f32 s18, s26
-; CHECK-NEXT:    vmov r6, r7, d10
-; CHECK-NEXT:    vmov.f32 s19, s27
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vmov.f32 s3, s13
+; CHECK-NEXT:    vmov r5, r4, d5
+; CHECK-NEXT:    vmov r3, r8, d7
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s24, s22
+; CHECK-NEXT:    vmov.f32 s25, s23
+; CHECK-NEXT:    vmov lr, r12, d1
+; CHECK-NEXT:    vmov.f32 s2, s12
+; CHECK-NEXT:    vmov.f32 s3, s13
+; CHECK-NEXT:    vmov r6, r7, d12
 ; CHECK-NEXT:    adds.w r0, r5, lr
 ; CHECK-NEXT:    adc.w r5, r4, r12
 ; CHECK-NEXT:    adds.w lr, r0, r3
-; CHECK-NEXT:    vmov r4, r2, d6
+; CHECK-NEXT:    vmov r4, r2, d10
 ; CHECK-NEXT:    adc.w r12, r5, r8
 ; CHECK-NEXT:    vmov r5, r0, d8
 ; CHECK-NEXT:    adds r6, r6, r4
 ; CHECK-NEXT:    adcs r2, r7
 ; CHECK-NEXT:    adds r6, r6, r5
 ; CHECK-NEXT:    adc.w r8, r2, r0
-; CHECK-NEXT:    vmov r7, r4, d11
-; CHECK-NEXT:    vmov r2, r5, d7
+; CHECK-NEXT:    vmov r7, r4, d1
+; CHECK-NEXT:    vmov r2, r5, d9
 ; CHECK-NEXT:    vmov r3, r0, d0
 ; CHECK-NEXT:    adds r2, r2, r7
 ; CHECK-NEXT:    adc.w r7, r5, r4
-; CHECK-NEXT:    vmov r5, r4, d9
+; CHECK-NEXT:    vmov r5, r4, d7
 ; CHECK-NEXT:    adds r2, r2, r5
 ; CHECK-NEXT:    adcs r7, r4
 ; CHECK-NEXT:    vmov r5, r4, d2
@@ -853,7 +834,7 @@ define void @vld3_v4i64(<12 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-NEXT:    adcs r0, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r0, r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
 ; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
 entry:
   %l1 = load <12 x i64>, <12 x i64>* %src, align 4
@@ -874,7 +855,7 @@ define void @vld3_v2f32(<6 x float> *%src, <2 x float> *%dst) {
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vldr s1, [r0, #16]
 ; CHECK-NEXT:    vldr s5, [r0, #20]
-; CHECK-NEXT:    vmov.f64 d6, d4
+; CHECK-NEXT:    vmov.f32 s12, s8
 ; CHECK-NEXT:    vmov.f32 s13, s11
 ; CHECK-NEXT:    vmov.f32 s0, s9
 ; CHECK-NEXT:    vadd.f32 q0, q3, q0
@@ -898,20 +879,20 @@ define void @vld3_v4f32(<12 x float> *%src, <4 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.f32 s11, s17
 ; CHECK-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
 ; CHECK-NEXT:    vmov.f32 s3, s19
 ; CHECK-NEXT:    vadd.f32 q0, q2, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
@@ -933,37 +914,37 @@ define void @vld3_v8f32(<24 x float> *%src, <8 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.f32 s11, s17
 ; CHECK-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
 ; CHECK-NEXT:    vadd.f32 q0, q2, q0
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s16, s9
-; CHECK-NEXT:    vmov.f64 d10, d4
 ; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s4, s10
-; CHECK-NEXT:    vmov.f32 s6, s12
+; CHECK-NEXT:    vmov.f32 s16, s9
 ; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s11
 ; CHECK-NEXT:    vmov.f32 s23, s13
 ; CHECK-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vmov.f32 s6, s12
 ; CHECK-NEXT:    vmov.f32 s7, s15
 ; CHECK-NEXT:    vadd.f32 q1, q4, q1
 ; CHECK-NEXT:    vstrw.32 q1, [r1]
@@ -985,71 +966,71 @@ define void @vld3_v16f32(<48 x float> *%src, <16 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s12, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s2
+; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s14, s3
-; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s15, s18
 ; CHECK-NEXT:    vmov.f32 s11, s17
 ; CHECK-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-NEXT:    vmov.f32 s0, s6
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmov.f32 s3, s19
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
 ; CHECK-NEXT:    vadd.f32 q0, q2, q0
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s16, s9
-; CHECK-NEXT:    vmov.f64 d10, d4
 ; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmov.f32 s21, s11
 ; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vmov.f32 s19, s14
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmov.f32 s23, s13
 ; CHECK-NEXT:    vmov.f32 s4, s10
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
 ; CHECK-NEXT:    vmov.f32 s6, s12
-; CHECK-NEXT:    vmov.f32 s19, s14
-; CHECK-NEXT:    vmov.f32 s23, s13
+; CHECK-NEXT:    vadd.f32 q4, q5, q4
 ; CHECK-NEXT:    vmov.f32 s7, s15
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #144]
-; CHECK-NEXT:    vadd.f32 q4, q5, q4
-; CHECK-NEXT:    vmov.f32 s20, s13
 ; CHECK-NEXT:    vadd.f32 q1, q4, q1
-; CHECK-NEXT:    vmov.f64 d8, d6
-; CHECK-NEXT:    vmov.f32 s17, s15
-; CHECK-NEXT:    vmov.f32 s21, s8
 ; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vmov.f32 s21, s8
 ; CHECK-NEXT:    vmov.f32 s22, s11
-; CHECK-NEXT:    vmov.f32 s8, s14
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
-; CHECK-NEXT:    vmov.f32 s10, s24
+; CHECK-NEXT:    vmov.f32 s16, s12
+; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s20, s13
 ; CHECK-NEXT:    vmov.f32 s23, s26
 ; CHECK-NEXT:    vmov.f32 s19, s25
 ; CHECK-NEXT:    vadd.f32 q4, q4, q5
+; CHECK-NEXT:    vmov.f32 s8, s14
+; CHECK-NEXT:    vmov.f32 s10, s24
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
 ; CHECK-NEXT:    vmov.f32 s11, s27
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
 ; CHECK-NEXT:    vadd.f32 q2, q4, q2
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #128]
-; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s24, s17
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f64 d14, d8
-; CHECK-NEXT:    vstrw.32 q1, [r1]
 ; CHECK-NEXT:    vmov.f32 s25, s12
-; CHECK-NEXT:    vmov.f32 s29, s19
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
 ; CHECK-NEXT:    vmov.f32 s26, s15
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s30, s14
-; CHECK-NEXT:    vmov.f32 s12, s18
-; CHECK-NEXT:    vmov.f32 s14, s20
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vmov.f32 s24, s17
 ; CHECK-NEXT:    vmov.f32 s27, s22
+; CHECK-NEXT:    vmov.f32 s28, s16
+; CHECK-NEXT:    vmov.f32 s29, s19
 ; CHECK-NEXT:    vmov.f32 s31, s21
 ; CHECK-NEXT:    vadd.f32 q6, q7, q6
+; CHECK-NEXT:    vmov.f32 s12, s18
+; CHECK-NEXT:    vmov.f32 s14, s20
 ; CHECK-NEXT:    vmov.f32 s15, s23
 ; CHECK-NEXT:    vadd.f32 q3, q6, q3
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
@@ -1079,9 +1060,9 @@ define void @vld3_v2f16(<6 x half> *%src, <2 x half> *%dst) {
 ; CHECK-NEXT:    vmovx.f16 s8, s0
 ; CHECK-NEXT:    vmovx.f16 s4, s2
 ; CHECK-NEXT:    vins.f16 s8, s2
-; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vmovx.f16 s2, s1
 ; CHECK-NEXT:    vins.f16 s1, s4
-; CHECK-NEXT:    vins.f16 s0, s6
+; CHECK-NEXT:    vins.f16 s0, s2
 ; CHECK-NEXT:    vadd.f16 q1, q0, q2
 ; CHECK-NEXT:    vmov.f32 s0, s1
 ; CHECK-NEXT:    vadd.f16 q0, q1, q0
@@ -1102,32 +1083,29 @@ entry:
 define void @vld3_v4f16(<12 x half> *%src, <4 x half> *%dst) {
 ; CHECK-LABEL: vld3_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8}
-; CHECK-NEXT:    vpush {d8}
 ; CHECK-NEXT:    ldrd r2, r3, [r0, #16]
-; CHECK-NEXT:    vmov.32 q2[0], r2
-; CHECK-NEXT:    vmov.32 q2[1], r3
-; CHECK-NEXT:    vmov.f32 s1, s8
-; CHECK-NEXT:    vmovx.f16 s4, s9
-; CHECK-NEXT:    vins.f16 s1, s4
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s8
+; CHECK-NEXT:    vmov.32 q2[0], r2
 ; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vmovx.f16 s16, s5
-; CHECK-NEXT:    vins.f16 s12, s6
-; CHECK-NEXT:    vins.f16 s4, s16
-; CHECK-NEXT:    vmovx.f16 s16, s6
-; CHECK-NEXT:    vins.f16 s5, s16
+; CHECK-NEXT:    vmov.32 q2[1], r3
 ; CHECK-NEXT:    vmovx.f16 s13, s7
-; CHECK-NEXT:    vins.f16 s7, s8
+; CHECK-NEXT:    vmovx.f16 s0, s9
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vins.f16 s1, s0
+; CHECK-NEXT:    vmovx.f16 s0, s5
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s0, s6
+; CHECK-NEXT:    vins.f16 s5, s0
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vins.f16 s7, s0
 ; CHECK-NEXT:    vmov.f32 s0, s5
+; CHECK-NEXT:    vins.f16 s12, s6
 ; CHECK-NEXT:    vins.f16 s13, s9
 ; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vadd.f16 q1, q1, q3
 ; CHECK-NEXT:    vadd.f16 q0, q1, q0
 ; CHECK-NEXT:    vmov r0, r2, d0
 ; CHECK-NEXT:    strd r0, r2, [r1]
-; CHECK-NEXT:    vpop {d8}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <12 x half>, <12 x half>* %src, align 4
@@ -1143,49 +1121,47 @@ entry:
 define void @vld3_v8f16(<24 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld3_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .vsave {d8, d9, d10}
+; CHECK-NEXT:    vpush {d8, d9, d10}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
 ; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s17
-; CHECK-NEXT:    vmov.f32 s5, s16
-; CHECK-NEXT:    vmovx.f16 s20, s15
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vmov.f32 s11, s14
-; CHECK-NEXT:    vins.f16 s11, s20
-; CHECK-NEXT:    vmov.f32 s6, s19
-; CHECK-NEXT:    vmovx.f16 s20, s12
-; CHECK-NEXT:    vmov.f32 s28, s18
-; CHECK-NEXT:    vins.f16 s6, s20
-; CHECK-NEXT:    vmovx.f16 s20, s19
-; CHECK-NEXT:    vins.f16 s28, s20
-; CHECK-NEXT:    vmovx.f16 s24, s1
-; CHECK-NEXT:    vmovx.f16 s20, s0
-; CHECK-NEXT:    vins.f16 s0, s24
-; CHECK-NEXT:    vins.f16 s20, s2
-; CHECK-NEXT:    vmovx.f16 s26, s16
-; CHECK-NEXT:    vmovx.f16 s21, s3
-; CHECK-NEXT:    vins.f16 s3, s26
-; CHECK-NEXT:    vins.f16 s21, s17
-; CHECK-NEXT:    vmovx.f16 s30, s14
-; CHECK-NEXT:    vmovx.f16 s23, s13
-; CHECK-NEXT:    vmov.f32 s10, s12
+; CHECK-NEXT:    vmovx.f16 s6, s2
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmov.f32 s5, s12
+; CHECK-NEXT:    vmovx.f16 s6, s13
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vmovx.f16 s6, s11
+; CHECK-NEXT:    vins.f16 s7, s6
+; CHECK-NEXT:    vmov.f32 s6, s15
+; CHECK-NEXT:    vmovx.f16 s16, s8
+; CHECK-NEXT:    vmovx.f16 s12, s12
+; CHECK-NEXT:    vmovx.f16 s15, s15
+; CHECK-NEXT:    vmov.f32 s20, s14
+; CHECK-NEXT:    vmovx.f16 s10, s10
+; CHECK-NEXT:    vmovx.f16 s17, s3
+; CHECK-NEXT:    vmovx.f16 s19, s9
+; CHECK-NEXT:    vmovx.f16 s18, s14
+; CHECK-NEXT:    vins.f16 s6, s16
+; CHECK-NEXT:    vmovx.f16 s16, s0
+; CHECK-NEXT:    vmovx.f16 s1, s1
+; CHECK-NEXT:    vins.f16 s20, s15
+; CHECK-NEXT:    vins.f16 s3, s12
+; CHECK-NEXT:    vins.f16 s9, s10
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s16, s2
 ; CHECK-NEXT:    vmov.f32 s1, s3
-; CHECK-NEXT:    vins.f16 s13, s30
-; CHECK-NEXT:    vins.f16 s23, s15
-; CHECK-NEXT:    vmov.f32 s2, s28
-; CHECK-NEXT:    vmovx.f16 s22, s18
-; CHECK-NEXT:    vmov.f32 s3, s13
-; CHECK-NEXT:    vins.f16 s22, s12
-; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vadd.f16 q0, q0, q5
+; CHECK-NEXT:    vins.f16 s17, s13
+; CHECK-NEXT:    vins.f16 s19, s11
+; CHECK-NEXT:    vins.f16 s18, s8
+; CHECK-NEXT:    vmov.f32 s2, s20
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vadd.f16 q0, q0, q4
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <24 x half>, <24 x half>* %src, align 4
@@ -1201,89 +1177,85 @@ entry:
 define void @vld3_v16f16(<48 x half> *%src, <16 x half> *%dst) {
 ; CHECK-LABEL: vld3_v16f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .vsave {d8, d9, d10}
+; CHECK-NEXT:    vpush {d8, d9, d10}
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #64]
-; CHECK-NEXT:    vmovx.f16 s8, s2
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
 ; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s13
+; CHECK-NEXT:    vmovx.f16 s6, s2
+; CHECK-NEXT:    vins.f16 s4, s6
 ; CHECK-NEXT:    vmov.f32 s5, s12
-; CHECK-NEXT:    vmovx.f16 s24, s1
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vmovx.f16 s6, s13
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vmovx.f16 s6, s11
+; CHECK-NEXT:    vins.f16 s7, s6
 ; CHECK-NEXT:    vmov.f32 s6, s15
-; CHECK-NEXT:    vmovx.f16 s26, s12
-; CHECK-NEXT:    vmovx.f16 s20, s11
-; CHECK-NEXT:    vmov.f32 s19, s10
-; CHECK-NEXT:    vins.f16 s19, s20
-; CHECK-NEXT:    vmovx.f16 s20, s8
-; CHECK-NEXT:    vins.f16 s6, s20
-; CHECK-NEXT:    vmovx.f16 s20, s15
-; CHECK-NEXT:    vmov.f32 s28, s14
-; CHECK-NEXT:    vmovx.f16 s30, s10
-; CHECK-NEXT:    vins.f16 s28, s20
-; CHECK-NEXT:    vmovx.f16 s20, s0
-; CHECK-NEXT:    vins.f16 s0, s24
-; CHECK-NEXT:    vins.f16 s20, s2
-; CHECK-NEXT:    vmovx.f16 s21, s3
-; CHECK-NEXT:    vins.f16 s3, s26
-; CHECK-NEXT:    vins.f16 s21, s13
-; CHECK-NEXT:    vmov.f32 s18, s8
-; CHECK-NEXT:    vmovx.f16 s23, s9
+; CHECK-NEXT:    vmovx.f16 s16, s8
+; CHECK-NEXT:    vmovx.f16 s12, s12
+; CHECK-NEXT:    vmovx.f16 s15, s15
+; CHECK-NEXT:    vmov.f32 s20, s14
+; CHECK-NEXT:    vmovx.f16 s10, s10
+; CHECK-NEXT:    vmovx.f16 s17, s3
+; CHECK-NEXT:    vmovx.f16 s19, s9
+; CHECK-NEXT:    vmovx.f16 s18, s14
+; CHECK-NEXT:    vins.f16 s6, s16
+; CHECK-NEXT:    vmovx.f16 s16, s0
+; CHECK-NEXT:    vmovx.f16 s1, s1
+; CHECK-NEXT:    vins.f16 s20, s15
+; CHECK-NEXT:    vins.f16 s3, s12
+; CHECK-NEXT:    vins.f16 s9, s10
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s16, s2
+; CHECK-NEXT:    vins.f16 s17, s13
 ; CHECK-NEXT:    vmov.f32 s1, s3
-; CHECK-NEXT:    vins.f16 s9, s30
-; CHECK-NEXT:    vins.f16 s23, s11
-; CHECK-NEXT:    vmovx.f16 s22, s14
-; CHECK-NEXT:    vmov.f32 s2, s28
-; CHECK-NEXT:    vins.f16 s22, s8
+; CHECK-NEXT:    vins.f16 s19, s11
+; CHECK-NEXT:    vins.f16 s18, s8
 ; CHECK-NEXT:    vmov.f32 s3, s9
-; CHECK-NEXT:    vmov.f32 s7, s19
-; CHECK-NEXT:    vadd.f16 q0, q0, q5
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s2, s20
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vadd.f16 q0, q0, q4
+; CHECK-NEXT:    vmov.f32 s20, s14
 ; CHECK-NEXT:    vadd.f16 q1, q0, q1
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    vmovx.f16 s16, s2
-; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vmovx.f16 s20, s11
-; CHECK-NEXT:    vins.f16 s4, s16
-; CHECK-NEXT:    vmovx.f16 s16, s13
 ; CHECK-NEXT:    vmov.f32 s5, s12
-; CHECK-NEXT:    vmovx.f16 s24, s1
-; CHECK-NEXT:    vins.f16 s5, s16
-; CHECK-NEXT:    vmov.f32 s19, s10
-; CHECK-NEXT:    vins.f16 s19, s20
+; CHECK-NEXT:    vmov.f32 s4, s1
+; CHECK-NEXT:    vmovx.f16 s6, s2
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s6, s13
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vmov.f32 s7, s10
+; CHECK-NEXT:    vmovx.f16 s6, s11
+; CHECK-NEXT:    vmovx.f16 s16, s8
+; CHECK-NEXT:    vins.f16 s7, s6
 ; CHECK-NEXT:    vmov.f32 s6, s15
-; CHECK-NEXT:    vmovx.f16 s20, s8
-; CHECK-NEXT:    vmov.f32 s28, s14
-; CHECK-NEXT:    vins.f16 s6, s20
-; CHECK-NEXT:    vmovx.f16 s20, s15
-; CHECK-NEXT:    vins.f16 s28, s20
-; CHECK-NEXT:    vmovx.f16 s20, s0
-; CHECK-NEXT:    vins.f16 s0, s24
-; CHECK-NEXT:    vins.f16 s20, s2
-; CHECK-NEXT:    vmovx.f16 s21, s3
-; CHECK-NEXT:    vmovx.f16 s26, s12
-; CHECK-NEXT:    vins.f16 s21, s13
-; CHECK-NEXT:    vins.f16 s3, s26
-; CHECK-NEXT:    vmovx.f16 s30, s10
-; CHECK-NEXT:    vmovx.f16 s23, s9
-; CHECK-NEXT:    vmov.f32 s18, s8
-; CHECK-NEXT:    vins.f16 s9, s30
-; CHECK-NEXT:    vins.f16 s23, s11
+; CHECK-NEXT:    vmovx.f16 s15, s15
+; CHECK-NEXT:    vmovx.f16 s12, s12
+; CHECK-NEXT:    vmovx.f16 s10, s10
+; CHECK-NEXT:    vmovx.f16 s17, s3
+; CHECK-NEXT:    vmovx.f16 s19, s9
+; CHECK-NEXT:    vmovx.f16 s18, s14
+; CHECK-NEXT:    vins.f16 s6, s16
+; CHECK-NEXT:    vmovx.f16 s16, s0
+; CHECK-NEXT:    vmovx.f16 s1, s1
+; CHECK-NEXT:    vins.f16 s20, s15
+; CHECK-NEXT:    vins.f16 s3, s12
+; CHECK-NEXT:    vins.f16 s9, s10
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s16, s2
 ; CHECK-NEXT:    vmov.f32 s1, s3
-; CHECK-NEXT:    vmovx.f16 s22, s14
-; CHECK-NEXT:    vmov.f32 s2, s28
-; CHECK-NEXT:    vins.f16 s22, s8
+; CHECK-NEXT:    vins.f16 s17, s13
+; CHECK-NEXT:    vins.f16 s19, s11
+; CHECK-NEXT:    vins.f16 s18, s8
+; CHECK-NEXT:    vmov.f32 s2, s20
 ; CHECK-NEXT:    vmov.f32 s3, s9
-; CHECK-NEXT:    vmov.f32 s7, s19
-; CHECK-NEXT:    vadd.f16 q0, q0, q5
+; CHECK-NEXT:    vadd.f16 q0, q0, q4
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpop {d8, d9, d10}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <48 x half>, <48 x half>* %src, align 4
@@ -1303,11 +1275,11 @@ define void @vld3_v2f64(<6 x double> *%src, <2 x double> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
-; CHECK-NEXT:    vadd.f64 d4, d3, d0
-; CHECK-NEXT:    vadd.f64 d5, d6, d7
-; CHECK-NEXT:    vadd.f64 d1, d4, d1
-; CHECK-NEXT:    vadd.f64 d0, d5, d2
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vadd.f64 d0, d3, d0
+; CHECK-NEXT:    vadd.f64 d3, d4, d5
+; CHECK-NEXT:    vadd.f64 d1, d0, d1
+; CHECK-NEXT:    vadd.f64 d0, d3, d2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1324,25 +1296,25 @@ entry:
 define void @vld3_v4f64(<12 x double> *%src, <4 x double> *%dst) {
 ; CHECK-LABEL: vld3_v4f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
-; CHECK-NEXT:    vadd.f64 d5, d6, d7
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q6, [r0]
-; CHECK-NEXT:    vadd.f64 d4, d1, d2
-; CHECK-NEXT:    vadd.f64 d10, d9, d6
-; CHECK-NEXT:    vadd.f64 d11, d12, d13
-; CHECK-NEXT:    vadd.f64 d3, d4, d3
-; CHECK-NEXT:    vadd.f64 d2, d5, d0
-; CHECK-NEXT:    vadd.f64 d1, d10, d7
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    vadd.f64 d0, d11, d8
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vadd.f64 d1, d1, d2
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vadd.f64 d2, d4, d5
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vadd.f64 d4, d7, d4
+; CHECK-NEXT:    vadd.f64 d7, d8, d9
+; CHECK-NEXT:    vadd.f64 d1, d1, d3
+; CHECK-NEXT:    vadd.f64 d0, d2, d0
+; CHECK-NEXT:    vadd.f64 d3, d4, d5
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vadd.f64 d2, d7, d6
+; CHECK-NEXT:    vstrw.32 q1, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <12 x double>, <12 x double>* %src, align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
index 06c10e0b7bb1a..8ddfb5fb44878 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4-post.ll
@@ -6,18 +6,14 @@
 define <16 x i32> *@vld4_v4i32(<16 x i32> *%src, <4 x i32> *%dst) {
 ; CHECK-LABEL: vld4_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i32 q4, q2, q3
+; CHECK-NEXT:    vadd.i32 q2, q2, q3
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q0, q4
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <16 x i32>, <16 x i32>* %src, align 4
@@ -38,18 +34,14 @@ entry:
 define <32 x i16> *@vld4_v8i16(<32 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld4_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i16 q4, q2, q3
+; CHECK-NEXT:    vadd.i16 q2, q2, q3
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
-; CHECK-NEXT:    vadd.i16 q0, q0, q4
+; CHECK-NEXT:    vadd.i16 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x i16>, <32 x i16>* %src, align 4
@@ -70,18 +62,14 @@ entry:
 define <64 x i8> *@vld4_v16i8(<64 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-LABEL: vld4_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.8 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i8 q4, q2, q3
+; CHECK-NEXT:    vadd.i8 q2, q2, q3
 ; CHECK-NEXT:    vadd.i8 q0, q0, q1
-; CHECK-NEXT:    vadd.i8 q0, q0, q4
+; CHECK-NEXT:    vadd.i8 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <64 x i8>, <64 x i8>* %src, align 4
@@ -109,23 +97,19 @@ define <8 x i64> *@vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vmov.f32 s6, s10
 ; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov lr, r12, d5
 ; CHECK-NEXT:    vldrw.u32 q2, [r0], #64
-; CHECK-NEXT:    vmov.f64 d6, d5
+; CHECK-NEXT:    vmov r4, r8, d9
+; CHECK-NEXT:    vmov.f32 s12, s10
 ; CHECK-NEXT:    vmov.f32 s13, s11
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmov.f32 s10, s16
-; CHECK-NEXT:    vmov.f32 s15, s19
-; CHECK-NEXT:    vmov.f32 s11, s17
-; CHECK-NEXT:    vmov lr, r12, d3
 ; CHECK-NEXT:    vmov r2, r7, d1
-; CHECK-NEXT:    vmov r4, r8, d7
-; CHECK-NEXT:    vmov r3, r6, d5
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s3, s17
+; CHECK-NEXT:    vmov r3, r6, d1
 ; CHECK-NEXT:    adds.w r2, r2, lr
 ; CHECK-NEXT:    adc.w r7, r7, r12
 ; CHECK-NEXT:    adds r3, r3, r4
@@ -166,18 +150,14 @@ entry:
 define <16 x float> *@vld4_v4f32(<16 x float> *%src, <4 x float> *%dst) {
 ; CHECK-LABEL: vld4_v4f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.f32 q4, q2, q3
+; CHECK-NEXT:    vadd.f32 q2, q2, q3
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-NEXT:    vadd.f32 q0, q0, q4
+; CHECK-NEXT:    vadd.f32 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <16 x float>, <16 x float>* %src, align 4
@@ -198,18 +178,14 @@ entry:
 define <32 x half> *@vld4_v8f16(<32 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld4_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.f16 q4, q2, q3
+; CHECK-NEXT:    vadd.f16 q2, q2, q3
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
-; CHECK-NEXT:    vadd.f16 q0, q0, q4
+; CHECK-NEXT:    vadd.f16 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x half>, <32 x half>* %src, align 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vld4.ll b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
index c1b984761dcdf..5058013576343 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vld4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vld4.ll
@@ -6,17 +6,17 @@
 define void @vld4_v2i32(<8 x i32> *%src, <2 x i32> *%dst) {
 ; CHECK-LABEL: vld4_v2i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s8, s3
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vmov.f32 s10, s7
 ; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vmov.f32 s12, s1
-; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s6, s5
 ; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov.f32 s8, s3
+; CHECK-NEXT:    vmov.f32 s12, s1
 ; CHECK-NEXT:    vmov r0, s10
 ; CHECK-NEXT:    add r0, r2
-; CHECK-NEXT:    vmov r2, s14
+; CHECK-NEXT:    vmov r2, s6
 ; CHECK-NEXT:    add r2, r3
 ; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    add.w r12, r2, r0
@@ -44,18 +44,14 @@ entry:
 define void @vld4_v4i32(<16 x i32> *%src, <4 x i32> *%dst) {
 ; CHECK-LABEL: vld4_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i32 q4, q2, q3
+; CHECK-NEXT:    vadd.i32 q2, q2, q3
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vadd.i32 q0, q0, q4
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <16 x i32>, <16 x i32>* %src, align 4
@@ -73,27 +69,25 @@ entry:
 define void @vld4_v8i32(<32 x i32> *%src, <8 x i32> *%dst) {
 ; CHECK-LABEL: vld4_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i32 q6, q2, q3
+; CHECK-NEXT:    vadd.i32 q4, q2, q3
+; CHECK-NEXT:    vadd.i32 q5, q0, q1
+; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vadd.i32 q4, q5, q4
+; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vadd.i32 q2, q2, q3
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vld40.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vadd.i32 q0, q0, q6
-; CHECK-NEXT:    vld41.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vld42.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vld43.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4
-; CHECK-NEXT:    vadd.i32 q5, q3, q4
-; CHECK-NEXT:    vadd.i32 q1, q1, q2
-; CHECK-NEXT:    vadd.i32 q1, q1, q5
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x i32>, <32 x i32>* %src, align 4
@@ -111,12 +105,10 @@ entry:
 define void @vld4_v16i32(<64 x i32> *%src, <16 x i32> *%dst) {
 ; CHECK-LABEL: vld4_v16i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5}
-; CHECK-NEXT:    push {r4, r5}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #112
-; CHECK-NEXT:    sub sp, #112
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    mov r2, r0
 ; CHECK-NEXT:    add.w r3, r0, #192
@@ -124,56 +116,40 @@ define void @vld4_v16i32(<64 x i32> *%src, <16 x i32> *%dst) {
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    adds r0, #128
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r2]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i32 q4, q2, q3
+; CHECK-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-NEXT:    vld40.32 {q3, q4, q5, q6}, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vstrw.32 q4, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vld40.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vld41.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vld42.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vadd.i32 q6, q5, q6
-; CHECK-NEXT:    vstrw.32 q6, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vld43.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vstrw.32 q4, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vld41.32 {q3, q4, q5, q6}, [r3]
 ; CHECK-NEXT:    vadd.i32 q0, q0, q2
-; CHECK-NEXT:    vadd.i32 q1, q3, q5
-; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vld42.32 {q3, q4, q5, q6}, [r3]
+; CHECK-NEXT:    vld43.32 {q3, q4, q5, q6}, [r3]
+; CHECK-NEXT:    vadd.i32 q1, q5, q6
+; CHECK-NEXT:    vadd.i32 q2, q3, q4
+; CHECK-NEXT:    vadd.i32 q0, q2, q1
+; CHECK-NEXT:    vld40.32 {q4, q5, q6, q7}, [r2]
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld41.32 {q4, q5, q6, q7}, [r2]
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld42.32 {q4, q5, q6, q7}, [r2]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld43.32 {q4, q5, q6, q7}, [r2]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmov q5, q1
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vadd.i32 q0, q0, q5
-; CHECK-NEXT:    vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload
-; CHECK-NEXT:    vadd.i32 q1, q2, q1
-; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6
-; CHECK-NEXT:    vadd.i32 q2, q3, q4
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
-; CHECK-NEXT:    vadd.i32 q1, q5, q6
-; CHECK-NEXT:    vadd.i32 q1, q2, q1
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vadd.i32 q2, q2, q3
+; CHECK-NEXT:    vadd.i32 q0, q0, q2
+; CHECK-NEXT:    vadd.i32 q1, q6, q7
+; CHECK-NEXT:    vadd.i32 q2, q4, q5
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-NEXT:    vadd.i32 q1, q2, q1
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #112
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    pop {r4, r5}
+; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <64 x i32>, <64 x i32>* %src, align 4
@@ -193,25 +169,25 @@ define void @vld4_v4i32_align1(<16 x i32> *%src, <4 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrb.u8 q2, [r0]
-; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
-; CHECK-NEXT:    vldrb.u8 q1, [r0, #32]
 ; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
+; CHECK-NEXT:    vldrb.u8 q1, [r0, #32]
+; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
+; CHECK-NEXT:    vldrb.u8 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s16, s11
-; CHECK-NEXT:    vmov.f64 d10, d5
+; CHECK-NEXT:    vmov.f32 s20, s10
 ; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s19, s3
 ; CHECK-NEXT:    vmov.f32 s21, s14
-; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s19, s3
 ; CHECK-NEXT:    vmov.f32 s23, s2
 ; CHECK-NEXT:    vadd.i32 q4, q5, q4
 ; CHECK-NEXT:    vmov.f32 s20, s9
 ; CHECK-NEXT:    vmov.f32 s21, s13
-; CHECK-NEXT:    vmov.f32 s9, s12
 ; CHECK-NEXT:    vmov.f32 s22, s5
-; CHECK-NEXT:    vmov.f32 s10, s4
 ; CHECK-NEXT:    vmov.f32 s23, s1
+; CHECK-NEXT:    vmov.f32 s9, s12
+; CHECK-NEXT:    vmov.f32 s10, s4
 ; CHECK-NEXT:    vmov.f32 s11, s0
 ; CHECK-NEXT:    vadd.i32 q0, q2, q5
 ; CHECK-NEXT:    vadd.i32 q0, q0, q4
@@ -320,18 +296,14 @@ entry:
 define void @vld4_v8i16(<32 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld4_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i16 q4, q2, q3
+; CHECK-NEXT:    vadd.i16 q2, q2, q3
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
-; CHECK-NEXT:    vadd.i16 q0, q0, q4
+; CHECK-NEXT:    vadd.i16 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x i16>, <32 x i16>* %src, align 2
@@ -349,27 +321,25 @@ entry:
 define void @vld4_v16i16(<64 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK-LABEL: vld4_v16i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i16 q6, q2, q3
+; CHECK-NEXT:    vadd.i16 q4, q2, q3
+; CHECK-NEXT:    vadd.i16 q5, q0, q1
+; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vadd.i16 q4, q5, q4
+; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vadd.i16 q2, q2, q3
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
-; CHECK-NEXT:    vld40.16 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vadd.i16 q0, q0, q6
-; CHECK-NEXT:    vld41.16 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vld42.16 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vld43.16 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4
-; CHECK-NEXT:    vadd.i16 q5, q3, q4
-; CHECK-NEXT:    vadd.i16 q1, q1, q2
-; CHECK-NEXT:    vadd.i16 q1, q1, q5
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vadd.i16 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <64 x i16>, <64 x i16>* %src, align 2
@@ -387,58 +357,56 @@ entry:
 define void @vld4_v8i16_align1(<32 x i16> *%src, <8 x i16> *%dst) {
 ; CHECK-LABEL: vld4_v8i16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vldrb.u8 q1, [r0, #32]
 ; CHECK-NEXT:    vldrb.u8 q2, [r0, #48]
-; CHECK-NEXT:    vmovx.f16 s0, s7
 ; CHECK-NEXT:    vmovx.f16 s18, s5
+; CHECK-NEXT:    vmovx.f16 s0, s7
 ; CHECK-NEXT:    vins.f16 s18, s0
-; CHECK-NEXT:    vmovx.f16 s0, s11
 ; CHECK-NEXT:    vmovx.f16 s19, s9
+; CHECK-NEXT:    vmovx.f16 s0, s11
 ; CHECK-NEXT:    vins.f16 s5, s7
 ; CHECK-NEXT:    vins.f16 s19, s0
 ; CHECK-NEXT:    vldrb.u8 q0, [r0]
 ; CHECK-NEXT:    vins.f16 s9, s11
-; CHECK-NEXT:    vmovx.f16 s24, s6
-; CHECK-NEXT:    vmovx.f16 s12, s3
+; CHECK-NEXT:    vmov.f32 s22, s5
 ; CHECK-NEXT:    vmovx.f16 s16, s1
+; CHECK-NEXT:    vmovx.f16 s12, s3
 ; CHECK-NEXT:    vins.f16 s16, s12
 ; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
 ; CHECK-NEXT:    vins.f16 s1, s3
-; CHECK-NEXT:    vmovx.f16 s20, s15
-; CHECK-NEXT:    vmovx.f16 s17, s13
-; CHECK-NEXT:    vins.f16 s17, s20
-; CHECK-NEXT:    vmov.f32 s22, s5
 ; CHECK-NEXT:    vmov.f32 s23, s9
+; CHECK-NEXT:    vmovx.f16 s17, s13
+; CHECK-NEXT:    vmovx.f16 s20, s15
 ; CHECK-NEXT:    vins.f16 s13, s15
+; CHECK-NEXT:    vins.f16 s17, s20
 ; CHECK-NEXT:    vmov.f32 s20, s1
+; CHECK-NEXT:    vmovx.f16 s1, s6
 ; CHECK-NEXT:    vmov.f32 s21, s13
 ; CHECK-NEXT:    vadd.i16 q4, q5, q4
 ; CHECK-NEXT:    vmovx.f16 s22, s4
-; CHECK-NEXT:    vins.f16 s22, s24
-; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vmovx.f16 s24, s10
+; CHECK-NEXT:    vins.f16 s22, s1
 ; CHECK-NEXT:    vmovx.f16 s23, s8
-; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vmov.f32 s6, s4
-; CHECK-NEXT:    vmov.f32 s7, s8
-; CHECK-NEXT:    vins.f16 s23, s24
-; CHECK-NEXT:    vmovx.f16 s24, s2
+; CHECK-NEXT:    vmovx.f16 s1, s10
 ; CHECK-NEXT:    vmovx.f16 s20, s0
-; CHECK-NEXT:    vins.f16 s20, s24
-; CHECK-NEXT:    vmovx.f16 s24, s14
+; CHECK-NEXT:    vins.f16 s23, s1
+; CHECK-NEXT:    vmovx.f16 s1, s2
+; CHECK-NEXT:    vins.f16 s20, s1
 ; CHECK-NEXT:    vmovx.f16 s21, s12
-; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s1, s14
+; CHECK-NEXT:    vins.f16 s8, s10
+; CHECK-NEXT:    vins.f16 s4, s6
 ; CHECK-NEXT:    vins.f16 s12, s14
-; CHECK-NEXT:    vins.f16 s21, s24
+; CHECK-NEXT:    vins.f16 s21, s1
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmov.f32 s3, s8
 ; CHECK-NEXT:    vmov.f32 s1, s12
-; CHECK-NEXT:    vmov.f32 s2, s6
-; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vadd.i16 q0, q0, q5
 ; CHECK-NEXT:    vadd.i16 q0, q0, q4
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x i16>, <32 x i16>* %src, align 1
@@ -612,18 +580,14 @@ entry:
 define void @vld4_v16i8(<64 x i8> *%src, <16 x i8> *%dst) {
 ; CHECK-LABEL: vld4_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.8 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.8 {q0, q1, q2, q3}, [r0]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.i8 q4, q2, q3
+; CHECK-NEXT:    vadd.i8 q2, q2, q3
 ; CHECK-NEXT:    vadd.i8 q0, q0, q1
-; CHECK-NEXT:    vadd.i8 q0, q0, q4
+; CHECK-NEXT:    vadd.i8 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <64 x i8>, <64 x i8>* %src, align 1
@@ -650,23 +614,19 @@ define void @vld4_v2i64(<8 x i64> *%src, <2 x i64> *%dst) {
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d2, d1
+; CHECK-NEXT:    vmov.f32 s4, s2
 ; CHECK-NEXT:    vmov.f32 s5, s3
-; CHECK-NEXT:    vmov.f32 s6, s10
 ; CHECK-NEXT:    vmov.f32 s2, s8
-; CHECK-NEXT:    vmov.f32 s7, s11
 ; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov lr, r12, d5
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmov.f64 d6, d5
+; CHECK-NEXT:    vmov r0, r8, d9
+; CHECK-NEXT:    vmov.f32 s12, s10
 ; CHECK-NEXT:    vmov.f32 s13, s11
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmov.f32 s10, s16
-; CHECK-NEXT:    vmov.f32 s15, s19
-; CHECK-NEXT:    vmov.f32 s11, s17
-; CHECK-NEXT:    vmov lr, r12, d3
 ; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    vmov r0, r8, d7
-; CHECK-NEXT:    vmov r5, r6, d5
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s3, s17
+; CHECK-NEXT:    vmov r5, r6, d1
 ; CHECK-NEXT:    adds.w r2, r2, lr
 ; CHECK-NEXT:    adc.w r3, r3, r12
 ; CHECK-NEXT:    vmov r4, r12, d2
@@ -710,54 +670,45 @@ define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d6, d3
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #80]
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s8, s2
+; CHECK-NEXT:    vmov.f32 s9, s3
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s2, s20
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #112]
+; CHECK-NEXT:    vmov.f32 s3, s21
+; CHECK-NEXT:    vmov r3, r2, d11
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
+; CHECK-NEXT:    vmov.f32 s0, s26
+; CHECK-NEXT:    vmov.f32 s1, s27
+; CHECK-NEXT:    vmov lr, r12, d9
+; CHECK-NEXT:    vmov.f32 s12, s6
 ; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmov.f32 s14, s2
-; CHECK-NEXT:    vmov.f32 s6, s0
-; CHECK-NEXT:    vmov.f32 s15, s3
-; CHECK-NEXT:    vmov.f32 s7, s1
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #96]
-; CHECK-NEXT:    vmov.f64 d4, d11
-; CHECK-NEXT:    vmov.f32 s9, s23
-; CHECK-NEXT:    vmov r3, r2, d7
-; CHECK-NEXT:    vmov r4, r5, d3
-; CHECK-NEXT:    vmov.f32 s10, s18
-; CHECK-NEXT:    vmov.f32 s11, s19
-; CHECK-NEXT:    vmov.f32 s22, s16
-; CHECK-NEXT:    vmov.f32 s23, s17
+; CHECK-NEXT:    vmov r4, r5, d1
+; CHECK-NEXT:    vmov.f32 s2, s16
+; CHECK-NEXT:    vmov.f32 s3, s17
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
-; CHECK-NEXT:    vmov q7, q5
-; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
-; CHECK-NEXT:    vmov r0, r6, d15
-; CHECK-NEXT:    vmov.f64 d14, d11
-; CHECK-NEXT:    vmov.f32 s29, s23
-; CHECK-NEXT:    vmov lr, r12, d5
-; CHECK-NEXT:    vmov.f32 s30, s26
-; CHECK-NEXT:    vmov.f32 s22, s24
-; CHECK-NEXT:    vmov.f32 s31, s27
-; CHECK-NEXT:    vmov.f32 s23, s25
-; CHECK-NEXT:    vmov.f64 d12, d9
+; CHECK-NEXT:    vmov.f32 s6, s28
+; CHECK-NEXT:    vmov.f32 s7, s29
+; CHECK-NEXT:    vmov.f32 s10, s20
+; CHECK-NEXT:    vmov.f32 s11, s21
+; CHECK-NEXT:    vmov r0, r6, d1
 ; CHECK-NEXT:    adds r7, r4, r3
+; CHECK-NEXT:    vmov r4, r8, d0
 ; CHECK-NEXT:    adcs r5, r2
-; CHECK-NEXT:    vmov r4, r8, d14
-; CHECK-NEXT:    vmov r2, r3, d10
-; CHECK-NEXT:    vmov.f32 s25, s19
-; CHECK-NEXT:    vmov.f32 s26, s2
-; CHECK-NEXT:    vmov.f32 s18, s0
-; CHECK-NEXT:    vmov.f32 s27, s3
-; CHECK-NEXT:    vmov.f32 s19, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov r2, r3, d12
+; CHECK-NEXT:    vmov.f32 s0, s18
+; CHECK-NEXT:    vmov.f32 s1, s19
 ; CHECK-NEXT:    adds.w r0, r0, lr
 ; CHECK-NEXT:    adc.w r6, r6, r12
 ; CHECK-NEXT:    adds.w lr, r0, r7
 ; CHECK-NEXT:    adc.w r12, r6, r5
-; CHECK-NEXT:    vmov r6, r5, d12
+; CHECK-NEXT:    vmov r6, r5, d0
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    adds r2, r2, r4
 ; CHECK-NEXT:    vmov r4, r0, d8
 ; CHECK-NEXT:    adc.w r3, r3, r8
@@ -766,11 +717,11 @@ define void @vld4_v4i64(<16 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-NEXT:    adds.w r9, r6, r2
 ; CHECK-NEXT:    adc.w r8, r0, r3
 ; CHECK-NEXT:    vmov r5, r4, d15
-; CHECK-NEXT:    vmov r3, r6, d11
-; CHECK-NEXT:    vmov r7, r0, d9
+; CHECK-NEXT:    vmov r3, r6, d3
+; CHECK-NEXT:    vmov r7, r0, d5
 ; CHECK-NEXT:    adds r3, r3, r5
 ; CHECK-NEXT:    adcs r6, r4
-; CHECK-NEXT:    vmov r5, r4, d13
+; CHECK-NEXT:    vmov r5, r4, d11
 ; CHECK-NEXT:    adds r5, r5, r7
 ; CHECK-NEXT:    adcs r0, r4
 ; CHECK-NEXT:    adds r3, r3, r5
@@ -812,11 +763,11 @@ entry:
 define void @vld4_v2f32(<8 x float> *%src, <2 x float> *%dst) {
 ; CHECK-LABEL: vld4_v2f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vmov.f32 s8, s7
-; CHECK-NEXT:    vmov.f64 d6, d3
 ; CHECK-NEXT:    vmov.f32 s9, s3
+; CHECK-NEXT:    vmov.f32 s12, s6
 ; CHECK-NEXT:    vmov.f32 s13, s2
 ; CHECK-NEXT:    vadd.f32 q2, q3, q2
 ; CHECK-NEXT:    vmov.f32 s12, s5
@@ -842,18 +793,14 @@ entry:
 define void @vld4_v4f32(<16 x float> *%src, <4 x float> *%dst) {
 ; CHECK-LABEL: vld4_v4f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.f32 q4, q2, q3
+; CHECK-NEXT:    vadd.f32 q2, q2, q3
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-NEXT:    vadd.f32 q0, q0, q4
+; CHECK-NEXT:    vadd.f32 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <16 x float>, <16 x float>* %src, align 4
@@ -871,27 +818,25 @@ entry:
 define void @vld4_v8f32(<32 x float> *%src, <8 x float> *%dst) {
 ; CHECK-LABEL: vld4_v8f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.f32 q6, q2, q3
+; CHECK-NEXT:    vadd.f32 q4, q2, q3
+; CHECK-NEXT:    vadd.f32 q5, q0, q1
+; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vadd.f32 q4, q5, q4
+; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vadd.f32 q2, q2, q3
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-NEXT:    vld40.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vadd.f32 q0, q0, q6
-; CHECK-NEXT:    vld41.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vld42.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vld43.32 {q1, q2, q3, q4}, [r0]
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    @ kill: def $q1 killed $q1 killed $q1_q2_q3_q4
-; CHECK-NEXT:    vadd.f32 q5, q3, q4
-; CHECK-NEXT:    vadd.f32 q1, q1, q2
-; CHECK-NEXT:    vadd.f32 q1, q1, q5
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vadd.f32 q0, q0, q2
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x float>, <32 x float>* %src, align 4
@@ -909,12 +854,10 @@ entry:
 define void @vld4_v16f32(<64 x float> *%src, <16 x float> *%dst) {
 ; CHECK-LABEL: vld4_v16f32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5}
-; CHECK-NEXT:    push {r4, r5}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    .pad #112
-; CHECK-NEXT:    sub sp, #112
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #32
+; CHECK-NEXT:    sub sp, #32
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    mov r2, r0
 ; CHECK-NEXT:    add.w r3, r0, #192
@@ -922,56 +865,40 @@ define void @vld4_v16f32(<64 x float> *%src, <16 x float> *%dst) {
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    adds r0, #128
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r2]!
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.f32 q4, q2, q3
+; CHECK-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-NEXT:    vld40.32 {q3, q4, q5, q6}, [r3]
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-NEXT:    vstrw.32 q4, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vld40.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q6, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vld41.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vld42.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vadd.f32 q6, q5, q6
-; CHECK-NEXT:    vstrw.32 q6, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vld43.32 {q1, q2, q3, q4}, [r3]
-; CHECK-NEXT:    vstrw.32 q4, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vld41.32 {q3, q4, q5, q6}, [r3]
 ; CHECK-NEXT:    vadd.f32 q0, q0, q2
-; CHECK-NEXT:    vadd.f32 q1, q3, q5
-; CHECK-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-NEXT:    vstrw.32 q0, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r2]
-; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vld42.32 {q3, q4, q5, q6}, [r3]
+; CHECK-NEXT:    vld43.32 {q3, q4, q5, q6}, [r3]
+; CHECK-NEXT:    vadd.f32 q1, q5, q6
+; CHECK-NEXT:    vadd.f32 q2, q3, q4
+; CHECK-NEXT:    vadd.f32 q0, q2, q1
+; CHECK-NEXT:    vld40.32 {q4, q5, q6, q7}, [r2]
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vld40.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld41.32 {q4, q5, q6, q7}, [r2]
 ; CHECK-NEXT:    vld41.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld42.32 {q4, q5, q6, q7}, [r2]
 ; CHECK-NEXT:    vld42.32 {q0, q1, q2, q3}, [r0]
+; CHECK-NEXT:    vld43.32 {q4, q5, q6, q7}, [r2]
 ; CHECK-NEXT:    vld43.32 {q0, q1, q2, q3}, [r0]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmov q5, q1
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vadd.f32 q0, q0, q5
-; CHECK-NEXT:    vldmia sp, {d6, d7, d8, d9, d10, d11, d12, d13} @ 64-byte Reload
-; CHECK-NEXT:    vadd.f32 q1, q2, q1
-; CHECK-NEXT:    @ kill: def $q3 killed $q3 killed $q3_q4_q5_q6
-; CHECK-NEXT:    vadd.f32 q2, q3, q4
 ; CHECK-NEXT:    vadd.f32 q0, q0, q1
-; CHECK-NEXT:    vadd.f32 q1, q5, q6
-; CHECK-NEXT:    vadd.f32 q1, q2, q1
-; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vadd.f32 q2, q2, q3
+; CHECK-NEXT:    vadd.f32 q0, q0, q2
+; CHECK-NEXT:    vadd.f32 q1, q6, q7
+; CHECK-NEXT:    vadd.f32 q2, q4, q5
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-NEXT:    vadd.f32 q1, q2, q1
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #112
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    pop {r4, r5}
+; CHECK-NEXT:    add sp, #32
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <64 x float>, <64 x float>* %src, align 4
@@ -991,25 +918,25 @@ define void @vld4_v4f32_align1(<16 x float> *%src, <4 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrb.u8 q2, [r0]
-; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
-; CHECK-NEXT:    vldrb.u8 q1, [r0, #32]
 ; CHECK-NEXT:    vldrb.u8 q0, [r0, #48]
+; CHECK-NEXT:    vldrb.u8 q1, [r0, #32]
+; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
+; CHECK-NEXT:    vldrb.u8 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s16, s11
-; CHECK-NEXT:    vmov.f64 d10, d5
+; CHECK-NEXT:    vmov.f32 s20, s10
 ; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s19, s3
 ; CHECK-NEXT:    vmov.f32 s21, s14
-; CHECK-NEXT:    vmov.f32 s18, s7
 ; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s19, s3
 ; CHECK-NEXT:    vmov.f32 s23, s2
 ; CHECK-NEXT:    vadd.f32 q4, q5, q4
 ; CHECK-NEXT:    vmov.f32 s20, s9
 ; CHECK-NEXT:    vmov.f32 s21, s13
-; CHECK-NEXT:    vmov.f32 s9, s12
 ; CHECK-NEXT:    vmov.f32 s22, s5
-; CHECK-NEXT:    vmov.f32 s10, s4
 ; CHECK-NEXT:    vmov.f32 s23, s1
+; CHECK-NEXT:    vmov.f32 s9, s12
+; CHECK-NEXT:    vmov.f32 s10, s4
 ; CHECK-NEXT:    vmov.f32 s11, s0
 ; CHECK-NEXT:    vadd.f32 q0, q2, q5
 ; CHECK-NEXT:    vadd.f32 q0, q0, q4
@@ -1035,17 +962,17 @@ define void @vld4_v2f16(<8 x half> *%src, <2 x half> *%dst) {
 ; CHECK-LABEL: vld4_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrh.u16 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s4, s3
 ; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vmovx.f16 s4, s3
 ; CHECK-NEXT:    vins.f16 s8, s4
-; CHECK-NEXT:    vmovx.f16 s4, s2
 ; CHECK-NEXT:    vmovx.f16 s12, s0
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vins.f16 s1, s3
 ; CHECK-NEXT:    vins.f16 s12, s4
-; CHECK-NEXT:    vins.f16 s0, s2
 ; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vadd.f16 q0, q0, q3
+; CHECK-NEXT:    vins.f16 s0, s2
 ; CHECK-NEXT:    vadd.f16 q1, q1, q2
+; CHECK-NEXT:    vadd.f16 q0, q0, q3
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vmov r0, s0
 ; CHECK-NEXT:    str r0, [r1]
@@ -1066,27 +993,27 @@ entry:
 define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) {
 ; CHECK-LABEL: vld4_v4f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
 ; CHECK-NEXT:    vldrh.u16 q0, [r0]
-; CHECK-NEXT:    vmovx.f16 s8, s2
-; CHECK-NEXT:    vmovx.f16 s4, s0
-; CHECK-NEXT:    vins.f16 s4, s8
 ; CHECK-NEXT:    vldrh.u16 q2, [r0, #16]
+; CHECK-NEXT:    vmovx.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s6, s2
 ; CHECK-NEXT:    vins.f16 s0, s2
-; CHECK-NEXT:    vmovx.f16 s16, s3
-; CHECK-NEXT:    vmovx.f16 s12, s10
-; CHECK-NEXT:    vmovx.f16 s5, s8
-; CHECK-NEXT:    vins.f16 s5, s12
 ; CHECK-NEXT:    vmovx.f16 s12, s1
-; CHECK-NEXT:    vins.f16 s12, s16
-; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vmovx.f16 s16, s11
+; CHECK-NEXT:    vmovx.f16 s2, s3
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s5, s8
+; CHECK-NEXT:    vmovx.f16 s6, s10
+; CHECK-NEXT:    vins.f16 s12, s2
 ; CHECK-NEXT:    vmovx.f16 s13, s9
+; CHECK-NEXT:    vmovx.f16 s2, s11
 ; CHECK-NEXT:    vins.f16 s1, s3
-; CHECK-NEXT:    vins.f16 s13, s16
 ; CHECK-NEXT:    vins.f16 s9, s11
+; CHECK-NEXT:    vins.f16 s8, s10
 ; CHECK-NEXT:    vmov.f32 s16, s1
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vins.f16 s13, s2
 ; CHECK-NEXT:    vmov.f32 s1, s8
 ; CHECK-NEXT:    vmov.f32 s17, s9
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
@@ -1094,7 +1021,7 @@ define void @vld4_v4f16(<16 x half> *%src, <4 x half> *%dst) {
 ; CHECK-NEXT:    vadd.f16 q0, q0, q3
 ; CHECK-NEXT:    vmov r0, r2, d0
 ; CHECK-NEXT:    strd r0, r2, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vpop {d8}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <16 x half>, <16 x half>* %src, align 2
@@ -1112,18 +1039,14 @@ entry:
 define void @vld4_v8f16(<32 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vld4_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
-; CHECK-NEXT:    vadd.f16 q4, q2, q3
+; CHECK-NEXT:    vadd.f16 q2, q2, q3
 ; CHECK-NEXT:    vadd.f16 q0, q0, q1
-; CHECK-NEXT:    vadd.f16 q0, q0, q4
+; CHECK-NEXT:    vadd.f16 q0, q0, q2
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <32 x half>, <32 x half>* %src, align 2
@@ -1141,37 +1064,25 @@ entry:
 define void @vld4_v16f16(<64 x half> *%src, <16 x half> *%dst) {
 ; CHECK-LABEL: vld4_v16f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5}
-; CHECK-NEXT:    push {r4, r5}
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #80
-; CHECK-NEXT:    sub sp, #80
 ; CHECK-NEXT:    vld40.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld41.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld42.16 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vld43.16 {q0, q1, q2, q3}, [r0]!
 ; CHECK-NEXT:    vld40.16 {q4, q5, q6, q7}, [r0]
-; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
+; CHECK-NEXT:    vadd.f16 q2, q2, q3
+; CHECK-NEXT:    vadd.f16 q0, q0, q1
 ; CHECK-NEXT:    vld41.16 {q4, q5, q6, q7}, [r0]
+; CHECK-NEXT:    vadd.f16 q0, q0, q2
 ; CHECK-NEXT:    vld42.16 {q4, q5, q6, q7}, [r0]
 ; CHECK-NEXT:    vld43.16 {q4, q5, q6, q7}, [r0]
-; CHECK-NEXT:    @ kill: def $q4 killed $q4 killed $q4_q5_q6_q7
-; CHECK-NEXT:    vadd.f16 q0, q6, q7
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vadd.f16 q6, q6, q7
 ; CHECK-NEXT:    vadd.f16 q4, q4, q5
-; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vadd.f16 q4, q4, q0
-; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
-; CHECK-NEXT:    @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3
+; CHECK-NEXT:    vadd.f16 q4, q4, q6
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
-; CHECK-NEXT:    vadd.f16 q4, q2, q3
-; CHECK-NEXT:    vadd.f16 q0, q0, q1
-; CHECK-NEXT:    vadd.f16 q0, q0, q4
-; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #80
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    pop {r4, r5}
 ; CHECK-NEXT:    bx lr
 entry:
   %l1 = load <64 x half>, <64 x half>* %src, align 2
@@ -1193,48 +1104,48 @@ define void @vld4_v8f16_align1(<32 x half> *%src, <8 x half> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vldrb.u8 q0, [r0, #32]
 ; CHECK-NEXT:    vldrb.u8 q2, [r0, #48]
-; CHECK-NEXT:    vmovx.f16 s4, s3
 ; CHECK-NEXT:    vmovx.f16 s18, s1
+; CHECK-NEXT:    vmovx.f16 s4, s3
 ; CHECK-NEXT:    vins.f16 s18, s4
-; CHECK-NEXT:    vmovx.f16 s4, s11
 ; CHECK-NEXT:    vmovx.f16 s19, s9
+; CHECK-NEXT:    vmovx.f16 s4, s11
 ; CHECK-NEXT:    vins.f16 s1, s3
 ; CHECK-NEXT:    vins.f16 s19, s4
 ; CHECK-NEXT:    vldrb.u8 q1, [r0]
-; CHECK-NEXT:    vmovx.f16 s24, s2
-; CHECK-NEXT:    vins.f16 s9, s11
-; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vmovx.f16 s22, s0
+; CHECK-NEXT:    vmovx.f16 s3, s2
 ; CHECK-NEXT:    vmovx.f16 s16, s5
+; CHECK-NEXT:    vmovx.f16 s12, s7
 ; CHECK-NEXT:    vins.f16 s16, s12
 ; CHECK-NEXT:    vldrb.u8 q3, [r0, #16]
-; CHECK-NEXT:    vins.f16 s5, s7
-; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vins.f16 s22, s3
+; CHECK-NEXT:    vmovx.f16 s23, s8
 ; CHECK-NEXT:    vmovx.f16 s17, s13
+; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vmovx.f16 s3, s10
 ; CHECK-NEXT:    vins.f16 s17, s20
-; CHECK-NEXT:    vmovx.f16 s22, s0
-; CHECK-NEXT:    vins.f16 s22, s24
-; CHECK-NEXT:    vmovx.f16 s24, s10
-; CHECK-NEXT:    vmovx.f16 s23, s8
-; CHECK-NEXT:    vins.f16 s13, s15
-; CHECK-NEXT:    vins.f16 s23, s24
-; CHECK-NEXT:    vmovx.f16 s24, s6
+; CHECK-NEXT:    vins.f16 s23, s3
 ; CHECK-NEXT:    vmovx.f16 s20, s4
-; CHECK-NEXT:    vins.f16 s0, s2
-; CHECK-NEXT:    vins.f16 s20, s24
-; CHECK-NEXT:    vmovx.f16 s24, s14
+; CHECK-NEXT:    vmovx.f16 s3, s6
+; CHECK-NEXT:    vins.f16 s9, s11
+; CHECK-NEXT:    vins.f16 s5, s7
+; CHECK-NEXT:    vins.f16 s13, s15
+; CHECK-NEXT:    vins.f16 s20, s3
 ; CHECK-NEXT:    vmovx.f16 s21, s12
+; CHECK-NEXT:    vmovx.f16 s3, s14
 ; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vins.f16 s21, s24
-; CHECK-NEXT:    vmov.f32 s26, s1
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vins.f16 s12, s14
 ; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vmov.f32 s27, s9
 ; CHECK-NEXT:    vmov.f32 s24, s5
-; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vins.f16 s21, s3
+; CHECK-NEXT:    vmov.f32 s26, s1
+; CHECK-NEXT:    vmov.f32 s27, s9
+; CHECK-NEXT:    vmov.f32 s25, s13
 ; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vadd.f16 q4, q6, q4
 ; CHECK-NEXT:    vmov.f32 s7, s8
-; CHECK-NEXT:    vmov.f32 s25, s13
 ; CHECK-NEXT:    vmov.f32 s5, s12
-; CHECK-NEXT:    vadd.f16 q4, q6, q4
 ; CHECK-NEXT:    vadd.f16 q0, q1, q5
 ; CHECK-NEXT:    vadd.f16 q0, q0, q4
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
diff --git a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
index d26757fc99e89..930212ddc59c0 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldshuffle.ll
@@ -30,9 +30,9 @@ define void @arm_cmplx_mag_squared_f16(half* nocapture readonly %pSrc, half* noc
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vld20.16 {q0, q1}, [r0]
 ; CHECK-NEXT:    vld21.16 {q0, q1}, [r0]!
-; CHECK-NEXT:    vmul.f16 q2, q0, q0
-; CHECK-NEXT:    vfma.f16 q2, q1, q1
-; CHECK-NEXT:    vstrb.8 q2, [r1], #16
+; CHECK-NEXT:    vmul.f16 q0, q0, q0
+; CHECK-NEXT:    vfma.f16 q0, q1, q1
+; CHECK-NEXT:    vstrb.8 q0, [r1], #16
 ; CHECK-NEXT:    le lr, .LBB0_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r4, r2
@@ -159,9 +159,9 @@ define void @arm_cmplx_mag_squared_f32(float* nocapture readonly %pSrc, float* n
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vld20.32 {q0, q1}, [r0]
 ; CHECK-NEXT:    vld21.32 {q0, q1}, [r0]!
-; CHECK-NEXT:    vmul.f32 q2, q0, q0
-; CHECK-NEXT:    vfma.f32 q2, q1, q1
-; CHECK-NEXT:    vstrb.8 q2, [r1], #16
+; CHECK-NEXT:    vmul.f32 q0, q0, q0
+; CHECK-NEXT:    vfma.f32 q0, q1, q1
+; CHECK-NEXT:    vstrb.8 q0, [r1], #16
 ; CHECK-NEXT:    le lr, .LBB1_4
 ; CHECK-NEXT:  @ %bb.5: @ %middle.block
 ; CHECK-NEXT:    cmp r4, r2
diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
index c073ffbe4a42e..e69d06d475300 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
@@ -6,125 +6,119 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %n
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    mul r12, r3, r2
 ; CHECK-NEXT:    lsrs.w r2, r12, #2
 ; CHECK-NEXT:    beq.w .LBB0_3
 ; CHECK-NEXT:  @ %bb.1: @ %vector.ph
 ; CHECK-NEXT:    mvn r3, #7
-; CHECK-NEXT:    ldr r2, [sp, #88]
+; CHECK-NEXT:    ldr r2, [sp, #56]
 ; CHECK-NEXT:    and.w r3, r3, r12, lsr #2
 ; CHECK-NEXT:    sub.w r12, r3, #8
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #3
 ; CHECK-NEXT:  .LBB0_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q4, [r0, #32]
-; CHECK-NEXT:    vldrh.u16 q5, [r0, #48]
+; CHECK-NEXT:    vldrh.u16 q1, [r0, #32]
+; CHECK-NEXT:    vldrh.u16 q4, [r0, #48]
 ; CHECK-NEXT:    vldrh.u16 q3, [r0], #64
-; CHECK-NEXT:    vmov.f32 s2, s17
+; CHECK-NEXT:    vmovx.f16 s26, s4
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s6, s6
+; CHECK-NEXT:    vldrh.u16 q5, [r0, #-48]
+; CHECK-NEXT:    vmovx.f16 s27, s16
+; CHECK-NEXT:    vins.f16 s26, s6
+; CHECK-NEXT:    vmovx.f16 s6, s18
+; CHECK-NEXT:    vmovx.f16 s8, s7
+; CHECK-NEXT:    vmovx.f16 s10, s5
+; CHECK-NEXT:    vmovx.f16 s24, s12
+; CHECK-NEXT:    vins.f16 s10, s8
+; CHECK-NEXT:    vins.f16 s27, s6
+; CHECK-NEXT:    vmovx.f16 s6, s14
 ; CHECK-NEXT:    vmovx.f16 s8, s19
-; CHECK-NEXT:    vldrh.u16 q6, [r0, #-48]
-; CHECK-NEXT:    vins.f16 s2, s19
-; CHECK-NEXT:    vmov.f32 s3, s21
-; CHECK-NEXT:    vmovx.f16 s5, s25
-; CHECK-NEXT:    vins.f16 s3, s23
-; CHECK-NEXT:    vmovx.f16 s6, s17
+; CHECK-NEXT:    vmovx.f16 s11, s17
 ; CHECK-NEXT:    vmov.f32 s0, s13
-; CHECK-NEXT:    vins.f16 s6, s8
-; CHECK-NEXT:    vmovx.f16 s8, s23
-; CHECK-NEXT:    vmovx.f16 s7, s21
-; CHECK-NEXT:    vins.f16 s0, s15
-; CHECK-NEXT:    vins.f16 s7, s8
-; CHECK-NEXT:    vmovx.f16 s8, s15
-; CHECK-NEXT:    vmovx.f16 s4, s13
-; CHECK-NEXT:    vins.f16 s25, s27
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmovx.f16 s8, s27
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vmov.f32 s1, s25
-; CHECK-NEXT:    vmul.f16 q2, q1, r2
-; CHECK-NEXT:    vmul.f16 q0, q0, r2
-; CHECK-NEXT:    vmovx.f16 s7, s0
-; CHECK-NEXT:    vmovx.f16 s28, s8
-; CHECK-NEXT:    vins.f16 s7, s28
-; CHECK-NEXT:    vmovx.f16 s30, s16
-; CHECK-NEXT:    vmovx.f16 s31, s20
-; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmovx.f16 s28, s12
-; CHECK-NEXT:    vins.f16 s16, s18
-; CHECK-NEXT:    vmovx.f16 s29, s24
-; CHECK-NEXT:    vmovx.f16 s4, s18
+; CHECK-NEXT:    vins.f16 s11, s8
+; CHECK-NEXT:    vmovx.f16 s25, s20
+; CHECK-NEXT:    vins.f16 s24, s6
+; CHECK-NEXT:    vmovx.f16 s6, s22
+; CHECK-NEXT:    vmovx.f16 s1, s15
+; CHECK-NEXT:    vmovx.f16 s8, s13
 ; CHECK-NEXT:    vins.f16 s20, s22
-; CHECK-NEXT:    vins.f16 s30, s4
-; CHECK-NEXT:    vmovx.f16 s4, s22
+; CHECK-NEXT:    vins.f16 s16, s18
+; CHECK-NEXT:    vins.f16 s25, s6
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov.f32 s3, s17
+; CHECK-NEXT:    vins.f16 s0, s15
+; CHECK-NEXT:    vmovx.f16 s9, s21
+; CHECK-NEXT:    vins.f16 s8, s1
+; CHECK-NEXT:    vmovx.f16 s1, s23
 ; CHECK-NEXT:    vins.f16 s12, s14
-; CHECK-NEXT:    vins.f16 s31, s4
-; CHECK-NEXT:    vmovx.f16 s4, s14
-; CHECK-NEXT:    vmov.f32 s14, s16
-; CHECK-NEXT:    vins.f16 s24, s26
-; CHECK-NEXT:    vmov.f32 s15, s20
-; CHECK-NEXT:    vins.f16 s28, s4
-; CHECK-NEXT:    vmovx.f16 s4, s26
-; CHECK-NEXT:    vmov.f32 s13, s24
-; CHECK-NEXT:    vins.f16 s29, s4
+; CHECK-NEXT:    vins.f16 s21, s23
+; CHECK-NEXT:    vmov.f32 s14, s4
+; CHECK-NEXT:    vmov.f32 s15, s16
+; CHECK-NEXT:    vins.f16 s9, s1
+; CHECK-NEXT:    vmov.f32 s13, s20
+; CHECK-NEXT:    vmul.f16 q6, q6, r2
 ; CHECK-NEXT:    vmul.f16 q3, q3, r2
-; CHECK-NEXT:    vmul.f16 q7, q7, r2
+; CHECK-NEXT:    vins.f16 s2, s7
+; CHECK-NEXT:    vins.f16 s3, s19
+; CHECK-NEXT:    vmov.f32 s1, s21
+; CHECK-NEXT:    vmul.f16 q0, q0, r2
 ; CHECK-NEXT:    vmovx.f16 s4, s12
-; CHECK-NEXT:    vmovx.f16 s6, s28
+; CHECK-NEXT:    vmovx.f16 s6, s24
+; CHECK-NEXT:    vmul.f16 q2, q2, r2
+; CHECK-NEXT:    vmovx.f16 s7, s0
 ; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vmovx.f16 s8, s8
 ; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vmovx.f16 s6, s1
-; CHECK-NEXT:    vmovx.f16 s5, s9
-; CHECK-NEXT:    vins.f16 s12, s28
-; CHECK-NEXT:    vins.f16 s6, s5
-; CHECK-NEXT:    vmovx.f16 s18, s13
-; CHECK-NEXT:    vmovx.f16 s5, s29
-; CHECK-NEXT:    vins.f16 s1, s9
-; CHECK-NEXT:    vins.f16 s18, s5
-; CHECK-NEXT:    vmovx.f16 s23, s2
-; CHECK-NEXT:    vmovx.f16 s5, s10
-; CHECK-NEXT:    vins.f16 s2, s10
-; CHECK-NEXT:    vins.f16 s23, s5
-; CHECK-NEXT:    vins.f16 s13, s29
-; CHECK-NEXT:    vmovx.f16 s27, s3
+; CHECK-NEXT:    vmovx.f16 s5, s1
+; CHECK-NEXT:    vmovx.f16 s6, s9
+; CHECK-NEXT:    vins.f16 s7, s8
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vmovx.f16 s6, s13
+; CHECK-NEXT:    vmovx.f16 s8, s25
+; CHECK-NEXT:    vins.f16 s6, s8
+; CHECK-NEXT:    vmovx.f16 s19, s2
+; CHECK-NEXT:    vmovx.f16 s8, s10
+; CHECK-NEXT:    vmovx.f16 s18, s14
+; CHECK-NEXT:    vins.f16 s19, s8
+; CHECK-NEXT:    vmovx.f16 s8, s26
+; CHECK-NEXT:    vins.f16 s18, s8
+; CHECK-NEXT:    vmovx.f16 s23, s3
 ; CHECK-NEXT:    vmovx.f16 s8, s11
-; CHECK-NEXT:    vmovx.f16 s22, s14
-; CHECK-NEXT:    vins.f16 s27, s8
-; CHECK-NEXT:    vins.f16 s14, s30
-; CHECK-NEXT:    vmovx.f16 s26, s15
-; CHECK-NEXT:    vins.f16 s15, s31
-; CHECK-NEXT:    vmovx.f16 s8, s31
+; CHECK-NEXT:    vins.f16 s14, s26
+; CHECK-NEXT:    vins.f16 s23, s8
+; CHECK-NEXT:    vmovx.f16 s22, s15
+; CHECK-NEXT:    vins.f16 s15, s27
+; CHECK-NEXT:    vmovx.f16 s8, s27
+; CHECK-NEXT:    vins.f16 s12, s24
+; CHECK-NEXT:    vins.f16 s13, s25
 ; CHECK-NEXT:    vins.f16 s3, s11
-; CHECK-NEXT:    vins.f16 s26, s8
+; CHECK-NEXT:    vins.f16 s1, s9
+; CHECK-NEXT:    vins.f16 s2, s10
+; CHECK-NEXT:    vins.f16 s22, s8
 ; CHECK-NEXT:    vmov q2, q3
-; CHECK-NEXT:    vmovx.f16 s5, s30
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s17, s0
 ; CHECK-NEXT:    vmov.f32 s10, s4
-; CHECK-NEXT:    vmov.f32 s29, s0
-; CHECK-NEXT:    vins.f16 s22, s5
+; CHECK-NEXT:    vmov q6, q0
+; CHECK-NEXT:    vmov.f32 s11, s7
 ; CHECK-NEXT:    vmov.f32 s9, s0
-; CHECK-NEXT:    vmov.f32 s11, s31
-; CHECK-NEXT:    vmov q7, q0
-; CHECK-NEXT:    vmov.f32 s31, s6
-; CHECK-NEXT:    vmov.f32 s16, s13
-; CHECK-NEXT:    vmov.f32 s21, s2
-; CHECK-NEXT:    vmov.f32 s25, s3
-; CHECK-NEXT:    vmov.f32 s17, s29
-; CHECK-NEXT:    vmov.f32 s20, s14
-; CHECK-NEXT:    vmov.f32 s24, s15
-; CHECK-NEXT:    vstrh.16 q5, [r1, #32]
-; CHECK-NEXT:    vstrh.16 q6, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s17, s2
+; CHECK-NEXT:    vmov.f32 s16, s14
+; CHECK-NEXT:    vmov.f32 s21, s3
+; CHECK-NEXT:    vstrh.16 q4, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s20, s15
+; CHECK-NEXT:    vmov.f32 s7, s5
+; CHECK-NEXT:    vstrh.16 q5, [r1, #48]
 ; CHECK-NEXT:    vstrh.16 q2, [r1], #64
-; CHECK-NEXT:    vmov.f32 s19, s31
-; CHECK-NEXT:    vstrh.16 q4, [r1, #-48]
+; CHECK-NEXT:    vmov.f32 s4, s13
+; CHECK-NEXT:    vmov.f32 s5, s25
+; CHECK-NEXT:    vstrh.16 q1, [r1, #-48]
 ; CHECK-NEXT:    le lr, .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: @ %while.end
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %tmp.0.extract.trunc = trunc i32 %scale.coerce to i16
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
index 9b0bc7e72516c..f2d9593f26418 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovn.ll
@@ -176,8 +176,8 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: vmovn64_b2:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f32 s4, s6
-; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vmov.f32 s7, s1
 ; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
@@ -185,8 +185,8 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECKBE-LABEL: vmovn64_b2:
 ; CHECKBE:       @ %bb.0: @ %entry
 ; CHECKBE-NEXT:    vmov.f32 s4, s6
-; CHECKBE-NEXT:    vmov.f32 s5, s7
 ; CHECKBE-NEXT:    vmov.f32 s6, s0
+; CHECKBE-NEXT:    vmov.f32 s5, s7
 ; CHECKBE-NEXT:    vmov.f32 s7, s1
 ; CHECKBE-NEXT:    vmov q0, q1
 ; CHECKBE-NEXT:    bx lr
@@ -199,16 +199,16 @@ define arm_aapcs_vfpcc <2 x i64> @vmovn64_b3(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: vmovn64_b3:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov.f32 s0, s2
-; CHECK-NEXT:    vmov.f32 s1, s3
 ; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s1, s3
 ; CHECK-NEXT:    vmov.f32 s3, s5
 ; CHECK-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: vmovn64_b3:
 ; CHECKBE:       @ %bb.0: @ %entry
 ; CHECKBE-NEXT:    vmov.f32 s0, s2
-; CHECKBE-NEXT:    vmov.f32 s1, s3
 ; CHECKBE-NEXT:    vmov.f32 s2, s4
+; CHECKBE-NEXT:    vmov.f32 s1, s3
 ; CHECKBE-NEXT:    vmov.f32 s3, s5
 ; CHECKBE-NEXT:    bx lr
 entry:
@@ -301,11 +301,11 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmovn32_b2(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vmovn32_b2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s9, s0
-; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s11, s2
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov.f32 s5, s0
+; CHECK-NEXT:    vmov.f32 s7, s2
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: vmovn32_b2:
@@ -326,22 +326,21 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @vmovn32_b3(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: vmovn32_b3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s8, s1
-; CHECK-NEXT:    vmov.f32 s9, s4
-; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov.f32 s11, s6
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: vmovn32_b3:
 ; CHECKBE:       @ %bb.0: @ %entry
 ; CHECKBE-NEXT:    vrev64.32 q2, q1
 ; CHECKBE-NEXT:    vrev64.32 q1, q0
-; CHECKBE-NEXT:    vmov.f32 s12, s5
-; CHECKBE-NEXT:    vmov.f32 s13, s8
-; CHECKBE-NEXT:    vmov.f32 s14, s7
-; CHECKBE-NEXT:    vmov.f32 s15, s10
-; CHECKBE-NEXT:    vrev64.32 q0, q3
+; CHECKBE-NEXT:    vmov.f32 s4, s5
+; CHECKBE-NEXT:    vmov.f32 s6, s7
+; CHECKBE-NEXT:    vmov.f32 s5, s8
+; CHECKBE-NEXT:    vmov.f32 s7, s10
+; CHECKBE-NEXT:    vrev64.32 q0, q1
 ; CHECKBE-NEXT:    bx lr
 entry:
   %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
@@ -450,15 +449,15 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: vmovn16_b2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s9, s5
-; CHECK-NEXT:    vins.f16 s9, s1
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vins.f16 s8, s0
-; CHECK-NEXT:    vmovx.f16 s10, s6
-; CHECK-NEXT:    vins.f16 s10, s2
-; CHECK-NEXT:    vmovx.f16 s11, s7
-; CHECK-NEXT:    vins.f16 s11, s3
-; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vmovx.f16 s5, s5
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vmovx.f16 s6, s6
+; CHECK-NEXT:    vmovx.f16 s7, s7
+; CHECK-NEXT:    vins.f16 s5, s1
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s6, s2
+; CHECK-NEXT:    vins.f16 s7, s3
+; CHECK-NEXT:    vmov q0, q1
 ; CHECK-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: vmovn16_b2:
@@ -466,12 +465,12 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECKBE-NEXT:    vrev64.16 q2, q0
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    vmovx.f16 s5, s1
-; CHECKBE-NEXT:    vins.f16 s5, s9
 ; CHECKBE-NEXT:    vmovx.f16 s4, s0
-; CHECKBE-NEXT:    vins.f16 s4, s8
 ; CHECKBE-NEXT:    vmovx.f16 s6, s2
-; CHECKBE-NEXT:    vins.f16 s6, s10
 ; CHECKBE-NEXT:    vmovx.f16 s7, s3
+; CHECKBE-NEXT:    vins.f16 s5, s9
+; CHECKBE-NEXT:    vins.f16 s4, s8
+; CHECKBE-NEXT:    vins.f16 s6, s10
 ; CHECKBE-NEXT:    vins.f16 s7, s11
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
@@ -483,28 +482,27 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: vmovn16_b3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov q2, q0
-; CHECK-NEXT:    vmovx.f16 s1, s9
+; CHECK-NEXT:    vmovx.f16 s1, s1
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vmovx.f16 s3, s3
 ; CHECK-NEXT:    vins.f16 s1, s5
-; CHECK-NEXT:    vmovx.f16 s0, s8
 ; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vmovx.f16 s2, s10
 ; CHECK-NEXT:    vins.f16 s2, s6
-; CHECK-NEXT:    vmovx.f16 s3, s11
 ; CHECK-NEXT:    vins.f16 s3, s7
 ; CHECK-NEXT:    bx lr
 ;
 ; CHECKBE-LABEL: vmovn16_b3:
 ; CHECKBE:       @ %bb.0: @ %entry
-; CHECKBE-NEXT:    vrev64.16 q3, q0
 ; CHECKBE-NEXT:    vrev64.16 q2, q1
-; CHECKBE-NEXT:    vmovx.f16 s5, s13
+; CHECKBE-NEXT:    vrev64.16 q1, q0
+; CHECKBE-NEXT:    vmovx.f16 s5, s5
+; CHECKBE-NEXT:    vmovx.f16 s4, s4
+; CHECKBE-NEXT:    vmovx.f16 s6, s6
+; CHECKBE-NEXT:    vmovx.f16 s7, s7
 ; CHECKBE-NEXT:    vins.f16 s5, s9
-; CHECKBE-NEXT:    vmovx.f16 s4, s12
 ; CHECKBE-NEXT:    vins.f16 s4, s8
-; CHECKBE-NEXT:    vmovx.f16 s6, s14
 ; CHECKBE-NEXT:    vins.f16 s6, s10
-; CHECKBE-NEXT:    vmovx.f16 s7, s15
 ; CHECKBE-NEXT:    vins.f16 s7, s11
 ; CHECKBE-NEXT:    vrev64.16 q0, q1
 ; CHECKBE-NEXT:    bx lr
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
index e4ae6e5dff3a2..b8ddde719a67e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll
@@ -148,11 +148,11 @@ entry:
 define arm_aapcs_vfpcc void @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) {
 ; CHECK-LABEL: vmovn64_b2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s8, s6
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s10, s0
-; CHECK-NEXT:    vmov.f32 s11, s1
-; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s4, s6
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vmov.f32 s6, s0
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vstrw.32 q1, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 3, i32 0>
@@ -163,11 +163,11 @@ entry:
 define arm_aapcs_vfpcc void @vmovn64_b3(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) {
 ; CHECK-LABEL: vmovn64_b3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s8, s2
-; CHECK-NEXT:    vmov.f32 s9, s3
-; CHECK-NEXT:    vmov.f32 s10, s4
-; CHECK-NEXT:    vmov.f32 s11, s5
-; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s0, s2
+; CHECK-NEXT:    vmov.f32 s1, s3
+; CHECK-NEXT:    vmov.f32 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> <i32 1, i32 2>
@@ -232,11 +232,11 @@ entry:
 define arm_aapcs_vfpcc void @vmovn32_b2(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) {
 ; CHECK-LABEL: vmovn32_b2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s9, s0
-; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s11, s2
-; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s6, s7
+; CHECK-NEXT:    vmov.f32 s5, s0
+; CHECK-NEXT:    vmov.f32 s7, s2
+; CHECK-NEXT:    vstrw.32 q1, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 5, i32 0, i32 7, i32 2>
@@ -247,11 +247,11 @@ entry:
 define arm_aapcs_vfpcc void @vmovn32_b3(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) {
 ; CHECK-LABEL: vmovn32_b3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmov.f32 s8, s1
-; CHECK-NEXT:    vmov.f32 s9, s4
-; CHECK-NEXT:    vmov.f32 s10, s3
-; CHECK-NEXT:    vmov.f32 s11, s6
-; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> <i32 1, i32 4, i32 3, i32 6>
@@ -314,15 +314,15 @@ entry:
 define arm_aapcs_vfpcc void @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) {
 ; CHECK-LABEL: vmovn16_b2:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s9, s5
-; CHECK-NEXT:    vins.f16 s9, s1
-; CHECK-NEXT:    vmovx.f16 s8, s4
-; CHECK-NEXT:    vins.f16 s8, s0
-; CHECK-NEXT:    vmovx.f16 s10, s6
-; CHECK-NEXT:    vins.f16 s10, s2
-; CHECK-NEXT:    vmovx.f16 s11, s7
-; CHECK-NEXT:    vins.f16 s11, s3
-; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vmovx.f16 s5, s5
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vmovx.f16 s6, s6
+; CHECK-NEXT:    vmovx.f16 s7, s7
+; CHECK-NEXT:    vins.f16 s5, s1
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s6, s2
+; CHECK-NEXT:    vins.f16 s7, s3
+; CHECK-NEXT:    vstrw.32 q1, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 9, i32 0, i32 11, i32 2, i32 13, i32 4, i32 15, i32 6>
@@ -333,15 +333,15 @@ entry:
 define arm_aapcs_vfpcc void @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) {
 ; CHECK-LABEL: vmovn16_b3:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovx.f16 s9, s1
-; CHECK-NEXT:    vins.f16 s9, s5
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vins.f16 s8, s4
-; CHECK-NEXT:    vmovx.f16 s10, s2
-; CHECK-NEXT:    vins.f16 s10, s6
-; CHECK-NEXT:    vmovx.f16 s11, s3
-; CHECK-NEXT:    vins.f16 s11, s7
-; CHECK-NEXT:    vstrw.32 q2, [r0]
+; CHECK-NEXT:    vmovx.f16 s1, s1
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmovx.f16 s2, s2
+; CHECK-NEXT:    vmovx.f16 s3, s3
+; CHECK-NEXT:    vins.f16 s1, s5
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vins.f16 s2, s6
+; CHECK-NEXT:    vins.f16 s3, s7
+; CHECK-NEXT:    vstrw.32 q0, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
   %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> <i32 1, i32 8, i32 3, i32 10, i32 5, i32 12, i32 7, i32 14>
diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
index 7e2374f2885f5..f66eb8584a0bd 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmull-splat.ll
@@ -190,15 +190,12 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @sext32_0213_0ext(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_0213_0ext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s16, s1
 ; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
-; CHECK-NEXT:    vmov.f32 s18, s3
 ; CHECK-NEXT:    vmullb.s32 q2, q0, q3
-; CHECK-NEXT:    vmullb.s32 q1, q4, q3
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmullb.s32 q1, q0, q3
 ; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -213,15 +210,12 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @sext32_0ext_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: sext32_0ext_0213:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s16, s1
 ; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
-; CHECK-NEXT:    vmov.f32 s18, s3
 ; CHECK-NEXT:    vmullb.s32 q2, q3, q0
-; CHECK-NEXT:    vmullb.s32 q1, q3, q4
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmullb.s32 q1, q3, q0
 ; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -241,8 +235,8 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s6, s7
 ; CHECK-NEXT:    umull lr, r12, r1, r0
 ; CHECK-NEXT:    umull r2, r5, r3, r0
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
@@ -252,10 +246,10 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
 ; CHECK-NEXT:    mla r5, r3, r2, r5
 ; CHECK-NEXT:    asrs r3, r3, #31
 ; CHECK-NEXT:    mla r1, r1, r0, r4
-; CHECK-NEXT:    vmov r4, s8
+; CHECK-NEXT:    vmov r4, s4
 ; CHECK-NEXT:    mla r3, r3, r0, r5
 ; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
-; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    umull r5, lr, r4, r0
 ; CHECK-NEXT:    umull r3, r12, r1, r0
 ; CHECK-NEXT:    vmov q1[2], q1[0], r5, r3
@@ -286,8 +280,8 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK-NEXT:    asrs r4, r0, #31
 ; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    vmov r3, s4
-; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s10, s7
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s6, s7
 ; CHECK-NEXT:    umull lr, r12, r0, r1
 ; CHECK-NEXT:    umull r2, r5, r0, r3
 ; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
@@ -296,10 +290,10 @@ define arm_aapcs_vfpcc <4 x i64> @sext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK-NEXT:    mla r1, r4, r1, r2
 ; CHECK-NEXT:    asrs r2, r3, #31
 ; CHECK-NEXT:    mla r2, r0, r2, r5
-; CHECK-NEXT:    vmov r5, s8
+; CHECK-NEXT:    vmov r5, s4
 ; CHECK-NEXT:    mla r2, r4, r3, r2
 ; CHECK-NEXT:    vmov q0[3], q0[1], r2, r1
-; CHECK-NEXT:    vmov r1, s10
+; CHECK-NEXT:    vmov r1, s6
 ; CHECK-NEXT:    umull r3, lr, r0, r5
 ; CHECK-NEXT:    umull r2, r12, r0, r1
 ; CHECK-NEXT:    vmov q1[2], q1[0], r3, r2
@@ -474,15 +468,12 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @zext32_0213_0ext(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: zext32_0213_0ext:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s16, s1
 ; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
-; CHECK-NEXT:    vmov.f32 s18, s3
 ; CHECK-NEXT:    vmullb.u32 q2, q0, q3
-; CHECK-NEXT:    vmullb.u32 q1, q4, q3
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmullb.u32 q1, q0, q3
 ; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -497,15 +488,12 @@ entry:
 define arm_aapcs_vfpcc <4 x i64> @zext32_0ext_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK-LABEL: zext32_0ext_0213:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vmov.f32 s16, s1
 ; CHECK-NEXT:    vmov q3[2], q3[0], r0, r0
-; CHECK-NEXT:    vmov.f32 s18, s3
 ; CHECK-NEXT:    vmullb.u32 q2, q3, q0
-; CHECK-NEXT:    vmullb.u32 q1, q3, q4
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
+; CHECK-NEXT:    vmullb.u32 q1, q3, q0
 ; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %shuf1 = shufflevector <8 x i32> %src1, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
@@ -522,13 +510,13 @@ define arm_aapcs_vfpcc <4 x i64> @zext32_0213_ext0(<8 x i32> %src1, i32 %src2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    umull r1, r12, r1, r0
 ; CHECK-NEXT:    umull r3, r2, r3, r0
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov q2[3], q2[1], r2, r12
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    umull r1, r2, r1, r0
@@ -551,13 +539,13 @@ define arm_aapcs_vfpcc <4 x i64> @zext32_ext0_0213(<8 x i32> %src1, i32 %src2) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vmov r1, s2
 ; CHECK-NEXT:    vmov r3, s0
-; CHECK-NEXT:    vmov.f32 s4, s1
-; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s2, s3
 ; CHECK-NEXT:    umull r1, r12, r0, r1
 ; CHECK-NEXT:    umull r3, r2, r0, r3
 ; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
-; CHECK-NEXT:    vmov r1, s6
-; CHECK-NEXT:    vmov r3, s4
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov r3, s0
 ; CHECK-NEXT:    vmov q2[3], q2[1], r2, r12
 ; CHECK-NEXT:    vmov q0, q2
 ; CHECK-NEXT:    umull r1, r2, r0, r1
diff --git a/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll b/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll
new file mode 100644
index 0000000000000..926a177671413
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-vqdmulh-minmax.ll
@@ -0,0 +1,513 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
+
+define arm_aapcs_vfpcc i32 @vqdmulh_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
+; CHECK-LABEL: vqdmulh_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
+; CHECK-NEXT:    vaddv.s8 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <16 x i8> %s0 to <16 x i32>
+  %l5 = sext <16 x i8> %s1 to <16 x i32>
+  %l6 = mul nsw <16 x i32> %l5, %l2
+  %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
+  %l10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %l9)
+  ret i32 %l10
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vqdmulh_v16i8_b(<16 x i8> %s0, <16 x i8> %s1) {
+; CHECK-LABEL: vqdmulh_v16i8_b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <16 x i8> %s0 to <16 x i32>
+  %l5 = sext <16 x i8> %s1 to <16 x i32>
+  %l6 = mul nsw <16 x i32> %l5, %l2
+  %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
+  %l10 = trunc <16 x i32> %l9 to <16 x i8>
+  ret <16 x i8> %l10
+}
+
+define arm_aapcs_vfpcc <8 x i8> @vqdmulh_v8i8_b(<8 x i8> %s0, <8 x i8> %s1) {
+; CHECK-LABEL: vqdmulh_v8i8_b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
+; CHECK-NEXT:    vmovlb.s8 q0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <8 x i8> %s0 to <8 x i32>
+  %l5 = sext <8 x i8> %s1 to <8 x i32>
+  %l6 = mul nsw <8 x i32> %l5, %l2
+  %l7 = ashr <8 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
+  %l10 = trunc <8 x i32> %l9 to <8 x i8>
+  ret <8 x i8> %l10
+}
+
+define arm_aapcs_vfpcc <4 x i8> @vqdmulh_v4i8_b(<4 x i8> %s0, <4 x i8> %s1) {
+; CHECK-LABEL: vqdmulh_v4i8_b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
+; CHECK-NEXT:    vmovlb.s8 q0, q0
+; CHECK-NEXT:    vmovlb.s16 q0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <4 x i8> %s0 to <4 x i32>
+  %l5 = sext <4 x i8> %s1 to <4 x i32>
+  %l6 = mul nsw <4 x i32> %l5, %l2
+  %l7 = ashr <4 x i32> %l6, <i32 7, i32 7, i32 7, i32 7>
+  %l9 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l7, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
+  %l10 = trunc <4 x i32> %l9 to <4 x i8>
+  ret <4 x i8> %l10
+}
+
+define arm_aapcs_vfpcc <32 x i8> @vqdmulh_v32i8_b(<32 x i8> %s0, <32 x i8> %s1) {
+; CHECK-LABEL: vqdmulh_v32i8_b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s8 q0, q2, q0
+; CHECK-NEXT:    vqdmulh.s8 q1, q3, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <32 x i8> %s0 to <32 x i32>
+  %l5 = sext <32 x i8> %s1 to <32 x i32>
+  %l6 = mul nsw <32 x i32> %l5, %l2
+  %l7 = ashr <32 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %l9 = call <32 x i32> @llvm.smin.v32i32(<32 x i32> %l7, <32 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
+  %l10 = trunc <32 x i32> %l9 to <32 x i8>
+  ret <32 x i8> %l10
+}
+
+define arm_aapcs_vfpcc i32 @vqdmulh_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
+; CHECK-LABEL: vqdmulh_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
+; CHECK-NEXT:    vaddv.s16 r0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <8 x i16> %s0 to <8 x i32>
+  %l5 = sext <8 x i16> %s1 to <8 x i32>
+  %l6 = mul nsw <8 x i32> %l5, %l2
+  %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
+  %l10 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %l9)
+  ret i32 %l10
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_b(<8 x i16> %s0, <8 x i16> %s1) {
+; CHECK-LABEL: vqdmulh_v8i16_b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <8 x i16> %s0 to <8 x i32>
+  %l5 = sext <8 x i16> %s1 to <8 x i32>
+  %l6 = mul nsw <8 x i32> %l5, %l2
+  %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
+  %l10 = trunc <8 x i32> %l9 to <8 x i16>
+  ret <8 x i16> %l10
+}
+
+define arm_aapcs_vfpcc <4 x i16> @vqdmulh_v4i16_b(<4 x i16> %s0, <4 x i16> %s1) {
+; CHECK-LABEL: vqdmulh_v4i16_b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
+; CHECK-NEXT:    vmovlb.s16 q0, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <4 x i16> %s0 to <4 x i32>
+  %l5 = sext <4 x i16> %s1 to <4 x i32>
+  %l6 = mul nsw <4 x i32> %l5, %l2
+  %l7 = ashr <4 x i32> %l6, <i32 15, i32 15, i32 15, i32 15>
+  %l9 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l7, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
+  %l10 = trunc <4 x i32> %l9 to <4 x i16>
+  ret <4 x i16> %l10
+}
+
+define arm_aapcs_vfpcc <16 x i16> @vqdmulh_v16i16_b(<16 x i16> %s0, <16 x i16> %s1) {
+; CHECK-LABEL: vqdmulh_v16i16_b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s16 q0, q2, q0
+; CHECK-NEXT:    vqdmulh.s16 q1, q3, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <16 x i16> %s0 to <16 x i32>
+  %l5 = sext <16 x i16> %s1 to <16 x i32>
+  %l6 = mul nsw <16 x i32> %l5, %l2
+  %l7 = ashr <16 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
+  %l10 = trunc <16 x i32> %l9 to <16 x i16>
+  ret <16 x i16> %l10
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_c(<8 x i16> %s0, <8 x i16> %s1) {
+; CHECK-LABEL: vqdmulh_v8i16_c:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    vmov.u16 r0, q2[3]
+; CHECK-NEXT:    vmov.u16 r1, q2[1]
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    vmov.u16 r0, q1[2]
+; CHECK-NEXT:    vmov.u16 r1, q1[0]
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.u16 r0, q1[3]
+; CHECK-NEXT:    vmov.u16 r1, q1[1]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmullb.s16 q0, q3, q0
+; CHECK-NEXT:    vshl.i32 q0, q0, #10
+; CHECK-NEXT:    vshr.s32 q0, q0, #10
+; CHECK-NEXT:    vshr.s32 q3, q0, #15
+; CHECK-NEXT:    vmov r0, r1, d6
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.16 q0[1], r1
+; CHECK-NEXT:    vmov r0, r1, d7
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q2[6]
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    vmov.u16 r1, q2[4]
+; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
+; CHECK-NEXT:    vmov.u16 r0, q2[7]
+; CHECK-NEXT:    vmov.u16 r1, q2[5]
+; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
+; CHECK-NEXT:    vmov.u16 r0, q1[6]
+; CHECK-NEXT:    vmov.u16 r1, q1[4]
+; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
+; CHECK-NEXT:    vmov.u16 r0, q1[7]
+; CHECK-NEXT:    vmov.u16 r1, q1[5]
+; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
+; CHECK-NEXT:    vmullb.s16 q1, q2, q3
+; CHECK-NEXT:    vshl.i32 q1, q1, #10
+; CHECK-NEXT:    vshr.s32 q1, q1, #10
+; CHECK-NEXT:    vshr.s32 q1, q1, #15
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.16 q0[7], r1
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <8 x i16> %s0 to <8 x i22>
+  %l5 = sext <8 x i16> %s1 to <8 x i22>
+  %l6 = mul nsw <8 x i22> %l5, %l2
+  %l7 = ashr <8 x i22> %l6, <i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15>
+  %l9 = call <8 x i22> @llvm.smin.v8i22(<8 x i22> %l7, <8 x i22> <i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767>)
+  %l10 = trunc <8 x i22> %l9 to <8 x i16>
+  ret <8 x i16> %l10
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved(<8 x i16> %s0, <8 x i16> %s1) {
+; CHECK-LABEL: vqdmulh_v8i16_interleaved:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = shufflevector <8 x i16> %s0, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  %1 = sext <8 x i16> %0 to <8 x i32>
+  %l2 = sext <8 x i16> %s0 to <8 x i32>
+  %2 = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  %3 = sext <8 x i16> %2 to <8 x i32>
+  %l5 = sext <8 x i16> %s1 to <8 x i32>
+  %l6 = mul nsw <8 x i32> %3, %1
+  %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
+  %l10 = trunc <8 x i32> %l9 to <8 x i16>
+  %4 = shufflevector <8 x i16> %l10, <8 x i16> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x i16> %4
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved2(<4 x i32> %s0a, <8 x i16> %s1) {
+; CHECK-LABEL: vqdmulh_v8i16_interleaved2:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vqdmulh.s16 q2, q1, q0
+; CHECK-NEXT:    vrev32.16 q1, q1
+; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
+; CHECK-NEXT:    vmovnt.i32 q2, q0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+  %s0 = trunc <4 x i32> %s0a to <4 x i16>
+  %strided.vec = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %strided.vec44 = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %l7 = sext <4 x i16> %strided.vec to <4 x i32>
+  %l8 = sext <4 x i16> %s0 to <4 x i32>
+  %l9 = mul nsw <4 x i32> %l7, %l8
+  %l10 = ashr <4 x i32> %l9, <i32 15, i32 15, i32 15, i32 15>
+  %l12 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l10, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
+  %l13 = trunc <4 x i32> %l12 to <4 x i16>
+  %l14 = sext <4 x i16> %strided.vec44 to <4 x i32>
+  %l15 = mul nsw <4 x i32> %l14, %l8
+  %l16 = ashr <4 x i32> %l15, <i32 15, i32 15, i32 15, i32 15>
+  %l18 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l16, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
+  %l19 = trunc <4 x i32> %l18 to <4 x i16>
+  %interleaved.vec = shufflevector <4 x i16> %l13, <4 x i16> %l19, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x i16> %interleaved.vec
+}
+
+define arm_aapcs_vfpcc i64 @vqdmulh_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
+; CHECK-LABEL: vqdmulh_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
+; CHECK-NEXT:    vaddlv.s32 r0, r1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <4 x i32> %s0 to <4 x i64>
+  %l5 = sext <4 x i32> %s1 to <4 x i64>
+  %l6 = mul nsw <4 x i64> %l5, %l2
+  %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31>
+  %l9 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
+  %l10 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %l9)
+  ret i64 %l10
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vqdmulh_v4i32_b(<4 x i32> %s0, <4 x i32> %s1) {
+; CHECK-LABEL: vqdmulh_v4i32_b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <4 x i32> %s0 to <4 x i64>
+  %l5 = sext <4 x i32> %s1 to <4 x i64>
+  %l6 = mul nsw <4 x i64> %l5, %l2
+  %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31>
+  %l9 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
+  %l10 = trunc <4 x i64> %l9 to <4 x i32>
+  ret <4 x i32> %l10
+}
+
+define arm_aapcs_vfpcc <2 x i32> @vqdmulh_v2i32_b(<2 x i32> %s0, <2 x i32> %s1) {
+; CHECK-LABEL: vqdmulh_v2i32_b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
+; CHECK-NEXT:    vmov r0, s2
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    asrs r1, r1, #31
+; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <2 x i32> %s0 to <2 x i64>
+  %l5 = sext <2 x i32> %s1 to <2 x i64>
+  %l6 = mul nsw <2 x i64> %l5, %l2
+  %l7 = ashr <2 x i64> %l6, <i64 31, i64 31>
+  %l9 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %l7, <2 x i64> <i64 2147483647, i64 2147483647>)
+  %l10 = trunc <2 x i64> %l9 to <2 x i32>
+  ret <2 x i32> %l10
+}
+
+define arm_aapcs_vfpcc <8 x i32> @vqdmulh_v8i32_b(<8 x i32> %s0, <8 x i32> %s1) {
+; CHECK-LABEL: vqdmulh_v8i32_b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vqdmulh.s32 q0, q2, q0
+; CHECK-NEXT:    vqdmulh.s32 q1, q3, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <8 x i32> %s0 to <8 x i64>
+  %l5 = sext <8 x i32> %s1 to <8 x i64>
+  %l6 = mul nsw <8 x i64> %l5, %l2
+  %l7 = ashr <8 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
+  %l9 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %l7, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
+  %l10 = trunc <8 x i64> %l9 to <8 x i32>
+  ret <8 x i32> %l10
+}
+
+define arm_aapcs_vfpcc <16 x i32> @vqdmulh_v16i32_b(<16 x i32> %s0, <16 x i32> %s1) {
+; CHECK-LABEL: vqdmulh_v16i32_b:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    add r0, sp, #32
+; CHECK-NEXT:    vqdmulh.s32 q0, q4, q0
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    add r0, sp, #48
+; CHECK-NEXT:    vqdmulh.s32 q1, q4, q1
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    add r0, sp, #64
+; CHECK-NEXT:    vqdmulh.s32 q2, q4, q2
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vqdmulh.s32 q3, q4, q3
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %l2 = sext <16 x i32> %s0 to <16 x i64>
+  %l5 = sext <16 x i32> %s1 to <16 x i64>
+  %l6 = mul nsw <16 x i64> %l5, %l2
+  %l7 = ashr <16 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
+  %l9 = call <16 x i64> @llvm.smin.v16i64(<16 x i64> %l7, <16 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
+  %l10 = trunc <16 x i64> %l9 to <16 x i32>
+  ret <16 x i32> %l10
+}
+
+
+
+define void @vqdmulh_loop_i8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: vqdmulh_loop_i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    mov.w lr, #64
+; CHECK-NEXT:  .LBB17_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
+; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
+; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    le lr, .LBB17_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i8, i8* %x, i32 %index
+  %1 = bitcast i8* %0 to <16 x i8>*
+  %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
+  %2 = sext <16 x i8> %wide.load to <16 x i32>
+  %3 = getelementptr inbounds i8, i8* %y, i32 %index
+  %4 = bitcast i8* %3 to <16 x i8>*
+  %wide.load26 = load <16 x i8>, <16 x i8>* %4, align 1
+  %5 = sext <16 x i8> %wide.load26 to <16 x i32>
+  %6 = mul nsw <16 x i32> %5, %2
+  %7 = ashr <16 x i32> %6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  %8 = icmp slt <16 x i32> %7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
+  %9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
+  %10 = trunc <16 x i32> %9 to <16 x i8>
+  %11 = getelementptr inbounds i8, i8* %z, i32 %index
+  %12 = bitcast i8* %11 to <16 x i8>*
+  store <16 x i8> %10, <16 x i8>* %12, align 1
+  %index.next = add i32 %index, 16
+  %13 = icmp eq i32 %index.next, 1024
+  br i1 %13, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @vqdmulh_loop_i16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
+; CHECK-LABEL: vqdmulh_loop_i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    mov.w lr, #128
+; CHECK-NEXT:  .LBB18_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
+; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
+; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    le lr, .LBB18_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16, i16* %x, i32 %index
+  %1 = bitcast i16* %0 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
+  %2 = sext <8 x i16> %wide.load to <8 x i32>
+  %3 = getelementptr inbounds i16, i16* %y, i32 %index
+  %4 = bitcast i16* %3 to <8 x i16>*
+  %wide.load30 = load <8 x i16>, <8 x i16>* %4, align 2
+  %5 = sext <8 x i16> %wide.load30 to <8 x i32>
+  %6 = mul nsw <8 x i32> %5, %2
+  %7 = ashr <8 x i32> %6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
+  %8 = icmp slt <8 x i32> %7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
+  %10 = trunc <8 x i32> %9 to <8 x i16>
+  %11 = getelementptr inbounds i16, i16* %z, i32 %index
+  %12 = bitcast i16* %11 to <8 x i16>*
+  store <8 x i16> %10, <8 x i16>* %12, align 2
+  %index.next = add i32 %index, 8
+  %13 = icmp eq i32 %index.next, 1024
+  br i1 %13, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @vqdmulh_loop_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
+; CHECK-LABEL: vqdmulh_loop_i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    mov.w lr, #256
+; CHECK-NEXT:  .LBB19_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
+; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    le lr, .LBB19_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %x, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = sext <4 x i32> %wide.load to <4 x i64>
+  %3 = getelementptr inbounds i32, i32* %y, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  %wide.load30 = load <4 x i32>, <4 x i32>* %4, align 4
+  %5 = sext <4 x i32> %wide.load30 to <4 x i64>
+  %6 = mul nsw <4 x i64> %5, %2
+  %7 = ashr <4 x i64> %6, <i64 31, i64 31, i64 31, i64 31>
+  %8 = icmp slt <4 x i64> %7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
+  %9 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
+  %10 = trunc <4 x i64> %9 to <4 x i32>
+  %11 = getelementptr inbounds i32, i32* %z, i32 %index
+  %12 = bitcast i32* %11 to <4 x i32>*
+  store <4 x i32> %10, <4 x i32>* %12, align 4
+  %index.next = add i32 %index, 4
+  %13 = icmp eq i32 %index.next, 1024
+  br i1 %13, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define i32 @scalar(i16 %a) {
+; CHECK-LABEL: scalar:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    smulbb r1, r0, r0
+; CHECK-NEXT:    movs r0, #127
+; CHECK-NEXT:    asrs r2, r1, #7
+; CHECK-NEXT:    cmp r2, #127
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    asrlt r0, r1, #7
+; CHECK-NEXT:    bx lr
+  %e = sext i16 %a to i32
+  %d = mul nsw i32 %e, %e
+  %b = ashr i32 %d, 7
+  %c = call i32 @llvm.smin.i32(i32 %b, i32 127)
+  ret i32 %c
+}
+
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.smin.i32(i32 %a, i32 %b)
+declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>)
+declare <8 x i64> @llvm.smin.v8i64(<8 x i64>, <8 x i64>)
+declare <16 x i64> @llvm.smin.v16i64(<16 x i64>, <16 x i64>)
+declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
+declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>)
+declare <32 x i32> @llvm.smin.v32i32(<32 x i32>, <32 x i32>)
+declare <8 x i22> @llvm.smin.v8i22(<8 x i22>, <8 x i22>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll
index ebaeae88af718..eafbf41bc6241 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2-post.ll
@@ -72,16 +72,16 @@ entry:
 define <4 x i64> *@vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-LABEL: vst2_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    add.w r0, r1, #32
-; CHECK-NEXT:    vmov.f64 d4, d1
+; CHECK-NEXT:    vmov.f32 s8, s2
 ; CHECK-NEXT:    vmov.f32 s9, s3
 ; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s10, s6
 ; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vmov.f32 s11, s7
+; CHECK-NEXT:    vmov.f32 s10, s6
 ; CHECK-NEXT:    vstrb.8 q0, [r1], #16
+; CHECK-NEXT:    vmov.f32 s11, s7
 ; CHECK-NEXT:    vstrw.32 q2, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -144,11 +144,11 @@ entry:
 define <4 x double> *@vst2_v2f64(<2 x double> *%src, <4 x double> *%dst) {
 ; CHECK-LABEL: vst2_v2f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d4, d2
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vmov.f64 d5, d0
 ; CHECK-NEXT:    vmov.f64 d0, d3
+; CHECK-NEXT:    vmov.f64 d4, d2
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vstrw.32 q2, [r1], #32
 ; CHECK-NEXT:    mov r0, r1
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst2.ll b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
index 67a606396127e..c749b36416f66 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst2.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst2.ll
@@ -44,8 +44,8 @@ define void @vst2_v8i32(<8 x i32> *%src, <16 x i32> *%dst) {
 ; CHECK-LABEL: vst2_v8i32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vst20.32 {q0, q1}, [r1]
 ; CHECK-NEXT:    vst21.32 {q0, q1}, [r1]!
@@ -68,10 +68,10 @@ define void @vst2_v16i32(<16 x i32> *%src, <32 x i32> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #112]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
@@ -100,17 +100,17 @@ entry:
 define void @vst2_v4i32_align1(<4 x i32> *%src, <8 x i32> *%dst) {
 ; CHECK-LABEL: vst2_v4i32_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d4, d3
-; CHECK-NEXT:    vmov.f64 d6, d2
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.f32 s9, s2
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s14, s5
 ; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vmov.f32 s15, s1
+; CHECK-NEXT:    vmov.f32 s12, s4
 ; CHECK-NEXT:    vstrb.8 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s15, s1
 ; CHECK-NEXT:    vstrb.8 q3, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -186,8 +186,8 @@ define void @vst2_v16i16(<16 x i16> *%src, <32 x i16> *%dst) {
 ; CHECK-LABEL: vst2_v16i16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vst20.16 {q0, q1}, [r1]
 ; CHECK-NEXT:    vst21.16 {q0, q1}, [r1]!
@@ -207,34 +207,31 @@ entry:
 define void @vst2_v8i16_align1(<8 x i16> *%src, <16 x i16> *%dst) {
 ; CHECK-LABEL: vst2_v8i16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
 ; CHECK-NEXT:    vmovx.f16 s1, s10
-; CHECK-NEXT:    vmovx.f16 s12, s6
-; CHECK-NEXT:    vins.f16 s1, s12
+; CHECK-NEXT:    vmovx.f16 s0, s6
 ; CHECK-NEXT:    vins.f16 s10, s6
-; CHECK-NEXT:    vmov.f32 s0, s10
-; CHECK-NEXT:    vmovx.f16 s12, s7
 ; CHECK-NEXT:    vmovx.f16 s3, s11
+; CHECK-NEXT:    vmovx.f16 s6, s7
 ; CHECK-NEXT:    vins.f16 s11, s7
-; CHECK-NEXT:    vmov.f32 s2, s11
-; CHECK-NEXT:    vmovx.f16 s14, s4
-; CHECK-NEXT:    vins.f16 s3, s12
-; CHECK-NEXT:    vmovx.f16 s12, s8
+; CHECK-NEXT:    vins.f16 s3, s6
+; CHECK-NEXT:    vmovx.f16 s6, s8
 ; CHECK-NEXT:    vins.f16 s8, s4
-; CHECK-NEXT:    vins.f16 s12, s14
-; CHECK-NEXT:    vmov q4, q2
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vmov q3, q2
+; CHECK-NEXT:    vins.f16 s6, s4
+; CHECK-NEXT:    vmovx.f16 s15, s9
+; CHECK-NEXT:    vins.f16 s9, s5
 ; CHECK-NEXT:    vmovx.f16 s4, s5
-; CHECK-NEXT:    vmov.f32 s17, s12
+; CHECK-NEXT:    vins.f16 s1, s0
+; CHECK-NEXT:    vmov.f32 s0, s10
+; CHECK-NEXT:    vins.f16 s15, s4
+; CHECK-NEXT:    vmov.f32 s2, s11
+; CHECK-NEXT:    vmov.f32 s13, s6
 ; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
-; CHECK-NEXT:    vmovx.f16 s19, s9
-; CHECK-NEXT:    vins.f16 s9, s5
-; CHECK-NEXT:    vmov.f32 s18, s9
-; CHECK-NEXT:    vins.f16 s19, s4
-; CHECK-NEXT:    vstrb.8 q4, [r1]
-; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    vmov.f32 s14, s9
+; CHECK-NEXT:    vstrb.8 q3, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
@@ -328,12 +325,12 @@ entry:
 define void @vst2_v2i64(<2 x i64> *%src, <4 x i64> *%dst) {
 ; CHECK-LABEL: vst2_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vmov.f32 s9, s5
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vmov.f32 s10, s0
 ; CHECK-NEXT:    vmov.f32 s11, s1
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s9, s5
 ; CHECK-NEXT:    vmov.f32 s0, s6
 ; CHECK-NEXT:    vstrb.8 q2, [r1], #16
 ; CHECK-NEXT:    vmov.f32 s1, s7
@@ -354,25 +351,25 @@ define void @vst2_v4i64(<4 x i64> *%src, <8 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d6, d1
-; CHECK-NEXT:    vmov.f64 d10, d3
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f32 s12, s2
 ; CHECK-NEXT:    vmov.f32 s13, s3
+; CHECK-NEXT:    vmov.f32 s20, s6
 ; CHECK-NEXT:    vmov.f32 s21, s7
 ; CHECK-NEXT:    vmov.f32 s2, s16
-; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmov.f32 s22, s10
 ; CHECK-NEXT:    vmov.f32 s3, s17
-; CHECK-NEXT:    vmov.f32 s7, s9
+; CHECK-NEXT:    vmov.f32 s6, s8
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.f32 s7, s9
+; CHECK-NEXT:    vmov.f32 s14, s18
 ; CHECK-NEXT:    vstrb.8 q1, [r1], #48
-; CHECK-NEXT:    vmov.f32 s23, s11
+; CHECK-NEXT:    vmov.f32 s15, s19
+; CHECK-NEXT:    vmov.f32 s22, s10
 ; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vmov.f32 s23, s11
 ; CHECK-NEXT:    vstrw.32 q5, [r1, #-32]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
@@ -429,8 +426,8 @@ define void @vst2_v8f32(<8 x float> *%src, <16 x float> *%dst) {
 ; CHECK-LABEL: vst2_v8f32:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vst20.32 {q0, q1}, [r1]
 ; CHECK-NEXT:    vst21.32 {q0, q1}, [r1]!
@@ -453,10 +450,10 @@ define void @vst2_v16f32(<16 x float> *%src, <32 x float> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #112]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #96]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
@@ -485,17 +482,17 @@ entry:
 define void @vst2_v4f32_align1(<4 x float> *%src, <8 x float> *%dst) {
 ; CHECK-LABEL: vst2_v4f32_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d4, d3
-; CHECK-NEXT:    vmov.f64 d6, d2
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f32 s8, s6
 ; CHECK-NEXT:    vmov.f32 s9, s2
-; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vmov.f32 s10, s7
-; CHECK-NEXT:    vmov.f32 s14, s5
 ; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vmov.f32 s15, s1
+; CHECK-NEXT:    vmov.f32 s12, s4
 ; CHECK-NEXT:    vstrb.8 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s15, s1
 ; CHECK-NEXT:    vstrb.8 q3, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
@@ -543,19 +540,19 @@ define void @vst2_v4f16(<4 x half> *%src, <8 x half> *%dst) {
 ; CHECK-NEXT:    vmov.32 q1[0], r3
 ; CHECK-NEXT:    vmov.32 q0[1], r12
 ; CHECK-NEXT:    vmov.32 q1[1], r0
-; CHECK-NEXT:    vmovx.f16 s8, s0
-; CHECK-NEXT:    vmovx.f16 s10, s4
+; CHECK-NEXT:    vmovx.f16 s2, s0
 ; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vins.f16 s8, s10
-; CHECK-NEXT:    vmovx.f16 s10, s1
-; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s1
 ; CHECK-NEXT:    vins.f16 s1, s5
-; CHECK-NEXT:    vins.f16 s10, s4
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vmov.f32 s5, s8
-; CHECK-NEXT:    vmov.f32 s6, s1
-; CHECK-NEXT:    vmov.f32 s7, s10
-; CHECK-NEXT:    vstrh.16 q1, [r1]
+; CHECK-NEXT:    vmovx.f16 s6, s5
+; CHECK-NEXT:    vmov q2, q0
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmov.f32 s9, s2
+; CHECK-NEXT:    vmov.f32 s10, s1
+; CHECK-NEXT:    vmov.f32 s11, s4
+; CHECK-NEXT:    vstrh.16 q2, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
@@ -589,8 +586,8 @@ define void @vst2_v16f16(<16 x half> *%src, <32 x half> *%dst) {
 ; CHECK-LABEL: vst2_v16f16:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vst20.16 {q2, q3}, [r1]
 ; CHECK-NEXT:    vst21.16 {q2, q3}, [r1]!
@@ -610,32 +607,32 @@ entry:
 define void @vst2_v8f16_align1(<8 x half> *%src, <16 x half> *%dst) {
 ; CHECK-LABEL: vst2_v8f16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vmovx.f16 s1, s6
-; CHECK-NEXT:    vmovx.f16 s12, s10
-; CHECK-NEXT:    vins.f16 s1, s12
-; CHECK-NEXT:    vins.f16 s6, s10
+; CHECK-NEXT:    vmovx.f16 s0, s10
+; CHECK-NEXT:    vins.f16 s1, s0
 ; CHECK-NEXT:    vmovx.f16 s3, s7
-; CHECK-NEXT:    vmovx.f16 s12, s11
+; CHECK-NEXT:    vmovx.f16 s0, s11
+; CHECK-NEXT:    vins.f16 s6, s10
+; CHECK-NEXT:    vins.f16 s3, s0
+; CHECK-NEXT:    vmovx.f16 s10, s4
+; CHECK-NEXT:    vmovx.f16 s0, s8
 ; CHECK-NEXT:    vins.f16 s7, s11
-; CHECK-NEXT:    vins.f16 s3, s12
-; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vmovx.f16 s14, s8
 ; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vins.f16 s12, s14
-; CHECK-NEXT:    vmovx.f16 s14, s5
+; CHECK-NEXT:    vins.f16 s10, s0
+; CHECK-NEXT:    vmovx.f16 s8, s5
 ; CHECK-NEXT:    vins.f16 s5, s9
-; CHECK-NEXT:    vmovx.f16 s8, s9
+; CHECK-NEXT:    vmovx.f16 s0, s9
+; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vins.f16 s8, s0
 ; CHECK-NEXT:    vmov.f32 s0, s6
-; CHECK-NEXT:    vins.f16 s14, s8
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmov.f32 s9, s12
-; CHECK-NEXT:    vmov.f32 s10, s5
 ; CHECK-NEXT:    vmov.f32 s2, s7
-; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    vmov.f32 s13, s10
 ; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
-; CHECK-NEXT:    vstrb.8 q2, [r1]
+; CHECK-NEXT:    vmov.f32 s14, s5
+; CHECK-NEXT:    vmov.f32 s15, s8
+; CHECK-NEXT:    vstrb.8 q3, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
@@ -652,8 +649,8 @@ entry:
 define void @vst2_v2f64(<2 x double> *%src, <4 x double> *%dst) {
 ; CHECK-LABEL: vst2_v2f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vmov.f64 d4, d3
 ; CHECK-NEXT:    vmov.f64 d5, d1
 ; CHECK-NEXT:    vmov.f64 d3, d0
@@ -675,17 +672,17 @@ define void @vst2_v4f64(<4 x double> *%src, <8 x double> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
 ; CHECK-NEXT:    vmov.f64 d8, d4
 ; CHECK-NEXT:    vmov.f64 d9, d0
 ; CHECK-NEXT:    vmov.f64 d0, d5
 ; CHECK-NEXT:    vstrw.32 q4, [r1]
-; CHECK-NEXT:    vmov.f64 d4, d6
-; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
 ; CHECK-NEXT:    vmov.f64 d5, d2
+; CHECK-NEXT:    vstrw.32 q0, [r1, #16]
+; CHECK-NEXT:    vmov.f64 d4, d6
 ; CHECK-NEXT:    vmov.f64 d2, d7
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 1e46dd1b256f5..7d4763fdeb03a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -12,16 +12,15 @@ define void @vst3_v2i32(<2 x i32> *%src, <6 x i32> *%dst) {
 ; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
 ; CHECK-NEXT:    ldrd r4, r0, [r0, #16]
 ; CHECK-NEXT:    vmov q1[2], q1[0], lr, r3
-; CHECK-NEXT:    vmov.32 q0[0], r4
 ; CHECK-NEXT:    vmov q1[3], q1[1], r12, r2
-; CHECK-NEXT:    vmov.32 q0[1], r0
+; CHECK-NEXT:    vmov.32 q0[0], r4
 ; CHECK-NEXT:    vmov.f32 s8, s7
-; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov r2, s8
-; CHECK-NEXT:    vmov.f64 d4, d2
+; CHECK-NEXT:    vmov.32 q0[1], r0
 ; CHECK-NEXT:    vmov.f32 s9, s6
 ; CHECK-NEXT:    vmov.f32 s10, s0
 ; CHECK-NEXT:    vmov.f32 s11, s5
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    vmov.f32 s8, s4
 ; CHECK-NEXT:    vstrw.32 q2, [r1]
 ; CHECK-NEXT:    strd r2, r0, [r1, #16]
 ; CHECK-NEXT:    pop {r4, pc}
@@ -44,22 +43,22 @@ define void @vst3_v4i32(<4 x i32> *%src, <12 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d8, d6
-; CHECK-NEXT:    vmov.f32 s17, s4
 ; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s19, s13
 ; CHECK-NEXT:    vmov.f32 s9, s1
 ; CHECK-NEXT:    vmov.f32 s18, s0
 ; CHECK-NEXT:    vmov.f32 s0, s2
-; CHECK-NEXT:    vstrw.32 q4, [r1]
 ; CHECK-NEXT:    vmov.f32 s11, s6
-; CHECK-NEXT:    vmov.f32 s1, s15
 ; CHECK-NEXT:    vmov.f32 s10, s14
-; CHECK-NEXT:    vmov.f32 s2, s7
+; CHECK-NEXT:    vmov.f32 s16, s12
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.f32 s19, s13
+; CHECK-NEXT:    vmov.f32 s1, s15
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vmov.f32 s2, s7
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
@@ -84,41 +83,41 @@ define void @vst3_v8i32(<8 x i32> *%src, <24 x i32> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vldrw.u32 q4, [r0]
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q7, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT:    vmov.f64 d10, d8
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    vmov.f32 s21, s28
-; CHECK-NEXT:    vmov.f64 d14, d12
-; CHECK-NEXT:    vmov.f64 d4, d1
-; CHECK-NEXT:    vmov.f32 s29, s12
-; CHECK-NEXT:    vmov.f32 s9, s27
-; CHECK-NEXT:    vmov.f32 s31, s25
+; CHECK-NEXT:    vmov.f32 s8, s2
+; CHECK-NEXT:    vmov.f32 s20, s28
+; CHECK-NEXT:    vmov.f32 s9, s19
+; CHECK-NEXT:    vmov.f32 s28, s16
+; CHECK-NEXT:    vmov.f32 s31, s17
+; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s11, s3
+; CHECK-NEXT:    vmov.f32 s10, s15
+; CHECK-NEXT:    vmov.f32 s23, s29
+; CHECK-NEXT:    vstrw.32 q2, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s22, s4
+; CHECK-NEXT:    vmov.f32 s21, s24
+; CHECK-NEXT:    vmov.f32 s29, s12
+; CHECK-NEXT:    vstrw.32 q5, [r1]
 ; CHECK-NEXT:    vmov.f32 s30, s0
 ; CHECK-NEXT:    vmov.f32 s0, s13
 ; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
 ; CHECK-NEXT:    vmov.f32 s3, s14
-; CHECK-NEXT:    vmov.f32 s2, s26
-; CHECK-NEXT:    vldrw.u32 q6, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s10, s15
-; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
-; CHECK-NEXT:    vmov.f32 s23, s17
-; CHECK-NEXT:    vstrw.32 q2, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s12, s25
 ; CHECK-NEXT:    vmov.f32 s13, s5
-; CHECK-NEXT:    vmov.f32 s22, s4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
 ; CHECK-NEXT:    vmov.f32 s4, s6
-; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vmov.f32 s12, s25
 ; CHECK-NEXT:    vmov.f32 s15, s26
-; CHECK-NEXT:    vmov.f32 s5, s19
 ; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmov.f32 s6, s27
+; CHECK-NEXT:    vmov.f32 s5, s19
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s6, s27
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
 ; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
@@ -142,108 +141,106 @@ define void @vst3_v16i32(<16 x i32> *%src, <48 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #160
-; CHECK-NEXT:    sub sp, #160
+; CHECK-NEXT:    .pad #144
+; CHECK-NEXT:    sub sp, #144
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #160]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
-; CHECK-NEXT:    vldrw.u32 q6, [r0]
-; CHECK-NEXT:    vstrw.32 q7, [sp, #112] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #80]
-; CHECK-NEXT:    vmov.f32 s16, s1
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #160]
-; CHECK-NEXT:    vstrw.32 q7, [sp, #144] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
-; CHECK-NEXT:    vmov.f32 s17, s9
 ; CHECK-NEXT:    vstrw.32 q3, [sp, #128] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s19, s2
-; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vstrw.32 q7, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s18, s26
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #144]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #176]
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vstrw.32 q7, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
-; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
-; CHECK-NEXT:    vmov.f64 d8, d5
+; CHECK-NEXT:    vmov.f32 s16, s1
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #176]
+; CHECK-NEXT:    vmov.f32 s19, s2
 ; CHECK-NEXT:    vstrw.32 q7, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s17, s9
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
+; CHECK-NEXT:    vmov.f32 s18, s26
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #144]
+; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s18, s3
+; CHECK-NEXT:    vmov.f32 s16, s10
 ; CHECK-NEXT:    vmov.f32 s17, s27
 ; CHECK-NEXT:    vmov.f32 s19, s11
-; CHECK-NEXT:    vmov.f32 s18, s3
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
-; CHECK-NEXT:    vmov.f64 d8, d3
-; CHECK-NEXT:    vmov.f32 s17, s31
+; CHECK-NEXT:    vmov.f32 s16, s6
 ; CHECK-NEXT:    vmov.f32 s19, s7
-; CHECK-NEXT:    vmov.f32 s18, s15
-; CHECK-NEXT:    vstrw.32 q4, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d8, d12
-; CHECK-NEXT:    vmov.f32 s17, s0
-; CHECK-NEXT:    vmov.f32 s19, s25
+; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vmov.f32 s18, s31
+; CHECK-NEXT:    vstrw.32 q4, [sp, #112] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s18, s8
-; CHECK-NEXT:    vmov q2, q7
-; CHECK-NEXT:    vmov.f64 d0, d4
-; CHECK-NEXT:    vstrw.32 q4, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s1, s12
-; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vmov.f64 d4, d14
 ; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s4, s13
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s17, s0
+; CHECK-NEXT:    vmov.f32 s3, s13
+; CHECK-NEXT:    vmov.f32 s0, s12
+; CHECK-NEXT:    vmov.f64 d14, d4
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s7, s14
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s6, s10
+; CHECK-NEXT:    vmov.f32 s16, s24
+; CHECK-NEXT:    vmov.f32 s19, s25
+; CHECK-NEXT:    vstrw.32 q4, [sp, #96] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s17, s1
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vmov q0, q5
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d0, d14
+; CHECK-NEXT:    vldrw.u32 q5, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s6, s14
+; CHECK-NEXT:    vmov.f32 s7, s30
 ; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d4, d1
-; CHECK-NEXT:    vmov q3, q1
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s31, s1
+; CHECK-NEXT:    vmov.f64 d0, d10
 ; CHECK-NEXT:    vmov.f32 s16, s5
-; CHECK-NEXT:    vmov.f32 s17, s1
 ; CHECK-NEXT:    vmov.f32 s19, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d12, d11
-; CHECK-NEXT:    vmov q7, q1
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s18, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vmov q0, q7
-; CHECK-NEXT:    vmov.f32 s25, s7
-; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
-; CHECK-NEXT:    vmov.f32 s27, s23
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s10, s15
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s29, s20
-; CHECK-NEXT:    vmov q5, q1
-; CHECK-NEXT:    vmov.f32 s31, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s26, s15
-; CHECK-NEXT:    vstrw.32 q2, [r1, #128]
-; CHECK-NEXT:    vmov.f32 s30, s0
-; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
-; CHECK-NEXT:    vmov.f64 d0, d2
+; CHECK-NEXT:    vmov.f32 s14, s7
+; CHECK-NEXT:    vmov.f32 s29, s4
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s24, s2
+; CHECK-NEXT:    vmov.f32 s30, s4
+; CHECK-NEXT:    vmov.f32 s27, s3
 ; CHECK-NEXT:    vstrw.32 q7, [r1, #96]
-; CHECK-NEXT:    vmov.f32 s1, s12
-; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s12
-; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s4, s0
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s10
+; CHECK-NEXT:    vmov.f64 d10, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s12, s5
+; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s15, s6
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s14, s22
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s13, s11
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [r1, #64]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s25, s23
+; CHECK-NEXT:    vstrw.32 q3, [r1, #128]
+; CHECK-NEXT:    vmov.f32 s26, s11
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #160
+; CHECK-NEXT:    vmov.f32 s6, s20
+; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s5, s8
+; CHECK-NEXT:    vmov.f32 s20, s9
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s23, s10
+; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
+; CHECK-NEXT:    add sp, #144
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -303,23 +300,23 @@ define void @vst3_v4i16(<4 x i16> *%src, <12 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r4, r5, r7, lr}
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    vldrh.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrh.u32 q1, [r0]
 ; CHECK-NEXT:    vldrh.u32 q0, [r0, #8]
-; CHECK-NEXT:    vmov.f64 d6, d5
-; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vldrh.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vmov r0, r5, d2
+; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vmov r2, r3, d0
 ; CHECK-NEXT:    vmov lr, r4, d1
 ; CHECK-NEXT:    vmov.16 q0[0], r0
-; CHECK-NEXT:    vmov.f32 s15, s11
+; CHECK-NEXT:    vmov.f32 s4, s10
 ; CHECK-NEXT:    vmov.16 q0[1], r2
-; CHECK-NEXT:    vmov.32 q3[2], r4
+; CHECK-NEXT:    vmov.f32 s7, s11
+; CHECK-NEXT:    vmov r12, s6
+; CHECK-NEXT:    vmov.32 q1[2], r4
 ; CHECK-NEXT:    vmov r0, r4, d4
+; CHECK-NEXT:    vstrh.32 q1, [r1, #16]
 ; CHECK-NEXT:    vmov.16 q0[2], r0
-; CHECK-NEXT:    vmov r12, s6
 ; CHECK-NEXT:    vmov.16 q0[3], r5
-; CHECK-NEXT:    vstrh.32 q3, [r1, #16]
 ; CHECK-NEXT:    vmov.16 q0[4], r3
 ; CHECK-NEXT:    vmov.16 q0[5], r4
 ; CHECK-NEXT:    vmov.16 q0[6], r12
@@ -343,64 +340,52 @@ entry:
 define void @vst3_v8i16(<8 x i16> *%src, <24 x i16> *%dst) {
 ; CHECK-LABEL: vst3_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d0, d4
+; CHECK-NEXT:    vmov.f32 s0, s8
 ; CHECK-NEXT:    vmov.u16 r2, q1[1]
-; CHECK-NEXT:    vmovx.f16 s20, s8
 ; CHECK-NEXT:    vins.f16 s0, s4
 ; CHECK-NEXT:    vmov.f32 s12, s9
 ; CHECK-NEXT:    vins.f16 s12, s5
 ; CHECK-NEXT:    vmov.16 q0[4], r2
 ; CHECK-NEXT:    vmov.f32 s3, s12
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s1, s8
-; CHECK-NEXT:    vmov.f32 s17, s12
-; CHECK-NEXT:    vmov.f32 s18, s12
-; CHECK-NEXT:    vins.f16 s17, s20
-; CHECK-NEXT:    vmovx.f16 s20, s18
-; CHECK-NEXT:    vins.f16 s2, s20
-; CHECK-NEXT:    vmovx.f16 s20, s14
-; CHECK-NEXT:    vmov.f32 s18, s2
-; CHECK-NEXT:    vmov.f32 s1, s17
-; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vmovx.f16 s8, s8
 ; CHECK-NEXT:    vmovx.f16 s16, s6
-; CHECK-NEXT:    vins.f16 s16, s20
-; CHECK-NEXT:    vmovx.f16 s20, s15
+; CHECK-NEXT:    vmov.f32 s1, s12
 ; CHECK-NEXT:    vins.f16 s17, s7
-; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vins.f16 s1, s8
+; CHECK-NEXT:    vmovx.f16 s8, s12
+; CHECK-NEXT:    vins.f16 s2, s8
+; CHECK-NEXT:    vmovx.f16 s8, s14
+; CHECK-NEXT:    vins.f16 s16, s8
 ; CHECK-NEXT:    vmovx.f16 s19, s7
-; CHECK-NEXT:    vrev32.16 q1, q1
-; CHECK-NEXT:    vins.f16 s19, s20
-; CHECK-NEXT:    vmov.f32 s21, s11
+; CHECK-NEXT:    vmovx.f16 s8, s15
 ; CHECK-NEXT:    vmov.f32 s18, s15
-; CHECK-NEXT:    vmovx.f16 s24, s17
-; CHECK-NEXT:    vmov.f32 s22, s11
-; CHECK-NEXT:    vins.f16 s21, s24
-; CHECK-NEXT:    vmovx.f16 s24, s22
-; CHECK-NEXT:    vins.f16 s18, s24
-; CHECK-NEXT:    vmov.f32 s12, s13
-; CHECK-NEXT:    vmov.f32 s22, s18
-; CHECK-NEXT:    vmov.f32 s17, s21
-; CHECK-NEXT:    vmov.f32 s18, s22
-; CHECK-NEXT:    vmovx.f16 s20, s9
-; CHECK-NEXT:    vins.f16 s12, s20
-; CHECK-NEXT:    vmovx.f16 s20, s10
-; CHECK-NEXT:    vins.f16 s14, s20
+; CHECK-NEXT:    vins.f16 s19, s8
+; CHECK-NEXT:    vmovx.f16 s8, s17
+; CHECK-NEXT:    vmov.f32 s17, s11
+; CHECK-NEXT:    vmovx.f16 s12, s9
+; CHECK-NEXT:    vins.f16 s17, s8
+; CHECK-NEXT:    vmovx.f16 s8, s11
+; CHECK-NEXT:    vins.f16 s18, s8
+; CHECK-NEXT:    vmov.f32 s8, s13
+; CHECK-NEXT:    vins.f16 s8, s12
+; CHECK-NEXT:    vmovx.f16 s12, s10
+; CHECK-NEXT:    vins.f16 s14, s12
+; CHECK-NEXT:    vrev32.16 q1, q1
+; CHECK-NEXT:    vmovx.f16 s12, s13
+; CHECK-NEXT:    vmovx.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s5, s12
+; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    vins.f16 s10, s4
+; CHECK-NEXT:    vmov.f32 s9, s5
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s15, s14
-; CHECK-NEXT:    vmov.f32 s14, s10
-; CHECK-NEXT:    vmovx.f16 s8, s13
-; CHECK-NEXT:    vins.f16 s5, s8
-; CHECK-NEXT:    vmovx.f16 s8, s6
-; CHECK-NEXT:    vins.f16 s14, s8
-; CHECK-NEXT:    vmov.f32 s6, s14
-; CHECK-NEXT:    vmov.f32 s13, s5
-; CHECK-NEXT:    vmov.f32 s14, s6
-; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12}
+; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
@@ -421,135 +406,112 @@ define void @vst3_v16i16(<16 x i16> *%src, <48 x i16> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #80
-; CHECK-NEXT:    sub sp, #80
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
-; CHECK-NEXT:    vmovx.f16 s0, s14
-; CHECK-NEXT:    vmovx.f16 s8, s22
-; CHECK-NEXT:    vins.f16 s8, s0
-; CHECK-NEXT:    vmovx.f16 s0, s15
-; CHECK-NEXT:    vins.f16 s9, s23
-; CHECK-NEXT:    vmov.u16 r2, q6[1]
-; CHECK-NEXT:    vmovx.f16 s11, s23
-; CHECK-NEXT:    vstrw.32 q6, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vins.f16 s11, s0
-; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s10, s15
-; CHECK-NEXT:    vmovx.f16 s4, s9
-; CHECK-NEXT:    vmov q4, q2
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vmov.f32 s1, s11
-; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s2, s11
-; CHECK-NEXT:    vins.f16 s1, s4
-; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    .pad #64
+; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
+; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmovx.f16 s0, s10
+; CHECK-NEXT:    vmovx.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s1, s7
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vmovx.f16 s0, s11
+; CHECK-NEXT:    vmovx.f16 s7, s7
+; CHECK-NEXT:    vmov.f32 s12, s4
+; CHECK-NEXT:    vins.f16 s7, s0
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s18, s11
+; CHECK-NEXT:    vmov.f32 s15, s7
+; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s13, s3
+; CHECK-NEXT:    vins.f16 s13, s4
+; CHECK-NEXT:    vmovx.f16 s4, s3
 ; CHECK-NEXT:    vins.f16 s18, s4
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov.f32 s2, s18
-; CHECK-NEXT:    vmov.f64 d4, d2
-; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vmovx.f16 s28, s4
-; CHECK-NEXT:    vins.f16 s8, s24
-; CHECK-NEXT:    vmov.f32 s17, s1
-; CHECK-NEXT:    vmov.16 q2[4], r2
-; CHECK-NEXT:    vmov.f32 s11, s5
-; CHECK-NEXT:    vins.f16 s11, s25
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s20, s24
+; CHECK-NEXT:    vins.f16 s20, s4
+; CHECK-NEXT:    vmov.u16 r2, q1[1]
+; CHECK-NEXT:    vmov.16 q5[4], r2
+; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s23, s25
+; CHECK-NEXT:    vmovx.f16 s4, s24
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
-; CHECK-NEXT:    vmov.f32 s9, s4
-; CHECK-NEXT:    vmov.u16 r0, q5[1]
+; CHECK-NEXT:    vmov.f32 s14, s18
+; CHECK-NEXT:    vins.f16 s23, s5
+; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s5, s24
-; CHECK-NEXT:    vmov.f32 s6, s24
-; CHECK-NEXT:    vins.f16 s5, s28
-; CHECK-NEXT:    vmovx.f16 s28, s6
-; CHECK-NEXT:    vins.f16 s10, s28
-; CHECK-NEXT:    vmov.f32 s18, s2
-; CHECK-NEXT:    vmov.f32 s6, s10
-; CHECK-NEXT:    vstrw.32 q4, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s9, s5
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s10, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q2, [r1]
-; CHECK-NEXT:    vmov.f64 d14, d2
-; CHECK-NEXT:    vins.f16 s28, s20
-; CHECK-NEXT:    vmov.f32 s0, s5
-; CHECK-NEXT:    vins.f16 s0, s21
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s28, s0
+; CHECK-NEXT:    vins.f16 s5, s4
+; CHECK-NEXT:    vmovx.f16 s4, s24
+; CHECK-NEXT:    vmov.u16 r0, q3[1]
+; CHECK-NEXT:    vins.f16 s28, s12
+; CHECK-NEXT:    vins.f16 s22, s4
+; CHECK-NEXT:    vmov.f32 s4, s1
 ; CHECK-NEXT:    vmov.16 q7[4], r0
-; CHECK-NEXT:    vmov.f32 s31, s0
-; CHECK-NEXT:    vldrw.u32 q5, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s1, s12
-; CHECK-NEXT:    vmov.f32 s29, s4
-; CHECK-NEXT:    vmovx.f16 s4, s4
-; CHECK-NEXT:    vmov.f32 s2, s12
-; CHECK-NEXT:    vins.f16 s1, s4
-; CHECK-NEXT:    vmovx.f16 s4, s2
-; CHECK-NEXT:    vins.f16 s30, s4
-; CHECK-NEXT:    vmovx.f16 s4, s26
-; CHECK-NEXT:    vmov.f32 s2, s30
-; CHECK-NEXT:    vmov.f32 s29, s1
-; CHECK-NEXT:    vmov.f32 s12, s13
-; CHECK-NEXT:    vmov.f32 s30, s2
-; CHECK-NEXT:    vmovx.f16 s0, s18
-; CHECK-NEXT:    vins.f16 s0, s4
-; CHECK-NEXT:    vmov q1, q4
-; CHECK-NEXT:    vins.f16 s1, s7
+; CHECK-NEXT:    vins.f16 s4, s13
+; CHECK-NEXT:    vmov.f32 s21, s5
+; CHECK-NEXT:    vmov.f32 s31, s4
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmov.f32 s29, s8
+; CHECK-NEXT:    vins.f16 s29, s0
+; CHECK-NEXT:    vmovx.f16 s0, s8
+; CHECK-NEXT:    vins.f16 s30, s0
+; CHECK-NEXT:    vmovx.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s0, s26
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s4, s0
+; CHECK-NEXT:    vins.f16 s5, s7
+; CHECK-NEXT:    vmovx.f16 s7, s7
+; CHECK-NEXT:    vmovx.f16 s0, s27
+; CHECK-NEXT:    vins.f16 s7, s0
+; CHECK-NEXT:    vmovx.f16 s0, s5
+; CHECK-NEXT:    vmov.f32 s13, s19
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s13, s0
+; CHECK-NEXT:    vmov.f32 s14, s27
+; CHECK-NEXT:    vmovx.f16 s0, s19
+; CHECK-NEXT:    vmov.f32 s12, s25
+; CHECK-NEXT:    vins.f16 s14, s0
+; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vins.f16 s0, s6
+; CHECK-NEXT:    vmovx.f16 s6, s2
+; CHECK-NEXT:    vins.f16 s10, s6
+; CHECK-NEXT:    vmovx.f16 s6, s9
+; CHECK-NEXT:    vmov.f32 s3, s10
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s5, s13
 ; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
-; CHECK-NEXT:    vmovx.f16 s3, s7
-; CHECK-NEXT:    vmovx.f16 s4, s27
-; CHECK-NEXT:    vins.f16 s3, s4
-; CHECK-NEXT:    vmov.f32 s5, s23
-; CHECK-NEXT:    vmov.f32 s2, s27
-; CHECK-NEXT:    vmovx.f16 s16, s1
-; CHECK-NEXT:    vmov.f32 s6, s23
-; CHECK-NEXT:    vins.f16 s5, s16
+; CHECK-NEXT:    vrev32.16 q2, q2
+; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vmovx.f16 s8, s17
+; CHECK-NEXT:    vins.f16 s9, s6
+; CHECK-NEXT:    vmovx.f16 s6, s10
+; CHECK-NEXT:    vins.f16 s12, s8
+; CHECK-NEXT:    vmovx.f16 s8, s18
+; CHECK-NEXT:    vmov.f32 s10, s18
 ; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmovx.f16 s20, s6
-; CHECK-NEXT:    vmov.f32 s24, s25
-; CHECK-NEXT:    vins.f16 s2, s20
-; CHECK-NEXT:    vmovx.f16 s20, s17
-; CHECK-NEXT:    vins.f16 s12, s20
-; CHECK-NEXT:    vmovx.f16 s20, s18
-; CHECK-NEXT:    vins.f16 s14, s20
-; CHECK-NEXT:    vmov.f32 s6, s2
-; CHECK-NEXT:    vmov.f32 s15, s14
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vmovx.f16 s16, s13
-; CHECK-NEXT:    vstr s16, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s1, s5
-; CHECK-NEXT:    vrev32.16 q5, q4
-; CHECK-NEXT:    vldr s16, [sp, #32] @ 4-byte Reload
-; CHECK-NEXT:    vins.f16 s21, s16
-; CHECK-NEXT:    vmovx.f16 s16, s22
-; CHECK-NEXT:    vins.f16 s14, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s6
-; CHECK-NEXT:    vmovx.f16 s4, s17
-; CHECK-NEXT:    vmov.f32 s22, s14
-; CHECK-NEXT:    vins.f16 s24, s4
-; CHECK-NEXT:    vmovx.f16 s4, s18
-; CHECK-NEXT:    vins.f16 s26, s4
-; CHECK-NEXT:    vmov.f32 s13, s21
-; CHECK-NEXT:    vmov.f32 s27, s26
-; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s26, s18
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmovx.f16 s4, s25
+; CHECK-NEXT:    vins.f16 s26, s8
+; CHECK-NEXT:    vmov.f32 s15, s26
+; CHECK-NEXT:    vmovx.f16 s8, s25
+; CHECK-NEXT:    vrev32.16 q6, q4
+; CHECK-NEXT:    vins.f16 s2, s6
+; CHECK-NEXT:    vins.f16 s25, s8
+; CHECK-NEXT:    vmov.f32 s1, s9
+; CHECK-NEXT:    vmovx.f16 s8, s26
+; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vrev32.16 q4, q4
-; CHECK-NEXT:    vins.f16 s17, s4
-; CHECK-NEXT:    vmovx.f16 s4, s18
-; CHECK-NEXT:    vins.f16 s26, s4
-; CHECK-NEXT:    vmov.f32 s14, s22
-; CHECK-NEXT:    vmov.f32 s18, s26
-; CHECK-NEXT:    vstrw.32 q3, [r1, #64]
-; CHECK-NEXT:    vmov.f32 s25, s17
+; CHECK-NEXT:    vins.f16 s10, s8
+; CHECK-NEXT:    vmov.f32 s6, s14
+; CHECK-NEXT:    vmov.f32 s14, s10
+; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
+; CHECK-NEXT:    vmov.f32 s13, s25
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s26, s18
-; CHECK-NEXT:    vstrw.32 q6, [r1, #16]
-; CHECK-NEXT:    add sp, #80
+; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    add sp, #64
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -664,29 +626,26 @@ entry:
 define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) {
 ; CHECK-LABEL: vst3_v8i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10}
-; CHECK-NEXT:    vpush {d8, d9, d10}
-; CHECK-NEXT:    vldrb.u16 q1, [r0, #16]
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    vldrb.u16 q2, [r0, #8]
-; CHECK-NEXT:    vmovx.f16 s12, s6
-; CHECK-NEXT:    vmovx.f16 s0, s10
-; CHECK-NEXT:    vins.f16 s0, s12
-; CHECK-NEXT:    vmovx.f16 s12, s7
+; CHECK-NEXT:    vldrb.u16 q1, [r0, #16]
+; CHECK-NEXT:    vldrb.u16 q3, [r0]
 ; CHECK-NEXT:    vins.f16 s1, s11
+; CHECK-NEXT:    vmovx.f16 s2, s6
+; CHECK-NEXT:    vmovx.f16 s0, s10
 ; CHECK-NEXT:    vmovx.f16 s3, s11
-; CHECK-NEXT:    vins.f16 s3, s12
-; CHECK-NEXT:    vldrb.u16 q3, [r0]
-; CHECK-NEXT:    vmov.f32 s2, s7
-; CHECK-NEXT:    vmovx.f16 s20, s1
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s2, s7
+; CHECK-NEXT:    vmovx.f16 s1, s1
 ; CHECK-NEXT:    vmov.f32 s17, s15
+; CHECK-NEXT:    vins.f16 s3, s2
+; CHECK-NEXT:    vins.f16 s17, s1
+; CHECK-NEXT:    vmov.f32 s2, s7
+; CHECK-NEXT:    vmovx.f16 s1, s15
 ; CHECK-NEXT:    vmov.u16 r0, q3[0]
-; CHECK-NEXT:    vmov.f32 s18, s15
-; CHECK-NEXT:    vins.f16 s17, s20
-; CHECK-NEXT:    vmovx.f16 s20, s18
-; CHECK-NEXT:    vins.f16 s2, s20
-; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vins.f16 s2, s1
 ; CHECK-NEXT:    vmov.f32 s1, s17
-; CHECK-NEXT:    vmov.f32 s2, s18
 ; CHECK-NEXT:    vmov.8 q4[0], r0
 ; CHECK-NEXT:    vmov.u16 r0, q2[0]
 ; CHECK-NEXT:    vstrb.16 q0, [r1, #16]
@@ -720,7 +679,7 @@ define void @vst3_v8i8(<8 x i8> *%src, <24 x i8> *%dst) {
 ; CHECK-NEXT:    vmov.u16 r0, q3[5]
 ; CHECK-NEXT:    vmov.8 q4[15], r0
 ; CHECK-NEXT:    vstrw.32 q4, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10}
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x i8>, <8 x i8>* %src, i32 0
@@ -957,11 +916,9 @@ entry:
 define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) {
 ; CHECK-LABEL: vst3_v2i64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vmov.f64 d6, d5
-; CHECK-NEXT:    vmov.f32 s13, s11
 ; CHECK-NEXT:    vmov.f32 s14, s2
 ; CHECK-NEXT:    vmov.f32 s15, s3
 ; CHECK-NEXT:    vmov.f32 s2, s6
@@ -969,8 +926,10 @@ define void @vst3_v2i64(<2 x i64> *%src, <6 x i64> *%dst) {
 ; CHECK-NEXT:    vmov.f32 s6, s8
 ; CHECK-NEXT:    vmov.f32 s7, s9
 ; CHECK-NEXT:    vstrb.8 q1, [r1], #32
-; CHECK-NEXT:    vstrw.32 q3, [r1]
+; CHECK-NEXT:    vmov.f32 s12, s10
+; CHECK-NEXT:    vmov.f32 s13, s11
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #-16]
+; CHECK-NEXT:    vstrw.32 q3, [r1]
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <2 x i64>, <2 x i64>* %src, i32 0
@@ -991,41 +950,37 @@ define void @vst3_v4i64(<4 x i64> *%src, <12 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
-; CHECK-NEXT:    vmov.f64 d10, d2
-; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov.f32 s16, s14
+; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
+; CHECK-NEXT:    vmov.f32 s17, s15
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
+; CHECK-NEXT:    vmov.f64 d7, d15
+; CHECK-NEXT:    vmov.f32 s18, s2
+; CHECK-NEXT:    vmov.f32 s19, s3
+; CHECK-NEXT:    vmov.f32 s20, s4
+; CHECK-NEXT:    vstrw.32 q4, [r1, #80]
 ; CHECK-NEXT:    vmov.f32 s21, s5
 ; CHECK-NEXT:    vmov.f32 s22, s28
 ; CHECK-NEXT:    vmov.f32 s23, s29
-; CHECK-NEXT:    vmov.f64 d14, d12
+; CHECK-NEXT:    vmov.f32 s4, s8
 ; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vmov.f32 s5, s9
+; CHECK-NEXT:    vmov.f32 s28, s24
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s29, s25
-; CHECK-NEXT:    vmov.f64 d8, d7
 ; CHECK-NEXT:    vmov.f32 s30, s12
-; CHECK-NEXT:    vmov.f32 s17, s15
 ; CHECK-NEXT:    vmov.f32 s31, s13
-; CHECK-NEXT:    vldrw.u32 q3, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s18, s2
-; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s4, s8
-; CHECK-NEXT:    vmov.f32 s19, s3
 ; CHECK-NEXT:    vmov.f32 s2, s26
-; CHECK-NEXT:    vstrw.32 q4, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s5, s9
-; CHECK-NEXT:    vmov.f32 s8, s14
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vstrw.32 q7, [r1, #48]
 ; CHECK-NEXT:    vmov.f32 s3, s27
-; CHECK-NEXT:    vmov.f32 s9, s15
+; CHECK-NEXT:    vmov.f32 s8, s14
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
+; CHECK-NEXT:    vmov.f32 s9, s15
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1047,10 +1002,10 @@ entry:
 define void @vst3_v2f32(<2 x float> *%src, <6 x float> *%dst) {
 ; CHECK-LABEL: vst3_v2f32:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    ldr r2, [r0, #20]
 ; CHECK-NEXT:    vldr s0, [r0]
 ; CHECK-NEXT:    vldr s3, [r0, #4]
 ; CHECK-NEXT:    vldr s1, [r0, #8]
-; CHECK-NEXT:    ldr r2, [r0, #20]
 ; CHECK-NEXT:    vldr s2, [r0, #16]
 ; CHECK-NEXT:    ldr r0, [r0, #12]
 ; CHECK-NEXT:    strd r0, r2, [r1, #16]
@@ -1075,22 +1030,22 @@ define void @vst3_v4f32(<4 x float> *%src, <12 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vmov.f64 d8, d6
-; CHECK-NEXT:    vmov.f32 s17, s0
 ; CHECK-NEXT:    vmov.f32 s8, s1
-; CHECK-NEXT:    vmov.f32 s19, s13
 ; CHECK-NEXT:    vmov.f32 s9, s5
 ; CHECK-NEXT:    vmov.f32 s18, s4
 ; CHECK-NEXT:    vmov.f32 s4, s6
-; CHECK-NEXT:    vstrw.32 q4, [r1]
 ; CHECK-NEXT:    vmov.f32 s11, s2
-; CHECK-NEXT:    vmov.f32 s5, s15
 ; CHECK-NEXT:    vmov.f32 s10, s14
-; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    vmov.f32 s16, s12
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s17, s0
+; CHECK-NEXT:    vmov.f32 s19, s13
+; CHECK-NEXT:    vmov.f32 s5, s15
+; CHECK-NEXT:    vstrw.32 q4, [r1]
+; CHECK-NEXT:    vmov.f32 s6, s3
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
 ; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
@@ -1115,41 +1070,41 @@ define void @vst3_v8f32(<8 x float> *%src, <24 x float> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #16
 ; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vldrw.u32 q4, [r0]
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT:    vmov.f64 d10, d8
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
 ; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    vmov.f32 s21, s24
-; CHECK-NEXT:    vmov.f64 d12, d4
-; CHECK-NEXT:    vmov.f64 d6, d1
-; CHECK-NEXT:    vmov.f32 s25, s28
-; CHECK-NEXT:    vmov.f32 s13, s11
-; CHECK-NEXT:    vmov.f32 s27, s9
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vmov.f32 s20, s24
+; CHECK-NEXT:    vmov.f32 s13, s19
+; CHECK-NEXT:    vmov.f32 s24, s16
+; CHECK-NEXT:    vmov.f32 s27, s17
+; CHECK-NEXT:    vmov.f32 s2, s18
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vmov.f32 s14, s11
+; CHECK-NEXT:    vmov.f32 s23, s25
+; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s22, s4
+; CHECK-NEXT:    vmov.f32 s21, s28
+; CHECK-NEXT:    vmov.f32 s25, s8
+; CHECK-NEXT:    vstrw.32 q5, [r1]
 ; CHECK-NEXT:    vmov.f32 s26, s0
-; CHECK-NEXT:    vmov.f32 s0, s29
+; CHECK-NEXT:    vmov.f32 s0, s9
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s3, s30
-; CHECK-NEXT:    vmov.f32 s14, s31
-; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s23, s17
-; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s2, s10
-; CHECK-NEXT:    vmov.f32 s8, s29
-; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
+; CHECK-NEXT:    vmov.f32 s3, s10
 ; CHECK-NEXT:    vmov.f32 s9, s5
-; CHECK-NEXT:    vmov.f32 s22, s4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
 ; CHECK-NEXT:    vmov.f32 s4, s6
-; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vmov.f32 s8, s29
 ; CHECK-NEXT:    vmov.f32 s11, s30
-; CHECK-NEXT:    vmov.f32 s5, s19
 ; CHECK-NEXT:    vmov.f32 s10, s18
-; CHECK-NEXT:    vmov.f32 s6, s31
+; CHECK-NEXT:    vmov.f32 s5, s19
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s6, s31
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
 ; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
@@ -1173,107 +1128,106 @@ define void @vst3_v16f32(<16 x float> *%src, <48 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #160
-; CHECK-NEXT:    sub sp, #160
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT:    .pad #144
+; CHECK-NEXT:    sub sp, #144
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #64]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vstrw.32 q5, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0]
+; CHECK-NEXT:    vstrw.32 q7, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #160]
+; CHECK-NEXT:    vstrw.32 q5, [sp, #80] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s12, s1
-; CHECK-NEXT:    vstrw.32 q5, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
+; CHECK-NEXT:    vstrw.32 q7, [sp, #64] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s13, s9
 ; CHECK-NEXT:    vmov.f32 s15, s2
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #160]
-; CHECK-NEXT:    vstrw.32 q5, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #176]
 ; CHECK-NEXT:    vmov.f32 s14, s26
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q4, [sp, #128] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #144]
-; CHECK-NEXT:    vstrw.32 q5, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT:    vstrw.32 q7, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #176]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
-; CHECK-NEXT:    vmov.f64 d6, d5
-; CHECK-NEXT:    vstrw.32 q7, [sp, #144] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q4, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s14, s3
+; CHECK-NEXT:    vmov.f32 s12, s10
 ; CHECK-NEXT:    vmov.f32 s13, s27
 ; CHECK-NEXT:    vmov.f32 s15, s11
-; CHECK-NEXT:    vmov.f32 s14, s3
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
-; CHECK-NEXT:    vmov.f64 d6, d3
 ; CHECK-NEXT:    vmov.f32 s13, s23
+; CHECK-NEXT:    vmov.f32 s12, s6
 ; CHECK-NEXT:    vmov.f32 s15, s7
 ; CHECK-NEXT:    vmov.f32 s14, s31
-; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d6, d12
+; CHECK-NEXT:    vstrw.32 q3, [sp, #112] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s13, s0
-; CHECK-NEXT:    vmov.f32 s15, s25
 ; CHECK-NEXT:    vmov.f32 s14, s8
-; CHECK-NEXT:    vmov q2, q7
-; CHECK-NEXT:    vmov.f64 d0, d10
-; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmov.f64 d4, d14
+; CHECK-NEXT:    vmov.f32 s0, s20
 ; CHECK-NEXT:    vmov.f32 s3, s21
-; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vmov.f64 d10, d2
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s2, s20
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmov.f64 d14, d2
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s7, s10
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s6, s22
-; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d4, d1
-; CHECK-NEXT:    vmov q3, q1
-; CHECK-NEXT:    vmov.f32 s20, s5
+; CHECK-NEXT:    vmov.f32 s20, s9
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s23, s30
+; CHECK-NEXT:    vmov.f32 s12, s24
+; CHECK-NEXT:    vstrw.32 q5, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s15, s25
+; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s21, s1
-; CHECK-NEXT:    vmov.f32 s23, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d12, d9
-; CHECK-NEXT:    vmov q7, q1
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vmov q0, q7
-; CHECK-NEXT:    vmov.f32 s25, s7
-; CHECK-NEXT:    vstrw.32 q5, [r1, #112]
-; CHECK-NEXT:    vmov.f32 s27, s19
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s10, s15
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s29, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s12, s2
+; CHECK-NEXT:    vmov.f32 s15, s3
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d0, d14
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #64] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s31, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s26, s15
-; CHECK-NEXT:    vstrw.32 q2, [r1, #128]
-; CHECK-NEXT:    vmov.f32 s30, s0
-; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
-; CHECK-NEXT:    vmov.f64 d0, d2
+; CHECK-NEXT:    vmov.f64 d0, d8
+; CHECK-NEXT:    vmov.f32 s20, s9
+; CHECK-NEXT:    vmov.f32 s23, s10
+; CHECK-NEXT:    vmov.f32 s14, s11
+; CHECK-NEXT:    vmov.f32 s29, s8
+; CHECK-NEXT:    vldrw.u32 q2, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s24, s2
+; CHECK-NEXT:    vmov.f32 s30, s8
+; CHECK-NEXT:    vmov.f32 s27, s3
 ; CHECK-NEXT:    vstrw.32 q7, [r1, #96]
-; CHECK-NEXT:    vmov.f32 s1, s12
-; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vldrw.u32 q1, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s16
-; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s8, s0
+; CHECK-NEXT:    vmov.f32 s11, s1
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s22, s6
+; CHECK-NEXT:    vmov.f64 d8, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s16, s13
+; CHECK-NEXT:    vstrw.32 q5, [r1, #112]
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #144]
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s19, s14
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s18, s6
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s13, s7
+; CHECK-NEXT:    vldrw.u32 q1, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q4, [r1, #64]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s25, s19
+; CHECK-NEXT:    vstrw.32 q3, [r1, #128]
+; CHECK-NEXT:    vmov.f32 s26, s7
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    add sp, #160
+; CHECK-NEXT:    vmov.f32 s10, s16
+; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s9, s4
+; CHECK-NEXT:    vmov.f32 s16, s5
+; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s19, s6
+; CHECK-NEXT:    vstrw.32 q4, [r1, #64]
+; CHECK-NEXT:    add sp, #144
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1297,14 +1251,14 @@ define void @vst3_v2f16(<2 x half> *%src, <6 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vldmia r0, {s0, s1}
 ; CHECK-NEXT:    ldr r0, [r0, #8]
-; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmovx.f16 s2, s0
 ; CHECK-NEXT:    vins.f16 s0, s1
 ; CHECK-NEXT:    vmov.32 q1[0], r0
+; CHECK-NEXT:    vmovx.f16 s6, s4
+; CHECK-NEXT:    vins.f16 s4, s2
 ; CHECK-NEXT:    vmovx.f16 s2, s1
-; CHECK-NEXT:    vmovx.f16 s10, s4
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vins.f16 s2, s10
 ; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vins.f16 s2, s6
 ; CHECK-NEXT:    vmov r3, s2
 ; CHECK-NEXT:    vmov r0, r2, d0
 ; CHECK-NEXT:    stm r1!, {r0, r2, r3}
@@ -1328,8 +1282,6 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .save {r7, lr}
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    .vsave {d8, d9}
-; CHECK-NEXT:    vpush {d8, d9}
 ; CHECK-NEXT:    ldrd r2, r12, [r0]
 ; CHECK-NEXT:    ldrd r3, lr, [r0, #8]
 ; CHECK-NEXT:    vmov.32 q0[0], r2
@@ -1337,30 +1289,29 @@ define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
 ; CHECK-NEXT:    vmov.32 q1[0], r3
 ; CHECK-NEXT:    vmov.32 q0[1], r12
 ; CHECK-NEXT:    vmov.32 q1[1], lr
-; CHECK-NEXT:    vmov.f32 s2, s4
-; CHECK-NEXT:    vmov.f32 s3, s5
-; CHECK-NEXT:    vmovx.f16 s10, s0
 ; CHECK-NEXT:    vmov.f32 s8, s1
-; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmov.f32 s3, s5
 ; CHECK-NEXT:    vins.f16 s8, s5
+; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    vmov.32 q1[1], r0
+; CHECK-NEXT:    vmovx.f16 s13, s3
+; CHECK-NEXT:    vmovx.f16 s6, s0
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    vmovx.f16 s10, s4
 ; CHECK-NEXT:    vmovx.f16 s2, s2
-; CHECK-NEXT:    vmovx.f16 s12, s4
-; CHECK-NEXT:    vins.f16 s4, s10
-; CHECK-NEXT:    vins.f16 s2, s12
-; CHECK-NEXT:    vmovx.f16 s10, s1
-; CHECK-NEXT:    vmovx.f16 s12, s5
-; CHECK-NEXT:    vmovx.f16 s17, s3
-; CHECK-NEXT:    vins.f16 s5, s10
-; CHECK-NEXT:    vins.f16 s17, s12
-; CHECK-NEXT:    vmov.f32 s16, s5
+; CHECK-NEXT:    vins.f16 s4, s6
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s2, s10
+; CHECK-NEXT:    vmovx.f16 s10, s5
+; CHECK-NEXT:    vins.f16 s5, s6
+; CHECK-NEXT:    vins.f16 s13, s10
+; CHECK-NEXT:    vmov.f32 s12, s5
 ; CHECK-NEXT:    vmov.f32 s1, s4
 ; CHECK-NEXT:    vmov.f32 s3, s8
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vmov r0, r2, d8
+; CHECK-NEXT:    vmov r0, r2, d6
 ; CHECK-NEXT:    strd r0, r2, [r1, #16]
-; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
   %s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
@@ -1379,65 +1330,53 @@ entry:
 define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) {
 ; CHECK-LABEL: vst3_v8f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14}
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d0, d4
-; CHECK-NEXT:    vmovx.f16 s6, s20
-; CHECK-NEXT:    vmovx.f16 s12, s8
-; CHECK-NEXT:    vmov.f32 s4, s9
-; CHECK-NEXT:    vins.f16 s0, s20
-; CHECK-NEXT:    vmov r2, s6
-; CHECK-NEXT:    vins.f16 s4, s21
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmovx.f16 s2, s12
+; CHECK-NEXT:    vins.f16 s0, s12
+; CHECK-NEXT:    vmov r2, s2
 ; CHECK-NEXT:    vmov.16 q0[4], r2
-; CHECK-NEXT:    vmov.f32 s3, s4
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
-; CHECK-NEXT:    vmov.f32 s1, s8
-; CHECK-NEXT:    vmov.f32 s17, s4
-; CHECK-NEXT:    vmovx.f16 s24, s7
-; CHECK-NEXT:    vmov.f32 s18, s4
-; CHECK-NEXT:    vins.f16 s17, s12
-; CHECK-NEXT:    vmovx.f16 s12, s18
-; CHECK-NEXT:    vins.f16 s2, s12
-; CHECK-NEXT:    vmovx.f16 s15, s23
-; CHECK-NEXT:    vins.f16 s15, s24
-; CHECK-NEXT:    vmovx.f16 s24, s6
-; CHECK-NEXT:    vmovx.f16 s12, s22
-; CHECK-NEXT:    vmov.f32 s18, s2
-; CHECK-NEXT:    vins.f16 s12, s24
-; CHECK-NEXT:    vmov.f32 s25, s11
-; CHECK-NEXT:    vins.f16 s13, s23
-; CHECK-NEXT:    vmov.f32 s26, s11
-; CHECK-NEXT:    vmov.f32 s14, s7
-; CHECK-NEXT:    vmovx.f16 s28, s13
-; CHECK-NEXT:    vins.f16 s25, s28
-; CHECK-NEXT:    vmovx.f16 s28, s26
-; CHECK-NEXT:    vins.f16 s14, s28
-; CHECK-NEXT:    vmovx.f16 s28, s9
-; CHECK-NEXT:    vmov.f32 s4, s5
-; CHECK-NEXT:    vrev32.16 q5, q5
-; CHECK-NEXT:    vins.f16 s4, s28
-; CHECK-NEXT:    vmovx.f16 s28, s10
-; CHECK-NEXT:    vins.f16 s6, s28
-; CHECK-NEXT:    vmov.f32 s26, s14
-; CHECK-NEXT:    vmov.f32 s7, s6
-; CHECK-NEXT:    vmov.f32 s6, s10
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vins.f16 s21, s8
-; CHECK-NEXT:    vmovx.f16 s8, s22
-; CHECK-NEXT:    vins.f16 s6, s8
-; CHECK-NEXT:    vmov.f32 s1, s17
-; CHECK-NEXT:    vmov.f32 s22, s6
-; CHECK-NEXT:    vmov.f32 s13, s25
-; CHECK-NEXT:    vmov.f32 s5, s21
-; CHECK-NEXT:    vmov.f32 s2, s18
-; CHECK-NEXT:    vmov.f32 s14, s26
+; CHECK-NEXT:    vmovx.f16 s4, s4
+; CHECK-NEXT:    vmov.f32 s1, s16
+; CHECK-NEXT:    vmovx.f16 s11, s15
+; CHECK-NEXT:    vmov.f32 s8, s5
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    vmovx.f16 s4, s16
+; CHECK-NEXT:    vins.f16 s8, s13
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s19
+; CHECK-NEXT:    vmov.f32 s3, s8
+; CHECK-NEXT:    vins.f16 s11, s4
+; CHECK-NEXT:    vmovx.f16 s4, s18
+; CHECK-NEXT:    vmovx.f16 s8, s14
+; CHECK-NEXT:    vins.f16 s9, s15
+; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vmovx.f16 s4, s9
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vins.f16 s9, s4
+; CHECK-NEXT:    vrev32.16 q3, q3
+; CHECK-NEXT:    vmov.f32 s10, s19
+; CHECK-NEXT:    vmovx.f16 s4, s7
+; CHECK-NEXT:    vins.f16 s10, s4
+; CHECK-NEXT:    vmovx.f16 s4, s5
+; CHECK-NEXT:    vmov.f32 s12, s17
+; CHECK-NEXT:    vstrw.32 q2, [r1, #32]
+; CHECK-NEXT:    vins.f16 s12, s4
+; CHECK-NEXT:    vmovx.f16 s4, s6
+; CHECK-NEXT:    vins.f16 s18, s4
+; CHECK-NEXT:    vmovx.f16 s4, s17
+; CHECK-NEXT:    vins.f16 s13, s4
+; CHECK-NEXT:    vmovx.f16 s4, s14
+; CHECK-NEXT:    vins.f16 s6, s4
+; CHECK-NEXT:    vmov.f32 s15, s18
+; CHECK-NEXT:    vmov.f32 s14, s6
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
-; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s6, s22
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14}
+; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    vpop {d8, d9}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
@@ -1458,150 +1397,121 @@ define void @vst3_v16f16(<16 x half> *%src, <48 x half> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #128
-; CHECK-NEXT:    sub sp, #128
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q6, [r0]
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #16]
-; CHECK-NEXT:    vmovx.f16 s0, s19
-; CHECK-NEXT:    vmovx.f16 s7, s15
+; CHECK-NEXT:    .pad #96
+; CHECK-NEXT:    sub sp, #96
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #64]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
+; CHECK-NEXT:    vmovx.f16 s0, s15
+; CHECK-NEXT:    vmovx.f16 s7, s11
 ; CHECK-NEXT:    vins.f16 s7, s0
-; CHECK-NEXT:    vmovx.f16 s0, s18
-; CHECK-NEXT:    vmovx.f16 s4, s14
-; CHECK-NEXT:    vstrw.32 q5, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vmov q6, q2
+; CHECK-NEXT:    vmovx.f16 s0, s14
+; CHECK-NEXT:    vmovx.f16 s4, s10
+; CHECK-NEXT:    vins.f16 s1, s11
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vins.f16 s4, s0
-; CHECK-NEXT:    vmov.f64 d14, d12
-; CHECK-NEXT:    vins.f16 s5, s15
-; CHECK-NEXT:    vstrw.32 q3, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s6, s19
-; CHECK-NEXT:    vmovx.f16 s0, s5
-; CHECK-NEXT:    vmov q2, q1
-; CHECK-NEXT:    vmov.f32 s5, s27
-; CHECK-NEXT:    vmov.f32 s6, s27
-; CHECK-NEXT:    vins.f16 s28, s12
+; CHECK-NEXT:    vmovx.f16 s0, s1
+; CHECK-NEXT:    vstrw.32 q1, [sp, #80] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s5, s11
 ; CHECK-NEXT:    vins.f16 s5, s0
-; CHECK-NEXT:    vmovx.f16 s0, s6
-; CHECK-NEXT:    vins.f16 s10, s0
-; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d2, d10
-; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT:    vmovx.f16 s2, s8
-; CHECK-NEXT:    vstrw.32 q2, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s0, s21
-; CHECK-NEXT:    vins.f16 s4, s8
-; CHECK-NEXT:    vmov r2, s2
-; CHECK-NEXT:    vins.f16 s0, s9
-; CHECK-NEXT:    vmov.16 q1[4], r2
-; CHECK-NEXT:    vmovx.f16 s2, s12
-; CHECK-NEXT:    vmov.f32 s7, s0
-; CHECK-NEXT:    vmovx.f16 s0, s20
-; CHECK-NEXT:    vmov.f32 s5, s20
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #80]
-; CHECK-NEXT:    vmov r0, s2
-; CHECK-NEXT:    vmov.f32 s9, s20
-; CHECK-NEXT:    vmov.16 q7[4], r0
-; CHECK-NEXT:    vmov.f32 s10, s20
-; CHECK-NEXT:    vins.f16 s9, s0
-; CHECK-NEXT:    vmovx.f16 s0, s10
+; CHECK-NEXT:    vmov.f32 s6, s15
+; CHECK-NEXT:    vmovx.f16 s0, s11
+; CHECK-NEXT:    vmov q7, q4
 ; CHECK-NEXT:    vins.f16 s6, s0
-; CHECK-NEXT:    vmov.f32 s0, s25
-; CHECK-NEXT:    vstrw.32 q2, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vmov q2, q4
-; CHECK-NEXT:    vins.f16 s0, s13
-; CHECK-NEXT:    vstrw.32 q1, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s5, s8
+; CHECK-NEXT:    vmovx.f16 s2, s20
+; CHECK-NEXT:    vstrw.32 q1, [sp, #64] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s4, s16
+; CHECK-NEXT:    vins.f16 s4, s20
+; CHECK-NEXT:    vmov.f32 s0, s17
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vstrw.32 q7, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s16, s4
+; CHECK-NEXT:    vmovx.f16 s4, s28
+; CHECK-NEXT:    vldrw.u32 q7, [r0, #80]
+; CHECK-NEXT:    vmov.16 q4[4], r2
+; CHECK-NEXT:    vins.f16 s0, s21
+; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s19, s0
+; CHECK-NEXT:    vmovx.f16 s0, s28
+; CHECK-NEXT:    vins.f16 s18, s0
+; CHECK-NEXT:    vmov.f64 d0, d4
+; CHECK-NEXT:    vstrw.32 q6, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmovx.f16 s8, s24
+; CHECK-NEXT:    vmov.f32 s22, s28
+; CHECK-NEXT:    vins.f16 s20, s24
+; CHECK-NEXT:    vmov r0, s8
+; CHECK-NEXT:    vmov.f32 s17, s28
+; CHECK-NEXT:    vmov.16 q5[4], r0
+; CHECK-NEXT:    vmov.f32 s2, s10
+; CHECK-NEXT:    vins.f16 s17, s4
+; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vmov.f32 s21, s12
+; CHECK-NEXT:    vmovx.f16 s24, s10
+; CHECK-NEXT:    vins.f16 s21, s0
+; CHECK-NEXT:    vmovx.f16 s0, s12
+; CHECK-NEXT:    vins.f16 s22, s0
+; CHECK-NEXT:    vmovx.f16 s0, s30
+; CHECK-NEXT:    vins.f16 s24, s0
+; CHECK-NEXT:    vmovx.f16 s0, s31
+; CHECK-NEXT:    vmovx.f16 s27, s11
+; CHECK-NEXT:    vins.f16 s4, s25
+; CHECK-NEXT:    vins.f16 s27, s0
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s25, s11
+; CHECK-NEXT:    vmov.f32 s23, s4
+; CHECK-NEXT:    vmovx.f16 s4, s25
+; CHECK-NEXT:    vmov.f32 s25, s3
+; CHECK-NEXT:    vmov.f32 s26, s31
+; CHECK-NEXT:    vmovx.f16 s0, s3
+; CHECK-NEXT:    vins.f16 s25, s4
+; CHECK-NEXT:    vins.f16 s26, s0
+; CHECK-NEXT:    vmovx.f16 s4, s1
+; CHECK-NEXT:    vmov.f32 s0, s29
+; CHECK-NEXT:    vins.f16 s0, s4
+; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vins.f16 s30, s4
+; CHECK-NEXT:    vmov.f32 s6, s18
+; CHECK-NEXT:    vrev32.16 q2, q2
+; CHECK-NEXT:    vmovx.f16 s4, s29
+; CHECK-NEXT:    vmov.f32 s3, s30
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s9, s4
+; CHECK-NEXT:    vmovx.f16 s4, s10
+; CHECK-NEXT:    vins.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s4, s29
+; CHECK-NEXT:    vmov.f32 s8, s13
+; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
+; CHECK-NEXT:    vins.f16 s8, s4
+; CHECK-NEXT:    vmovx.f16 s4, s30
+; CHECK-NEXT:    vins.f16 s14, s4
+; CHECK-NEXT:    vmov.f32 s10, s30
+; CHECK-NEXT:    vmov.f32 s11, s14
+; CHECK-NEXT:    vmovx.f16 s4, s13
 ; CHECK-NEXT:    vldrw.u32 q3, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s31, s0
-; CHECK-NEXT:    vmovx.f16 s0, s24
-; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vins.f16 s5, s0
-; CHECK-NEXT:    vmov.f32 s29, s24
-; CHECK-NEXT:    vmovx.f16 s0, s6
-; CHECK-NEXT:    vstrw.32 q1, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vins.f16 s30, s0
-; CHECK-NEXT:    vmovx.f16 s0, s22
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s18, s6
+; CHECK-NEXT:    vstrw.32 q5, [r1]
+; CHECK-NEXT:    vrev32.16 q3, q3
+; CHECK-NEXT:    vmov.f32 s6, s30
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #80] @ 16-byte Reload
+; CHECK-NEXT:    vins.f16 s13, s4
 ; CHECK-NEXT:    vmovx.f16 s4, s14
-; CHECK-NEXT:    vmov.f32 s8, s9
-; CHECK-NEXT:    vins.f16 s4, s0
-; CHECK-NEXT:    vmovx.f16 s0, s23
-; CHECK-NEXT:    vmovx.f16 s7, s15
-; CHECK-NEXT:    vins.f16 s7, s0
-; CHECK-NEXT:    vins.f16 s5, s15
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s6, s23
-; CHECK-NEXT:    vmovx.f16 s16, s5
-; CHECK-NEXT:    vmov.f32 s1, s15
-; CHECK-NEXT:    vmov.f32 s2, s15
-; CHECK-NEXT:    vins.f16 s1, s16
-; CHECK-NEXT:    vmovx.f16 s16, s2
-; CHECK-NEXT:    vins.f16 s6, s16
-; CHECK-NEXT:    vmovx.f16 s16, s13
-; CHECK-NEXT:    vmov.f32 s20, s21
-; CHECK-NEXT:    vins.f16 s20, s16
-; CHECK-NEXT:    vmovx.f16 s16, s14
-; CHECK-NEXT:    vins.f16 s22, s16
-; CHECK-NEXT:    vldrw.u32 q4, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s23, s22
-; CHECK-NEXT:    vmov.f32 s14, s18
-; CHECK-NEXT:    vstrw.32 q3, [sp, #96] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s14, s30
-; CHECK-NEXT:    vstrw.32 q3, [sp, #80] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s6
-; CHECK-NEXT:    vmov.f32 s22, s14
-; CHECK-NEXT:    vmovx.f16 s12, s21
-; CHECK-NEXT:    vstr s12, [sp, #64] @ 4-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s5, s1
-; CHECK-NEXT:    vrev32.16 q4, q3
-; CHECK-NEXT:    vldr s12, [sp, #64] @ 4-byte Reload
-; CHECK-NEXT:    vins.f16 s17, s12
-; CHECK-NEXT:    vmovx.f16 s12, s18
-; CHECK-NEXT:    vins.f16 s22, s12
-; CHECK-NEXT:    vmovx.f16 s12, s25
-; CHECK-NEXT:    vmov.f32 s6, s2
-; CHECK-NEXT:    vins.f16 s8, s12
-; CHECK-NEXT:    vmovx.f16 s0, s26
-; CHECK-NEXT:    vmov.f32 s18, s22
-; CHECK-NEXT:    vins.f16 s10, s0
-; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s11, s10
-; CHECK-NEXT:    vstrw.32 q1, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s10, s26
-; CHECK-NEXT:    vrev32.16 q6, q0
-; CHECK-NEXT:    vmovx.f16 s12, s9
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT:    vins.f16 s25, s12
-; CHECK-NEXT:    vmovx.f16 s12, s26
-; CHECK-NEXT:    vins.f16 s10, s12
-; CHECK-NEXT:    vmov.f32 s29, s1
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #96] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s30, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s26, s10
-; CHECK-NEXT:    vmov.f32 s1, s13
-; CHECK-NEXT:    vstrw.32 q7, [r1]
-; CHECK-NEXT:    vmov.f32 s2, s14
-; CHECK-NEXT:    vldrw.u32 q3, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q0, [sp, #112] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s2, s14
-; CHECK-NEXT:    vmov.f32 s13, s1
-; CHECK-NEXT:    vmov.f32 s21, s17
-; CHECK-NEXT:    vmov.f32 s9, s25
-; CHECK-NEXT:    vmov.f32 s22, s18
-; CHECK-NEXT:    vmov.f32 s10, s26
-; CHECK-NEXT:    vstrw.32 q5, [r1, #64]
+; CHECK-NEXT:    vmov.f32 s1, s9
+; CHECK-NEXT:    vins.f16 s10, s4
+; CHECK-NEXT:    vmov.f32 s9, s13
+; CHECK-NEXT:    vmov.f32 s4, s28
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s14, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
-; CHECK-NEXT:    add sp, #128
+; CHECK-NEXT:    vmov.f32 s7, s31
+; CHECK-NEXT:    vstrw.32 q4, [r1, #48]
+; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
+; CHECK-NEXT:    add sp, #96
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -1623,8 +1533,8 @@ entry:
 define void @vst3_v2f64(<2 x double> *%src, <6 x double> *%dst) {
 ; CHECK-LABEL: vst3_v2f64:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vmov.f64 d6, d2
 ; CHECK-NEXT:    vmov.f64 d7, d1
@@ -1653,32 +1563,28 @@ define void @vst3_v4f64(<4 x double> *%src, <12 x double> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #16
-; CHECK-NEXT:    sub sp, #16
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
 ; CHECK-NEXT:    vmov.f64 d6, d15
-; CHECK-NEXT:    vstrw.32 q6, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #64]
-; CHECK-NEXT:    vmov.f64 d10, d2
+; CHECK-NEXT:    vmov.f64 d15, d13
 ; CHECK-NEXT:    vmov.f64 d7, d1
-; CHECK-NEXT:    vmov.f64 d11, d12
+; CHECK-NEXT:    vmov.f64 d10, d2
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
-; CHECK-NEXT:    vmov.f64 d12, d4
+; CHECK-NEXT:    vmov.f64 d11, d12
+; CHECK-NEXT:    vmov.f64 d2, d8
 ; CHECK-NEXT:    vstrw.32 q5, [r1]
 ; CHECK-NEXT:    vmov.f64 d1, d5
-; CHECK-NEXT:    vldrw.u32 q2, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d2, d8
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
+; CHECK-NEXT:    vmov.f64 d8, d15
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #64]
+; CHECK-NEXT:    vmov.f64 d12, d4
+; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
 ; CHECK-NEXT:    vmov.f64 d13, d14
-; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
-; CHECK-NEXT:    vmov.f64 d8, d5
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #48]
-; CHECK-NEXT:    vstrw.32 q4, [r1, #32]
-; CHECK-NEXT:    add sp, #16
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
index 4c939fc09e59b..ee1fe9e69c255 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4-post.ll
@@ -104,21 +104,21 @@ define <8 x i64> *@vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q4, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vmov.f64 d4, d8
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
+; CHECK-NEXT:    vmov.f32 s8, s16
 ; CHECK-NEXT:    vmov.f32 s9, s17
 ; CHECK-NEXT:    vmov.f32 s10, s0
 ; CHECK-NEXT:    vmov.f32 s11, s1
 ; CHECK-NEXT:    vmov.f32 s0, s18
 ; CHECK-NEXT:    vmov.f32 s1, s19
-; CHECK-NEXT:    vmov.f64 d8, d6
-; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s17, s13
 ; CHECK-NEXT:    vmov.f32 s18, s4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
 ; CHECK-NEXT:    vmov.f32 s19, s5
+; CHECK-NEXT:    vmov.f32 s16, s12
+; CHECK-NEXT:    vmov.f32 s17, s13
 ; CHECK-NEXT:    vmov.f32 s4, s14
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s5, s15
@@ -215,16 +215,16 @@ define <8 x double> *@vst4_v2f64(<2 x double> *%src, <8 x double> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9}
 ; CHECK-NEXT:    vpush {d8, d9}
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
 ; CHECK-NEXT:    vmov.f64 d2, d6
 ; CHECK-NEXT:    vmov.f64 d3, d0
 ; CHECK-NEXT:    vmov.f64 d0, d7
-; CHECK-NEXT:    vmov.f64 d6, d8
-; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
 ; CHECK-NEXT:    vmov.f64 d7, d4
+; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
+; CHECK-NEXT:    vmov.f64 d6, d8
 ; CHECK-NEXT:    vmov.f64 d4, d9
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst4.ll b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
index f275049eddfc6..db4a438ae076a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -13,20 +13,20 @@ define void @vst4_v2i32(<2 x i32> *%src, <8 x i32> *%dst) {
 ; CHECK-NEXT:    ldrd r3, r2, [r0, #8]
 ; CHECK-NEXT:    ldm r6, {r4, r5, r6}
 ; CHECK-NEXT:    vmov q1[2], q1[0], lr, r3
-; CHECK-NEXT:    vmov q1[3], q1[1], r12, r2
 ; CHECK-NEXT:    ldr r0, [r0, #28]
+; CHECK-NEXT:    vmov q1[3], q1[1], r12, r2
 ; CHECK-NEXT:    vmov q0[2], q0[0], r4, r6
-; CHECK-NEXT:    vmov.f64 d4, d2
+; CHECK-NEXT:    vmov.f32 s8, s4
 ; CHECK-NEXT:    vmov q0[3], q0[1], r5, r0
 ; CHECK-NEXT:    vmov.f32 s9, s6
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s5, s7
 ; CHECK-NEXT:    vmov.f32 s10, s0
 ; CHECK-NEXT:    vmov.f32 s11, s2
+; CHECK-NEXT:    vmov.f32 s6, s1
 ; CHECK-NEXT:    vstrw.32 q2, [r1]
-; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s9, s7
-; CHECK-NEXT:    vmov.f32 s10, s1
-; CHECK-NEXT:    vmov.f32 s11, s3
-; CHECK-NEXT:    vstrw.32 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s7, s3
+; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
 ; CHECK-NEXT:    pop {r4, r5, r6, pc}
 entry:
   %s1 = getelementptr <2 x i32>, <2 x i32>* %src, i32 0
@@ -78,12 +78,12 @@ define void @vst4_v8i32(<8 x i32> *%src, <32 x i32> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vst40.32 {q4, q5, q6, q7}, [r1]
 ; CHECK-NEXT:    vst41.32 {q4, q5, q6, q7}, [r1]
@@ -120,55 +120,50 @@ define void @vst4_v16i32(<16 x i32> *%src, <64 x i32> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #192
 ; CHECK-NEXT:    sub sp, #192
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #240]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #208]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #224]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #208]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #144]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
+; CHECK-NEXT:    vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #240]
+; CHECK-NEXT:    vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill
+; CHECK-NEXT:    add r2, sp, #128
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #160]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #224]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
+; CHECK-NEXT:    vmov q6, q2
+; CHECK-NEXT:    vmov q7, q3
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
 ; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r1]
 ; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r1]
 ; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
+; CHECK-NEXT:    vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r0]
@@ -210,28 +205,28 @@ define void @vst4_v4i32_align1(<4 x i32> *%src, <16 x i32> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q4, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vmov.f32 s12, s17
-; CHECK-NEXT:    vmov.f64 d10, d8
-; CHECK-NEXT:    vmov.f32 s24, s19
-; CHECK-NEXT:    vmov.f32 s13, s9
-; CHECK-NEXT:    vmov.f32 s21, s8
-; CHECK-NEXT:    vmov.f32 s25, s11
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
 ; CHECK-NEXT:    vmov.f32 s14, s1
 ; CHECK-NEXT:    vmov.f32 s22, s0
 ; CHECK-NEXT:    vmov.f32 s26, s3
-; CHECK-NEXT:    vmov.f32 s0, s18
-; CHECK-NEXT:    vmov.f32 s1, s10
+; CHECK-NEXT:    vmov.f32 s12, s17
+; CHECK-NEXT:    vmov.f32 s13, s9
 ; CHECK-NEXT:    vmov.f32 s15, s5
-; CHECK-NEXT:    vmov.f32 s23, s4
+; CHECK-NEXT:    vmov.f32 s20, s16
 ; CHECK-NEXT:    vstrb.8 q3, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s27, s7
+; CHECK-NEXT:    vmov.f32 s21, s8
+; CHECK-NEXT:    vmov.f32 s23, s4
+; CHECK-NEXT:    vmov.f32 s24, s19
 ; CHECK-NEXT:    vstrb.8 q5, [r1]
-; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vmov.f32 s25, s11
+; CHECK-NEXT:    vmov.f32 s27, s7
+; CHECK-NEXT:    vmov.f32 s0, s18
 ; CHECK-NEXT:    vstrb.8 q6, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s1, s10
+; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vstrb.8 q0, [r1, #32]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
@@ -385,12 +380,12 @@ define void @vst4_v16i16(<16 x i16> *%src, <64 x i16> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vst40.16 {q4, q5, q6, q7}, [r1]
 ; CHECK-NEXT:    vst41.16 {q4, q5, q6, q7}, [r1]
@@ -421,61 +416,61 @@ entry:
 define void @vst4_v8i16_align1(<8 x i16> *%src, <32 x i16> *%dst) {
 ; CHECK-LABEL: vst4_v8i16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #16]
 ; CHECK-NEXT:    vmovx.f16 s12, s5
-; CHECK-NEXT:    vins.f16 s5, s9
 ; CHECK-NEXT:    vmovx.f16 s0, s9
-; CHECK-NEXT:    vmovx.f16 s20, s17
+; CHECK-NEXT:    vins.f16 s5, s9
 ; CHECK-NEXT:    vins.f16 s12, s0
 ; CHECK-NEXT:    vmov q0, q1
-; CHECK-NEXT:    vmov.f32 s3, s12
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vmovx.f16 s27, s4
 ; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vmov.f32 s3, s12
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
+; CHECK-NEXT:    vmov.f32 s5, s4
+; CHECK-NEXT:    vmovx.f16 s8, s8
+; CHECK-NEXT:    vmovx.f16 s0, s17
 ; CHECK-NEXT:    vmovx.f16 s2, s13
+; CHECK-NEXT:    vins.f16 s27, s8
+; CHECK-NEXT:    vmovx.f16 s4, s12
+; CHECK-NEXT:    vmovx.f16 s8, s16
 ; CHECK-NEXT:    vins.f16 s13, s17
-; CHECK-NEXT:    vins.f16 s2, s20
-; CHECK-NEXT:    vmovx.f16 s20, s8
-; CHECK-NEXT:    vins.f16 s27, s20
-; CHECK-NEXT:    vmov.f32 s0, s13
-; CHECK-NEXT:    vmovx.f16 s28, s12
-; CHECK-NEXT:    vmovx.f16 s20, s16
 ; CHECK-NEXT:    vins.f16 s12, s16
-; CHECK-NEXT:    vins.f16 s28, s20
 ; CHECK-NEXT:    vmov q5, q3
-; CHECK-NEXT:    vmov.f32 s25, s4
-; CHECK-NEXT:    vmov.f32 s22, s28
-; CHECK-NEXT:    vmovx.f16 s28, s11
-; CHECK-NEXT:    vmov.f32 s21, s4
-; CHECK-NEXT:    vmovx.f16 s8, s10
+; CHECK-NEXT:    vins.f16 s4, s8
+; CHECK-NEXT:    vmov.f32 s22, s4
+; CHECK-NEXT:    vmovx.f16 s4, s11
 ; CHECK-NEXT:    vmov.f32 s23, s27
 ; CHECK-NEXT:    vmovx.f16 s27, s7
-; CHECK-NEXT:    vins.f16 s27, s28
 ; CHECK-NEXT:    vins.f16 s7, s11
-; CHECK-NEXT:    vmov.f32 s25, s7
-; CHECK-NEXT:    vmovx.f16 s28, s19
+; CHECK-NEXT:    vins.f16 s27, s4
 ; CHECK-NEXT:    vmovx.f16 s26, s15
-; CHECK-NEXT:    vins.f16 s15, s19
-; CHECK-NEXT:    vins.f16 s26, s28
-; CHECK-NEXT:    vmovx.f16 s31, s6
+; CHECK-NEXT:    vmovx.f16 s4, s19
+; CHECK-NEXT:    vmov.f32 s25, s7
+; CHECK-NEXT:    vins.f16 s26, s4
+; CHECK-NEXT:    vmovx.f16 s7, s6
+; CHECK-NEXT:    vmovx.f16 s4, s10
 ; CHECK-NEXT:    vins.f16 s6, s10
-; CHECK-NEXT:    vins.f16 s31, s8
-; CHECK-NEXT:    vmov.f32 s29, s6
+; CHECK-NEXT:    vmov.f32 s21, s5
+; CHECK-NEXT:    vins.f16 s15, s19
+; CHECK-NEXT:    vins.f16 s7, s4
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmovx.f16 s6, s14
 ; CHECK-NEXT:    vmovx.f16 s4, s18
-; CHECK-NEXT:    vmovx.f16 s30, s14
-; CHECK-NEXT:    vmov.f32 s24, s15
 ; CHECK-NEXT:    vins.f16 s14, s18
-; CHECK-NEXT:    vins.f16 s30, s4
-; CHECK-NEXT:    vmov.f32 s28, s14
+; CHECK-NEXT:    vins.f16 s2, s0
+; CHECK-NEXT:    vmov.f32 s0, s13
+; CHECK-NEXT:    vmov.f32 s24, s15
+; CHECK-NEXT:    vins.f16 s6, s4
+; CHECK-NEXT:    vmov.f32 s4, s14
 ; CHECK-NEXT:    vstrb.8 q6, [r1, #48]
-; CHECK-NEXT:    vstrb.8 q7, [r1, #32]
+; CHECK-NEXT:    vstrb.8 q1, [r1, #32]
 ; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
 ; CHECK-NEXT:    vstrb.8 q5, [r1]
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x i16>, <8 x i16>* %src, i32 0
@@ -723,25 +718,25 @@ define void @vst4_v2i64(<2 x i64> *%src, <8 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d6, d8
-; CHECK-NEXT:    vmov.f64 d10, d4
-; CHECK-NEXT:    vmov.f32 s13, s17
-; CHECK-NEXT:    vmov.f32 s21, s9
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    vmov.f32 s14, s0
-; CHECK-NEXT:    vmov.f32 s22, s4
 ; CHECK-NEXT:    vmov.f32 s15, s1
+; CHECK-NEXT:    vmov.f32 s22, s4
 ; CHECK-NEXT:    vmov.f32 s23, s5
+; CHECK-NEXT:    vmov.f32 s12, s16
+; CHECK-NEXT:    vmov.f32 s13, s17
+; CHECK-NEXT:    vmov.f32 s20, s8
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s21, s9
 ; CHECK-NEXT:    vmov.f32 s0, s18
 ; CHECK-NEXT:    vstrw.32 q5, [r1]
-; CHECK-NEXT:    vmov.f32 s4, s10
 ; CHECK-NEXT:    vmov.f32 s1, s19
-; CHECK-NEXT:    vmov.f32 s5, s11
+; CHECK-NEXT:    vmov.f32 s4, s10
 ; CHECK-NEXT:    vstrw.32 q0, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s5, s11
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #32]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11}
 ; CHECK-NEXT:    bx lr
@@ -766,57 +761,56 @@ define void @vst4_v4i64(<4 x i64> *%src, <16 x i64> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #48
-; CHECK-NEXT:    sub sp, #48
-; CHECK-NEXT:    vldrw.u32 q7, [r0]
+; CHECK-NEXT:    .pad #64
+; CHECK-NEXT:    sub sp, #64
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #64]
-; CHECK-NEXT:    vmov.f64 d12, d14
+; CHECK-NEXT:    vldrw.u32 q7, [r0]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #96]
-; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #64]
+; CHECK-NEXT:    vmov.f32 s6, s0
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
-; CHECK-NEXT:    vstrw.32 q1, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f32 s25, s29
-; CHECK-NEXT:    vmov.f32 s26, s0
-; CHECK-NEXT:    vmov.f32 s27, s1
-; CHECK-NEXT:    vmov.f32 s0, s30
-; CHECK-NEXT:    vstrw.32 q6, [r1]
-; CHECK-NEXT:    vmov.f32 s1, s31
-; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d2, d6
-; CHECK-NEXT:    vmov.f32 s5, s13
-; CHECK-NEXT:    vmov.f64 d14, d0
-; CHECK-NEXT:    vmov.f32 s29, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s7, s1
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vmov.f64 d13, d1
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #80]
+; CHECK-NEXT:    vmov.f32 s4, s28
+; CHECK-NEXT:    vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s5, s29
+; CHECK-NEXT:    vmov.f32 s24, s30
+; CHECK-NEXT:    vstrw.32 q1, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s25, s31
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s6, s8
+; CHECK-NEXT:    vstrw.32 q6, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f32 s7, s9
+; CHECK-NEXT:    vmov.f32 s4, s12
+; CHECK-NEXT:    vmov.f32 s5, s13
 ; CHECK-NEXT:    vmov.f32 s8, s14
 ; CHECK-NEXT:    vstrw.32 q1, [r1, #16]
 ; CHECK-NEXT:    vmov.f32 s9, s15
-; CHECK-NEXT:    vmov.f64 d6, d0
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f64 d1, d15
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s13, s1
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f64 d13, d7
+; CHECK-NEXT:    vmov.f32 s14, s20
+; CHECK-NEXT:    vmov.f32 s15, s21
 ; CHECK-NEXT:    vmov.f32 s30, s16
+; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
 ; CHECK-NEXT:    vmov.f32 s31, s17
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s16, s2
 ; CHECK-NEXT:    vstrw.32 q7, [r1, #64]
 ; CHECK-NEXT:    vmov.f32 s17, s3
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s14, s20
-; CHECK-NEXT:    vstrw.32 q4, [r1, #96]
-; CHECK-NEXT:    vmov.f32 s15, s21
-; CHECK-NEXT:    vmov.f32 s20, s2
-; CHECK-NEXT:    vstrw.32 q3, [r1, #80]
-; CHECK-NEXT:    vmov.f32 s21, s3
 ; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f32 s20, s26
+; CHECK-NEXT:    vstrw.32 q4, [r1, #96]
+; CHECK-NEXT:    vmov.f32 s21, s27
+; CHECK-NEXT:    vstrw.32 q3, [r1, #32]
 ; CHECK-NEXT:    vstrw.32 q5, [r1, #112]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #32]
-; CHECK-NEXT:    add sp, #48
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    add sp, #64
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    bx lr
 entry:
@@ -901,12 +895,12 @@ define void @vst4_v8f32(<8 x float> *%src, <32 x float> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vst40.32 {q4, q5, q6, q7}, [r1]
 ; CHECK-NEXT:    vst41.32 {q4, q5, q6, q7}, [r1]
@@ -943,55 +937,50 @@ define void @vst4_v16f32(<16 x float> *%src, <64 x float> *%dst) {
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    .pad #192
 ; CHECK-NEXT:    sub sp, #192
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #240]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #208]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #224]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #208]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #144]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
+; CHECK-NEXT:    vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #240]
+; CHECK-NEXT:    vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill
+; CHECK-NEXT:    add r2, sp, #128
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #160]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #96]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
 ; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #224]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
+; CHECK-NEXT:    vmov q6, q2
+; CHECK-NEXT:    vmov q7, q3
+; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
 ; CHECK-NEXT:    mov r0, r1
 ; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r1]
 ; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r1]
 ; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r1]
 ; CHECK-NEXT:    vst43.32 {q0, q1, q2, q3}, [r0]!
-; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
+; CHECK-NEXT:    vldmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    vst40.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst41.32 {q0, q1, q2, q3}, [r0]
 ; CHECK-NEXT:    vst42.32 {q0, q1, q2, q3}, [r0]
@@ -1033,28 +1022,28 @@ define void @vst4_v4f32_align1(<4 x float> *%src, <16 x float> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT:    vldrw.u32 q4, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
-; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
-; CHECK-NEXT:    vmov.f32 s12, s17
-; CHECK-NEXT:    vmov.f64 d10, d8
-; CHECK-NEXT:    vmov.f32 s24, s19
-; CHECK-NEXT:    vmov.f32 s13, s9
-; CHECK-NEXT:    vmov.f32 s21, s8
-; CHECK-NEXT:    vmov.f32 s25, s11
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
 ; CHECK-NEXT:    vmov.f32 s14, s1
 ; CHECK-NEXT:    vmov.f32 s22, s0
 ; CHECK-NEXT:    vmov.f32 s26, s3
-; CHECK-NEXT:    vmov.f32 s0, s18
-; CHECK-NEXT:    vmov.f32 s1, s10
+; CHECK-NEXT:    vmov.f32 s12, s17
+; CHECK-NEXT:    vmov.f32 s13, s9
 ; CHECK-NEXT:    vmov.f32 s15, s5
-; CHECK-NEXT:    vmov.f32 s23, s4
+; CHECK-NEXT:    vmov.f32 s20, s16
 ; CHECK-NEXT:    vstrb.8 q3, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s27, s7
+; CHECK-NEXT:    vmov.f32 s21, s8
+; CHECK-NEXT:    vmov.f32 s23, s4
+; CHECK-NEXT:    vmov.f32 s24, s19
 ; CHECK-NEXT:    vstrb.8 q5, [r1]
-; CHECK-NEXT:    vmov.f32 s3, s6
+; CHECK-NEXT:    vmov.f32 s25, s11
+; CHECK-NEXT:    vmov.f32 s27, s7
+; CHECK-NEXT:    vmov.f32 s0, s18
 ; CHECK-NEXT:    vstrb.8 q6, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s1, s10
+; CHECK-NEXT:    vmov.f32 s3, s6
 ; CHECK-NEXT:    vstrb.8 q0, [r1, #32]
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
@@ -1079,17 +1068,18 @@ entry:
 define void @vst4_v2f16(<2 x half> *%src, <8 x half> *%dst) {
 ; CHECK-LABEL: vst4_v2f16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vldmia r0, {s0, s1}
+; CHECK-NEXT:    vldr s0, [r0]
+; CHECK-NEXT:    vldr s5, [r0, #4]
 ; CHECK-NEXT:    vldr s4, [r0, #8]
 ; CHECK-NEXT:    vmovx.f16 s2, s0
-; CHECK-NEXT:    vldr s5, [r0, #12]
-; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vldr s1, [r0, #12]
+; CHECK-NEXT:    vmovx.f16 s6, s5
 ; CHECK-NEXT:    vmovx.f16 s3, s4
-; CHECK-NEXT:    vmovx.f16 s8, s1
-; CHECK-NEXT:    vins.f16 s4, s5
-; CHECK-NEXT:    vins.f16 s2, s8
-; CHECK-NEXT:    vmovx.f16 s8, s5
-; CHECK-NEXT:    vins.f16 s3, s8
+; CHECK-NEXT:    vins.f16 s2, s6
+; CHECK-NEXT:    vmovx.f16 s6, s1
+; CHECK-NEXT:    vins.f16 s4, s1
+; CHECK-NEXT:    vins.f16 s0, s5
+; CHECK-NEXT:    vins.f16 s3, s6
 ; CHECK-NEXT:    vmov.f32 s1, s4
 ; CHECK-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-NEXT:    bx lr
@@ -1122,37 +1112,33 @@ define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) {
 ; CHECK-NEXT:    vmov.32 q0[0], r3
 ; CHECK-NEXT:    vmov.32 q0[1], r12
 ; CHECK-NEXT:    ldrd r2, r12, [r0]
-; CHECK-NEXT:    vmov.f32 s2, s4
 ; CHECK-NEXT:    ldrd r3, r0, [r0, #8]
-; CHECK-NEXT:    vmov.f32 s3, s5
 ; CHECK-NEXT:    vmovx.f16 s12, s0
-; CHECK-NEXT:    vins.f16 s0, s2
-; CHECK-NEXT:    vmovx.f16 s4, s2
+; CHECK-NEXT:    vmovx.f16 s2, s4
+; CHECK-NEXT:    vmov.f32 s3, s5
 ; CHECK-NEXT:    vmov.32 q2[0], r3
-; CHECK-NEXT:    vins.f16 s12, s4
+; CHECK-NEXT:    vins.f16 s0, s4
 ; CHECK-NEXT:    vmov.32 q1[0], r2
 ; CHECK-NEXT:    vmov.32 q2[1], r0
 ; CHECK-NEXT:    vmov.32 q1[1], r12
-; CHECK-NEXT:    vmov.f32 s6, s8
-; CHECK-NEXT:    vmov.f32 s7, s9
-; CHECK-NEXT:    vmovx.f16 s14, s4
-; CHECK-NEXT:    vins.f16 s4, s6
-; CHECK-NEXT:    vmovx.f16 s8, s6
-; CHECK-NEXT:    vins.f16 s14, s8
+; CHECK-NEXT:    vins.f16 s12, s2
+; CHECK-NEXT:    vmovx.f16 s6, s4
+; CHECK-NEXT:    vmovx.f16 s2, s8
+; CHECK-NEXT:    vins.f16 s6, s2
 ; CHECK-NEXT:    vmovx.f16 s11, s1
-; CHECK-NEXT:    vins.f16 s1, s3
-; CHECK-NEXT:    vmovx.f16 s13, s3
-; CHECK-NEXT:    vins.f16 s11, s13
+; CHECK-NEXT:    vmovx.f16 s2, s3
 ; CHECK-NEXT:    vmovx.f16 s10, s5
-; CHECK-NEXT:    vins.f16 s5, s7
-; CHECK-NEXT:    vmovx.f16 s13, s7
-; CHECK-NEXT:    vins.f16 s10, s13
+; CHECK-NEXT:    vins.f16 s11, s2
+; CHECK-NEXT:    vmovx.f16 s2, s9
+; CHECK-NEXT:    vins.f16 s1, s3
+; CHECK-NEXT:    vins.f16 s5, s9
+; CHECK-NEXT:    vins.f16 s4, s8
 ; CHECK-NEXT:    vmov.f32 s8, s5
-; CHECK-NEXT:    vmov.f32 s5, s0
-; CHECK-NEXT:    vmov.f32 s6, s14
+; CHECK-NEXT:    vins.f16 s10, s2
 ; CHECK-NEXT:    vmov.f32 s9, s1
-; CHECK-NEXT:    vmov.f32 s7, s12
+; CHECK-NEXT:    vmov.f32 s5, s0
 ; CHECK-NEXT:    vstrh.16 q2, [r1, #16]
+; CHECK-NEXT:    vmov.f32 s7, s12
 ; CHECK-NEXT:    vstrh.16 q1, [r1]
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -1205,12 +1191,12 @@ define void @vst4_v16f16(<16 x half> *%src, <64 x half> *%dst) {
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #96]
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #64]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vst40.16 {q4, q5, q6, q7}, [r1]
 ; CHECK-NEXT:    vst41.16 {q4, q5, q6, q7}, [r1]
@@ -1241,70 +1227,61 @@ entry:
 define void @vst4_v8f16_align1(<8 x half> *%src, <32 x half> *%dst) {
 ; CHECK-LABEL: vst4_v8f16_align1:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT:    .pad #8
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    vldrw.u32 q7, [r0, #48]
+; CHECK-NEXT:    .vsave {d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
-; CHECK-NEXT:    vmovx.f16 s2, s5
-; CHECK-NEXT:    vmovx.f16 s0, s29
-; CHECK-NEXT:    vins.f16 s2, s0
-; CHECK-NEXT:    vmovx.f16 s12, s25
-; CHECK-NEXT:    vstr s2, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT:    vmovx.f16 s0, s5
+; CHECK-NEXT:    vmovx.f16 s2, s21
+; CHECK-NEXT:    vins.f16 s0, s2
 ; CHECK-NEXT:    vmovx.f16 s2, s9
-; CHECK-NEXT:    vins.f16 s5, s29
-; CHECK-NEXT:    vins.f16 s2, s12
+; CHECK-NEXT:    vmovx.f16 s12, s25
 ; CHECK-NEXT:    vmovx.f16 s19, s4
-; CHECK-NEXT:    vmovx.f16 s12, s28
-; CHECK-NEXT:    vins.f16 s9, s25
+; CHECK-NEXT:    vins.f16 s2, s12
+; CHECK-NEXT:    vmovx.f16 s12, s20
 ; CHECK-NEXT:    vins.f16 s19, s12
-; CHECK-NEXT:    vmovx.f16 s14, s8
-; CHECK-NEXT:    vmovx.f16 s12, s24
-; CHECK-NEXT:    vins.f16 s14, s12
-; CHECK-NEXT:    vins.f16 s4, s28
-; CHECK-NEXT:    vstr s14, [sp] @ 4-byte Spill
+; CHECK-NEXT:    vmovx.f16 s12, s8
+; CHECK-NEXT:    vmovx.f16 s14, s24
 ; CHECK-NEXT:    vmovx.f16 s15, s7
-; CHECK-NEXT:    vmovx.f16 s20, s31
-; CHECK-NEXT:    vins.f16 s8, s24
-; CHECK-NEXT:    vins.f16 s15, s20
-; CHECK-NEXT:    vmovx.f16 s20, s27
+; CHECK-NEXT:    vins.f16 s12, s14
+; CHECK-NEXT:    vmovx.f16 s14, s23
+; CHECK-NEXT:    vins.f16 s15, s14
 ; CHECK-NEXT:    vmovx.f16 s14, s11
-; CHECK-NEXT:    vins.f16 s7, s31
-; CHECK-NEXT:    vins.f16 s14, s20
+; CHECK-NEXT:    vmovx.f16 s1, s27
+; CHECK-NEXT:    vins.f16 s7, s23
+; CHECK-NEXT:    vins.f16 s14, s1
 ; CHECK-NEXT:    vmovx.f16 s23, s6
-; CHECK-NEXT:    vmovx.f16 s28, s30
-; CHECK-NEXT:    vins.f16 s6, s30
-; CHECK-NEXT:    vins.f16 s23, s28
-; CHECK-NEXT:    vins.f16 s11, s27
+; CHECK-NEXT:    vmovx.f16 s1, s22
+; CHECK-NEXT:    vins.f16 s6, s22
+; CHECK-NEXT:    vins.f16 s5, s21
+; CHECK-NEXT:    vins.f16 s4, s20
+; CHECK-NEXT:    vins.f16 s23, s1
 ; CHECK-NEXT:    vmovx.f16 s22, s10
-; CHECK-NEXT:    vmovx.f16 s24, s26
-; CHECK-NEXT:    vldr s28, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT:    vins.f16 s22, s24
 ; CHECK-NEXT:    vins.f16 s10, s26
+; CHECK-NEXT:    vmovx.f16 s1, s26
+; CHECK-NEXT:    vins.f16 s9, s25
+; CHECK-NEXT:    vins.f16 s8, s24
+; CHECK-NEXT:    vins.f16 s11, s27
 ; CHECK-NEXT:    vmov q6, q1
-; CHECK-NEXT:    vmov.f32 s27, s28
-; CHECK-NEXT:    vldr s28, [sp] @ 4-byte Reload
-; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vins.f16 s22, s1
 ; CHECK-NEXT:    vmov.f32 s1, s25
-; CHECK-NEXT:    vmov.f32 s13, s7
-; CHECK-NEXT:    vmov.f32 s21, s6
-; CHECK-NEXT:    vmov.f32 s12, s11
-; CHECK-NEXT:    vmov.f32 s20, s10
-; CHECK-NEXT:    vstrb.8 q3, [r1, #48]
-; CHECK-NEXT:    vmov.f32 s3, s27
 ; CHECK-NEXT:    vmov q6, q2
-; CHECK-NEXT:    vmov.f32 s26, s28
-; CHECK-NEXT:    vstrb.8 q5, [r1, #32]
-; CHECK-NEXT:    vmov.f32 s25, s4
+; CHECK-NEXT:    vmov.f32 s3, s0
+; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vmov.f32 s26, s12
 ; CHECK-NEXT:    vstrb.8 q0, [r1, #16]
-; CHECK-NEXT:    vmov.f32 s17, s4
+; CHECK-NEXT:    vmov.f32 s25, s4
 ; CHECK-NEXT:    vmov.f32 s27, s19
+; CHECK-NEXT:    vmov.f32 s13, s7
 ; CHECK-NEXT:    vstrb.8 q6, [r1]
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vmov.f32 s12, s11
+; CHECK-NEXT:    vmov.f32 s21, s6
+; CHECK-NEXT:    vstrb.8 q3, [r1, #48]
+; CHECK-NEXT:    vmov.f32 s20, s10
+; CHECK-NEXT:    vstrb.8 q5, [r1, #32]
+; CHECK-NEXT:    vpop {d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    bx lr
 entry:
   %s1 = getelementptr <8 x half>, <8 x half>* %src, i32 0
@@ -1329,15 +1306,15 @@ define void @vst4_v2f64(<2 x double> *%src, <8 x double> *%dst) {
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11}
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
-; CHECK-NEXT:    vmov.f64 d8, d4
-; CHECK-NEXT:    vmov.f64 d10, d6
+; CHECK-NEXT:    vldrw.u32 q3, [r0]
 ; CHECK-NEXT:    vmov.f64 d9, d0
+; CHECK-NEXT:    vmov.f64 d8, d4
 ; CHECK-NEXT:    vmov.f64 d11, d2
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #16]
+; CHECK-NEXT:    vmov.f64 d10, d6
 ; CHECK-NEXT:    vmov.f64 d0, d5
 ; CHECK-NEXT:    vstrw.32 q5, [r1]
 ; CHECK-NEXT:    vmov.f64 d2, d7
@@ -1369,32 +1346,32 @@ define void @vst4_v4f64(<4 x double> *%src, <16 x double> *%dst) {
 ; CHECK-NEXT:    .pad #64
 ; CHECK-NEXT:    sub sp, #64
 ; CHECK-NEXT:    vldrw.u32 q7, [r0, #80]
-; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
-; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d14, d12
+; CHECK-NEXT:    vldrw.u32 q6, [r0]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
+; CHECK-NEXT:    vstrw.32 q7, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f64 d15, d10
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #64]
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #48]
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #112]
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT:    vmov.f64 d15, d10
+; CHECK-NEXT:    vmov.f64 d14, d12
 ; CHECK-NEXT:    vstrw.32 q7, [sp, #48] @ 16-byte Spill
 ; CHECK-NEXT:    vmov.f64 d14, d4
 ; CHECK-NEXT:    vmov.f64 d15, d2
-; CHECK-NEXT:    vmov.f64 d2, d5
 ; CHECK-NEXT:    vstrw.32 q7, [sp] @ 16-byte Spill
-; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f64 d4, d0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f64 d10, d13
-; CHECK-NEXT:    vmov.f64 d12, d0
+; CHECK-NEXT:    vmov.f64 d2, d5
 ; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
 ; CHECK-NEXT:    vmov.f64 d5, d6
+; CHECK-NEXT:    vstrw.32 q1, [r1, #48]
 ; CHECK-NEXT:    vmov.f64 d13, d8
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #64]
+; CHECK-NEXT:    vmov.f64 d12, d0
 ; CHECK-NEXT:    vmov.f64 d8, d1
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q6, [r1, #80]
diff --git a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
index d055469064e52..5e3546585e94b 100644
--- a/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-zext-masked-load.ll
@@ -56,14 +56,14 @@ define arm_aapcs_vfpcc <4 x double> @foo_v4i32(<4 x i32>* nocapture readonly %pS
 ; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
 ; CHECK-NEXT:    vpt.s32 lt, q0, zr
 ; CHECK-NEXT:    vldrwt.u32 q4, [r0]
-; CHECK-NEXT:    vmov.f64 d0, d8
 ; CHECK-NEXT:    vmov.i64 q5, #0xffffffff
+; CHECK-NEXT:    vmov.f32 s0, s16
 ; CHECK-NEXT:    vmov.f32 s2, s17
 ; CHECK-NEXT:    vand q6, q0, q5
 ; CHECK-NEXT:    vmov r0, r1, d13
 ; CHECK-NEXT:    bl __aeabi_ul2d
 ; CHECK-NEXT:    vmov r2, r3, d12
-; CHECK-NEXT:    vmov.f64 d0, d9
+; CHECK-NEXT:    vmov.f32 s0, s18
 ; CHECK-NEXT:    vmov.f32 s2, s19
 ; CHECK-NEXT:    vmov d9, r0, r1
 ; CHECK-NEXT:    vand q5, q0, q5
diff --git a/llvm/test/CodeGen/WebAssembly/simd-concat.ll b/llvm/test/CodeGen/WebAssembly/simd-concat.ll
index ede1868917860..ec7fc9836c3ef 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-concat.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-concat.ll
@@ -11,7 +11,7 @@ define <16 x i8> @concat_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
 ; CHECK-NEXT:    # fallthrough-return
   %v = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %v
@@ -23,7 +23,7 @@ define <8 x i8> @concat_v4i8(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %v = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %v
@@ -35,7 +35,7 @@ define <8 x i16> @concat_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
 ; CHECK-NEXT:    # fallthrough-return
   %v = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %v
@@ -47,7 +47,7 @@ define <4 x i8> @concat_v2i8(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 16, 17, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %v = shufflevector <2 x i8> %a, <2 x i8> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i8> %v
@@ -59,7 +59,7 @@ define <4 x i16> @concat_v2i16(<2 x i16> %a, <2 x i16> %b) {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %v = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i16> %v
@@ -71,7 +71,7 @@ define <4 x i32> @concat_v2i32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
 ; CHECK-NEXT:    # fallthrough-return
   %v = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x i32> %v
diff --git a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
index 6a5f9372efaf5..aacbce01f3826 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll
@@ -1,173 +1,443 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals | FileCheck %s --check-prefixes CHECK,NO-SIMD128
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
 
 ; Test that vector float-to-int and int-to-float instructions lower correctly
 
 target triple = "wasm32-unknown-unknown"
 
-; CHECK-LABEL: convert_s_v4f32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype convert_s_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.convert_i32x4_s $push[[R:[0-9]+]]=, $0
-; SIMD128-NEXT: return $pop[[R]]
 define <4 x float> @convert_s_v4f32(<4 x i32> %x) {
+; CHECK-LABEL: convert_s_v4f32:
+; CHECK:         .functype convert_s_v4f32 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f32x4.convert_i32x4_s
+; CHECK-NEXT:    # fallthrough-return
   %a = sitofp <4 x i32> %x to <4 x float>
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: convert_u_v4f32:
-; NO-SIMD128-NOT: i32x4
-; SIMD128-NEXT: .functype convert_u_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.convert_i32x4_u $push[[R:[0-9]+]]=, $0
-; SIMD128-NEXT: return $pop[[R]]
 define <4 x float> @convert_u_v4f32(<4 x i32> %x) {
+; CHECK-LABEL: convert_u_v4f32:
+; CHECK:         .functype convert_u_v4f32 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f32x4.convert_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
   %a = uitofp <4 x i32> %x to <4 x float>
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: convert_s_v2f64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NOT: f64x2.convert_i64x2_s
-; SIMD128-NEXT: .functype convert_s_v2f64 (v128) -> (v128){{$}}
 define <2 x double> @convert_s_v2f64(<2 x i64> %x) {
+; CHECK-LABEL: convert_s_v2f64:
+; CHECK:         .functype convert_s_v2f64 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.extract_lane 0
+; CHECK-NEXT:    f64.convert_i64_s
+; CHECK-NEXT:    f64x2.splat
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.extract_lane 1
+; CHECK-NEXT:    f64.convert_i64_s
+; CHECK-NEXT:    f64x2.replace_lane 1
+; CHECK-NEXT:    # fallthrough-return
   %a = sitofp <2 x i64> %x to <2 x double>
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: convert_u_v2f64:
-; NO-SIMD128-NOT: i64x2
-; SIMD128-NOT: f64x2.convert_i64x2_u
-; SIMD128-NEXT: .functype convert_u_v2f64 (v128) -> (v128){{$}}
 define <2 x double> @convert_u_v2f64(<2 x i64> %x) {
+; CHECK-LABEL: convert_u_v2f64:
+; CHECK:         .functype convert_u_v2f64 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.const 4294967295, 4294967295
+; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    v128.const 4841369599423283200, 4841369599423283200
+; CHECK-NEXT:    v128.or
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 32
+; CHECK-NEXT:    i64x2.shr_u
+; CHECK-NEXT:    v128.const 4985484787499139072, 4985484787499139072
+; CHECK-NEXT:    v128.or
+; CHECK-NEXT:    v128.const 0x1.00000001p84, 0x1.00000001p84
+; CHECK-NEXT:    f64x2.sub
+; CHECK-NEXT:    f64x2.add
+; CHECK-NEXT:    # fallthrough-return
   %a = uitofp <2 x i64> %x to <2 x double>
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: trunc_sat_s_v4i32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype trunc_sat_s_v4i32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.trunc_sat_f32x4_s $push[[R:[0-9]+]]=, $0
-; SIMD128-NEXT: return $pop[[R]]
 define <4 x i32> @trunc_sat_s_v4i32(<4 x float> %x) {
+; CHECK-LABEL: trunc_sat_s_v4i32:
+; CHECK:         .functype trunc_sat_s_v4i32 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_s
+; CHECK-NEXT:    # fallthrough-return
   %a = fptosi <4 x float> %x to <4 x i32>
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: trunc_sat_u_v4i32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype trunc_sat_u_v4i32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: i32x4.trunc_sat_f32x4_u $push[[R:[0-9]+]]=, $0
-; SIMD128-NEXT: return $pop[[R]]
 define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) {
+; CHECK-LABEL: trunc_sat_u_v4i32:
+; CHECK:         .functype trunc_sat_u_v4i32 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.trunc_sat_f32x4_u
+; CHECK-NEXT:    # fallthrough-return
   %a = fptoui <4 x float> %x to <4 x i32>
   ret <4 x i32> %a
 }
 
-; CHECK-LABEL: trunc_sat_s_v2i64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NOT: i64x2.trunc_sat_f64x2_s
-; SIMD128-NEXT: .functype trunc_sat_s_v2i64 (v128) -> (v128){{$}}
 define <2 x i64> @trunc_sat_s_v2i64(<2 x double> %x) {
+; CHECK-LABEL: trunc_sat_s_v2i64:
+; CHECK:         .functype trunc_sat_s_v2i64 (v128) -> (v128)
+; CHECK-NEXT:    .local f64, i64, i64
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.extract_lane 1
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    f64.abs
+; CHECK-NEXT:    f64.const 0x1p63
+; CHECK-NEXT:    f64.lt
+; CHECK-NEXT:    i32.eqz
+; CHECK-NEXT:    br_if 0 # 0: down to label1
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.trunc_f64_s
+; CHECK-NEXT:    local.set 2
+; CHECK-NEXT:    br 1 # 1: down to label0
+; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:    end_block # label1:
+; CHECK-NEXT:    i64.const -9223372036854775808
+; CHECK-NEXT:    local.set 2
+; CHECK-NEXT:  .LBB6_3:
+; CHECK-NEXT:    end_block # label0:
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.extract_lane 0
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    f64.abs
+; CHECK-NEXT:    f64.const 0x1p63
+; CHECK-NEXT:    f64.lt
+; CHECK-NEXT:    i32.eqz
+; CHECK-NEXT:    br_if 0 # 0: down to label3
+; CHECK-NEXT:  # %bb.4:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.trunc_f64_s
+; CHECK-NEXT:    local.set 3
+; CHECK-NEXT:    br 1 # 1: down to label2
+; CHECK-NEXT:  .LBB6_5:
+; CHECK-NEXT:    end_block # label3:
+; CHECK-NEXT:    i64.const -9223372036854775808
+; CHECK-NEXT:    local.set 3
+; CHECK-NEXT:  .LBB6_6:
+; CHECK-NEXT:    end_block # label2:
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i64x2.splat
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64x2.replace_lane 1
+; CHECK-NEXT:    # fallthrough-return
   %a = fptosi <2 x double> %x to <2 x i64>
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: trunc_sat_u_v2i64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NOT: i64x2.trunc_sat_f64x2_u
-; SIMD128-NEXT: .functype trunc_sat_u_v2i64 (v128) -> (v128){{$}}
 define <2 x i64> @trunc_sat_u_v2i64(<2 x double> %x) {
+; CHECK-LABEL: trunc_sat_u_v2i64:
+; CHECK:         .functype trunc_sat_u_v2i64 (v128) -> (v128)
+; CHECK-NEXT:    .local f64, i64, i64
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.extract_lane 1
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    f64.const 0x1p64
+; CHECK-NEXT:    f64.lt
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64.const 0x0p0
+; CHECK-NEXT:    f64.ge
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    i32.eqz
+; CHECK-NEXT:    br_if 0 # 0: down to label5
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.trunc_f64_u
+; CHECK-NEXT:    local.set 2
+; CHECK-NEXT:    br 1 # 1: down to label4
+; CHECK-NEXT:  .LBB7_2:
+; CHECK-NEXT:    end_block # label5:
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    local.set 2
+; CHECK-NEXT:  .LBB7_3:
+; CHECK-NEXT:    end_block # label4:
+; CHECK-NEXT:    block
+; CHECK-NEXT:    block
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.extract_lane 0
+; CHECK-NEXT:    local.tee 1
+; CHECK-NEXT:    f64.const 0x1p64
+; CHECK-NEXT:    f64.lt
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64.const 0x0p0
+; CHECK-NEXT:    f64.ge
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    i32.eqz
+; CHECK-NEXT:    br_if 0 # 0: down to label7
+; CHECK-NEXT:  # %bb.4:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i64.trunc_f64_u
+; CHECK-NEXT:    local.set 3
+; CHECK-NEXT:    br 1 # 1: down to label6
+; CHECK-NEXT:  .LBB7_5:
+; CHECK-NEXT:    end_block # label7:
+; CHECK-NEXT:    i64.const 0
+; CHECK-NEXT:    local.set 3
+; CHECK-NEXT:  .LBB7_6:
+; CHECK-NEXT:    end_block # label6:
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i64x2.splat
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i64x2.replace_lane 1
+; CHECK-NEXT:    # fallthrough-return
   %a = fptoui <2 x double> %x to <2 x i64>
   ret <2 x i64> %a
 }
 
-; CHECK-LABEL: demote_zero_v4f32:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype demote_zero_v4f32 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.demote_zero_f64x2 $push[[R:[0-9]+]]=, $0
-; SIMD128-NEXT: return $pop[[R]]
 define <4 x float> @demote_zero_v4f32(<2 x double> %x) {
+; CHECK-LABEL: demote_zero_v4f32:
+; CHECK:         .functype demote_zero_v4f32 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f32x4.demote_zero_f64x2
+; CHECK-NEXT:    # fallthrough-return
   %v = shufflevector <2 x double> %x, <2 x double> zeroinitializer,
          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %a = fptrunc <4 x double> %v to <4 x float>
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: demote_zero_v4f32_2:
-; NO-SIMD128-NOT: f32x4
-; SIMD128-NEXT: .functype demote_zero_v4f32_2 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f32x4.demote_zero_f64x2 $push[[R:[0-9]+]]=, $0
-; SIMD128-NEXT: return $pop[[R]]
 define <4 x float> @demote_zero_v4f32_2(<2 x double> %x) {
+; CHECK-LABEL: demote_zero_v4f32_2:
+; CHECK:         .functype demote_zero_v4f32_2 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f32x4.demote_zero_f64x2
+; CHECK-NEXT:    # fallthrough-return
   %v = fptrunc <2 x double> %x to <2 x float>
   %a = shufflevector <2 x float> %v, <2 x float> zeroinitializer,
          <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x float> %a
 }
 
-; CHECK-LABEL: convert_low_s_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype convert_low_s_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.convert_low_i32x4_s $push[[R:[0-9]+]]=, $0
-; SIMD128-NEXT: return $pop[[R]]
 define <2 x double> @convert_low_s_v2f64(<4 x i32> %x) {
+; CHECK-LABEL: convert_low_s_v2f64:
+; CHECK:         .functype convert_low_s_v2f64 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.convert_low_i32x4_s
+; CHECK-NEXT:    # fallthrough-return
   %v = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   %a = sitofp <2 x i32> %v to <2 x double>
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: convert_low_u_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype convert_low_u_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.convert_low_i32x4_u $push[[R:[0-9]+]]=, $0
-; SIMD128-NEXT: return $pop[[R]]
 define <2 x double> @convert_low_u_v2f64(<4 x i32> %x) {
+; CHECK-LABEL: convert_low_u_v2f64:
+; CHECK:         .functype convert_low_u_v2f64 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.convert_low_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
   %v = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
   %a = uitofp <2 x i32> %v to <2 x double>
   ret <2 x double> %a
 }
 
 
-; CHECK-LABEL: convert_low_s_v2f64_2:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype convert_low_s_v2f64_2 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.convert_low_i32x4_s $push[[R:[0-9]+]]=, $0
-; SIMD128-NEXT: return $pop[[R]]
 define <2 x double> @convert_low_s_v2f64_2(<4 x i32> %x) {
+; CHECK-LABEL: convert_low_s_v2f64_2:
+; CHECK:         .functype convert_low_s_v2f64_2 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.convert_low_i32x4_s
+; CHECK-NEXT:    # fallthrough-return
   %v = sitofp <4 x i32> %x to <4 x double>
   %a = shufflevector <4 x double> %v, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: convert_low_u_v2f64_2:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype convert_low_u_v2f64_2 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.convert_low_i32x4_u $push[[R:[0-9]+]]=, $0
-; SIMD128-NEXT: return $pop[[R]]
 define <2 x double> @convert_low_u_v2f64_2(<4 x i32> %x) {
+; CHECK-LABEL: convert_low_u_v2f64_2:
+; CHECK:         .functype convert_low_u_v2f64_2 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.convert_low_i32x4_u
+; CHECK-NEXT:    # fallthrough-return
   %v = uitofp <4 x i32> %x to <4 x double>
   %a = shufflevector <4 x double> %v, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: promote_low_v2f64:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype promote_low_v2f64 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.promote_low_f32x4 $push[[R:[0-9]+]]=, $0
-; SIMD128-NEXT: return $pop[[R]]
 define <2 x double> @promote_low_v2f64(<4 x float> %x) {
+; CHECK-LABEL: promote_low_v2f64:
+; CHECK:         .functype promote_low_v2f64 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.promote_low_f32x4
+; CHECK-NEXT:    # fallthrough-return
   %v = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 0, i32 1>
   %a = fpext <2 x float> %v to <2 x double>
   ret <2 x double> %a
 }
 
-; CHECK-LABEL: promote_low_v2f64_2:
-; NO-SIMD128-NOT: f64x2
-; SIMD128-NEXT: .functype promote_low_v2f64_2 (v128) -> (v128){{$}}
-; SIMD128-NEXT: f64x2.promote_low_f32x4 $push[[R:[0-9]+]]=, $0
-; SIMD128-NEXT: return $pop[[R]]
 define <2 x double> @promote_low_v2f64_2(<4 x float> %x) {
+; CHECK-LABEL: promote_low_v2f64_2:
+; CHECK:         .functype promote_low_v2f64_2 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.promote_low_f32x4
+; CHECK-NEXT:    # fallthrough-return
   %v = fpext <4 x float> %x to <4 x double>
   %a = shufflevector <4 x double> %v, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %a
 }
+
+;; Also check with illegally wide vectors
+
+define <4 x double> @convert_low_s_v4f64(<8 x i32> %x) {
+; CHECK-LABEL: convert_low_s_v4f64:
+; CHECK:         .functype convert_low_s_v4f64 (i32, v128, v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.convert_low_i32x4_s
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    f64x2.convert_low_i32x4_s
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
+  %v = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %a = sitofp <4 x i32> %v to <4 x double>
+  ret <4 x double> %a
+}
+
+define <4 x double> @convert_low_u_v4f64(<8 x i32> %x) {
+; CHECK-LABEL: convert_low_u_v4f64:
+; CHECK:         .functype convert_low_u_v4f64 (i32, v128, v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.convert_low_i32x4_u
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    f64x2.convert_low_i32x4_u
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
+  %v = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %a = uitofp <4 x i32> %v to <4 x double>
+  ret <4 x double> %a
+}
+
+
+define <4 x double> @convert_low_s_v4f64_2(<8 x i32> %x) {
+; CHECK-LABEL: convert_low_s_v4f64_2:
+; CHECK:         .functype convert_low_s_v4f64_2 (i32, v128, v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.convert_low_i32x4_s
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    f64x2.convert_low_i32x4_s
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
+  %v = sitofp <8 x i32> %x to <8 x double>
+  %a = shufflevector <8 x double> %v, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %a
+}
+
+define <4 x double> @convert_low_u_v4f64_2(<8 x i32> %x) {
+; CHECK-LABEL: convert_low_u_v4f64_2:
+; CHECK:         .functype convert_low_u_v4f64_2 (i32, v128, v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.convert_low_i32x4_u
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    f64x2.convert_low_i32x4_u
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
+  %v = uitofp <8 x i32> %x to <8 x double>
+  %a = shufflevector <8 x double> %v, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %a
+}
+
+define <4 x double> @promote_low_v4f64(<8 x float> %x) {
+; CHECK-LABEL: promote_low_v4f64:
+; CHECK:         .functype promote_low_v4f64 (i32, v128, v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.promote_low_f32x4
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    f64x2.promote_low_f32x4
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
+  %v = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %a = fpext <4 x float> %v to <4 x double>
+  ret <4 x double> %a
+}
+
+define <4 x double> @promote_low_v4f64_2(<8 x float> %x) {
+; CHECK-LABEL: promote_low_v4f64_2:
+; CHECK:         .functype promote_low_v4f64_2 (i32, v128, v128) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.promote_low_f32x4
+; CHECK-NEXT:    v128.store 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    f64x2.promote_low_f32x4
+; CHECK-NEXT:    v128.store 16
+; CHECK-NEXT:    # fallthrough-return
+  %v = fpext <8 x float> %x to <8 x double>
+  %a = shufflevector <8 x double> %v, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %a
+}
+
+define <2 x double> @promote_mixed_v2f64(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: promote_mixed_v2f64:
+; CHECK:         .functype promote_mixed_v2f64 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.shuffle 8, 9, 10, 11, 28, 29, 30, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    f64x2.promote_low_f32x4
+; CHECK-NEXT:    # fallthrough-return
+  %v = shufflevector <4 x float> %x, <4 x float> %y, <2 x i32> <i32 2, i32 7>
+  %a = fpext <2 x float> %v to <2 x double>
+  ret <2 x double> %a
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-extending.ll b/llvm/test/CodeGen/WebAssembly/simd-extending.ll
index 5cab13c0c61db..0caec7d0b0d0e 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-extending.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-extending.ll
@@ -169,29 +169,8 @@ define <8 x i16> @extend_lowish_i8x16_s(<16 x i8> %v) {
 ; CHECK:         .functype extend_lowish_i8x16_s (v128) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 1
-; CHECK-NEXT:    i16x8.splat
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 2
-; CHECK-NEXT:    i16x8.replace_lane 1
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 3
-; CHECK-NEXT:    i16x8.replace_lane 2
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 4
-; CHECK-NEXT:    i16x8.replace_lane 3
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 5
-; CHECK-NEXT:    i16x8.replace_lane 4
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 6
-; CHECK-NEXT:    i16x8.replace_lane 5
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 7
-; CHECK-NEXT:    i16x8.replace_lane 6
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.extract_lane_u 8
-; CHECK-NEXT:    i16x8.replace_lane 7
+; CHECK-NEXT:    i8x16.shuffle 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0
 ; CHECK-NEXT:    i32.const 8
 ; CHECK-NEXT:    i16x8.shl
 ; CHECK-NEXT:    i32.const 8
@@ -208,17 +187,8 @@ define <4 x i32> @extend_lowish_i16x8_s(<8 x i16> %v) {
 ; CHECK:         .functype extend_lowish_i16x8_s (v128) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.extract_lane_u 1
-; CHECK-NEXT:    i32x4.splat
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.extract_lane_u 2
-; CHECK-NEXT:    i32x4.replace_lane 1
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.extract_lane_u 3
-; CHECK-NEXT:    i32x4.replace_lane 2
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.extract_lane_u 4
-; CHECK-NEXT:    i32x4.replace_lane 3
+; CHECK-NEXT:    i8x16.shuffle 2, 3, 0, 0, 4, 5, 0, 0, 6, 7, 0, 0, 8, 9, 0, 0
 ; CHECK-NEXT:    i32.const 16
 ; CHECK-NEXT:    i32x4.shl
 ; CHECK-NEXT:    i32.const 16
diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
index f7cc9302003db..420422cf9cbb4 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll
@@ -540,6 +540,26 @@ define <4 x float> @bitselect_v4f32(<4 x float> %v1, <4 x float> %v2, <4 x float
   ret <4 x float> %a
 }
 
+; CHECK-LABEL: pmin_v4f32:
+; CHECK-NEXT: .functype pmin_v4f32 (v128, v128) -> (v128){{$}}
+; CHECK-NEXT: f32x4.pmin $push[[R:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+declare <4 x float> @llvm.wasm.pmin.v4f32(<4 x float>, <4 x float>)
+define <4 x float> @pmin_v4f32(<4 x float> %a, <4 x float> %b) {
+  %v = call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %v
+}
+
+; CHECK-LABEL: pmax_v4f32:
+; CHECK-NEXT: .functype pmax_v4f32 (v128, v128) -> (v128){{$}}
+; CHECK-NEXT: f32x4.pmax $push[[R:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+declare <4 x float> @llvm.wasm.pmax.v4f32(<4 x float>, <4 x float>)
+define <4 x float> @pmax_v4f32(<4 x float> %a, <4 x float> %b) {
+  %v = call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %v
+}
+
 ; CHECK-LABEL: ceil_v4f32:
 ; CHECK-NEXT: .functype ceil_v4f32 (v128) -> (v128){{$}}
 ; CHECK-NEXT: f32x4.ceil $push[[R:[0-9]+]]=, $0{{$}}
@@ -595,6 +615,26 @@ define <2 x double> @bitselect_v2f64(<2 x double> %v1, <2 x double> %v2, <2 x do
   ret <2 x double> %a
 }
 
+; CHECK-LABEL: pmin_v2f64:
+; CHECK-NEXT: .functype pmin_v2f64 (v128, v128) -> (v128){{$}}
+; CHECK-NEXT: f64x2.pmin $push[[R:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+declare <2 x double> @llvm.wasm.pmin.v2f64(<2 x double>, <2 x double>)
+define <2 x double> @pmin_v2f64(<2 x double> %a, <2 x double> %b) {
+  %v = call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %v
+}
+
+; CHECK-LABEL: pmax_v2f64:
+; CHECK-NEXT: .functype pmax_v2f64 (v128, v128) -> (v128){{$}}
+; CHECK-NEXT: f64x2.pmax $push[[R:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: return $pop[[R]]{{$}}
+declare <2 x double> @llvm.wasm.pmax.v2f64(<2 x double>, <2 x double>)
+define <2 x double> @pmax_v2f64(<2 x double> %a, <2 x double> %b) {
+  %v = call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %v
+}
+
 ; CHECK-LABEL: ceil_v2f64:
 ; CHECK-NEXT: .functype ceil_v2f64 (v128) -> (v128){{$}}
 ; CHECK-NEXT: f64x2.ceil $push[[R:[0-9]+]]=, $0{{$}}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll b/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll
index 60afb367cbbeb..b63900efb35e1 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll
@@ -294,7 +294,7 @@ define <8 x i8> @load_ext_v8i16_a1(<8 x i8>* %p) {
 ; CHECK:         .functype load_ext_v8i16_a1 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.load8x8_u 0:p2align=0
+; CHECK-NEXT:    v128.load64_zero 0:p2align=0
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i8>, <8 x i8>* %p, align 1
   ret <8 x i8> %v
@@ -305,7 +305,7 @@ define <8 x i8> @load_ext_v8i16_a2(<8 x i8>* %p) {
 ; CHECK:         .functype load_ext_v8i16_a2 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.load8x8_u 0:p2align=1
+; CHECK-NEXT:    v128.load64_zero 0:p2align=1
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i8>, <8 x i8>* %p, align 2
   ret <8 x i8> %v
@@ -316,7 +316,7 @@ define <8 x i8> @load_ext_v8i16_a4(<8 x i8>* %p) {
 ; CHECK:         .functype load_ext_v8i16_a4 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.load8x8_u 0:p2align=2
+; CHECK-NEXT:    v128.load64_zero 0:p2align=2
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i8>, <8 x i8>* %p, align 4
   ret <8 x i8> %v
@@ -328,7 +328,7 @@ define <8 x i8> @load_ext_v8i16_a8(<8 x i8>* %p) {
 ; CHECK:         .functype load_ext_v8i16_a8 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i8>, <8 x i8>* %p, align 8
   ret <8 x i8> %v
@@ -340,7 +340,7 @@ define <8 x i8> @load_ext_v8i16_a16(<8 x i8>* %p) {
 ; CHECK:         .functype load_ext_v8i16_a16 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    v128.load 0
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i8>, <8 x i8>* %p, align 16
   ret <8 x i8> %v
@@ -636,7 +636,7 @@ define <4 x i16> @load_ext_v4i32_a1(<4 x i16>* %p) {
 ; CHECK:         .functype load_ext_v4i32_a1 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.load16x4_u 0:p2align=0
+; CHECK-NEXT:    v128.load64_zero 0:p2align=0
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i16>, <4 x i16>* %p, align 1
   ret <4 x i16> %v
@@ -647,7 +647,7 @@ define <4 x i16> @load_ext_v4i32_a2(<4 x i16>* %p) {
 ; CHECK:         .functype load_ext_v4i32_a2 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.load16x4_u 0:p2align=1
+; CHECK-NEXT:    v128.load64_zero 0:p2align=1
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i16>, <4 x i16>* %p, align 2
   ret <4 x i16> %v
@@ -658,7 +658,7 @@ define <4 x i16> @load_ext_v4i32_a4(<4 x i16>* %p) {
 ; CHECK:         .functype load_ext_v4i32_a4 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.load16x4_u 0:p2align=2
+; CHECK-NEXT:    v128.load64_zero 0:p2align=2
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i16>, <4 x i16>* %p, align 4
   ret <4 x i16> %v
@@ -670,7 +670,7 @@ define <4 x i16> @load_ext_v4i32_a8(<4 x i16>* %p) {
 ; CHECK:         .functype load_ext_v4i32_a8 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i16>, <4 x i16>* %p, align 8
   ret <4 x i16> %v
@@ -682,7 +682,7 @@ define <4 x i16> @load_ext_v4i32_a16(<4 x i16>* %p) {
 ; CHECK:         .functype load_ext_v4i32_a16 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    v128.load 0
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i16>, <4 x i16>* %p, align 16
   ret <4 x i16> %v
diff --git a/llvm/test/CodeGen/WebAssembly/simd-nonconst-sext.ll b/llvm/test/CodeGen/WebAssembly/simd-nonconst-sext.ll
deleted file mode 100644
index b8c734ec3aa89..0000000000000
--- a/llvm/test/CodeGen/WebAssembly/simd-nonconst-sext.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mattr=+simd128 | FileCheck %s
-
-; A regression test for a bug in the lowering of SIGN_EXTEND_INREG
-; with SIMD and without sign-ext where ISel would crash if the index
-; of the vector extract was not a constant.
-
-target triple = "wasm32"
-
-; CHECK-LABEL: foo:
-; CHECK-NEXT: .functype foo () -> (f32)
-; CHECK: i32x4.load16x4_u
-; CHECK: f32.convert_i32_s
-define float @foo() {
-  %1 = load <4 x i16>, <4 x i16>* undef, align 8
-  %2 = load i32, i32* undef, align 4
-  %vecext = extractelement <4 x i16> %1, i32 %2
-  %conv = sitofp i16 %vecext to float
-  ret float %conv
-}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-offset.ll
index b72ded143a296..62d32b6be4f7a 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-offset.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-offset.ll
@@ -402,7 +402,7 @@ define <8 x i8> @load_ext_v8i16(<8 x i8>* %p) {
 ; CHECK:         .functype load_ext_v8i16 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i8>, <8 x i8>* %p
   ret <8 x i8> %v
@@ -473,7 +473,7 @@ define <8 x i8> @load_ext_v8i16_with_folded_offset(<8 x i8>* %p) {
 ; CHECK:         .functype load_ext_v8i16_with_folded_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.load8x8_u 16
+; CHECK-NEXT:    v128.load64_zero 16
 ; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <8 x i8>* %p to i32
   %r = add nuw i32 %q, 16
@@ -539,7 +539,7 @@ define <8 x i8> @load_ext_v8i16_with_folded_gep_offset(<8 x i8>* %p) {
 ; CHECK:         .functype load_ext_v8i16_with_folded_gep_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.load8x8_u 8
+; CHECK-NEXT:    v128.load64_zero 8
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 1
   %v = load <8 x i8>, <8 x i8>* %s
@@ -613,7 +613,7 @@ define <8 x i8> @load_ext_v8i16_with_unfolded_gep_negative_offset(<8 x i8>* %p)
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const -8
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 -1
   %v = load <8 x i8>, <8 x i8>* %s
@@ -695,7 +695,7 @@ define <8 x i8> @load_ext_v8i16_with_unfolded_offset(<8 x i8>* %p) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const 16
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <8 x i8>* %p to i32
   %r = add nsw i32 %q, 16
@@ -771,7 +771,7 @@ define <8 x i8> @load_ext_v8i16_with_unfolded_gep_offset(<8 x i8>* %p) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const 8
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i16x8.load8x8_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <8 x i8>, <8 x i8>* %p, i32 1
   %v = load <8 x i8>, <8 x i8>* %s
@@ -835,7 +835,7 @@ define <8 x i8> @load_ext_v8i16_from_numeric_address() {
 ; CHECK:         .functype load_ext_v8i16_from_numeric_address () -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    i16x8.load8x8_u 32
+; CHECK-NEXT:    v128.load64_zero 32
 ; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <8 x i8>*
   %v = load <8 x i8>, <8 x i8>* %s
@@ -898,7 +898,7 @@ define <8 x i8> @load_ext_v8i16_from_global_address() {
 ; CHECK:         .functype load_ext_v8i16_from_global_address () -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    i16x8.load8x8_u gv_v8i8
+; CHECK-NEXT:    v128.load64_zero gv_v8i8
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <8 x i8>, <8 x i8>* @gv_v8i8
   ret <8 x i8> %v
@@ -922,13 +922,8 @@ define void @store_narrowing_v8i16(<8 x i8> %v, <8 x i8>* %p) {
 ; CHECK:         .functype store_narrowing_v8i16 (v128, i32) -> ()
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    v128.const 255, 255, 255, 255, 255, 255, 255, 255
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.narrow_i16x8_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   store <8 x i8> %v, <8 x i8>* %p
   ret void
@@ -954,13 +949,10 @@ define void @store_narrowing_v8i16_with_folded_offset(<8 x i8> %v, <8 x i8>* %p)
 ; CHECK:         .functype store_narrowing_v8i16_with_folded_offset (v128, i32) -> ()
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    v128.const 255, 255, 255, 255, 255, 255, 255, 255
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.narrow_i16x8_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 16
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <8 x i8>* %p to i32
   %r = add nuw i32 %q, 16
@@ -987,13 +979,10 @@ define void @store_narrowing_v8i16_with_folded_gep_offset(<8 x i8> %v, <8 x i8>*
 ; CHECK:         .functype store_narrowing_v8i16_with_folded_gep_offset (v128, i32) -> ()
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    v128.const 255, 255, 255, 255, 255, 255, 255, 255
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.narrow_i16x8_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 8
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 1
   store <8 x i8> %v , <8 x i8>* %s
@@ -1022,13 +1011,8 @@ define void @store_narrowing_v8i16_with_unfolded_gep_negative_offset(<8 x i8> %v
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i32.const -8
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    v128.const 255, 255, 255, 255, 255, 255, 255, 255
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.narrow_i16x8_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 -1
   store <8 x i8> %v , <8 x i8>* %s
@@ -1059,13 +1043,8 @@ define void @store_narrowing_v8i16_with_unfolded_offset(<8 x i8> %v, <8 x i8>* %
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i32.const 16
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    v128.const 255, 255, 255, 255, 255, 255, 255, 255
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.narrow_i16x8_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <8 x i8>* %p to i32
   %r = add nsw i32 %q, 16
@@ -1096,13 +1075,8 @@ define void @store_narrowing_v8i16_with_unfolded_gep_offset(<8 x i8> %v, <8 x i8
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i32.const 8
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    v128.const 255, 255, 255, 255, 255, 255, 255, 255
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.narrow_i16x8_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <8 x i8>, <8 x i8>* %p, i32 1
   store <8 x i8> %v , <8 x i8>* %s
@@ -1126,14 +1100,9 @@ define void @store_narrowing_v8i16_to_numeric_address(<8 x i8> %v, <8 x i8>* %p)
 ; CHECK-LABEL: store_narrowing_v8i16_to_numeric_address:
 ; CHECK:         .functype store_narrowing_v8i16_to_numeric_address (v128, i32) -> ()
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    v128.const 255, 255, 255, 255, 255, 255, 255, 255
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i32.const 32
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.narrow_i16x8_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 32
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <8 x i8>*
   store <8 x i8> %v , <8 x i8>* %s
@@ -1156,14 +1125,9 @@ define void @store_narrowing_v8i16_to_global_address(<8 x i8> %v) {
 ; CHECK-LABEL: store_narrowing_v8i16_to_global_address:
 ; CHECK:         .functype store_narrowing_v8i16_to_global_address (v128) -> ()
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    v128.const 255, 255, 255, 255, 255, 255, 255, 255
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i32.const gv_v8i8
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i8x16.narrow_i16x8_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store gv_v8i8
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   store <8 x i8> %v , <8 x i8>* @gv_v8i8
   ret void
@@ -1225,7 +1189,7 @@ define <4 x i16> @load_ext_v4i32(<4 x i16>* %p) {
 ; CHECK:         .functype load_ext_v4i32 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i16>, <4 x i16>* %p
   ret <4 x i16> %v
@@ -1296,7 +1260,7 @@ define <4 x i16> @load_ext_v4i32_with_folded_offset(<4 x i16>* %p) {
 ; CHECK:         .functype load_ext_v4i32_with_folded_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.load16x4_u 16
+; CHECK-NEXT:    v128.load64_zero 16
 ; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x i16>* %p to i32
   %r = add nuw i32 %q, 16
@@ -1362,7 +1326,7 @@ define <4 x i16> @load_ext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
 ; CHECK:         .functype load_ext_v4i32_with_folded_gep_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32x4.load16x4_u 8
+; CHECK-NEXT:    v128.load64_zero 8
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 1
   %v = load <4 x i16>, <4 x i16>* %s
@@ -1436,7 +1400,7 @@ define <4 x i16> @load_ext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const -8
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 -1
   %v = load <4 x i16>, <4 x i16>* %s
@@ -1518,7 +1482,7 @@ define <4 x i16> @load_ext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const 16
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x i16>* %p to i32
   %r = add nsw i32 %q, 16
@@ -1594,7 +1558,7 @@ define <4 x i16> @load_ext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const 8
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i32x4.load16x4_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <4 x i16>, <4 x i16>* %p, i32 1
   %v = load <4 x i16>, <4 x i16>* %s
@@ -1658,7 +1622,7 @@ define <4 x i16> @load_ext_v4i32_from_numeric_address() {
 ; CHECK:         .functype load_ext_v4i32_from_numeric_address () -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    i32x4.load16x4_u 32
+; CHECK-NEXT:    v128.load64_zero 32
 ; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <4 x i16>*
   %v = load <4 x i16>, <4 x i16>* %s
@@ -1721,7 +1685,7 @@ define <4 x i16> @load_ext_v4i32_from_global_address() {
 ; CHECK:         .functype load_ext_v4i32_from_global_address () -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    i32x4.load16x4_u gv_v4i16
+; CHECK-NEXT:    v128.load64_zero gv_v4i16
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <4 x i16>, <4 x i16>* @gv_v4i16
   ret <4 x i16> %v
@@ -1744,13 +1708,8 @@ define void @store_narrowing_v4i32(<4 x i16> %v, <4 x i16>* %p) {
 ; CHECK:         .functype store_narrowing_v4i32 (v128, i32) -> ()
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.narrow_i32x4_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   store <4 x i16> %v , <4 x i16>* %p
   ret void
@@ -1776,13 +1735,10 @@ define void @store_narrowing_v4i32_with_folded_offset(<4 x i16> %v, <4 x i16>* %
 ; CHECK:         .functype store_narrowing_v4i32_with_folded_offset (v128, i32) -> ()
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.narrow_i32x4_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 16
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x i16>* %p to i32
   %r = add nuw i32 %q, 16
@@ -1809,13 +1765,10 @@ define void @store_narrowing_v4i32_with_folded_gep_offset(<4 x i16> %v, <4 x i16
 ; CHECK:         .functype store_narrowing_v4i32_with_folded_gep_offset (v128, i32) -> ()
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
+; CHECK-NEXT:    i32.const 8
+; CHECK-NEXT:    i32.add
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.narrow_i32x4_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 8
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 1
   store <4 x i16> %v , <4 x i16>* %s
@@ -1844,13 +1797,8 @@ define void @store_narrowing_v4i32_with_unfolded_gep_negative_offset(<4 x i16> %
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i32.const -8
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.narrow_i32x4_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 -1
   store <4 x i16> %v , <4 x i16>* %s
@@ -1881,13 +1829,8 @@ define void @store_narrowing_v4i32_with_unfolded_offset(<4 x i16> %v, <4 x i16>*
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i32.const 16
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.narrow_i32x4_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <4 x i16>* %p to i32
   %r = add nsw i32 %q, 16
@@ -1918,13 +1861,8 @@ define void @store_narrowing_v4i32_with_unfolded_gep_offset(<4 x i16> %v, <4 x i
 ; CHECK-NEXT:    local.get 1
 ; CHECK-NEXT:    i32.const 8
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.narrow_i32x4_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 0
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <4 x i16>, <4 x i16>* %p, i32 1
   store <4 x i16> %v , <4 x i16>* %s
@@ -1948,14 +1886,9 @@ define void @store_narrowing_v4i32_to_numeric_address(<4 x i16> %v) {
 ; CHECK-LABEL: store_narrowing_v4i32_to_numeric_address:
 ; CHECK:         .functype store_narrowing_v4i32_to_numeric_address (v128) -> ()
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    i32.const 32
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.narrow_i32x4_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store 32
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <4 x i16>*
   store <4 x i16> %v , <4 x i16>* %s
@@ -1978,14 +1911,9 @@ define void @store_narrowing_v4i32_to_global_address(<4 x i16> %v) {
 ; CHECK-LABEL: store_narrowing_v4i32_to_global_address:
 ; CHECK:         .functype store_narrowing_v4i32_to_global_address (v128) -> ()
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    v128.const 65535, 65535, 65535, 65535
+; CHECK-NEXT:    i32.const gv_v4i16
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    v128.and
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i16x8.narrow_i32x4_u
-; CHECK-NEXT:    i64x2.extract_lane 0
-; CHECK-NEXT:    i64.store gv_v4i16
+; CHECK-NEXT:    v128.store64_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
   store <4 x i16> %v , <4 x i16>* @gv_v4i16
   ret void
@@ -2047,7 +1975,7 @@ define <2 x i32> @load_ext_v2i64(<2 x i32>* %p) {
 ; CHECK:         .functype load_ext_v2i64 (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i64x2.load32x2_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <2 x i32>, <2 x i32>* %p
   ret <2 x i32> %v
@@ -2118,7 +2046,7 @@ define <2 x i32> @load_ext_v2i64_with_folded_offset(<2 x i32>* %p) {
 ; CHECK:         .functype load_ext_v2i64_with_folded_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i64x2.load32x2_u 16
+; CHECK-NEXT:    v128.load64_zero 16
 ; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x i32>* %p to i32
   %r = add nuw i32 %q, 16
@@ -2184,7 +2112,7 @@ define <2 x i32> @load_ext_v2i64_with_folded_gep_offset(<2 x i32>* %p) {
 ; CHECK:         .functype load_ext_v2i64_with_folded_gep_offset (i32) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i64x2.load32x2_u 8
+; CHECK-NEXT:    v128.load64_zero 8
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x i32>, <2 x i32>* %p, i32 1
   %v = load <2 x i32>, <2 x i32>* %s
@@ -2258,7 +2186,7 @@ define <2 x i32> @load_ext_v2i64_with_unfolded_gep_negative_offset(<2 x i32>* %p
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const -8
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64x2.load32x2_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr inbounds <2 x i32>, <2 x i32>* %p, i32 -1
   %v = load <2 x i32>, <2 x i32>* %s
@@ -2340,7 +2268,7 @@ define <2 x i32> @load_ext_v2i64_with_unfolded_offset(<2 x i32>* %p) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const 16
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64x2.load32x2_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %q = ptrtoint <2 x i32>* %p to i32
   %r = add nsw i32 %q, 16
@@ -2416,7 +2344,7 @@ define <2 x i32> @load_ext_v2i64_with_unfolded_gep_offset(<2 x i32>* %p) {
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    i32.const 8
 ; CHECK-NEXT:    i32.add
-; CHECK-NEXT:    i64x2.load32x2_u 0
+; CHECK-NEXT:    v128.load64_zero 0
 ; CHECK-NEXT:    # fallthrough-return
   %s = getelementptr <2 x i32>, <2 x i32>* %p, i32 1
   %v = load <2 x i32>, <2 x i32>* %s
@@ -2480,7 +2408,7 @@ define <2 x i32> @load_ext_v2i64_from_numeric_address() {
 ; CHECK:         .functype load_ext_v2i64_from_numeric_address () -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    i64x2.load32x2_u 32
+; CHECK-NEXT:    v128.load64_zero 32
 ; CHECK-NEXT:    # fallthrough-return
   %s = inttoptr i32 32 to <2 x i32>*
   %v = load <2 x i32>, <2 x i32>* %s
@@ -2543,7 +2471,7 @@ define <2 x i32> @load_ext_v2i64_from_global_address() {
 ; CHECK:         .functype load_ext_v2i64_from_global_address () -> (v128)
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    i32.const 0
-; CHECK-NEXT:    i64x2.load32x2_u gv_v2i32
+; CHECK-NEXT:    v128.load64_zero gv_v2i32
 ; CHECK-NEXT:    # fallthrough-return
   %v = load <2 x i32>, <2 x i32>* @gv_v2i32
   ret <2 x i32> %v
diff --git a/llvm/test/CodeGen/WebAssembly/simd-scalar-to-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-scalar-to-vector.ll
deleted file mode 100644
index 70982c763614b..0000000000000
--- a/llvm/test/CodeGen/WebAssembly/simd-scalar-to-vector.ll
+++ /dev/null
@@ -1,42 +0,0 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s
-
-; Test that scalar_to_vector is lowered into a splat correctly.
-; This bugpoint-reduced code turns into the selection dag below.
-; TODO: find small test cases that produce scalar_to_vector dag nodes
-; to make this test more readable and comprehensive.
-
-;   t0: ch = EntryToken
-; t32: i32,ch = load<(load 4 from `<2 x i16>* undef`, align 1)> t0, undef:i32, undef:i32
-;   t33: v4i32 = scalar_to_vector t32
-; t34: v8i16 = bitcast t33
-;       t51: i32 = extract_vector_elt t34, Constant:i32<0>
-;   t52: ch = store<(store 2 into `<4 x i16>* undef`, align 1), trunc to i16> t32:1, t51, undef:i32, undef:i32
-;       t50: i32 = extract_vector_elt t34, Constant:i32<1>
-;     t53: ch = store<(store 2 into `<4 x i16>* undef` + 2, align 1), trunc to i16> t32:1, t50, undef:i32, undef:i32
-;       t49: i32 = extract_vector_elt t34, Constant:i32<2>
-;     t55: ch = store<(store 2 into `<4 x i16>* undef` + 4, align 1), trunc to i16> t32:1, t49, undef:i32, undef:i32
-;       t48: i32 = extract_vector_elt t34, Constant:i32<3>
-;     t57: ch = store<(store 2 into `<4 x i16>* undef` + 6, align 1), trunc to i16> t32:1, t48, undef:i32, undef:i32
-;   t58: ch = TokenFactor t52, t53, t55, t57
-; t24: ch = WebAssemblyISD::RETURN t58
-
-target triple = "wasm32-unknown-unknown"
-
-; CHECK-LABEL: foo:
-; CHECK: i64x2.splat
-define void @foo() {
-entry:
-  %a = load <2 x i16>, <2 x i16>* undef, align 1
-  %b = shufflevector <2 x i16> %a, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-  %0 = bitcast <8 x i16> %b to <16 x i8>
-  %shuffle.i214 = shufflevector <16 x i8> %0, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-  %1 = bitcast <16 x i8> %shuffle.i214 to <8 x i16>
-  %add82 = add <8 x i16> %1, zeroinitializer
-  %2 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> %add82
-  %3 = bitcast <8 x i16> %2 to <16 x i8>
-  %shuffle.i204 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-  %4 = bitcast <16 x i8> %shuffle.i204 to <8 x i16>
-  %dst2.0.vec.extract = shufflevector <8 x i16> %4, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  store <4 x i16> %dst2.0.vec.extract, <4 x i16>* undef, align 1
-  ret void
-}
diff --git a/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll b/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll
index 7d4d833aa9ba9..836062da2dda5 100644
--- a/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll
+++ b/llvm/test/CodeGen/WinEH/wineh-noret-cleanup.ll
@@ -61,19 +61,19 @@ catch.body.2:
 
 ; SEH-LABEL: test:
 ; SEH-LABEL: .Llsda_begin0:
-; SEH-NEXT:    .long   .Ltmp0@IMGREL+1
+; SEH-NEXT:    .long   .Ltmp0@IMGREL
 ; SEH-NEXT:    .long   .Ltmp1@IMGREL+1
 ; SEH-NEXT:    .long   dummy_filter@IMGREL
 ; SEH-NEXT:    .long   .LBB0_3@IMGREL
-; SEH-NEXT:    .long   .Ltmp0@IMGREL+1
+; SEH-NEXT:    .long   .Ltmp0@IMGREL
 ; SEH-NEXT:    .long   .Ltmp1@IMGREL+1
 ; SEH-NEXT:    .long   dummy_filter@IMGREL
 ; SEH-NEXT:    .long   .LBB0_5@IMGREL
-; SEH-NEXT:    .long   .Ltmp2@IMGREL+1
+; SEH-NEXT:    .long   .Ltmp2@IMGREL
 ; SEH-NEXT:    .long   .Ltmp3@IMGREL+1
 ; SEH-NEXT:    .long   "?dtor$2@?0?test@4HA"@IMGREL
 ; SEH-NEXT:    .long   0
-; SEH-NEXT:    .long   .Ltmp2@IMGREL+1
+; SEH-NEXT:    .long   .Ltmp2@IMGREL
 ; SEH-NEXT:    .long   .Ltmp3@IMGREL+1
 ; SEH-NEXT:    .long   dummy_filter@IMGREL
 ; SEH-NEXT:    .long   .LBB0_5@IMGREL
diff --git a/llvm/test/CodeGen/X86/AMX/amx-type.ll b/llvm/test/CodeGen/X86/AMX/amx-type.ll
index 989a1076ce7a6..ddf650525baaa 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-type.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-type.ll
@@ -163,18 +163,19 @@ define dso_local void @__tile_dpbssd(%struct.__tile_str* nocapture %0, %struct._
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i16, i16* [[TMP6]], align 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP8]], align 2
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 2
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <256 x i32>* [[TMP10]] to i8*
-; CHECK-NEXT:    [[TMP12:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP11]], i64 64)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 2
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <256 x i32>* [[TMP13]] to i8*
-; CHECK-NEXT:    [[TMP15:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP9]], i8* [[TMP14]], i64 64)
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <256 x i32>* [[TMP16]] to i8*
-; CHECK-NEXT:    [[TMP18:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP7]], i8* [[TMP17]], i64 64)
-; CHECK-NEXT:    [[TMP19:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[TMP5]], i16 [[TMP7]], i16 [[TMP9]], x86_amx [[TMP12]], x86_amx [[TMP15]], x86_amx [[TMP18]])
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <256 x i32>* [[TMP10]] to i8*
-; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP20]], i64 64, x86_amx [[TMP19]])
+; CHECK-NEXT:    [[TMP10:%.*]] = udiv i16 [[TMP9]], 4
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 2
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP11]] to i8*
+; CHECK-NEXT:    [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP12]], i64 64)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 2
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <256 x i32>* [[TMP14]] to i8*
+; CHECK-NEXT:    [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[TMP9]], i8* [[TMP15]], i64 64)
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <256 x i32>* [[TMP17]] to i8*
+; CHECK-NEXT:    [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP10]], i16 [[TMP7]], i8* [[TMP18]], i64 64)
+; CHECK-NEXT:    [[TMP20:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[TMP5]], i16 [[TMP7]], i16 [[TMP9]], x86_amx [[TMP13]], x86_amx [[TMP16]], x86_amx [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <256 x i32>* [[TMP11]] to i8*
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[TMP5]], i16 [[TMP7]], i8* [[TMP21]], i64 64, x86_amx [[TMP20]])
 ; CHECK-NEXT:    ret void
 ;
   %4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 0
@@ -200,15 +201,16 @@ define dso_local void @__tile_dpbssd(%struct.__tile_str* nocapture %0, %struct._
 
 define dso_local void @__tile_dpbsud(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) {
 ; CHECK-LABEL: @__tile_dpbsud(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8*
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K:%.*]], i8* [[TMP1]], i64 64)
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8*
-; CHECK-NEXT:    [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[K]], i16 [[N:%.*]], i8* [[TMP3]], i64 64)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8*
-; CHECK-NEXT:    [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP5]], i64 64)
-; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP6]], x86_amx [[TMP2]], x86_amx [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <256 x i32>* [[PC]] to i8*
-; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP7]], i64 64, x86_amx [[T6]])
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP2]], i64 64)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8*
+; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], i8* [[TMP4]], i64 64)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8*
+; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP6]], i64 64)
+; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <256 x i32>* [[PC]] to i8*
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP8]], i64 64, x86_amx [[T6]])
 ; CHECK-NEXT:    ret void
 ;
   %t0 = load <256 x i32>, <256 x i32>* %pa, align 64
@@ -225,15 +227,16 @@ define dso_local void @__tile_dpbsud(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <
 
 define dso_local void @__tile_dpbusd(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) {
 ; CHECK-LABEL: @__tile_dpbusd(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8*
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K:%.*]], i8* [[TMP1]], i64 64)
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8*
-; CHECK-NEXT:    [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[K]], i16 [[N:%.*]], i8* [[TMP3]], i64 64)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8*
-; CHECK-NEXT:    [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP5]], i64 64)
-; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP6]], x86_amx [[TMP2]], x86_amx [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <256 x i32>* [[PC]] to i8*
-; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP7]], i64 64, x86_amx [[T6]])
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP2]], i64 64)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8*
+; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], i8* [[TMP4]], i64 64)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8*
+; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP6]], i64 64)
+; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <256 x i32>* [[PC]] to i8*
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP8]], i64 64, x86_amx [[T6]])
 ; CHECK-NEXT:    ret void
 ;
   %t0 = load <256 x i32>, <256 x i32>* %pa, align 64
@@ -250,15 +253,16 @@ define dso_local void @__tile_dpbusd(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <
 
 define dso_local void @__tile_dpbuud(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) {
 ; CHECK-LABEL: @__tile_dpbuud(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8*
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K:%.*]], i8* [[TMP1]], i64 64)
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8*
-; CHECK-NEXT:    [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[K]], i16 [[N:%.*]], i8* [[TMP3]], i64 64)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8*
-; CHECK-NEXT:    [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP5]], i64 64)
-; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP6]], x86_amx [[TMP2]], x86_amx [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <256 x i32>* [[PC]] to i8*
-; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP7]], i64 64, x86_amx [[T6]])
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP2]], i64 64)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8*
+; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], i8* [[TMP4]], i64 64)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8*
+; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP6]], i64 64)
+; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <256 x i32>* [[PC]] to i8*
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP8]], i64 64, x86_amx [[T6]])
 ; CHECK-NEXT:    ret void
 ;
   %t0 = load <256 x i32>, <256 x i32>* %pa, align 64
@@ -275,15 +279,16 @@ define dso_local void @__tile_dpbuud(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <
 
 define dso_local void @__tile_dpbf16ps(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) {
 ; CHECK-LABEL: @__tile_dpbf16ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8*
-; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K:%.*]], i8* [[TMP1]], i64 64)
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8*
-; CHECK-NEXT:    [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[K]], i16 [[N:%.*]], i8* [[TMP3]], i64 64)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8*
-; CHECK-NEXT:    [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP5]], i64 64)
-; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP6]], x86_amx [[TMP2]], x86_amx [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <256 x i32>* [[PC]] to i8*
-; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP7]], i64 64, x86_amx [[T6]])
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i16 [[K:%.*]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <256 x i32>* [[PA:%.*]] to i8*
+; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP2]], i64 64)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <256 x i32>* [[PB:%.*]] to i8*
+; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP1]], i16 [[N:%.*]], i8* [[TMP4]], i64 64)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <256 x i32>* [[PC:%.*]] to i8*
+; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP6]], i64 64)
+; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP7]], x86_amx [[TMP3]], x86_amx [[TMP5]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <256 x i32>* [[PC]] to i8*
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP8]], i64 64, x86_amx [[T6]])
 ; CHECK-NEXT:    ret void
 ;
   %t0 = load <256 x i32>, <256 x i32>* %pa, align 64
diff --git a/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
new file mode 100644
index 0000000000000..4aa5c7e3e1b9a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/lat-combine-amx-bitcast.ll
@@ -0,0 +1,412 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
+
+define void @combine_amx_cast_inside_bb() {
+; CHECK-LABEL: @combine_amx_cast_inside_bb(
+; CHECK-NEXT:  wrapper_entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef)
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+wrapper_entry:
+  %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef)
+  %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
+  %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %tmp)
+  call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %1)
+  ret void
+}
+
+; Cases where amxcast can be combined across bb
+; %5 and %6 is combined together since %goodphi's incoming is phi or amxcast
+define void @combine_amx_cast_and_phi() {
+; CHECK-LABEL: @combine_amx_cast_and_phi(
+; CHECK-NEXT:  wrapper_entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <616 x i8>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef)
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK:       for.body.i.lr.ph.i:
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <110 x i32>* [[TMP2]] to i8*
+; CHECK-NEXT:    store <110 x i32> undef, <110 x i32>* [[TMP2]], align 512
+; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP4]], i64 40)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <616 x i8>* [[TMP1]] to i8*
+; CHECK-NEXT:    store <616 x i8> undef, <616 x i8>* [[TMP1]], align 1024
+; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* [[TMP6]], i64 56)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <560 x i8>* [[TMP0]] to i8*
+; CHECK-NEXT:    store <560 x i8> undef, <560 x i8>* [[TMP0]], align 1024
+; CHECK-NEXT:    [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP8]], i64 40)
+; CHECK-NEXT:    [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]])
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_I_I]]
+; CHECK:       for.cond.cleanup.i.i:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ]
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP11]])
+; CHECK-NEXT:    ret void
+;
+wrapper_entry:
+  %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef)
+  %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
+  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+
+for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
+  %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
+  %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef)
+  %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
+  %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
+  %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
+  br label %for.cond.cleanup.i.i
+
+for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i, %wrapper_entry
+  %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
+  %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
+  call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %6)
+  ret void
+}
+
+; Cases where amxcast can't be combined across bb
+; %5 and %6 is not combined together since %evilphi's incoming is not phi or amxcast
+define void @fail_to_combine_amx_cast_and_phi(<110 x i32> %tmp) {
+; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi(
+; CHECK-NEXT:  wrapper_entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca <560 x i8>, align 64
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca <616 x i8>, align 64
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = add <110 x i32> [[TMP:%.*]], [[TMP]]
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK:       for.body.i.lr.ph.i:
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <110 x i32>* [[TMP4]] to i8*
+; CHECK-NEXT:    store <110 x i32> undef, <110 x i32>* [[TMP4]], align 512
+; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP6]], i64 40)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <616 x i8>* [[TMP3]] to i8*
+; CHECK-NEXT:    store <616 x i8> undef, <616 x i8>* [[TMP3]], align 1024
+; CHECK-NEXT:    [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* [[TMP8]], i64 56)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <560 x i8>* [[TMP2]] to i8*
+; CHECK-NEXT:    store <560 x i8> undef, <560 x i8>* [[TMP2]], align 1024
+; CHECK-NEXT:    [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP10]], i64 40)
+; CHECK-NEXT:    [[TMP12:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP7]], x86_amx [[TMP9]], x86_amx [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <110 x i32>* [[TMP1]] to i8*
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* [[TMP13]], i64 40, x86_amx [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = load <110 x i32>, <110 x i32>* [[TMP1]], align 512
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_I_I]]
+; CHECK:       for.cond.cleanup.i.i:
+; CHECK-NEXT:    [[EVILPHI:%.*]] = phi <110 x i32> [ [[TMP5]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP14]], [[FOR_BODY_I_LR_PH_I]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <110 x i32>* [[TMP0]] to i8*
+; CHECK-NEXT:    store <110 x i32> [[EVILPHI]], <110 x i32>* [[TMP0]], align 512
+; CHECK-NEXT:    [[TMP16:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP15]], i64 40)
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP16]])
+; CHECK-NEXT:    ret void
+;
+wrapper_entry:
+  %0 = add <110 x i32> %tmp, %tmp
+  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+
+for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
+  %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
+  %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef)
+  %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
+  %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
+  %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
+  br label %for.cond.cleanup.i.i
+
+for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i, %wrapper_entry
+  %evilphi = phi <110 x i32> [ %0, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
+  %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi)
+  call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %6)
+  ret void
+}
+
+; Cases where amxcast can't be combined across bb
+; %5 and %6 is not combined together since %evilphi's user aka %evilphi2 is not inside phi web.
+define void @fail_to_combine_amx_cast_and_phi2() {
+; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi2(
+; CHECK-NEXT:  wrapper_entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca <560 x i8>, align 64
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca <616 x i8>, align 64
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <110 x i32>* [[TMP5]] to i8*
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* [[TMP7]], i64 40, x86_amx [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = load <110 x i32>, <110 x i32>* [[TMP5]], align 512
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK:       for.body.i.lr.ph.i:
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <110 x i32>* [[TMP4]] to i8*
+; CHECK-NEXT:    store <110 x i32> undef, <110 x i32>* [[TMP4]], align 512
+; CHECK-NEXT:    [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP9]], i64 40)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <616 x i8>* [[TMP3]] to i8*
+; CHECK-NEXT:    store <616 x i8> undef, <616 x i8>* [[TMP3]], align 1024
+; CHECK-NEXT:    [[TMP12:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* [[TMP11]], i64 56)
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <560 x i8>* [[TMP2]] to i8*
+; CHECK-NEXT:    store <560 x i8> undef, <560 x i8>* [[TMP2]], align 1024
+; CHECK-NEXT:    [[TMP14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP13]], i64 40)
+; CHECK-NEXT:    [[TMP15:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP10]], x86_amx [[TMP12]], x86_amx [[TMP14]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <110 x i32>* [[TMP1]] to i8*
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* [[TMP16]], i64 40, x86_amx [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = load <110 x i32>, <110 x i32>* [[TMP1]], align 512
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
+; CHECK:       for.cond.cleanup.i.i:
+; CHECK-NEXT:    [[GOODPHI:%.*]] = phi <110 x i32> [ [[TMP8]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <110 x i32>* [[TMP0]] to i8*
+; CHECK-NEXT:    store <110 x i32> [[GOODPHI]], <110 x i32>* [[TMP0]], align 512
+; CHECK-NEXT:    [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP18]], i64 40)
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP19]])
+; CHECK-NEXT:    br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[EVILPHI2:%.*]] = phi <110 x i32> [ [[GOODPHI]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP17]], [[FOR_BODY_I_LR_PH_I]] ]
+; CHECK-NEXT:    store <110 x i32> [[EVILPHI2]], <110 x i32>* undef, align 512
+; CHECK-NEXT:    ret void
+;
+wrapper_entry:
+  %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef)
+  %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
+  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+
+for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
+  %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
+  %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef)
+  %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
+  %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
+  %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
+  br i1 undef, label %for.cond.cleanup.i.i, label %exit
+
+for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i, %wrapper_entry
+  %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
+  %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
+  call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %6)
+  br i1 undef, label %exit, label %for.body.i.lr.ph.i
+exit:
+  %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ]
+  store <110 x i32> %evilphi2, <110 x i32>* undef, align 512
+  ret void
+}
+
+define void @fail_to_combine_amx_cast_and_phi_due_to_const_value() {
+; CHECK-LABEL: @fail_to_combine_amx_cast_and_phi_due_to_const_value(
+; CHECK-NEXT:  wrapper_entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca <560 x i8>, align 64
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca <616 x i8>, align 64
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK:       for.body.i.lr.ph.i:
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <110 x i32>* [[TMP4]] to i8*
+; CHECK-NEXT:    store <110 x i32> undef, <110 x i32>* [[TMP4]], align 512
+; CHECK-NEXT:    [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP5]], i64 40)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <616 x i8>* [[TMP3]] to i8*
+; CHECK-NEXT:    store <616 x i8> undef, <616 x i8>* [[TMP3]], align 1024
+; CHECK-NEXT:    [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* [[TMP7]], i64 56)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <560 x i8>* [[TMP2]] to i8*
+; CHECK-NEXT:    store <560 x i8> undef, <560 x i8>* [[TMP2]], align 1024
+; CHECK-NEXT:    [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP9]], i64 40)
+; CHECK-NEXT:    [[TMP11:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP6]], x86_amx [[TMP8]], x86_amx [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <110 x i32>* [[TMP1]] to i8*
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* [[TMP12]], i64 40, x86_amx [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = load <110 x i32>, <110 x i32>* [[TMP1]], align 512
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_I_I]]
+; CHECK:       for.cond.cleanup.i.i:
+; CHECK-NEXT:    [[EVILPHI:%.*]] = phi <110 x i32> [ undef, [[WRAPPER_ENTRY:%.*]] ], [ [[TMP13]], [[FOR_BODY_I_LR_PH_I]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <110 x i32>* [[TMP0]] to i8*
+; CHECK-NEXT:    store <110 x i32> [[EVILPHI]], <110 x i32>* [[TMP0]], align 512
+; CHECK-NEXT:    [[TMP15:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP14]], i64 40)
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP15]])
+; CHECK-NEXT:    ret void
+;
+wrapper_entry:
+  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+
+for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
+  %0 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
+  %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef)
+  %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
+  %3 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %0, x86_amx %1, x86_amx %2)
+  %4 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %3)
+  br label %for.cond.cleanup.i.i
+
+for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i, %wrapper_entry
+  %evilphi = phi <110 x i32> [ undef, %wrapper_entry ], [ %4, %for.body.i.lr.ph.i ]
+  %5 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi)
+  call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %5)
+  ret void
+}
+
+; Cases where amxcast can be combined across bb
+; When optimizeAMXCastFromPhi process %6 and %goodphi, %goodphi2 is outside the phi-web, so the optimization stop
+; When optimizeAMXCastFromPhi process %7 and %goodphi2, the optimization continue.
+define void @combine_amx_cast_and_multiple_phi() {
+; CHECK-LABEL: @combine_amx_cast_and_multiple_phi(
+; CHECK-NEXT:  wrapper_entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <616 x i8>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef)
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK:       for.body.i.lr.ph.i:
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <110 x i32>* [[TMP2]] to i8*
+; CHECK-NEXT:    store <110 x i32> undef, <110 x i32>* [[TMP2]], align 512
+; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP4]], i64 40)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <616 x i8>* [[TMP1]] to i8*
+; CHECK-NEXT:    store <616 x i8> undef, <616 x i8>* [[TMP1]], align 1024
+; CHECK-NEXT:    [[TMP7:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* [[TMP6]], i64 56)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <560 x i8>* [[TMP0]] to i8*
+; CHECK-NEXT:    store <560 x i8> undef, <560 x i8>* [[TMP0]], align 1024
+; CHECK-NEXT:    [[TMP9:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP8]], i64 40)
+; CHECK-NEXT:    [[TMP10:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP5]], x86_amx [[TMP7]], x86_amx [[TMP9]])
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I]], label [[EXIT:%.*]]
+; CHECK:       for.cond.cleanup.i.i:
+; CHECK-NEXT:    [[TMP11:%.*]] = phi x86_amx [ [[TMP3]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ]
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP11]])
+; CHECK-NEXT:    br i1 undef, label [[EXIT]], label [[FOR_BODY_I_LR_PH_I]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi x86_amx [ [[TMP11]], [[FOR_COND_CLEANUP_I_I]] ], [ [[TMP10]], [[FOR_BODY_I_LR_PH_I]] ]
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP12]])
+; CHECK-NEXT:    ret void
+;
+wrapper_entry:
+  %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef)
+  %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
+  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+
+for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
+  %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
+  %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef)
+  %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
+  %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
+  %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
+  br i1 undef, label %for.cond.cleanup.i.i, label %exit
+
+for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i, %wrapper_entry
+  %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %5, %for.body.i.lr.ph.i ]
+  %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
+  call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %6)
+  br i1 undef, label %exit, label %for.body.i.lr.ph.i
+exit:
+  %evilphi2 = phi <110 x i32> [ %goodphi, %for.cond.cleanup.i.i ], [ %5, %for.body.i.lr.ph.i ]
+  %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
+  call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %7)
+  ret void
+}
+
+; Currently we are not able to delete DeadPHICycle, later we will handle with them
+define void @combine_amx_cast_and_phi_in_a_circle() {
+; CHECK-LABEL: @combine_amx_cast_and_phi_in_a_circle(
+; CHECK-NEXT:  wrapper_entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <560 x i8>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca <616 x i8>, align 64
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca <110 x i32>, align 64
+; CHECK-NEXT:    [[TMP4:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef)
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <110 x i32>* [[TMP3]] to i8*
+; CHECK-NEXT:    store <110 x i32> undef, <110 x i32>* [[TMP3]], align 512
+; CHECK-NEXT:    [[TMP6:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* [[TMP5]], i64 40)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <616 x i8>* [[TMP2]] to i8*
+; CHECK-NEXT:    store <616 x i8> undef, <616 x i8>* [[TMP2]], align 1024
+; CHECK-NEXT:    [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* [[TMP7]], i64 56)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <560 x i8>* [[TMP1]] to i8*
+; CHECK-NEXT:    store <560 x i8> undef, <560 x i8>* [[TMP1]], align 1024
+; CHECK-NEXT:    [[TMP10:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP9]], i64 40)
+; CHECK-NEXT:    [[TMP11:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP6]], x86_amx [[TMP8]], x86_amx [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <110 x i32>* [[TMP0]] to i8*
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* [[TMP12]], i64 40, x86_amx [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = load <110 x i32>, <110 x i32>* [[TMP0]], align 512
+; CHECK-NEXT:    br i1 undef, label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi x86_amx [ [[TMP15:%.*]], [[BB3]] ], [ [[TMP11]], [[BB1]] ]
+; CHECK-NEXT:    [[GOODPHI:%.*]] = phi <110 x i32> [ [[EVILPHI2:%.*]], [[BB3]] ], [ [[TMP13]], [[BB1]] ]
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP14]])
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP15]] = phi x86_amx [ [[TMP14]], [[BB2]] ], [ [[TMP11]], [[BB1]] ]
+; CHECK-NEXT:    [[EVILPHI2]] = phi <110 x i32> [ [[GOODPHI]], [[BB2]] ], [ [[TMP13]], [[BB1]] ]
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP15]])
+; CHECK-NEXT:    br i1 undef, label [[BB2]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP15]])
+; CHECK-NEXT:    ret void
+;
+wrapper_entry:
+  %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef)
+  %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
+  br label %bb1
+
+bb1:                               ; preds = %wrapper_entry
+  %1 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> undef)
+  %2 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> undef)
+  %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
+  %4 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %1, x86_amx %2, x86_amx %3)
+  %5 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %4)
+  br i1 undef, label %bb2, label %bb3
+
+bb2:                             ; preds = %bb1, %wrapper_entry
+  %goodphi = phi <110 x i32> [ %evilphi2, %bb3], [ %5, %bb1 ]
+  %6 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
+  call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %6)
+  br label %bb3
+bb3:
+  %evilphi2 = phi <110 x i32> [ %goodphi, %bb2 ], [ %5, %bb1 ]
+  %7 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
+  call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %7)
+  br i1 undef, label %bb2, label %exit
+exit:
+  %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %evilphi2)
+  call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %8)
+  ret void
+}
+
+define void @eliminate_unused_phi_and_cast() {
+; CHECK-LABEL: @eliminate_unused_phi_and_cast(
+; CHECK-NEXT:  wrapper_entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca <560 x i8>, align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef)
+; CHECK-NEXT:    br i1 undef, label [[FOR_COND_CLEANUP_I_I:%.*]], label [[FOR_BODY_I_LR_PH_I:%.*]]
+; CHECK:       for.body.i.lr.ph.i:
+; CHECK-NEXT:    [[TMP2:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* undef, i64 undef)
+; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* undef, i64 undef)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <560 x i8>* [[TMP0]] to i8*
+; CHECK-NEXT:    store <560 x i8> undef, <560 x i8>* [[TMP0]], align 1024
+; CHECK-NEXT:    [[TMP5:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* [[TMP4]], i64 40)
+; CHECK-NEXT:    [[TMP6:%.*]] = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx [[TMP2]], x86_amx [[TMP3]], x86_amx [[TMP5]])
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_I_I]]
+; CHECK:       for.cond.cleanup.i.i:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi x86_amx [ [[TMP1]], [[WRAPPER_ENTRY:%.*]] ], [ [[TMP6]], [[FOR_BODY_I_LR_PH_I]] ]
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx [[TMP7]])
+; CHECK-NEXT:    ret void
+;
+wrapper_entry:
+  %0 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 40, i8* undef, i64 undef)
+  %tmp = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %0)
+  br i1 undef, label %for.cond.cleanup.i.i, label %for.body.i.lr.ph.i
+
+for.body.i.lr.ph.i:                               ; preds = %wrapper_entry
+  %1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 11, i16 56, i8* undef, i64 undef)
+  %v1 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %1)
+  %2 = call x86_amx @llvm.x86.tileloadd64.internal(i16 14, i16 40, i8* undef, i64 undef)
+  %v2 = call <616 x i8> @llvm.x86.cast.tile.to.vector.v616i8(x86_amx %2)
+  %3 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %v1)
+  %4 = call x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8> %v2)
+  %5 = call x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8> undef)
+  %6 = call x86_amx @llvm.x86.tdpbssd.internal(i16 11, i16 40, i16 56, x86_amx %3, x86_amx %4, x86_amx %5)
+  %7 = call <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx %6)
+  br label %for.cond.cleanup.i.i
+
+for.cond.cleanup.i.i:                             ; preds = %for.body.i.lr.ph.i, %wrapper_entry
+  %goodphi = phi <110 x i32> [ %tmp, %wrapper_entry ], [ %7, %for.body.i.lr.ph.i ]
+  %8 = call x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32> %goodphi)
+  call void @llvm.x86.tilestored64.internal(i16 11, i16 40, i8* undef, i64 undef, x86_amx %8)
+  ret void
+}
+
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare <110 x i32> @llvm.x86.cast.tile.to.vector.v110i32(x86_amx)
+declare <616 x i8> @llvm.x86.cast.tile.to.vector.v616i8(x86_amx)
+declare x86_amx @llvm.x86.cast.vector.to.tile.v110i32(<110 x i32>)
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
+declare x86_amx @llvm.x86.cast.vector.to.tile.v616i8(<616 x i8>)
+declare x86_amx @llvm.x86.cast.vector.to.tile.v560i8(<560 x i8>)
+declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
diff --git a/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
new file mode 100644
index 0000000000000..98a820197bbd6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/lat-transform-amx-bitcast.ll
@@ -0,0 +1,429 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt --codegen-opt-level=2 -mtriple=x86_64 -lower-amx-type %s -S | FileCheck %s
+
+%struct.__tile_str = type { i16, i16, <256 x i32> }
+
+@buf = dso_local global [1024 x i8] zeroinitializer, align 64
+@buf2 = dso_local global [1024 x i8] zeroinitializer, align 64
+
+; test bitcast x86_amx to <256 x i32>
+define dso_local void @test_user_empty(i16 %m, i16 %n, i8 *%buf, i64 %s) {
+; CHECK-LABEL: @test_user_empty(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N:%.*]], i8* [[BUF:%.*]], i64 [[S:%.*]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %n, i8* %buf, i64 %s)
+  %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
+  ret void
+}
+
+; test bitcast <256 x i32> to x86_amx
+define dso_local void @test_user_empty2(<256 x i32> %in) {
+; CHECK-LABEL: @test_user_empty2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %in)
+  ret void
+}
+
+define dso_local <256 x i32> @test_amx_load_bitcast_v256i32(<256 x i32>* %in, i16 %m, i16 %n, i8 *%buf, i64 %s) {
+; CHECK-LABEL: @test_amx_load_bitcast_v256i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[T1:%.*]] = load <256 x i32>, <256 x i32>* [[IN:%.*]], align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <256 x i32>* [[TMP0]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[T1]], <256 x i32>* [[TMP0]], align 1024
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[N:%.*]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], i8* [[TMP1]], i64 [[TMP2]])
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP3]])
+; CHECK-NEXT:    ret <256 x i32> [[T1]]
+;
+entry:
+  %t1 = load <256 x i32>, <256 x i32>* %in, align 64
+  %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t1)
+  call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, i8* %buf, i64 %s, x86_amx %t2)
+  ret <256 x i32> %t1
+}
+
+define dso_local <225 x i32> @test_amx_load_bitcast_v225i32(<225 x i32>* %in, i16 %m, i16 %n, i8 *%buf, i64 %s) {
+; CHECK-LABEL: @test_amx_load_bitcast_v225i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca <225 x i32>, align 64
+; CHECK-NEXT:    [[T1:%.*]] = load <225 x i32>, <225 x i32>* [[IN:%.*]], align 64
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <225 x i32>* [[TMP0]] to i8*
+; CHECK-NEXT:    store <225 x i32> [[T1]], <225 x i32>* [[TMP0]], align 1024
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[N:%.*]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[N]], i8* [[TMP1]], i64 [[TMP2]])
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP3]])
+; CHECK-NEXT:    ret <225 x i32> [[T1]]
+;
+entry:
+  %t1 = load <225 x i32>, <225 x i32>* %in, align 64
+  %t2 = call x86_amx @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32> %t1)
+  call void @llvm.x86.tilestored64.internal(i16 %m, i16 %n, i8* %buf, i64 %s, x86_amx %t2)
+  ret <225 x i32> %t1
+}
+
+define dso_local <256 x i32> @test_amx_bitcast_store(<256 x i32>* %out, i16 %m, i16 %n, i8 *%buf, i64 %s) {
+; CHECK-LABEL: @test_amx_bitcast_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[M]], i8* [[BUF:%.*]], i64 [[S:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <256 x i32>* [[TMP0]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[M]] to i64
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[M]], i8* [[TMP1]], i64 [[TMP2]], x86_amx [[T1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = load <256 x i32>, <256 x i32>* [[TMP0]], align 1024
+; CHECK-NEXT:    store <256 x i32> [[TMP3]], <256 x i32>* [[OUT:%.*]], align 1024
+; CHECK-NEXT:    ret <256 x i32> [[TMP3]]
+;
+entry:
+  %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %m, i16 %m, i8* %buf, i64 %s)
+  %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
+  store <256 x i32> %t2, <256 x i32>* %out
+  ret <256 x i32> %t2
+}
+
+define dso_local void @test_src_add(<256 x i32> %x, <256 x i32> %y, i16 %r, i16 %c, i8* %buf, i64 %s) {
+; CHECK-LABEL: @test_src_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[ADD:%.*]] = add <256 x i32> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <256 x i32>* [[TMP0]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[ADD]], <256 x i32>* [[TMP0]], align 1024
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[C:%.*]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C]], i8* [[TMP1]], i64 [[TMP2]])
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], i8* [[BUF:%.*]], i64 [[S:%.*]], x86_amx [[TMP3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add <256 x i32> %y, %x
+  %t = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %add)
+  call void @llvm.x86.tilestored64.internal(i16 %r, i16 %c, i8* %buf, i64 %s, x86_amx %t)
+  ret void
+}
+
+define dso_local void @test_src_add2(<256 x i32> %x, i16 %r, i16 %c, i8* %buf, i64 %s) {
+; CHECK-LABEL: @test_src_add2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[T1:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[R:%.*]], i16 [[C:%.*]], i8* [[BUF:%.*]], i64 [[S:%.*]])
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <256 x i32>* [[TMP0]] to i8*
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i16 [[C]] to i64
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[R]], i16 [[C]], i8* [[TMP1]], i64 [[TMP2]], x86_amx [[T1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = load <256 x i32>, <256 x i32>* [[TMP0]], align 1024
+; CHECK-NEXT:    [[ADD:%.*]] = add <256 x i32> [[TMP3]], [[X:%.*]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t1 = call x86_amx @llvm.x86.tileloadd64.internal(i16 %r, i16 %c, i8* %buf, i64 %s)
+  %t2 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t1)
+  %add = add <256 x i32> %t2, %x
+  ret void
+}
+
+define dso_local void @__tile_loadd(%struct.__tile_str* nocapture %0, i8* %1, i64 %2) local_unnamed_addr {
+; CHECK-LABEL: @__tile_loadd(
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, i16* [[TMP5]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, i16* [[TMP7]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[TMP2:%.*]], 32
+; CHECK-NEXT:    [[TMP10:%.*]] = ashr exact i64 [[TMP9]], 32
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP6]], i16 [[TMP8]], i8* [[TMP1:%.*]], i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8*
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i16 [[TMP8]] to i64
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[TMP6]], i16 [[TMP8]], i8* [[TMP12]], i64 [[TMP13]], x86_amx [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = load <256 x i32>, <256 x i32>* [[TMP4]], align 1024
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0]], i64 0, i32 2
+; CHECK-NEXT:    store <256 x i32> [[TMP14]], <256 x i32>* [[TMP15]], align 64
+; CHECK-NEXT:    ret void
+;
+  %4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 0
+  %5 = load i16, i16* %4, align 64
+  %6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 1
+  %7 = load i16, i16* %6, align 2
+  %8 = shl i64 %2, 32
+  %9 = ashr exact i64 %8, 32
+  %10 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %5, i16 %7, i8* %1, i64 %9)
+  %11 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %10)
+  %12 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 2
+  store <256 x i32> %11, <256 x i32>* %12, align 64
+  ret void
+}
+
+define dso_local void @__tile_dpbssd(%struct.__tile_str* nocapture %0, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %1, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr {
+; CHECK-LABEL: @__tile_dpbssd(
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP1:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, i16* [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP10]], align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, i16* [[TMP12]], align 2
+; CHECK-NEXT:    [[TMP14:%.*]] = udiv i16 [[TMP13]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP0:%.*]], i64 0, i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = load <256 x i32>, <256 x i32>* [[TMP15]], align 64
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <256 x i32>* [[TMP7]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[TMP16]], <256 x i32>* [[TMP7]], align 1024
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i16 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP11]], i8* [[TMP17]], i64 [[TMP18]])
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP1]], i64 0, i32 2
+; CHECK-NEXT:    [[TMP21:%.*]] = load <256 x i32>, <256 x i32>* [[TMP20]], align 64
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <256 x i32>* [[TMP6]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[TMP21]], <256 x i32>* [[TMP6]], align 1024
+; CHECK-NEXT:    [[TMP23:%.*]] = sext i16 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP9]], i16 [[TMP13]], i8* [[TMP22]], i64 [[TMP23]])
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2
+; CHECK-NEXT:    [[TMP26:%.*]] = load <256 x i32>, <256 x i32>* [[TMP25]], align 64
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <256 x i32>* [[TMP5]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[TMP26]], <256 x i32>* [[TMP5]], align 1024
+; CHECK-NEXT:    [[TMP28:%.*]] = sext i16 [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP29:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP14]], i16 [[TMP11]], i8* [[TMP27]], i64 [[TMP28]])
+; CHECK-NEXT:    [[TMP30:%.*]] = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 [[TMP9]], i16 [[TMP11]], i16 [[TMP13]], x86_amx [[TMP19]], x86_amx [[TMP24]], x86_amx [[TMP29]])
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8*
+; CHECK-NEXT:    [[TMP32:%.*]] = sext i16 [[TMP11]] to i64
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[TMP9]], i16 [[TMP11]], i8* [[TMP31]], i64 [[TMP32]], x86_amx [[TMP30]])
+; CHECK-NEXT:    [[TMP33:%.*]] = load <256 x i32>, <256 x i32>* [[TMP4]], align 1024
+; CHECK-NEXT:    store <256 x i32> [[TMP33]], <256 x i32>* [[TMP15]], align 64
+; CHECK-NEXT:    ret void
+;
+  %4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 0
+  %5 = load i16, i16* %4, align 64
+  %6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 1
+  %7 = load i16, i16* %6, align 2
+  %8 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 1
+  %9 = load i16, i16* %8, align 2
+  %10 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %0, i64 0, i32 2
+  %11 = load <256 x i32>, <256 x i32>* %10, align 64
+  %12 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %11)
+  %13 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %1, i64 0, i32 2
+  %14 = load <256 x i32>, <256 x i32>* %13, align 64
+  %15 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %14)
+  %16 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 2
+  %17 = load <256 x i32>, <256 x i32>* %16, align 64
+  %18 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %17)
+  %19 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %5, i16 %7, i16 %9, x86_amx %12, x86_amx %15, x86_amx %18)
+  %20 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %19)
+  store <256 x i32> %20, <256 x i32>* %10, align 64
+  ret void
+}
+
+define dso_local void @__tile_dpbsud(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) {
+; CHECK-LABEL: @__tile_dpbsud(
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv i16 [[K:%.*]], 4
+; CHECK-NEXT:    [[T0:%.*]] = load <256 x i32>, <256 x i32>* [[PA:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[T0]], <256 x i32>* [[TMP4]], align 1024
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i16 [[K]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP6]], i64 [[TMP7]])
+; CHECK-NEXT:    [[T2:%.*]] = load <256 x i32>, <256 x i32>* [[PB:%.*]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <256 x i32>* [[TMP3]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[T2]], <256 x i32>* [[TMP3]], align 1024
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i16 [[N:%.*]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[N]], i8* [[TMP9]], i64 [[TMP10]])
+; CHECK-NEXT:    [[T4:%.*]] = load <256 x i32>, <256 x i32>* [[PC:%.*]], align 64
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP2]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[T4]], <256 x i32>* [[TMP2]], align 1024
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i16 [[N]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP12]], i64 [[TMP13]])
+; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP14]], x86_amx [[TMP8]], x86_amx [[TMP11]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <256 x i32>* [[TMP1]] to i8*
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i16 [[N]] to i64
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP15]], i64 [[TMP16]], x86_amx [[T6]])
+; CHECK-NEXT:    [[TMP17:%.*]] = load <256 x i32>, <256 x i32>* [[TMP1]], align 1024
+; CHECK-NEXT:    store <256 x i32> [[TMP17]], <256 x i32>* [[PC]], align 64
+; CHECK-NEXT:    ret void
+;
+  %t0 = load <256 x i32>, <256 x i32>* %pa, align 64
+  %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0)
+  %t2 = load <256 x i32>, <256 x i32>* %pb, align 64
+  %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2)
+  %t4 = load <256 x i32>, <256 x i32>* %pc, align 64
+  %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4)
+  %t6 = tail call x86_amx @llvm.x86.tdpbsud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
+  %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6)
+  store <256 x i32> %t7, <256 x i32>* %pc, align 64
+  ret void
+}
+
+define dso_local void @__tile_dpbusd(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) {
+; CHECK-LABEL: @__tile_dpbusd(
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv i16 [[K:%.*]], 4
+; CHECK-NEXT:    [[T0:%.*]] = load <256 x i32>, <256 x i32>* [[PA:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[T0]], <256 x i32>* [[TMP4]], align 1024
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i16 [[K]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP6]], i64 [[TMP7]])
+; CHECK-NEXT:    [[T2:%.*]] = load <256 x i32>, <256 x i32>* [[PB:%.*]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <256 x i32>* [[TMP3]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[T2]], <256 x i32>* [[TMP3]], align 1024
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i16 [[N:%.*]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[N]], i8* [[TMP9]], i64 [[TMP10]])
+; CHECK-NEXT:    [[T4:%.*]] = load <256 x i32>, <256 x i32>* [[PC:%.*]], align 64
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP2]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[T4]], <256 x i32>* [[TMP2]], align 1024
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i16 [[N]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP12]], i64 [[TMP13]])
+; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP14]], x86_amx [[TMP8]], x86_amx [[TMP11]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <256 x i32>* [[TMP1]] to i8*
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i16 [[N]] to i64
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP15]], i64 [[TMP16]], x86_amx [[T6]])
+; CHECK-NEXT:    [[TMP17:%.*]] = load <256 x i32>, <256 x i32>* [[TMP1]], align 1024
+; CHECK-NEXT:    store <256 x i32> [[TMP17]], <256 x i32>* [[PC]], align 64
+; CHECK-NEXT:    ret void
+;
+  %t0 = load <256 x i32>, <256 x i32>* %pa, align 64
+  %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0)
+  %t2 = load <256 x i32>, <256 x i32>* %pb, align 64
+  %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2)
+  %t4 = load <256 x i32>, <256 x i32>* %pc, align 64
+  %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4)
+  %t6 = tail call x86_amx @llvm.x86.tdpbusd.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
+  %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6)
+  store <256 x i32> %t7, <256 x i32>* %pc, align 64
+  ret void
+}
+
+define dso_local void @__tile_dpbuud(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) {
+; CHECK-LABEL: @__tile_dpbuud(
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv i16 [[K:%.*]], 4
+; CHECK-NEXT:    [[T0:%.*]] = load <256 x i32>, <256 x i32>* [[PA:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[T0]], <256 x i32>* [[TMP4]], align 1024
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i16 [[K]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP6]], i64 [[TMP7]])
+; CHECK-NEXT:    [[T2:%.*]] = load <256 x i32>, <256 x i32>* [[PB:%.*]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <256 x i32>* [[TMP3]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[T2]], <256 x i32>* [[TMP3]], align 1024
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i16 [[N:%.*]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[N]], i8* [[TMP9]], i64 [[TMP10]])
+; CHECK-NEXT:    [[T4:%.*]] = load <256 x i32>, <256 x i32>* [[PC:%.*]], align 64
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP2]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[T4]], <256 x i32>* [[TMP2]], align 1024
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i16 [[N]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP12]], i64 [[TMP13]])
+; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP14]], x86_amx [[TMP8]], x86_amx [[TMP11]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <256 x i32>* [[TMP1]] to i8*
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i16 [[N]] to i64
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP15]], i64 [[TMP16]], x86_amx [[T6]])
+; CHECK-NEXT:    [[TMP17:%.*]] = load <256 x i32>, <256 x i32>* [[TMP1]], align 1024
+; CHECK-NEXT:    store <256 x i32> [[TMP17]], <256 x i32>* [[PC]], align 64
+; CHECK-NEXT:    ret void
+;
+  %t0 = load <256 x i32>, <256 x i32>* %pa, align 64
+  %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0)
+  %t2 = load <256 x i32>, <256 x i32>* %pb, align 64
+  %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2)
+  %t4 = load <256 x i32>, <256 x i32>* %pc, align 64
+  %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4)
+  %t6 = tail call x86_amx @llvm.x86.tdpbuud.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
+  %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6)
+  store <256 x i32> %t7, <256 x i32>* %pc, align 64
+  ret void
+}
+
+define dso_local void @__tile_dpbf16ps(i16 %m, i16 %n, i16 %k, <256 x i32>* %pc, <256 x i32>* %pa, <256 x i32>* %pb) {
+; CHECK-LABEL: @__tile_dpbf16ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP3:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv i16 [[K:%.*]], 4
+; CHECK-NEXT:    [[T0:%.*]] = load <256 x i32>, <256 x i32>* [[PA:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[T0]], <256 x i32>* [[TMP4]], align 1024
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i16 [[K]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M:%.*]], i16 [[K]], i8* [[TMP6]], i64 [[TMP7]])
+; CHECK-NEXT:    [[T2:%.*]] = load <256 x i32>, <256 x i32>* [[PB:%.*]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <256 x i32>* [[TMP3]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[T2]], <256 x i32>* [[TMP3]], align 1024
+; CHECK-NEXT:    [[TMP10:%.*]] = sext i16 [[N:%.*]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP5]], i16 [[N]], i8* [[TMP9]], i64 [[TMP10]])
+; CHECK-NEXT:    [[T4:%.*]] = load <256 x i32>, <256 x i32>* [[PC:%.*]], align 64
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <256 x i32>* [[TMP2]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[T4]], <256 x i32>* [[TMP2]], align 1024
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i16 [[N]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[M]], i16 [[N]], i8* [[TMP12]], i64 [[TMP13]])
+; CHECK-NEXT:    [[T6:%.*]] = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 [[M]], i16 [[N]], i16 [[K]], x86_amx [[TMP14]], x86_amx [[TMP8]], x86_amx [[TMP11]])
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <256 x i32>* [[TMP1]] to i8*
+; CHECK-NEXT:    [[TMP16:%.*]] = sext i16 [[N]] to i64
+; CHECK-NEXT:    call void @llvm.x86.tilestored64.internal(i16 [[M]], i16 [[N]], i8* [[TMP15]], i64 [[TMP16]], x86_amx [[T6]])
+; CHECK-NEXT:    [[TMP17:%.*]] = load <256 x i32>, <256 x i32>* [[TMP1]], align 1024
+; CHECK-NEXT:    store <256 x i32> [[TMP17]], <256 x i32>* [[PC]], align 64
+; CHECK-NEXT:    ret void
+;
+  %t0 = load <256 x i32>, <256 x i32>* %pa, align 64
+  %t1 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t0)
+  %t2 = load <256 x i32>, <256 x i32>* %pb, align 64
+  %t3 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t2)
+  %t4 = load <256 x i32>, <256 x i32>* %pc, align 64
+  %t5 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %t4)
+  %t6 = tail call x86_amx @llvm.x86.tdpbf16ps.internal(i16 %m, i16 %n, i16 %k, x86_amx %t5, x86_amx %t1, x86_amx %t3)
+  %t7 = call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx %t6)
+  store <256 x i32> %t7, <256 x i32>* %pc, align 64
+  ret void
+}
+
+define dso_local void @__tile_stored(i8* %0, i64 %1, %struct.__tile_str* nocapture readonly byval(%struct.__tile_str) align 64 %2) local_unnamed_addr {
+; CHECK-LABEL: @__tile_stored(
+; CHECK-NEXT:    [[TMP4:%.*]] = alloca <256 x i32>, align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR:%.*]], %struct.__tile_str* [[TMP2:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, i16* [[TMP5]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, i16* [[TMP7]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TILE_STR]], %struct.__tile_str* [[TMP2]], i64 0, i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load <256 x i32>, <256 x i32>* [[TMP9]], align 64
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <256 x i32>* [[TMP4]] to i8*
+; CHECK-NEXT:    store <256 x i32> [[TMP10]], <256 x i32>* [[TMP4]], align 1024
+; CHECK-NEXT:    [[TMP12:%.*]] = sext i16 [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = call x86_amx @llvm.x86.tileloadd64.internal(i16 [[TMP6]], i16 [[TMP8]], i8* [[TMP11]], i64 [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = shl i64 [[TMP1:%.*]], 32
+; CHECK-NEXT:    [[TMP15:%.*]] = ashr exact i64 [[TMP14]], 32
+; CHECK-NEXT:    tail call void @llvm.x86.tilestored64.internal(i16 [[TMP6]], i16 [[TMP8]], i8* [[TMP0:%.*]], i64 [[TMP15]], x86_amx [[TMP13]])
+; CHECK-NEXT:    ret void
+;
+  %4 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 0
+  %5 = load i16, i16* %4, align 64
+  %6 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 1
+  %7 = load i16, i16* %6, align 2
+  %8 = getelementptr inbounds %struct.__tile_str, %struct.__tile_str* %2, i64 0, i32 2
+  %9 = load <256 x i32>, <256 x i32>* %8, align 64
+  %10 = call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> %9)
+  %11 = shl i64 %1, 32
+  %12 = ashr exact i64 %11, 32
+  tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %7, i8* %0, i64 %12, x86_amx %10)
+  ret void
+}
+
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbsud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbusd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbuud.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare x86_amx @llvm.x86.tdpbf16ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
+
+declare x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32>)
+declare x86_amx @llvm.x86.cast.vector.to.tile.v225i32(<225 x i32>)
+declare <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx)
+declare <225 x i32> @llvm.x86.cast.tile.to.vector.v225i32(x86_amx)
diff --git a/llvm/test/CodeGen/X86/Inputs/fsloader.afdo b/llvm/test/CodeGen/X86/Inputs/fsloader.afdo
new file mode 100644
index 0000000000000..debfcd3589579
--- /dev/null
+++ b/llvm/test/CodeGen/X86/Inputs/fsloader.afdo
@@ -0,0 +1,35 @@
+work:42380966:1346190
+ 1: 1246499
+ 5: 1246499
+foo:28798256:4267
+ 0: 4267
+ 2.1: 255999
+ 4: 264627 bar:250018
+ 4.512: 269485 bar:278102
+ 4.4608: 280297 bar:280933
+ 4.12288: 278916 bar:267752
+ 5: 264627
+ 5.4096: 269485
+ 5.8192: 260670
+ 5.8704: 278916
+ 6: 11541
+ 6.3584: 278916 work:284547
+ 6.4096: 260670 work:249428
+ 6.8704: 11541
+ 7: 272442
+ 7.512: 283590
+ 7.4608: 234082
+ 7.9728: 279149
+ 8: 11541
+ 8.11776: 283590 work:305061
+ 8.12288: 279149 work:281368
+ 8.13824: 234082 work:225786
+ 10: 4050
+bar:9504180:1076805
+ 2: 1056020
+ 3: 1056020
+main:20360:0
+ 0: 0
+ 2.1: 4045
+ 3: 4156 foo:4267
+ 5: 0
diff --git a/llvm/test/CodeGen/X86/avx-insertelt.ll b/llvm/test/CodeGen/X86/avx-insertelt.ll
index 3f5d004841e81..1bca2df5d9ce9 100644
--- a/llvm/test/CodeGen/X86/avx-insertelt.ll
+++ b/llvm/test/CodeGen/X86/avx-insertelt.ll
@@ -91,23 +91,35 @@ define <4 x i64> @insert_i64_firstelt_of_low_subvector(<4 x i64> %x, i64 %s) {
 ; 0'th element of high subvector insertion into an AVX register.
 
 define <8 x float> @insert_f32_firstelt_of_high_subvector(<8 x float> %x, float %s) {
-; ALL-LABEL: insert_f32_firstelt_of_high_subvector:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; ALL-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT:    retq
+; AVX-LABEL: insert_f32_firstelt_of_high_subvector:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: insert_f32_firstelt_of_high_subvector:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss %xmm1, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
+; AVX2-NEXT:    retq
   %i0 = insertelement <8 x float> %x, float %s, i32 4
   ret <8 x float> %i0
 }
 
 define <4 x double> @insert_f64_firstelt_of_high_subvector(<4 x double> %x, double %s) {
-; ALL-LABEL: insert_f64_firstelt_of_high_subvector:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; ALL-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT:    retq
+; AVX-LABEL: insert_f64_firstelt_of_high_subvector:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: insert_f64_firstelt_of_high_subvector:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastsd %xmm1, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
   %i0 = insertelement <4 x double> %x, double %s, i32 2
   ret <4 x double> %i0
 }
@@ -140,9 +152,10 @@ define <16 x i16> @insert_i16_firstelt_of_high_subvector(<16 x i16> %x, i16 %s)
 ;
 ; AVX2-LABEL: insert_i16_firstelt_of_high_subvector:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpinsrw $0, %edi, %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %edi, %xmm1
+; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    retq
   %i0 = insertelement <16 x i16> %x, i16 %s, i32 8
   ret <16 x i16> %i0
@@ -158,9 +171,9 @@ define <8 x i32> @insert_i32_firstelt_of_high_subvector(<8 x i32> %x, i32 %s) {
 ;
 ; AVX2-LABEL: insert_i32_firstelt_of_high_subvector:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpinsrd $0, %edi, %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %edi, %xmm1
+; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
 ; AVX2-NEXT:    retq
   %i0 = insertelement <8 x i32> %x, i32 %s, i32 4
   ret <8 x i32> %i0
@@ -176,9 +189,9 @@ define <4 x i64> @insert_i64_firstelt_of_high_subvector(<4 x i64> %x, i64 %s) {
 ;
 ; AVX2-LABEL: insert_i64_firstelt_of_high_subvector:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpinsrq $0, %rdi, %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovq %rdi, %xmm1
+; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    retq
   %i0 = insertelement <4 x i64> %x, i64 %s, i32 2
   ret <4 x i64> %i0
@@ -187,26 +200,38 @@ define <4 x i64> @insert_i64_firstelt_of_high_subvector(<4 x i64> %x, i64 %s) {
 ; element insertion into 0'th element of both subvectors
 
 define <8 x float> @insert_f32_firstelts(<8 x float> %x, float %s) {
-; ALL-LABEL: insert_f32_firstelts:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; ALL-NEXT:    retq
+; AVX-LABEL: insert_f32_firstelts:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: insert_f32_firstelts:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss %xmm1, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
+; AVX2-NEXT:    retq
   %i0 = insertelement <8 x float> %x, float %s, i32 0
   %i1 = insertelement <8 x float> %i0, float %s, i32 4
   ret <8 x float> %i1
 }
 
 define <4 x double> @insert_f64_firstelts(<4 x double> %x, double %s) {
-; ALL-LABEL: insert_f64_firstelts:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3]
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; ALL-NEXT:    retq
+; AVX-LABEL: insert_f64_firstelts:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: insert_f64_firstelts:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastsd %xmm1, %ymm1
+; AVX2-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; AVX2-NEXT:    retq
   %i0 = insertelement <4 x double> %x, double %s, i32 0
   %i1 = insertelement <4 x double> %i0, double %s, i32 2
   ret <4 x double> %i1
@@ -245,9 +270,11 @@ define <16 x i16> @insert_i16_firstelts(<16 x i16> %x, i16 %s) {
 ; AVX2-LABEL: insert_i16_firstelts:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpinsrw $0, %edi, %xmm0, %xmm1
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpinsrw $0, %edi, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovd %edi, %xmm1
+; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    retq
   %i0 = insertelement <16 x i16> %x, i16 %s, i32 0
   %i1 = insertelement <16 x i16> %i0, i16 %s, i32 8
@@ -266,10 +293,8 @@ define <8 x i32> @insert_i32_firstelts(<8 x i32> %x, i32 %s) {
 ; AVX2-LABEL: insert_i32_firstelts:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpinsrd $0, %edi, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7]
 ; AVX2-NEXT:    retq
   %i0 = insertelement <8 x i32> %x, i32 %s, i32 0
   %i1 = insertelement <8 x i32> %i0, i32 %s, i32 4
@@ -288,9 +313,10 @@ define <4 x i64> @insert_i64_firstelts(<4 x i64> %x, i64 %s) {
 ; AVX2-LABEL: insert_i64_firstelts:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpinsrq $0, %rdi, %xmm0, %xmm1
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpinsrq $0, %rdi, %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vmovq %rdi, %xmm1
+; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    retq
   %i0 = insertelement <4 x i64> %x, i64 %s, i32 0
   %i1 = insertelement <4 x i64> %i0, i64 %s, i32 2
@@ -300,23 +326,35 @@ define <4 x i64> @insert_i64_firstelts(<4 x i64> %x, i64 %s) {
 ; element insertion into two elements of high subvector
 
 define <8 x float> @insert_f32_two_elts_of_high_subvector(<8 x float> %x, float %s) {
-; ALL-LABEL: insert_f32_two_elts_of_high_subvector:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; ALL-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,3]
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT:    retq
+; AVX-LABEL: insert_f32_two_elts_of_high_subvector:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,3]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: insert_f32_two_elts_of_high_subvector:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastss %xmm1, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
   %i0 = insertelement <8 x float> %x, float %s, i32 4
   %i1 = insertelement <8 x float> %i0, float %s, i32 5
   ret <8 x float> %i1
 }
 
 define <4 x double> @insert_f64_two_elts_of_high_subvector(<4 x double> %x, double %s) {
-; ALL-LABEL: insert_f64_two_elts_of_high_subvector:
-; ALL:       # %bb.0:
-; ALL-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT:    retq
+; AVX-LABEL: insert_f64_two_elts_of_high_subvector:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    retq
+;
+; AVX2-LABEL: insert_f64_two_elts_of_high_subvector:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vbroadcastsd %xmm1, %ymm1
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT:    retq
   %i0 = insertelement <4 x double> %x, double %s, i32 2
   %i1 = insertelement <4 x double> %i0, double %s, i32 3
   ret <4 x double> %i1
@@ -354,10 +392,9 @@ define <16 x i16> @insert_i16_two_elts_of_high_subvector(<16 x i16> %x, i16 %s)
 ;
 ; AVX2-LABEL: insert_i16_two_elts_of_high_subvector:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpinsrw $0, %edi, %xmm1, %xmm1
-; AVX2-NEXT:    vpinsrw $1, %edi, %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %edi, %xmm1
+; AVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
 ; AVX2-NEXT:    retq
   %i0 = insertelement <16 x i16> %x, i16 %s, i32 8
   %i1 = insertelement <16 x i16> %i0, i16 %s, i32 9
@@ -375,10 +412,9 @@ define <8 x i32> @insert_i32_two_elts_of_high_subvector(<8 x i32> %x, i32 %s) {
 ;
 ; AVX2-LABEL: insert_i32_two_elts_of_high_subvector:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpinsrd $0, %edi, %xmm1, %xmm1
-; AVX2-NEXT:    vpinsrd $1, %edi, %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovd %edi, %xmm1
+; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-NEXT:    retq
   %i0 = insertelement <8 x i32> %x, i32 %s, i32 4
   %i1 = insertelement <8 x i32> %i0, i32 %s, i32 5
@@ -395,9 +431,9 @@ define <4 x i64> @insert_i64_two_elts_of_high_subvector(<4 x i64> %x, i64 %s) {
 ;
 ; AVX2-LABEL: insert_i64_two_elts_of_high_subvector:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpinsrq $0, %rdi, %xmm0, %xmm1
-; AVX2-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovq %rdi, %xmm1
+; AVX2-NEXT:    vpbroadcastq %xmm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-NEXT:    retq
   %i0 = insertelement <4 x i64> %x, i64 %s, i32 2
   %i1 = insertelement <4 x i64> %i0, i64 %s, i32 3
diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll
index a176edba13aa9..7aa25855b96e8 100644
--- a/llvm/test/CodeGen/X86/avx.ll
+++ b/llvm/test/CodeGen/X86/avx.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X32
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X86
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X64
 
 define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
@@ -43,16 +43,15 @@ define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
 
 define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
-; On X32, account for the argument's move to registers
-; X32-LABEL: insertps_from_vector_load:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X32-NEXT:    retl
+; X86-LABEL: insertps_from_vector_load:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-NEXT:    retq
   %1 = load <4 x float>, <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -61,17 +60,15 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
 
 ;; Use a non-zero CountS for insertps
 define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
-; On X32, account for the argument's move to registers
-;; Try to match a bit more of the instr, since we need the load's offset.
-; X32-LABEL: insertps_from_vector_load_offset:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; X32-NEXT:    retl
+; X86-LABEL: insertps_from_vector_load_offset:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vinsertps $32, 4(%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load_offset:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X64-NEXT:    vinsertps $32, 4(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X64-NEXT:    retq
   %1 = load <4 x float>, <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
@@ -79,20 +76,18 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>
 }
 
 define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
-; On X32, account for the argument's move to registers
-;; Try to match a bit more of the instr, since we need the load's offset.
-; X32-LABEL: insertps_from_vector_load_offset_2:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    shll $4, %ecx
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
-; X32-NEXT:    retl
+; X86-LABEL: insertps_from_vector_load_offset_2:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    vinsertps $0, 12(%eax,%ecx), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load_offset_2:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    shlq $4, %rsi
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; X64-NEXT:    vinsertps $0, 12(%rdi,%rsi), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
   %2 = load <4 x float>, <4 x float>* %1, align 16
@@ -101,17 +96,16 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
 }
 
 define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
-; On X32, account for the arguments' move to registers
-; X32-LABEL: insertps_from_broadcast_loadf32:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X32-NEXT:    retl
+; X86-LABEL: insertps_from_broadcast_loadf32:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_broadcast_loadf32:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-NEXT:    retq
   %1 = getelementptr inbounds float, float* %fb, i64 %index
   %2 = load float, float* %1, align 4
@@ -124,16 +118,15 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
 }
 
 define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
-; On X32, account for the arguments' move to registers
-; X32-LABEL: insertps_from_broadcast_loadv4f32:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; X32-NEXT:    retl
+; X86-LABEL: insertps_from_broadcast_loadv4f32:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_broadcast_loadv4f32:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
 ; X64-NEXT:    retq
   %1 = load <4 x float>, <4 x float>* %b, align 4
   %2 = extractelement <4 x float> %1, i32 0
@@ -145,31 +138,29 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
   ret <4 x float> %7
 }
 
-;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
 define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
-; On X32, account for the arguments' move to registers
-; X32-LABEL: insertps_from_broadcast_multiple_use:
-; X32:       ## %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4
-; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
-; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
-; X32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
-; X32-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X32-NEXT:    retl
+; X86-LABEL: insertps_from_broadcast_multiple_use:
+; X86:       ## %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4
+; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; X86-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
+; X86-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
+; X86-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; X86-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_broadcast_multiple_use:
 ; X64:       ## %bb.0:
 ; X64-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4
-; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
-; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
-; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
-; X64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
+; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
+; X64-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
 ; X64-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avx2-masked-gather.ll b/llvm/test/CodeGen/X86/avx2-masked-gather.ll
index 9b3635fa1c9e8..0eaa034bf32b2 100644
--- a/llvm/test/CodeGen/X86/avx2-masked-gather.ll
+++ b/llvm/test/CodeGen/X86/avx2-masked-gather.ll
@@ -396,17 +396,15 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
 ; NOGATHER-NEXT:    je .LBB6_10
 ; NOGATHER-NEXT:  # %bb.9: # %cond.load10
 ; NOGATHER-NEXT:    vmovq %xmm0, %rcx
-; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT:    vpinsrd $0, (%rcx), %xmm2, %xmm2
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT:    vbroadcastss (%rcx), %ymm2
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
 ; NOGATHER-NEXT:  .LBB6_10: # %else11
 ; NOGATHER-NEXT:    testb $32, %al
 ; NOGATHER-NEXT:    je .LBB6_12
 ; NOGATHER-NEXT:  # %bb.11: # %cond.load13
 ; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rcx
-; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT:    vpinsrd $1, (%rcx), %xmm2, %xmm2
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT:    vbroadcastss (%rcx), %ymm2
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
 ; NOGATHER-NEXT:  .LBB6_12: # %else14
 ; NOGATHER-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; NOGATHER-NEXT:    testb $64, %al
@@ -419,16 +417,14 @@ define <8 x i32> @masked_gather_v8i32(<8 x i32*>* %ptr, <8 x i1> %masks, <8 x i3
 ; NOGATHER-NEXT:    retq
 ; NOGATHER-NEXT:  .LBB6_13: # %cond.load16
 ; NOGATHER-NEXT:    vmovq %xmm0, %rcx
-; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT:    vpinsrd $2, (%rcx), %xmm2, %xmm2
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT:    vbroadcastss (%rcx), %ymm2
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
 ; NOGATHER-NEXT:    testb $-128, %al
 ; NOGATHER-NEXT:    je .LBB6_16
 ; NOGATHER-NEXT:  .LBB6_15: # %cond.load19
 ; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; NOGATHER-NEXT:    vpinsrd $3, (%rax), %xmm0, %xmm0
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; NOGATHER-NEXT:    vbroadcastss (%rax), %ymm0
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
 ; NOGATHER-NEXT:    vmovaps %ymm1, %ymm0
 ; NOGATHER-NEXT:    retq
 entry:
@@ -503,18 +499,15 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
 ; NOGATHER-NEXT:    je .LBB7_10
 ; NOGATHER-NEXT:  # %bb.9: # %cond.load10
 ; NOGATHER-NEXT:    vmovq %xmm0, %rcx
-; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; NOGATHER-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT:    vbroadcastss (%rcx), %ymm2
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
 ; NOGATHER-NEXT:  .LBB7_10: # %else11
 ; NOGATHER-NEXT:    testb $32, %al
 ; NOGATHER-NEXT:    je .LBB7_12
 ; NOGATHER-NEXT:  # %bb.11: # %cond.load13
 ; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rcx
-; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT:    vbroadcastss (%rcx), %ymm2
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
 ; NOGATHER-NEXT:  .LBB7_12: # %else14
 ; NOGATHER-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; NOGATHER-NEXT:    testb $64, %al
@@ -527,16 +520,14 @@ define <8 x float> @masked_gather_v8float(<8 x float*>* %ptr, <8 x i1> %masks, <
 ; NOGATHER-NEXT:    retq
 ; NOGATHER-NEXT:  .LBB7_13: # %cond.load16
 ; NOGATHER-NEXT:    vmovq %xmm0, %rcx
-; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT:    vbroadcastss (%rcx), %ymm2
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
 ; NOGATHER-NEXT:    testb $-128, %al
 ; NOGATHER-NEXT:    je .LBB7_16
 ; NOGATHER-NEXT:  .LBB7_15: # %cond.load19
 ; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; NOGATHER-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; NOGATHER-NEXT:    vbroadcastss (%rax), %ymm0
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
 ; NOGATHER-NEXT:    vmovaps %ymm1, %ymm0
 ; NOGATHER-NEXT:    retq
 entry:
@@ -597,16 +588,14 @@ define <4 x i64> @masked_gather_v4i64(<4 x i64*>* %ptr, <4 x i1> %masks, <4 x i6
 ; NOGATHER-NEXT:    retq
 ; NOGATHER-NEXT:  .LBB8_5: # %cond.load4
 ; NOGATHER-NEXT:    vmovq %xmm0, %rcx
-; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT:    vpinsrq $0, (%rcx), %xmm2, %xmm2
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT:    vbroadcastsd (%rcx), %ymm2
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
 ; NOGATHER-NEXT:    testb $8, %al
 ; NOGATHER-NEXT:    je .LBB8_8
 ; NOGATHER-NEXT:  .LBB8_7: # %cond.load7
 ; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; NOGATHER-NEXT:    vpinsrq $1, (%rax), %xmm0, %xmm0
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; NOGATHER-NEXT:    vbroadcastsd (%rax), %ymm0
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; NOGATHER-NEXT:    vmovaps %ymm1, %ymm0
 ; NOGATHER-NEXT:    retq
 entry:
@@ -667,16 +656,14 @@ define <4 x double> @masked_gather_v4double(<4 x double*>* %ptr, <4 x i1> %masks
 ; NOGATHER-NEXT:    retq
 ; NOGATHER-NEXT:  .LBB9_5: # %cond.load4
 ; NOGATHER-NEXT:    vmovq %xmm0, %rcx
-; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; NOGATHER-NEXT:    vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; NOGATHER-NEXT:    vbroadcastsd (%rcx), %ymm2
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
 ; NOGATHER-NEXT:    testb $8, %al
 ; NOGATHER-NEXT:    je .LBB9_8
 ; NOGATHER-NEXT:  .LBB9_7: # %cond.load7
 ; NOGATHER-NEXT:    vpextrq $1, %xmm0, %rax
-; NOGATHER-NEXT:    vextractf128 $1, %ymm1, %xmm0
-; NOGATHER-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; NOGATHER-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; NOGATHER-NEXT:    vbroadcastsd (%rax), %ymm0
+; NOGATHER-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; NOGATHER-NEXT:    vmovaps %ymm1, %ymm0
 ; NOGATHER-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 7274d8335129c..4ae0d273daaee 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -8,9 +8,9 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
 ; CHECK-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
-; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; CHECK-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vbroadcastss %xmm1, %zmm1
+; CHECK-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,30,15]
+; CHECK-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %rrr = load float, float* %br
   %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
@@ -19,14 +19,23 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
 }
 
 define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
-; CHECK-LABEL: test2:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovhps {{.*#+}} xmm2 = xmm0[0,1],mem[0,1]
-; CHECK-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
-; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
-; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; CHECK-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
-; CHECK-NEXT:    retq
+; KNL-LABEL: test2:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
+; KNL-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
+; KNL-NEXT:    movb $64, %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vbroadcastsd %xmm1, %zmm0 {%k1}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test2:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
+; SKX-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
+; SKX-NEXT:    movb $64, %al
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vbroadcastsd %xmm1, %zmm0 {%k1}
+; SKX-NEXT:    retq
   %rrr = load double, double* %br
   %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
   %rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
@@ -535,14 +544,23 @@ define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
 }
 
 define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
-; CHECK-LABEL: insert_v8i64:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
-; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
-; CHECK-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; KNL-LABEL: insert_v8i64:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT:    movb $8, %al
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_v8i64:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    movb $8, %al
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1}
+; SKX-NEXT:    retq
   %val = load i64, i64* %ptr
   %r1 = insertelement <8 x i64> %x, i64 %val, i32 1
   %r2 = insertelement <8 x i64> %r1, i64 %y, i32 3
@@ -550,13 +568,22 @@ define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
 }
 
 define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
-; CHECK-LABEL: insert_v4i64:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; KNL-LABEL: insert_v4i64:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT:    vmovq %rdi, %xmm1
+; KNL-NEXT:    vpbroadcastq %xmm1, %ymm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_v4i64:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT:    vpbroadcastq %rdi, %ymm1
+; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; SKX-NEXT:    retq
   %val = load i64, i64* %ptr
   %r1 = insertelement <4 x i64> %x, i64 %val, i32 1
   %r2 = insertelement <4 x i64> %r1, i64 %y, i32 3
@@ -576,14 +603,23 @@ define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) {
 }
 
 define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
-; CHECK-LABEL: insert_v16i32:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
-; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
-; CHECK-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; KNL-LABEL: insert_v16i32:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL-NEXT:    movw $32, %ax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vpbroadcastd %edi, %zmm0 {%k1}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_v16i32:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    movw $32, %ax
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vpbroadcastd %edi, %zmm0 {%k1}
+; SKX-NEXT:    retq
   %val = load i32, i32* %ptr
   %r1 = insertelement <16 x i32> %x, i32 %val, i32 1
   %r2 = insertelement <16 x i32> %r1, i32 %y, i32 5
@@ -591,13 +627,22 @@ define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
 }
 
 define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
-; CHECK-LABEL: insert_v8i32:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; KNL-LABEL: insert_v8i32:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT:    vmovd %edi, %xmm1
+; KNL-NEXT:    vpbroadcastd %xmm1, %ymm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_v8i32:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT:    vpbroadcastd %edi, %ymm1
+; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
+; SKX-NEXT:    retq
   %val = load i32, i32* %ptr
   %r1 = insertelement <8 x i32> %x, i32 %val, i32 1
   %r2 = insertelement <8 x i32> %r1, i32 %y, i32 5
@@ -617,14 +662,24 @@ define <4 x i32> @insert_v4i32(<4 x i32> %x, i32 %y, i32* %ptr) {
 }
 
 define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
-; CHECK-LABEL: insert_v32i16:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
-; CHECK-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
-; CHECK-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
-; CHECK-NEXT:    retq
+; KNL-LABEL: insert_v32i16:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
+; KNL-NEXT:    vmovd %edi, %xmm0
+; KNL-NEXT:    vpbroadcastw %xmm0, %ymm0
+; KNL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; KNL-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_v32i16:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    movl $512, %eax ## imm = 0x200
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vpbroadcastw %edi, %zmm0 {%k1}
+; SKX-NEXT:    retq
   %val = load i16, i16* %ptr
   %r1 = insertelement <32 x i16> %x, i16 %val, i32 1
   %r2 = insertelement <32 x i16> %r1, i16 %y, i32 9
@@ -632,13 +687,24 @@ define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
 }
 
 define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
-; CHECK-LABEL: insert_v16i16:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; CHECK-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
-; CHECK-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT:    retq
+; KNL-LABEL: insert_v16i16:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT:    vmovd %edi, %xmm1
+; KNL-NEXT:    vpbroadcastw %xmm1, %ymm1
+; KNL-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: insert_v16i16:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
+; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT:    vpbroadcastw %edi, %ymm1
+; SKX-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
+; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; SKX-NEXT:    retq
   %val = load i16, i16* %ptr
   %r1 = insertelement <16 x i16> %x, i16 %val, i32 1
   %r2 = insertelement <16 x i16> %r1, i16 %y, i32 9
@@ -739,12 +805,20 @@ define <16 x float> @test_insert_128_v16f32(<16 x float> %x, float %y) {
 }
 
 define <16 x i16> @test_insert_128_v16i16(<16 x i16> %x, i16 %y) {
-; CHECK-LABEL: test_insert_128_v16i16:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpinsrw $2, %edi, %xmm1, %xmm1
-; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
+; KNL-LABEL: test_insert_128_v16i16:
+; KNL:       ## %bb.0:
+; KNL-NEXT:    vmovd %edi, %xmm1
+; KNL-NEXT:    vpbroadcastw %xmm1, %ymm1
+; KNL-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
+; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_insert_128_v16i16:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    vpbroadcastw %edi, %ymm1
+; SKX-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7,8,9],ymm1[10],ymm0[11,12,13,14,15]
+; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; SKX-NEXT:    retq
   %r = insertelement <16 x i16> %x, i16 %y, i32 10
   ret <16 x i16> %r
 }
diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
index 35374c880b721..62c18b8b26380 100644
--- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
+++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
@@ -72,12 +72,19 @@ define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
 
 define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
 ;
-; AVX512-LABEL: load_one_mask_bit_set5:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
-; AVX512-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; AVX512-NEXT:    vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: load_one_mask_bit_set5:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    movb $-128, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vbroadcastsd 56(%rdi), %zmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; SKX-LABEL: load_one_mask_bit_set5:
+; SKX:       ## %bb.0:
+; SKX-NEXT:    movb $-128, %al
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vbroadcastsd 56(%rdi), %zmm0 {%k1}
+; SKX-NEXT:    retq
   %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
   ret <8 x double> %res
 }
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index fa67199ce98df..ac346c502eb8c 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
 
-define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
+define i32 @test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) nounwind {
 ; X86-LABEL: test_int_x86_avx512_kunpck_wd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k0 # encoding: [0xc5,0xf8,0x90,0x44,0x24,0x04]
@@ -26,7 +26,7 @@ define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
 
 declare i64 @llvm.x86.avx512.kunpck.dq(i64, i64)
 
-define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
+define i64 @test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) nounwind {
 ; X86-LABEL: test_int_x86_avx512_kunpck_qd:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
@@ -46,7 +46,7 @@ define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) {
 
 declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
 
-  define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) {
+define <64 x i8> @test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0x4c,0x24,0x04]
@@ -75,7 +75,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64)
   }
 
 declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32)
-  define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) {
+define <32 x i16> @test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpbroadcastw {{[0-9]+}}(%esp), %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0x4c,0x24,0x02]
@@ -105,7 +105,7 @@ declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i
 
 declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64)
 
-define void@test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) {
+define void @test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
@@ -130,7 +130,7 @@ define void@test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8
 
 declare void @llvm.x86.avx512.mask.storeu.w.512(i8*, <32 x i16>, i32)
 
-define void@test_int_x86_avx512_mask_storeu_w_512(i8* %ptr1, i8* %ptr2, <32 x i16> %x1, i32 %x2) {
+define void @test_int_x86_avx512_mask_storeu_w_512(i8* %ptr1, i8* %ptr2, <32 x i16> %x1, i32 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_storeu_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
@@ -155,7 +155,7 @@ define void@test_int_x86_avx512_mask_storeu_w_512(i8* %ptr1, i8* %ptr2, <32 x i1
 
 declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8*, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) {
+define <32 x i16> @test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
@@ -184,7 +184,7 @@ define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32
 
 declare <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8*, <64 x i8>, i64)
 
-define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) {
+define <64 x i8> @test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_loadu_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
@@ -213,7 +213,7 @@ define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x
 
 declare <8 x i64> @llvm.x86.avx512.psll.dq.512(<8 x i64>, i32)
 
-define <8 x i64>@test_int_x86_avx512_psll_dq_512(<8 x i64> %x0) {
+define <8 x i64> @test_int_x86_avx512_psll_dq_512(<8 x i64> %x0) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_psll_dq_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpslldq $8, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x73,0xf8,0x08]
@@ -228,7 +228,7 @@ define <8 x i64>@test_int_x86_avx512_psll_dq_512(<8 x i64> %x0) {
   ret <8 x i64> %res2
 }
 
-define <8 x i64>@test_int_x86_avx512_psll_load_dq_512(<8 x i64>* %p0) {
+define <8 x i64> @test_int_x86_avx512_psll_load_dq_512(<8 x i64>* %p0) nounwind {
 ; X86-LABEL: test_int_x86_avx512_psll_load_dq_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -248,7 +248,7 @@ define <8 x i64>@test_int_x86_avx512_psll_load_dq_512(<8 x i64>* %p0) {
 
 declare <8 x i64> @llvm.x86.avx512.psrl.dq.512(<8 x i64>, i32)
 
-define <8 x i64>@test_int_x86_avx512_psrl_dq_512(<8 x i64> %x0) {
+define <8 x i64> @test_int_x86_avx512_psrl_dq_512(<8 x i64> %x0) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_psrl_dq_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsrldq $8, %zmm0, %zmm1 # encoding: [0x62,0xf1,0x75,0x48,0x73,0xd8,0x08]
@@ -263,7 +263,7 @@ define <8 x i64>@test_int_x86_avx512_psrl_dq_512(<8 x i64> %x0) {
   ret <8 x i64> %res2
 }
 
-define <8 x i64>@test_int_x86_avx512_psrl_load_dq_512(<8 x i64>* %p0) {
+define <8 x i64> @test_int_x86_avx512_psrl_load_dq_512(<8 x i64>* %p0) nounwind {
 ; X86-LABEL: test_int_x86_avx512_psrl_load_dq_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -283,7 +283,7 @@ define <8 x i64>@test_int_x86_avx512_psrl_load_dq_512(<8 x i64>* %p0) {
 
 declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64)
 
-define <64 x i8>@test_int_x86_avx512_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3) {
+define <64 x i8> @test_int_x86_avx512_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_palignr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpalignr $2, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x0f,0xc1,0x02]
@@ -293,7 +293,7 @@ define <64 x i8>@test_int_x86_avx512_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <
   ret <64 x i8> %res
 }
 
-define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) {
+define <64 x i8> @test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_palignr_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -313,7 +313,7 @@ define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %
   ret <64 x i8> %res
 }
 
-define <64 x i8>@test_int_x86_avx512_maskz_palignr_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x4) {
+define <64 x i8> @test_int_x86_avx512_maskz_palignr_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x4) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_palignr_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -333,7 +333,7 @@ define <64 x i8>@test_int_x86_avx512_maskz_palignr_512(<64 x i8> %x0, <64 x i8>
 
 declare <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16>, i32, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pshufh_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpshufhw $3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7e,0x48,0x70,0xc0,0x03]
@@ -343,7 +343,7 @@ define <32 x i16>@test_int_x86_avx512_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pshufh_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
@@ -363,7 +363,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i32 %x1,
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_maskz_pshufh_w_512(<32 x i16> %x0, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_maskz_pshufh_w_512(<32 x i16> %x0, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_pshufh_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -383,7 +383,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_pshufh_w_512(<32 x i16> %x0, i32 %x3
 
 declare <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16>, i32, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pshufl_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpshuflw $3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7f,0x48,0x70,0xc0,0x03]
@@ -393,7 +393,7 @@ define <32 x i16>@test_int_x86_avx512_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pshufl_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
@@ -413,7 +413,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i32 %x1,
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_maskz_pshufl_w_512(<32 x i16> %x0, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_maskz_pshufl_w_512(<32 x i16> %x0, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_pshufl_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -431,7 +431,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_pshufl_w_512(<32 x i16> %x0, i32 %x3
   ret <32 x i16> %res
 }
 
-define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
+define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; X86-LABEL: test_pcmpeq_b:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
@@ -451,7 +451,7 @@ define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) {
   ret i64 %res
 }
 
-define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_pcmpeq_b:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
@@ -476,7 +476,7 @@ define i64 @test_mask_pcmpeq_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 
 declare i64 @llvm.x86.avx512.mask.pcmpeq.b.512(<64 x i8>, <64 x i8>, i64)
 
-define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
+define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; CHECK-LABEL: test_pcmpeq_w:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
@@ -487,7 +487,7 @@ define i32 @test_pcmpeq_w(<32 x i16> %a, <32 x i16> %b) {
   ret i32 %res
 }
 
-define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_pcmpeq_w:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
@@ -509,7 +509,7 @@ define i32 @test_mask_pcmpeq_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
 
 declare i32 @llvm.x86.avx512.mask.pcmpeq.w.512(<32 x i16>, <32 x i16>, i32)
 
-define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
+define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; X86-LABEL: test_pcmpgt_b:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
@@ -529,7 +529,7 @@ define i64 @test_pcmpgt_b(<64 x i8> %a, <64 x i8> %b) {
   ret i64 %res
 }
 
-define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_pcmpgt_b:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
@@ -554,7 +554,7 @@ define i64 @test_mask_pcmpgt_b(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
 
 declare i64 @llvm.x86.avx512.mask.pcmpgt.b.512(<64 x i8>, <64 x i8>, i64)
 
-define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
+define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; CHECK-LABEL: test_pcmpgt_w:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
@@ -565,7 +565,7 @@ define i32 @test_pcmpgt_w(<32 x i16> %a, <32 x i16> %b) {
   ret i32 %res
 }
 
-define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+define i32 @test_mask_pcmpgt_w(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_pcmpgt_w:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
@@ -589,7 +589,7 @@ declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
 
 declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-define <64 x i8>@test_int_x86_avx512_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
+define <64 x i8> @test_int_x86_avx512_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_punpckhb_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpunpckhbw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x68,0xc1]
@@ -599,7 +599,7 @@ define <64 x i8>@test_int_x86_avx512_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1
   ret <64 x i8> %res
 }
 
-define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+define <64 x i8> @test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_punpckhb_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -621,7 +621,7 @@ define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8
 
 declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-define <64 x i8>@test_int_x86_avx512_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
+define <64 x i8> @test_int_x86_avx512_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_punpcklb_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpunpcklbw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x60,0xc1]
@@ -631,7 +631,7 @@ define <64 x i8>@test_int_x86_avx512_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1
   ret <64 x i8> %res
 }
 
-define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+define <64 x i8> @test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_punpcklb_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -653,7 +653,7 @@ define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8
 
 declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_punpckhw_d_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpunpckhwd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x69,0xc1]
@@ -663,7 +663,7 @@ define <32 x i16>@test_int_x86_avx512_punpckhw_d_512(<32 x i16> %x0, <32 x i16>
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_punpckhw_d_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -685,7 +685,7 @@ define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x
 
 declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_punpcklw_d_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpunpcklwd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x61,0xc1]
@@ -695,7 +695,7 @@ define <32 x i16>@test_int_x86_avx512_punpcklw_d_512(<32 x i16> %x0, <32 x i16>
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_punpcklw_d_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -717,7 +717,7 @@ define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x
 
 declare <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-define <64 x i8>@test_int_x86_avx512_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
+define <64 x i8> @test_int_x86_avx512_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmaxs_b_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmaxsb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x3c,0xc1]
@@ -726,7 +726,7 @@ define <64 x i8>@test_int_x86_avx512_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <
   ret <64 x i8> %res
 }
 
-define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+define <64 x i8> @test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -746,7 +746,7 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmaxs_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmaxsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xee,0xc1]
@@ -755,7 +755,7 @@ define <32 x i16>@test_int_x86_avx512_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -775,7 +775,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16
 
 declare <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-define <64 x i8>@test_int_x86_avx512_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
+define <64 x i8> @test_int_x86_avx512_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmaxu_b_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmaxub %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xde,0xc1]
@@ -784,7 +784,7 @@ define <64 x i8>@test_int_x86_avx512_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <
   ret <64 x i8> %res
 }
 
-define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+define <64 x i8> @test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -804,7 +804,7 @@ define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmaxu_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmaxuw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x3e,0xc1]
@@ -813,7 +813,7 @@ define <32 x i16>@test_int_x86_avx512_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -833,7 +833,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16
 
 declare <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-define <64 x i8>@test_int_x86_avx512_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
+define <64 x i8> @test_int_x86_avx512_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmins_b_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpminsb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x38,0xc1]
@@ -842,7 +842,7 @@ define <64 x i8>@test_int_x86_avx512_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <
   ret <64 x i8> %res
 }
 
-define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+define <64 x i8> @test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmins_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -862,7 +862,7 @@ define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmins_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpminsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xea,0xc1]
@@ -871,7 +871,7 @@ define <32 x i16>@test_int_x86_avx512_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmins_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -891,7 +891,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16
 
 declare <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-define <64 x i8>@test_int_x86_avx512_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
+define <64 x i8> @test_int_x86_avx512_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pminu_b_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpminub %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xda,0xc1]
@@ -900,7 +900,7 @@ define <64 x i8>@test_int_x86_avx512_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <
   ret <64 x i8> %res
 }
 
-define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+define <64 x i8> @test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pminu_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -920,7 +920,7 @@ define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pminu_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpminuw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x3a,0xc1]
@@ -929,7 +929,7 @@ define <32 x i16>@test_int_x86_avx512_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pminu_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -949,7 +949,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1) {
+define <32 x i16> @test_int_x86_avx512_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmovzxb_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovzxbw %ymm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x30,0xc0]
@@ -959,7 +959,7 @@ define <32 x i16>@test_int_x86_avx512_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
+define <32 x i16> @test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -979,7 +979,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i1
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_maskz_pmovzxb_w_512(<32 x i8> %x0, i32 %x2) {
+define <32 x i16> @test_int_x86_avx512_maskz_pmovzxb_w_512(<32 x i8> %x0, i32 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -999,7 +999,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_pmovzxb_w_512(<32 x i8> %x0, i32 %x2
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1) {
+define <32 x i16> @test_int_x86_avx512_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmovsxb_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovsxbw %ymm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x20,0xc0]
@@ -1008,7 +1008,7 @@ define <32 x i16>@test_int_x86_avx512_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
+define <32 x i16> @test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -1026,7 +1026,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i1
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_maskz_pmovsxb_w_512(<32 x i8> %x0, i32 %x2) {
+define <32 x i16> @test_int_x86_avx512_maskz_pmovsxb_w_512(<32 x i8> %x0, i32 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -1044,7 +1044,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_pmovsxb_w_512(<32 x i8> %x0, i32 %x2
 
 declare <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_psrl_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd1,0xc1]
@@ -1053,7 +1053,7 @@ define <32 x i16>@test_int_x86_avx512_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1,
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_psrl_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -1071,7 +1071,7 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16>
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_maskz_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_maskz_psrl_w_512(<32 x i16> %x0, <8 x i16> %x1, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_psrl_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -1089,7 +1089,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_psrl_w_512(<32 x i16> %x0, <8 x i16>
 
 declare <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16>, i32, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_psrl_wi_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
@@ -1119,7 +1119,7 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1,
 
 declare <32 x i16> @llvm.x86.avx512.mask.psra.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_psra_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsraw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe1,0xc1]
@@ -1128,7 +1128,7 @@ define <32 x i16>@test_int_x86_avx512_psra_w_512(<32 x i16> %x0, <8 x i16> %x1,
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_psra_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -1146,7 +1146,7 @@ define <32 x i16>@test_int_x86_avx512_mask_psra_w_512(<32 x i16> %x0, <8 x i16>
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_maskz_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_maskz_psra_w_512(<32 x i16> %x0, <8 x i16> %x1, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_psra_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -1164,7 +1164,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_psra_w_512(<32 x i16> %x0, <8 x i16>
 
 declare <32 x i16> @llvm.x86.avx512.mask.psra.wi.512(<32 x i16>, i32, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_psra_wi_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
@@ -1194,7 +1194,7 @@ define <32 x i16>@test_int_x86_avx512_mask_psra_wi_512(<32 x i16> %x0, i32 %x1,
 
 declare <32 x i16> @llvm.x86.avx512.mask.psll.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_psll_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsllw %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xf1,0xc1]
@@ -1203,7 +1203,7 @@ define <32 x i16>@test_int_x86_avx512_psll_w_512(<32 x i16> %x0, <8 x i16> %x1,
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_psll_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -1221,7 +1221,7 @@ define <32 x i16>@test_int_x86_avx512_mask_psll_w_512(<32 x i16> %x0, <8 x i16>
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_maskz_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_maskz_psll_w_512(<32 x i16> %x0, <8 x i16> %x1, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_psll_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -1239,7 +1239,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_psll_w_512(<32 x i16> %x0, <8 x i16>
 
 declare <32 x i16> @llvm.x86.avx512.mask.psll.wi.512(<32 x i16>, i32, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_psll_wi_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08]
@@ -1269,7 +1269,7 @@ define <32 x i16>@test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i32 %x1,
 
 declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
+define <64 x i8> @test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pshuf_b_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpshufb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x00,0xc1]
@@ -1278,7 +1278,7 @@ define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <
   ret <64 x i8> %res
 }
 
-define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+define <64 x i8> @test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pshuf_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -1299,7 +1299,7 @@ define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %
 
 declare <64 x i8> @llvm.x86.avx512.cvtmask2b.512(i64)
 
-define <64 x i8>@test_int_x86_avx512_cvtmask2b_512(i64 %x0) {
+define <64 x i8> @test_int_x86_avx512_cvtmask2b_512(i64 %x0) nounwind {
 ; X86-LABEL: test_int_x86_avx512_cvtmask2b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf8,0x90,0x44,0x24,0x04]
@@ -1317,7 +1317,7 @@ define <64 x i8>@test_int_x86_avx512_cvtmask2b_512(i64 %x0) {
 
 declare <32 x i16> @llvm.x86.avx512.cvtmask2w.512(i32)
 
-define <32 x i16>@test_int_x86_avx512_cvtmask2w_512(i32 %x0) {
+define <32 x i16> @test_int_x86_avx512_cvtmask2w_512(i32 %x0) nounwind {
 ; X86-LABEL: test_int_x86_avx512_cvtmask2w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k0 # encoding: [0xc4,0xe1,0xf9,0x90,0x44,0x24,0x04]
@@ -1332,7 +1332,7 @@ define <32 x i16>@test_int_x86_avx512_cvtmask2w_512(i32 %x0) {
   %res = call <32 x i16> @llvm.x86.avx512.cvtmask2w.512(i32 %x0)
   ret <32 x i16> %res
 }
-define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
+define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; CHECK-LABEL: test_mask_packs_epi32_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x6b,0xc1]
@@ -1341,7 +1341,7 @@ define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_packs_epi32_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -1359,7 +1359,7 @@ define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
+define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_packs_epi32_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -1375,7 +1375,7 @@ define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
+define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_packs_epi32_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1391,7 +1391,7 @@ define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_packs_epi32_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1411,7 +1411,7 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
+define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_packs_epi32_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1429,7 +1429,7 @@ define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %pt
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
+define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_packs_epi32_rmb_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1447,7 +1447,7 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_packs_epi32_rmbk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1469,7 +1469,7 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
+define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_packs_epi32_rmbkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1491,7 +1491,7 @@ define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i
 
 declare <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
 
-define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; CHECK-LABEL: test_mask_packs_epi16_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x63,0xc1]
@@ -1500,7 +1500,7 @@ define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
+define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_packs_epi16_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -1518,7 +1518,7 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
+define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_packs_epi16_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -1534,7 +1534,7 @@ define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_packs_epi16_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1550,7 +1550,7 @@ define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
+define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_packs_epi16_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1570,7 +1570,7 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
+define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_packs_epi16_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1591,7 +1591,7 @@ define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 declare <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64)
 
 
-define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
+define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) nounwind {
 ; CHECK-LABEL: test_mask_packus_epi32_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x2b,0xc1]
@@ -1600,7 +1600,7 @@ define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_packus_epi32_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -1618,7 +1618,7 @@ define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b,
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
+define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_packus_epi32_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -1634,7 +1634,7 @@ define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
+define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_packus_epi32_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1650,7 +1650,7 @@ define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_packus_epi32_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1670,7 +1670,7 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %pt
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
+define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_packus_epi32_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1688,7 +1688,7 @@ define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %p
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
+define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_packus_epi32_rmb_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1706,7 +1706,7 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_packus_epi32_rmbk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1728,7 +1728,7 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
+define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_packus_epi32_rmbkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1750,7 +1750,7 @@ define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b,
 
 declare <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
 
-define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; CHECK-LABEL: test_mask_packus_epi16_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x67,0xc1]
@@ -1759,7 +1759,7 @@ define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
+define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_packus_epi16_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -1777,7 +1777,7 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
+define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_packus_epi16_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -1793,7 +1793,7 @@ define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b,
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_packus_epi16_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1809,7 +1809,7 @@ define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
+define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_packus_epi16_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1829,7 +1829,7 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
+define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_packus_epi16_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -1849,15 +1849,11 @@ define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %pt
 
 declare <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64)
 
-define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; X86-LABEL: test_cmp_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi # encoding: [0x57]
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    .cfi_offset %edi, -8
 ; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
@@ -1895,9 +1891,7 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
 ; X86-NEXT:    addl $-1, %eax # encoding: [0x83,0xc0,0xff]
 ; X86-NEXT:    adcl $-1, %edx # encoding: [0x83,0xd2,0xff]
 ; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %edi # encoding: [0x5f]
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -1940,21 +1934,13 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
   ret i64 %ret7
 }
 
-define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_cmp_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp # encoding: [0x55]
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %ebx # encoding: [0x53]
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    pushl %edi # encoding: [0x57]
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    .cfi_offset %esi, -20
-; X86-NEXT:    .cfi_offset %edi, -16
-; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x14]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x18]
 ; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xd1]
@@ -2008,13 +1994,9 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
 ; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x18]
 ; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi # encoding: [0x5f]
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    popl %ebx # encoding: [0x5b]
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebp # encoding: [0x5d]
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2061,15 +2043,11 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
 
 declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
 
-define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
+define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; X86-LABEL: test_ucmp_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %edi # encoding: [0x57]
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 12
-; X86-NEXT:    .cfi_offset %esi, -12
-; X86-NEXT:    .cfi_offset %edi, -8
 ; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xc1]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %eax # encoding: [0xc5,0xfb,0x93,0xc1]
@@ -2107,9 +2085,7 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
 ; X86-NEXT:    addl $-1, %eax # encoding: [0x83,0xc0,0xff]
 ; X86-NEXT:    adcl $-1, %edx # encoding: [0x83,0xd2,0xff]
 ; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %edi # encoding: [0x5f]
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2152,21 +2128,13 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
   ret i64 %ret7
 }
 
-define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_x86_avx512_ucmp_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp # encoding: [0x55]
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    pushl %ebx # encoding: [0x53]
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    pushl %edi # encoding: [0x57]
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    .cfi_offset %esi, -20
-; X86-NEXT:    .cfi_offset %edi, -16
-; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    .cfi_offset %ebp, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x14]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x18]
 ; X86-NEXT:    vpcmpeqb %zmm1, %zmm0, %k2 # encoding: [0x62,0xf1,0x7d,0x48,0x74,0xd1]
@@ -2220,13 +2188,9 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
 ; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
 ; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx # encoding: [0x13,0x54,0x24,0x18]
 ; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi # encoding: [0x5f]
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    popl %ebx # encoding: [0x5b]
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebp # encoding: [0x5d]
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2273,7 +2237,7 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
 
 declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
 
-define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; X86-LABEL: test_cmp_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
@@ -2335,12 +2299,10 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
   ret i32 %ret7
 }
 
-define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_cmp_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
@@ -2362,7 +2324,6 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
 ; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
 ; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
 ; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2409,7 +2370,7 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
 
 declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
 
-define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
+define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; X86-LABEL: test_ucmp_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x75,0xc1]
@@ -2471,12 +2432,10 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
   ret i32 %ret7
 }
 
-define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_ucmp_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9]
 ; X86-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x75,0xc1]
@@ -2498,7 +2457,6 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
 ; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
 ; X86-NEXT:    addl %ecx, %eax # encoding: [0x01,0xc8]
 ; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2548,7 +2506,7 @@ declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) n
 
 declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-define <64 x i8>@mm512_avg_epu8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+define <64 x i8> @mm512_avg_epu8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind {
 ; CHECK-LABEL: mm512_avg_epu8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpavgb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe0,0xc1]
@@ -2557,7 +2515,7 @@ define <64 x i8>@mm512_avg_epu8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64
   ret <64 x i8> %res
 }
 
-define <64 x i8>@mm512_mask_avg_epu8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
+define <64 x i8> @mm512_mask_avg_epu8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) nounwind {
 ; X86-LABEL: mm512_mask_avg_epu8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -2577,7 +2535,7 @@ define <64 x i8>@mm512_mask_avg_epu8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2
 
 declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@mm512_avg_epu16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @mm512_avg_epu16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; CHECK-LABEL: mm512_avg_epu16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpavgw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe3,0xc1]
@@ -2586,7 +2544,7 @@ define <32 x i16>@mm512_avg_epu16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2
   ret <32 x i16> %res
 }
 
-define <32 x i16>@mm512_mask_avg_epu16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @mm512_mask_avg_epu16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: mm512_mask_avg_epu16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -2606,7 +2564,7 @@ define <32 x i16>@mm512_mask_avg_epu16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16
 
 declare <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1) {
+define <32 x i16> @test_int_x86_avx512_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pabs_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpabsw %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x1d,0xc0]
@@ -2615,7 +2573,7 @@ define <32 x i16>@test_int_x86_avx512_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1)
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+define <32 x i16> @test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pabs_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -2635,7 +2593,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pabs_w_512(<32 x i16> %x0, <32 x i16>
 
 declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64)
 
-define <64 x i8>@test_int_x86_avx512_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1) {
+define <64 x i8> @test_int_x86_avx512_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pabs_b_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpabsb %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x1c,0xc0]
@@ -2644,7 +2602,7 @@ define <64 x i8>@test_int_x86_avx512_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1) {
   ret <64 x i8> %res
 }
 
-define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+define <64 x i8> @test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pabs_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -2664,12 +2622,10 @@ define <64 x i8>@test_int_x86_avx512_mask_pabs_b_512(<64 x i8> %x0, <64 x i8> %x
 
 declare i64 @llvm.x86.avx512.ptestm.b.512(<64 x i8>, <64 x i8>, i64)
 
-define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+define i64 @test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_ptestm_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
 ; X86-NEXT:    vptestmb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x48,0x26,0xc1]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
@@ -2681,7 +2637,6 @@ define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x
 ; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
 ; X86-NEXT:    adcl %ecx, %edx # encoding: [0x11,0xca]
 ; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2701,7 +2656,7 @@ define i64@test_int_x86_avx512_ptestm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x
 
 declare i32 @llvm.x86.avx512.ptestm.w.512(<32 x i16>, <32 x i16>, i32)
 
-define i32@test_int_x86_avx512_ptestm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+define i32 @test_int_x86_avx512_ptestm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_ptestm_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vptestmw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x48,0x26,0xc1]
@@ -2728,12 +2683,10 @@ define i32@test_int_x86_avx512_ptestm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32
 
 declare i64 @llvm.x86.avx512.ptestnm.b.512(<64 x i8>, <64 x i8>, i64 %x2)
 
-define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) {
+define i64 @test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_ptestnm_b_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi # encoding: [0x56]
-; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
 ; X86-NEXT:    vptestnmb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x48,0x26,0xc1]
 ; X86-NEXT:    kshiftrq $32, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x31,0xc8,0x20]
 ; X86-NEXT:    kmovd %k1, %ecx # encoding: [0xc5,0xfb,0x93,0xc9]
@@ -2745,7 +2698,6 @@ define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %
 ; X86-NEXT:    addl %esi, %eax # encoding: [0x01,0xf0]
 ; X86-NEXT:    adcl %ecx, %edx # encoding: [0x11,0xca]
 ; X86-NEXT:    popl %esi # encoding: [0x5e]
-; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2765,7 +2717,7 @@ define i64@test_int_x86_avx512_ptestnm_b_512(<64 x i8> %x0, <64 x i8> %x1, i64 %
 
 declare i32 @llvm.x86.avx512.ptestnm.w.512(<32 x i16>, <32 x i16>, i32 %x2)
 
-define i32@test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) {
+define i32 @test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_ptestnm_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vptestnmw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x48,0x26,0xc1]
@@ -2792,7 +2744,7 @@ define i32@test_int_x86_avx512_ptestnm_w_512(<32 x i16> %x0, <32 x i16> %x1, i32
 
 declare i64 @llvm.x86.avx512.cvtb2mask.512(<64 x i8>)
 
-define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
+define i64 @test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) nounwind {
 ; X86-LABEL: test_int_x86_avx512_cvtb2mask_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vpmovb2m %zmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x48,0x29,0xc0]
@@ -2814,7 +2766,7 @@ define i64@test_int_x86_avx512_cvtb2mask_512(<64 x i8> %x0) {
 
 declare i32 @llvm.x86.avx512.cvtw2mask.512(<32 x i16>)
 
-define i32@test_int_x86_avx512_cvtw2mask_512(<32 x i16> %x0) {
+define i32 @test_int_x86_avx512_cvtw2mask_512(<32 x i16> %x0) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovw2m %zmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x48,0x29,0xc0]
@@ -2827,7 +2779,7 @@ define i32@test_int_x86_avx512_cvtw2mask_512(<32 x i16> %x0) {
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmulhu_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmulhuw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe4,0xc1]
@@ -2836,7 +2788,7 @@ define <32 x i16>@test_int_x86_avx512_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmulhu_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -2856,7 +2808,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i1
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmulh_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmulhw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe5,0xc1]
@@ -2865,7 +2817,7 @@ define <32 x i16>@test_int_x86_avx512_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmulh_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -2885,7 +2837,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmulhr_sw_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmulhrsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x0b,0xc1]
@@ -2894,7 +2846,7 @@ define <32 x i16>@test_int_x86_avx512_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -2914,7 +2866,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i
 
 declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmaddubs_w_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x04,0xc1]
@@ -2923,7 +2875,7 @@ define <32 x i16>@test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -2943,7 +2895,7 @@ define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i
 
 declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16)
 
-define <16 x i32>@test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2) {
+define <16 x i32> @test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmaddw_d_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xf5,0xc1]
@@ -2952,7 +2904,7 @@ define <16 x i32>@test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x
   ret <16 x i32> %res
 }
 
-define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) {
+define <16 x i32> @test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -2972,7 +2924,7 @@ define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i1
 
 declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_permvar_hi_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xc0]
@@ -2981,7 +2933,7 @@ define <32 x i16>@test_int_x86_avx512_permvar_hi_512(<32 x i16> %x0, <32 x i16>
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -2999,7 +2951,7 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_maskz_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_maskz_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_permvar_hi_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3017,7 +2969,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_permvar_hi_512(<32 x i16> %x0, <32 x
 
 declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_vpermt2var_hi_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpermi2w %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x75,0xc2]
@@ -3026,7 +2978,7 @@ define <32 x i16>@test_int_x86_avx512_vpermt2var_hi_512(<32 x i16> %x0, <32 x i1
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3046,7 +2998,7 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32
 
 declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3064,7 +3016,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <3
 
 declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_vpermi2var_hi_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpermt2w %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xc2]
@@ -3073,7 +3025,7 @@ define <32 x i16>@test_int_x86_avx512_vpermi2var_hi_512(<32 x i16> %x0, <32 x i1
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3093,7 +3045,7 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32
 
 declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) {
+define <32 x i16> @test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_dbpsadbw_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3121,7 +3073,7 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8>
   ret <32 x i16> %res4
 }
 
-define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; CHECK-LABEL: test_mask_adds_epu16_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpaddusw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdd,0xc1]
@@ -3130,7 +3082,7 @@ define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epu16_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3148,7 +3100,7 @@ define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epu16_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3164,7 +3116,7 @@ define <32 x i16> @test_mask_adds_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_adds_epu16_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3180,7 +3132,7 @@ define <32 x i16> @test_mask_adds_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epu16_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3200,7 +3152,7 @@ define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
+define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epu16_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3220,7 +3172,7 @@ define <32 x i16> @test_mask_adds_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 
 declare <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; CHECK-LABEL: test_mask_subs_epu16_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsubusw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd9,0xc1]
@@ -3229,7 +3181,7 @@ define <32 x i16> @test_mask_subs_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epu16_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3247,7 +3199,7 @@ define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epu16_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3263,7 +3215,7 @@ define <32 x i16> @test_mask_subs_epu16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_subs_epu16_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3279,7 +3231,7 @@ define <32 x i16> @test_mask_subs_epu16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epu16_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3299,7 +3251,7 @@ define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
+define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epu16_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3319,7 +3271,7 @@ define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 
 declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <64 x i8> @test_mask_adds_epu8_rr_512(<64 x i8> %a, <64 x i8> %b) {
+define <64 x i8> @test_mask_adds_epu8_rr_512(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; CHECK-LABEL: test_mask_adds_epu8_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpaddusb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xdc,0xc1]
@@ -3328,7 +3280,7 @@ define <64 x i8> @test_mask_adds_epu8_rr_512(<64 x i8> %a, <64 x i8> %b) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_adds_epu8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) {
+define <64 x i8> @test_mask_adds_epu8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epu8_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -3346,7 +3298,7 @@ define <64 x i8> @test_mask_adds_epu8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_adds_epu8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+define <64 x i8> @test_mask_adds_epu8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epu8_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -3362,7 +3314,7 @@ define <64 x i8> @test_mask_adds_epu8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_adds_epu8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) {
+define <64 x i8> @test_mask_adds_epu8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_adds_epu8_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3378,7 +3330,7 @@ define <64 x i8> @test_mask_adds_epu8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_adds_epu8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
+define <64 x i8> @test_mask_adds_epu8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <64 x i8> %passThru, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epu8_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3398,7 +3350,7 @@ define <64 x i8> @test_mask_adds_epu8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_adds_epu8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b, i64 %mask) {
+define <64 x i8> @test_mask_adds_epu8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epu8_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3418,7 +3370,7 @@ define <64 x i8> @test_mask_adds_epu8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b,
 
 declare <64 x i8> @llvm.x86.avx512.mask.paddus.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-define <64 x i8> @test_mask_subs_epu8_rr_512(<64 x i8> %a, <64 x i8> %b) {
+define <64 x i8> @test_mask_subs_epu8_rr_512(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; CHECK-LABEL: test_mask_subs_epu8_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsubusb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xd8,0xc1]
@@ -3427,7 +3379,7 @@ define <64 x i8> @test_mask_subs_epu8_rr_512(<64 x i8> %a, <64 x i8> %b) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_subs_epu8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) {
+define <64 x i8> @test_mask_subs_epu8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epu8_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -3445,7 +3397,7 @@ define <64 x i8> @test_mask_subs_epu8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_subs_epu8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+define <64 x i8> @test_mask_subs_epu8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epu8_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -3461,7 +3413,7 @@ define <64 x i8> @test_mask_subs_epu8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_subs_epu8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) {
+define <64 x i8> @test_mask_subs_epu8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_subs_epu8_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3477,7 +3429,7 @@ define <64 x i8> @test_mask_subs_epu8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_subs_epu8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
+define <64 x i8> @test_mask_subs_epu8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <64 x i8> %passThru, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epu8_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3497,7 +3449,7 @@ define <64 x i8> @test_mask_subs_epu8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_subs_epu8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b, i64 %mask) {
+define <64 x i8> @test_mask_subs_epu8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epu8_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3517,7 +3469,7 @@ define <64 x i8> @test_mask_subs_epu8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b,
 
 declare <64 x i8> @llvm.x86.avx512.mask.psubus.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-define <32 x i16> @test_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+define <32 x i16> @test_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; CHECK-LABEL: test_adds_epi16_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0xc1]
@@ -3526,7 +3478,7 @@ define <32 x i16> @test_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   ret <32 x i16> %1
 }
 
-define <32 x i16> @test_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_adds_epi16_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3546,7 +3498,7 @@ define <32 x i16> @test_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i
   ret <32 x i16> %3
 }
 
-define <32 x i16> @test_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+define <32 x i16> @test_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
 ; X86-LABEL: test_adds_epi16_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3564,7 +3516,7 @@ define <32 x i16> @test_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %m
   ret <32 x i16> %3
 }
 
-define <32 x i16> @test_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+define <32 x i16> @test_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) nounwind {
 ; X86-LABEL: test_adds_epi16_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3580,7 +3532,7 @@ define <32 x i16> @test_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
   ret <32 x i16> %1
 }
 
-define <32 x i16> @test_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_adds_epi16_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3602,7 +3554,7 @@ define <32 x i16> @test_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <3
   ret <32 x i16> %3
 }
 
-define <32 x i16> @test_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
+define <32 x i16> @test_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) nounwind {
 ; X86-LABEL: test_adds_epi16_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3624,7 +3576,7 @@ define <32 x i16> @test_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i
 
 declare <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16>, <32 x i16>)
 
-define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; CHECK-LABEL: test_mask_adds_epi16_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpaddsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0xc1]
@@ -3633,7 +3585,7 @@ define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epi16_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3651,7 +3603,7 @@ define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epi16_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3667,7 +3619,7 @@ define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_adds_epi16_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3683,7 +3635,7 @@ define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epi16_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3703,7 +3655,7 @@ define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
+define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epi16_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3723,7 +3675,7 @@ define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 
 declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16> @test_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+define <32 x i16> @test_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; CHECK-LABEL: test_subs_epi16_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0xc1]
@@ -3732,7 +3684,7 @@ define <32 x i16> @test_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   ret <32 x i16> %1
 }
 
-define <32 x i16> @test_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_subs_epi16_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3752,7 +3704,7 @@ define <32 x i16> @test_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i
   ret <32 x i16> %3
 }
 
-define <32 x i16> @test_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+define <32 x i16> @test_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
 ; X86-LABEL: test_subs_epi16_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3770,7 +3722,7 @@ define <32 x i16> @test_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %m
   ret <32 x i16> %3
 }
 
-define <32 x i16> @test_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+define <32 x i16> @test_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) nounwind {
 ; X86-LABEL: test_subs_epi16_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3786,7 +3738,7 @@ define <32 x i16> @test_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
   ret <32 x i16> %1
 }
 
-define <32 x i16> @test_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_subs_epi16_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3808,7 +3760,7 @@ define <32 x i16> @test_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <3
   ret <32 x i16> %3
 }
 
-define <32 x i16> @test_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
+define <32 x i16> @test_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) nounwind {
 ; X86-LABEL: test_subs_epi16_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3830,7 +3782,7 @@ define <32 x i16> @test_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i
 
 declare <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16>, <32 x i16>)
 
-define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) nounwind {
 ; CHECK-LABEL: test_mask_subs_epi16_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsubsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0xc1]
@@ -3839,7 +3791,7 @@ define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epi16_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3857,7 +3809,7 @@ define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) {
+define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epi16_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -3873,7 +3825,7 @@ define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_subs_epi16_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3889,7 +3841,7 @@ define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epi16_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3909,7 +3861,7 @@ define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
   ret <32 x i16> %res
 }
 
-define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) {
+define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epi16_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3929,7 +3881,7 @@ define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 
 declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <64 x i8> @test_mask_adds_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) {
+define <64 x i8> @test_mask_adds_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; CHECK-LABEL: test_mask_adds_epi8_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpaddsb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xec,0xc1]
@@ -3938,7 +3890,7 @@ define <64 x i8> @test_mask_adds_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_adds_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) {
+define <64 x i8> @test_mask_adds_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epi8_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -3956,7 +3908,7 @@ define <64 x i8> @test_mask_adds_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_adds_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+define <64 x i8> @test_mask_adds_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epi8_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -3972,7 +3924,7 @@ define <64 x i8> @test_mask_adds_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_adds_epi8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) {
+define <64 x i8> @test_mask_adds_epi8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_adds_epi8_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -3988,7 +3940,7 @@ define <64 x i8> @test_mask_adds_epi8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_adds_epi8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
+define <64 x i8> @test_mask_adds_epi8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <64 x i8> %passThru, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epi8_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -4008,7 +3960,7 @@ define <64 x i8> @test_mask_adds_epi8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_adds_epi8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b, i64 %mask) {
+define <64 x i8> @test_mask_adds_epi8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_adds_epi8_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -4028,7 +3980,7 @@ define <64 x i8> @test_mask_adds_epi8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b,
 
 declare <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-define <64 x i8> @test_mask_subs_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) {
+define <64 x i8> @test_mask_subs_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) nounwind {
 ; CHECK-LABEL: test_mask_subs_epi8_rr_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsubsb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe8,0xc1]
@@ -4037,7 +3989,7 @@ define <64 x i8> @test_mask_subs_epi8_rr_512(<64 x i8> %a, <64 x i8> %b) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_subs_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) {
+define <64 x i8> @test_mask_subs_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %passThru, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epi8_rrk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -4055,7 +4007,7 @@ define <64 x i8> @test_mask_subs_epi8_rrk_512(<64 x i8> %a, <64 x i8> %b, <64 x
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_subs_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) {
+define <64 x i8> @test_mask_subs_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epi8_rrkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04]
@@ -4071,7 +4023,7 @@ define <64 x i8> @test_mask_subs_epi8_rrkz_512(<64 x i8> %a, <64 x i8> %b, i64 %
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_subs_epi8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) {
+define <64 x i8> @test_mask_subs_epi8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) nounwind {
 ; X86-LABEL: test_mask_subs_epi8_rm_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -4087,7 +4039,7 @@ define <64 x i8> @test_mask_subs_epi8_rm_512(<64 x i8> %a, <64 x i8>* %ptr_b) {
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_subs_epi8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
+define <64 x i8> @test_mask_subs_epi8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <64 x i8> %passThru, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epi8_rmk_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -4107,7 +4059,7 @@ define <64 x i8> @test_mask_subs_epi8_rmk_512(<64 x i8> %a, <64 x i8>* %ptr_b, <
   ret <64 x i8> %res
 }
 
-define <64 x i8> @test_mask_subs_epi8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b, i64 %mask) {
+define <64 x i8> @test_mask_subs_epi8_rmkz_512(<64 x i8> %a, <64 x i8>* %ptr_b, i64 %mask) nounwind {
 ; X86-LABEL: test_mask_subs_epi8_rmkz_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -4129,7 +4081,7 @@ declare <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8>, <64 x i8>, <64 x
 
 declare <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_psrlv32hi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x10,0xc1]
@@ -4138,7 +4090,7 @@ define <32 x i16>@test_int_x86_avx512_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1,
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_psrlv32hi:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -4156,7 +4108,7 @@ define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16>
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_maskz_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_maskz_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_psrlv32hi:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -4174,7 +4126,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_psrlv32hi(<32 x i16> %x0, <32 x i16>
 
 declare <32 x i16> @llvm.x86.avx512.mask.psrav32.hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_psrav32_hi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsravw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x11,0xc1]
@@ -4183,7 +4135,7 @@ define <32 x i16>@test_int_x86_avx512_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1,
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_psrav32_hi:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -4201,7 +4153,7 @@ define <32 x i16>@test_int_x86_avx512_mask_psrav32_hi(<32 x i16> %x0, <32 x i16>
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_maskz_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_maskz_psrav32_hi(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_psrav32_hi:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -4219,7 +4171,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_psrav32_hi(<32 x i16> %x0, <32 x i16
 
 declare <32 x i16> @llvm.x86.avx512.mask.psllv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i16>@test_int_x86_avx512_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+define <32 x i16> @test_int_x86_avx512_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_psllv32hi:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x12,0xc1]
@@ -4228,7 +4180,7 @@ define <32 x i16>@test_int_x86_avx512_psllv32hi(<32 x i16> %x0, <32 x i16> %x1,
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_psllv32hi:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -4246,7 +4198,7 @@ define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16>
   ret <32 x i16> %res
 }
 
-define <32 x i16>@test_int_x86_avx512_maskz_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) {
+define <32 x i16> @test_int_x86_avx512_maskz_psllv32hi(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_psllv32hi:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -4264,7 +4216,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_psllv32hi(<32 x i16> %x0, <32 x i16>
 
 declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
 
-define <32 x i8>@test_int_x86_avx512_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1) {
+define <32 x i8> @test_int_x86_avx512_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1) nounwind {
 ; CHECK-LABEL: test_int_x86_avx512_pmov_wb_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpmovwb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc0]
@@ -4273,7 +4225,7 @@ define <32 x i8>@test_int_x86_avx512_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1)
   ret <32 x i8> %res
 }
 
-define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) {
+define <32 x i8> @test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
@@ -4291,7 +4243,7 @@ define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8>
   ret <32 x i8> %res
 }
 
-define <32 x i8>@test_int_x86_avx512_maskz_pmov_wb_512(<32 x i16> %x0, i32 %x2) {
+define <32 x i8> @test_int_x86_avx512_maskz_pmov_wb_512(<32 x i16> %x0, i32 %x2) nounwind {
 ; X86-LABEL: test_int_x86_avx512_maskz_pmov_wb_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04]
diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll
new file mode 100644
index 0000000000000..8148585f8d793
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-arith-intrinsics.ll
@@ -0,0 +1,645 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unkown-unkown -mattr=+avx512bw -mattr=+avx512fp16 -mattr=+avx512vl | FileCheck %s
+
+declare <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half>, <32 x half>, i32)
+
+define <32 x half> @test_int_x86_avx512fp16_add_ph_512(<32 x half> %x1, <32 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_add_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4)
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_add_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_add_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vaddph %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res0 = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4)
+  %res = select <32 x i1> %mask, <32 x half> %res0, <32 x half> %src
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_maskz_add_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_add_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vaddph %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vaddph (%rsi), %zmm1, %zmm1 {%k1} {z}
+; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %val = load <32 x half>, <32 x half>* %ptr
+  %res0 = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4)
+  %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer
+  %t2 = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %val, i32 4)
+  %res2 = select <32 x i1> %mask, <32 x half> %t2, <32 x half> zeroinitializer
+  %res3  =  fadd <32 x half> %res1, %res2
+  ret <32 x half> %res3
+}
+
+define <32 x half> @test_int_x86_avx512fp16_add_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, <32 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_add_ph_512_round:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vaddph {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %t1 = call <32 x half> @llvm.x86.avx512fp16.add.ph.512(<32 x half> %x1, <32 x half> %x2, i32 10)
+  %res = select <32 x i1> %mask, <32 x half> %t1, <32 x half> %src
+  ret <32 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half>, <32 x half>, i32)
+
+define <32 x half> @test_int_x86_avx512fp16_sub_ph_512(<32 x half> %x1, <32 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_sub_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsubph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4)
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_sub_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_sub_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsubph %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res0 = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4)
+  %res = select <32 x i1> %mask, <32 x half> %res0, <32 x half> %src
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_maskz_sub_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_sub_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsubph %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vsubph (%rsi), %zmm1, %zmm1 {%k1} {z}
+; CHECK-NEXT:    vsubph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %val = load <32 x half>, <32 x half>* %ptr
+  %res0 = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4)
+  %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer
+  %t2 = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %val, i32 4)
+  %res2 = select <32 x i1> %mask, <32 x half> %t2, <32 x half> zeroinitializer
+  %res3  =  fsub <32 x half> %res1, %res2
+  ret <32 x half> %res3
+}
+
+define <32 x half> @test_int_x86_avx512fp16_sub_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, <32 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_sub_ph_512_round:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsubph {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %t1 = call <32 x half> @llvm.x86.avx512fp16.sub.ph.512(<32 x half> %x1, <32 x half> %x2, i32 10)
+  %res = select <32 x i1> %mask, <32 x half> %t1, <32 x half> %src
+  ret <32 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half>, <32 x half>, i32)
+
+define <32 x half> @test_int_x86_avx512fp16_mul_ph_512(<32 x half> %x1, <32 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mul_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmulph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4)
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_mul_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_mul_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmulph %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res0 = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4)
+  %res = select <32 x i1> %mask, <32 x half> %res0, <32 x half> %src
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_maskz_mul_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_mul_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmulph %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vmulph (%rsi), %zmm1, %zmm1 {%k1} {z}
+; CHECK-NEXT:    vmulph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %val = load <32 x half>, <32 x half>* %ptr
+  %res0 = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4)
+  %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer
+  %t2 = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %val, i32 4)
+  %res2 = select <32 x i1> %mask, <32 x half> %t2, <32 x half> zeroinitializer
+  %res3  =  fmul <32 x half> %res1, %res2
+  ret <32 x half> %res3
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mul_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, <32 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mul_ph_512_round:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmulph {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %t1 = call <32 x half> @llvm.x86.avx512fp16.mul.ph.512(<32 x half> %x1, <32 x half> %x2, i32 10)
+  %res = select <32 x i1> %mask, <32 x half> %t1, <32 x half> %src
+  ret <32 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half>, <32 x half>, i32)
+
+define <32 x half> @test_int_x86_avx512fp16_div_ph_512(<32 x half> %x1, <32 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_div_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4)
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_div_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_div_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vdivph %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res0 = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4)
+  %res = select <32 x i1> %mask, <32 x half> %res0, <32 x half> %src
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_maskz_div_ph_512(<32 x half> %src, <32 x half> %x1, <32 x half> %x2, i32 %msk, <32 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_div_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vdivph %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vdivph (%rsi), %zmm1, %zmm1 {%k1} {z}
+; CHECK-NEXT:    vdivph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %val = load <32 x half>, <32 x half>* %ptr
+  %res0 = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %x2, i32 4)
+  %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer
+  %t2 = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %val, i32 4)
+  %res2 = select <32 x i1> %mask, <32 x half> %t2, <32 x half> zeroinitializer
+  %res3  =  fdiv <32 x half> %res1, %res2
+  ret <32 x half> %res3
+}
+
+define <32 x half> @test_int_x86_avx512fp16_div_ph_512_round(<32 x half> %x1, <32 x half> %x2, <32 x half> %src, i32 %msk, <32 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_div_ph_512_round:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vdivph {ru-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovaps %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %t1 = call <32 x half> @llvm.x86.avx512fp16.div.ph.512(<32 x half> %x1, <32 x half> %x2, i32 10)
+  %res = select <32 x i1> %mask, <32 x half> %t1, <32 x half> %src
+  ret <32 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32)
+
+define <32 x half> @test_min_ph(<32 x half> %x1, <32 x half> %x2) {
+; CHECK-LABEL: test_min_ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res0 = fcmp olt <32 x half> %x1, %x2
+  %res1 = select <32 x i1> %res0, <32 x half> %x1, <32 x half> %x2
+  ret  <32 x half> %res1
+}
+
+define <32 x half> @test_int_x86_avx512fp16_min_ph_512_sae(<32 x half> %x1, <32 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_min_ph_512_sae:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph {sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res0 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %x1, <32 x half> %x2, i32 8)
+  ret <32 x half> %res0
+}
+
+define <32 x half> @test_int_x86_avx512fp16_maskz_min_ph_512_sae(<32 x half> %x1, <32 x half> %x2, i32 %msk) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_min_ph_512_sae:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vminph {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res0 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %x1, <32 x half> %x2, i32 8)
+  %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer
+  ret <32 x half> %res1
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32)
+
+define <32 x half> @test_max_ph(<32 x half> %x1, <32 x half> %x2) {
+; CHECK-LABEL: test_max_ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res0 = fcmp ogt <32 x half> %x1, %x2
+  %res1 = select <32 x i1> %res0, <32 x half> %x1, <32 x half> %x2
+  ret  <32 x half> %res1
+}
+
+define <32 x half> @test_int_x86_avx512fp16_max_ph_512_sae(<32 x half> %x1, <32 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_max_ph_512_sae:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph {sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res0 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %x1, <32 x half> %x2, i32 8)
+  ret <32 x half> %res0
+}
+
+define <32 x half> @test_int_x86_avx512fp16_maskz_max_ph_512_sae(<32 x half> %x1, <32 x half> %x2, i32 %msk) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_max_ph_512_sae:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmaxph {sae}, %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %msk to <32 x i1>
+  %res0 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %x1, <32 x half> %x2, i32 8)
+  %res1 = select <32 x i1> %mask, <32 x half> %res0, <32 x half> zeroinitializer
+  ret <32 x half> %res1
+}
+
+declare <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half>, <8 x double>, i8, i32)
+
+define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd(<8 x half> %x0, <8 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2pd %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> %x0, <8 x double> %x1, i8 %x2, i32 4)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_sae(<8 x half> %x0, <8 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_sae:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2pd {sae}, %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> %x0, <8 x double> %x1, i8 %x2, i32 8)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_nomask(<8 x half> %x0, <8 x double> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2pd %xmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> %x0, <8 x double> %x1, i8 -1, i32 4)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_load(<8 x half>* %px0, <8 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtph2pd (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x0 = load <8 x half>, <8 x half>* %px0, align 16
+  %res = call <8 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.512(<8 x half> %x0, <8 x double> %x1, i8 %x2, i32 4)
+  ret <8 x double> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double>, <8 x half>, i8, i32)
+
+define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph(<8 x double> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtpd2ph %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double> %x0, <8 x half> %x1, i8 %x2, i32 4)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_r(<8 x double> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtpd2ph {rz-sae}, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double> %x0, <8 x half> %x1, i8 %x2, i32 11)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_load(<8 x double>* %px0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtpd2phz (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x0 = load <8 x double>, <8 x double>* %px0, align 64
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.512(<8 x double> %x0, <8 x half> %x1, i8 %x2, i32 4)
+  ret <8 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half>, <4 x float>, <8 x half>, i8, i32)
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round(<8 x half> %x0, <4 x float> %x1, <8 x half> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_ss2sh_round:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtss2sh %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> %x0, <4 x float> %x1, <8 x half> %x2, i8 %x3, i32 4)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round_r(<8 x half> %x0, <4 x float> %x1, <8 x half> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_ss2sh_round_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtss2sh {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> %x0, <4 x float> %x1, <8 x half> %x2, i8 %x3, i32 11)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round_nomask(<8 x half> %x0, <4 x float> %x1, <8 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_ss2sh_round_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtss2sh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> %x0, <4 x float> %x1, <8 x half> %x2, i8 -1, i32 4)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvt_ss2sh_round_z(<8 x half> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_ss2sh_round_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtss2sh %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtss2sh.round(<8 x half> %x0, <4 x float> %x1, <8 x half> zeroinitializer, i8 %x2, i32 4)
+  ret <8 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half>, <2 x double>, <8 x half>, i8, i32)
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round(<8 x half> %x0, <2 x double> %x1, <8 x half> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sd2sh_round:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtsd2sh %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> %x0, <2 x double> %x1, <8 x half> %x2, i8 %x3, i32 4)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round_r(<8 x half> %x0, <2 x double> %x1, <8 x half> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sd2sh_round_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtsd2sh {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> %x0, <2 x double> %x1, <8 x half> %x2, i8 %x3, i32 11)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round_nomask(<8 x half> %x0, <2 x double> %x1, <8 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sd2sh_round_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtsd2sh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> %x0, <2 x double> %x1, <8 x half> %x2, i8 -1, i32 4)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvt_sd2sh_round_z(<8 x half> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sd2sh_round_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtsd2sh %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtsd2sh.round(<8 x half> %x0, <2 x double> %x1, <8 x half> zeroinitializer, i8 %x2, i32 4)
+  ret <8 x half> %res
+}
+
+declare <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float>, <8 x half>, <4 x float>, i8, i32)
+
+define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round(<4 x float> %x0, <8 x half> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2ss_round:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> %x0, <8 x half> %x1, <4 x float> %x2, i8 %x3, i32 4)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round_r(<4 x float> %x0, <8 x half> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2ss_round_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtsh2ss {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> %x0, <8 x half> %x1, <4 x float> %x2, i8 %x3, i32 8)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round_nomask(<4 x float> %x0, <8 x half> %x1, <4 x float> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2ss_round_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> %x0, <8 x half> %x1, <4 x float> %x2, i8 -1, i32 4)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512fp16_mask_cvt_sh2ss_round_z(<4 x float> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2ss_round_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtsh2ss.round(<4 x float> %x0, <8 x half> %x1, <4 x float> zeroinitializer, i8 %x2, i32 4)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double>, <8 x half>, <2 x double>, i8, i32)
+
+define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round(<2 x double> %x0, <8 x half> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2sd_round:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtsh2sd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> %x0, <8 x half> %x1, <2 x double> %x2, i8 %x3, i32 4)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round_r(<2 x double> %x0, <8 x half> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2sd_round_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtsh2sd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> %x0, <8 x half> %x1, <2 x double> %x2, i8 %x3, i32 8)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round_nomask(<2 x double> %x0, <8 x half> %x1, <2 x double> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2sd_round_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtsh2sd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> %x0, <8 x half> %x1, <2 x double> %x2, i8 -1, i32 4)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_int_x86_avx512fp16_mask_cvt_sh2sd_round_z(<2 x double> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvt_sh2sd_round_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtsh2sd %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtsh2sd.round(<2 x double> %x0, <8 x half> %x1, <2 x double> zeroinitializer, i8 %x2, i32 4)
+  ret <2 x double> %res
+}
+
+declare <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half>, <16 x float>, i16, i32)
+
+define <16 x float> @test_int_x86_avx512_cvt_ph2psx_512(<16 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvt_ph2psx_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2psx %ymm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> %x0, <16 x float> undef, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512_mask_cvt_ph2psx_512(<16 x half> %x0, <16 x float> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2psx_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2psx %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> %x0, <16 x float> %x1, i16 %x2, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_512(<16 x half> %x0, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2psx_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2psx %ymm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> %x0, <16 x float> zeroinitializer, i16 %x2, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512_cvt_ph2psx_512r(<16 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvt_ph2psx_512r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2psx {sae}, %ymm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> %x0, <16 x float> undef, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512_mask_cvt_ph2psx_512r(<16 x half> %x0, <16 x float> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2psx_512r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2psx {sae}, %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> %x0, <16 x float> %x1, i16 %x2, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_512r(<16 x half> %x0, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2psx_512r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2psx {sae}, %ymm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.512(<16 x half> %x0, <16 x float> zeroinitializer, i16 %x2, i32 8)
+  ret <16 x float> %res
+}
+
+declare <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float>, <16 x half>, i16, i32)
+
+define <16 x half> @test_int_x86_avx512_cvt_ps2phx_512(<16 x float> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvt_ps2phx_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtps2phx %zmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> %x0, <16 x half> undef, i16 -1, i32 4)
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512_mask_cvt_ps2phx_512(<16 x float> %x0, <16 x half> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2phx_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtps2phx %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> %x0, <16 x half> %x1, i16 %x2, i32 4)
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512_maskz_cvt_ps2phx_512(<16 x float> %x0, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ps2phx_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtps2phx %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> %x0, <16 x half> zeroinitializer, i16 %x2, i32 4)
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512_mask_cvt_ps2phx_512r(<16 x float> %x0, <16 x half> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2phx_512r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtps2phx {rd-sae}, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vcvtps2phx {ru-sae}, %zmm0, %ymm0
+; CHECK-NEXT:    vaddph %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> %x0, <16 x half> %x1, i16 %x2, i32 9)
+  %res1 = call <16 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.512(<16 x float> %x0, <16 x half> %x1, i16 -1, i32 10)
+  %res2 = fadd <16 x half> %res, %res1
+  ret <16 x half> %res2
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll
new file mode 100644
index 0000000000000..d827206318e76
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-arith-vl-intrinsics.ll
@@ -0,0 +1,804 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unkown-unkown -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512fp16 | FileCheck %s
+
+define <16 x half> @test_int_x86_avx512fp16_add_ph_256(<16 x half> %x1, <16 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_add_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = fadd <16 x half> %x1, %x2
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mask_add_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, <16 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_add_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps %ymm2, %ymm3
+; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT:    vaddph (%rsi), %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vaddph %ymm2, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %val = load <16 x half>, <16 x half>* %ptr
+  %res0 = fadd <16 x half> %x1, %x2
+  %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %src
+  %t3 = fadd <16 x half> %x1, %val
+  %res2 = select <16 x i1> %msk, <16 x half> %t3, <16 x half> %src
+  %res   =  fadd <16 x half> %res1 , %res2
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_maskz_add_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, <16 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_add_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %res0 = fadd <16 x half> %x1, %x2
+  %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
+  ret <16 x half> %res1
+}
+
+define <8 x half> @test_int_x86_avx512fp16_add_ph_128(<8 x half> %x1, <8 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_add_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = fadd <8 x half> %x1, %x2
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_add_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, <8 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_add_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vaddph (%rsi), %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vaddph %xmm2, %xmm3, %xmm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %val = load <8 x half>, <8 x half>* %ptr
+  %res0 = fadd <8 x half> %x1, %x2
+  %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %src
+  %t3 = fadd <8 x half> %x1, %val
+  %res2 = select <8 x i1> %msk, <8 x half> %t3, <8 x half> %src
+  %res   =  fadd <8 x half> %res1 , %res2
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_maskz_add_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, <8 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_add_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %res0 = fadd <8 x half> %x1, %x2
+  %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
+  ret <8 x half> %res1
+}
+
+define <16 x half> @test_int_x86_avx512fp16_sub_ph_256(<16 x half> %x1, <16 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_sub_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsubph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = fsub <16 x half> %x1, %x2
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mask_sub_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, <16 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_sub_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps %ymm2, %ymm3
+; CHECK-NEXT:    vsubph %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT:    vsubph (%rsi), %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vsubph %ymm2, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %val = load <16 x half>, <16 x half>* %ptr
+  %res0 = fsub <16 x half> %x1, %x2
+  %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %src
+  %t3 = fsub <16 x half> %x1, %val
+  %res2 = select <16 x i1> %msk, <16 x half> %t3, <16 x half> %src
+  %res   =  fsub <16 x half> %res1 , %res2
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_maskz_sub_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, <16 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_sub_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsubph %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %res0 = fsub <16 x half> %x1, %x2
+  %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
+  ret <16 x half> %res1
+}
+
+define <8 x half> @test_int_x86_avx512fp16_sub_ph_128(<8 x half> %x1, <8 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_sub_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsubph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = fsub <8 x half> %x1, %x2
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_sub_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, <8 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_sub_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vsubph %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vsubph (%rsi), %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vsubph %xmm2, %xmm3, %xmm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %val = load <8 x half>, <8 x half>* %ptr
+  %res0 = fsub <8 x half> %x1, %x2
+  %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %src
+  %t3 = fsub <8 x half> %x1, %val
+  %res2 = select <8 x i1> %msk, <8 x half> %t3, <8 x half> %src
+  %res   =  fsub <8 x half> %res1 , %res2
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_maskz_sub_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, <8 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_sub_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsubph %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %res0 = fsub <8 x half> %x1, %x2
+  %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
+  ret <8 x half> %res1
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mul_ph_256(<16 x half> %x1, <16 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mul_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmulph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = fmul <16 x half> %x1, %x2
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mask_mul_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, <16 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_mul_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps %ymm2, %ymm3
+; CHECK-NEXT:    vmulph %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT:    vmulph (%rsi), %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vmulph %ymm2, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %val = load <16 x half>, <16 x half>* %ptr
+  %res0 = fmul <16 x half> %x1, %x2
+  %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %src
+  %t3 = fmul <16 x half> %x1, %val
+  %res2 = select <16 x i1> %msk, <16 x half> %t3, <16 x half> %src
+  %res   =  fmul <16 x half> %res1 , %res2
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_maskz_mul_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, <16 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_mul_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmulph %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %res0 = fmul <16 x half> %x1, %x2
+  %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
+  ret <16 x half> %res1
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mul_ph_128(<8 x half> %x1, <8 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mul_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmulph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = fmul <8 x half> %x1, %x2
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_mul_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, <8 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_mul_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vmulph %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vmulph (%rsi), %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmulph %xmm2, %xmm3, %xmm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %val = load <8 x half>, <8 x half>* %ptr
+  %res0 = fmul <8 x half> %x1, %x2
+  %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %src
+  %t3 = fmul <8 x half> %x1, %val
+  %res2 = select <8 x i1> %msk, <8 x half> %t3, <8 x half> %src
+  %res   =  fmul <8 x half> %res1 , %res2
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_maskz_mul_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, <8 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_mul_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmulph %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %res0 = fmul <8 x half> %x1, %x2
+  %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
+  ret <8 x half> %res1
+}
+
+define <16 x half> @test_int_x86_avx512fp16_div_ph_256(<16 x half> %x1, <16 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_div_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = fdiv <16 x half> %x1, %x2
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mask_div_ph_256(<16 x half> %x1, <16 x half> %x2, <16 x half> %src, i16 %mask, <16 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_div_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps %ymm2, %ymm3
+; CHECK-NEXT:    vdivph %ymm1, %ymm0, %ymm3 {%k1}
+; CHECK-NEXT:    vdivph (%rsi), %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vdivph %ymm2, %ymm3, %ymm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %val = load <16 x half>, <16 x half>* %ptr
+  %res0 = fdiv <16 x half> %x1, %x2
+  %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %src
+  %t3 = fdiv <16 x half> %x1, %val
+  %res2 = select <16 x i1> %msk, <16 x half> %t3, <16 x half> %src
+  %res   =  fdiv <16 x half> %res1 , %res2
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_maskz_div_ph_256(<16 x half> %x1, <16 x half> %x2, i16 %mask, <16 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_div_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vdivph %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %res0 = fdiv <16 x half> %x1, %x2
+  %res1 = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
+  ret <16 x half> %res1
+}
+
+define <8 x half> @test_int_x86_avx512fp16_div_ph_128(<8 x half> %x1, <8 x half> %x2) {
+; CHECK-LABEL: test_int_x86_avx512fp16_div_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = fdiv <8 x half> %x1, %x2
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_div_ph_128(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, <8 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_div_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vdivph %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vdivph (%rsi), %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vdivph %xmm2, %xmm3, %xmm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %val = load <8 x half>, <8 x half>* %ptr
+  %res0 = fdiv <8 x half> %x1, %x2
+  %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %src
+  %t3 = fdiv <8 x half> %x1, %val
+  %res2 = select <8 x i1> %msk, <8 x half> %t3, <8 x half> %src
+  %res   =  fdiv <8 x half> %res1 , %res2
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_maskz_div_ph_128(<8 x half> %x1, <8 x half> %x2, i8 %mask, <8 x half>* %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_maskz_div_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vdivph %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %res0 = fdiv <8 x half> %x1, %x2
+  %res1 = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
+  ret <8 x half> %res1
+}
+
+define <16 x half> @test_min_ph_256(<16 x half> %x1, <16 x half> %x2) {
+; CHECK-LABEL: test_min_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res0 = fcmp olt <16 x half> %x1, %x2
+  %res1 = select <16 x i1> %res0, <16 x half> %x1, <16 x half> %x2
+  ret  <16 x half> %res1
+}
+
+define <16 x half> @test_max_ph_256(<16 x half> %x1, <16 x half> %x2) {
+; CHECK-LABEL: test_max_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res0 = fcmp ogt <16 x half> %x1, %x2
+  %res1 = select <16 x i1> %res0, <16 x half> %x1, <16 x half> %x2
+  ret  <16 x half> %res1
+}
+
+define <8 x half> @test_min_ph_128(<8 x half> %x1, <8 x half> %x2) {
+; CHECK-LABEL: test_min_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res0 = fcmp olt <8 x half> %x1, %x2
+  %res1 = select <8 x i1> %res0, <8 x half> %x1, <8 x half> %x2
+  ret  <8 x half> %res1
+}
+
+define <8 x half> @test_max_ph_128(<8 x half> %x1, <8 x half> %x2) {
+; CHECK-LABEL: test_max_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res0 = fcmp ogt <8 x half> %x1, %x2
+  %res1 = select <8 x i1> %res0, <8 x half> %x1, <8 x half> %x2
+  ret  <8 x half> %res1
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half>, <8 x half>)
+declare <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half>, <16 x half>)
+
+define <8 x half> @test_max_ph_128_2(<8 x half> %x1, <8 x half> %x2) {
+; CHECK-LABEL: test_max_ph_128_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res0 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %x1, <8 x half> %x2)
+  ret  <8 x half> %res0
+}
+
+define <16 x half> @test_max_ph_256_2(<16 x half> %x1, <16 x half> %x2) {
+; CHECK-LABEL: test_max_ph_256_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res0 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %x1, <16 x half> %x2)
+  ret  <16 x half> %res0
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half>, <8 x half>)
+declare <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half>, <16 x half>)
+
+define <8 x half> @test_min_ph_128_2(<8 x half> %x1, <8 x half> %x2) {
+; CHECK-LABEL: test_min_ph_128_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res0 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %x1, <8 x half> %x2)
+  ret  <8 x half> %res0
+}
+
+define <16 x half> @test_min_ph_256_2(<16 x half> %x1, <16 x half> %x2) {
+; CHECK-LABEL: test_min_ph_256_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res0 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %x1, <16 x half> %x2)
+  ret  <16 x half> %res0
+}
+
+declare <4 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.256(<8 x half>, <4 x double>, i8)
+
+define <4 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_256(<8 x half> %x0, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2pd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.256(<8 x half> %x0, <4 x double> %x1, i8 %x2)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_256_nomask(<8 x half> %x0, <4 x double> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_256_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2pd %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.256(<8 x half> %x0, <4 x double> %x1, i8 -1)
+  ret <4 x double> %res
+}
+
+declare <2 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.128(<8 x half>, <2 x double>, i8)
+
+define <2 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_128(<8 x half> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2pd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.128(<8 x half> %x0, <2 x double> %x1, i8 %x2)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_int_x86_avx512_mask_vcvt_ph2pd_128_nomask(<8 x half> %x0, <2 x double> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_ph2pd_128_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2pd %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.128(<8 x half> %x0, <2 x double> %x1, i8 -1)
+  ret <2 x double> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.256(<4 x double>, <8 x half>, i8)
+
+define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_256(<4 x double> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtpd2ph %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.256(<4 x double> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_256_load(<4 x double>* %px0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph_256_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtpd2phy (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x0 = load <4 x double>, <4 x double>* %px0, align 32
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.256(<4 x double> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.128(<2 x double>, <8 x half>, i8)
+
+define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_128(<2 x double> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtpd2ph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.128(<2 x double> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_vcvt_pd2ph_128_load(<2 x double>* %px0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vcvt_pd2ph_128_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtpd2phx (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x0 = load <2 x double>, <2 x double>* %px0, align 16
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.128(<2 x double> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half>, <4 x i32>, i8)
+
+define <4 x i32> @test_int_x86_avx512_cvt_ph2udq_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvt_ph2udq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2udq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half> %x0, <4 x i32> undef, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2udq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2udq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_maskz_cvt_ph2udq_128(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2udq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2udq %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half>, <8 x i32>, i8)
+
+define <8 x i32> @test_int_x86_avx512_cvt_ph2udq_256(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvt_ph2udq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2udq %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half> %x0, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2udq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2udq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_maskz_cvt_ph2udq_256(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2udq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2udq %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half>, <4 x i32>, i8)
+
+define <4 x i32> @test_int_x86_avx512_cvtt_ph2dq_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2dq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half> %x0, <4 x i32> undef, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2dq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_maskz_cvtt_ph2dq_128(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2dq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half>, <8 x i32>, i8)
+
+define <8 x i32> @test_int_x86_avx512_cvtt_ph2dq_256(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2dq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half> %x0, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2dq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2dq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_maskz_cvtt_ph2dq_256(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2dq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2dq %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half>, <4 x i32>, i8)
+
+define <4 x i32> @test_int_x86_avx512_cvtt_ph2udq_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2udq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half> %x0, <4 x i32> undef, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2udq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_maskz_cvtt_ph2udq_128(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2udq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half>, <8 x i32>, i8)
+
+define <8 x i32> @test_int_x86_avx512_cvtt_ph2udq_256(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2udq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2udq %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half> %x0, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2udq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2udq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_maskz_cvtt_ph2udq_256(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2udq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2udq %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.128(<8 x half>, <4 x float>, i8)
+
+define <4 x float> @test_int_x86_avx512_cvt_ph2psx_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvt_ph2psx_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2psx %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.128(<8 x half> %x0, <4 x float> undef, i8 -1)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512_mask_cvt_ph2psx_128(<8 x half> %x0, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2psx_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2psx %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.128(<8 x half> %x0, <4 x float> %x1, i8 %x2)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_128(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2psx_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2psx %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.128(<8 x half> %x0, <4 x float> zeroinitializer, i8 %x2)
+  ret <4 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.256(<8 x half>, <8 x float>, i8)
+
+define <8 x float> @test_int_x86_avx512_cvt_ph2psx_256(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvt_ph2psx_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2psx %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.256(<8 x half> %x0, <8 x float> undef, i8 -1)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_int_x86_avx512_mask_cvt_ph2psx_256(<8 x half> %x0, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2psx_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2psx %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.256(<8 x half> %x0, <8 x float> %x1, i8 %x2)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_int_x86_avx512_maskz_cvt_ph2psx_256(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2psx_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2psx %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x float> @llvm.x86.avx512fp16.mask.vcvtph2psx.256(<8 x half> %x0, <8 x float> zeroinitializer, i8 %x2)
+  ret <8 x float> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.128(<4 x float>, <8 x half>, i8)
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_ps2phx_128(<4 x float> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2phx_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtps2phx %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vcvtps2phx %xmm0, %xmm0
+; CHECK-NEXT:    vaddph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.128(<4 x float> %x0, <8 x half> %x1, i8 %x2)
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.128(<4 x float> %x0, <8 x half> %x1, i8 -1)
+  %res2 = fadd <8 x half> %res, %res1
+  ret <8 x half> %res2
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.256(<8 x float>, <8 x half>, i8)
+
+define <8 x half> @test_int_x86_avx512_cvt_ps2phx_256(<8 x float> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvt_ps2phx_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtps2phx %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.256(<8 x float> %x0, <8 x half> undef, i8 -1)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_ps2phx_256(<8 x float> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2phx_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtps2phx %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.256(<8 x float> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_maskz_cvt_ps2phx_256(<8 x float> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ps2phx_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtps2phx %ymm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.256(<8 x float> %x0, <8 x half> zeroinitializer, i8 %x2)
+  ret <8 x half> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-arith.ll b/llvm/test/CodeGen/X86/avx512fp16-arith.ll
new file mode 100644
index 0000000000000..e897c195b9068
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-arith.ll
@@ -0,0 +1,585 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=+avx512fp16 | FileCheck %s
+
+define <32 x half> @vaddph_512_test(<32 x half> %i, <32 x half> %j) nounwind readnone {
+; CHECK-LABEL: vaddph_512_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %x = fadd  <32 x half> %i, %j
+  ret <32 x half> %x
+}
+
+define <32 x half> @vaddph_512_fold_test(<32 x half> %i, <32 x half>* %j) nounwind {
+; CHECK-LABEL: vaddph_512_fold_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vaddph (%rdi), %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %tmp = load <32 x half>, <32 x half>* %j, align 4
+  %x = fadd  <32 x half> %i, %tmp
+  ret <32 x half> %x
+}
+
+define <32 x half> @vaddph_512_broadc_test(<32 x half> %a) nounwind {
+; CHECK-LABEL: vaddph_512_broadc_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vaddph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to32}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %b = fadd <32 x half> %a, <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>
+  ret <32 x half> %b
+}
+
+define <16 x half> @vaddph_256_broadc_test(<16 x half> %a) nounwind {
+; CHECK-LABEL: vaddph_256_broadc_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vaddph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %b = fadd <16 x half> %a, <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>
+  ret <16 x half> %b
+}
+
+define <8 x half> @vaddph_128_broadc_test(<8 x half> %a) nounwind {
+; CHECK-LABEL: vaddph_128_broadc_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vaddph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %b = fadd <8 x half> %a, <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>
+  ret <8 x half> %b
+}
+
+define <32 x half> @vaddph_512_mask_test1(<32 x half> %i, <32 x half> %j, <32 x i1> %mask) nounwind readnone {
+; CHECK-LABEL: vaddph_512_mask_test1:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpsllw $7, %ymm2, %ymm2
+; CHECK-NEXT:    vpmovb2m %ymm2, %k1
+; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x = fadd  <32 x half> %i, %j
+  %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> %i
+  ret <32 x half> %r
+}
+
+define <32 x half> @vaddph_512_mask_test(<32 x half> %i, <32 x half> %j, <32 x half> %mask1) nounwind readnone {
+; CHECK-LABEL: vaddph_512_mask_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpneq_oqph %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %mask = fcmp one <32 x half> %mask1, zeroinitializer
+  %x = fadd  <32 x half> %i, %j
+  %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> %i
+  ret <32 x half> %r
+}
+
+define <32 x half> @vaddph_512_maskz_test(<32 x half> %i, <32 x half> %j, <32 x half> %mask1) nounwind readnone {
+; CHECK-LABEL: vaddph_512_maskz_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT:    vcmpneq_oqph %zmm3, %zmm2, %k1
+; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %mask = fcmp one <32 x half> %mask1, zeroinitializer
+  %x = fadd  <32 x half> %i, %j
+  %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> zeroinitializer
+  ret <32 x half> %r
+}
+
+define <32 x half> @vaddph_512_mask_fold_test(<32 x half> %i, <32 x half>* %j.ptr, <32 x half> %mask1) nounwind readnone {
+; CHECK-LABEL: vaddph_512_mask_fold_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpneq_oqph %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vaddph (%rdi), %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %mask = fcmp one <32 x half> %mask1, zeroinitializer
+  %j = load <32 x half>, <32 x half>* %j.ptr
+  %x = fadd  <32 x half> %i, %j
+  %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> %i
+  ret <32 x half> %r
+}
+
+define <32 x half> @vaddph_512_maskz_fold_test(<32 x half> %i, <32 x half>* %j.ptr, <32 x half> %mask1) nounwind readnone {
+; CHECK-LABEL: vaddph_512_maskz_fold_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpneq_oqph %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vaddph (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %mask = fcmp one <32 x half> %mask1, zeroinitializer
+  %j = load <32 x half>, <32 x half>* %j.ptr
+  %x = fadd  <32 x half> %i, %j
+  %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> zeroinitializer
+  ret <32 x half> %r
+}
+
+define <32 x half> @vaddph_512_maskz_fold_test_2(<32 x half> %i, <32 x half>* %j.ptr, <32 x half> %mask1) nounwind readnone {
+; CHECK-LABEL: vaddph_512_maskz_fold_test_2:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vcmpneq_oqph %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vaddph (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %mask = fcmp one <32 x half> %mask1, zeroinitializer
+  %j = load <32 x half>, <32 x half>* %j.ptr
+  %x = fadd  <32 x half> %j, %i
+  %r = select <32 x i1> %mask, <32 x half> %x, <32 x half> zeroinitializer
+  ret <32 x half> %r
+}
+
+define <32 x half> @vsubph_512_test(<32 x half> %i, <32 x half> %j) nounwind readnone {
+; CHECK-LABEL: vsubph_512_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vsubph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %x = fsub  <32 x half> %i, %j
+  ret <32 x half> %x
+}
+
+define <32 x half> @vmulph_512_test(<32 x half> %i, <32 x half> %j) nounwind readnone {
+; CHECK-LABEL: vmulph_512_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmulph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %x = fmul  <32 x half> %i, %j
+  ret <32 x half> %x
+}
+
+define <32 x half> @vdivph_512_test(<32 x half> %i, <32 x half> %j) nounwind readnone {
+; CHECK-LABEL: vdivph_512_test:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vdivph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %x = fdiv  <32 x half> %i, %j
+  ret <32 x half> %x
+}
+
+define half @add_sh(half %i, half %j, half* %x.ptr) nounwind readnone {
+; CHECK-LABEL: add_sh:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %x = load half, half* %x.ptr
+  %y = fadd  half %i, %j
+  %r = fadd  half %x, %y
+  ret half %r
+}
+
+define half @sub_sh(half %i, half %j, half* %x.ptr) nounwind readnone {
+; CHECK-LABEL: sub_sh:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovsh (%rdi), %xmm2
+; CHECK-NEXT:    vsubsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vsubsh %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %x = load half, half* %x.ptr
+  %y = fsub  half %i, %j
+  %r = fsub  half %x, %y
+  ret half %r
+}
+
+define half @sub_sh_2(half %i, half %j, half* %x.ptr) nounwind readnone {
+; CHECK-LABEL: sub_sh_2:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vsubsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vsubsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %x = load half, half* %x.ptr
+  %y = fsub  half %i, %j
+  %r = fsub  half %y, %x
+  ret half %r
+}
+
+define half @mul_sh(half %i, half %j, half* %x.ptr) nounwind readnone {
+; CHECK-LABEL: mul_sh:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmulsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmulsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %x = load half, half* %x.ptr
+  %y = fmul  half %i, %j
+  %r = fmul  half %x, %y
+  ret half %r
+}
+
+define half @div_sh(half %i, half %j, half* %x.ptr) nounwind readnone {
+; CHECK-LABEL: div_sh:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vmovsh (%rdi), %xmm2
+; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vdivsh %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %x = load half, half* %x.ptr
+  %y = fdiv  half %i, %j
+  %r = fdiv  half %x, %y
+  ret half %r
+}
+
+define half @div_sh_2(half %i, half %j, half* %x.ptr) nounwind readnone {
+; CHECK-LABEL: div_sh_2:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vdivsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %x = load half, half* %x.ptr
+  %y = fdiv  half %i, %j
+  %r = fdiv  half %y, %x
+  ret half %r
+}
+
+define i1 @cmp_une_sh(half %x, half %y) {
+; CHECK-LABEL: cmp_une_sh:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcmpneqsh %xmm1, %xmm0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+entry:
+  %0 = fcmp une half %x, %y
+  ret i1 %0
+}
+
+define i1 @cmp_oeq_sh(half %x, half %y) {
+; CHECK-LABEL: cmp_oeq_sh:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcmpeqsh %xmm1, %xmm0, %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+entry:
+  %0 = fcmp oeq half %x, %y
+  ret i1 %0
+}
+
+define i1 @cmp_olt_sh(half %x, half %y) {
+; CHECK-LABEL: cmp_olt_sh:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vucomish %xmm0, %xmm1
+; CHECK-NEXT:    seta %al
+; CHECK-NEXT:    retq
+  entry:
+  %0 = fcmp olt half %x, %y
+  ret i1 %0
+}
+
+define <32 x i1> @cmp_ph(<32 x half> %x, <32 x half> %y) {
+; CHECK-LABEL: cmp_ph:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcmpneqph %zmm1, %zmm0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %ymm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = fcmp une <32 x half> %x, %y
+  ret <32 x i1> %0
+}
+
+define half @fneg(half %x) {
+; CHECK-LABEL: fneg:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = fneg half %x
+  ret half %a
+}
+
+define half @fneg_idiom(half %x) {
+; CHECK-LABEL: fneg_idiom:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = fsub half -0.0, %x
+  ret half %a
+}
+
+define half @fabs(half %x) {
+; CHECK-LABEL: fabs:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = call half @llvm.fabs.f16(half %x)
+  ret half %a
+}
+declare half @llvm.fabs.f16(half)
+
+define half @fcopysign(half %x, half %y) {
+; CHECK-LABEL: fcopysign:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT:    vpternlogq $226, %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %a = call half @llvm.copysign.f16(half %x, half %y)
+  ret half %a
+}
+declare half @llvm.copysign.f16(half, half)
+
+define <8 x half> @fnegv8f16(<8 x half> %x) {
+; CHECK-LABEL: fnegv8f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = fneg <8 x half> %x
+  ret <8 x half> %a
+}
+
+define <8 x half> @fneg_idiomv8f16(<8 x half> %x) {
+; CHECK-LABEL: fneg_idiomv8f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = fsub <8 x half> <half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0>, %x
+  ret <8 x half> %a
+}
+
+define <8 x half> @fabsv8f16(<8 x half> %x) {
+; CHECK-LABEL: fabsv8f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = call <8 x half> @llvm.fabs.v8f16(<8 x half> %x)
+  ret <8 x half> %a
+}
+declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
+
+define <8 x half> @fcopysignv8f16(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: fcopysignv8f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT:    vpternlogq $226, %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %a = call <8 x half> @llvm.copysign.v8f16(<8 x half> %x, <8 x half> %y)
+  ret <8 x half> %a
+}
+declare <8 x half> @llvm.copysign.v8f16(<8 x half>, <8 x half>)
+
+define <16 x half> @fnegv16f16(<16 x half> %x) {
+; CHECK-LABEL: fnegv16f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %a = fneg <16 x half> %x
+  ret <16 x half> %a
+}
+
+define <16 x half> @fneg_idiomv16f16(<16 x half> %x) {
+; CHECK-LABEL: fneg_idiomv16f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %a = fsub <16 x half> <half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0>, %x
+  ret <16 x half> %a
+}
+
+define <16 x half> @fabsv16f16(<16 x half> %x) {
+; CHECK-LABEL: fabsv16f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %a = call <16 x half> @llvm.fabs.v16f16(<16 x half> %x)
+  ret <16 x half> %a
+}
+declare <16 x half> @llvm.fabs.v16f16(<16 x half>)
+
+define <16 x half> @fcopysignv16f16(<16 x half> %x, <16 x half> %y) {
+; CHECK-LABEL: fcopysignv16f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT:    vpternlogq $226, %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %a = call <16 x half> @llvm.copysign.v16f16(<16 x half> %x, <16 x half> %y)
+  ret <16 x half> %a
+}
+declare <16 x half> @llvm.copysign.v16f16(<16 x half>, <16 x half>)
+
+define <32 x half> @fnegv32f16(<32 x half> %x) {
+; CHECK-LABEL: fnegv32f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} zmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %a = fneg <32 x half> %x
+  ret <32 x half> %a
+}
+
+define <32 x half> @fneg_idiomv32f16(<32 x half> %x) {
+; CHECK-LABEL: fneg_idiomv32f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} zmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %a = fsub <32 x half> <half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0, half -0.0>, %x
+  ret <32 x half> %a
+}
+
+define <32 x half> @fabsv32f16(<32 x half> %x) {
+; CHECK-LABEL: fabsv32f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} zmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %a = call <32 x half> @llvm.fabs.v32f16(<32 x half> %x)
+  ret <32 x half> %a
+}
+declare <32 x half> @llvm.fabs.v32f16(<32 x half>)
+
+define <32 x half> @fcopysignv32f16(<32 x half> %x, <32 x half> %y) {
+; CHECK-LABEL: fcopysignv32f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT:    vpternlogq $226, %zmm1, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %a = call <32 x half> @llvm.copysign.v32f16(<32 x half> %x, <32 x half> %y)
+  ret <32 x half> %a
+}
+declare <32 x half> @llvm.copysign.v32f16(<32 x half>, <32 x half>)
+
+define <8 x half>  @regression_test1(<8 x half> %x, <8 x half> %y) #0 {
+; CHECK-LABEL: regression_test1:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vsubph %xmm1, %xmm0, %xmm2
+; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; CHECK-NEXT:    retq
+entry:
+  %a = fsub <8 x half> %x, %y
+  %b = fadd <8 x half> %x, %y
+  %c = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  ret <8 x half> %c
+}
+
+define <8 x i16>  @regression_test2(<8 x float> %x) #0 {
+; CHECK-LABEL: regression_test2:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcvttps2udq %ymm0, %ymm0
+; CHECK-NEXT:    vpmovdw %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %a = fptoui <8 x float> %x to  <8 x i16>
+  ret <8 x i16> %a
+}
+
+define <8 x i16>  @regression_test3(<8 x float> %x) #0 {
+; CHECK-LABEL: regression_test3:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
+; CHECK-NEXT:    vpmovdw %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %a = fptosi <8 x float> %x to  <8 x i16>
+  ret <8 x i16> %a
+}
+
+define <8 x i16>  @regression_test4(<8 x double> %x) #0 {
+; CHECK-LABEL: regression_test4:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; CHECK-NEXT:    vpmovdw %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %a = fptoui <8 x double> %x to  <8 x i16>
+  ret <8 x i16> %a
+}
+
+define <8 x i16>  @regression_test5(<8 x double> %x) #0 {
+; CHECK-LABEL: regression_test5:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcvttpd2dq %zmm0, %ymm0
+; CHECK-NEXT:    vpmovdw %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %a = fptosi <8 x double> %x to  <8 x i16>
+  ret <8 x i16> %a
+}
+
+define <8 x i1> @fcmp_v8f16(<8 x half> %a, <8 x half> %b)
+; CHECK-LABEL: fcmp_v8f16:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcmpeqph %xmm1, %xmm0, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %xmm0
+; CHECK-NEXT:    retq
+{
+entry:
+  %0 = fcmp oeq <8 x half> %a, %b
+  ret <8 x i1> %0
+}
+
+define <16 x i1> @fcmp_v16f16(<16 x half> %a, <16 x half> %b)
+; CHECK-LABEL: fcmp_v16f16:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcmpeqph %ymm1, %ymm0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+{
+entry:
+  %0 = fcmp oeq <16 x half> %a, %b
+  ret <16 x i1> %0
+}
+
+define <32 x i1> @fcmp_v32f16(<32 x half> %a, <32 x half> %b)
+; CHECK-LABEL: fcmp_v32f16:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcmpeqph %zmm1, %zmm0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %ymm0
+; CHECK-NEXT:    retq
+{
+entry:
+  %0 = fcmp oeq <32 x half> %a, %b
+  ret <32 x i1> %0
+}
+
+define <8 x i16> @zext_fcmp_v8f16(<8 x half> %a, <8 x half> %b)
+; CHECK-LABEL: zext_fcmp_v8f16:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcmpeqph %xmm1, %xmm0, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %xmm0
+; CHECK-NEXT:    vpsrlw $15, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+{
+entry:
+  %0 = fcmp oeq <8 x half> %a, %b
+  %1 = zext <8 x i1> %0 to <8 x i16>
+  ret <8 x i16> %1
+}
+
+define <16 x i16> @zext_fcmp_v16f16(<16 x half> %a, <16 x half> %b)
+; CHECK-LABEL: zext_fcmp_v16f16:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcmpeqph %ymm1, %ymm0, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %ymm0
+; CHECK-NEXT:    vpsrlw $15, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+{
+entry:
+  %0 = fcmp oeq <16 x half> %a, %b
+  %1 = zext <16 x i1> %0 to <16 x i16>
+  ret <16 x i16> %1
+}
+
+define <32 x i16> @zext_fcmp_v32f16(<32 x half> %a, <32 x half> %b)
+; CHECK-LABEL: zext_fcmp_v32f16:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    vcmpeqph %zmm1, %zmm0, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %zmm0
+; CHECK-NEXT:    vpsrlw $15, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+{
+entry:
+  %0 = fcmp oeq <32 x half> %a, %b
+  %1 = zext <32 x i1> %0 to <32 x i16>
+  ret <32 x i16> %1
+}
+
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-intrinsics.ll
new file mode 100644
index 0000000000000..e1bf6e3ba01ae
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-intrinsics.ll
@@ -0,0 +1,549 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unkown-unkown -mattr=+avx512bw -mattr=+avx512fp16 | FileCheck %s
+
+declare <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16>, i32)
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512(<32 x i16> %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtw2ph %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %res0 = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 4)
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_2(<32 x i16> %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtw2ph %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %res0 = sitofp <32 x i16> %arg0 to <32 x half>
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_b(i16* %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtw2ph (%rdi){1to32}, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %scalar = load i16, i16* %arg0
+  %scalar_in_vector = insertelement <32 x i16> undef, i16 %scalar, i32 0
+  %val = shufflevector <32 x i16> %scalar_in_vector, <32 x i16> undef, <32 x i32> zeroinitializer
+  %res0 = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> %val, i32 4)
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_b_2(i16* %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_b_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtw2ph (%rdi){1to32}, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %scalar = load i16, i16* %arg0
+  %scalar_in_vector = insertelement <32 x i16> undef, i16 %scalar, i32 0
+  %val = shufflevector <32 x i16> %scalar_in_vector, <32 x i16> undef, <32 x i32> zeroinitializer
+  %res0 = sitofp <32 x i16> %val to <32 x half>
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_r(<32 x i16> %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtw2ph {ru-sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %res0 = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 10)
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_nomask(<32 x i16> %arg0) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtw2ph %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 4)
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_nomask_2(<32 x i16> %arg0) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_nomask_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtw2ph %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = sitofp <32 x i16> %arg0 to <32 x half>
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_z(<32 x i16> %arg0, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtw2ph %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %res0 = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 4)
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_z_2(<32 x i16> %arg0, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_z_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtw2ph %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %res0 = sitofp <32 x i16> %arg0 to <32 x half>
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_load(<32 x i16>* %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtw2ph (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %val = load <32 x i16>, <32 x i16>* %arg0
+  %res0 = call <32 x half> @llvm.x86.avx512.sitofp.round.v32f16.v32i16(<32 x i16> %val, i32 4)
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_512_load_2(<32 x i16>* %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_512_load_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtw2ph (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %val = load <32 x i16>, <32 x i16>* %arg0
+  %res0 = sitofp <32 x i16> %val to <32 x half>
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+declare <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half>, <32 x i16>, i32, i32)
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2w %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_512_b(half* %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_512_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtph2w (%rdi){1to32}, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %scalar = load half, half* %arg0
+  %scalar_in_vector = insertelement <32 x half> undef, half %scalar, i32 0
+  %val = shufflevector <32 x half> %scalar_in_vector, <32 x half> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_512_r(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_512_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2w {rd-sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 9)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_512_nomask(<32 x half> %arg0, <32 x i16> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_512_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2w %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> %arg0, <32 x i16> %arg1, i32 -1, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_512_z(<32 x half> %arg0, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_512_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2w %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> %arg0, <32 x i16> zeroinitializer, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_512_load(<32 x half>* %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_512_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtph2w (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %val = load <32 x half>, <32 x half>* %arg0
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+
+declare <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16>, i32)
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512(<32 x i16> %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuw2ph %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %res0 = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 4)
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_2(<32 x i16> %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuw2ph %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %res0 = uitofp <32 x i16> %arg0 to <32 x half>
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_b(i16* %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtuw2ph (%rdi){1to32}, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %scalar = load i16, i16* %arg0
+  %scalar_in_vector = insertelement <32 x i16> undef, i16 %scalar, i32 0
+  %val = shufflevector <32 x i16> %scalar_in_vector, <32 x i16> undef, <32 x i32> zeroinitializer
+  %res0 = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> %val, i32 4)
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_b_2(i16* %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_b_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtuw2ph (%rdi){1to32}, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %scalar = load i16, i16* %arg0
+  %scalar_in_vector = insertelement <32 x i16> undef, i16 %scalar, i32 0
+  %val = shufflevector <32 x i16> %scalar_in_vector, <32 x i16> undef, <32 x i32> zeroinitializer
+  %res0 = uitofp <32 x i16> %val to <32 x half>
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_r(<32 x i16> %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuw2ph {ru-sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %res0 = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 10)
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_nomask(<32 x i16> %arg0) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuw2ph %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 4)
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_nomask_2(<32 x i16> %arg0) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_nomask_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuw2ph %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = uitofp <32 x i16> %arg0 to <32 x half>
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_z(<32 x i16> %arg0, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuw2ph %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %res0 = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> %arg0, i32 4)
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_z_2(<32 x i16> %arg0, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_z_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuw2ph %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %res0 = uitofp <32 x i16> %arg0 to <32 x half>
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> zeroinitializer
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_load(<32 x i16>* %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtuw2ph (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %val = load <32 x i16>, <32 x i16>* %arg0
+  %res0 = call <32 x half> @llvm.x86.avx512.uitofp.round.v32f16.v32i16(<32 x i16> %val, i32 4)
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+define <32 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_512_load_2(<32 x i16>* %arg0, <32 x half> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_512_load_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtuw2ph (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i32 %mask to <32 x i1>
+  %val = load <32 x i16>, <32 x i16>* %arg0
+  %res0 = uitofp <32 x i16> %val to <32 x half>
+  %res = select <32 x i1> %msk, <32 x half> %res0, <32 x half> %arg1
+  ret <32 x half> %res
+}
+
+declare <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half>, <32 x i16>, i32, i32)
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2uw %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_512_b(half* %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_512_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtph2uw (%rdi){1to32}, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %scalar = load half, half* %arg0
+  %scalar_in_vector = insertelement <32 x half> undef, half %scalar, i32 0
+  %val = shufflevector <32 x half> %scalar_in_vector, <32 x half> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_512_r(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_512_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2uw {rd-sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 9)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_512_nomask(<32 x half> %arg0, <32 x i16> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_512_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2uw %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> %arg0, <32 x i16> %arg1, i32 -1, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_512_z(<32 x half> %arg0, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_512_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2uw %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> %arg0, <32 x i16> zeroinitializer, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_512_load(<32 x half>* %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_512_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtph2uw (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %val = load <32 x half>, <32 x half>* %arg0
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+declare <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half>, <32 x i16>, i32, i32)
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2w %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_512_b(half* %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_512_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvttph2w (%rdi){1to32}, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %scalar = load half, half* %arg0
+  %scalar_in_vector = insertelement <32 x half> undef, half %scalar, i32 0
+  %val = shufflevector <32 x half> %scalar_in_vector, <32 x half> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_512_sae(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_512_sae:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2w {sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 8)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_512_nomask(<32 x half> %arg0, <32 x i16> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_512_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2w %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> %arg0, <32 x i16> %arg1, i32 -1, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_512_z(<32 x half> %arg0, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_512_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2w %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> %arg0, <32 x i16> zeroinitializer, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_512_load(<32 x half>* %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_512_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvttph2w (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %val = load <32 x half>, <32 x half>* %arg0
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+declare <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half>, <32 x i16>, i32, i32)
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2uw %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_512_b(half* %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_512_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvttph2uw (%rdi){1to32}, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %scalar = load half, half* %arg0
+  %scalar_in_vector = insertelement <32 x half> undef, half %scalar, i32 0
+  %val = shufflevector <32 x half> %scalar_in_vector, <32 x half> undef, <32 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_512_sae(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_512_sae:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2uw {sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> %arg0, <32 x i16> %arg1, i32 %mask, i32 8)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_512_nomask(<32 x half> %arg0, <32 x i16> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_512_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2uw %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> %arg0, <32 x i16> %arg1, i32 -1, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_512_z(<32 x half> %arg0, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_512_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2uw %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> %arg0, <32 x i16> zeroinitializer, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_512_load(<32 x half>* %arg0, <32 x i16> %arg1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_512_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvttph2uw (%rdi), %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %val = load <32 x half>, <32 x half>* %arg0
+  %res = call <32 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.512(<32 x half> %val, <32 x i16> %arg1, i32 %mask, i32 4)
+  ret <32 x i16> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll
new file mode 100644
index 0000000000000..d17b677276b99
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll
@@ -0,0 +1,770 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unkown-unkown -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512fp16 | FileCheck %s
+
+define <16 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_256(<16 x i16> %arg0, <16 x half> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtw2ph %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %res0 = sitofp <16 x i16> %arg0 to <16 x half>
+  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %arg1
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_256_b(i16* %arg0, <16 x half> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_256_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtw2ph (%rdi){1to16}, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %scalar = load i16, i16* %arg0
+  %scalar_in_vector = insertelement <16 x i16> undef, i16 %scalar, i32 0
+  %val = shufflevector <16 x i16> %scalar_in_vector, <16 x i16> undef, <16 x i32> zeroinitializer
+  %res0 = sitofp <16 x i16> %val to <16 x half>
+  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %arg1
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_256_nomask(<16 x i16> %arg0, <16 x half> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_256_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtw2ph %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = sitofp <16 x i16> %arg0 to <16 x half>
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_256_z(<16 x i16> %arg0, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_256_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtw2ph %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %res0 = sitofp <16 x i16> %arg0 to <16 x half>
+  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_256_load(<16 x i16>* %arg0, <16 x half> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_256_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtw2ph (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %val = load <16 x i16>, <16 x i16>* %arg0
+  %res0 = sitofp <16 x i16> %val to <16 x half>
+  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %arg1
+  ret <16 x half> %res
+}
+
+declare <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.256(<16 x half>, <16 x i16>, i16)
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2w %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_256_b(half* %arg0, <16 x i16> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_256_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtph2w (%rdi){1to16}, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %scalar = load half, half* %arg0
+  %scalar_in_vector = insertelement <16 x half> undef, half %scalar, i32 0
+  %val = shufflevector <16 x half> %scalar_in_vector, <16 x half> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_256_nomask(<16 x half> %arg0, <16 x i16> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_256_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2w %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.256(<16 x half> %arg0, <16 x i16> %arg1, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_256_z(<16 x half> %arg0, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_256_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2w %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.256(<16 x half> %arg0, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_256_load(<16 x half>* %arg0, <16 x i16> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_256_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtph2w (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %val = load <16 x half>, <16 x half>* %arg0
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_256(<16 x i16> %arg0, <16 x half> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuw2ph %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %res0 = uitofp <16 x i16> %arg0 to <16 x half>
+  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %arg1
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_256_b(i16* %arg0, <16 x half> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_256_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtuw2ph (%rdi){1to16}, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %scalar = load i16, i16* %arg0
+  %scalar_in_vector = insertelement <16 x i16> undef, i16 %scalar, i32 0
+  %val = shufflevector <16 x i16> %scalar_in_vector, <16 x i16> undef, <16 x i32> zeroinitializer
+  %res0 = uitofp <16 x i16> %val to <16 x half>
+  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %arg1
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_256_nomask(<16 x i16> %arg0, <16 x half> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_256_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuw2ph %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = uitofp <16 x i16> %arg0 to <16 x half>
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_256_z(<16 x i16> %arg0, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_256_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuw2ph %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %res0 = uitofp <16 x i16> %arg0 to <16 x half>
+  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> zeroinitializer
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_256_load(<16 x i16>* %arg0, <16 x half> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_256_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtuw2ph (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i16 %mask to <16 x i1>
+  %val = load <16 x i16>, <16 x i16>* %arg0
+  %res0 = uitofp <16 x i16> %val to <16 x half>
+  %res = select <16 x i1> %msk, <16 x half> %res0, <16 x half> %arg1
+  ret <16 x half> %res
+}
+
+declare <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.256(<16 x half>, <16 x i16>, i16)
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2uw %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_256_b(half* %arg0, <16 x i16> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_256_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtph2uw (%rdi){1to16}, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %scalar = load half, half* %arg0
+  %scalar_in_vector = insertelement <16 x half> undef, half %scalar, i32 0
+  %val = shufflevector <16 x half> %scalar_in_vector, <16 x half> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_256_nomask(<16 x half> %arg0, <16 x i16> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_256_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2uw %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.256(<16 x half> %arg0, <16 x i16> %arg1, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_256_z(<16 x half> %arg0, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_256_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2uw %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.256(<16 x half> %arg0, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_256_load(<16 x half>* %arg0, <16 x i16> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_256_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtph2uw (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %val = load <16 x half>, <16 x half>* %arg0
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask)
+  ret <16 x i16> %res
+}
+
+declare <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.256(<16 x half>, <16 x i16>, i16)
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2w %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_256_b(half* %arg0, <16 x i16> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_256_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvttph2w (%rdi){1to16}, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %scalar = load half, half* %arg0
+  %scalar_in_vector = insertelement <16 x half> undef, half %scalar, i32 0
+  %val = shufflevector <16 x half> %scalar_in_vector, <16 x half> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_256_nomask(<16 x half> %arg0, <16 x i16> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_256_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2w %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.256(<16 x half> %arg0, <16 x i16> %arg1, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_256_z(<16 x half> %arg0, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_256_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2w %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.256(<16 x half> %arg0, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_256_load(<16 x half>* %arg0, <16 x i16> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_256_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvttph2w (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %val = load <16 x half>, <16 x half>* %arg0
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask)
+  ret <16 x i16> %res
+}
+
+declare <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.256(<16 x half>, <16 x i16>, i16)
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2uw %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.256(<16 x half> %arg0, <16 x i16> %arg1, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_256_b(half* %arg0, <16 x i16> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_256_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvttph2uw (%rdi){1to16}, %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %scalar = load half, half* %arg0
+  %scalar_in_vector = insertelement <16 x half> undef, half %scalar, i32 0
+  %val = shufflevector <16 x half> %scalar_in_vector, <16 x half> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_256_nomask(<16 x half> %arg0, <16 x i16> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_256_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2uw %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.256(<16 x half> %arg0, <16 x i16> %arg1, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_256_z(<16 x half> %arg0, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_256_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2uw %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.256(<16 x half> %arg0, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_256_load(<16 x half>* %arg0, <16 x i16> %arg1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_256_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvttph2uw (%rdi), %ymm0 {%k1}
+; CHECK-NEXT:    retq
+  %val = load <16 x half>, <16 x half>* %arg0
+  %res = call <16 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.256(<16 x half> %val, <16 x i16> %arg1, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_128(<8 x i16> %arg0, <8 x half> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtw2ph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %res0 = sitofp <8 x i16> %arg0 to <8 x half>
+  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %arg1
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_128_b(i16* %arg0, <8 x half> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_128_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtw2ph (%rdi){1to8}, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %scalar = load i16, i16* %arg0
+  %scalar_in_vector = insertelement <8 x i16> undef, i16 %scalar, i32 0
+  %val = shufflevector <8 x i16> %scalar_in_vector, <8 x i16> undef, <8 x i32> zeroinitializer
+  %res0 = sitofp <8 x i16> %val to <8 x half>
+  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %arg1
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_128_nomask(<8 x i16> %arg0, <8 x half> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_128_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtw2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = sitofp <8 x i16> %arg0 to <8 x half>
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_128_z(<8 x i16> %arg0, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_128_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtw2ph %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %res0 = sitofp <8 x i16> %arg0 to <8 x half>
+  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvtw2ph_128_load(<8 x i16>* %arg0, <8 x half> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtw2ph_128_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtw2ph (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %val = load <8 x i16>, <8 x i16>* %arg0
+  %res0 = sitofp <8 x i16> %val to <8 x half>
+  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %arg1
+  ret <8 x half> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.128(<8 x half>, <8 x i16>, i8)
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2w %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_128_b(half* %arg0, <8 x i16> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_128_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtph2w (%rdi){1to8}, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %scalar = load half, half* %arg0
+  %scalar_in_vector = insertelement <8 x half> undef, half %scalar, i32 0
+  %val = shufflevector <8 x half> %scalar_in_vector, <8 x half> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_128_nomask(<8 x half> %arg0, <8 x i16> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_128_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2w %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.128(<8 x half> %arg0, <8 x i16> %arg1, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_128_z(<8 x half> %arg0, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_128_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2w %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.128(<8 x half> %arg0, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2w_128_load(<8 x half>* %arg0, <8 x i16> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2w_128_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtph2w (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %val = load <8 x half>, <8 x half>* %arg0
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2w.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask)
+  ret <8 x i16> %res
+}
+
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_128(<8 x i16> %arg0, <8 x half> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuw2ph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %res0 = uitofp <8 x i16> %arg0 to <8 x half>
+  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %arg1
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_128_b(i16* %arg0, <8 x half> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_128_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtuw2ph (%rdi){1to8}, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %scalar = load i16, i16* %arg0
+  %scalar_in_vector = insertelement <8 x i16> undef, i16 %scalar, i32 0
+  %val = shufflevector <8 x i16> %scalar_in_vector, <8 x i16> undef, <8 x i32> zeroinitializer
+  %res0 = uitofp <8 x i16> %val to <8 x half>
+  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %arg1
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_128_nomask(<8 x i16> %arg0, <8 x half> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_128_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuw2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = uitofp <8 x i16> %arg0 to <8 x half>
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_128_z(<8 x i16> %arg0, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_128_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuw2ph %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %res0 = uitofp <8 x i16> %arg0 to <8 x half>
+  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> zeroinitializer
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512fp16_mask_cvtuw2ph_128_load(<8 x i16>* %arg0, <8 x half> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtuw2ph_128_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtuw2ph (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %msk = bitcast i8 %mask to <8 x i1>
+  %val = load <8 x i16>, <8 x i16>* %arg0
+  %res0 = uitofp <8 x i16> %val to <8 x half>
+  %res = select <8 x i1> %msk, <8 x half> %res0, <8 x half> %arg1
+  ret <8 x half> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.128(<8 x half>, <8 x i16>, i8)
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2uw %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_128_b(half* %arg0, <8 x i16> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_128_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtph2uw (%rdi){1to8}, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %scalar = load half, half* %arg0
+  %scalar_in_vector = insertelement <8 x half> undef, half %scalar, i32 0
+  %val = shufflevector <8 x half> %scalar_in_vector, <8 x half> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_128_nomask(<8 x half> %arg0, <8 x i16> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_128_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2uw %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.128(<8 x half> %arg0, <8 x i16> %arg1, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_128_z(<8 x half> %arg0, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_128_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2uw %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.128(<8 x half> %arg0, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvtph2uw_128_load(<8 x half>* %arg0, <8 x i16> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvtph2uw_128_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtph2uw (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %val = load <8 x half>, <8 x half>* %arg0
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvtph2uw.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask)
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.128(<8 x half>, <8 x i16>, i8)
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2w %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_128_b(half* %arg0, <8 x i16> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_128_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvttph2w (%rdi){1to8}, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %scalar = load half, half* %arg0
+  %scalar_in_vector = insertelement <8 x half> undef, half %scalar, i32 0
+  %val = shufflevector <8 x half> %scalar_in_vector, <8 x half> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_128_nomask(<8 x half> %arg0, <8 x i16> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_128_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.128(<8 x half> %arg0, <8 x i16> %arg1, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_128_z(<8 x half> %arg0, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_128_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.128(<8 x half> %arg0, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2w_128_load(<8 x half>* %arg0, <8 x i16> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2w_128_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvttph2w (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %val = load <8 x half>, <8 x half>* %arg0
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2w.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask)
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.128(<8 x half>, <8 x i16>, i8)
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.128(<8 x half> %arg0, <8 x i16> %arg1, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_128_b(half* %arg0, <8 x i16> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_128_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvttph2uw (%rdi){1to8}, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %scalar = load half, half* %arg0
+  %scalar_in_vector = insertelement <8 x half> undef, half %scalar, i32 0
+  %val = shufflevector <8 x half> %scalar_in_vector, <8 x half> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_128_nomask(<8 x half> %arg0, <8 x i16> %arg1) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_128_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.128(<8 x half> %arg0, <8 x i16> %arg1, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_128_z(<8 x half> %arg0, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_128_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.128(<8 x half> %arg0, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_int_x86_avx512fp16_mask_cvttph2uw_128_load(<8 x half>* %arg0, <8 x i16> %arg1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_cvttph2uw_128_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvttph2uw (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %val = load <8 x half>, <8 x half>* %arg0
+  %res = call <8 x i16> @llvm.x86.avx512fp16.mask.vcvttph2uw.128(<8 x half> %val, <8 x i16> %arg1, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <4 x half> @test_u16tofp4(<4 x i16> %arg0) {
+; CHECK-LABEL: test_u16tofp4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuw2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = uitofp <4 x i16> %arg0 to <4 x half>
+  ret <4 x half> %res
+}
+
+define <2 x half> @test_s16tofp2(<2 x i16> %arg0) {
+; CHECK-LABEL: test_s16tofp2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtw2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = sitofp <2 x i16> %arg0 to <2 x half>
+  ret <2 x half> %res
+}
+
+define <4 x half> @test_u8tofp4(<4 x i8> %arg0) {
+; CHECK-LABEL: test_u8tofp4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT:    vcvtuw2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = uitofp <4 x i8> %arg0 to <4 x half>
+  ret <4 x half> %res
+}
+
+define <2 x half> @test_s8tofp2(<2 x i8> %arg0) {
+; CHECK-LABEL: test_s8tofp2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm0
+; CHECK-NEXT:    vcvtw2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = sitofp <2 x i8> %arg0 to <2 x half>
+  ret <2 x half> %res
+}
+
+define <2 x half> @test_u1tofp2(<2 x i1> %arg0) {
+; CHECK-LABEL: test_u1tofp2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vcvtuw2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = uitofp <2 x i1> %arg0 to <2 x half>
+  ret <2 x half> %res
+}
+
+define <4 x half> @test_s17tofp4(<4 x i17> %arg0) {
+; CHECK-LABEL: test_s17tofp4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpslld $15, %xmm0, %xmm0
+; CHECK-NEXT:    vpsrad $15, %xmm0, %xmm0
+; CHECK-NEXT:    vcvtdq2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = sitofp <4 x i17> %arg0 to <4 x half>
+  ret <4 x half> %res
+}
+
+define <2 x half> @test_u33tofp2(<2 x i33> %arg0) {
+; CHECK-LABEL: test_u33tofp2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vcvtuqq2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = uitofp <2 x i33> %arg0 to <2 x half>
+  ret <2 x half> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
new file mode 100644
index 0000000000000..e19ea8426e8ad
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
@@ -0,0 +1,1029 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
+
+define half @f32tof16(float %b) nounwind {
+; X64-LABEL: f32tof16:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: f32tof16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %a = fptrunc float %b to half
+  ret half %a
+}
+
+define half @f64tof16(double %b) nounwind {
+; X64-LABEL: f64tof16:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtsd2sh %xmm0, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: f64tof16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vcvtsd2sh %xmm0, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %a = fptrunc double %b to half
+  ret half %a
+}
+
+define <16 x half> @f32to16f16(<16 x float> %b) nounwind {
+; CHECK-LABEL: f32to16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtps2phx %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fptrunc <16 x float> %b to <16 x half>
+  ret <16 x half> %a
+}
+
+define <8 x half> @f32to8f16(<8 x float> %b) {
+; CHECK-LABEL: f32to8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtps2phx %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fptrunc <8 x float> %b to <8 x half>
+  ret <8 x half> %a
+}
+
+define <4 x half> @f32to4f16(<4 x float> %b) {
+; CHECK-LABEL: f32to4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtps2phx %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fptrunc <4 x float> %b to <4 x half>
+  ret <4 x half> %a
+}
+
+define <2 x half> @f32to2f16(<2 x float> %b) {
+; CHECK-LABEL: f32to2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtps2phx %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fptrunc <2 x float> %b to <2 x half>
+  ret <2 x half> %a
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.128(<4 x float>, <8 x half>, i8)
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.256(<8 x float>, <8 x half>, i8)
+
+define <8 x half> @f32to4f16_mask(<4 x float> %a, <8 x half> %b, i8 %mask) {
+; X64-LABEL: f32to4f16_mask:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1
+; X64-NEXT:    vcvtps2phx %xmm0, %xmm1 {%k1}
+; X64-NEXT:    vmovaps %xmm1, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: f32to4f16_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vcvtps2phx %xmm0, %xmm1 {%k1}
+; X86-NEXT:    vmovaps %xmm1, %xmm0
+; X86-NEXT:    retl
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.128(<4 x float> %a, <8 x half> %b, i8 %mask)
+  ret <8 x half> %res
+}
+
+define <8 x half> @f32to8f16_mask(<8 x float> %a, <8 x half> %b, i8 %mask) {
+; X64-LABEL: f32to8f16_mask:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovd %edi, %k1
+; X64-NEXT:    vcvtps2phx %ymm0, %xmm1 {%k1}
+; X64-NEXT:    vmovaps %xmm1, %xmm0
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
+;
+; X86-LABEL: f32to8f16_mask:
+; X86:       # %bb.0:
+; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1
+; X86-NEXT:    vcvtps2phx %ymm0, %xmm1 {%k1}
+; X86-NEXT:    vmovaps %xmm1, %xmm0
+; X86-NEXT:    vzeroupper
+; X86-NEXT:    retl
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtps2phx.256(<8 x float> %a, <8 x half> %b, i8 %mask)
+  ret <8 x half> %res
+}
+
+define <8 x half> @f32to8f16_mask2(<8 x float> %b, <8 x i1> %mask) {
+; CHECK-LABEL: f32to8f16_mask2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsllw $15, %xmm1, %xmm1
+; CHECK-NEXT:    vpmovw2m %xmm1, %k1
+; CHECK-NEXT:    vcvtps2phx %ymm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fptrunc <8 x float> %b to <8 x half>
+  %c = select <8 x i1>%mask, <8 x half>%a, <8 x half> zeroinitializer
+  ret <8 x half> %c
+}
+
+define <16 x half> @f32to16f16_mask(<16 x float> %b, <16 x i1> %mask) {
+; CHECK-LABEL: f32to16f16_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsllw $7, %xmm1, %xmm1
+; CHECK-NEXT:    vpmovb2m %xmm1, %k1
+; CHECK-NEXT:    vcvtps2phx %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fptrunc <16 x float> %b to <16 x half>
+  %c = select <16 x i1>%mask, <16 x half>%a, <16 x half> zeroinitializer
+  ret <16 x half> %c
+}
+
+define float @f16tof32(half %b) nounwind {
+; X64-LABEL: f16tof32:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: f16tof32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %a = fpext half %b to float
+  ret float %a
+}
+
+define double @f16tof64(half %b) nounwind {
+; X64-LABEL: f16tof64:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: f16tof64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    vmovsh 8(%ebp), %xmm0
+; X86-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+  %a = fpext half %b to double
+  ret double %a
+}
+
+define <16 x float> @f16to16f32(<16 x half> %b) nounwind {
+; CHECK-LABEL: f16to16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2psx %ymm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <16 x half> %b to <16 x float>
+  ret <16 x float> %a
+}
+
+define <8 x float> @f16to8f32(<8 x half> %b) nounwind {
+; CHECK-LABEL: f16to8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2psx %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <8 x half> %b to <8 x float>
+  ret <8 x float> %a
+}
+
+define <4 x float> @f16to4f32(<4 x half> %b) nounwind {
+; CHECK-LABEL: f16to4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2psx %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <4 x half> %b to <4 x float>
+  ret <4 x float> %a
+}
+
+define <2 x float> @f16to2f32(<2 x half> %b) nounwind {
+; CHECK-LABEL: f16to2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2psx %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <2 x half> %b to <2 x float>
+  ret <2 x float> %a
+}
+
+define <16 x float> @f16to16f32_mask(<16 x half> %b, <16 x float> %b1, <16 x float> %a1) {
+; CHECK-LABEL: f16to16f32_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpltps %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vcvtph2psx %ymm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <16 x half> %b to <16 x float>
+  %mask = fcmp ogt <16 x float> %a1, %b1
+  %c = select <16 x i1> %mask, <16 x float> %a, <16 x float> zeroinitializer
+  ret <16 x float> %c
+}
+
+define <8 x float> @f16to8f32_mask(<8 x half> %b, <8 x float> %b1, <8 x float> %a1) {
+; CHECK-LABEL: f16to8f32_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpltps %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vcvtph2psx %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <8 x half> %b to <8 x float>
+  %mask = fcmp ogt <8 x float> %a1, %b1
+  %c = select <8 x i1> %mask, <8 x float> %a, <8 x float> zeroinitializer
+  ret <8 x float> %c
+}
+
+define <4 x float> @f16to4f32_mask(<4 x half> %b, <4 x float> %b1, <4 x float> %a1) {
+; CHECK-LABEL: f16to4f32_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpltps %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vcvtph2psx %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <4 x half> %b to <4 x float>
+  %mask = fcmp ogt <4 x float> %a1, %b1
+  %c = select <4 x i1> %mask, <4 x float> %a, <4 x float> zeroinitializer
+  ret <4 x float> %c
+}
+
+define <2 x float> @f16to2f32_mask(<2 x half> %b, <2 x float> %b1, <2 x float> %a1) {
+; CHECK-LABEL: f16to2f32_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpltps %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vcvtph2psx %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <2 x half> %b to <2 x float>
+  %mask = fcmp ogt <2 x float> %a1, %b1
+  %c = select <2 x i1> %mask, <2 x float> %a, <2 x float> zeroinitializer
+  ret <2 x float> %c
+}
+
+define <2 x double> @f16to2f64(<2 x half> %b) nounwind {
+; CHECK-LABEL: f16to2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2pd %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <2 x half> %b to <2 x double>
+  ret <2 x double> %a
+}
+
+define <2 x double> @f16to2f64_mask(<2 x half> %b, <2 x double> %b1, <2 x double> %a1) {
+; CHECK-LABEL: f16to2f64_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpltpd %xmm2, %xmm1, %k1
+; CHECK-NEXT:    vcvtph2pd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <2 x half> %b to <2 x double>
+  %mask = fcmp ogt <2 x double> %a1, %b1
+  %c = select <2 x i1> %mask, <2 x double> %a, <2 x double> zeroinitializer
+  ret <2 x double> %c
+}
+
+define <4 x double> @f16to4f64(<4 x half> %b) nounwind {
+; CHECK-LABEL: f16to4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2pd %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <4 x half> %b to <4 x double>
+  ret <4 x double> %a
+}
+
+define <4 x double> @f16to4f64_mask(<4 x half> %b, <4 x double> %b1, <4 x double> %a1) {
+; CHECK-LABEL: f16to4f64_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpltpd %ymm2, %ymm1, %k1
+; CHECK-NEXT:    vcvtph2pd %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <4 x half> %b to <4 x double>
+  %mask = fcmp ogt <4 x double> %a1, %b1
+  %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer
+  ret <4 x double> %c
+}
+
+define <8 x double> @f16to8f64(<8 x half> %b) nounwind {
+; CHECK-LABEL: f16to8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2pd %xmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <8 x half> %b to <8 x double>
+  ret <8 x double> %a
+}
+
+define <8 x double> @f16to8f64_mask(<8 x half> %b, <8 x double> %b1, <8 x double> %a1) {
+; CHECK-LABEL: f16to8f64_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpltpd %zmm2, %zmm1, %k1
+; CHECK-NEXT:    vcvtph2pd %xmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fpext <8 x half> %b to <8 x double>
+  %mask = fcmp ogt <8 x double> %a1, %b1
+  %c = select <8 x i1> %mask, <8 x double> %a, <8 x double> zeroinitializer
+  ret <8 x double> %c
+}
+
+define <2 x half> @f64to2f16(<2 x double> %b) {
+; CHECK-LABEL: f64to2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtpd2ph %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fptrunc <2 x double> %b to <2 x half>
+  ret <2 x half> %a
+}
+
+define <4 x half> @f64to4f16(<4 x double> %b) {
+; CHECK-LABEL: f64to4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtpd2ph %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fptrunc <4 x double> %b to <4 x half>
+  ret <4 x half> %a
+}
+
+define <8 x half> @f64to8f16(<8 x double> %b) {
+; CHECK-LABEL: f64to8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtpd2ph %zmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %a = fptrunc <8 x double> %b to <8 x half>
+  ret <8 x half> %a
+}
+
+define float @extload_f16_f32(half* %x) {
+; X64-LABEL: extload_f16_f32:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: extload_f16_f32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsh (%eax), %xmm0
+; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+  %a = load half, half* %x
+  %b = fpext half %a to float
+  ret float %b
+}
+
+define double @extload_f16_f64(half* %x) {
+; X64-LABEL: extload_f16_f64:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: extload_f16_f64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    vmovsh (%eax), %xmm0
+; X86-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
+  %a = load half, half* %x
+  %b = fpext half %a to double
+  ret double %b
+}
+
+define float @extload_f16_f32_optsize(half* %x) optsize {
+; X64-LABEL: extload_f16_f32_optsize:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtsh2ss (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: extload_f16_f32_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtsh2ss (%eax), %xmm0, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%esp)
+; X86-NEXT:    flds (%esp)
+; X86-NEXT:    popl %eax
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+  %a = load half, half* %x
+  %b = fpext half %a to float
+  ret float %b
+}
+
+define double @extload_f16_f64_optsize(half* %x) optsize {
+; X64-LABEL: extload_f16_f64_optsize:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtsh2sd (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: extload_f16_f64_optsize:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    vcvtsh2sd (%eax), %xmm0, %xmm0
+; X86-NEXT:    vmovsd %xmm0, (%esp)
+; X86-NEXT:    fldl (%esp)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl
+  %a = load half, half* %x
+  %b = fpext half %a to double
+  ret double %b
+}
+
+define <16 x float> @extload_v16f16_v16f32(<16 x half>* %x) {
+; X64-LABEL: extload_v16f16_v16f32:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtph2psx (%rdi), %zmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: extload_v16f16_v16f32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtph2psx (%eax), %zmm0
+; X86-NEXT:    retl
+  %a = load <16 x half>, <16 x half>* %x
+  %b = fpext <16 x half> %a to <16 x float>
+  ret <16 x float> %b
+}
+
+define <8 x float> @extload_v8f16_v8f32(<8 x half>* %x) {
+; X64-LABEL: extload_v8f16_v8f32:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtph2psx (%rdi), %ymm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: extload_v8f16_v8f32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtph2psx (%eax), %ymm0
+; X86-NEXT:    retl
+  %a = load <8 x half>, <8 x half>* %x
+  %b = fpext <8 x half> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+define <4 x float> @extload_v4f16_v4f32(<4 x half>* %x) {
+; X64-LABEL: extload_v4f16_v4f32:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtph2psx (%rdi), %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: extload_v4f16_v4f32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtph2psx (%eax), %xmm0
+; X86-NEXT:    retl
+  %a = load <4 x half>, <4 x half>* %x
+  %b = fpext <4 x half> %a to <4 x float>
+  ret <4 x float> %b
+}
+
+define <8 x double> @extload_v8f16_v8f64(<8 x half>* %x) {
+; X64-LABEL: extload_v8f16_v8f64:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtph2pd (%rdi), %zmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: extload_v8f16_v8f64:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtph2pd (%eax), %zmm0
+; X86-NEXT:    retl
+  %a = load <8 x half>, <8 x half>* %x
+  %b = fpext <8 x half> %a to <8 x double>
+  ret <8 x double> %b
+}
+
+define <4 x double> @extload_v4f16_v4f64(<4 x half>* %x) {
+; X64-LABEL: extload_v4f16_v4f64:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtph2pd (%rdi), %ymm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: extload_v4f16_v4f64:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtph2pd (%eax), %ymm0
+; X86-NEXT:    retl
+  %a = load <4 x half>, <4 x half>* %x
+  %b = fpext <4 x half> %a to <4 x double>
+  ret <4 x double> %b
+}
+
+define <2 x double> @extload_v2f16_v2f64(<2 x half>* %x) {
+; X64-LABEL: extload_v2f16_v2f64:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtph2pd (%rdi), %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: extload_v2f16_v2f64:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtph2pd (%eax), %xmm0
+; X86-NEXT:    retl
+  %a = load <2 x half>, <2 x half>* %x
+  %b = fpext <2 x half> %a to <2 x double>
+  ret <2 x double> %b
+}
+
+define half @s8_to_half(i8 %x) {
+; X64-LABEL: s8_to_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movsbl %dil, %eax
+; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: s8_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %a = sitofp i8 %x to half
+  ret half %a
+}
+
+define half @s16_to_half(i16 %x) {
+; X64-LABEL: s16_to_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movswl %di, %eax
+; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: s16_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %a = sitofp i16 %x to half
+  ret half %a
+}
+
+define half @s32_to_half(i32 %x) {
+; X64-LABEL: s32_to_half:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtsi2sh %edi, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: s32_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvtsi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+  %a = sitofp i32 %x to half
+  ret half %a
+}
+
+define half @s64_to_half(i64 %x) {
+; X64-LABEL: s64_to_half:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtsi2sh %rdi, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: s64_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vcvtqq2ph %xmm0, %xmm0
+; X86-NEXT:    retl
+  %a = sitofp i64 %x to half
+  ret half %a
+}
+
+define half @s128_to_half(i128 %x) {
+; X64-LABEL: s128_to_half:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    callq __floattihf@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    retq
+;
+; X86-LABEL: s128_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovups %xmm0, (%esp)
+; X86-NEXT:    calll __floattihf
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+  %a = sitofp i128 %x to half
+  ret half %a
+}
+
+define half @u8_to_half(i8 %x) {
+; X64-LABEL: u8_to_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: u8_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %a = uitofp i8 %x to half
+  ret half %a
+}
+
+define half @u16_to_half(i16 %x) {
+; X64-LABEL: u16_to_half:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: u16_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    retl
+  %a = uitofp i16 %x to half
+  ret half %a
+}
+
+define half @u32_to_half(i32 %x) {
+; X64-LABEL: u32_to_half:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtusi2sh %edi, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: u32_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvtusi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+  %a = uitofp i32 %x to half
+  ret half %a
+}
+
+define half @u64_to_half(i64 %x) {
+; X64-LABEL: u64_to_half:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtusi2sh %rdi, %xmm0, %xmm0
+; X64-NEXT:    retq
+;
+; X86-LABEL: u64_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vcvtuqq2ph %xmm0, %xmm0
+; X86-NEXT:    retl
+  %a = uitofp i64 %x to half
+  ret half %a
+}
+
+define half @u128_to_half(i128 %x) {
+; X64-LABEL: u128_to_half:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    callq __floatuntihf@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    retq
+;
+; X86-LABEL: u128_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovups %xmm0, (%esp)
+; X86-NEXT:    calll __floatuntihf
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+  %a = uitofp i128 %x to half
+  ret half %a
+}
+
+define i8 @half_to_s8(half %x) {
+; X64-LABEL: half_to_s8:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: half_to_s8:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %a = fptosi half %x to i8
+  ret i8 %a
+}
+
+define i16 @half_to_s16(half %x) {
+; X64-LABEL: half_to_s16:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: half_to_s16:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %a = fptosi half %x to i16
+  ret i16 %a
+}
+
+define i32 @half_to_s32(half %x) {
+; X64-LABEL: half_to_s32:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: half_to_s32:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+  %a = fptosi half %x to i32
+  ret i32 %a
+}
+
+define i64 @half_to_s64(half %x) {
+; X64-LABEL: half_to_s64:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: half_to_s64:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vcvttph2qq %xmm0, %xmm0
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-NEXT:    retl
+  %a = fptosi half %x to i64
+  ret i64 %a
+}
+
+define i128 @half_to_s128(half %x) {
+; X64-LABEL: half_to_s128:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    callq __fixhfti@PLT
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    retq
+;
+; X86-LABEL: half_to_s128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    vmovsh 12(%ebp), %xmm0
+; X86-NEXT:    vmovsh %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __fixhfti
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovups %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl $4
+  %a = fptosi half %x to i128
+  ret i128 %a
+}
+
+define i8 @half_to_u8(half %x) {
+; X64-LABEL: half_to_u8:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: half_to_u8:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %a = fptoui half %x to i8
+  ret i8 %a
+}
+
+define i16 @half_to_u16(half %x) {
+; X64-LABEL: half_to_u16:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: half_to_u16:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %a = fptoui half %x to i16
+  ret i16 %a
+}
+
+define i32 @half_to_u32(half %x) {
+; X64-LABEL: half_to_u32:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2usi %xmm0, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: half_to_u32:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2usi {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+  %a = fptoui half %x to i32
+  ret i32 %a
+}
+
+define i64 @half_to_u64(half %x) {
+; X64-LABEL: half_to_u64:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2usi %xmm0, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: half_to_u64:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vcvttph2uqq %xmm0, %xmm0
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-NEXT:    retl
+  %a = fptoui half %x to i64
+  ret i64 %a
+}
+
+define i128 @half_to_u128(half %x) {
+; X64-LABEL: half_to_u128:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    callq __fixunshfti@PLT
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    .cfi_def_cfa_offset 8
+; X64-NEXT:    retq
+;
+; X86-LABEL: half_to_u128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    .cfi_offset %esi, -12
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    vmovsh 12(%ebp), %xmm0
+; X86-NEXT:    vmovsh %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __fixunshfti
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovups %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
+; X86-NEXT:    retl $4
+  %a = fptoui half %x to i128
+  ret i128 %a
+}
+
+define x86_fp80 @half_to_f80(half %x) nounwind {
+; X64-LABEL: half_to_f80:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __extendhfxf2@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: half_to_f80:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovsh %xmm0, (%esp)
+; X86-NEXT:    calll __extendhfxf2
+; X86-NEXT:    popl %eax
+; X86-NEXT:    retl
+  %a = fpext half %x to x86_fp80
+  ret x86_fp80 %a
+}
+
+define half @f80_to_half(x86_fp80 %x) nounwind {
+; X64-LABEL: f80_to_half:
+; X64:       # %bb.0:
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    callq __truncxfhf2@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+;
+; X86-LABEL: f80_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    calll __truncxfhf2
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %a = fptrunc x86_fp80 %x to half
+  ret half %a
+}
+
+; FIXME: We're doing a two step conversion here on 32-bit.
+; First from f16->f32 then f32->f128. This is occuring
+; due to common code in LegalizeFloatTypes that thinks
+; there are no libcalls for f16 to any type but f32.
+; Changing this may break other non-x86 targets. The code
+; generated here should be functional.
+define fp128 @half_to_f128(half %x) nounwind {
+; X64-LABEL: half_to_f128:
+; X64:       # %bb.0:
+; X64-NEXT:    jmp __extendhftf2@PLT # TAILCALL
+;
+; X86-LABEL: half_to_f128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    vmovsh 12(%ebp), %xmm0
+; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __extendsftf2
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    leal -4(%ebp), %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %a = fpext half %x to fp128
+  ret fp128 %a
+}
+
+define half @f128_to_half(fp128 %x) nounwind {
+; X64-LABEL: f128_to_half:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __trunctfhf2@PLT
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: f128_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmovups %xmm0, (%esp)
+; X86-NEXT:    calll __trunctfhf2
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    retl
+  %a = fptrunc fp128 %x to half
+  ret half %a
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
new file mode 100644
index 0000000000000..424d6ad759065
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-fmaxnum.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16    | FileCheck %s --check-prefixes=CHECK
+
+declare half @llvm.maxnum.f16(half, half)
+declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)
+declare <4 x half> @llvm.maxnum.v4f16(<4 x half>, <4 x half>)
+declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
+declare <16 x half> @llvm.maxnum.v16f16(<16 x half>, <16 x half>)
+declare <32 x half> @llvm.maxnum.v32f16(<32 x half>, <32 x half>)
+
+define half @test_intrinsic_fmaxh(half %x, half %y) {
+; CHECK-LABEL: test_intrinsic_fmaxh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5f,0xd0]
+; CHECK-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %z = call half @llvm.maxnum.f16(half %x, half %y) readnone
+  ret half %z
+}
+
+define <2 x half> @test_intrinsic_fmax_v2f16(<2 x half> %x, <2 x half> %y) {
+; CHECK-LABEL: test_intrinsic_fmax_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %z = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %x, <2 x half> %y) readnone
+  ret <2 x half> %z
+}
+
+define <4 x half> @test_intrinsic_fmax_v4f16(<4 x half> %x, <4 x half> %y) {
+; CHECK-LABEL: test_intrinsic_fmax_v4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %z = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %x, <4 x half> %y) readnone
+  ret <4 x half> %z
+}
+
+define <8 x half> @test_intrinsic_fmax_v8f16(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: test_intrinsic_fmax_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5f,0xd0]
+; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %z = call <8 x half> @llvm.maxnum.v8f16(<8 x half> %x, <8 x half> %y) readnone
+  ret <8 x half> %z
+}
+
+define <16 x half> @test_intrinsic_fmax_v16f16(<16 x half> %x, <16 x half> %y) {
+; CHECK-LABEL: test_intrinsic_fmax_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph %ymm0, %ymm1, %ymm2 # encoding: [0x62,0xf5,0x74,0x28,0x5f,0xd0]
+; CHECK-NEXT:    vcmpunordph %ymm0, %ymm0, %k1 # encoding: [0x62,0xf3,0x7c,0x28,0xc2,0xc8,0x03]
+; CHECK-NEXT:    vmovdqu16 %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %z = call <16 x half> @llvm.maxnum.v16f16(<16 x half> %x, <16 x half> %y) readnone
+  ret <16 x half> %z
+}
+
+define <32 x half> @test_intrinsic_fmax_v32f16(<32 x half> %x, <32 x half> %y) {
+; CHECK-LABEL: test_intrinsic_fmax_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph %zmm0, %zmm1, %zmm2 # encoding: [0x62,0xf5,0x74,0x48,0x5f,0xd0]
+; CHECK-NEXT:    vcmpunordph %zmm0, %zmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03]
+; CHECK-NEXT:    vmovdqu16 %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1]
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %z = call <32 x half> @llvm.maxnum.v32f16(<32 x half> %x, <32 x half> %y) readnone
+  ret <32 x half> %z
+}
+
+define <4 x half> @maxnum_intrinsic_nnan_fmf_f432(<4 x half> %a, <4 x half> %b) {
+; CHECK-LABEL: maxnum_intrinsic_nnan_fmf_f432:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxph %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x5f,0xc1]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %r = tail call nnan <4 x half> @llvm.maxnum.v4f16(<4 x half> %a, <4 x half> %b)
+  ret <4 x half> %r
+}
+
+define half @maxnum_intrinsic_nnan_attr_f16(half %a, half %b) #0 {
+; CHECK-LABEL: maxnum_intrinsic_nnan_attr_f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxsh %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5f,0xc1]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %r = tail call half @llvm.maxnum.f16(half %a, half %b)
+  ret half %r
+}
+
+define half @test_maxnum_const_op1(half %x) {
+; CHECK-LABEL: test_maxnum_const_op1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5f,0x05,A,A,A,A]
+; CHECK-NEXT:    # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %r = call half @llvm.maxnum.f16(half 1.0, half %x)
+  ret half %r
+}
+
+define half @test_maxnum_const_op2(half %x) {
+; CHECK-LABEL: test_maxnum_const_op2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmaxsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5f,0x05,A,A,A,A]
+; CHECK-NEXT:    # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %r = call half @llvm.maxnum.f16(half %x, half 1.0)
+  ret half %r
+}
+
+define half @test_maxnum_const_nan(half %x) {
+; CHECK-LABEL: test_maxnum_const_nan:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %r = call half @llvm.maxnum.f16(half %x, half 0x7fff000000000000)
+  ret half %r
+}
+
+attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true"}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fminnum.ll b/llvm/test/CodeGen/X86/avx512fp16-fminnum.ll
new file mode 100644
index 0000000000000..4ff9056fd791a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-fminnum.ll
@@ -0,0 +1,129 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs --show-mc-encoding -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16    | FileCheck %s --check-prefixes=CHECK
+
+declare half @llvm.minnum.f16(half, half)
+declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>)
+declare <4 x half> @llvm.minnum.v4f16(<4 x half>, <4 x half>)
+declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)
+declare <16 x half> @llvm.minnum.v16f16(<16 x half>, <16 x half>)
+declare <32 x half> @llvm.minnum.v32f16(<32 x half>, <32 x half>)
+
+define half @test_intrinsic_fminh(half %x, half %y) {
+; CHECK-LABEL: test_intrinsic_fminh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminsh %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x76,0x08,0x5d,0xd0]
+; CHECK-NEXT:    vcmpunordsh %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7e,0x08,0xc2,0xc8,0x03]
+; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf5,0x7e,0x09,0x10,0xd1]
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %z = call half @llvm.minnum.f16(half %x, half %y) readnone
+  ret half %z
+}
+
+define <2 x half> @test_intrinsic_fmin_v2f16(<2 x half> %x, <2 x half> %y) {
+; CHECK-LABEL: test_intrinsic_fmin_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0]
+; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %z = call <2 x half> @llvm.minnum.v2f16(<2 x half> %x, <2 x half> %y) readnone
+  ret <2 x half> %z
+}
+
+define <4 x half> @test_intrinsic_fmin_v4f16(<4 x half> %x, <4 x half> %y) {
+; CHECK-LABEL: test_intrinsic_fmin_v4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0]
+; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %z = call <4 x half> @llvm.minnum.v4f16(<4 x half> %x, <4 x half> %y) readnone
+  ret <4 x half> %z
+}
+
+define <8 x half> @test_intrinsic_fmin_v8f16(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: test_intrinsic_fmin_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph %xmm0, %xmm1, %xmm2 # encoding: [0x62,0xf5,0x74,0x08,0x5d,0xd0]
+; CHECK-NEXT:    vcmpunordph %xmm0, %xmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x08,0xc2,0xc8,0x03]
+; CHECK-NEXT:    vmovdqu16 %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %z = call <8 x half> @llvm.minnum.v8f16(<8 x half> %x, <8 x half> %y) readnone
+  ret <8 x half> %z
+}
+
+define <16 x half> @test_intrinsic_fmin_v16f16(<16 x half> %x, <16 x half> %y) {
+; CHECK-LABEL: test_intrinsic_fmin_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph %ymm0, %ymm1, %ymm2 # encoding: [0x62,0xf5,0x74,0x28,0x5d,0xd0]
+; CHECK-NEXT:    vcmpunordph %ymm0, %ymm0, %k1 # encoding: [0x62,0xf3,0x7c,0x28,0xc2,0xc8,0x03]
+; CHECK-NEXT:    vmovdqu16 %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %z = call <16 x half> @llvm.minnum.v16f16(<16 x half> %x, <16 x half> %y) readnone
+  ret <16 x half> %z
+}
+
+define <32 x half> @test_intrinsic_fmin_v32f16(<32 x half> %x, <32 x half> %y) {
+; CHECK-LABEL: test_intrinsic_fmin_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph %zmm0, %zmm1, %zmm2 # encoding: [0x62,0xf5,0x74,0x48,0x5d,0xd0]
+; CHECK-NEXT:    vcmpunordph %zmm0, %zmm0, %k1 # encoding: [0x62,0xf3,0x7c,0x48,0xc2,0xc8,0x03]
+; CHECK-NEXT:    vmovdqu16 %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xd1]
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %z = call <32 x half> @llvm.minnum.v32f16(<32 x half> %x, <32 x half> %y) readnone
+  ret <32 x half> %z
+}
+
+define <4 x half> @minnum_intrinsic_nnan_fmf_f432(<4 x half> %a, <4 x half> %b) {
+; CHECK-LABEL: minnum_intrinsic_nnan_fmf_f432:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminph %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7c,0x08,0x5d,0xc1]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %r = tail call nnan <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b)
+  ret <4 x half> %r
+}
+
+define half @minnum_intrinsic_nnan_attr_f16(half %a, half %b) #0 {
+; CHECK-LABEL: minnum_intrinsic_nnan_attr_f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminsh %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5d,0xc1]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %r = tail call half @llvm.minnum.f16(half %a, half %b)
+  ret half %r
+}
+
+define half @test_minnum_const_op1(half %x) {
+; CHECK-LABEL: test_minnum_const_op1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5d,0x05,A,A,A,A]
+; CHECK-NEXT:    # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %r = call half @llvm.minnum.f16(half 1.0, half %x)
+  ret half %r
+}
+
+define half @test_minnum_const_op2(half %x) {
+; CHECK-LABEL: test_minnum_const_op2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0x62,0xf5,0x7e,0x08,0x5d,0x05,A,A,A,A]
+; CHECK-NEXT:    # fixup A - offset: 6, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %r = call half @llvm.minnum.f16(half %x, half 1.0)
+  ret half %r
+}
+
+define half @test_minnum_const_nan(half %x) {
+; CHECK-LABEL: test_minnum_const_nan:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %r = call half @llvm.minnum.f16(half %x, half 0x7fff000000000000)
+  ret half %r
+}
+
+attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll b/llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll
new file mode 100644
index 0000000000000..4e8d1f16a655f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-fold-load-binops.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s
+
+; Verify that we're folding the load into the math instruction.
+; This pattern is generated out of the simplest intrinsics usage:
+;  _mm_add_ss(a, _mm_load_ss(b));
+
+define <8 x half> @addsh(<8 x half> %va, half* %pb) {
+; CHECK-LABEL: addsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = extractelement <8 x half> %va, i32 0
+  %b = load half, half* %pb
+  %r = fadd half %a, %b
+  %vr = insertelement <8 x half> %va, half %r, i32 0
+  ret <8 x half> %vr
+}
+
+define <8 x half> @subsh(<8 x half> %va, half* %pb) {
+; CHECK-LABEL: subsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsubsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = extractelement <8 x half> %va, i32 0
+  %b = load half, half* %pb
+  %r = fsub half %a, %b
+  %vr = insertelement <8 x half> %va, half %r, i32 0
+  ret <8 x half> %vr
+}
+
+define <8 x half> @mulsh(<8 x half> %va, half* %pb) {
+; CHECK-LABEL: mulsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmulsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = extractelement <8 x half> %va, i32 0
+  %b = load half, half* %pb
+  %r = fmul half %a, %b
+  %vr = insertelement <8 x half> %va, half %r, i32 0
+  ret <8 x half> %vr
+}
+
+define <8 x half> @divsh(<8 x half> %va, half* %pb) {
+; CHECK-LABEL: divsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = extractelement <8 x half> %va, i32 0
+  %b = load half, half* %pb
+  %r = fdiv half %a, %b
+  %vr = insertelement <8 x half> %va, half %r, i32 0
+  ret <8 x half> %vr
+}
+
+define <8 x half> @minsh(<8 x half> %va, half* %pb) {
+; CHECK-LABEL: minsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminsh (%rdi), %xmm0, %xmm1
+; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = extractelement <8 x half> %va, i32 0
+  %b = load half, half* %pb
+  %r = call nnan half @llvm.minnum.f16(half %a, half %b) readnone
+  %vr = insertelement <8 x half> %va, half %r, i32 0
+  ret <8 x half> %vr
+}
+
+define <8 x half> @maxsh(<8 x half> %va, half* %pb) {
+; CHECK-LABEL: maxsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vminsh (%rdi), %xmm0, %xmm1
+; CHECK-NEXT:    vmovsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a = extractelement <8 x half> %va, i32 0
+  %b = load half, half* %pb
+  %r = call nnan half @llvm.minnum.f16(half %a, half %b) readnone
+  %vr = insertelement <8 x half> %va, half %r, i32 0
+  ret <8 x half> %vr
+}
+
+declare half @llvm.minnum.f16(half, half)
+declare half @llvm.maxnum.f16(half, half)
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fold-xmm-zero.ll b/llvm/test/CodeGen/X86/avx512fp16-fold-xmm-zero.ll
new file mode 100644
index 0000000000000..33d4cc164fb68
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-fold-xmm-zero.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=i386-apple-macosx10.6.7 -mattr=+avx512fp16 -no-integrated-as | FileCheck %s
+
+; Simple test to make sure folding for special constants (like half zero)
+; isn't completely broken.
+
+; CHECK: vdivsh LCPI0
+
+%0 = type { half, half, half, half, half, half, half, half }
+
+define void @f() nounwind ssp {
+entry:
+  %0 = tail call %0 asm sideeffect "foo", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(half 1.000000e+00, half 2.000000e+00, half 3.000000e+00, half 4.000000e+00, half 5.000000e+00, half 6.000000e+00, half 7.000000e+00, half 8.000000e+00) nounwind
+  %asmresult = extractvalue %0 %0, 0
+  %asmresult8 = extractvalue %0 %0, 1
+  %asmresult9 = extractvalue %0 %0, 2
+  %asmresult10 = extractvalue %0 %0, 3
+  %asmresult11 = extractvalue %0 %0, 4
+  %asmresult12 = extractvalue %0 %0, 5
+  %asmresult13 = extractvalue %0 %0, 6
+  %asmresult14 = extractvalue %0 %0, 7
+  %div = fdiv half %asmresult, 0.000000e+00
+  %1 = tail call %0 asm sideeffect "bar", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(half %div, half %asmresult8, half %asmresult9, half %asmresult10, half %asmresult11, half %asmresult12, half %asmresult13, half %asmresult14) nounwind
+  %asmresult24 = extractvalue %0 %1, 0
+  %asmresult25 = extractvalue %0 %1, 1
+  %asmresult26 = extractvalue %0 %1, 2
+  %asmresult27 = extractvalue %0 %1, 3
+  %asmresult28 = extractvalue %0 %1, 4
+  %asmresult29 = extractvalue %0 %1, 5
+  %asmresult30 = extractvalue %0 %1, 6
+  %asmresult31 = extractvalue %0 %1, 7
+  %div33 = fdiv half %asmresult24, 0.000000e+00
+  %2 = tail call %0 asm sideeffect "baz", "={xmm0},={xmm1},={xmm2},={xmm3},={xmm4},={xmm5},={xmm6},={xmm7},0,1,2,3,4,5,6,7,~{dirflag},~{fpsr},~{flags}"(half %div33, half %asmresult25, half %asmresult26, half %asmresult27, half %asmresult28, half %asmresult29, half %asmresult30, half %asmresult31) nounwind
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll
new file mode 100644
index 0000000000000..d2ff3a1215dd7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-fp-logic.ll
@@ -0,0 +1,381 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s
+
+; Test cases derived from float/double tests in fp-logic.ll
+
+; 1 FP operand, 1 int operand, int result
+
+define i16 @f1(half %x, i16 %y) {
+; CHECK-LABEL: f1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovw %xmm0, %eax
+; CHECK-NEXT:    andl %edi, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %and = and i16 %bc1, %y
+  ret i16 %and
+}
+
+; Swap operands of the logic op.
+
+define i16 @f2(half %x, i16 %y) {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovw %xmm0, %eax
+; CHECK-NEXT:    andl %edi, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %and = and i16 %y, %bc1
+  ret i16 %and
+}
+
+; 1 FP operand, 1 constant operand, int result
+
+define i16 @f3(half %x) {
+; CHECK-LABEL: f3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovw %xmm0, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %and = and i16 %bc1, 1
+  ret i16 %and
+}
+
+; Swap operands of the logic op.
+
+define i16 @f4(half %x) {
+; CHECK-LABEL: f4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovw %xmm0, %eax
+; CHECK-NEXT:    andl $2, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %and = and i16 2, %bc1
+  ret i16 %and
+}
+
+; 1 FP operand, 1 integer operand, FP result
+
+define half @f5(half %x, i16 %y) {
+; CHECK-LABEL: f5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovw %edi, %xmm1
+; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %and = and i16 %bc1, %y
+  %bc2 = bitcast i16 %and to half
+  ret half %bc2
+}
+
+; Swap operands of the logic op.
+
+define half @f6(half %x, i16 %y) {
+; CHECK-LABEL: f6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovw %edi, %xmm1
+; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %and = and i16 %y, %bc1
+  %bc2 = bitcast i16 %and to half
+  ret half %bc2
+}
+
+; 1 FP operand, 1 constant operand, FP result
+
+define half @f7(half %x) {
+; CHECK-LABEL: f7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %and = and i16 %bc1, 3
+  %bc2 = bitcast i16 %and to half
+  ret half %bc2
+}
+
+; Swap operands of the logic op.
+
+define half @f8(half %x) {
+; CHECK-LABEL: f8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %and = and i16 4, %bc1
+  %bc2 = bitcast i16 %and to half
+  ret half %bc2
+}
+
+; 2 FP operands, int result
+
+define i16 @f9(half %x, half %y) {
+; CHECK-LABEL: f9:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovw %xmm0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %bc2 = bitcast half %y to i16
+  %and = and i16 %bc1, %bc2
+  ret i16 %and
+}
+
+; 2 FP operands, FP result
+
+define half @f10(half %x, half %y) {
+; CHECK-LABEL: f10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %bc2 = bitcast half %y to i16
+  %and = and i16 %bc1, %bc2
+  %bc3 = bitcast i16 %and to half
+  ret half %bc3
+}
+
+define half @or(half %x, half %y) {
+; CHECK-LABEL: or:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %bc2 = bitcast half %y to i16
+  %and = or i16 %bc1, %bc2
+  %bc3 = bitcast i16 %and to half
+  ret half %bc3
+}
+
+define half @xor(half %x, half %y) {
+; CHECK-LABEL: xor:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %bc2 = bitcast half %y to i16
+  %and = xor i16 %bc1, %bc2
+  %bc3 = bitcast i16 %and to half
+  ret half %bc3
+}
+
+define half @f7_or(half %x) {
+; CHECK-LABEL: f7_or:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    vorps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %and = or i16 %bc1, 3
+  %bc2 = bitcast i16 %and to half
+  ret half %bc2
+}
+
+define half @f7_xor(half %x) {
+; CHECK-LABEL: f7_xor:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    vxorps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %and = xor i16 %bc1, 3
+  %bc2 = bitcast i16 %and to half
+  ret half %bc2
+}
+
+; Grabbing the sign bit is a special case that could be handled
+; by movmskps/movmskpd, but if we're not shifting it over, then
+; a simple FP logic op is cheaper.
+
+define half @movmsk(half %x) {
+; CHECK-LABEL: movmsk:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %and = and i16 %bc1, 32768
+  %bc2 = bitcast i16 %and to half
+  ret half %bc2
+}
+
+define half @bitcast_fabs(half %x) {
+; CHECK-LABEL: bitcast_fabs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %and = and i16 %bc1, 32767
+  %bc2 = bitcast i16 %and to half
+  ret half %bc2
+}
+
+define half @bitcast_fneg(half %x) {
+; CHECK-LABEL: bitcast_fneg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %x to i16
+  %xor = xor i16 %bc1, 32768
+  %bc2 = bitcast i16 %xor to half
+  ret half %bc2
+}
+
+define <8 x half> @bitcast_fabs_vec(<8 x half> %x) {
+; CHECK-LABEL: bitcast_fabs_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
+; CHECK-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast <8 x half> %x to <8 x i16>
+  %and = and <8 x i16> %bc1, <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>
+  %bc2 = bitcast <8 x i16> %and to <8 x half>
+  ret <8 x half> %bc2
+}
+
+define <8 x half> @bitcast_fneg_vec(<8 x half> %x) {
+; CHECK-LABEL: bitcast_fneg_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast <8 x half> %x to <8 x i16>
+  %xor = xor <8 x i16> %bc1, <i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768>
+  %bc2 = bitcast <8 x i16> %xor to <8 x half>
+  ret <8 x half> %bc2
+}
+
+define half @fadd_bitcast_fneg(half %x, half %y) {
+; CHECK-LABEL: fadd_bitcast_fneg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsubsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %y to i16
+  %xor = xor i16 %bc1, 32768
+  %bc2 = bitcast i16 %xor to half
+  %fadd = fadd half %x, %bc2
+  ret half %fadd
+}
+
+define half @fsub_bitcast_fneg(half %x, half %y) {
+; CHECK-LABEL: fsub_bitcast_fneg:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-NEXT:    vxorps %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vsubsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast half %y to i16
+  %xor = xor i16 %bc1, 32767
+  %bc2 = bitcast i16 %xor to half
+  %fsub = fsub half %x, %bc2
+  ret half %fsub
+}
+
+define half @nabs(half %a) {
+; CHECK-LABEL: nabs:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %conv = bitcast half %a to i16
+  %and = or i16 %conv, -32768
+  %conv1 = bitcast i16 %and to half
+  ret half %conv1
+}
+
+define <8 x half> @nabsv8f16(<8 x half> %a) {
+; CHECK-LABEL: nabsv8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %conv = bitcast <8 x half> %a to <8 x i16>
+  %and = or <8 x i16> %conv, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
+  %conv1 = bitcast <8 x i16> %and to <8 x half>
+  ret <8 x half> %conv1
+}
+
+define <8 x half> @fadd_bitcast_fneg_vec(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: fadd_bitcast_fneg_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsubph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast <8 x half> %y to <8 x i16>
+  %xor = xor <8 x i16> %bc1, <i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768>
+  %bc2 = bitcast <8 x i16> %xor to <8 x half>
+  %fadd = fadd <8 x half> %x, %bc2
+  ret <8 x half> %fadd
+}
+
+define <8 x half> @fadd_bitcast_fneg_vec_undef_elts(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: fadd_bitcast_fneg_vec_undef_elts:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsubph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast <8 x half> %y to <8 x i16>
+  %xor = xor <8 x i16> %bc1, <i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 undef, i16 32768>
+  %bc2 = bitcast <8 x i16> %xor to <8 x half>
+  %fadd = fadd <8 x half> %x, %bc2
+  ret <8 x half> %fadd
+}
+
+define <8 x half> @fsub_bitcast_fneg_vec(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: fsub_bitcast_fneg_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast <8 x half> %y to <8 x i16>
+  %xor = xor <8 x i16> %bc1, <i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768>
+  %bc2 = bitcast <8 x i16> %xor to <8 x half>
+  %fsub = fsub <8 x half> %x, %bc2
+  ret <8 x half> %fsub
+}
+
+define <8 x half> @fsub_bitcast_fneg_vec_undef_elts(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: fsub_bitcast_fneg_vec_undef_elts:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast <8 x half> %y to <8 x i16>
+  %xor = xor <8 x i16> %bc1, <i16 undef, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 32768, i16 undef>
+  %bc2 = bitcast <8 x i16> %xor to <8 x half>
+  %fsub = fsub <8 x half> %x, %bc2
+  ret <8 x half> %fsub
+}
+
+define <8 x half> @fadd_bitcast_fneg_vec_width(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: fadd_bitcast_fneg_vec_width:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast <8 x half> %y to <2 x i64>
+  %xor = xor <2 x i64> %bc1, <i64 -9223231297218904064, i64 -9223231297218904064>
+  %bc2 = bitcast <2 x i64> %xor to <8 x half>
+  %fadd = fadd <8 x half> %x, %bc2
+  ret <8 x half> %fadd
+}
+
+define <8 x half> @fsub_bitcast_fneg_vec_width(<8 x half> %x, <8 x half> %y) {
+; CHECK-LABEL: fsub_bitcast_fneg_vec_width:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; CHECK-NEXT:    vsubph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %bc1 = bitcast <8 x half> %y to <2 x i64>
+  %xor = xor <2 x i64> %bc1, <i64 -9223231297218904064, i64 -9223231297218904064>
+  %bc2 = bitcast <2 x i64> %xor to <8 x half>
+  %fsub = fsub <8 x half> %x, %bc2
+  ret <8 x half> %fsub
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
new file mode 100644
index 0000000000000..170e1ea1a6a92
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
@@ -0,0 +1,1158 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 | FileCheck %s
+
+declare i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half>, <8 x half>, i32, i32)
+
+define i32 @test_x86_avx512fp16_ucomi_sh_lt(<8 x half> %a0, <8 x half> %a1) {
+; CHECK-LABEL: test_x86_avx512fp16_ucomi_sh_lt:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcmpngesh %xmm1, %xmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    retq
+  %res = call i32 @llvm.x86.avx512fp16.vcomi.sh(<8 x half> %a0, <8 x half> %a1, i32 9, i32 4)
+  ret i32 %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half>, i32) nounwind readnone
+
+define <32 x half> @test_sqrt_ph_512(<32 x half> %a0) {
+; CHECK-LABEL: test_sqrt_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsqrtph %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+  ret <32 x half> %1
+}
+
+define <32 x half> @test_mask_sqrt_ph_512(<32 x half> %a0, <32 x half> %passthru, i32 %mask) {
+; CHECK-LABEL: test_mask_sqrt_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsqrtph %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> %passthru
+  ret <32 x half> %3
+}
+
+define <32 x half> @test_maskz_sqrt_ph_512(<32 x half> %a0, i32 %mask) {
+; CHECK-LABEL: test_maskz_sqrt_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsqrtph %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %1 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> zeroinitializer
+  ret <32 x half> %3
+}
+
+declare <32 x half> @llvm.sqrt.v32f16(<32 x half>)
+
+define <32 x half> @test_sqrt_round_ph_512(<32 x half> %a0) {
+; CHECK-LABEL: test_sqrt_round_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsqrtph {rz-sae}, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %a0, i32 11)
+  ret <32 x half> %1
+}
+
+define <32 x half> @test_mask_sqrt_round_ph_512(<32 x half> %a0, <32 x half> %passthru, i32 %mask) {
+; CHECK-LABEL: test_mask_sqrt_round_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsqrtph {rz-sae}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %a0, i32 11)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> %passthru
+  ret <32 x half> %3
+}
+
+define <32 x half> @test_maskz_sqrt_round_ph_512(<32 x half> %a0, i32 %mask) {
+; CHECK-LABEL: test_maskz_sqrt_round_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsqrtph {rz-sae}, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %1 = call <32 x half> @llvm.x86.avx512fp16.sqrt.ph.512(<32 x half> %a0, i32 11)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x half> %1, <32 x half> zeroinitializer
+  ret <32 x half> %3
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32) nounwind readnone
+
+define <8 x half> @test_sqrt_sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+; CHECK-LABEL: test_sqrt_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsqrtsh %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask, i32 4)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_sqrt_sh_r(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+; CHECK-LABEL: test_sqrt_sh_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsqrtsh {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask, i32 10)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_sqrt_sh_nomask(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+; CHECK-LABEL: test_sqrt_sh_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsqrtsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 -1, i32 4)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_sqrt_sh_z(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2, i8 %mask) {
+; CHECK-LABEL: test_sqrt_sh_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsqrtsh {ru-sae}, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 10)
+  ret <8 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half>, <32 x half>, i32)
+declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8)
+
+define <32 x half> @test_rsqrt_ph_512(<32 x half> %a0) {
+; CHECK-LABEL: test_rsqrt_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrsqrtph %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 -1)
+  ret <32 x half> %res
+}
+
+define <8 x half> @test_rsqrt_sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %a2) {
+; CHECK-LABEL: test_rsqrt_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrsqrtsh %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a0, <8 x half> %a2, i8 -1)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_rsqrt_sh_load(<8 x half> %a0, <8 x half>* %a1ptr) {
+; CHECK-LABEL: test_rsqrt_sh_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrsqrtsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a1 = load <8 x half>, <8 x half>* %a1ptr
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_rsqrt_sh_maskz(<8 x half> %a0, i8 %mask) {
+; CHECK-LABEL: test_rsqrt_sh_maskz:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vrsqrtsh %xmm0, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a0, <8 x half> zeroinitializer, i8 %mask)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_rsqrt_sh_mask(<8 x half> %a0, <8 x half> %b0, <8 x half> %c0, i8 %mask) {
+; CHECK-LABEL: test_rsqrt_sh_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vrsqrtsh %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %b0, <8 x half> %c0, i8 %mask)
+  ret <8 x half> %res
+}
+
+declare <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half>, i32)
+
+define i32 @test_int_x86_avx512_fpclass_ph_512(<32 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_fpclass_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfpclassph $2, %zmm0, %k1
+; CHECK-NEXT:    vfpclassph $4, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %x0, i32 4)
+  %res1 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %x0, i32 2)
+  %1 = and <32 x i1> %res1, %res
+  %2 = bitcast <32 x i1> %1 to i32
+  ret i32 %2
+}
+
+declare i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half>, i32, i8)
+
+define i8 @test_int_x86_avx512_mask_fpclass_sh(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfpclasssh $4, %xmm0, %k1
+; CHECK-NEXT:    vfpclasssh $2, %xmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %res = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %x0, i32 2, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %x0, i32 4, i8 %res)
+  ret i8 %res1
+}
+
+define i8 @test_int_x86_avx512_mask_fpclass_sh_load(<8 x half>* %x0ptr) {
+; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_sh_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfpclasssh $4, (%rdi), %k0
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %x0 = load <8 x half>, <8 x half>* %x0ptr
+  %res = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %x0, i32 4, i8 -1)
+  ret i8 %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half>, <32 x half>, i32)
+
+define <32 x half> @test_rcp_ph_512(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
+; CHECK-LABEL: test_rcp_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vrcpph %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> %a1, i32 %mask)
+  ret <32 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half>, <8 x half>, <8 x half>, i8)
+
+define <8 x half> @test_rcp_sh(<8 x half> %a0) {
+; CHECK-LABEL: test_rcp_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrcpsh %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+    %res = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a0, <8 x half> zeroinitializer, i8 -1)
+    ret <8 x half> %res
+}
+
+define <8 x half> @test_rcp_sh_load(<8 x half> %a0, <8 x half>* %a1ptr) {
+; CHECK-LABEL: test_rcp_sh_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrcpsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a1 = load <8 x half>, <8 x half>* %a1ptr
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 -1)
+  ret <8 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
+
+define <32 x half>@test_int_x86_avx512_mask_reduce_ph_512(<32 x half> %x0, <32 x half> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vreduceph $8, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vreduceph $4, {sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vaddph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %x0, i32 8, <32 x half> %x2, i32 %x3, i32 4)
+  %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %x0, i32 4, <32 x half> %x2, i32 -1, i32 8)
+  %res2 = fadd <32 x half> %res, %res1
+  ret <32 x half> %res2
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half>, <8 x half>,<8 x half>, i8, i32, i32)
+
+define <8 x half>@test_int_x86_avx512_mask_reduce_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vreducesh $4, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4, i32 4, i32 4)
+  ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_reduce_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_sh_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vreducesh $4, {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 -1, i32 4, i32 8)
+  ret <8 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
+
+define <32 x half>@test_int_x86_avx512_mask_rndscale_ph_512(<32 x half> %x0, <32 x half> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vrndscaleph $8, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vrndscaleph $4, {sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vaddph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %x0, i32 8, <32 x half> %x2, i32 %x3, i32 4)
+  %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %x0, i32 4, <32 x half> %x2, i32 -1, i32 8)
+  %res2 = fadd <32 x half> %res, %res1
+  ret <32 x half> %res2
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half>, <8 x half>,<8 x half>, i8, i32, i32)
+
+define <8 x half>@test_int_x86_avx512_mask_rndscale_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vrndscalesh $4, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4, i32 4, i32 4)
+  ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_rndscale_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_sh_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrndscalesh $4, {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 -1, i32 4, i32 8)
+  ret <8 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half>, <32 x half>, i32, i32)
+
+define <32 x half>@test_int_x86_avx512_mask_getexp_ph_512(<32 x half> %x0, <32 x half> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getexp_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vgetexpph %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vgetexpph {sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vaddph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %x0, <32 x half> %x1, i32 %x2, i32 4)
+  %res2 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %x0, <32 x half> zeroinitializer, i32 -1, i32 8)
+  %res3 = fadd <32 x half> %res1, %res2
+  ret <32 x half> %res3
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half>, <8 x half>,<8 x half>, i8, i32)
+
+define <8 x half>@test_int_x86_avx512_mask_getexp_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getexp_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vgetexpsh %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+    %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4, i32 4)
+    ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_getexp_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getexp_sh_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgetexpsh {sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+    %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 -1, i32 8)
+    ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_getexp_sh_load(<8 x half> %x0, <8 x half>* %x1ptr) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getexp_sh_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgetexpsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %x1 = load <8 x half>, <8 x half>* %x1ptr
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> undef, i8 -1, i32 4)
+  ret <8 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
+
+define <32 x half>@test_int_x86_avx512_mask_getmant_ph_512(<32 x half> %x0, <32 x half> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vgetmantph $8, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vgetmantph $4, {sae}, %zmm0, %zmm0
+; CHECK-NEXT:    vaddph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %x0, i32 8, <32 x half> %x2, i32 %x3, i32 4)
+  %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %x0, i32 4, <32 x half> %x2, i32 -1, i32 8)
+  %res2 = fadd <32 x half> %res, %res1
+  ret <32 x half> %res2
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half>, <8 x half>, i32, <8 x half>, i8, i32)
+
+define <8 x half>@test_int_x86_avx512_mask_getmant_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vgetmantsh $11, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %x0, <8 x half> %x1, i32 11, <8 x half> %x3, i8 %x4, i32 4)
+  ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_getmant_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sh_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgetmantsh $11, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %x0, <8 x half> %x1, i32 11, <8 x half> %x3, i8 -1, i32 4)
+  ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_getmant_sh_z(<8 x half> %x0, <8 x half> %x1, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_sh_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vgetmantsh $11, %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %x0, <8 x half> %x1, i32 11, <8 x half> zeroinitializer, i8 %x4, i32 4)
+  ret <8 x half> %res
+}
+
+declare <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half>, <32 x half>, <32 x half>, i32, i32)
+
+define <32 x half>@test_int_x86_avx512_mask_scalef_ph_512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vscalefph {rz-sae}, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vscalefph {rn-sae}, %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddph %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i32 %x3 to <32 x i1>
+  %res1 = call <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, i32 %x3, i32 11)
+  %res2 = call <32 x half> @llvm.x86.avx512fp16.mask.scalef.ph.512(<32 x half> %x0, <32 x half> %x1, <32 x half> zeroinitializer, i32 -1, i32 8)
+  %res3 = fadd <32 x half> %res1, %res2
+  ret <32 x half> %res3
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half>, <8 x half>,<8 x half>, i8, i32)
+
+define <8 x half>@test_int_x86_avx512_mask_scalef_sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vscalefsh %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+    %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 %x4, i32 4)
+    ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_scalef_sh_nomask(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sh_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vscalefsh {rn-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+    %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> %x3, i8 -1, i32 8)
+    ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_scalef_sh_load(<8 x half> %x0, <8 x half>* %x1ptr) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_sh_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vscalefsh (%rdi), %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %x1 = load <8 x half>, <8 x half>* %x1ptr
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.sh(<8 x half> %x0, <8 x half> %x1, <8 x half> undef, i8 -1, i32 4)
+  ret <8 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
+
+define <8 x half> @test_int_x86_avx512fp16_mask_add_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_add_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vaddsh %xmm1, %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vaddsh (%rsi), %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %val.half = load half,half * %ptr
+  %val = insertelement <8 x half> undef, half %val.half, i32 0
+  %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4)
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4)
+  %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4)
+  %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.add.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4)
+  ret <8 x half> %res3
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
+
+define <8 x half> @test_int_x86_avx512fp16_mask_sub_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_sub_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsubsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vsubsh %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vsubsh %xmm1, %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vsubsh (%rsi), %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %val.half = load half,half * %ptr
+  %val = insertelement <8 x half> undef, half %val.half, i32 0
+  %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4)
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4)
+  %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4)
+  %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.sub.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4)
+  ret <8 x half> %res3
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
+
+define <8 x half> @test_int_x86_avx512fp16_mask_mul_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_mul_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmulsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vmulsh %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vmulsh %xmm1, %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmulsh (%rsi), %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %val.half = load half,half * %ptr
+  %val = insertelement <8 x half> undef, half %val.half, i32 0
+  %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4)
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4)
+  %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4)
+  %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4)
+  ret <8 x half> %res3
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
+
+define <8 x half> @test_int_x86_avx512fp16_mask_div_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_div_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vdivsh %xmm1, %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vdivsh (%rsi), %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %val.half = load half,half * %ptr
+  %val = insertelement <8 x half> undef, half %val.half, i32 0
+  %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4)
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4)
+  %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4)
+  %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.div.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4)
+  ret <8 x half> %res3
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
+
+define <8 x half> @test_int_x86_avx512fp16_mask_min_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_min_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vminsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vminsh %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vminsh %xmm1, %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vminsh (%rsi), %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %val.half = load half,half * %ptr
+  %val = insertelement <8 x half> undef, half %val.half, i32 0
+  %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4)
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4)
+  %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4)
+  %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4)
+  ret <8 x half> %res3
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
+
+define <8 x half> @test_int_x86_avx512fp16_mask_max_sh(<8 x half> %x1, <8 x half> %x2, <8 x half> %src, i8 %mask, half * %ptr) {
+; CHECK-LABEL: test_int_x86_avx512fp16_mask_max_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vmaxsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vmaxsh %xmm1, %xmm0, %xmm3 {%k1}
+; CHECK-NEXT:    vmaxsh %xmm1, %xmm3, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmaxsh (%rsi), %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %val.half = load half,half * %ptr
+  %val = insertelement <8 x half> undef, half %val.half, i32 0
+  %res0 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %x1, <8 x half> %x2, <8 x half> zeroinitializer, i8 -1, i32 4)
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %res0, <8 x half> %x2, <8 x half> %src , i8 %mask, i32 4)
+  %res2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %res1, <8 x half> %x2, <8 x half> zeroinitializer , i8 %mask, i32 4)
+  %res3 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %res2, <8 x half> %val, <8 x half> %src , i8 %mask, i32 4)
+  ret <8 x half> %res3
+}
+
+declare i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half>, <8 x half>, i32, i8, i32)
+
+define i8 @test_int_x86_avx512_mask_cmp_sh(<8 x half> %x0, <8 x half> %x1, i8 %x3, i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcmpunordsh %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %res2 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 3, i8 %x3, i32 4)
+  ret i8 %res2
+}
+
+
+define i8 @test_int_x86_avx512_mask_cmp_sh_all(<8 x half> %x0, <8 x half> %x1, i8 %x3, i32 %x4) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cmp_sh_all:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcmplesh %xmm1, %xmm0, %k0
+; CHECK-NEXT:    kmovd %k0, %ecx
+; CHECK-NEXT:    vcmpunordsh {sae}, %xmm1, %xmm0, %k0
+; CHECK-NEXT:    kmovd %k0, %edx
+; CHECK-NEXT:    vcmpneqsh %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovd %k0, %esi
+; CHECK-NEXT:    vcmpnltsh {sae}, %xmm1, %xmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andb %sil, %al
+; CHECK-NEXT:    andb %dl, %al
+; CHECK-NEXT:    andb %cl, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %res1 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 2, i8 -1, i32 4)
+  %res2 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 3, i8 -1, i32 8)
+  %res3 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 4, i8 %x3, i32 4)
+  %res4 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 5, i8 %x3, i32 8)
+
+  %res11 = and i8 %res1, %res2
+  %res12 = and i8 %res3, %res4
+  %res13 = and i8 %res11, %res12
+  ret i8 %res13
+}
+
+declare <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32>, i32)
+
+define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512(<16 x i32> %x0, <16 x half> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtdq2ph %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i16 %x2 to <16 x i1>
+  %res0 = call <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 4)
+  %res = select <16 x i1> %mask, <16 x half> %res0, <16 x half> %x1
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512_r(<16 x i32> %x0, <16 x half> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_512_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtdq2ph {ru-sae}, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i16 %x2 to <16 x i1>
+  %res0 = call <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 10)
+  %res = select <16 x i1> %mask, <16 x half> %res0, <16 x half> %x1
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512_nomask(<16 x i32> %x0, <16 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_512_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtdq2ph %zmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 4)
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512_mask_cvt_dq2ph_512_z(<16 x i32> %x0, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_512_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtdq2ph %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %mask = bitcast i16 %x2 to <16 x i1>
+  %res0 = call <16 x half> @llvm.x86.avx512.sitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 4)
+  %res = select <16 x i1> %mask, <16 x half> %res0, <16 x half> zeroinitializer
+  ret <16 x half> %res
+}
+
+define <16 x half> @sint_to_fp_16i32_to_16f16(<16 x i32> %x) {
+; CHECK-LABEL: sint_to_fp_16i32_to_16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtdq2ph %zmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = sitofp <16 x i32> %x to <16 x half>
+  ret <16 x half> %res
+}
+
+declare <16 x half> @llvm.x86.avx512.uitofp.round.v16f16.v16i32(<16 x i32>, i32)
+
+define <16 x half> @test_int_x86_avx512_mask_cvt_udq2ph_512_r(<16 x i32> %x0, <16 x half> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_512_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtudq2ph {ru-sae}, %zmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i16 %x2 to <16 x i1>
+  %res0 = call <16 x half> @llvm.x86.avx512.uitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 10)
+  %res = select <16 x i1> %mask, <16 x half> %res0, <16 x half> %x1
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512_mask_cvt_udq2ph_512_nomask(<16 x i32> %x0, <16 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_512_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtudq2ph %zmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512.uitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 4)
+  ret <16 x half> %res
+}
+
+define <16 x half> @test_int_x86_avx512_mask_cvt_udq2ph_512_z(<16 x i32> %x0, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_512_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtudq2ph %zmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %mask = bitcast i16 %x2 to <16 x i1>
+  %res0 = call <16 x half> @llvm.x86.avx512.uitofp.round.v16f16.v16i32(<16 x i32> %x0, i32 4)
+  %res = select <16 x i1> %mask, <16 x half> %res0, <16 x half> zeroinitializer
+  ret <16 x half> %res
+}
+
+define <16 x half> @uint_to_fp_16i32_to_16f16(<16 x i32> %x) {
+; CHECK-LABEL: uint_to_fp_16i32_to_16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtudq2ph %zmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = uitofp <16 x i32> %x to <16 x half>
+  ret <16 x half> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.512(<16 x half>, <16 x i32>, i16, i32)
+
+define <16 x i32> @test_int_x86_avx512_mask_cvt_ph2dq_512(<16 x half> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2dq_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2dq {ru-sae}, %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtph2dq {rn-sae}, %ymm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.512(<16 x half> %x0, <16 x i32> %x1, i16 %x2, i32 10)
+  %res1 = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.512(<16 x half> %x0, <16 x i32> %x1, i16 -1, i32 8)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.512(<16 x half>, <16 x i32>, i16, i32)
+
+define <16 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_512(<16 x half> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2udq_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2udq {ru-sae}, %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtph2udq {rn-sae}, %ymm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.512(<16 x half> %x0, <16 x i32> %x1, i16 %x2, i32 10)
+  %res1 = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.512(<16 x half> %x0, <16 x i32> %x1, i16 -1, i32 8)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.512(<16 x half>, <16 x i32>, i16, i32)
+
+define <16 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_512(<16 x half> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2dq_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2dq %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvttph2dq {sae}, %ymm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.512(<16 x half> %x0, <16 x i32> %x1, i16 %x2, i32 4)
+  %res1 = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.512(<16 x half> %x0, <16 x i32> %x1, i16 -1, i32 8)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.512(<16 x half>, <16 x i32>, i16, i32)
+
+define <16 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_512(<16 x half> %x0, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2udq_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2udq %ymm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvttph2udq {sae}, %ymm0, %zmm0
+; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.512(<16 x half> %x0, <16 x i32> %x1, i16 %x2, i32 4)
+  %res1 = call <16 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.512(<16 x half> %x0, <16 x i32> %x1, i16 -1, i32 8)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64>, i32)
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512(<8 x i64> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtqq2ph %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %mask = bitcast i8 %x2 to <8 x i1>
+  %res0 = call <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 4)
+  %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> %x1
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512_r(<8 x i64> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_512_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtqq2ph {ru-sae}, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %mask = bitcast i8 %x2 to <8 x i1>
+  %res0 = call <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 10)
+  %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> %x1
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512_nomask(<8 x i64> %x0, <8 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_512_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtqq2ph %zmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 4)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_512_z(<8 x i64> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_512_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtqq2ph %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %mask = bitcast i8 %x2 to <8 x i1>
+  %res0 = call <8 x half> @llvm.x86.avx512.sitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 4)
+  %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> zeroinitializer
+  ret <8 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64>, i32)
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512(<8 x i64> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuqq2ph %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %mask = bitcast i8 %x2 to <8 x i1>
+  %res0 = call <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 4)
+  %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> %x1
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512_r(<8 x i64> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_512_r:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuqq2ph {ru-sae}, %zmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %mask = bitcast i8 %x2 to <8 x i1>
+  %res0 = call <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 10)
+  %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> %x1
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512_nomask(<8 x i64> %x0, <8 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_512_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuqq2ph %zmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 4)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_512_z(<8 x i64> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_512_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuqq2ph %zmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %mask = bitcast i8 %x2 to <8 x i1>
+  %res0 = call <8 x half> @llvm.x86.avx512.uitofp.round.v8f16.v8i64(<8 x i64> %x0, i32 4)
+  %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> zeroinitializer
+  ret <8 x half> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2qq.512(<8 x half>, <8 x i64>, i8, i32)
+
+define <8 x i64> @test_int_x86_avx512_mask_cvt_ph2qq_512(<8 x half> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2qq_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2qq {ru-sae}, %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtph2qq {rn-sae}, %xmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2qq.512(<8 x half> %x0, <8 x i64> %x1, i8 %x2, i32 10)
+  %res1 = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2qq.512(<8 x half> %x0, <8 x i64> %x1, i8 -1, i32 8)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2uqq.512(<8 x half>, <8 x i64>, i8, i32)
+
+define <8 x i64> @test_int_x86_avx512_mask_cvt_ph2uqq_512(<8 x half> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2uqq_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2uqq {ru-sae}, %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvtph2uqq {rn-sae}, %xmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2uqq.512(<8 x half> %x0, <8 x i64> %x1, i8 %x2, i32 10)
+  %res1 = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvtph2uqq.512(<8 x half> %x0, <8 x i64> %x1, i8 -1, i32 8)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.512(<8 x half>, <8 x i64>, i8, i32)
+
+define <8 x i64> @test_int_x86_avx512_mask_cvtt_ph2uqq_512(<8 x half> %x0, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2uqq_512:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2uqq {sae}, %xmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vcvttph2uqq %xmm0, %zmm0
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.512(<8 x half> %x0, <8 x i64> %x1, i8 %x2, i32 8)
+  %res1 = call <8 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.512(<8 x half> %x0, <8 x i64> %x1, i8 -1, i32 4)
+  %res2 = add <8 x i64> %res, %res1
+  ret <8 x i64> %res2
+}
+
+declare i32 @llvm.x86.avx512fp16.vcvtsh2si32(<8 x half>, i32)
+
+define i32 @test_x86_avx512fp16_vcvtsh2si32(<8 x half> %arg0) {
+; CHECK-LABEL: test_x86_avx512fp16_vcvtsh2si32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtsh2si %xmm0, %ecx
+; CHECK-NEXT:    vcvtsh2si {rz-sae}, %xmm0, %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    retq
+  %res1 = call i32 @llvm.x86.avx512fp16.vcvtsh2si32(<8 x half> %arg0, i32 4)
+  %res2 = call i32 @llvm.x86.avx512fp16.vcvtsh2si32(<8 x half> %arg0, i32 11)
+  %res = add i32 %res1, %res2
+  ret i32 %res
+}
+
+declare i64 @llvm.x86.avx512fp16.vcvtsh2si64(<8 x half>, i32)
+
+define i64 @test_x86_avx512fp16_vcvtsh2si64(<8 x half> %arg0) {
+; CHECK-LABEL: test_x86_avx512fp16_vcvtsh2si64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtsh2si %xmm0, %rcx
+; CHECK-NEXT:    vcvtsh2si {ru-sae}, %xmm0, %rax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    retq
+  %res1 = call i64 @llvm.x86.avx512fp16.vcvtsh2si64(<8 x half> %arg0, i32 4)
+  %res2 = call i64 @llvm.x86.avx512fp16.vcvtsh2si64(<8 x half> %arg0, i32 10)
+  %res = add i64 %res1, %res2
+  ret i64 %res
+}
+
+declare i32 @llvm.x86.avx512fp16.vcvttsh2si32(<8 x half>, i32)
+
+define i32 @test_x86_avx512fp16_vcvttsh2si32(<8 x half> %arg0) {
+; CHECK-LABEL: test_x86_avx512fp16_vcvttsh2si32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttsh2si %xmm0, %ecx
+; CHECK-NEXT:    vcvttsh2si {sae}, %xmm0, %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    retq
+  %res1 = call i32 @llvm.x86.avx512fp16.vcvttsh2si32(<8 x half> %arg0, i32 4)
+  %res2 = call i32 @llvm.x86.avx512fp16.vcvttsh2si32(<8 x half> %arg0, i32 8)
+  %res = add i32 %res1, %res2
+  ret i32 %res
+}
+
+declare i64 @llvm.x86.avx512fp16.vcvttsh2si64(<8 x half>, i32)
+
+define i64 @test_x86_avx512fp16_vcvttsh2si64(<8 x half> %arg0) {
+; CHECK-LABEL: test_x86_avx512fp16_vcvttsh2si64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttsh2si %xmm0, %rcx
+; CHECK-NEXT:    vcvttsh2si {sae}, %xmm0, %rax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    retq
+  %res1 = call i64 @llvm.x86.avx512fp16.vcvttsh2si64(<8 x half> %arg0, i32 4)
+  %res2 = call i64 @llvm.x86.avx512fp16.vcvttsh2si64(<8 x half> %arg0, i32 8)
+  %res = add i64 %res1, %res2
+  ret i64 %res
+}
+
+
+declare i32 @llvm.x86.avx512fp16.vcvtsh2usi32(<8 x half>, i32)
+
+define i32 @test_x86_avx512fp16_vcvtsh2usi32(<8 x half> %arg0) {
+; CHECK-LABEL: test_x86_avx512fp16_vcvtsh2usi32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtsh2usi %xmm0, %ecx
+; CHECK-NEXT:    vcvtsh2usi {rd-sae}, %xmm0, %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    retq
+  %res1 = call i32 @llvm.x86.avx512fp16.vcvtsh2usi32(<8 x half> %arg0, i32 4)
+  %res2 = call i32 @llvm.x86.avx512fp16.vcvtsh2usi32(<8 x half> %arg0, i32 9)
+  %res = add i32 %res1, %res2
+  ret i32 %res
+}
+
+
+declare i64 @llvm.x86.avx512fp16.vcvtsh2usi64(<8 x half>, i32)
+
+define i64 @test_x86_avx512fp16_vcvtsh2usi64(<8 x half> %arg0) {
+; CHECK-LABEL: test_x86_avx512fp16_vcvtsh2usi64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtsh2usi %xmm0, %rcx
+; CHECK-NEXT:    vcvtsh2usi {ru-sae}, %xmm0, %rax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    retq
+  %res1 = call i64 @llvm.x86.avx512fp16.vcvtsh2usi64(<8 x half> %arg0, i32 4)
+  %res2 = call i64 @llvm.x86.avx512fp16.vcvtsh2usi64(<8 x half> %arg0, i32 10)
+  %res = add i64 %res1, %res2
+  ret i64 %res
+}
+
+declare i32 @llvm.x86.avx512fp16.vcvttsh2usi32(<8 x half>, i32)
+
+define i32 @test_x86_avx512fp16_vcvttsh2usi32(<8 x half> %arg0) {
+; CHECK-LABEL: test_x86_avx512fp16_vcvttsh2usi32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttsh2usi %xmm0, %ecx
+; CHECK-NEXT:    vcvttsh2usi {sae}, %xmm0, %eax
+; CHECK-NEXT:    addl %ecx, %eax
+; CHECK-NEXT:    retq
+  %res1 = call i32 @llvm.x86.avx512fp16.vcvttsh2usi32(<8 x half> %arg0, i32 4)
+  %res2 = call i32 @llvm.x86.avx512fp16.vcvttsh2usi32(<8 x half> %arg0, i32 8)
+  %res = add i32 %res1, %res2
+  ret i32 %res
+}
+
+declare i64 @llvm.x86.avx512fp16.vcvttsh2usi64(<8 x half>, i32)
+
+define i64 @test_x86_avx512fp16_vcvttsh2usi64(<8 x half> %arg0) {
+; CHECK-LABEL: test_x86_avx512fp16_vcvttsh2usi64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttsh2usi %xmm0, %rcx
+; CHECK-NEXT:    vcvttsh2usi {sae}, %xmm0, %rax
+; CHECK-NEXT:    addq %rcx, %rax
+; CHECK-NEXT:    retq
+  %res1 = call i64 @llvm.x86.avx512fp16.vcvttsh2usi64(<8 x half> %arg0, i32 4)
+  %res2 = call i64 @llvm.x86.avx512fp16.vcvttsh2usi64(<8 x half> %arg0, i32 8)
+  %res = add i64 %res1, %res2
+  ret i64 %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.vcvtsi2sh(<8 x half>, i32, i32)
+
+define <8 x half> @test_x86_avx512fp16_vcvtsi2sh(<8 x half> %arg0, i32 %arg1) {
+; CHECK-LABEL: test_x86_avx512fp16_vcvtsi2sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtsi2sh %edi, %xmm0, %xmm1
+; CHECK-NEXT:    vcvtsi2sh %edi, {rd-sae}, %xmm0, %xmm0
+; CHECK-NEXT:    vaddph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.vcvtsi2sh(<8 x half> %arg0, i32 %arg1, i32 4)
+  %res2 = call <8 x half> @llvm.x86.avx512fp16.vcvtsi2sh(<8 x half> %arg0, i32 %arg1, i32 9)
+  %res = fadd <8 x half> %res1, %res2
+  ret <8 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.vcvtsi642sh(<8 x half>, i64, i32)
+
+define <8 x half> @test_x86_avx512fp16_vcvtsi642sh(<8 x half> %arg0, i64 %arg1) {
+; CHECK-LABEL: test_x86_avx512fp16_vcvtsi642sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtsi2sh %rdi, %xmm0, %xmm1
+; CHECK-NEXT:    vcvtsi2sh %rdi, {rn-sae}, %xmm0, %xmm0
+; CHECK-NEXT:    vaddph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.vcvtsi642sh(<8 x half> %arg0, i64 %arg1, i32 4)
+  %res2 = call <8 x half> @llvm.x86.avx512fp16.vcvtsi642sh(<8 x half> %arg0, i64 %arg1, i32 8)
+  %res = fadd <8 x half> %res1, %res2
+  ret <8 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.vcvtusi2sh(<8 x half>, i32, i32)
+
+define <8 x half> @test_x86_avx512fp16_vcvtusi2sh(<8 x half> %arg0, i32 %arg1) {
+; CHECK-LABEL: test_x86_avx512fp16_vcvtusi2sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtusi2sh %edi, %xmm0, %xmm1
+; CHECK-NEXT:    vcvtusi2sh %edi, {rd-sae}, %xmm0, %xmm0
+; CHECK-NEXT:    vaddph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.vcvtusi2sh(<8 x half> %arg0, i32 %arg1, i32 4)
+  %res2 = call <8 x half> @llvm.x86.avx512fp16.vcvtusi2sh(<8 x half> %arg0, i32 %arg1, i32 9)
+  %res = fadd <8 x half> %res1, %res2
+  ret <8 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.vcvtusi642sh(<8 x half>, i64, i32)
+
+define <8 x half> @test_x86_avx512fp16_vcvtusi642sh(<8 x half> %arg0, i64 %arg1) {
+; CHECK-LABEL: test_x86_avx512fp16_vcvtusi642sh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtusi2sh %rdi, %xmm0, %xmm1
+; CHECK-NEXT:    vcvtusi2sh %rdi, {rd-sae}, %xmm0, %xmm0
+; CHECK-NEXT:    vaddph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.vcvtusi642sh(<8 x half> %arg0, i64 %arg1, i32 4)
+  %res2 = call <8 x half> @llvm.x86.avx512fp16.vcvtusi642sh(<8 x half> %arg0, i64 %arg1, i32 9)
+  %res = fadd <8 x half> %res1, %res2
+  ret <8 x half> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll b/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll
new file mode 100644
index 0000000000000..d8ab0de2ca97b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-machine-combiner.ll
@@ -0,0 +1,345 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512fp16 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s
+
+; Incremental updates of the instruction depths should be enough for this test
+; case.
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512fp16 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s
+
+; Verify that the first two adds are independent regardless of how the inputs are
+; commuted. The destination registers are used as source registers for the third add.
+
+define half @reassociate_adds1(half %x0, half %x1, half %x2, half %x3) {
+; CHECK-LABEL: reassociate_adds1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddsh %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd reassoc nsz half %x0, %x1
+  %t1 = fadd reassoc nsz half %t0, %x2
+  %t2 = fadd reassoc nsz half %t1, %x3
+  ret half %t2
+}
+
+define half @reassociate_adds2(half %x0, half %x1, half %x2, half %x3) {
+; CHECK-LABEL: reassociate_adds2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddsh %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd reassoc nsz half %x0, %x1
+  %t1 = fadd reassoc nsz half %x2, %t0
+  %t2 = fadd reassoc nsz half %t1, %x3
+  ret half %t2
+}
+
+define half @reassociate_adds3(half %x0, half %x1, half %x2, half %x3) {
+; CHECK-LABEL: reassociate_adds3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddsh %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd reassoc nsz half %x0, %x1
+  %t1 = fadd reassoc nsz half %t0, %x2
+  %t2 = fadd reassoc nsz half %x3, %t1
+  ret half %t2
+}
+
+define half @reassociate_adds4(half %x0, half %x1, half %x2, half %x3) {
+; CHECK-LABEL: reassociate_adds4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddsh %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd reassoc nsz half %x0, %x1
+  %t1 = fadd reassoc nsz half %x2, %t0
+  %t2 = fadd reassoc nsz half %x3, %t1
+  ret half %t2
+}
+
+; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
+; produced because that would cost more compile time.
+
+define half @reassociate_adds5(half %x0, half %x1, half %x2, half %x3, half %x4, half %x5, half %x6, half %x7) {
+; CHECK-LABEL: reassociate_adds5:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddsh %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddsh %xmm5, %xmm4, %xmm1
+; CHECK-NEXT:    vaddsh %xmm6, %xmm1, %xmm1
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddsh %xmm7, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd reassoc nsz half %x0, %x1
+  %t1 = fadd reassoc nsz half %t0, %x2
+  %t2 = fadd reassoc nsz half %t1, %x3
+  %t3 = fadd reassoc nsz half %t2, %x4
+  %t4 = fadd reassoc nsz half %t3, %x5
+  %t5 = fadd reassoc nsz half %t4, %x6
+  %t6 = fadd reassoc nsz half %t5, %x7
+  ret half %t6
+}
+
+; Verify that we only need two associative operations to reassociate the operands.
+; Also, we should reassociate such that the result of the high latency division
+; is used by the final 'add' rather than reassociating the %x3 operand with the
+; division. The latter reassociation would not improve anything.
+
+define half @reassociate_adds6(half %x0, half %x1, half %x2, half %x3) {
+; CHECK-LABEL: reassociate_adds6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddsh %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fdiv reassoc nsz half %x0, %x1
+  %t1 = fadd reassoc nsz half %x2, %t0
+  %t2 = fadd reassoc nsz half %x3, %t1
+  ret half %t2
+}
+
+; Verify that SSE and AVX scalar single-precision multiplies are reassociated.
+
+define half @reassociate_muls1(half %x0, half %x1, half %x2, half %x3) {
+; CHECK-LABEL: reassociate_muls1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmulsh %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vmulsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fdiv reassoc nsz half %x0, %x1
+  %t1 = fmul reassoc nsz half %x2, %t0
+  %t2 = fmul reassoc nsz half %x3, %t1
+  ret half %t2
+}
+
+; Verify that SSE and AVX 128-bit vector half-precision adds are reassociated.
+
+define <8 x half> @reassociate_adds_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
+; CHECK-LABEL: reassociate_adds_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddph %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fdiv reassoc nsz <8 x half> %x0, %x1
+  %t1 = fadd reassoc nsz <8 x half> %x2, %t0
+  %t2 = fadd reassoc nsz <8 x half> %x3, %t1
+  ret <8 x half> %t2
+}
+
+; Verify that SSE and AVX 128-bit vector half-precision multiplies are reassociated.
+
+define <8 x half> @reassociate_muls_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
+; CHECK-LABEL: reassociate_muls_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmulph %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vmulph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd reassoc nsz <8 x half> %x0, %x1
+  %t1 = fmul reassoc nsz <8 x half> %x2, %t0
+  %t2 = fmul reassoc nsz <8 x half> %x3, %t1
+  ret <8 x half> %t2
+}
+
+; Verify that AVX 256-bit vector half-precision adds are reassociated.
+
+define <16 x half> @reassociate_adds_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) {
+; CHECK-LABEL: reassociate_adds_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vaddph %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %t0 = fdiv reassoc nsz <16 x half> %x0, %x1
+  %t1 = fadd reassoc nsz <16 x half> %x2, %t0
+  %t2 = fadd reassoc nsz <16 x half> %x3, %t1
+  ret <16 x half> %t2
+}
+
+; Verify that AVX 256-bit vector half-precision multiplies are reassociated.
+
+define <16 x half> @reassociate_muls_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) {
+; CHECK-LABEL: reassociate_muls_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vmulph %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vmulph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %t0 = fadd reassoc nsz <16 x half> %x0, %x1
+  %t1 = fmul reassoc nsz <16 x half> %x2, %t0
+  %t2 = fmul reassoc nsz <16 x half> %x3, %t1
+  ret <16 x half> %t2
+}
+
+; Verify that AVX512 512-bit vector half-precision adds are reassociated.
+
+define <32 x half> @reassociate_adds_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) {
+; CHECK-LABEL: reassociate_adds_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vaddph %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %t0 = fdiv reassoc nsz <32 x half> %x0, %x1
+  %t1 = fadd reassoc nsz <32 x half> %x2, %t0
+  %t2 = fadd reassoc nsz <32 x half> %x3, %t1
+  ret <32 x half> %t2
+}
+
+; Verify that AVX512 512-bit vector half-precision multiplies are reassociated.
+
+define <32 x half> @reassociate_muls_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) {
+; CHECK-LABEL: reassociate_muls_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vmulph %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vmulph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd reassoc nsz <32 x half> %x0, %x1
+  %t1 = fmul reassoc nsz <32 x half> %x2, %t0
+  %t2 = fmul reassoc nsz <32 x half> %x3, %t1
+  ret <32 x half> %t2
+}
+
+; Verify that SSE and AVX scalar half-precision minimum ops are reassociated.
+
+define half @reassociate_mins_half(half %x0, half %x1, half %x2, half %x3) {
+; CHECK-LABEL: reassociate_mins_half:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vminsh %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vminsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fdiv half %x0, %x1
+  %cmp1 = fcmp olt half %x2, %t0
+  %sel1 = select i1 %cmp1, half %x2, half %t0
+  %cmp2 = fcmp olt half %x3, %sel1
+  %sel2 = select i1 %cmp2, half %x3, half %sel1
+  ret half %sel2
+}
+
+; Verify that SSE and AVX scalar half-precision maximum ops are reassociated.
+
+define half @reassociate_maxs_half(half %x0, half %x1, half %x2, half %x3) {
+; CHECK-LABEL: reassociate_maxs_half:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmaxsh %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vmaxsh %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fdiv half %x0, %x1
+  %cmp1 = fcmp ogt half %x2, %t0
+  %sel1 = select i1 %cmp1, half %x2, half %t0
+  %cmp2 = fcmp ogt half %x3, %sel1
+  %sel2 = select i1 %cmp2, half %x3, half %sel1
+  ret half %sel2
+}
+
+; Verify that SSE and AVX 128-bit vector half-precision minimum ops are reassociated.
+
+define <8 x half> @reassociate_mins_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
+; CHECK-LABEL: reassociate_mins_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vminph %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vminph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd <8 x half> %x0, %x1
+  %cmp1 = fcmp olt <8 x half> %x2, %t0
+  %sel1 = select <8 x i1> %cmp1, <8 x half> %x2, <8 x half> %t0
+  %cmp2 = fcmp olt <8 x half> %x3, %sel1
+  %sel2 = select <8 x i1> %cmp2, <8 x half> %x3, <8 x half> %sel1
+  ret <8 x half> %sel2
+}
+
+; Verify that SSE and AVX 128-bit vector half-precision maximum ops are reassociated.
+
+define <8 x half> @reassociate_maxs_v8f16(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, <8 x half> %x3) {
+; CHECK-LABEL: reassociate_maxs_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vmaxph %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vmaxph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd <8 x half> %x0, %x1
+  %cmp1 = fcmp ogt <8 x half> %x2, %t0
+  %sel1 = select <8 x i1> %cmp1, <8 x half> %x2, <8 x half> %t0
+  %cmp2 = fcmp ogt <8 x half> %x3, %sel1
+  %sel2 = select <8 x i1> %cmp2, <8 x half> %x3, <8 x half> %sel1
+  ret <8 x half> %sel2
+}
+
+; Verify that AVX 256-bit vector half-precision minimum ops are reassociated.
+
+define <16 x half> @reassociate_mins_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) {
+; CHECK-LABEL: reassociate_mins_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vminph %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vminph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %t0 = fadd <16 x half> %x0, %x1
+  %cmp1 = fcmp olt <16 x half> %x2, %t0
+  %sel1 = select <16 x i1> %cmp1, <16 x half> %x2, <16 x half> %t0
+  %cmp2 = fcmp olt <16 x half> %x3, %sel1
+  %sel2 = select <16 x i1> %cmp2, <16 x half> %x3, <16 x half> %sel1
+  ret <16 x half> %sel2
+}
+
+; Verify that AVX 256-bit vector half-precision maximum ops are reassociated.
+
+define <16 x half> @reassociate_maxs_v16f16(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, <16 x half> %x3) {
+; CHECK-LABEL: reassociate_maxs_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vmaxph %ymm3, %ymm2, %ymm1
+; CHECK-NEXT:    vmaxph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %t0 = fadd <16 x half> %x0, %x1
+  %cmp1 = fcmp ogt <16 x half> %x2, %t0
+  %sel1 = select <16 x i1> %cmp1, <16 x half> %x2, <16 x half> %t0
+  %cmp2 = fcmp ogt <16 x half> %x3, %sel1
+  %sel2 = select <16 x i1> %cmp2, <16 x half> %x3, <16 x half> %sel1
+  ret <16 x half> %sel2
+}
+
+; Verify that AVX512 512-bit vector half-precision minimum ops are reassociated.
+
+define <32 x half> @reassociate_mins_v32f16(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) {
+; CHECK-LABEL: reassociate_mins_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vminph %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vminph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd <32 x half> %x0, %x1
+  %cmp1 = fcmp olt <32 x half> %x2, %t0
+  %sel1 = select <32 x i1> %cmp1, <32 x half> %x2, <32 x half> %t0
+  %cmp2 = fcmp olt <32 x half> %x3, %sel1
+  %sel2 = select <32 x i1> %cmp2, <32 x half> %x3, <32 x half> %sel1
+  ret <32 x half> %sel2
+}
+
+; Verify that AVX512 512-bit vector half-precision maximum ops are reassociated.
+
+define <32 x half> @reassociate_maxs_v16f32(<32 x half> %x0, <32 x half> %x1, <32 x half> %x2, <32 x half> %x3) {
+; CHECK-LABEL: reassociate_maxs_v16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    vmaxph %zmm3, %zmm2, %zmm1
+; CHECK-NEXT:    vmaxph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd <32 x half> %x0, %x1
+  %cmp1 = fcmp ogt <32 x half> %x2, %t0
+  %sel1 = select <32 x i1> %cmp1, <32 x half> %x2, <32 x half> %t0
+  %cmp2 = fcmp ogt <32 x half> %x3, %sel1
+  %sel2 = select <32 x i1> %cmp2, <32 x half> %x3, <32 x half> %sel1
+  ret <32 x half> %sel2
+}
+
diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
index 9af66e4f3b3d9..0b384a4d10c3a 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll
@@ -1217,6 +1217,19 @@ define <8 x half> @movrrkz8f16(<8 x half> %a, i8 %msk) {
   ret <8 x half> %res
 }
 
+define <8 x half> @movsh(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: movsh:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,14,15,0,1,2,3,4,5,6,7,14,15,10,11]
+; CHECK-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vaddph %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res1 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 7, i32 0, i32 1, i32 2, i32 3, i32 7, i32 5>
+  %res2 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %res = fadd <8 x half> %res1, %res2
+  ret <8 x half> %res
+}
+
 define i16 @test_movw(half %x) {
 ; X64-LABEL: test_movw:
 ; X64:       # %bb.0:
@@ -1885,3 +1898,31 @@ define <4 x float> @regression2(i8 addrspace(1)* %0, <4 x i32> %1, <4 x i32> %2,
   %18 = fmul contract <4 x float> %17, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000>
   ret <4 x float> %18
 }
+
+; Make sure load/stores of v4f16 are handled well on 32-bit targets where
+; default widening legalization can't use i64.
+define void @load_store_v4f16(<4 x half>* %x, <4 x half>* %y, <4 x half>* %z) {
+; X64-LABEL: load_store_v4f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; X64-NEXT:    vmovlps %xmm0, (%rdx)
+; X64-NEXT:    retq
+;
+; X86-LABEL: load_store_v4f16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; X86-NEXT:    vmovlps %xmm0, (%eax)
+; X86-NEXT:    retl
+  %a = load <4 x half>, <4 x half>* %x
+  %b = load <4 x half>, <4 x half>* %y
+  %c = fadd <4 x half> %a, %b
+  store <4 x half> %c, <4 x half>* %z
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-rndscale.ll b/llvm/test/CodeGen/X86/avx512fp16-rndscale.ll
new file mode 100644
index 0000000000000..c958b7e86d9f1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-rndscale.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx512fp16 | FileCheck %s
+
+declare <8 x half> @llvm.ceil.v8f16(<8 x half>)
+declare <16 x half> @llvm.ceil.v16f16(<16 x half>)
+declare <32 x half> @llvm.ceil.v32f16(<32 x half>)
+
+define <8 x half> @ceil_v8f16(<8 x half> %p) {
+; CHECK-LABEL: ceil_v8f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $10, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t = call <8 x half> @llvm.ceil.v8f16(<8 x half> %p)
+  ret <8 x half> %t
+}
+
+define <16 x half> @ceil_v16f16(<16 x half> %p) {
+; CHECK-LABEL: ceil_v16f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $10, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %t = call <16 x half> @llvm.ceil.v16f16(<16 x half> %p)
+  ret <16 x half> %t
+}
+
+define <32 x half> @ceil_v32f16(<32 x half> %p) {
+; CHECK-LABEL: ceil_v32f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $10, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %t = call <32 x half> @llvm.ceil.v32f16(<32 x half> %p)
+  ret <32 x half> %t
+}
+
+declare <8 x half> @llvm.floor.v8f16(<8 x half>)
+declare <16 x half> @llvm.floor.v16f16(<16 x half>)
+declare <32 x half> @llvm.floor.v32f16(<32 x half>)
+
+define <8 x half> @floor_v8f16(<8 x half> %p) {
+; CHECK-LABEL: floor_v8f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $9, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t = call <8 x half> @llvm.floor.v8f16(<8 x half> %p)
+  ret <8 x half> %t
+}
+
+define <16 x half> @floor_v16f16(<16 x half> %p) {
+; CHECK-LABEL: floor_v16f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $9, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %t = call <16 x half> @llvm.floor.v16f16(<16 x half> %p)
+  ret <16 x half> %t
+}
+
+define <32 x half> @floor_v32f16(<32 x half> %p) {
+; CHECK-LABEL: floor_v32f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $9, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %t = call <32 x half> @llvm.floor.v32f16(<32 x half> %p)
+  ret <32 x half> %t
+}
+
+declare <8 x half> @llvm.trunc.v8f16(<8 x half>)
+declare <16 x half> @llvm.trunc.v16f16(<16 x half>)
+declare <32 x half> @llvm.trunc.v32f16(<32 x half>)
+
+define <8 x half> @trunc_v8f16(<8 x half> %p) {
+; CHECK-LABEL: trunc_v8f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $11, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t = call <8 x half> @llvm.trunc.v8f16(<8 x half> %p)
+  ret <8 x half> %t
+}
+
+define <16 x half> @trunc_v16f16(<16 x half> %p) {
+; CHECK-LABEL: trunc_v16f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $11, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %t = call <16 x half> @llvm.trunc.v16f16(<16 x half> %p)
+  ret <16 x half> %t
+}
+
+define <32 x half> @trunc_v32f16(<32 x half> %p) {
+; CHECK-LABEL: trunc_v32f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $11, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %t = call <32 x half> @llvm.trunc.v32f16(<32 x half> %p)
+  ret <32 x half> %t
+}
+
+declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>)
+declare <16 x half> @llvm.nearbyint.v16f16(<16 x half>)
+declare <32 x half> @llvm.nearbyint.v32f16(<32 x half>)
+
+define <8 x half> @nearbyint_v8f16(<8 x half> %p) {
+; CHECK-LABEL: nearbyint_v8f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $12, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %p)
+  ret <8 x half> %t
+}
+
+define <16 x half> @nearbyint_v16f16(<16 x half> %p) {
+; CHECK-LABEL: nearbyint_v16f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $12, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %t = call <16 x half> @llvm.nearbyint.v16f16(<16 x half> %p)
+  ret <16 x half> %t
+}
+
+define <32 x half> @nearbyint_v32f16(<32 x half> %p) {
+; CHECK-LABEL: nearbyint_v32f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $12, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %t = call <32 x half> @llvm.nearbyint.v32f16(<32 x half> %p)
+  ret <32 x half> %t
+}
+
+declare <8 x half> @llvm.rint.v8f16(<8 x half>)
+declare <16 x half> @llvm.rint.v16f16(<16 x half>)
+declare <32 x half> @llvm.rint.v32f16(<32 x half>)
+
+define <8 x half> @rint_v8f16(<8 x half> %p) {
+; CHECK-LABEL: rint_v8f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $4, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t = call <8 x half> @llvm.rint.v8f16(<8 x half> %p)
+  ret <8 x half> %t
+}
+
+define <16 x half> @rint_v16f16(<16 x half> %p) {
+; CHECK-LABEL: rint_v16f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $4, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %t = call <16 x half> @llvm.rint.v16f16(<16 x half> %p)
+  ret <16 x half> %t
+}
+
+define <32 x half> @rint_v32f16(<32 x half> %p) {
+; CHECK-LABEL: rint_v32f16:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscaleph $4, %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %t = call <32 x half> @llvm.rint.v32f16(<32 x half> %p)
+  ret <32 x half> %t
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-scalar.ll b/llvm/test/CodeGen/X86/avx512fp16-scalar.ll
new file mode 100644
index 0000000000000..36145e86469aa
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-scalar.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512fp16 --show-mc-encoding | FileCheck %s
+
+declare half @llvm.ceil.f16(half)
+declare half @llvm.floor.f16(half)
+declare half @llvm.trunc.f16(half)
+declare half @llvm.rint.f16(half)
+declare half @llvm.nearbyint.f16(half)
+
+define half @test_ceil(half %a) {
+; CHECK-LABEL: test_ceil:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscalesh $10, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x0a]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %c = call half @llvm.ceil.f16(half %a)
+  ret half %c
+}
+
+define half @test_floor(half %a) {
+; CHECK-LABEL: test_floor:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscalesh $9, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x09]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %c = call half @llvm.floor.f16(half %a)
+  ret half %c
+}
+
+define half @test_trunc(half %a) {
+; CHECK-LABEL: test_trunc:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscalesh $11, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x0b]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %c = call half @llvm.trunc.f16(half %a)
+  ret half %c
+}
+
+define half @test_rint(half %a) {
+; CHECK-LABEL: test_rint:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscalesh $4, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x04]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %c = call half @llvm.rint.f16(half %a)
+  ret half %c
+}
+
+define half @test_nearbyint(half %a) {
+; CHECK-LABEL: test_nearbyint:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vrndscalesh $12, %xmm0, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7c,0x08,0x0a,0xc0,0x0c]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %c = call half @llvm.nearbyint.f16(half %a)
+  ret half %c
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll b/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll
new file mode 100644
index 0000000000000..7da327b10d412
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avx512fp16-unsafe-fp-math.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=+avx512fp16 | FileCheck %s --check-prefix=CHECK_UNSAFE
+; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512fp16 | FileCheck %s --check-prefix=CHECK
+
+define <32 x half> @test_max_v32f16(<32 x half> * %a_ptr, <32 x half> %b)  {
+; CHECK_UNSAFE-LABEL: test_max_v32f16:
+; CHECK_UNSAFE:       # %bb.0:
+; CHECK_UNSAFE-NEXT:    vmaxph (%rdi), %zmm0, %zmm0
+; CHECK_UNSAFE-NEXT:    retq
+;
+; CHECK-LABEL: test_max_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1
+; CHECK-NEXT:    vmaxph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %a = load <32 x half>, <32 x half>* %a_ptr
+  %tmp = fcmp fast ogt <32 x half> %a, %b
+  %tmp4 = select <32 x i1> %tmp, <32 x half> %a, <32 x half> %b
+  ret <32 x half> %tmp4;
+}
+
+define <32 x half> @test_min_v32f16(<32 x half>* %a_ptr, <32 x half> %b)  {
+; CHECK_UNSAFE-LABEL: test_min_v32f16:
+; CHECK_UNSAFE:       # %bb.0:
+; CHECK_UNSAFE-NEXT:    vminph (%rdi), %zmm0, %zmm0
+; CHECK_UNSAFE-NEXT:    retq
+;
+; CHECK-LABEL: test_min_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps (%rdi), %zmm1
+; CHECK-NEXT:    vminph %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %a = load <32 x half>, <32 x half>* %a_ptr
+  %tmp = fcmp fast olt <32 x half> %a, %b
+  %tmp4 = select <32 x i1> %tmp, <32 x half> %a, <32 x half> %b
+  ret <32 x half> %tmp4;
+}
+
+define <16 x half> @test_max_v16f16(<16 x half> * %a_ptr, <16 x half> %b)  {
+; CHECK_UNSAFE-LABEL: test_max_v16f16:
+; CHECK_UNSAFE:       # %bb.0:
+; CHECK_UNSAFE-NEXT:    vmaxph (%rdi), %ymm0, %ymm0
+; CHECK_UNSAFE-NEXT:    retq
+;
+; CHECK-LABEL: test_max_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    vmaxph %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %a = load <16 x half>, <16 x half>* %a_ptr
+  %tmp = fcmp fast ogt <16 x half> %a, %b
+  %tmp4 = select <16 x i1> %tmp, <16 x half> %a, <16 x half> %b
+  ret <16 x half> %tmp4;
+}
+
+define <16 x half> @test_min_v16f16(<16 x half>* %a_ptr, <16 x half> %b)  {
+; CHECK_UNSAFE-LABEL: test_min_v16f16:
+; CHECK_UNSAFE:       # %bb.0:
+; CHECK_UNSAFE-NEXT:    vminph (%rdi), %ymm0, %ymm0
+; CHECK_UNSAFE-NEXT:    retq
+;
+; CHECK-LABEL: test_min_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps (%rdi), %ymm1
+; CHECK-NEXT:    vminph %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %a = load <16 x half>, <16 x half>* %a_ptr
+  %tmp = fcmp fast olt <16 x half> %a, %b
+  %tmp4 = select <16 x i1> %tmp, <16 x half> %a, <16 x half> %b
+  ret <16 x half> %tmp4;
+}
+
+define <8 x half> @test_max_v8f16(<8 x half> * %a_ptr, <8 x half> %b)  {
+; CHECK_UNSAFE-LABEL: test_max_v8f16:
+; CHECK_UNSAFE:       # %bb.0:
+; CHECK_UNSAFE-NEXT:    vmaxph (%rdi), %xmm0, %xmm0
+; CHECK_UNSAFE-NEXT:    retq
+;
+; CHECK-LABEL: test_max_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1
+; CHECK-NEXT:    vmaxph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a = load <8 x half>, <8 x half>* %a_ptr
+  %tmp = fcmp fast ogt <8 x half> %a, %b
+  %tmp4 = select <8 x i1> %tmp, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %tmp4;
+}
+
+define <8 x half> @test_min_v8f16(<8 x half>* %a_ptr, <8 x half> %b)  {
+; CHECK_UNSAFE-LABEL: test_min_v8f16:
+; CHECK_UNSAFE:       # %bb.0:
+; CHECK_UNSAFE-NEXT:    vminph (%rdi), %xmm0, %xmm0
+; CHECK_UNSAFE-NEXT:    retq
+;
+; CHECK-LABEL: test_min_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps (%rdi), %xmm1
+; CHECK-NEXT:    vminph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a = load <8 x half>, <8 x half>* %a_ptr
+  %tmp = fcmp fast olt <8 x half> %a, %b
+  %tmp4 = select <8 x i1> %tmp, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %tmp4;
+}
+
+define half @test_max_f16(half %a, half* %ptr) {
+; CHECK_UNSAFE-LABEL: test_max_f16:
+; CHECK_UNSAFE:       # %bb.0: # %entry
+; CHECK_UNSAFE-NEXT:    vmaxsh (%rdi), %xmm0, %xmm0
+; CHECK_UNSAFE-NEXT:    retq
+;
+; CHECK-LABEL: test_max_f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovsh (%rdi), %xmm1
+; CHECK-NEXT:    vmaxsh %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = load half, half* %ptr
+  %1 = fcmp fast ogt half %0, %a
+  %2 = select i1 %1, half %0, half %a
+  ret half %2
+}
+
+define half @test_min_f16(half %a, half* %ptr) {
+; CHECK_UNSAFE-LABEL: test_min_f16:
+; CHECK_UNSAFE:       # %bb.0: # %entry
+; CHECK_UNSAFE-NEXT:    vminsh (%rdi), %xmm0, %xmm0
+; CHECK_UNSAFE-NEXT:    retq
+;
+; CHECK-LABEL: test_min_f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovsh (%rdi), %xmm1
+; CHECK-NEXT:    vminsh %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %0 = load half, half* %ptr
+  %1 = fcmp fast olt half %0, %a
+  %2 = select i1 %1, half %0, half %a
+  ret half %2
+}
diff --git a/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
index 42a5eecadda67..93efbace3e759 100644
--- a/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16vl-intrinsics.ll
@@ -23,3 +23,1299 @@ entry:
   %0 = bitcast <8 x i16> %vecinit7.i to <2 x i64>
   ret <2 x i64> %0
 }
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_dq2ph_256(<8 x i32> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtdq2ph %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %mask = bitcast i8 %x2 to <8 x i1>
+  %res0 = sitofp <8 x i32> %x0 to <8 x half>
+  %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> %x1
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_dq2ph_256_z(<8 x i32> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_256_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtdq2ph %ymm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %mask = bitcast i8 %x2 to <8 x i1>
+  %res0 = sitofp <8 x i32> %x0 to <8 x half>
+  %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> zeroinitializer
+  ret <8 x half> %res
+}
+
+define <8 x half> @sint_to_fp_8i32_to_8f16(<8 x i32> %x) {
+; CHECK-LABEL: sint_to_fp_8i32_to_8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtdq2ph %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = sitofp <8 x i32> %x to <8 x half>
+  ret <8 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtdq2ph.128(<4 x i32>, <8 x half>, i8)
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_dq2ph_128(<4 x i32> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtdq2ph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtdq2ph.128(<4 x i32> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_dq2ph_128_nomask(<4 x i32> %x0, <8 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_128_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtdq2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtdq2ph.128(<4 x i32> %x0, <8 x half> %x1, i8 -1)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_dq2ph_128_z(<4 x i32> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_dq2ph_128_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtdq2ph %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtdq2ph.128(<4 x i32> %x0, <8 x half> zeroinitializer, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <4 x half> @sint_to_fp_4i32_to_4f16(<4 x i32> %x) {
+; CHECK-LABEL: sint_to_fp_4i32_to_4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtdq2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = sitofp <4 x i32> %x to <4 x half>
+  ret <4 x half> %res
+}
+
+define <2 x half> @sint_to_fp_2i32_to_2f16(<2 x i32> %x) {
+; CHECK-LABEL: sint_to_fp_2i32_to_2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtdq2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = sitofp <2 x i32> %x to <2 x half>
+  ret <2 x half> %res
+}
+
+define <4 x i32> @fp_to_sint_4f16_to_4i32(<4 x half> %x) {
+; CHECK-LABEL: fp_to_sint_4f16_to_4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = fptosi <4 x half> %x to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i32> @fp_to_sint_2f16_to_2i32(<2 x half> %x) {
+; CHECK-LABEL: fp_to_sint_2f16_to_2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = fptosi <2 x half> %x to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <2 x i16> @fp_to_sint_2f16_to_2i16(<2 x half> %x) {
+; CHECK-LABEL: fp_to_sint_2f16_to_2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = fptosi <2 x half> %x to <2 x i16>
+  ret <2 x i16> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_udq2ph_256(<8 x i32> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtudq2ph %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %mask = bitcast i8 %x2 to <8 x i1>
+  %res0 = uitofp <8 x i32> %x0 to <8 x half>
+  %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> %x1
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_udq2ph_256_z(<8 x i32> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_256_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtudq2ph %ymm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %mask = bitcast i8 %x2 to <8 x i1>
+  %res0 = uitofp <8 x i32> %x0 to <8 x half>
+  %res = select <8 x i1> %mask, <8 x half> %res0, <8 x half> zeroinitializer
+  ret <8 x half> %res
+}
+
+define <8 x half> @uint_to_fp_8i32_to_8f16(<8 x i32> %x) {
+; CHECK-LABEL: uint_to_fp_8i32_to_8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtudq2ph %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = uitofp <8 x i32> %x to <8 x half>
+  ret <8 x half> %res
+}
+
+define <8 x i32> @fp_to_uint_8f16_to_8i32(<8 x half> %x) {
+; CHECK-LABEL: fp_to_uint_8f16_to_8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2udq %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = fptoui <8 x half> %x to <8 x i32>
+  ret <8 x i32> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtudq2ph.128(<4 x i32>, <8 x half>, i8)
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_udq2ph_128(<4 x i32> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtudq2ph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtudq2ph.128(<4 x i32> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_udq2ph_128_nomask(<4 x i32> %x0, <8 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_128_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtudq2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtudq2ph.128(<4 x i32> %x0, <8 x half> %x1, i8 -1)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_udq2ph_128_z(<4 x i32> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_udq2ph_128_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtudq2ph %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtudq2ph.128(<4 x i32> %x0, <8 x half> zeroinitializer, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <4 x half> @uint_to_fp_4i32_to_4f16(<4 x i32> %x) {
+; CHECK-LABEL: uint_to_fp_4i32_to_4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtudq2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = uitofp <4 x i32> %x to <4 x half>
+  ret <4 x half> %res
+}
+
+define <2 x half> @uint_to_fp_2i32_to_2f16(<2 x i32> %x) {
+; CHECK-LABEL: uint_to_fp_2i32_to_2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtudq2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = uitofp <2 x i32> %x to <2 x half>
+  ret <2 x half> %res
+}
+
+define <4 x i32> @fp_to_uint_4f16_to_4i32(<4 x half> %x) {
+; CHECK-LABEL: fp_to_uint_4f16_to_4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = fptoui <4 x half> %x to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i32> @fp_to_uint_2f16_to_2i32(<2 x half> %x) {
+; CHECK-LABEL: fp_to_uint_2f16_to_2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = fptoui <2 x half> %x to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <2 x i16> @fp_to_uint_2f16_to_2i16(<2 x half> %x) {
+; CHECK-LABEL: fp_to_uint_2f16_to_2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = fptoui <2 x half> %x to <2 x i16>
+  ret <2 x i16> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.128(<8 x half>, <4 x i32>, i8)
+
+define <4 x i32> @test_int_x86_avx512_cvt_ph2dq_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvt_ph2dq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2dq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.128(<8 x half> %x0, <4 x i32> undef, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_mask_cvt_ph2dq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2dq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2dq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_maskz_cvt_ph2dq_128(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2dq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2dq %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.256(<8 x half>, <8 x i32>, i8)
+
+define <8 x i32> @test_int_x86_avx512_cvt_ph2dq_256(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvt_ph2dq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2dq %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.256(<8 x half> %x0, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_mask_cvt_ph2dq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2dq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2dq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_maskz_cvt_ph2dq_256(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2dq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2dq %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2dq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half>, <4 x i32>, i8)
+
+define <4 x i32> @test_int_x86_avx512_cvt_ph2udq_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvt_ph2udq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2udq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half> %x0, <4 x i32> undef, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2udq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2udq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_maskz_cvt_ph2udq_128(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2udq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2udq %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half>, <8 x i32>, i8)
+
+define <8 x i32> @test_int_x86_avx512_cvt_ph2udq_256(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvt_ph2udq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2udq %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half> %x0, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_mask_cvt_ph2udq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2udq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2udq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_maskz_cvt_ph2udq_256(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvt_ph2udq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2udq %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvtph2udq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half>, <4 x i32>, i8)
+
+define <4 x i32> @test_int_x86_avx512_cvtt_ph2dq_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2dq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half> %x0, <4 x i32> undef, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2dq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_maskz_cvtt_ph2dq_128(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2dq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half>, <8 x i32>, i8)
+
+define <8 x i32> @test_int_x86_avx512_cvtt_ph2dq_256(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2dq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half> %x0, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_mask_cvtt_ph2dq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2dq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2dq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_maskz_cvtt_ph2dq_256(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2dq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2dq %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2dq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half>, <4 x i32>, i8)
+
+define <4 x i32> @test_int_x86_avx512_cvtt_ph2udq_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2udq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half> %x0, <4 x i32> undef, i8 -1)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_128(<8 x half> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2udq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half> %x0, <4 x i32> %x1, i8 %x2)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_int_x86_avx512_maskz_cvtt_ph2udq_128(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2udq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.128(<8 x half> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half>, <8 x i32>, i8)
+
+define <8 x i32> @test_int_x86_avx512_cvtt_ph2udq_256(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2udq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2udq %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half> %x0, <8 x i32> undef, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_mask_cvtt_ph2udq_256(<8 x half> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2udq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2udq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+define <8 x i32> @test_int_x86_avx512_maskz_cvtt_ph2udq_256(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2udq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2udq %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512fp16.mask.vcvttph2udq.256(<8 x half> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.256(<8 x half>, <4 x double>, i8)
+
+define <4 x double> @test_int_x86_avx512_mask_cvt_ph2pd_256(<8 x half> %x0, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2pd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2pd %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.256(<8 x half> %x0, <4 x double> %x1, i8 %x2)
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_int_x86_avx512_mask_cvt_ph2pd_256_nomask(<8 x half> %x0, <4 x double> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2pd_256_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2pd %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.256(<8 x half> %x0, <4 x double> %x1, i8 -1)
+  ret <4 x double> %res
+}
+
+declare <2 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.128(<8 x half>, <2 x double>, i8)
+
+define <2 x double> @test_int_x86_avx512_mask_cvt_ph2pd_128(<8 x half> %x0, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2pd_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtph2pd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.128(<8 x half> %x0, <2 x double> %x1, i8 %x2)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_int_x86_avx512_mask_cvt_ph2pd_128_nomask(<8 x half> %x0, <2 x double> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ph2pd_128_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2pd %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.x86.avx512fp16.mask.vcvtph2pd.128(<8 x half> %x0, <2 x double> %x1, i8 -1)
+  ret <2 x double> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.256(<4 x double>, <8 x half>, i8)
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_pd2ph_256(<4 x double> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtpd2ph %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.256(<4 x double> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_pd2ph_256_load(<4 x double>* %px0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ph_256_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtpd2phy (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x0 = load <4 x double>, <4 x double>* %px0, align 32
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.256(<4 x double> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.128(<2 x double>, <8 x half>, i8)
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_pd2ph_128(<2 x double> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtpd2ph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.128(<2 x double> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_pd2ph_128_load(<2 x double>* %px0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2ph_128_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vcvtpd2phx (%rdi), %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x0 = load <2 x double>, <2 x double>* %px0, align 16
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtpd2ph.128(<2 x double> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.256(<4 x i64>, <8 x half>, i8)
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_256(<4 x i64> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtqq2ph %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.256(<4 x i64> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_256_nomask(<4 x i64> %x0, <8 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_256_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtqq2ph %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.256(<4 x i64> %x0, <8 x half> %x1, i8 -1)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_256_z(<4 x i64> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_256_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtqq2ph %ymm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.256(<4 x i64> %x0, <8 x half> zeroinitializer, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <4 x half> @sint_to_fp_4i64_to_4f16(<4 x i64> %x) {
+; CHECK-LABEL: sint_to_fp_4i64_to_4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtqq2ph %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = sitofp <4 x i64> %x to <4 x half>
+  ret <4 x half> %res
+}
+
+define <4 x i64> @fp_to_sint_4f16_to_4i64(<4 x half> %x) {
+; CHECK-LABEL: fp_to_sint_4f16_to_4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2qq %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = fptosi <4 x half> %x to <4 x i64>
+  ret <4 x i64> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.128(<2 x i64>, <8 x half>, i8)
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_128(<2 x i64> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtqq2ph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.128(<2 x i64> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_128_nomask(<2 x i64> %x0, <8 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_128_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtqq2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.128(<2 x i64> %x0, <8 x half> %x1, i8 -1)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_qq2ph_128_z(<2 x i64> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ph_128_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtqq2ph %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtqq2ph.128(<2 x i64> %x0, <8 x half> zeroinitializer, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <2 x half> @sint_to_fp_2i64_to_2f16(<2 x i64> %x) {
+; CHECK-LABEL: sint_to_fp_2i64_to_2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtqq2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = sitofp <2 x i64> %x to <2 x half>
+  ret <2 x half> %res
+}
+
+define <2 x i64> @fp_to_sint_2f16_to_2i64(<2 x half> %x) {
+; CHECK-LABEL: fp_to_sint_2f16_to_2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2qq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = fptosi <2 x half> %x to <2 x i64>
+  ret <2 x i64> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256(<4 x i64>, <8 x half>, i8)
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_256(<4 x i64> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuqq2ph %ymm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256(<4 x i64> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_256_nomask(<4 x i64> %x0, <8 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_256_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuqq2ph %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256(<4 x i64> %x0, <8 x half> %x1, i8 -1)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_256_z(<4 x i64> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_256_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuqq2ph %ymm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.256(<4 x i64> %x0, <8 x half> zeroinitializer, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <4 x half> @uint_to_fp_4i64_to_4f16(<4 x i64> %x) {
+; CHECK-LABEL: uint_to_fp_4i64_to_4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuqq2ph %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = uitofp <4 x i64> %x to <4 x half>
+  ret <4 x half> %res
+}
+
+define <4 x i64> @fp_to_uint_4f16_to_4i64(<4 x half> %x) {
+; CHECK-LABEL: fp_to_uint_4f16_to_4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2uqq %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = fptoui <4 x half> %x to <4 x i64>
+  ret <4 x i64> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128(<2 x i64>, <8 x half>, i8)
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_128(<2 x i64> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuqq2ph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128(<2 x i64> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_128_nomask(<2 x i64> %x0, <8 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_128_nomask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuqq2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128(<2 x i64> %x0, <8 x half> %x1, i8 -1)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_int_x86_avx512_mask_cvt_uqq2ph_128_z(<2 x i64> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ph_128_z:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvtuqq2ph %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.vcvtuqq2ph.128(<2 x i64> %x0, <8 x half> zeroinitializer, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <2 x half> @uint_to_fp_2i64_to_2f16(<2 x i64> %x) {
+; CHECK-LABEL: uint_to_fp_2i64_to_2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuqq2ph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = uitofp <2 x i64> %x to <2 x half>
+  ret <2 x half> %res
+}
+
+define <2 x i64> @fp_to_uint_2f16_to_2i64(<2 x half> %x) {
+; CHECK-LABEL: fp_to_uint_2f16_to_2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2uqq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = fptoui <2 x half> %x to <2 x i64>
+  ret <2 x i64> %res
+}
+
+declare <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.128(<8 x half>, <2 x i64>, i8)
+
+define <2 x i64> @test_int_x86_avx512_cvtt_ph2qq_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2qq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2qq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.128(<8 x half> %x0, <2 x i64> undef, i8 -1)
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_int_x86_avx512_mask_cvtt_ph2qq_128(<8 x half> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2qq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2qq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.128(<8 x half> %x0, <2 x i64> %x1, i8 %x2)
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ph2qq_128(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2qq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2qq %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.128(<8 x half> %x0, <2 x i64> zeroinitializer, i8 %x2)
+  ret <2 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.256(<8 x half>, <4 x i64>, i8)
+
+define <4 x i64> @test_int_x86_avx512_cvtt_ph2qq_256(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2qq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2qq %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.256(<8 x half> %x0, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_int_x86_avx512_mask_cvtt_ph2qq_256(<8 x half> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2qq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2qq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.256(<8 x half> %x0, <4 x i64> %x1, i8 %x2)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_int_x86_avx512_maskz_cvtt_ph2qq_256(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2qq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2qq %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2qq.256(<8 x half> %x0, <4 x i64> zeroinitializer, i8 %x2)
+  ret <4 x i64> %res
+}
+
+declare <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.128(<8 x half>, <2 x i64>, i8)
+
+define <2 x i64> @test_int_x86_avx512_cvtt_ph2uqq_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2uqq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2uqq %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.128(<8 x half> %x0, <2 x i64> undef, i8 -1)
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_int_x86_avx512_mask_cvtt_ph2uqq_128(<8 x half> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2uqq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2uqq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.128(<8 x half> %x0, <2 x i64> %x1, i8 %x2)
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ph2uqq_128(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2uqq_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2uqq %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.128(<8 x half> %x0, <2 x i64> zeroinitializer, i8 %x2)
+  ret <2 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.256(<8 x half>, <4 x i64>, i8)
+
+define <4 x i64> @test_int_x86_avx512_cvtt_ph2uqq_256(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtt_ph2uqq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2uqq %xmm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.256(<8 x half> %x0, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_int_x86_avx512_mask_cvtt_ph2uqq_256(<8 x half> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ph2uqq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2uqq %xmm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.256(<8 x half> %x0, <4 x i64> %x1, i8 %x2)
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @test_int_x86_avx512_maskz_cvtt_ph2uqq_256(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_cvtt_ph2uqq_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vcvttph2uqq %xmm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512fp16.mask.vcvttph2uqq.256(<8 x half> %x0, <4 x i64> zeroinitializer, i8 %x2)
+  ret <4 x i64> %res
+}
+
+declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
+declare <16 x half> @llvm.sqrt.v16f16(<16 x half>)
+
+define <8 x half> @test_sqrt_ph_128(<8 x half> %a0) {
+; CHECK-LABEL: test_sqrt_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsqrtph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
+  ret <8 x half> %1
+}
+
+define <8 x half> @test_mask_sqrt_ph_128(<8 x half> %a0, <8 x half> %passthru, i8 %mask) {
+; CHECK-LABEL: test_mask_sqrt_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsqrtph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %1 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x half> %1, <8 x half> %passthru
+  ret <8 x half> %3
+}
+
+define <8 x half> @test_maskz_sqrt_ph_128(<8 x half> %a0, i8 %mask) {
+; CHECK-LABEL: test_maskz_sqrt_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsqrtph %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %1 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x half> %1, <8 x half> zeroinitializer
+  ret <8 x half> %3
+}
+
+define <16 x half> @test_sqrt_ph_256(<16 x half> %a0) {
+; CHECK-LABEL: test_sqrt_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsqrtph %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
+  ret <16 x half> %1
+}
+
+define <16 x half> @test_mask_sqrt_ph_256(<16 x half> %a0, <16 x half> %passthru, i16 %mask) {
+; CHECK-LABEL: test_mask_sqrt_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsqrtph %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %1 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x half> %1, <16 x half> %passthru
+  ret <16 x half> %3
+}
+
+define <16 x half> @test_maskz_sqrt_ph_256(<16 x half> %a0, i16 %mask) {
+; CHECK-LABEL: test_maskz_sqrt_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vsqrtph %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %1 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x half> %1, <16 x half> zeroinitializer
+  ret <16 x half> %3
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half>, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half>, <16 x half>, i16)
+
+define <8 x half> @test_rsqrt_ph_128(<8 x half> %a0) {
+; CHECK-LABEL: test_rsqrt_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrsqrtph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 -1)
+  ret <8 x half> %res
+}
+
+define <16 x half> @test_rsqrt_ph_256(<16 x half> %a0) {
+; CHECK-LABEL: test_rsqrt_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrsqrtph %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 -1)
+  ret <16 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half>, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half>, <16 x half>, i16)
+
+define <8 x half> @test_rcp_ph_128(<8 x half> %a0, <8 x half> %a1, i8 %mask) {
+; CHECK-LABEL: test_rcp_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vrcpph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> %a1, i8 %mask)
+  ret <8 x half> %res
+}
+
+define <16 x half> @test_rcp_ph_256(<16 x half> %a0, <16 x half> %a1, i16 %mask) {
+; CHECK-LABEL: test_rcp_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vrcpph %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> %a1, i16 %mask)
+  ret <16 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half>, i32, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half>, i32, <16 x half>, i16)
+
+define <8 x half>@test_int_x86_avx512_mask_reduce_ph_128(<8 x half> %x0, <8 x half> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vreduceph $8, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vreduceph $4, %xmm0, %xmm0
+; CHECK-NEXT:    vaddph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %x0, i32 8, <8 x half> %x2, i8 %x3)
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %x0, i32 4, <8 x half> %x2, i8 -1)
+  %res2 = fadd <8 x half> %res, %res1
+  ret <8 x half> %res2
+}
+
+define <16 x half>@test_int_x86_avx512_mask_reduce_ph_256(<16 x half> %x0, <16 x half> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vreduceph $8, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vreduceph $4, %ymm0, %ymm0
+; CHECK-NEXT:    vaddph %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %x0, i32 8, <16 x half> %x2, i16 %x3)
+  %res1 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %x0, i32 4, <16 x half> %x2, i16 -1)
+  %res2 = fadd <16 x half> %res, %res1
+  ret <16 x half> %res2
+}
+
+declare <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half>, i32)
+declare <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half>, i32)
+
+define i8 @test_int_x86_avx512_fpclass_ph_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_fpclass_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfpclassph $2, %xmm0, %k1
+; CHECK-NEXT:    vfpclassph $4, %xmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+  %res = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %x0, i32 4)
+  %res1 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %x0, i32 2)
+  %1 = and <8 x i1> %res1, %res
+  %2 = bitcast <8 x i1> %1 to i8
+  ret i8 %2
+}
+
+define i16 @test_int_x86_avx512_fpclass_ph_256(<16 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_fpclass_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vfpclassph $2, %ymm0, %k1
+; CHECK-NEXT:    vfpclassph $4, %ymm0, %k0 {%k1}
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+  %res = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %x0, i32 4)
+  %res1 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %x0, i32 2)
+  %1 = and <16 x i1> %res1, %res
+  %2 = bitcast <16 x i1> %1 to i16
+  ret i16 %2
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half>, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half>, <16 x half>, i16)
+
+define <8 x half>@test_int_x86_avx512_getexp_ph_128(<8 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_getexp_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgetexpph %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %x0, <8 x half> zeroinitializer, i8 -1)
+  ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_getexp_ph_128(<8 x half> %x0, <8 x half> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getexp_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vgetexpph %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %x0, <8 x half> %x1, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_maskz_getexp_ph_128(<8 x half> %x0, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_getexp_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vgetexpph %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %x0, <8 x half> zeroinitializer, i8 %x2)
+  ret <8 x half> %res
+}
+
+define <16 x half>@test_int_x86_avx512_getexp_ph_256(<16 x half> %x0) {
+; CHECK-LABEL: test_int_x86_avx512_getexp_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vgetexpph %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %x0, <16 x half> zeroinitializer, i16 -1)
+  ret <16 x half> %res
+}
+
+define <16 x half>@test_int_x86_avx512_mask_getexp_ph_256(<16 x half> %x0, <16 x half> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getexp_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vgetexpph %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vmovaps %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %x0, <16 x half> %x1, i16 %x2)
+  ret <16 x half> %res
+}
+
+define <16 x half>@test_int_x86_avx512_maskz_getexp_ph_256(<16 x half> %x0, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_getexp_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vgetexpph %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %x0, <16 x half> zeroinitializer, i16 %x2)
+  ret <16 x half> %res
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half>, i32, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half>, i32, <16 x half>, i16)
+
+define <8 x half>@test_int_x86_avx512_mask_getmant_ph_128(<8 x half> %x0, <8 x half> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vgetmantph $8, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vgetmantph $4, %xmm0, %xmm0
+; CHECK-NEXT:    vaddph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %x0, i32 8, <8 x half> %x2, i8 %x3)
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %x0, i32 4, <8 x half> %x2, i8 -1)
+  %res2 = fadd <8 x half> %res, %res1
+  ret <8 x half> %res2
+}
+
+define <16 x half>@test_int_x86_avx512_mask_getmant_ph_256(<16 x half> %x0, <16 x half> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_getmant_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vgetmantph $8, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vgetmantph $4, %ymm0, %ymm0
+; CHECK-NEXT:    vaddph %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %x0, i32 8, <16 x half> %x2, i16 %x3)
+  %res1 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %x0, i32 4, <16 x half> %x2, i16 -1)
+  %res2 = fadd <16 x half> %res, %res1
+  ret <16 x half> %res2
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half>, i32, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half>, i32, <16 x half>, i16)
+
+define <8 x half>@test_int_x86_avx512_mask_rndscale_ph_128(<8 x half> %x0, <8 x half> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vrndscaleph $8, %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vrndscaleph $4, %xmm0, %xmm0
+; CHECK-NEXT:    vaddph %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %x0, i32 8, <8 x half> %x2, i8 %x3)
+  %res1 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %x0, i32 4, <8 x half> %x2, i8 -1)
+  %res2 = fadd <8 x half> %res, %res1
+  ret <8 x half> %res2
+}
+
+define <16 x half>@test_int_x86_avx512_mask_rndscale_ph_256(<16 x half> %x0, <16 x half> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_rndscale_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vrndscaleph $8, %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vrndscaleph $4, %ymm0, %ymm0
+; CHECK-NEXT:    vaddph %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %x0, i32 8, <16 x half> %x2, i16 %x3)
+  %res1 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %x0, i32 4, <16 x half> %x2, i16 -1)
+  %res2 = fadd <16 x half> %res, %res1
+  ret <16 x half> %res2
+}
+
+declare <8 x half> @llvm.x86.avx512fp16.mask.scalef.ph.128(<8 x half>, <8 x half>, <8 x half>, i8)
+declare <16 x half> @llvm.x86.avx512fp16.mask.scalef.ph.256(<16 x half>, <16 x half>, <16 x half>, i16)
+
+define <8 x half>@test_int_x86_avx512_scalef_ph_128(<8 x half> %x0, <8 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_scalef_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vscalefph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> zeroinitializer, i8 -1)
+  ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_mask_scalef_ph_128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vscalefph %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i8 %x3 to <8 x i1>
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> %x2, i8 %x3)
+  ret <8 x half> %res
+}
+
+define <8 x half>@test_int_x86_avx512_maskz_scalef_ph_128(<8 x half> %x0, <8 x half> %x1, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_scalef_ph_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vscalefph %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %mask = bitcast i8 %x3 to <8 x i1>
+  %res = call <8 x half> @llvm.x86.avx512fp16.mask.scalef.ph.128(<8 x half> %x0, <8 x half> %x1, <8 x half> zeroinitializer, i8 %x3)
+  ret <8 x half> %res
+}
+
+define <16 x half>@test_int_x86_avx512_scalef_ph_256(<16 x half> %x0, <16 x half> %x1) {
+; CHECK-LABEL: test_int_x86_avx512_scalef_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vscalefph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.scalef.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> zeroinitializer, i16 -1)
+  ret <16 x half> %res
+}
+
+define <16 x half>@test_int_x86_avx512_mask_scalef_ph_256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_scalef_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vscalefph %ymm1, %ymm0, %ymm2 {%k1}
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %mask = bitcast i16 %x3 to <16 x i1>
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.scalef.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> %x2, i16 %x3)
+  ret <16 x half> %res
+}
+
+define <16 x half>@test_int_x86_avx512_maskz_scalef_ph_256(<16 x half> %x0, <16 x half> %x1, i16 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_scalef_ph_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vscalefph %ymm1, %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %mask = bitcast i16 %x3 to <16 x i1>
+  %res = call <16 x half> @llvm.x86.avx512fp16.mask.scalef.ph.256(<16 x half> %x0, <16 x half> %x1, <16 x half> zeroinitializer, i16 %x3)
+  ret <16 x half> %res
+}
diff --git a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
index ef8caf36a68c6..a4db3e9fa8333 100644
--- a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
+++ b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll
@@ -47,7 +47,7 @@ return:                                           ; preds = %catch, %entry
 ; CHECK-NEXT: .set .Lfoo$parent_frame_offset, 32
 ; CHECK-NEXT: .long   (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
-; CHECK-NEXT: .long   .Ltmp0@IMGREL+1
+; CHECK-NEXT: .long   .Ltmp0@IMGREL
 ; CHECK-NEXT: .long   .Ltmp1@IMGREL+1
 ; CHECK-NEXT: .long   1
 ; CHECK-NEXT: .long   .LBB0_[[catch]]@IMGREL
diff --git a/llvm/test/CodeGen/X86/combine-add-ssat.ll b/llvm/test/CodeGen/X86/combine-add-ssat.ll
index 48f5d6c760963..be52956a81065 100644
--- a/llvm/test/CodeGen/X86/combine-add-ssat.ll
+++ b/llvm/test/CodeGen/X86/combine-add-ssat.ll
@@ -77,11 +77,10 @@ define <8 x i16> @combine_constfold_undef_v8i16() {
 define i32 @combine_constant_i32(i32 %a0) {
 ; CHECK-LABEL: combine_constant_i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    incl %ecx
-; CHECK-NEXT:    setns %al
-; CHECK-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
+; CHECK-NEXT:    leal 1(%rdi), %eax
+; CHECK-NEXT:    sarl $31, %eax
+; CHECK-NEXT:    xorl $-2147483648, %eax # imm = 0x80000000
 ; CHECK-NEXT:    incl %edi
 ; CHECK-NEXT:    cmovnol %edi, %eax
 ; CHECK-NEXT:    retq
@@ -125,13 +124,13 @@ define <8 x i16> @combine_zero_v8i16(<8 x i16> %a0) {
 define i32 @combine_no_overflow_i32(i32 %a0, i32 %a1) {
 ; CHECK-LABEL: combine_no_overflow_i32:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
 ; CHECK-NEXT:    sarl $16, %edi
 ; CHECK-NEXT:    shrl $16, %esi
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movl %edi, %ecx
-; CHECK-NEXT:    addl %esi, %ecx
-; CHECK-NEXT:    setns %al
-; CHECK-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; CHECK-NEXT:    leal (%rdi,%rsi), %eax
+; CHECK-NEXT:    sarl $31, %eax
+; CHECK-NEXT:    xorl $-2147483648, %eax # imm = 0x80000000
 ; CHECK-NEXT:    addl %edi, %esi
 ; CHECK-NEXT:    cmovnol %esi, %eax
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
new file mode 100644
index 0000000000000..39fed0690ece7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
@@ -0,0 +1,866 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2                                                                 | FileCheck %s --check-prefixes=CHECK,CHECK-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle                                | FileCheck %s --check-prefixes=CHECK,CHECK-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle                                  | FileCheck %s --check-prefixes=CHECK,CHECK-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,CHECK-FAST
+
+; Shuffle lowest element of some subvector into highest element of some subvector.
+; Mainly this is testing how well we avoid subvector extractions/insertions.
+; https://bugs.llvm.org/show_bug.cgi?id=50971
+
+define <2 x double> @vec128_eltty_double_source_subvec_0_target_subvec_mask_1_unary(<2 x double> %x) nounwind {
+; CHECK-LABEL: vec128_eltty_double_source_subvec_0_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT:    retq
+  %r = shufflevector <2 x double> %x, <2 x double> poison, <2 x i32> zeroinitializer
+  ret <2 x double> %r
+}
+
+define <2 x double> @vec128_eltty_double_source_subvec_0_target_subvec_mask_1_binary(<2 x double> %x, <2 x double> %y) nounwind {
+; CHECK-LABEL: vec128_eltty_double_source_subvec_0_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
+  %r = shufflevector <2 x double> %x, <2 x double> %y, <2 x i32> <i32 0, i32 2>
+  ret <2 x double> %r
+}
+
+define <2 x i64> @vec128_eltty_i64_source_subvec_0_target_subvec_mask_1_unary(<2 x i64> %x) nounwind {
+; CHECK-LABEL: vec128_eltty_i64_source_subvec_0_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; CHECK-NEXT:    retq
+  %r = shufflevector <2 x i64> %x, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @vec128_eltty_i64_source_subvec_0_target_subvec_mask_1_binary(<2 x i64> %x, <2 x i64> %y) nounwind {
+; CHECK-LABEL: vec128_eltty_i64_source_subvec_0_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
+  %r = shufflevector <2 x i64> %x, <2 x i64> %y, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %r
+}
+
+define <4 x float> @vec128_eltty_float_source_subvec_0_target_subvec_mask_1_unary(<4 x float> %x) nounwind {
+; CHECK-LABEL: vec128_eltty_float_source_subvec_0_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+  ret <4 x float> %r
+}
+
+define <4 x float> @vec128_eltty_float_source_subvec_0_target_subvec_mask_1_binary(<4 x float> %x, <4 x float> %y) nounwind {
+; CHECK-LABEL: vec128_eltty_float_source_subvec_0_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x float> %x, <4 x float> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %r
+}
+
+define <4 x i32> @vec128_eltty_i32_source_subvec_0_target_subvec_mask_1_unary(<4 x i32> %x) nounwind {
+; CHECK-LABEL: vec128_eltty_i32_source_subvec_0_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @vec128_eltty_i32_source_subvec_0_target_subvec_mask_1_binary(<4 x i32> %x, <4 x i32> %y) nounwind {
+; CHECK-LABEL: vec128_eltty_i32_source_subvec_0_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %r
+}
+
+define <8 x i16> @vec128_eltty_i16_source_subvec_0_target_subvec_mask_1_unary(<8 x i16> %x) nounwind {
+; CHECK-LABEL: vec128_eltty_i16_source_subvec_0_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,0,1]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 0>
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @vec128_eltty_i16_source_subvec_0_target_subvec_mask_1_binary(<8 x i16> %x, <8 x i16> %y) nounwind {
+; CHECK-LABEL: vec128_eltty_i16_source_subvec_0_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm1, %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8>
+  ret <8 x i16> %r
+}
+
+define <16 x i8> @vec128_eltty_i8_source_subvec_0_target_subvec_mask_1_unary(<16 x i8> %x) nounwind {
+; CHECK-LABEL: vec128_eltty_i8_source_subvec_0_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0]
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i8> %x, <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
+  ret <16 x i8> %r
+}
+
+define <16 x i8> @vec128_eltty_i8_source_subvec_0_target_subvec_mask_1_binary(<16 x i8> %x, <16 x i8> %y) nounwind {
+; CHECK-LABEL: vec128_eltty_i8_source_subvec_0_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; CHECK-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero
+; CHECK-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
+  ret <16 x i8> %r
+}
+
+define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_1_unary(<4 x double> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+  ret <4 x double> %r
+}
+
+define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_1_binary(<4 x double> %x, <4 x double> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x double> %r
+}
+
+define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_unary(<4 x double> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_2_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,0]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+  ret <4 x double> %r
+}
+
+define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary(<4 x double> %x, <4 x double> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x double> %r
+}
+
+define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_3_unary(<4 x double> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_3_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+  ret <4 x double> %r
+}
+
+define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_3_binary(<4 x double> %x, <4 x double> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_3_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastsd %xmm1, %ymm1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 4>
+  ret <4 x double> %r
+}
+
+define <4 x double> @vec256_eltty_double_source_subvec_1_target_subvec_mask_1_unary(<4 x double> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_double_source_subvec_1_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 3>
+  ret <4 x double> %r
+}
+
+define <4 x double> @vec256_eltty_double_source_subvec_1_target_subvec_mask_1_binary(<4 x double> %x, <4 x double> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_double_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[2,3]
+; CHECK-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x double> %r
+}
+
+define <4 x double> @vec256_eltty_double_source_subvec_1_target_subvec_mask_2_unary(<4 x double> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_double_source_subvec_1_target_subvec_mask_2_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  ret <4 x double> %r
+}
+
+define <4 x double> @vec256_eltty_double_source_subvec_1_target_subvec_mask_2_binary(<4 x double> %x, <4 x double> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_double_source_subvec_1_target_subvec_mask_2_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x double> %r
+}
+
+define <4 x double> @vec256_eltty_double_source_subvec_1_target_subvec_mask_3_unary(<4 x double> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_double_source_subvec_1_target_subvec_mask_3_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,2]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x double> %x, <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
+  ret <4 x double> %r
+}
+
+define <4 x double> @vec256_eltty_double_source_subvec_1_target_subvec_mask_3_binary(<4 x double> %x, <4 x double> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_double_source_subvec_1_target_subvec_mask_3_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 6, i32 2, i32 6>
+  ret <4 x double> %r
+}
+
+define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_1_unary(<4 x i64> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i64_source_subvec_0_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_1_binary(<4 x i64> %x, <4 x i64> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i64_source_subvec_0_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_2_unary(<4 x i64> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i64_source_subvec_0_target_subvec_mask_2_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,0]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 0>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_2_binary(<4 x i64> %x, <4 x i64> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i64_source_subvec_0_target_subvec_mask_2_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastsd %xmm1, %ymm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_unary(<4 x i64> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,0]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_binary(<4 x i64> %x, <4 x i64> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i64_source_subvec_0_target_subvec_mask_3_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastsd %xmm1, %ymm1
+; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 0, i32 4, i32 2, i32 4>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @vec256_eltty_i64_source_subvec_1_target_subvec_mask_1_unary(<4 x i64> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i64_source_subvec_1_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 3>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @vec256_eltty_i64_source_subvec_1_target_subvec_mask_1_binary(<4 x i64> %x, <4 x i64> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i64_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @vec256_eltty_i64_source_subvec_1_target_subvec_mask_2_unary(<4 x i64> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i64_source_subvec_1_target_subvec_mask_2_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @vec256_eltty_i64_source_subvec_1_target_subvec_mask_2_binary(<4 x i64> %x, <4 x i64> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i64_source_subvec_1_target_subvec_mask_2_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5]
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @vec256_eltty_i64_source_subvec_1_target_subvec_mask_3_unary(<4 x i64> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i64_source_subvec_1_target_subvec_mask_3_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,2]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 2, i32 2>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @vec256_eltty_i64_source_subvec_1_target_subvec_mask_3_binary(<4 x i64> %x, <4 x i64> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i64_source_subvec_1_target_subvec_mask_3_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 0, i32 6, i32 2, i32 6>
+  ret <4 x i64> %r
+}
+
+define <8 x float> @vec256_eltty_float_source_subvec_0_target_subvec_mask_1_unary(<8 x float> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_float_source_subvec_0_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %r
+}
+
+define <8 x float> @vec256_eltty_float_source_subvec_0_target_subvec_mask_1_binary(<8 x float> %x, <8 x float> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_float_source_subvec_0_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %r
+}
+
+define <8 x float> @vec256_eltty_float_source_subvec_0_target_subvec_mask_2_unary(<8 x float> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_float_source_subvec_0_target_subvec_mask_2_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,3,4,5,6,0]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 0>
+  ret <8 x float> %r
+}
+
+define <8 x float> @vec256_eltty_float_source_subvec_0_target_subvec_mask_2_binary(<8 x float> %x, <8 x float> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_float_source_subvec_0_target_subvec_mask_2_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm1, %ymm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8>
+  ret <8 x float> %r
+}
+
+define <8 x float> @vec256_eltty_float_source_subvec_0_target_subvec_mask_3_unary(<8 x float> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_float_source_subvec_0_target_subvec_mask_3_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,0,4,5,6,0]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 0>
+  ret <8 x float> %r
+}
+
+define <8 x float> @vec256_eltty_float_source_subvec_0_target_subvec_mask_3_binary(<8 x float> %x, <8 x float> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_float_source_subvec_0_target_subvec_mask_3_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; CHECK-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,0],ymm0[4,5],ymm1[6,4]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 4, i32 5, i32 6, i32 8>
+  ret <8 x float> %r
+}
+
+define <8 x float> @vec256_eltty_float_source_subvec_1_target_subvec_mask_1_unary(<8 x float> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,4,4,5,6,7]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 4, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %r
+}
+
+define <8 x float> @vec256_eltty_float_source_subvec_1_target_subvec_mask_1_binary(<8 x float> %x, <8 x float> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; CHECK-NEXT:    vbroadcastss %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 12, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %r
+}
+
+define <8 x float> @vec256_eltty_float_source_subvec_1_target_subvec_mask_2_unary(<8 x float> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_2_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,4]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>
+  ret <8 x float> %r
+}
+
+define <8 x float> @vec256_eltty_float_source_subvec_1_target_subvec_mask_2_binary(<8 x float> %x, <8 x float> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_2_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; CHECK-NEXT:    vbroadcastss %xmm1, %ymm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 12>
+  ret <8 x float> %r
+}
+
+define <8 x float> @vec256_eltty_float_source_subvec_1_target_subvec_mask_3_unary(<8 x float> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_3_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,4,4,5,6,4]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 4, i32 4, i32 5, i32 6, i32 4>
+  ret <8 x float> %r
+}
+
+define <8 x float> @vec256_eltty_float_source_subvec_1_target_subvec_mask_3_binary(<8 x float> %x, <8 x float> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_3_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; CHECK-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,0],ymm0[4,5],ymm1[6,4]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 12, i32 4, i32 5, i32 6, i32 12>
+  ret <8 x float> %r
+}
+
+define <8 x i32> @vec256_eltty_i32_source_subvec_0_target_subvec_mask_1_unary(<8 x i32> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i32_source_subvec_0_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,0,4,5,6,7]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @vec256_eltty_i32_source_subvec_0_target_subvec_mask_1_binary(<8 x i32> %x, <8 x i32> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i32_source_subvec_0_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @vec256_eltty_i32_source_subvec_0_target_subvec_mask_2_unary(<8 x i32> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i32_source_subvec_0_target_subvec_mask_2_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,3,4,5,6,0]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 0>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @vec256_eltty_i32_source_subvec_0_target_subvec_mask_2_binary(<8 x i32> %x, <8 x i32> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i32_source_subvec_0_target_subvec_mask_2_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm1, %ymm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @vec256_eltty_i32_source_subvec_0_target_subvec_mask_3_unary(<8 x i32> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i32_source_subvec_0_target_subvec_mask_3_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,0,4,5,6,0]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6, i32 0>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @vec256_eltty_i32_source_subvec_0_target_subvec_mask_3_binary(<8 x i32> %x, <8 x i32> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i32_source_subvec_0_target_subvec_mask_3_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vbroadcastss %xmm1, %ymm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 4, i32 5, i32 6, i32 8>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_unary(<8 x i32> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,4,4,5,6,7]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 4, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_binary(<8 x i32> %x, <8 x i32> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; CHECK-NEXT:    vbroadcastss %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 12, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_2_unary(<8 x i32> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_2_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,3,4,5,6,4]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 4>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_2_binary(<8 x i32> %x, <8 x i32> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_2_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; CHECK-NEXT:    vbroadcastss %xmm1, %ymm1
+; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 12>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_3_unary(<8 x i32> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_3_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [0,1,2,4,4,5,6,4]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 4, i32 4, i32 5, i32 6, i32 4>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_3_binary(<8 x i32> %x, <8 x i32> %y) nounwind {
+; CHECK-SLOW-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_3_binary:
+; CHECK-SLOW:       # %bb.0:
+; CHECK-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; CHECK-SLOW-NEXT:    vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
+; CHECK-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7]
+; CHECK-SLOW-NEXT:    retq
+;
+; CHECK-FAST-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_3_binary:
+; CHECK-FAST:       # %bb.0:
+; CHECK-FAST-NEXT:    vbroadcastss {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4]
+; CHECK-FAST-NEXT:    vpermps %ymm1, %ymm2, %ymm1
+; CHECK-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7]
+; CHECK-FAST-NEXT:    retq
+  %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 12, i32 4, i32 5, i32 6, i32 12>
+  ret <8 x i32> %r
+}
+
+define <16 x i16> @vec256_eltty_i16_source_subvec_0_target_subvec_mask_1_unary(<16 x i16> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i16_source_subvec_0_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,0,1,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i16> %x, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 0, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @vec256_eltty_i16_source_subvec_0_target_subvec_mask_1_binary(<16 x i16> %x, <16 x i16> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i16_source_subvec_0_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm1, %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @vec256_eltty_i16_source_subvec_0_target_subvec_mask_2_unary(<16 x i16> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i16_source_subvec_0_target_subvec_mask_2_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm1
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i16> %x, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @vec256_eltty_i16_source_subvec_0_target_subvec_mask_2_binary(<16 x i16> %x, <16 x i16> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i16_source_subvec_0_target_subvec_mask_2_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm1, %ymm1
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @vec256_eltty_i16_source_subvec_0_target_subvec_mask_3_unary(<16 x i16> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i16_source_subvec_0_target_subvec_mask_3_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm1
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i16> %x, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 0, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @vec256_eltty_i16_source_subvec_0_target_subvec_mask_3_binary(<16 x i16> %x, <16 x i16> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i16_source_subvec_0_target_subvec_mask_3_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw %xmm1, %ymm1
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 16, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @vec256_eltty_i16_source_subvec_1_target_subvec_mask_1_unary(<16 x i16> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i16_source_subvec_1_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpbroadcastw %xmm1, %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i16> %x, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @vec256_eltty_i16_source_subvec_1_target_subvec_mask_1_binary(<16 x i16> %x, <16 x i16> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i16_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; CHECK-NEXT:    vpbroadcastw %xmm1, %xmm1
+; CHECK-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 24, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @vec256_eltty_i16_source_subvec_1_target_subvec_mask_2_unary(<16 x i16> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i16_source_subvec_1_target_subvec_mask_2_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,16,17]
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i16> %x, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 8>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @vec256_eltty_i16_source_subvec_1_target_subvec_mask_2_binary(<16 x i16> %x, <16 x i16> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i16_source_subvec_1_target_subvec_mask_2_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; CHECK-NEXT:    vpbroadcastw %xmm1, %ymm1
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 24>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @vec256_eltty_i16_source_subvec_1_target_subvec_mask_3_unary(<16 x i16> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i16_source_subvec_1_target_subvec_mask_3_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
+; CHECK-NEXT:    vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17]
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i16> %x, <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 8>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @vec256_eltty_i16_source_subvec_1_target_subvec_mask_3_binary(<16 x i16> %x, <16 x i16> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i16_source_subvec_1_target_subvec_mask_3_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; CHECK-NEXT:    vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17]
+; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7],ymm0[8,9,10,11,12,13,14],ymm1[15]
+; CHECK-NEXT:    retq
+  %r = shufflevector <16 x i16> %x, <16 x i16> %y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 24, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 24>
+  ret <16 x i16> %r
+}
+
+define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_1_unary(<32 x i8> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; CHECK-NEXT:    retq
+  %r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_1_binary(<32 x i8> %x, <32 x i8> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_2_unary(<32 x i8> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_2_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 0>
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_2_binary(<32 x i8> %x, <32 x i8> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_2_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 32>
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_unary(<32 x i8> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
+; CHECK-NEXT:    vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 0>
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_binary(<32 x i8> %x, <32 x i8> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i8_source_subvec_0_target_subvec_mask_3_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; CHECK-NEXT:    vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 32, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 32>
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_1_unary(<32 x i8> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i8_source_subvec_1_target_subvec_mask_1_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_1_binary(<32 x i8> %x, <32 x i8> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i8_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; CHECK-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 48, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_2_unary(<32 x i8> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i8_source_subvec_1_target_subvec_mask_2_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,16]
+; CHECK-NEXT:    retq
+  %r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 16>
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_2_binary(<32 x i8> %x, <32 x i8> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i8_source_subvec_1_target_subvec_mask_2_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; CHECK-NEXT:    vpbroadcastb %xmm1, %ymm1
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 48>
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_unary(<32 x i8> %x) nounwind {
+; CHECK-LABEL: vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_unary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,2,3]
+; CHECK-NEXT:    vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <32 x i8> %x, <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 16, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 16>
+  ret <32 x i8> %r
+}
+
+define <32 x i8> @vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_binary(<32 x i8> %x, <32 x i8> %y) nounwind {
+; CHECK-LABEL: vec256_eltty_i8_source_subvec_1_target_subvec_mask_3_binary:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; CHECK-NEXT:    vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16]
+; CHECK-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; CHECK-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %r = shufflevector <32 x i8> %x, <32 x i8> %y, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 48, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 48>
+  ret <32 x i8> %r
+}
diff --git a/llvm/test/CodeGen/X86/cvt16-2.ll b/llvm/test/CodeGen/X86/cvt16-2.ll
new file mode 100644
index 0000000000000..67111e838cab8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/cvt16-2.ll
@@ -0,0 +1,157 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-avx512fp16 | FileCheck %s -check-prefix=LIBCALL
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512fp16 | FileCheck %s -check-prefix=FP16
+
+define void @test1(float %src, i16* %dest) {
+; LIBCALL-LABEL: test1:
+; LIBCALL:       # %bb.0:
+; LIBCALL-NEXT:    pushq %rbx
+; LIBCALL-NEXT:    .cfi_def_cfa_offset 16
+; LIBCALL-NEXT:    .cfi_offset %rbx, -16
+; LIBCALL-NEXT:    movq %rdi, %rbx
+; LIBCALL-NEXT:    callq __gnu_f2h_ieee@PLT
+; LIBCALL-NEXT:    movw %ax, (%rbx)
+; LIBCALL-NEXT:    popq %rbx
+; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
+; LIBCALL-NEXT:    retq
+;
+; FP16-LABEL: test1:
+; FP16:       # %bb.0:
+; FP16-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    vmovsh %xmm0, (%rdi)
+; FP16-NEXT:    retq
+  %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src)
+  store i16 %1, i16* %dest, align 2
+  ret void
+}
+
+define float @test2(i16* nocapture %src) {
+; LIBCALL-LABEL: test2:
+; LIBCALL:       # %bb.0:
+; LIBCALL-NEXT:    movzwl (%rdi), %edi
+; LIBCALL-NEXT:    jmp __gnu_h2f_ieee@PLT # TAILCALL
+;
+; FP16-LABEL: test2:
+; FP16:       # %bb.0:
+; FP16-NEXT:    vmovsh (%rdi), %xmm0
+; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    retq
+  %1 = load i16, i16* %src, align 2
+  %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1)
+  ret float %2
+}
+
+define float @test3(float %src) nounwind uwtable readnone {
+; LIBCALL-LABEL: test3:
+; LIBCALL:       # %bb.0:
+; LIBCALL-NEXT:    pushq %rax
+; LIBCALL-NEXT:    .cfi_def_cfa_offset 16
+; LIBCALL-NEXT:    callq __gnu_f2h_ieee@PLT
+; LIBCALL-NEXT:    movzwl %ax, %edi
+; LIBCALL-NEXT:    popq %rax
+; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
+; LIBCALL-NEXT:    jmp __gnu_h2f_ieee@PLT # TAILCALL
+;
+; FP16-LABEL: test3:
+; FP16:       # %bb.0:
+; FP16-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    retq
+  %1 = tail call i16 @llvm.convert.to.fp16.f32(float %src)
+  %2 = tail call float @llvm.convert.from.fp16.f32(i16 %1)
+  ret float %2
+}
+
+; FIXME: Should it be __extendhfdf2?
+define double @test4(i16* nocapture %src) {
+; LIBCALL-LABEL: test4:
+; LIBCALL:       # %bb.0:
+; LIBCALL-NEXT:    pushq %rax
+; LIBCALL-NEXT:    .cfi_def_cfa_offset 16
+; LIBCALL-NEXT:    movzwl (%rdi), %edi
+; LIBCALL-NEXT:    callq __gnu_h2f_ieee@PLT
+; LIBCALL-NEXT:    cvtss2sd %xmm0, %xmm0
+; LIBCALL-NEXT:    popq %rax
+; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
+; LIBCALL-NEXT:    retq
+;
+; FP16-LABEL: test4:
+; FP16:       # %bb.0:
+; FP16-NEXT:    vmovsh (%rdi), %xmm0
+; FP16-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    retq
+  %1 = load i16, i16* %src, align 2
+  %2 = tail call double @llvm.convert.from.fp16.f64(i16 %1)
+  ret double %2
+}
+
+define i16 @test5(double %src) {
+; LIBCALL-LABEL: test5:
+; LIBCALL:       # %bb.0:
+; LIBCALL-NEXT:    jmp __truncdfhf2@PLT # TAILCALL
+;
+; FP16-LABEL: test5:
+; FP16:       # %bb.0:
+; FP16-NEXT:    vcvtsd2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    vmovw %xmm0, %eax
+; FP16-NEXT:    # kill: def $ax killed $ax killed $eax
+; FP16-NEXT:    retq
+  %val = tail call i16 @llvm.convert.to.fp16.f64(double %src)
+  ret i16 %val
+}
+
+; FIXME: Should it be __extendhfxf2?
+define x86_fp80 @test6(i16* nocapture %src) {
+; LIBCALL-LABEL: test6:
+; LIBCALL:       # %bb.0:
+; LIBCALL-NEXT:    pushq %rax
+; LIBCALL-NEXT:    .cfi_def_cfa_offset 16
+; LIBCALL-NEXT:    movzwl (%rdi), %edi
+; LIBCALL-NEXT:    callq __gnu_h2f_ieee@PLT
+; LIBCALL-NEXT:    movss %xmm0, {{[0-9]+}}(%rsp)
+; LIBCALL-NEXT:    flds {{[0-9]+}}(%rsp)
+; LIBCALL-NEXT:    popq %rax
+; LIBCALL-NEXT:    .cfi_def_cfa_offset 8
+; LIBCALL-NEXT:    retq
+;
+; FP16-LABEL: test6:
+; FP16:       # %bb.0:
+; FP16-NEXT:    pushq %rax
+; FP16-NEXT:    .cfi_def_cfa_offset 16
+; FP16-NEXT:    vmovsh (%rdi), %xmm0
+; FP16-NEXT:    callq __extendhfxf2@PLT
+; FP16-NEXT:    popq %rax
+; FP16-NEXT:    .cfi_def_cfa_offset 8
+; FP16-NEXT:    retq
+  %1 = load i16, i16* %src, align 2
+  %2 = tail call x86_fp80 @llvm.convert.from.fp16.f80(i16 %1)
+  ret x86_fp80 %2
+}
+
+define i16 @test7(x86_fp80 %src) {
+; LIBCALL-LABEL: test7:
+; LIBCALL:       # %bb.0:
+; LIBCALL-NEXT:    jmp __truncxfhf2@PLT # TAILCALL
+;
+; FP16-LABEL: test7:
+; FP16:       # %bb.0:
+; FP16-NEXT:    subq $24, %rsp
+; FP16-NEXT:    .cfi_def_cfa_offset 32
+; FP16-NEXT:    fldt {{[0-9]+}}(%rsp)
+; FP16-NEXT:    fstpt (%rsp)
+; FP16-NEXT:    callq __truncxfhf2@PLT
+; FP16-NEXT:    vmovw %xmm0, %eax
+; FP16-NEXT:    # kill: def $ax killed $ax killed $eax
+; FP16-NEXT:    addq $24, %rsp
+; FP16-NEXT:    .cfi_def_cfa_offset 8
+; FP16-NEXT:    retq
+  %val = tail call i16 @llvm.convert.to.fp16.f80(x86_fp80 %src)
+  ret i16 %val
+}
+
+declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
+declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f64(double) nounwind readnone
+declare x86_fp80 @llvm.convert.from.fp16.f80(i16) nounwind readnone
+declare i16 @llvm.convert.to.fp16.f80(x86_fp80) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/fcmp-logic.ll b/llvm/test/CodeGen/X86/fcmp-logic.ll
new file mode 100644
index 0000000000000..54f7183ef3337
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fcmp-logic.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+
+define i1 @olt_ole_and_f32(float %w, float %x, float %y, float %z) {
+; CHECK-LABEL: olt_ole_and_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ucomiss %xmm0, %xmm1
+; CHECK-NEXT:    seta %cl
+; CHECK-NEXT:    ucomiss %xmm2, %xmm3
+; CHECK-NEXT:    setae %al
+; CHECK-NEXT:    andb %cl, %al
+; CHECK-NEXT:    retq
+  %f1 = fcmp olt float %w, %x
+  %f2 = fcmp ole float %y, %z
+  %r = and i1 %f1, %f2
+  ret i1 %r
+}
+
+define i1 @oge_oeq_or_f32(float %w, float %x, float %y, float %z) {
+; CHECK-LABEL: oge_oeq_or_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    setae %cl
+; CHECK-NEXT:    ucomiss %xmm3, %xmm2
+; CHECK-NEXT:    setnp %dl
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    andb %dl, %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    retq
+  %f1 = fcmp oge float %w, %x
+  %f2 = fcmp oeq float %y, %z
+  %r = or i1 %f1, %f2
+  ret i1 %r
+}
+
+define i1 @ord_one_xor_f32(float %w, float %x, float %y, float %z) {
+; CHECK-LABEL: ord_one_xor_f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    setnp %cl
+; CHECK-NEXT:    ucomiss %xmm3, %xmm2
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    xorb %cl, %al
+; CHECK-NEXT:    retq
+  %f1 = fcmp ord float %w, %x
+  %f2 = fcmp one float %y, %z
+  %r = xor i1 %f1, %f2
+  ret i1 %r
+}
+
+define i1 @une_ugt_and_f64(double %w, double %x, double %y, double %z) {
+; CHECK-LABEL: une_ugt_and_f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setp %al
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    ucomisd %xmm2, %xmm3
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    andb %cl, %al
+; CHECK-NEXT:    retq
+  %f1 = fcmp une double %w, %x
+  %f2 = fcmp ugt double %y, %z
+  %r = and i1 %f1, %f2
+  ret i1 %r
+}
+
+define i1 @ult_uge_or_f64(double %w, double %x, double %y, double %z) {
+; CHECK-LABEL: ult_uge_or_f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setb %cl
+; CHECK-NEXT:    ucomisd %xmm2, %xmm3
+; CHECK-NEXT:    setbe %al
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    retq
+  %f1 = fcmp ult double %w, %x
+  %f2 = fcmp uge double %y, %z
+  %r = or i1 %f1, %f2
+  ret i1 %r
+}
+
+define i1 @une_uno_xor_f64(double %w, double %x, double %y, double %z) {
+; CHECK-LABEL: une_uno_xor_f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setp %al
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    ucomisd %xmm3, %xmm2
+; CHECK-NEXT:    setp %al
+; CHECK-NEXT:    xorb %cl, %al
+; CHECK-NEXT:    retq
+  %f1 = fcmp une double %w, %x
+  %f2 = fcmp uno double %y, %z
+  %r = xor i1 %f1, %f2
+  ret i1 %r
+}
+
+define i1 @olt_olt_and_f32_f64(float %w, float %x, double %y, double %z) {
+; CHECK-LABEL: olt_olt_and_f32_f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ucomiss %xmm0, %xmm1
+; CHECK-NEXT:    seta %cl
+; CHECK-NEXT:    ucomisd %xmm2, %xmm3
+; CHECK-NEXT:    seta %al
+; CHECK-NEXT:    andb %cl, %al
+; CHECK-NEXT:    retq
+  %f1 = fcmp olt float %w, %x
+  %f2 = fcmp olt double %y, %z
+  %r = and i1 %f1, %f2
+  ret i1 %r
+}
+
+define i1 @une_uno_xor_f64_use1(double %w, double %x, double %y, double %z, i1* %p) {
+; CHECK-LABEL: une_uno_xor_f64_use1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setp %al
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    movb %cl, (%rdi)
+; CHECK-NEXT:    ucomisd %xmm3, %xmm2
+; CHECK-NEXT:    setp %al
+; CHECK-NEXT:    xorb %cl, %al
+; CHECK-NEXT:    retq
+  %f1 = fcmp une double %w, %x
+  store i1 %f1, i1* %p
+  %f2 = fcmp uno double %y, %z
+  %r = xor i1 %f1, %f2
+  ret i1 %r
+}
+
+define i1 @une_uno_xor_f64_use2(double %w, double %x, double %y, double %z, i1* %p) {
+; CHECK-LABEL: une_uno_xor_f64_use2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ucomisd %xmm1, %xmm0
+; CHECK-NEXT:    setp %al
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    ucomisd %xmm3, %xmm2
+; CHECK-NEXT:    setp %al
+; CHECK-NEXT:    setp (%rdi)
+; CHECK-NEXT:    xorb %cl, %al
+; CHECK-NEXT:    retq
+  %f1 = fcmp une double %w, %x
+  %f2 = fcmp uno double %y, %z
+  store i1 %f2, i1* %p
+  %r = xor i1 %f1, %f2
+  ret i1 %r
+}
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
new file mode 100644
index 0000000000000..dc70de2c57414
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-cmp-fp16.ll
@@ -0,0 +1,719 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK-32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK-64
+
+define i32 @test_f16_oeq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_oeq_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovnel %eax, %ecx
+; CHECK-32-NEXT:    cmovpl %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_oeq_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vucomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovnel %esi, %eax
+; CHECK-64-NEXT:    cmovpl %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"oeq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ogt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ogt_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmoval %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ogt_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vucomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovbel %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"ogt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_oge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_oge_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovael %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_oge_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vucomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovbl %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"oge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_olt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_olt_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmoval %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_olt_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vucomish %xmm0, %xmm1
+; CHECK-64-NEXT:    cmovbel %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"olt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ole_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ole_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovael %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ole_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vucomish %xmm0, %xmm1
+; CHECK-64-NEXT:    cmovbl %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"ole",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_one_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_one_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovnel %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_one_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vucomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovel %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"one",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ord_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ord_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovnpl %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ord_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vucomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovpl %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"ord",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ueq_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ueq_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovel %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ueq_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vucomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovnel %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"ueq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ugt_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ugt_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovbl %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ugt_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vucomish %xmm0, %xmm1
+; CHECK-64-NEXT:    cmovael %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"ugt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_uge_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_uge_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovbel %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_uge_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vucomish %xmm0, %xmm1
+; CHECK-64-NEXT:    cmoval %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"uge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ult_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ult_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovbl %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ult_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vucomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovael %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"ult",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ule_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ule_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovbel %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ule_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vucomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmoval %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"ule",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_une_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_une_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovnel %eax, %ecx
+; CHECK-32-NEXT:    cmovpl %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_une_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %esi, %eax
+; CHECK-64-NEXT:    vucomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovnel %edi, %eax
+; CHECK-64-NEXT:    cmovpl %edi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"une",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_uno_q(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_uno_q:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovpl %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_uno_q:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vucomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovnpl %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmp.f16(
+                                               half %f1, half %f2, metadata !"uno",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_oeq_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_oeq_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovnel %eax, %ecx
+; CHECK-32-NEXT:    cmovpl %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_oeq_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vcomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovnel %esi, %eax
+; CHECK-64-NEXT:    cmovpl %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"oeq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ogt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ogt_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmoval %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ogt_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vcomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovbel %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"ogt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_oge_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_oge_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovael %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_oge_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vcomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovbl %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"oge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_olt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_olt_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmoval %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_olt_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vcomish %xmm0, %xmm1
+; CHECK-64-NEXT:    cmovbel %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"olt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ole_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ole_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovael %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ole_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vcomish %xmm0, %xmm1
+; CHECK-64-NEXT:    cmovbl %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"ole",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_one_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_one_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovnel %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_one_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vcomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovel %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"one",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ord_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ord_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovnpl %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ord_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vcomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovpl %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"ord",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ueq_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ueq_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovel %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ueq_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vcomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovnel %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"ueq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ugt_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ugt_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovbl %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ugt_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vcomish %xmm0, %xmm1
+; CHECK-64-NEXT:    cmovael %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"ugt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_uge_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_uge_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovbel %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_uge_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vcomish %xmm0, %xmm1
+; CHECK-64-NEXT:    cmoval %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"uge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ult_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ult_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovbl %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ult_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vcomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovael %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"ult",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_ule_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_ule_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovbel %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_ule_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vcomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmoval %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"ule",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_une_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_une_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovnel %eax, %ecx
+; CHECK-32-NEXT:    cmovpl %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_une_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %esi, %eax
+; CHECK-64-NEXT:    vcomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovnel %edi, %eax
+; CHECK-64-NEXT:    cmovpl %edi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"une",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @test_f16_uno_s(i32 %a, i32 %b, half %f1, half %f2) #0 {
+; CHECK-32-LABEL: test_f16_uno_s:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vcomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; CHECK-32-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; CHECK-32-NEXT:    cmovpl %eax, %ecx
+; CHECK-32-NEXT:    movl (%ecx), %eax
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: test_f16_uno_s:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    movl %edi, %eax
+; CHECK-64-NEXT:    vcomish %xmm1, %xmm0
+; CHECK-64-NEXT:    cmovnpl %esi, %eax
+; CHECK-64-NEXT:    retq
+  %cond = call i1 @llvm.experimental.constrained.fcmps.f16(
+                                               half %f1, half %f2, metadata !"uno",
+                                               metadata !"fpexcept.strict") #0
+  %res = select i1 %cond, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define void @foo(half %0, half %1) #0 {
+; CHECK-32-LABEL: foo:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    vucomish {{[0-9]+}}(%esp), %xmm0
+; CHECK-32-NEXT:    jbe .LBB28_1
+; CHECK-32-NEXT:  # %bb.2:
+; CHECK-32-NEXT:    jmp bar@PLT # TAILCALL
+; CHECK-32-NEXT:  .LBB28_1:
+; CHECK-32-NEXT:    retl
+;
+; CHECK-64-LABEL: foo:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    vucomish %xmm1, %xmm0
+; CHECK-64-NEXT:    jbe .LBB28_1
+; CHECK-64-NEXT:  # %bb.2:
+; CHECK-64-NEXT:    jmp bar@PLT # TAILCALL
+; CHECK-64-NEXT:  .LBB28_1:
+; CHECK-64-NEXT:    retq
+  %3 = call i1 @llvm.experimental.constrained.fcmp.f16( half %0, half %1, metadata !"ogt", metadata !"fpexcept.strict") #0
+  br i1 %3, label %4, label %5
+
+4:                                                ; preds = %2
+  tail call void @bar() #0
+  br label %5
+
+5:                                                ; preds = %4, %2
+  ret void
+}
+declare void @bar()
+
+attributes #0 = { strictfp }
+
+declare i1 @llvm.experimental.constrained.fcmp.f16(half, half, metadata, metadata)
+declare i1 @llvm.experimental.constrained.fcmps.f16(half, half, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
new file mode 100644
index 0000000000000..e7e52f153bc35
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fp16.ll
@@ -0,0 +1,200 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64
+
+declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata)
+declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata)
+declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata)
+declare half @llvm.experimental.constrained.fdiv.f16(half, half, metadata, metadata)
+declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata)
+declare double @llvm.experimental.constrained.fpext.f64.f16(half, metadata)
+declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata)
+declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata)
+declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata)
+
+define half @fadd_f16(half %a, half %b) nounwind strictfp {
+; X86-LABEL: fadd_f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vaddsh {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: fadd_f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vaddsh %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %ret = call half @llvm.experimental.constrained.fadd.f16(half %a, half %b,
+                                                           metadata !"round.dynamic",
+                                                           metadata !"fpexcept.strict") #0
+  ret half %ret
+}
+
+define half @fsub_f16(half %a, half %b) nounwind strictfp {
+; X86-LABEL: fsub_f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vsubsh {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: fsub_f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vsubsh %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %ret = call half @llvm.experimental.constrained.fsub.f16(half %a, half %b,
+                                                           metadata !"round.dynamic",
+                                                           metadata !"fpexcept.strict") #0
+  ret half %ret
+}
+
+define half @fmul_f16(half %a, half %b) nounwind strictfp {
+; X86-LABEL: fmul_f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vmulsh {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: fmul_f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vmulsh %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %ret = call half @llvm.experimental.constrained.fmul.f16(half %a, half %b,
+                                                           metadata !"round.dynamic",
+                                                           metadata !"fpexcept.strict") #0
+  ret half %ret
+}
+
+define half @fdiv_f16(half %a, half %b) nounwind strictfp {
+; X86-LABEL: fdiv_f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vdivsh {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: fdiv_f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vdivsh %xmm1, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %ret = call half @llvm.experimental.constrained.fdiv.f16(half %a, half %b,
+                                                           metadata !"round.dynamic",
+                                                           metadata !"fpexcept.strict") #0
+  ret half %ret
+}
+
+define void @fpext_f16_to_f32(half* %val, float* %ret) nounwind strictfp {
+; X86-LABEL: fpext_f16_to_f32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vmovsh (%ecx), %xmm0
+; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovss %xmm0, (%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: fpext_f16_to_f32:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vmovss %xmm0, (%rsi)
+; X64-NEXT:    retq
+  %1 = load half, half* %val, align 4
+  %res = call float @llvm.experimental.constrained.fpext.f32.f16(half %1,
+                                                                 metadata !"fpexcept.strict") #0
+  store float %res, float* %ret, align 8
+  ret void
+}
+
+define void @fpext_f16_to_f64(half* %val, double* %ret) nounwind strictfp {
+; X86-LABEL: fpext_f16_to_f64:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vmovsh (%ecx), %xmm0
+; X86-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovsd %xmm0, (%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: fpext_f16_to_f64:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    vcvtsh2sd %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vmovsd %xmm0, (%rsi)
+; X64-NEXT:    retq
+  %1 = load half, half* %val, align 4
+  %res = call double @llvm.experimental.constrained.fpext.f64.f16(half %1,
+                                                                  metadata !"fpexcept.strict") #0
+  store double %res, double* %ret, align 8
+  ret void
+}
+
+define void @fptrunc_float_to_f16(float* %val, half *%ret) nounwind strictfp {
+; X86-LABEL: fptrunc_float_to_f16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovsh %xmm0, (%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptrunc_float_to_f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vmovsh %xmm0, (%rsi)
+; X64-NEXT:    retq
+  %1 = load float, float* %val, align 8
+  %res = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %1,
+                                                                  metadata !"round.dynamic",
+                                                                  metadata !"fpexcept.strict") #0
+  store half %res, half* %ret, align 4
+  ret void
+}
+
+define void @fptrunc_double_to_f16(double* %val, half *%ret) nounwind strictfp {
+; X86-LABEL: fptrunc_double_to_f16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vcvtsd2sh %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovsh %xmm0, (%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptrunc_double_to_f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    vcvtsd2sh %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vmovsh %xmm0, (%rsi)
+; X64-NEXT:    retq
+  %1 = load double, double* %val, align 8
+  %res = call half @llvm.experimental.constrained.fptrunc.f16.f64(double %1,
+                                                                  metadata !"round.dynamic",
+                                                                  metadata !"fpexcept.strict") #0
+  store half %res, half* %ret, align 4
+  ret void
+}
+
+define void @fsqrt_f16(half* %a) nounwind strictfp {
+; X86-LABEL: fsqrt_f16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vmovsh (%eax), %xmm0
+; X86-NEXT:    vsqrtsh %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vmovsh %xmm0, (%eax)
+; X86-NEXT:    retl
+;
+; X64-LABEL: fsqrt_f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovsh (%rdi), %xmm0
+; X64-NEXT:    vsqrtsh %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vmovsh %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %1 = load half, half* %a, align 4
+  %res = call half @llvm.experimental.constrained.sqrt.f16(half %1,
+                                                           metadata !"round.dynamic",
+                                                           metadata !"fpexcept.strict") #0
+  store half %res, half* %a, align 4
+  ret void
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll
new file mode 100644
index 0000000000000..9ea19ca318816
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll
@@ -0,0 +1,184 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64
+
+declare i1  @llvm.experimental.constrained.fptosi.i1.f16(half, metadata)
+declare i8  @llvm.experimental.constrained.fptosi.i8.f16(half, metadata)
+declare i16 @llvm.experimental.constrained.fptosi.i16.f16(half, metadata)
+declare i32 @llvm.experimental.constrained.fptosi.i32.f16(half, metadata)
+declare i64 @llvm.experimental.constrained.fptosi.i64.f16(half, metadata)
+declare i1  @llvm.experimental.constrained.fptoui.i1.f16(half, metadata)
+declare i8  @llvm.experimental.constrained.fptoui.i8.f16(half, metadata)
+declare i16 @llvm.experimental.constrained.fptoui.i16.f16(half, metadata)
+declare i32 @llvm.experimental.constrained.fptoui.i32.f16(half, metadata)
+declare i64 @llvm.experimental.constrained.fptoui.i64.f16(half, metadata)
+
+define i1 @fptosi_f16toi1(half %x) #0 {
+; X86-LABEL: fptosi_f16toi1:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptosi_f16toi1:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %result = call i1 @llvm.experimental.constrained.fptosi.i1.f16(half %x,
+                                               metadata !"fpexcept.strict") #0
+  ret i1 %result
+}
+
+define i8 @fptosi_f16toi8(half %x) #0 {
+; X86-LABEL: fptosi_f16toi8:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptosi_f16toi8:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %result = call i8 @llvm.experimental.constrained.fptosi.i8.f16(half %x,
+                                               metadata !"fpexcept.strict") #0
+  ret i8 %result
+}
+
+define i16 @fptosi_f16toi16(half %x) #0 {
+; X86-LABEL: fptosi_f16toi16:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptosi_f16toi16:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %result = call i16 @llvm.experimental.constrained.fptosi.i16.f16(half %x,
+                                               metadata !"fpexcept.strict") #0
+  ret i16 %result
+}
+
+define i32 @fptosi_f16toi32(half %x) #0 {
+; X86-LABEL: fptosi_f16toi32:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptosi_f16toi32:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %eax
+; X64-NEXT:    retq
+  %result = call i32 @llvm.experimental.constrained.fptosi.i32.f16(half %x,
+                                               metadata !"fpexcept.strict") #0
+  ret i32 %result
+}
+
+define i64 @fptosi_f16toi64(half %x) #0 {
+; X86-LABEL: fptosi_f16toi64:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vcvttph2qq %xmm0, %xmm0
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptosi_f16toi64:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %rax
+; X64-NEXT:    retq
+  %result = call i64 @llvm.experimental.constrained.fptosi.i64.f16(half %x,
+                                               metadata !"fpexcept.strict") #0
+  ret i64 %result
+}
+
+define i1 @fptoui_f16toi1(half %x) #0 {
+; X86-LABEL: fptoui_f16toi1:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptoui_f16toi1:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %result = call i1 @llvm.experimental.constrained.fptoui.i1.f16(half %x,
+                                               metadata !"fpexcept.strict") #0
+  ret i1 %result
+}
+
+define i8 @fptoui_f16toi8(half %x) #0 {
+; X86-LABEL: fptoui_f16toi8:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptoui_f16toi8:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %result = call i8 @llvm.experimental.constrained.fptoui.i8.f16(half %x,
+                                               metadata !"fpexcept.strict") #0
+  ret i8 %result
+}
+
+define i16 @fptoui_f16toi16(half %x) #0 {
+; X86-LABEL: fptoui_f16toi16:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2si {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptoui_f16toi16:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2si %xmm0, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %result = call i16 @llvm.experimental.constrained.fptoui.i16.f16(half %x,
+                                               metadata !"fpexcept.strict") #0
+  ret i16 %result
+}
+
+define i32 @fptoui_f16toi32(half %x) #0 {
+; X86-LABEL: fptoui_f16toi32:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvttsh2usi {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptoui_f16toi32:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2usi %xmm0, %eax
+; X64-NEXT:    retq
+  %result = call i32 @llvm.experimental.constrained.fptoui.i32.f16(half %x,
+                                               metadata !"fpexcept.strict") #0
+  ret i32 %result
+}
+
+define i64 @fptoui_f16toi64(half %x) #0 {
+; X86-LABEL: fptoui_f16toi64:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    vcvttph2uqq %xmm0, %xmm0
+; X86-NEXT:    vmovd %xmm0, %eax
+; X86-NEXT:    vpextrd $1, %xmm0, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptoui_f16toi64:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvttsh2usi %xmm0, %rax
+; X64-NEXT:    retq
+  %result = call i64 @llvm.experimental.constrained.fptoui.i64.f16(half %x,
+                                               metadata !"fpexcept.strict") #0
+  ret i64 %result
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
new file mode 100644
index 0000000000000..58b6068ea53ac
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll
@@ -0,0 +1,197 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64
+
+declare half @llvm.experimental.constrained.sitofp.f16.i1(i1, metadata, metadata)
+declare half @llvm.experimental.constrained.sitofp.f16.i8(i8, metadata, metadata)
+declare half @llvm.experimental.constrained.sitofp.f16.i16(i16, metadata, metadata)
+declare half @llvm.experimental.constrained.sitofp.f16.i32(i32, metadata, metadata)
+declare half @llvm.experimental.constrained.sitofp.f16.i64(i64, metadata, metadata)
+declare half @llvm.experimental.constrained.uitofp.f16.i1(i1, metadata, metadata)
+declare half @llvm.experimental.constrained.uitofp.f16.i8(i8, metadata, metadata)
+declare half @llvm.experimental.constrained.uitofp.f16.i16(i16, metadata, metadata)
+declare half @llvm.experimental.constrained.uitofp.f16.i32(i32, metadata, metadata)
+declare half @llvm.experimental.constrained.uitofp.f16.i64(i64, metadata, metadata)
+
+define half @sitofp_i1tof16(i1 %x) #0 {
+; X86-LABEL: sitofp_i1tof16:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_i1tof16:
+; X64:       # %bb.0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    negb %dil
+; X64-NEXT:    movsbl %dil, %eax
+; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %result = call half @llvm.experimental.constrained.sitofp.f16.i1(i1 %x,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret half %result
+}
+
+define half @sitofp_i8tof16(i8 %x) #0 {
+; X86-LABEL: sitofp_i8tof16:
+; X86:       # %bb.0:
+; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_i8tof16:
+; X64:       # %bb.0:
+; X64-NEXT:    movsbl %dil, %eax
+; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %result = call half @llvm.experimental.constrained.sitofp.f16.i8(i8 %x,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret half %result
+}
+
+define half @sitofp_i16tof16(i16 %x) #0 {
+; X86-LABEL: sitofp_i16tof16:
+; X86:       # %bb.0:
+; X86-NEXT:    movswl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_i16tof16:
+; X64:       # %bb.0:
+; X64-NEXT:    movswl %di, %eax
+; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %result = call half @llvm.experimental.constrained.sitofp.f16.i16(i16 %x,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret half %result
+}
+
+define half @sitofp_i32tof16(i32 %x) #0 {
+; X86-LABEL: sitofp_i32tof16:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvtsi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_i32tof16:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtsi2sh %edi, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %result = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %x,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret half %result
+}
+
+define half @sitofp_i64tof16(i64 %x) #0 {
+; X86-LABEL: sitofp_i64tof16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vcvtqq2ph %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_i64tof16:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtsi2sh %rdi, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %x,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret half %result
+}
+
+define half @uitofp_i1tof16(i1 %x) #0 {
+; X86-LABEL: uitofp_i1tof16:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    andb $1, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_i1tof16:
+; X64:       # %bb.0:
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    vcvtsi2sh %edi, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %result = call half @llvm.experimental.constrained.uitofp.f16.i1(i1 %x,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret half %result
+}
+
+define half @uitofp_i8tof16(i8 %x) #0 {
+; X86-LABEL: uitofp_i8tof16:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_i8tof16:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %result = call half @llvm.experimental.constrained.uitofp.f16.i8(i8 %x,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret half %result
+}
+
+define half @uitofp_i16tof16(i16 %x) #0 {
+; X86-LABEL: uitofp_i16tof16:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_i16tof16:
+; X64:       # %bb.0:
+; X64-NEXT:    movzwl %di, %eax
+; X64-NEXT:    vcvtsi2sh %eax, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %result = call half @llvm.experimental.constrained.uitofp.f16.i16(i16 %x,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret half %result
+}
+
+define half @uitofp_i32tof16(i32 %x) #0 {
+; X86-LABEL: uitofp_i32tof16:
+; X86:       # %bb.0:
+; X86-NEXT:    vcvtusi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_i32tof16:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtusi2sh %edi, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %result = call half @llvm.experimental.constrained.uitofp.f16.i32(i32 %x,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret half %result
+}
+
+define half @uitofp_i64tof16(i64 %x) #0 {
+; X86-LABEL: uitofp_i64tof16:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    vcvtuqq2ph %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_i64tof16:
+; X64:       # %bb.0:
+; X64-NEXT:    vcvtusi2sh %rdi, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %result = call half @llvm.experimental.constrained.uitofp.f16.i64(i64 %x,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret half %result
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
new file mode 100644
index 0000000000000..5832301aeb4e5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X64
+
+declare half @llvm.experimental.constrained.ceil.f16(half, metadata)
+declare half @llvm.experimental.constrained.floor.f16(half, metadata)
+declare half @llvm.experimental.constrained.trunc.f16(half, metadata)
+declare half @llvm.experimental.constrained.rint.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.nearbyint.f16(half, metadata, metadata)
+
+define half @fceil32(half %f) #0 {
+; X86-LABEL: fceil32:
+; X86:       # %bb.0:
+; X86-NEXT:    vrndscalesh $10, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: fceil32:
+; X64:       # %bb.0:
+; X64-NEXT:    vrndscalesh $10, %xmm0, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call half @llvm.experimental.constrained.ceil.f16(
+                        half %f, metadata !"fpexcept.strict") #0
+  ret half %res
+}
+
+define half @ffloor32(half %f) #0 {
+; X86-LABEL: ffloor32:
+; X86:       # %bb.0:
+; X86-NEXT:    vrndscalesh $9, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: ffloor32:
+; X64:       # %bb.0:
+; X64-NEXT:    vrndscalesh $9, %xmm0, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call half @llvm.experimental.constrained.floor.f16(
+                        half %f, metadata !"fpexcept.strict") #0
+  ret half %res
+}
+
+define half @ftrunc32(half %f) #0 {
+; X86-LABEL: ftrunc32:
+; X86:       # %bb.0:
+; X86-NEXT:    vrndscalesh $11, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: ftrunc32:
+; X64:       # %bb.0:
+; X64-NEXT:    vrndscalesh $11, %xmm0, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call half @llvm.experimental.constrained.trunc.f16(
+                        half %f, metadata !"fpexcept.strict") #0
+  ret half %res
+}
+
+define half @frint32(half %f) #0 {
+; X86-LABEL: frint32:
+; X86:       # %bb.0:
+; X86-NEXT:    vrndscalesh $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: frint32:
+; X64:       # %bb.0:
+; X64-NEXT:    vrndscalesh $4, %xmm0, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call half @llvm.experimental.constrained.rint.f16(
+                        half %f,
+                        metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret half %res
+}
+
+define half @fnearbyint32(half %f) #0 {
+; X86-LABEL: fnearbyint32:
+; X86:       # %bb.0:
+; X86-NEXT:    vrndscalesh $12, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: fnearbyint32:
+; X64:       # %bb.0:
+; X64-NEXT:    vrndscalesh $12, %xmm0, %xmm0, %xmm0
+; X64-NEXT:    retq
+  %res = call half @llvm.experimental.constrained.nearbyint.f16(
+                        half %f,
+                        metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret half %res
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/fsafdo_test2.ll b/llvm/test/CodeGen/X86/fsafdo_test2.ll
index 7695f3e373dc6..c0ae5d2f36768 100644
--- a/llvm/test/CodeGen/X86/fsafdo_test2.ll
+++ b/llvm/test/CodeGen/X86/fsafdo_test2.ll
@@ -1,4 +1,8 @@
+; REQUIRES: asserts
 ; RUN: llc -enable-fs-discriminator < %s | FileCheck %s
+; RUN: llvm-profdata merge --sample -profile-isfs -o %t.afdo %S/Inputs/fsloader.afdo
+; RUN: llc -enable-fs-discriminator -fs-profile-file=%t.afdo -show-fs-branchprob -disable-ra-fsprofile-loader=false -disable-layout-fsprofile-loader=false < %s 2>&1 | FileCheck %s --check-prefix=LOADER
+;
 ;;
 ;; C source code for the test (compiler at -O3):
 ;; // A test case for loop unroll.
@@ -50,6 +54,25 @@
 ; CHECK: .byte   1
 ; CHECK: .size   __llvm_fs_discriminator__, 1
 
+;; Check that new branch probs are generated.
+; LOADER: Set branch fs prob: MBB (1 -> 3): unroll.c:22:11-->unroll.c:24:11 W=283590  0x40000000 / 0x80000000 = 50.00% --> 0x7aca7894 / 0x80000000 = 95.93%
+; LOADER: Set branch fs prob: MBB (1 -> 2): unroll.c:22:11 W=283590  0x40000000 / 0x80000000 = 50.00% --> 0x0535876c / 0x80000000 = 4.07%
+; LOADER: Set branch fs prob: MBB (3 -> 5): unroll.c:24:11-->unroll.c:22:11 W=283590  0x30000000 / 0x80000000 = 37.50% --> 0x7aca7894 / 0x80000000 = 95.93%
+; LOADER: Set branch fs prob: MBB (3 -> 4): unroll.c:24:11 W=283590  0x50000000 / 0x80000000 = 62.50% --> 0x0535876c / 0x80000000 = 4.07%
+; LOADER: Set branch fs prob: MBB (5 -> 8): unroll.c:22:11-->unroll.c:24:11 W=283590  0x40000000 / 0x80000000 = 50.00% --> 0x021c112e / 0x80000000 = 1.65%
+; LOADER: Set branch fs prob: MBB (5 -> 7): unroll.c:22:11 W=283590  0x40000000 / 0x80000000 = 50.00% --> 0x7de3eed2 / 0x80000000 = 98.35%
+; LOADER: Set branch fs prob: MBB (8 -> 10): unroll.c:24:11-->unroll.c:22:11 W=283590  0x30000000 / 0x80000000 = 37.50% --> 0x00000000 / 0x80000000 = 0.00%
+; LOADER: Set branch fs prob: MBB (8 -> 9): unroll.c:24:11 W=283590  0x50000000 / 0x80000000 = 62.50% --> 0x80000000 / 0x80000000 = 100.00%
+; LOADER: Set branch fs prob: MBB (10 -> 12): unroll.c:22:11-->unroll.c:24:11 W=283590  0x40000000 / 0x80000000 = 50.00% --> 0x7aca7894 / 0x80000000 = 95.93%
+; LOADER: Set branch fs prob: MBB (10 -> 11): unroll.c:22:11 W=283590  0x40000000 / 0x80000000 = 50.00% --> 0x0535876c / 0x80000000 = 4.07%
+; LOADER: Set branch fs prob: MBB (12 -> 14): unroll.c:24:11-->unroll.c:22:11 W=283590  0x30000000 / 0x80000000 = 37.50% --> 0x02012507 / 0x80000000 = 1.57%
+; LOADER: Set branch fs prob: MBB (12 -> 13): unroll.c:24:11 W=283590  0x50000000 / 0x80000000 = 62.50% --> 0x7dfedaf9 / 0x80000000 = 98.43%
+; LOADER: Set branch fs prob: MBB (14 -> 16): unroll.c:22:11-->unroll.c:24:11 W=283590  0x40000000 / 0x80000000 = 50.00% --> 0x0a5856e1 / 0x80000000 = 8.08%
+; LOADER: Set branch fs prob: MBB (14 -> 15): unroll.c:22:11 W=283590  0x40000000 / 0x80000000 = 50.00% --> 0x75a7a91f / 0x80000000 = 91.92%
+; LOADER: Set branch fs prob: MBB (16 -> 18): unroll.c:24:11-->unroll.c:19:3 W=283590  0x30000000 / 0x80000000 = 37.50% --> 0x16588166 / 0x80000000 = 17.46%
+; LOADER: Set branch fs prob: MBB (16 -> 17): unroll.c:24:11 W=283590  0x50000000 / 0x80000000 = 62.50% --> 0x69a77e9a / 0x80000000 = 82.54%
+
+
 target triple = "x86_64-unknown-linux-gnu"
 
 @sum = dso_local local_unnamed_addr global i32 0, align 4
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index 8b8312dde3d1d..19c521d05bd62 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -8,6 +8,7 @@ declare i8 @llvm.fshl.i8(i8, i8, i8) nounwind readnone
 declare i16 @llvm.fshl.i16(i16, i16, i16) nounwind readnone
 declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone
 declare i64 @llvm.fshl.i64(i64, i64, i64) nounwind readnone
+declare i128 @llvm.fshl.i128(i128, i128, i128) nounwind readnone
 
 ;
 ; Variable Funnel Shift
@@ -299,6 +300,601 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
   ret i64 %tmp
 }
 
+define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
+; X86-FAST-LABEL: var_shift_i128:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    pushl %ebx
+; X86-FAST-NEXT:    pushl %edi
+; X86-FAST-NEXT:    pushl %esi
+; X86-FAST-NEXT:    subl $72, %esp
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl %edx, %edi
+; X86-FAST-NEXT:    shldl $31, %eax, %edi
+; X86-FAST-NEXT:    movl %ebx, %eax
+; X86-FAST-NEXT:    notl %ebx
+; X86-FAST-NEXT:    andl $127, %ebx
+; X86-FAST-NEXT:    movb $64, %cl
+; X86-FAST-NEXT:    subb %bl, %cl
+; X86-FAST-NEXT:    shrl %edx
+; X86-FAST-NEXT:    movl %edx, %ebp
+; X86-FAST-NEXT:    shldl %cl, %edi, %edx
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edi, %edx
+; X86-FAST-NEXT:    shll %cl, %edx
+; X86-FAST-NEXT:    testb $32, %cl
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    jne .LBB6_1
+; X86-FAST-NEXT:  # %bb.2:
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    jmp .LBB6_3
+; X86-FAST-NEXT:  .LBB6_1:
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-FAST-NEXT:  .LBB6_3:
+; X86-FAST-NEXT:    andl $127, %eax
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movb %al, %ch
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movb %ch, %cl
+; X86-FAST-NEXT:    shldl %cl, %esi, %eax
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movb %bl, %cl
+; X86-FAST-NEXT:    addb $-64, %cl
+; X86-FAST-NEXT:    movl %edi, %eax
+; X86-FAST-NEXT:    movl %ebp, %edx
+; X86-FAST-NEXT:    shrdl %cl, %ebp, %eax
+; X86-FAST-NEXT:    shrl %cl, %ebp
+; X86-FAST-NEXT:    testb $32, %cl
+; X86-FAST-NEXT:    jne .LBB6_4
+; X86-FAST-NEXT:  # %bb.5:
+; X86-FAST-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    jmp .LBB6_6
+; X86-FAST-NEXT:  .LBB6_4:
+; X86-FAST-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-FAST-NEXT:  .LBB6_6:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT:    movb %ch, %cl
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    shldl %cl, %eax, %ebp
+; X86-FAST-NEXT:    shll %cl, %eax
+; X86-FAST-NEXT:    shll %cl, %esi
+; X86-FAST-NEXT:    testb $32, %ch
+; X86-FAST-NEXT:    jne .LBB6_7
+; X86-FAST-NEXT:  # %bb.8:
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    jmp .LBB6_9
+; X86-FAST-NEXT:  .LBB6_7:
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %eax, %ebp
+; X86-FAST-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-FAST-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-FAST-NEXT:  .LBB6_9:
+; X86-FAST-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-NEXT:    jb .LBB6_11
+; X86-FAST-NEXT:  # %bb.10:
+; X86-FAST-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-FAST-NEXT:  .LBB6_11:
+; X86-FAST-NEXT:    movb %bl, %cl
+; X86-FAST-NEXT:    shrdl %cl, %edx, %edi
+; X86-FAST-NEXT:    shrl %cl, %edx
+; X86-FAST-NEXT:    shldl $31, %eax, %esi
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT:    shrdl $1, %ebp, %eax
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    shrdl %cl, %esi, %eax
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, %eax
+; X86-FAST-NEXT:    shrl %cl, %eax
+; X86-FAST-NEXT:    testb $32, %bl
+; X86-FAST-NEXT:    je .LBB6_13
+; X86-FAST-NEXT:  # %bb.12:
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edx, %edi
+; X86-FAST-NEXT:    xorl %eax, %eax
+; X86-FAST-NEXT:    xorl %edx, %edx
+; X86-FAST-NEXT:  .LBB6_13:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-FAST-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-FAST-NEXT:    jb .LBB6_15
+; X86-FAST-NEXT:  # %bb.14:
+; X86-FAST-NEXT:    xorl %ebp, %ebp
+; X86-FAST-NEXT:  .LBB6_15:
+; X86-FAST-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movb $64, %cl
+; X86-FAST-NEXT:    subb %ch, %cl
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT:    shrl %cl, %ebp
+; X86-FAST-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-FAST-NEXT:    testb $32, %cl
+; X86-FAST-NEXT:    movl $0, %edx
+; X86-FAST-NEXT:    jne .LBB6_17
+; X86-FAST-NEXT:  # %bb.16:
+; X86-FAST-NEXT:    movl %ebp, %edx
+; X86-FAST-NEXT:  .LBB6_17:
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    addb $-64, %ch
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-FAST-NEXT:    movl %edi, %esi
+; X86-FAST-NEXT:    movb %ch, %cl
+; X86-FAST-NEXT:    shll %cl, %esi
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    shldl %cl, %edi, %edx
+; X86-FAST-NEXT:    testb $32, %ch
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    jne .LBB6_19
+; X86-FAST-NEXT:  # %bb.18:
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:  .LBB6_19:
+; X86-FAST-NEXT:    cmpl $64, %ebx
+; X86-FAST-NEXT:    jb .LBB6_21
+; X86-FAST-NEXT:  # %bb.20:
+; X86-FAST-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-FAST-NEXT:  .LBB6_21:
+; X86-FAST-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:    jae .LBB6_23
+; X86-FAST-NEXT:  # %bb.22:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:  .LBB6_23:
+; X86-FAST-NEXT:    testb $32, %ch
+; X86-FAST-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-FAST-NEXT:    jne .LBB6_25
+; X86-FAST-NEXT:  # %bb.24:
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:  .LBB6_25:
+; X86-FAST-NEXT:    cmpl $64, %ebx
+; X86-FAST-NEXT:    jb .LBB6_27
+; X86-FAST-NEXT:  # %bb.26:
+; X86-FAST-NEXT:    xorl %edx, %edx
+; X86-FAST-NEXT:  .LBB6_27:
+; X86-FAST-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-NEXT:    shrdl %cl, %esi, %edi
+; X86-FAST-NEXT:    testb $32, %cl
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-FAST-NEXT:    jne .LBB6_29
+; X86-FAST-NEXT:  # %bb.28:
+; X86-FAST-NEXT:    movl %edi, %ebp
+; X86-FAST-NEXT:  .LBB6_29:
+; X86-FAST-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-FAST-NEXT:    jae .LBB6_31
+; X86-FAST-NEXT:  # %bb.30:
+; X86-FAST-NEXT:    orl %ebp, %esi
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:  .LBB6_31:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-FAST-NEXT:    cmpl $64, %ebx
+; X86-FAST-NEXT:    jae .LBB6_33
+; X86-FAST-NEXT:  # %bb.32:
+; X86-FAST-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-FAST-NEXT:    movl %eax, %ebp
+; X86-FAST-NEXT:  .LBB6_33:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-FAST-NEXT:    cmpl $64, %ebx
+; X86-FAST-NEXT:    jae .LBB6_35
+; X86-FAST-NEXT:  # %bb.34:
+; X86-FAST-NEXT:    movl %edx, %ecx
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:    orl %eax, %edx
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %ecx, %edx
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-FAST-NEXT:  .LBB6_35:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    testl %ebx, %ebx
+; X86-FAST-NEXT:    je .LBB6_37
+; X86-FAST-NEXT:  # %bb.36:
+; X86-FAST-NEXT:    movl %ebp, %ecx
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-FAST-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:  .LBB6_37:
+; X86-FAST-NEXT:    orl %ecx, %edi
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-FAST-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-FAST-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-FAST-NEXT:    je .LBB6_39
+; X86-FAST-NEXT:  # %bb.38:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-FAST-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-FAST-NEXT:  .LBB6_39:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-FAST-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-FAST-NEXT:    orl %edx, %esi
+; X86-FAST-NEXT:    movl %ecx, 12(%eax)
+; X86-FAST-NEXT:    movl %esi, 8(%eax)
+; X86-FAST-NEXT:    movl %edi, 4(%eax)
+; X86-FAST-NEXT:    movl %ebx, (%eax)
+; X86-FAST-NEXT:    addl $72, %esp
+; X86-FAST-NEXT:    popl %esi
+; X86-FAST-NEXT:    popl %edi
+; X86-FAST-NEXT:    popl %ebx
+; X86-FAST-NEXT:    popl %ebp
+; X86-FAST-NEXT:    retl $4
+;
+; X86-SLOW-LABEL: var_shift_i128:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    pushl %ebx
+; X86-SLOW-NEXT:    pushl %edi
+; X86-SLOW-NEXT:    pushl %esi
+; X86-SLOW-NEXT:    subl $76, %esp
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    andl $127, %eax
+; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    # kill: def $al killed $al killed $eax
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %edx
+; X86-SLOW-NEXT:    movl %ebx, %esi
+; X86-SLOW-NEXT:    shrl %esi
+; X86-SLOW-NEXT:    movb %al, %ah
+; X86-SLOW-NEXT:    notb %ah
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movb %ah, %cl
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebp
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    shrl %edi
+; X86-SLOW-NEXT:    movb %ah, %cl
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    movl %ebx, %esi
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    shll %cl, %ebx
+; X86-SLOW-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SLOW-NEXT:    testb $32, %al
+; X86-SLOW-NEXT:    jne .LBB6_1
+; X86-SLOW-NEXT:  # %bb.2:
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    orl %edi, %ebp
+; X86-SLOW-NEXT:    jmp .LBB6_3
+; X86-SLOW-NEXT:  .LBB6_1:
+; X86-SLOW-NEXT:    movl %ebx, %ebp
+; X86-SLOW-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    xorl %ebx, %ebx
+; X86-SLOW-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SLOW-NEXT:  .LBB6_3:
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Reload
+; X86-SLOW-NEXT:    jb .LBB6_5
+; X86-SLOW-NEXT:  # %bb.4:
+; X86-SLOW-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SLOW-NEXT:  .LBB6_5:
+; X86-SLOW-NEXT:    shrl %edi
+; X86-SLOW-NEXT:    notl %ebx
+; X86-SLOW-NEXT:    andl $127, %ebx
+; X86-SLOW-NEXT:    movl %edi, %ebp
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %ebp
+; X86-SLOW-NEXT:    movl %esi, %ecx
+; X86-SLOW-NEXT:    shrl %ecx
+; X86-SLOW-NEXT:    movl %eax, %esi
+; X86-SLOW-NEXT:    shll $31, %esi
+; X86-SLOW-NEXT:    orl %ecx, %esi
+; X86-SLOW-NEXT:    movl %esi, %ecx
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    testb $32, %bl
+; X86-SLOW-NEXT:    movl $0, %esi
+; X86-SLOW-NEXT:    movl $0, %ecx
+; X86-SLOW-NEXT:    jne .LBB6_7
+; X86-SLOW-NEXT:  # %bb.6:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    movl %ebp, %ecx
+; X86-SLOW-NEXT:  .LBB6_7:
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    shrl %eax
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    shll $31, %esi
+; X86-SLOW-NEXT:    orl %eax, %esi
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    addl %edi, %edi
+; X86-SLOW-NEXT:    notb %cl
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SLOW-NEXT:    shll %cl, %edi
+; X86-SLOW-NEXT:    testb $32, %bl
+; X86-SLOW-NEXT:    jne .LBB6_9
+; X86-SLOW-NEXT:  # %bb.8:
+; X86-SLOW-NEXT:    orl %esi, %edi
+; X86-SLOW-NEXT:    movl %edi, %ebp
+; X86-SLOW-NEXT:  .LBB6_9:
+; X86-SLOW-NEXT:    movb %bl, %dh
+; X86-SLOW-NEXT:    addb $-64, %dh
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    movb %dh, %cl
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    testb $32, %dh
+; X86-SLOW-NEXT:    movl $0, %ecx
+; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    jne .LBB6_11
+; X86-SLOW-NEXT:  # %bb.10:
+; X86-SLOW-NEXT:    movl %esi, %ecx
+; X86-SLOW-NEXT:  .LBB6_11:
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT:    jb .LBB6_13
+; X86-SLOW-NEXT:  # %bb.12:
+; X86-SLOW-NEXT:    xorl %eax, %eax
+; X86-SLOW-NEXT:  .LBB6_13:
+; X86-SLOW-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movb $64, %ch
+; X86-SLOW-NEXT:    movb $64, %ah
+; X86-SLOW-NEXT:    subb %dl, %ah
+; X86-SLOW-NEXT:    movb %ah, %cl
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    notb %cl
+; X86-SLOW-NEXT:    leal (%ebp,%ebp), %edi
+; X86-SLOW-NEXT:    shll %cl, %edi
+; X86-SLOW-NEXT:    movb %ah, %cl
+; X86-SLOW-NEXT:    shrl %cl, %ebp
+; X86-SLOW-NEXT:    testb $32, %ah
+; X86-SLOW-NEXT:    jne .LBB6_14
+; X86-SLOW-NEXT:  # %bb.15:
+; X86-SLOW-NEXT:    orl %esi, %edi
+; X86-SLOW-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edi, %ebp
+; X86-SLOW-NEXT:    jmp .LBB6_16
+; X86-SLOW-NEXT:  .LBB6_14:
+; X86-SLOW-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SLOW-NEXT:  .LBB6_16:
+; X86-SLOW-NEXT:    addb $-64, %dl
+; X86-SLOW-NEXT:    movb %dl, %cl
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    shll %cl, %edi
+; X86-SLOW-NEXT:    notb %cl
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    movb %dl, %cl
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    testb $32, %dl
+; X86-SLOW-NEXT:    jne .LBB6_17
+; X86-SLOW-NEXT:  # %bb.18:
+; X86-SLOW-NEXT:    orl %eax, %edi
+; X86-SLOW-NEXT:    cmpl $64, %ebx
+; X86-SLOW-NEXT:    jae .LBB6_20
+; X86-SLOW-NEXT:    jmp .LBB6_21
+; X86-SLOW-NEXT:  .LBB6_17:
+; X86-SLOW-NEXT:    movl %esi, %edi
+; X86-SLOW-NEXT:    xorl %esi, %esi
+; X86-SLOW-NEXT:    cmpl $64, %ebx
+; X86-SLOW-NEXT:    jb .LBB6_21
+; X86-SLOW-NEXT:  .LBB6_20:
+; X86-SLOW-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SLOW-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SLOW-NEXT:  .LBB6_21:
+; X86-SLOW-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jae .LBB6_23
+; X86-SLOW-NEXT:  # %bb.22:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    orl %ebp, %esi
+; X86-SLOW-NEXT:  .LBB6_23:
+; X86-SLOW-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jae .LBB6_25
+; X86-SLOW-NEXT:  # %bb.24:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB6_25:
+; X86-SLOW-NEXT:    shrl %edi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    shll $31, %esi
+; X86-SLOW-NEXT:    orl %edi, %esi
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movb %bl, %cl
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    addl %edi, %edi
+; X86-SLOW-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-SLOW-NEXT:    shll %cl, %edi
+; X86-SLOW-NEXT:    testb $32, %bl
+; X86-SLOW-NEXT:    jne .LBB6_27
+; X86-SLOW-NEXT:  # %bb.26:
+; X86-SLOW-NEXT:    orl %esi, %edi
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB6_27:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    movl %edi, %eax
+; X86-SLOW-NEXT:    movb %dh, %cl
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    notb %cl
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    testb $32, %dh
+; X86-SLOW-NEXT:    jne .LBB6_29
+; X86-SLOW-NEXT:  # %bb.28:
+; X86-SLOW-NEXT:    orl %eax, %esi
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB6_29:
+; X86-SLOW-NEXT:    subb %bl, %ch
+; X86-SLOW-NEXT:    movl %edi, %eax
+; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shrl %edi
+; X86-SLOW-NEXT:    notb %cl
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SLOW-NEXT:    shll %cl, %edx
+; X86-SLOW-NEXT:    testb $32, %ch
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movl %edi, %ecx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    jne .LBB6_30
+; X86-SLOW-NEXT:  # %bb.31:
+; X86-SLOW-NEXT:    orl %ecx, %edx
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    cmpl $64, %ebx
+; X86-SLOW-NEXT:    jb .LBB6_33
+; X86-SLOW-NEXT:    jmp .LBB6_34
+; X86-SLOW-NEXT:  .LBB6_30:
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    xorl %eax, %eax
+; X86-SLOW-NEXT:    cmpl $64, %ebx
+; X86-SLOW-NEXT:    jae .LBB6_34
+; X86-SLOW-NEXT:  .LBB6_33:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SLOW-NEXT:    orl %eax, %edx
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB6_34:
+; X86-SLOW-NEXT:    cmpl $64, %ebx
+; X86-SLOW-NEXT:    jb .LBB6_35
+; X86-SLOW-NEXT:  # %bb.36:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SLOW-NEXT:    jmp .LBB6_37
+; X86-SLOW-NEXT:  .LBB6_35:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    orl %ecx, %eax
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:  .LBB6_37:
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    testl %ebx, %ebx
+; X86-SLOW-NEXT:    je .LBB6_39
+; X86-SLOW-NEXT:  # %bb.38:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ecx, %ebx
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB6_39:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-SLOW-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-SLOW-NEXT:    je .LBB6_41
+; X86-SLOW-NEXT:  # %bb.40:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:  .LBB6_41:
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl %esi, 12(%eax)
+; X86-SLOW-NEXT:    movl %edi, 8(%eax)
+; X86-SLOW-NEXT:    movl %ebx, (%eax)
+; X86-SLOW-NEXT:    movl %ebp, 4(%eax)
+; X86-SLOW-NEXT:    addl $76, %esp
+; X86-SLOW-NEXT:    popl %esi
+; X86-SLOW-NEXT:    popl %edi
+; X86-SLOW-NEXT:    popl %ebx
+; X86-SLOW-NEXT:    popl %ebp
+; X86-SLOW-NEXT:    retl $4
+;
+; X64-FAST-LABEL: var_shift_i128:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movq %r8, %r9
+; X64-FAST-NEXT:    movq %rcx, %r10
+; X64-FAST-NEXT:    movq %rdx, %r8
+; X64-FAST-NEXT:    movq %rsi, %rdx
+; X64-FAST-NEXT:    movl %r9d, %ecx
+; X64-FAST-NEXT:    shldq %cl, %rdi, %rdx
+; X64-FAST-NEXT:    shrdq $1, %r10, %r8
+; X64-FAST-NEXT:    shrq %r10
+; X64-FAST-NEXT:    notb %cl
+; X64-FAST-NEXT:    shrdq %cl, %r10, %r8
+; X64-FAST-NEXT:    shrq %cl, %r10
+; X64-FAST-NEXT:    xorl %eax, %eax
+; X64-FAST-NEXT:    testb $64, %cl
+; X64-FAST-NEXT:    cmovneq %r10, %r8
+; X64-FAST-NEXT:    cmovneq %rax, %r10
+; X64-FAST-NEXT:    movl %r9d, %ecx
+; X64-FAST-NEXT:    shlq %cl, %rdi
+; X64-FAST-NEXT:    testb $64, %r9b
+; X64-FAST-NEXT:    cmovneq %rdi, %rdx
+; X64-FAST-NEXT:    cmoveq %rdi, %rax
+; X64-FAST-NEXT:    orq %r8, %rax
+; X64-FAST-NEXT:    orq %r10, %rdx
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: var_shift_i128:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    movq %rcx, %r11
+; X64-SLOW-NEXT:    movq %rdx, %r9
+; X64-SLOW-NEXT:    movl %r8d, %ecx
+; X64-SLOW-NEXT:    shlq %cl, %rsi
+; X64-SLOW-NEXT:    movq %rdi, %rdx
+; X64-SLOW-NEXT:    shrq %rdx
+; X64-SLOW-NEXT:    movl %r8d, %r10d
+; X64-SLOW-NEXT:    notb %r10b
+; X64-SLOW-NEXT:    movl %r10d, %ecx
+; X64-SLOW-NEXT:    shrq %cl, %rdx
+; X64-SLOW-NEXT:    orq %rsi, %rdx
+; X64-SLOW-NEXT:    shrq %r9
+; X64-SLOW-NEXT:    movq %r11, %rax
+; X64-SLOW-NEXT:    shlq $63, %rax
+; X64-SLOW-NEXT:    orq %r9, %rax
+; X64-SLOW-NEXT:    shrq %cl, %rax
+; X64-SLOW-NEXT:    shrq %r11
+; X64-SLOW-NEXT:    leaq (%r11,%r11), %rsi
+; X64-SLOW-NEXT:    movl %r8d, %ecx
+; X64-SLOW-NEXT:    shlq %cl, %rsi
+; X64-SLOW-NEXT:    orq %rax, %rsi
+; X64-SLOW-NEXT:    movl %r10d, %ecx
+; X64-SLOW-NEXT:    shrq %cl, %r11
+; X64-SLOW-NEXT:    xorl %eax, %eax
+; X64-SLOW-NEXT:    testb $64, %r10b
+; X64-SLOW-NEXT:    cmovneq %r11, %rsi
+; X64-SLOW-NEXT:    cmovneq %rax, %r11
+; X64-SLOW-NEXT:    movl %r8d, %ecx
+; X64-SLOW-NEXT:    shlq %cl, %rdi
+; X64-SLOW-NEXT:    testb $64, %r8b
+; X64-SLOW-NEXT:    cmovneq %rdi, %rdx
+; X64-SLOW-NEXT:    cmoveq %rdi, %rax
+; X64-SLOW-NEXT:    orq %rsi, %rax
+; X64-SLOW-NEXT:    orq %r11, %rdx
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z)
+  ret i128 %tmp
+}
+
 ;
 ; Const Funnel Shift
 ;
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 5decfb6f35e61..243c02328ead0 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -8,6 +8,7 @@ declare i8 @llvm.fshr.i8(i8, i8, i8) nounwind readnone
 declare i16 @llvm.fshr.i16(i16, i16, i16) nounwind readnone
 declare i32 @llvm.fshr.i32(i32, i32, i32) nounwind readnone
 declare i64 @llvm.fshr.i64(i64, i64, i64) nounwind readnone
+declare i128 @llvm.fshr.i128(i128, i128, i128) nounwind readnone
 
 ;
 ; Variable Funnel Shift
@@ -299,6 +300,610 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
   ret i64 %tmp
 }
 
+define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
+; X86-FAST-LABEL: var_shift_i128:
+; X86-FAST:       # %bb.0:
+; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    pushl %ebx
+; X86-FAST-NEXT:    pushl %edi
+; X86-FAST-NEXT:    pushl %esi
+; X86-FAST-NEXT:    subl $76, %esp
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-NEXT:    movl %ebx, %ecx
+; X86-FAST-NEXT:    andl $127, %ecx
+; X86-FAST-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movb %cl, %ch
+; X86-FAST-NEXT:    movb $64, %cl
+; X86-FAST-NEXT:    subb %ch, %cl
+; X86-FAST-NEXT:    shll %cl, %edi
+; X86-FAST-NEXT:    movb %cl, (%esp) # 1-byte Spill
+; X86-FAST-NEXT:    testb $32, %cl
+; X86-FAST-NEXT:    movl $0, %esi
+; X86-FAST-NEXT:    jne .LBB6_2
+; X86-FAST-NEXT:  # %bb.1:
+; X86-FAST-NEXT:    movl %edi, %esi
+; X86-FAST-NEXT:  .LBB6_2:
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %eax, %edi
+; X86-FAST-NEXT:    movl %ebp, %eax
+; X86-FAST-NEXT:    shldl $1, %ebp, %edi
+; X86-FAST-NEXT:    addl %ebp, %eax
+; X86-FAST-NEXT:    notl %ebx
+; X86-FAST-NEXT:    andl $127, %ebx
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movb %bl, %cl
+; X86-FAST-NEXT:    shldl %cl, %eax, %edi
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    shll %cl, %eax
+; X86-FAST-NEXT:    testb $32, %bl
+; X86-FAST-NEXT:    movl %eax, %esi
+; X86-FAST-NEXT:    jne .LBB6_4
+; X86-FAST-NEXT:  # %bb.3:
+; X86-FAST-NEXT:    movl %edi, %esi
+; X86-FAST-NEXT:  .LBB6_4:
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-FAST-NEXT:    movb %ch, %cl
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-FAST-NEXT:    shrdl %cl, %edi, %esi
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edi, %esi
+; X86-FAST-NEXT:    shrl %cl, %esi
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-FAST-NEXT:    shrl %cl, %edi
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    shrdl %cl, %edx, %ebp
+; X86-FAST-NEXT:    testb $32, %ch
+; X86-FAST-NEXT:    jne .LBB6_5
+; X86-FAST-NEXT:  # %bb.6:
+; X86-FAST-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    jmp .LBB6_7
+; X86-FAST-NEXT:  .LBB6_5:
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    xorl %edi, %edi
+; X86-FAST-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-FAST-NEXT:  .LBB6_7:
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    testb $32, %bl
+; X86-FAST-NEXT:    movl $0, %esi
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT:    jne .LBB6_9
+; X86-FAST-NEXT:  # %bb.8:
+; X86-FAST-NEXT:    movl %eax, %esi
+; X86-FAST-NEXT:  .LBB6_9:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-FAST-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-FAST-NEXT:    jb .LBB6_11
+; X86-FAST-NEXT:  # %bb.10:
+; X86-FAST-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-FAST-NEXT:  .LBB6_11:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    shrdl $31, %edi, %eax
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movb %bl, %cl
+; X86-FAST-NEXT:    shll %cl, %eax
+; X86-FAST-NEXT:    testb $32, %bl
+; X86-FAST-NEXT:    movl $0, %edi
+; X86-FAST-NEXT:    jne .LBB6_13
+; X86-FAST-NEXT:  # %bb.12:
+; X86-FAST-NEXT:    movl %eax, %edi
+; X86-FAST-NEXT:  .LBB6_13:
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movb (%esp), %cl # 1-byte Reload
+; X86-FAST-NEXT:    shldl %cl, %ebp, %eax
+; X86-FAST-NEXT:    testb $32, %cl
+; X86-FAST-NEXT:    jne .LBB6_15
+; X86-FAST-NEXT:  # %bb.14:
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:  .LBB6_15:
+; X86-FAST-NEXT:    movb %bl, %dh
+; X86-FAST-NEXT:    addb $-64, %dh
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-FAST-NEXT:    movb %dh, %cl
+; X86-FAST-NEXT:    shll %cl, %eax
+; X86-FAST-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    testb $32, %dh
+; X86-FAST-NEXT:    movl $0, %eax
+; X86-FAST-NEXT:    jne .LBB6_17
+; X86-FAST-NEXT:  # %bb.16:
+; X86-FAST-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-FAST-NEXT:  .LBB6_17:
+; X86-FAST-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-FAST-NEXT:    jb .LBB6_19
+; X86-FAST-NEXT:  # %bb.18:
+; X86-FAST-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-FAST-NEXT:  .LBB6_19:
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    cmpl $64, %ebx
+; X86-FAST-NEXT:    jb .LBB6_21
+; X86-FAST-NEXT:  # %bb.20:
+; X86-FAST-NEXT:    xorl %esi, %esi
+; X86-FAST-NEXT:  .LBB6_21:
+; X86-FAST-NEXT:    addb $-64, %ch
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movb %ch, %cl
+; X86-FAST-NEXT:    shrl %cl, %eax
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    testb $32, %ch
+; X86-FAST-NEXT:    movl $0, %eax
+; X86-FAST-NEXT:    jne .LBB6_23
+; X86-FAST-NEXT:  # %bb.22:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-FAST-NEXT:  .LBB6_23:
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-FAST-NEXT:    jae .LBB6_25
+; X86-FAST-NEXT:  # %bb.24:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-FAST-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-FAST-NEXT:  .LBB6_25:
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-FAST-NEXT:    movb %ch, %cl
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    shrdl %cl, %eax, %ebp
+; X86-FAST-NEXT:    testb $32, %ch
+; X86-FAST-NEXT:    jne .LBB6_27
+; X86-FAST-NEXT:  # %bb.26:
+; X86-FAST-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:  .LBB6_27:
+; X86-FAST-NEXT:    cmpl $64, %ebx
+; X86-FAST-NEXT:    jb .LBB6_29
+; X86-FAST-NEXT:  # %bb.28:
+; X86-FAST-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-FAST-NEXT:  .LBB6_29:
+; X86-FAST-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT:    jae .LBB6_31
+; X86-FAST-NEXT:  # %bb.30:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-FAST-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:  .LBB6_31:
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    shldl $1, %eax, %ebp
+; X86-FAST-NEXT:    movl %ebp, %eax
+; X86-FAST-NEXT:    movl %ebx, %ecx
+; X86-FAST-NEXT:    shldl %cl, %edi, %eax
+; X86-FAST-NEXT:    testb $32, %bl
+; X86-FAST-NEXT:    jne .LBB6_33
+; X86-FAST-NEXT:  # %bb.32:
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:  .LBB6_33:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-FAST-NEXT:    movb %dh, %cl
+; X86-FAST-NEXT:    shldl %cl, %esi, %eax
+; X86-FAST-NEXT:    testb $32, %dh
+; X86-FAST-NEXT:    jne .LBB6_35
+; X86-FAST-NEXT:  # %bb.34:
+; X86-FAST-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:  .LBB6_35:
+; X86-FAST-NEXT:    movb $64, %cl
+; X86-FAST-NEXT:    subb %bl, %cl
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-FAST-NEXT:    shrdl %cl, %eax, %esi
+; X86-FAST-NEXT:    shrl %cl, %eax
+; X86-FAST-NEXT:    testb $32, %cl
+; X86-FAST-NEXT:    je .LBB6_37
+; X86-FAST-NEXT:  # %bb.36:
+; X86-FAST-NEXT:    movl %eax, %esi
+; X86-FAST-NEXT:    xorl %eax, %eax
+; X86-FAST-NEXT:  .LBB6_37:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:    cmpl $64, %ebx
+; X86-FAST-NEXT:    jae .LBB6_39
+; X86-FAST-NEXT:  # %bb.38:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-FAST-NEXT:    orl %eax, %ecx
+; X86-FAST-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:  .LBB6_39:
+; X86-FAST-NEXT:    cmpl $64, %ebx
+; X86-FAST-NEXT:    jae .LBB6_41
+; X86-FAST-NEXT:  # %bb.40:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-FAST-NEXT:    orl %esi, %eax
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:  .LBB6_41:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    testl %ebx, %ebx
+; X86-FAST-NEXT:    je .LBB6_43
+; X86-FAST-NEXT:  # %bb.42:
+; X86-FAST-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-FAST-NEXT:  .LBB6_43:
+; X86-FAST-NEXT:    orl %edx, %ebp
+; X86-FAST-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-FAST-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-FAST-NEXT:    je .LBB6_45
+; X86-FAST-NEXT:  # %bb.44:
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:  .LBB6_45:
+; X86-FAST-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-FAST-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-FAST-NEXT:    movl %ecx, 4(%eax)
+; X86-FAST-NEXT:    movl %esi, (%eax)
+; X86-FAST-NEXT:    movl %ebp, 12(%eax)
+; X86-FAST-NEXT:    movl %edi, 8(%eax)
+; X86-FAST-NEXT:    addl $76, %esp
+; X86-FAST-NEXT:    popl %esi
+; X86-FAST-NEXT:    popl %edi
+; X86-FAST-NEXT:    popl %ebx
+; X86-FAST-NEXT:    popl %ebp
+; X86-FAST-NEXT:    retl $4
+;
+; X86-SLOW-LABEL: var_shift_i128:
+; X86-SLOW:       # %bb.0:
+; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    pushl %ebx
+; X86-SLOW-NEXT:    pushl %edi
+; X86-SLOW-NEXT:    pushl %esi
+; X86-SLOW-NEXT:    subl $72, %esp
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    andl $127, %eax
+; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, %edx
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    leal (%edi,%edi), %ebp
+; X86-SLOW-NEXT:    notb %al
+; X86-SLOW-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebp
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    leal (%esi,%esi), %ebx
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl %edx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    testb $32, %dl
+; X86-SLOW-NEXT:    jne .LBB6_1
+; X86-SLOW-NEXT:  # %bb.2:
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-SLOW-NEXT:    orl %edi, %ebx
+; X86-SLOW-NEXT:    movl %ebx, %esi
+; X86-SLOW-NEXT:    jmp .LBB6_3
+; X86-SLOW-NEXT:  .LBB6_1:
+; X86-SLOW-NEXT:    movl %eax, %ebp
+; X86-SLOW-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SLOW-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SLOW-NEXT:  .LBB6_3:
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-SLOW-NEXT:    jb .LBB6_5
+; X86-SLOW-NEXT:  # %bb.4:
+; X86-SLOW-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SLOW-NEXT:  .LBB6_5:
+; X86-SLOW-NEXT:    leal (%ecx,%ecx), %esi
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    notl %ebx
+; X86-SLOW-NEXT:    andl $127, %ebx
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shrl $31, %ecx
+; X86-SLOW-NEXT:    leal (%ecx,%edi,2), %ecx
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ecx, %edi
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shll %cl, %edi
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    testb $32, %bl
+; X86-SLOW-NEXT:    movl $0, %edi
+; X86-SLOW-NEXT:    movl $0, %ecx
+; X86-SLOW-NEXT:    jne .LBB6_7
+; X86-SLOW-NEXT:  # %bb.6:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    movl %esi, %ecx
+; X86-SLOW-NEXT:  .LBB6_7:
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    movl %edi, %ecx
+; X86-SLOW-NEXT:    shrl $31, %ecx
+; X86-SLOW-NEXT:    leal (%ecx,%eax,2), %esi
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    andl $2147483647, %edi # imm = 0x7FFFFFFF
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    notb %cl
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    testb $32, %bl
+; X86-SLOW-NEXT:    jne .LBB6_9
+; X86-SLOW-NEXT:  # %bb.8:
+; X86-SLOW-NEXT:    orl %edi, %esi
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB6_9:
+; X86-SLOW-NEXT:    movb %bl, %dh
+; X86-SLOW-NEXT:    addb $-64, %dh
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    movb %dh, %cl
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    testb $32, %dh
+; X86-SLOW-NEXT:    movl $0, %ecx
+; X86-SLOW-NEXT:    jne .LBB6_11
+; X86-SLOW-NEXT:  # %bb.10:
+; X86-SLOW-NEXT:    movl %esi, %ecx
+; X86-SLOW-NEXT:  .LBB6_11:
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    jb .LBB6_13
+; X86-SLOW-NEXT:  # %bb.12:
+; X86-SLOW-NEXT:    xorl %ebp, %ebp
+; X86-SLOW-NEXT:  .LBB6_13:
+; X86-SLOW-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movb $64, %ch
+; X86-SLOW-NEXT:    movb $64, %ah
+; X86-SLOW-NEXT:    subb %dl, %ah
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT:    movb %ah, %cl
+; X86-SLOW-NEXT:    shll %cl, %ebp
+; X86-SLOW-NEXT:    notb %cl
+; X86-SLOW-NEXT:    movl %esi, %edi
+; X86-SLOW-NEXT:    shrl %edi
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    movb %ah, %cl
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    testb $32, %ah
+; X86-SLOW-NEXT:    jne .LBB6_14
+; X86-SLOW-NEXT:  # %bb.15:
+; X86-SLOW-NEXT:    orl %edi, %ebp
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebp, %esi
+; X86-SLOW-NEXT:    jmp .LBB6_16
+; X86-SLOW-NEXT:  .LBB6_14:
+; X86-SLOW-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SLOW-NEXT:  .LBB6_16:
+; X86-SLOW-NEXT:    addb $-64, %dl
+; X86-SLOW-NEXT:    movb %dl, %cl
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    notb %cl
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-SLOW-NEXT:    shll %cl, %ebp
+; X86-SLOW-NEXT:    movb %dl, %cl
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    testb $32, %dl
+; X86-SLOW-NEXT:    jne .LBB6_17
+; X86-SLOW-NEXT:  # %bb.18:
+; X86-SLOW-NEXT:    orl %eax, %ebp
+; X86-SLOW-NEXT:    cmpl $64, %ebx
+; X86-SLOW-NEXT:    jae .LBB6_20
+; X86-SLOW-NEXT:    jmp .LBB6_21
+; X86-SLOW-NEXT:  .LBB6_17:
+; X86-SLOW-NEXT:    movl %edi, %ebp
+; X86-SLOW-NEXT:    xorl %edi, %edi
+; X86-SLOW-NEXT:    cmpl $64, %ebx
+; X86-SLOW-NEXT:    jb .LBB6_21
+; X86-SLOW-NEXT:  .LBB6_20:
+; X86-SLOW-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SLOW-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SLOW-NEXT:  .LBB6_21:
+; X86-SLOW-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-SLOW-NEXT:    jae .LBB6_23
+; X86-SLOW-NEXT:  # %bb.22:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    orl %esi, %edi
+; X86-SLOW-NEXT:  .LBB6_23:
+; X86-SLOW-NEXT:    cmpl $64, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jb .LBB6_24
+; X86-SLOW-NEXT:  # %bb.25:
+; X86-SLOW-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jmp .LBB6_26
+; X86-SLOW-NEXT:  .LBB6_24:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB6_26:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    shrl $31, %eax
+; X86-SLOW-NEXT:    leal (%eax,%esi,2), %esi
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movb %bl, %cl
+; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    shrl %edi
+; X86-SLOW-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Reload
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    testb $32, %bl
+; X86-SLOW-NEXT:    jne .LBB6_28
+; X86-SLOW-NEXT:  # %bb.27:
+; X86-SLOW-NEXT:    orl %edi, %esi
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB6_28:
+; X86-SLOW-NEXT:    movl %ebp, %eax
+; X86-SLOW-NEXT:    movb %dh, %cl
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    notb %cl
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    testb $32, %dh
+; X86-SLOW-NEXT:    jne .LBB6_30
+; X86-SLOW-NEXT:  # %bb.29:
+; X86-SLOW-NEXT:    orl %esi, %eax
+; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB6_30:
+; X86-SLOW-NEXT:    subb %bl, %ch
+; X86-SLOW-NEXT:    movl %ebp, %eax
+; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    addl %ebp, %ebp
+; X86-SLOW-NEXT:    notb %cl
+; X86-SLOW-NEXT:    shll %cl, %ebp
+; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    movl %ebp, %esi
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-SLOW-NEXT:    shrl %cl, %ebp
+; X86-SLOW-NEXT:    testb $32, %ch
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    jne .LBB6_31
+; X86-SLOW-NEXT:  # %bb.32:
+; X86-SLOW-NEXT:    orl %ebp, %esi
+; X86-SLOW-NEXT:    movl %esi, %ebp
+; X86-SLOW-NEXT:    cmpl $64, %ebx
+; X86-SLOW-NEXT:    jb .LBB6_34
+; X86-SLOW-NEXT:    jmp .LBB6_35
+; X86-SLOW-NEXT:  .LBB6_31:
+; X86-SLOW-NEXT:    movl %eax, %ebp
+; X86-SLOW-NEXT:    xorl %eax, %eax
+; X86-SLOW-NEXT:    cmpl $64, %ebx
+; X86-SLOW-NEXT:    jae .LBB6_35
+; X86-SLOW-NEXT:  .LBB6_34:
+; X86-SLOW-NEXT:    movl %ebp, %esi
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-SLOW-NEXT:    orl %eax, %ebp
+; X86-SLOW-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, %ebp
+; X86-SLOW-NEXT:  .LBB6_35:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    cmpl $64, %ebx
+; X86-SLOW-NEXT:    jae .LBB6_37
+; X86-SLOW-NEXT:  # %bb.36:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    orl %ebp, %eax
+; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:  .LBB6_37:
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    testl %ebx, %ebx
+; X86-SLOW-NEXT:    je .LBB6_39
+; X86-SLOW-NEXT:  # %bb.38:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:  .LBB6_39:
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-SLOW-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-SLOW-NEXT:    je .LBB6_41
+; X86-SLOW-NEXT:  # %bb.40:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-SLOW-NEXT:  .LBB6_41:
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    orl %ecx, %ebx
+; X86-SLOW-NEXT:    orl %ebp, %edx
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl %ebx, (%eax)
+; X86-SLOW-NEXT:    movl %esi, 12(%eax)
+; X86-SLOW-NEXT:    movl %edx, 4(%eax)
+; X86-SLOW-NEXT:    movl %edi, 8(%eax)
+; X86-SLOW-NEXT:    addl $72, %esp
+; X86-SLOW-NEXT:    popl %esi
+; X86-SLOW-NEXT:    popl %edi
+; X86-SLOW-NEXT:    popl %ebx
+; X86-SLOW-NEXT:    popl %ebp
+; X86-SLOW-NEXT:    retl $4
+;
+; X64-FAST-LABEL: var_shift_i128:
+; X64-FAST:       # %bb.0:
+; X64-FAST-NEXT:    movq %r8, %r10
+; X64-FAST-NEXT:    movq %rcx, %r9
+; X64-FAST-NEXT:    movq %rdx, %r8
+; X64-FAST-NEXT:    movq %rsi, %rdx
+; X64-FAST-NEXT:    movl %r10d, %ecx
+; X64-FAST-NEXT:    shrdq %cl, %r9, %r8
+; X64-FAST-NEXT:    shrq %cl, %r9
+; X64-FAST-NEXT:    xorl %eax, %eax
+; X64-FAST-NEXT:    testb $64, %r10b
+; X64-FAST-NEXT:    cmovneq %r9, %r8
+; X64-FAST-NEXT:    cmovneq %rax, %r9
+; X64-FAST-NEXT:    shldq $1, %rdi, %rdx
+; X64-FAST-NEXT:    addq %rdi, %rdi
+; X64-FAST-NEXT:    notb %r10b
+; X64-FAST-NEXT:    movl %r10d, %ecx
+; X64-FAST-NEXT:    shldq %cl, %rdi, %rdx
+; X64-FAST-NEXT:    shlq %cl, %rdi
+; X64-FAST-NEXT:    testb $64, %r10b
+; X64-FAST-NEXT:    cmovneq %rdi, %rdx
+; X64-FAST-NEXT:    cmoveq %rdi, %rax
+; X64-FAST-NEXT:    orq %r8, %rax
+; X64-FAST-NEXT:    orq %r9, %rdx
+; X64-FAST-NEXT:    retq
+;
+; X64-SLOW-LABEL: var_shift_i128:
+; X64-SLOW:       # %bb.0:
+; X64-SLOW-NEXT:    movq %rcx, %r10
+; X64-SLOW-NEXT:    movq %rdx, %r9
+; X64-SLOW-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
+; X64-SLOW-NEXT:    andq %rdi, %rax
+; X64-SLOW-NEXT:    movl %r8d, %ecx
+; X64-SLOW-NEXT:    shrq %cl, %rax
+; X64-SLOW-NEXT:    movq %rdi, %rcx
+; X64-SLOW-NEXT:    shrq $63, %rcx
+; X64-SLOW-NEXT:    leaq (%rcx,%rsi,2), %rdx
+; X64-SLOW-NEXT:    movl %r8d, %r11d
+; X64-SLOW-NEXT:    notb %r11b
+; X64-SLOW-NEXT:    movl %r11d, %ecx
+; X64-SLOW-NEXT:    shlq %cl, %rdx
+; X64-SLOW-NEXT:    orq %rax, %rdx
+; X64-SLOW-NEXT:    movl %r8d, %ecx
+; X64-SLOW-NEXT:    shrq %cl, %r9
+; X64-SLOW-NEXT:    leaq (%r10,%r10), %rsi
+; X64-SLOW-NEXT:    movl %r11d, %ecx
+; X64-SLOW-NEXT:    shlq %cl, %rsi
+; X64-SLOW-NEXT:    orq %r9, %rsi
+; X64-SLOW-NEXT:    movl %r8d, %ecx
+; X64-SLOW-NEXT:    shrq %cl, %r10
+; X64-SLOW-NEXT:    xorl %eax, %eax
+; X64-SLOW-NEXT:    testb $64, %r8b
+; X64-SLOW-NEXT:    cmovneq %r10, %rsi
+; X64-SLOW-NEXT:    cmovneq %rax, %r10
+; X64-SLOW-NEXT:    addq %rdi, %rdi
+; X64-SLOW-NEXT:    movl %r11d, %ecx
+; X64-SLOW-NEXT:    shlq %cl, %rdi
+; X64-SLOW-NEXT:    testb $64, %r11b
+; X64-SLOW-NEXT:    cmovneq %rdi, %rdx
+; X64-SLOW-NEXT:    cmoveq %rdi, %rax
+; X64-SLOW-NEXT:    orq %rsi, %rax
+; X64-SLOW-NEXT:    orq %r10, %rdx
+; X64-SLOW-NEXT:    retq
+  %tmp = tail call i128 @llvm.fshr.i128(i128 %x, i128 %y, i128 %z)
+  ret i128 %tmp
+}
+
 ;
 ; Const Funnel Shift
 ;
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index c9c3f16c44f7e..5c16cfa387d51 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -39,7 +39,7 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,1]
 ; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
 ; AVX1-SLOW-NEXT:    vhaddps %xmm3, %xmm3, %xmm1
-; AVX1-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[1]
+; AVX1-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX1-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
 ; AVX1-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; AVX1-SLOW-NEXT:    retq
@@ -58,7 +58,7 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,3]
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,1]
 ; AVX2-SLOW-NEXT:    vhaddps %xmm3, %xmm3, %xmm1
-; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[1]
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
 ; AVX2-SLOW-NEXT:    vaddps %xmm0, %xmm1, %xmm0
 ; AVX2-SLOW-NEXT:    retq
@@ -105,8 +105,8 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
 ;
 ; SSSE3-FAST-LABEL: pair_sum_v4i32_v4i32:
 ; SSSE3-FAST:       # %bb.0:
-; SSSE3-FAST-NEXT:    phaddd %xmm3, %xmm2
 ; SSSE3-FAST-NEXT:    phaddd %xmm1, %xmm0
+; SSSE3-FAST-NEXT:    phaddd %xmm3, %xmm2
 ; SSSE3-FAST-NEXT:    phaddd %xmm2, %xmm0
 ; SSSE3-FAST-NEXT:    retq
 ;
@@ -126,12 +126,12 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
 ; AVX1-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
 ; AVX1-SLOW-NEXT:    retq
 ;
-; AVX-FAST-LABEL: pair_sum_v4i32_v4i32:
-; AVX-FAST:       # %bb.0:
-; AVX-FAST-NEXT:    vphaddd %xmm3, %xmm2, %xmm2
-; AVX-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
-; AVX-FAST-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
-; AVX-FAST-NEXT:    retq
+; AVX1-FAST-LABEL: pair_sum_v4i32_v4i32:
+; AVX1-FAST:       # %bb.0:
+; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    vphaddd %xmm3, %xmm2, %xmm1
+; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT:    retq
 ;
 ; AVX2-SLOW-LABEL: pair_sum_v4i32_v4i32:
 ; AVX2-SLOW:       # %bb.0:
@@ -147,6 +147,13 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
 ; AVX2-SLOW-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
 ; AVX2-SLOW-NEXT:    retq
+;
+; AVX2-FAST-LABEL: pair_sum_v4i32_v4i32:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vphaddd %xmm3, %xmm2, %xmm2
+; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vphaddd %xmm2, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    retq
   %5 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
   %6 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 1, i32 3>
   %7 = add <2 x i32> %5, %6
@@ -220,7 +227,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
 ; AVX1-SLOW-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
 ; AVX1-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
-; AVX1-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
+; AVX1-SLOW-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX1-SLOW-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; AVX1-SLOW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX1-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -241,7 +248,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
 ; AVX1-FAST-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
 ; AVX1-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
-; AVX1-FAST-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
+; AVX1-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX1-FAST-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; AVX1-FAST-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX1-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -264,7 +271,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
 ; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
-; AVX2-SLOW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
+; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX2-SLOW-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; AVX2-SLOW-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-SLOW-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -285,7 +292,7 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
 ; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
 ; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
 ; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
-; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 ; AVX2-FAST-NEXT:    vaddps %xmm1, %xmm3, %xmm1
 ; AVX2-FAST-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-FAST-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
@@ -451,20 +458,20 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
 ; AVX2-FAST:       # %bb.0:
 ; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vphaddd %xmm4, %xmm4, %xmm1
-; AVX2-FAST-NEXT:    vphaddd %xmm5, %xmm5, %xmm4
+; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm1
+; AVX2-FAST-NEXT:    vphaddd %xmm4, %xmm4, %xmm4
+; AVX2-FAST-NEXT:    vphaddd %xmm5, %xmm5, %xmm5
 ; AVX2-FAST-NEXT:    vphaddd %xmm3, %xmm2, %xmm2
-; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,3]
-; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
-; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
-; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
-; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vphaddd %xmm7, %xmm6, %xmm1
-; AVX2-FAST-NEXT:    vphaddd %xmm0, %xmm0, %xmm2
-; AVX2-FAST-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm4[0,3]
+; AVX2-FAST-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[0]
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[1,3]
+; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3]
+; AVX2-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX2-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vphaddd %xmm7, %xmm6, %xmm2
+; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm2, %xmm1
 ; AVX2-FAST-NEXT:    vpbroadcastq %xmm1, %ymm1
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FAST-NEXT:    retq
@@ -993,8 +1000,8 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
 ; SSSE3-FAST-NEXT:    movaps %xmm3, %xmm1
 ; SSSE3-FAST-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
 ; SSSE3-FAST-NEXT:    addps %xmm3, %xmm1
-; SSSE3-FAST-NEXT:    haddps %xmm0, %xmm1
-; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[2,0]
+; SSSE3-FAST-NEXT:    haddps %xmm1, %xmm0
+; SSSE3-FAST-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm0[0,2]
 ; SSSE3-FAST-NEXT:    movaps %xmm4, %xmm0
 ; SSSE3-FAST-NEXT:    retq
 ;
@@ -1028,8 +1035,8 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
 ; AVX-FAST-NEXT:    vaddps %xmm1, %xmm2, %xmm1
 ; AVX-FAST-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm3[1,0]
 ; AVX-FAST-NEXT:    vaddps %xmm2, %xmm3, %xmm2
-; AVX-FAST-NEXT:    vhaddps %xmm1, %xmm2, %xmm1
-; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,0]
+; AVX-FAST-NEXT:    vhaddps %xmm2, %xmm1, %xmm1
+; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; AVX-FAST-NEXT:    retq
   %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %0)
   %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %1)
@@ -1105,39 +1112,20 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
 ; AVX-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX-SLOW-NEXT:    retq
 ;
-; AVX1-FAST-LABEL: reduction_sum_v4i32_v4i32:
-; AVX1-FAST:       # %bb.0:
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; AVX1-FAST-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
-; AVX1-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; AVX1-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; AVX1-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-FAST-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,2]
-; AVX1-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-FAST-NEXT:    retq
-;
-; AVX2-FAST-LABEL: reduction_sum_v4i32_v4i32:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX2-FAST-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; AVX2-FAST-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; AVX2-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
-; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
-; AVX2-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
-; AVX2-FAST-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,0,2]
-; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2-FAST-NEXT:    retq
+; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32:
+; AVX-FAST:       # %bb.0:
+; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; AVX-FAST-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
+; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
+; AVX-FAST-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
+; AVX-FAST-NEXT:    vphaddd %xmm1, %xmm0, %xmm0
+; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; AVX-FAST-NEXT:    vpaddd %xmm1, %xmm2, %xmm1
+; AVX-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
+; AVX-FAST-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
+; AVX-FAST-NEXT:    vphaddd %xmm2, %xmm1, %xmm1
+; AVX-FAST-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX-FAST-NEXT:    retq
   %5 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %0)
   %6 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %1)
   %7 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %2)
diff --git a/llvm/test/CodeGen/X86/insertelement-shuffle.ll b/llvm/test/CodeGen/X86/insertelement-shuffle.ll
index 000466f598bb0..57ab9344c4fd8 100644
--- a/llvm/test/CodeGen/X86/insertelement-shuffle.ll
+++ b/llvm/test/CodeGen/X86/insertelement-shuffle.ll
@@ -30,19 +30,18 @@ define <8 x float> @insert_subvector_256(i16 %x0, i16 %x1, <8 x float> %v) nounw
 define <8 x i64> @insert_subvector_512(i32 %x0, i32 %x1, <8 x i64> %v) nounwind {
 ; X86_AVX256-LABEL: insert_subvector_512:
 ; X86_AVX256:       # %bb.0:
-; X86_AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; X86_AVX256-NEXT:    vpinsrd $0, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; X86_AVX256-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2
-; X86_AVX256-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X86_AVX256-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %ymm2
+; X86_AVX256-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7]
+; X86_AVX256-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %ymm2
+; X86_AVX256-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
 ; X86_AVX256-NEXT:    retl
 ;
 ; X64_AVX256-LABEL: insert_subvector_512:
 ; X64_AVX256:       # %bb.0:
 ; X64_AVX256-NEXT:    vmovd %edi, %xmm2
 ; X64_AVX256-NEXT:    vpinsrd $1, %esi, %xmm2, %xmm2
-; X64_AVX256-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; X64_AVX256-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
-; X64_AVX256-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; X64_AVX256-NEXT:    vpbroadcastq %xmm2, %ymm2
+; X64_AVX256-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
 ; X64_AVX256-NEXT:    retq
 ;
 ; X86_AVX512-LABEL: insert_subvector_512:
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index dc6362d499a1d..552b69748e86c 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -216,16 +216,14 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, <
 ; AVX1-NEXT:    testb $4, %al
 ; AVX1-NEXT:    je LBB1_6
 ; AVX1-NEXT:  LBB1_5: ## %cond.load5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovlpd (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0],xmm1[1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
 ; AVX1-NEXT:    addq $8, %rdi
 ; AVX1-NEXT:    testb $8, %al
 ; AVX1-NEXT:    je LBB1_8
 ; AVX1-NEXT:  LBB1_7: ## %cond.load9
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovhps (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0,1],mem[0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: expandload_v4f64_v4i64:
@@ -259,16 +257,14 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, <
 ; AVX2-NEXT:    testb $4, %al
 ; AVX2-NEXT:    je LBB1_6
 ; AVX2-NEXT:  LBB1_5: ## %cond.load5
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovlpd (%rdi), %xmm1, %xmm1 ## xmm1 = mem[0],xmm1[1]
-; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm1
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3]
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testb $8, %al
 ; AVX2-NEXT:    je LBB1_8
 ; AVX2-NEXT:  LBB1_7: ## %cond.load9
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vmovhpd (%rdi), %xmm1, %xmm1 ## xmm1 = xmm1[0],mem[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm1
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: expandload_v4f64_v4i64:
@@ -405,16 +401,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8
 ; AVX1-NEXT:    testb $4, %al
 ; AVX1-NEXT:    je LBB2_6
 ; AVX1-NEXT:  LBB2_5: ## %cond.load5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
 ; AVX1-NEXT:    addq $8, %rdi
 ; AVX1-NEXT:    testb $8, %al
 ; AVX1-NEXT:    je LBB2_8
 ; AVX1-NEXT:  LBB2_7: ## %cond.load9
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX1-NEXT:    addq $8, %rdi
 ; AVX1-NEXT:    testb $16, %al
 ; AVX1-NEXT:    je LBB2_10
@@ -431,16 +425,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8
 ; AVX1-NEXT:    testb $64, %al
 ; AVX1-NEXT:    je LBB2_14
 ; AVX1-NEXT:  LBB2_13: ## %cond.load21
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
 ; AVX1-NEXT:    addq $8, %rdi
 ; AVX1-NEXT:    testb $-128, %al
 ; AVX1-NEXT:    je LBB2_16
 ; AVX1-NEXT:  LBB2_15: ## %cond.load25
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: expandload_v8f64_v8i1:
@@ -486,16 +478,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8
 ; AVX2-NEXT:    testb $4, %al
 ; AVX2-NEXT:    je LBB2_6
 ; AVX2-NEXT:  LBB2_5: ## %cond.load5
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testb $8, %al
 ; AVX2-NEXT:    je LBB2_8
 ; AVX2-NEXT:  LBB2_7: ## %cond.load9
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testb $16, %al
 ; AVX2-NEXT:    je LBB2_10
@@ -512,16 +502,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8
 ; AVX2-NEXT:    testb $64, %al
 ; AVX2-NEXT:    je LBB2_14
 ; AVX2-NEXT:  LBB2_13: ## %cond.load21
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vmovlps (%rdi), %xmm2, %xmm2 ## xmm2 = mem[0,1],xmm2[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testb $-128, %al
 ; AVX2-NEXT:    je LBB2_16
 ; AVX2-NEXT:  LBB2_15: ## %cond.load25
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vmovhps (%rdi), %xmm2, %xmm2 ## xmm2 = xmm2[0,1],mem[0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: expandload_v8f64_v8i1:
@@ -777,16 +765,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
 ; AVX1-NEXT:    testb $4, %al
 ; AVX1-NEXT:    je LBB3_6
 ; AVX1-NEXT:  LBB3_5: ## %cond.load5
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
 ; AVX1-NEXT:    addq $8, %rdi
 ; AVX1-NEXT:    testb $8, %al
 ; AVX1-NEXT:    je LBB3_8
 ; AVX1-NEXT:  LBB3_7: ## %cond.load9
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
 ; AVX1-NEXT:    addq $8, %rdi
 ; AVX1-NEXT:    testb $16, %al
 ; AVX1-NEXT:    je LBB3_10
@@ -803,16 +789,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
 ; AVX1-NEXT:    testb $64, %al
 ; AVX1-NEXT:    je LBB3_14
 ; AVX1-NEXT:  LBB3_13: ## %cond.load21
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
 ; AVX1-NEXT:    addq $8, %rdi
 ; AVX1-NEXT:    testb $-128, %al
 ; AVX1-NEXT:    je LBB3_16
 ; AVX1-NEXT:  LBB3_15: ## %cond.load25
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
 ; AVX1-NEXT:    addq $8, %rdi
 ; AVX1-NEXT:    testl $256, %eax ## imm = 0x100
 ; AVX1-NEXT:    je LBB3_18
@@ -829,16 +813,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
 ; AVX1-NEXT:    testl $1024, %eax ## imm = 0x400
 ; AVX1-NEXT:    je LBB3_22
 ; AVX1-NEXT:  LBB3_21: ## %cond.load37
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
 ; AVX1-NEXT:    addq $8, %rdi
 ; AVX1-NEXT:    testl $2048, %eax ## imm = 0x800
 ; AVX1-NEXT:    je LBB3_24
 ; AVX1-NEXT:  LBB3_23: ## %cond.load41
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
 ; AVX1-NEXT:    addq $8, %rdi
 ; AVX1-NEXT:    testl $4096, %eax ## imm = 0x1000
 ; AVX1-NEXT:    je LBB3_26
@@ -855,16 +837,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
 ; AVX1-NEXT:    testl $16384, %eax ## imm = 0x4000
 ; AVX1-NEXT:    je LBB3_30
 ; AVX1-NEXT:  LBB3_29: ## %cond.load53
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
 ; AVX1-NEXT:    addq $8, %rdi
 ; AVX1-NEXT:    testl $32768, %eax ## imm = 0x8000
 ; AVX1-NEXT:    je LBB3_32
 ; AVX1-NEXT:  LBB3_31: ## %cond.load57
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: expandload_v16f64_v16i32:
@@ -939,16 +919,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
 ; AVX2-NEXT:    testb $4, %al
 ; AVX2-NEXT:    je LBB3_6
 ; AVX2-NEXT:  LBB3_5: ## %cond.load5
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testb $8, %al
 ; AVX2-NEXT:    je LBB3_8
 ; AVX2-NEXT:  LBB3_7: ## %cond.load9
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testb $16, %al
 ; AVX2-NEXT:    je LBB3_10
@@ -965,16 +943,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
 ; AVX2-NEXT:    testb $64, %al
 ; AVX2-NEXT:    je LBB3_14
 ; AVX2-NEXT:  LBB3_13: ## %cond.load21
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT:    vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testb $-128, %al
 ; AVX2-NEXT:    je LBB3_16
 ; AVX2-NEXT:  LBB3_15: ## %cond.load25
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT:    vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testl $256, %eax ## imm = 0x100
 ; AVX2-NEXT:    je LBB3_18
@@ -991,16 +967,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
 ; AVX2-NEXT:    testl $1024, %eax ## imm = 0x400
 ; AVX2-NEXT:    je LBB3_22
 ; AVX2-NEXT:  LBB3_21: ## %cond.load37
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT:    vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testl $2048, %eax ## imm = 0x800
 ; AVX2-NEXT:    je LBB3_24
 ; AVX2-NEXT:  LBB3_23: ## %cond.load41
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT:    vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testl $4096, %eax ## imm = 0x1000
 ; AVX2-NEXT:    je LBB3_26
@@ -1017,16 +991,14 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
 ; AVX2-NEXT:    testl $16384, %eax ## imm = 0x4000
 ; AVX2-NEXT:    je LBB3_30
 ; AVX2-NEXT:  LBB3_29: ## %cond.load53
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vmovlps (%rdi), %xmm4, %xmm4 ## xmm4 = mem[0,1],xmm4[2,3]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
 ; AVX2-NEXT:    addq $8, %rdi
 ; AVX2-NEXT:    testl $32768, %eax ## imm = 0x8000
 ; AVX2-NEXT:    je LBB3_32
 ; AVX2-NEXT:  LBB3_31: ## %cond.load57
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vmovhps (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0,1]
-; AVX2-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpbroadcastq (%rdi), %ymm4
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: expandload_v16f64_v16i32:
@@ -2193,31 +2165,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
 ; AVX1-NEXT:    testb $16, %al
 ; AVX1-NEXT:    je LBB8_10
 ; AVX1-NEXT:  LBB8_9: ## %cond.load13
-; AVX1-NEXT:    vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testb $32, %al
 ; AVX1-NEXT:    je LBB8_12
 ; AVX1-NEXT:  LBB8_11: ## %cond.load17
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testb $64, %al
 ; AVX1-NEXT:    je LBB8_14
 ; AVX1-NEXT:  LBB8_13: ## %cond.load21
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6],ymm0[7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testb $-128, %al
 ; AVX1-NEXT:    je LBB8_16
 ; AVX1-NEXT:  LBB8_15: ## %cond.load25
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testl $256, %eax ## imm = 0x100
 ; AVX1-NEXT:    je LBB8_18
@@ -2246,31 +2213,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
 ; AVX1-NEXT:    testl $4096, %eax ## imm = 0x1000
 ; AVX1-NEXT:    je LBB8_26
 ; AVX1-NEXT:  LBB8_25: ## %cond.load45
-; AVX1-NEXT:    vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testl $8192, %eax ## imm = 0x2000
 ; AVX1-NEXT:    je LBB8_28
 ; AVX1-NEXT:  LBB8_27: ## %cond.load49
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testl $16384, %eax ## imm = 0x4000
 ; AVX1-NEXT:    je LBB8_30
 ; AVX1-NEXT:  LBB8_29: ## %cond.load53
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testl $32768, %eax ## imm = 0x8000
 ; AVX1-NEXT:    je LBB8_32
 ; AVX1-NEXT:  LBB8_31: ## %cond.load57
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testl $65536, %eax ## imm = 0x10000
 ; AVX1-NEXT:    je LBB8_34
@@ -2299,31 +2261,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
 ; AVX1-NEXT:    testl $1048576, %eax ## imm = 0x100000
 ; AVX1-NEXT:    je LBB8_42
 ; AVX1-NEXT:  LBB8_41: ## %cond.load77
-; AVX1-NEXT:    vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testl $2097152, %eax ## imm = 0x200000
 ; AVX1-NEXT:    je LBB8_44
 ; AVX1-NEXT:  LBB8_43: ## %cond.load81
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testl $4194304, %eax ## imm = 0x400000
 ; AVX1-NEXT:    je LBB8_46
 ; AVX1-NEXT:  LBB8_45: ## %cond.load85
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testl $8388608, %eax ## imm = 0x800000
 ; AVX1-NEXT:    je LBB8_48
 ; AVX1-NEXT:  LBB8_47: ## %cond.load89
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testl $16777216, %eax ## imm = 0x1000000
 ; AVX1-NEXT:    je LBB8_50
@@ -2352,31 +2309,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
 ; AVX1-NEXT:    testl $268435456, %eax ## imm = 0x10000000
 ; AVX1-NEXT:    je LBB8_58
 ; AVX1-NEXT:  LBB8_57: ## %cond.load109
-; AVX1-NEXT:    vmovd (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testl $536870912, %eax ## imm = 0x20000000
 ; AVX1-NEXT:    je LBB8_60
 ; AVX1-NEXT:  LBB8_59: ## %cond.load113
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
 ; AVX1-NEXT:    je LBB8_62
 ; AVX1-NEXT:  LBB8_61: ## %cond.load117
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
 ; AVX1-NEXT:    addq $4, %rdi
 ; AVX1-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
 ; AVX1-NEXT:    je LBB8_64
 ; AVX1-NEXT:  LBB8_63: ## %cond.load121
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: expandload_v32f32_v32i32:
@@ -2515,31 +2467,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
 ; AVX2-NEXT:    testb $16, %al
 ; AVX2-NEXT:    je LBB8_10
 ; AVX2-NEXT:  LBB8_9: ## %cond.load13
-; AVX2-NEXT:    vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4],ymm0[5,6,7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testb $32, %al
 ; AVX2-NEXT:    je LBB8_12
 ; AVX2-NEXT:  LBB8_11: ## %cond.load17
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testb $64, %al
 ; AVX2-NEXT:    je LBB8_14
 ; AVX2-NEXT:  LBB8_13: ## %cond.load21
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6],ymm0[7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testb $-128, %al
 ; AVX2-NEXT:    je LBB8_16
 ; AVX2-NEXT:  LBB8_15: ## %cond.load25
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX2-NEXT:    vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testl $256, %eax ## imm = 0x100
 ; AVX2-NEXT:    je LBB8_18
@@ -2568,31 +2515,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
 ; AVX2-NEXT:    testl $4096, %eax ## imm = 0x1000
 ; AVX2-NEXT:    je LBB8_26
 ; AVX2-NEXT:  LBB8_25: ## %cond.load45
-; AVX2-NEXT:    vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4],ymm1[5,6,7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testl $8192, %eax ## imm = 0x2000
 ; AVX2-NEXT:    je LBB8_28
 ; AVX2-NEXT:  LBB8_27: ## %cond.load49
-; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT:    vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testl $16384, %eax ## imm = 0x4000
 ; AVX2-NEXT:    je LBB8_30
 ; AVX2-NEXT:  LBB8_29: ## %cond.load53
-; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT:    vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testl $32768, %eax ## imm = 0x8000
 ; AVX2-NEXT:    je LBB8_32
 ; AVX2-NEXT:  LBB8_31: ## %cond.load57
-; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT:    vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testl $65536, %eax ## imm = 0x10000
 ; AVX2-NEXT:    je LBB8_34
@@ -2621,31 +2563,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
 ; AVX2-NEXT:    testl $1048576, %eax ## imm = 0x100000
 ; AVX2-NEXT:    je LBB8_42
 ; AVX2-NEXT:  LBB8_41: ## %cond.load77
-; AVX2-NEXT:    vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm5
-; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testl $2097152, %eax ## imm = 0x200000
 ; AVX2-NEXT:    je LBB8_44
 ; AVX2-NEXT:  LBB8_43: ## %cond.load81
-; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX2-NEXT:    vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testl $4194304, %eax ## imm = 0x400000
 ; AVX2-NEXT:    je LBB8_46
 ; AVX2-NEXT:  LBB8_45: ## %cond.load85
-; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX2-NEXT:    vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testl $8388608, %eax ## imm = 0x800000
 ; AVX2-NEXT:    je LBB8_48
 ; AVX2-NEXT:  LBB8_47: ## %cond.load89
-; AVX2-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX2-NEXT:    vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testl $16777216, %eax ## imm = 0x1000000
 ; AVX2-NEXT:    je LBB8_50
@@ -2674,31 +2611,26 @@ define <32 x float> @expandload_v32f32_v32i32(float* %base, <32 x float> %src0,
 ; AVX2-NEXT:    testl $268435456, %eax ## imm = 0x10000000
 ; AVX2-NEXT:    je LBB8_58
 ; AVX2-NEXT:  LBB8_57: ## %cond.load109
-; AVX2-NEXT:    vmovss (%rdi), %xmm4 ## xmm4 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm5
-; AVX2-NEXT:    vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testl $536870912, %eax ## imm = 0x20000000
 ; AVX2-NEXT:    je LBB8_60
 ; AVX2-NEXT:  LBB8_59: ## %cond.load113
-; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vinsertps $16, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0],mem[0],xmm4[2,3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testl $1073741824, %eax ## imm = 0x40000000
 ; AVX2-NEXT:    je LBB8_62
 ; AVX2-NEXT:  LBB8_61: ## %cond.load117
-; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vinsertps $32, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1],mem[0],xmm4[3]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
 ; AVX2-NEXT:    addq $4, %rdi
 ; AVX2-NEXT:    testl $-2147483648, %eax ## imm = 0x80000000
 ; AVX2-NEXT:    je LBB8_64
 ; AVX2-NEXT:  LBB8_63: ## %cond.load121
-; AVX2-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vinsertps $48, (%rdi), %xmm4, %xmm4 ## xmm4 = xmm4[0,1,2],mem[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm4
+; AVX2-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: expandload_v32f32_v32i32:
diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll
index a1ff2bf087827..2f00b80bb76bb 100644
--- a/llvm/test/CodeGen/X86/masked_gather.ll
+++ b/llvm/test/CodeGen/X86/masked_gather.ll
@@ -1359,11 +1359,10 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX1-NEXT:    testb $-128, %al
 ; AVX1-NEXT:    je .LBB4_16
 ; AVX1-NEXT:  .LBB4_15: # %cond.load19
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpinsrd $3, c+12(%rip), %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vbroadcastss c+12(%rip), %ymm3
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
 ; AVX1-NEXT:  .LBB4_16: # %else20
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
@@ -1393,11 +1392,10 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX1-NEXT:    testb $-128, %al
 ; AVX1-NEXT:    je .LBB4_32
 ; AVX1-NEXT:  .LBB4_31: # %cond.load58
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpinsrd $3, c+28(%rip), %xmm4, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vbroadcastss c+28(%rip), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
 ; AVX1-NEXT:  .LBB4_32: # %else61
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
@@ -1418,9 +1416,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX1-NEXT:    testb $16, %al
 ; AVX1-NEXT:    je .LBB4_42
 ; AVX1-NEXT:  .LBB4_41: # %cond.load84
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpinsrd $0, c+28(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss c+28(%rip), %ymm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7]
 ; AVX1-NEXT:  .LBB4_42: # %else87
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
@@ -1428,25 +1425,22 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX1-NEXT:    testb $32, %al
 ; AVX1-NEXT:    je .LBB4_44
 ; AVX1-NEXT:  # %bb.43: # %cond.load89
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpinsrd $1, c+28(%rip), %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss c+28(%rip), %ymm3
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
 ; AVX1-NEXT:  .LBB4_44: # %else92
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    testb $64, %al
 ; AVX1-NEXT:    je .LBB4_46
 ; AVX1-NEXT:  # %bb.45: # %cond.load94
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpinsrd $2, c+28(%rip), %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss c+28(%rip), %ymm3
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7]
 ; AVX1-NEXT:  .LBB4_46: # %else97
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    testb $-128, %al
 ; AVX1-NEXT:    je .LBB4_48
 ; AVX1-NEXT:  # %bb.47: # %cond.load99
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpinsrd $3, c+28(%rip), %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vbroadcastss c+28(%rip), %ymm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
 ; AVX1-NEXT:  .LBB4_48: # %else102
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
@@ -1474,21 +1468,18 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX1-NEXT:    testb $16, %al
 ; AVX1-NEXT:    je .LBB4_10
 ; AVX1-NEXT:  .LBB4_9: # %cond.load10
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpinsrd $0, c+12(%rip), %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vbroadcastss c+12(%rip), %ymm3
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6,7]
 ; AVX1-NEXT:    testb $32, %al
 ; AVX1-NEXT:    je .LBB4_12
 ; AVX1-NEXT:  .LBB4_11: # %cond.load13
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpinsrd $1, c+12(%rip), %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vbroadcastss c+12(%rip), %ymm3
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
 ; AVX1-NEXT:    testb $64, %al
 ; AVX1-NEXT:    je .LBB4_14
 ; AVX1-NEXT:  .LBB4_13: # %cond.load16
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpinsrd $2, c+12(%rip), %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vbroadcastss c+12(%rip), %ymm3
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7]
 ; AVX1-NEXT:    testb $-128, %al
 ; AVX1-NEXT:    jne .LBB4_15
 ; AVX1-NEXT:    jmp .LBB4_16
@@ -1512,21 +1503,18 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX1-NEXT:    testb $16, %al
 ; AVX1-NEXT:    je .LBB4_26
 ; AVX1-NEXT:  .LBB4_25: # %cond.load43
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpinsrd $0, c+28(%rip), %xmm4, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vbroadcastss c+28(%rip), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7]
 ; AVX1-NEXT:    testb $32, %al
 ; AVX1-NEXT:    je .LBB4_28
 ; AVX1-NEXT:  .LBB4_27: # %cond.load48
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpinsrd $1, c+28(%rip), %xmm4, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vbroadcastss c+28(%rip), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
 ; AVX1-NEXT:    testb $64, %al
 ; AVX1-NEXT:    je .LBB4_30
 ; AVX1-NEXT:  .LBB4_29: # %cond.load53
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpinsrd $2, c+28(%rip), %xmm4, %xmm4
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT:    vbroadcastss c+28(%rip), %ymm4
+; AVX1-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
 ; AVX1-NEXT:    testb $-128, %al
 ; AVX1-NEXT:    jne .LBB4_31
 ; AVX1-NEXT:    jmp .LBB4_32
@@ -1581,9 +1569,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX2-NEXT:    testb $-128, %al
 ; AVX2-NEXT:    je .LBB4_16
 ; AVX2-NEXT:  .LBB4_15: # %cond.load19
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpinsrd $3, c+12(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastd c+12(%rip), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:  .LBB4_16: # %else20
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm0, %ymm2
@@ -1613,9 +1600,8 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX2-NEXT:    testb $-128, %al
 ; AVX2-NEXT:    je .LBB4_32
 ; AVX2-NEXT:  .LBB4_31: # %cond.load58
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpinsrd $3, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; AVX2-NEXT:  .LBB4_32: # %else61
 ; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm0, %ymm0
@@ -1642,17 +1628,15 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX2-NEXT:    testb $64, %al
 ; AVX2-NEXT:    je .LBB4_46
 ; AVX2-NEXT:  .LBB4_45: # %cond.load94
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpinsrd $2, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7]
 ; AVX2-NEXT:  .LBB4_46: # %else97
 ; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    testb $-128, %al
 ; AVX2-NEXT:    je .LBB4_48
 ; AVX2-NEXT:  # %bb.47: # %cond.load99
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpinsrd $3, c+28(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd c+28(%rip), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-NEXT:  .LBB4_48: # %else102
 ; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
@@ -1676,21 +1660,18 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX2-NEXT:    testb $16, %al
 ; AVX2-NEXT:    je .LBB4_10
 ; AVX2-NEXT:  .LBB4_9: # %cond.load10
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpinsrd $0, c+12(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastd c+12(%rip), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
 ; AVX2-NEXT:    testb $32, %al
 ; AVX2-NEXT:    je .LBB4_12
 ; AVX2-NEXT:  .LBB4_11: # %cond.load13
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpinsrd $1, c+12(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastd c+12(%rip), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
 ; AVX2-NEXT:    testb $64, %al
 ; AVX2-NEXT:    je .LBB4_14
 ; AVX2-NEXT:  .LBB4_13: # %cond.load16
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpinsrd $2, c+12(%rip), %xmm2, %xmm2
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastd c+12(%rip), %ymm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
 ; AVX2-NEXT:    testb $-128, %al
 ; AVX2-NEXT:    jne .LBB4_15
 ; AVX2-NEXT:    jmp .LBB4_16
@@ -1714,21 +1695,18 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX2-NEXT:    testb $16, %al
 ; AVX2-NEXT:    je .LBB4_26
 ; AVX2-NEXT:  .LBB4_25: # %cond.load43
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpinsrd $0, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7]
 ; AVX2-NEXT:    testb $32, %al
 ; AVX2-NEXT:    je .LBB4_28
 ; AVX2-NEXT:  .LBB4_27: # %cond.load48
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpinsrd $1, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; AVX2-NEXT:    testb $64, %al
 ; AVX2-NEXT:    je .LBB4_30
 ; AVX2-NEXT:  .LBB4_29: # %cond.load53
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT:    vpinsrd $2, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7]
 ; AVX2-NEXT:    testb $-128, %al
 ; AVX2-NEXT:    jne .LBB4_31
 ; AVX2-NEXT:    jmp .LBB4_32
@@ -1752,15 +1730,13 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
 ; AVX2-NEXT:    testb $16, %al
 ; AVX2-NEXT:    je .LBB4_42
 ; AVX2-NEXT:  .LBB4_41: # %cond.load84
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpinsrd $0, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4],ymm0[5,6,7]
 ; AVX2-NEXT:    testb $32, %al
 ; AVX2-NEXT:    je .LBB4_44
 ; AVX2-NEXT:  .LBB4_43: # %cond.load89
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpinsrd $1, c+28(%rip), %xmm3, %xmm3
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd c+28(%rip), %ymm3
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
 ; AVX2-NEXT:    testb $64, %al
 ; AVX2-NEXT:    jne .LBB4_45
 ; AVX2-NEXT:    jmp .LBB4_46
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 771850f1f2ecc..2a961299d1770 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -965,16 +965,14 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
 ; KNL_64-NEXT:    retq
 ; KNL_64-NEXT:  .LBB15_5: # %cond.load4
 ; KNL_64-NEXT:    vmovq %xmm0, %rcx
-; KNL_64-NEXT:    vextracti128 $1, %ymm2, %xmm1
-; KNL_64-NEXT:    vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; KNL_64-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm2
+; KNL_64-NEXT:    vpbroadcastq (%rcx), %ymm1
+; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
 ; KNL_64-NEXT:    testb $8, %al
 ; KNL_64-NEXT:    je .LBB15_8
 ; KNL_64-NEXT:  .LBB15_7: # %cond.load7
 ; KNL_64-NEXT:    vpextrq $1, %xmm0, %rax
-; KNL_64-NEXT:    vextracti128 $1, %ymm2, %xmm0
-; KNL_64-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; KNL_64-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm2
+; KNL_64-NEXT:    vpbroadcastq (%rax), %ymm0
+; KNL_64-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; KNL_64-NEXT:    vmovdqa %ymm2, %ymm0
 ; KNL_64-NEXT:    retq
 ;
@@ -1014,16 +1012,14 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x
 ; KNL_32-NEXT:    je .LBB15_6
 ; KNL_32-NEXT:  .LBB15_5: # %cond.load4
 ; KNL_32-NEXT:    vpextrd $2, %xmm0, %ecx
-; KNL_32-NEXT:    vextracti128 $1, %ymm2, %xmm1
-; KNL_32-NEXT:    vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; KNL_32-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm2
+; KNL_32-NEXT:    vpbroadcastq (%ecx), %ymm1
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
 ; KNL_32-NEXT:    testb $8, %al
 ; KNL_32-NEXT:    je .LBB15_8
 ; KNL_32-NEXT:  .LBB15_7: # %cond.load7
 ; KNL_32-NEXT:    vpextrd $3, %xmm0, %eax
-; KNL_32-NEXT:    vextracti128 $1, %ymm2, %xmm0
-; KNL_32-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; KNL_32-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm2
+; KNL_32-NEXT:    vpbroadcastq (%eax), %ymm0
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7]
 ; KNL_32-NEXT:    vmovdqa %ymm2, %ymm0
 ; KNL_32-NEXT:    retl
 ;
@@ -3220,17 +3216,15 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; KNL_64-NEXT:    je .LBB42_6
 ; KNL_64-NEXT:  # %bb.5: # %cond.load4
 ; KNL_64-NEXT:    vmovq %xmm2, %rcx
-; KNL_64-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; KNL_64-NEXT:    vpinsrq $0, (%rcx), %xmm3, %xmm3
-; KNL_64-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; KNL_64-NEXT:    vpbroadcastq (%rcx), %ymm3
+; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7]
 ; KNL_64-NEXT:  .LBB42_6: # %else5
 ; KNL_64-NEXT:    testb $8, %al
 ; KNL_64-NEXT:    je .LBB42_8
 ; KNL_64-NEXT:  # %bb.7: # %cond.load7
 ; KNL_64-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_64-NEXT:    vextracti128 $1, %ymm1, %xmm3
-; KNL_64-NEXT:    vpinsrq $1, (%rax), %xmm3, %xmm3
-; KNL_64-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
+; KNL_64-NEXT:    vpbroadcastq (%rax), %ymm3
+; KNL_64-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
 ; KNL_64-NEXT:  .LBB42_8: # %else8
 ; KNL_64-NEXT:    kmovw %k0, %eax
 ; KNL_64-NEXT:    testb $1, %al
@@ -3247,9 +3241,8 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; KNL_64-NEXT:    je .LBB42_16
 ; KNL_64-NEXT:  .LBB42_15: # %cond.load29
 ; KNL_64-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_64-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; KNL_64-NEXT:    vpinsrq $1, (%rax), %xmm4, %xmm4
-; KNL_64-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; KNL_64-NEXT:    vpbroadcastq (%rax), %ymm4
+; KNL_64-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
 ; KNL_64-NEXT:  .LBB42_16: # %else33
 ; KNL_64-NEXT:    kmovw %k0, %eax
 ; KNL_64-NEXT:    testb $1, %al
@@ -3266,9 +3259,8 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; KNL_64-NEXT:    je .LBB42_24
 ; KNL_64-NEXT:  .LBB42_23: # %cond.load54
 ; KNL_64-NEXT:    vpextrq $1, %xmm2, %rax
-; KNL_64-NEXT:    vextracti128 $1, %ymm4, %xmm0
-; KNL_64-NEXT:    vpinsrq $1, (%rax), %xmm0, %xmm0
-; KNL_64-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm4
+; KNL_64-NEXT:    vpbroadcastq (%rax), %ymm0
+; KNL_64-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7]
 ; KNL_64-NEXT:  .LBB42_24: # %else58
 ; KNL_64-NEXT:    vpaddq %ymm3, %ymm1, %ymm0
 ; KNL_64-NEXT:    vpaddq %ymm4, %ymm0, %ymm0
@@ -3286,9 +3278,8 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; KNL_64-NEXT:    je .LBB42_14
 ; KNL_64-NEXT:  .LBB42_13: # %cond.load23
 ; KNL_64-NEXT:    vmovq %xmm2, %rcx
-; KNL_64-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; KNL_64-NEXT:    vpinsrq $0, (%rcx), %xmm4, %xmm4
-; KNL_64-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; KNL_64-NEXT:    vpbroadcastq (%rcx), %ymm4
+; KNL_64-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
 ; KNL_64-NEXT:    testb $8, %al
 ; KNL_64-NEXT:    jne .LBB42_15
 ; KNL_64-NEXT:    jmp .LBB42_16
@@ -3305,9 +3296,8 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; KNL_64-NEXT:    je .LBB42_22
 ; KNL_64-NEXT:  .LBB42_21: # %cond.load48
 ; KNL_64-NEXT:    vmovq %xmm2, %rcx
-; KNL_64-NEXT:    vextracti128 $1, %ymm4, %xmm0
-; KNL_64-NEXT:    vpinsrq $0, (%rcx), %xmm0, %xmm0
-; KNL_64-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm4
+; KNL_64-NEXT:    vpbroadcastq (%rcx), %ymm0
+; KNL_64-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7]
 ; KNL_64-NEXT:    testb $8, %al
 ; KNL_64-NEXT:    jne .LBB42_23
 ; KNL_64-NEXT:    jmp .LBB42_24
@@ -3347,19 +3337,19 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; KNL_32-NEXT:    vpextrd $2, %xmm0, %edx
 ; KNL_32-NEXT:    je .LBB42_6
 ; KNL_32-NEXT:  # %bb.5: # %cond.load4
-; KNL_32-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; KNL_32-NEXT:    vpinsrd $0, (%edx), %xmm2, %xmm2
-; KNL_32-NEXT:    vpinsrd $1, 4(%edx), %xmm2, %xmm2
-; KNL_32-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL_32-NEXT:    vpbroadcastd (%edx), %ymm2
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7]
+; KNL_32-NEXT:    vpbroadcastd 4(%edx), %ymm2
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
 ; KNL_32-NEXT:  .LBB42_6: # %else5
 ; KNL_32-NEXT:    testb $8, %bl
 ; KNL_32-NEXT:    vpextrd $3, %xmm0, %esi
 ; KNL_32-NEXT:    je .LBB42_8
 ; KNL_32-NEXT:  # %bb.7: # %cond.load7
-; KNL_32-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; KNL_32-NEXT:    vpinsrd $2, (%esi), %xmm0, %xmm0
-; KNL_32-NEXT:    vpinsrd $3, 4(%esi), %xmm0, %xmm0
-; KNL_32-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; KNL_32-NEXT:    vpbroadcastd (%esi), %ymm0
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
+; KNL_32-NEXT:    vpbroadcastd 4(%esi), %ymm1
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; KNL_32-NEXT:  .LBB42_8: # %else8
 ; KNL_32-NEXT:    kmovw %k0, %ebx
 ; KNL_32-NEXT:    testb $1, %bl
@@ -3375,10 +3365,10 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; KNL_32-NEXT:    testb $8, %bl
 ; KNL_32-NEXT:    je .LBB42_16
 ; KNL_32-NEXT:  .LBB42_15: # %cond.load29
-; KNL_32-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; KNL_32-NEXT:    vpinsrd $2, (%esi), %xmm2, %xmm2
-; KNL_32-NEXT:    vpinsrd $3, 4(%esi), %xmm2, %xmm2
-; KNL_32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; KNL_32-NEXT:    vpbroadcastd (%esi), %ymm2
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
+; KNL_32-NEXT:    vpbroadcastd 4(%esi), %ymm2
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
 ; KNL_32-NEXT:  .LBB42_16: # %else33
 ; KNL_32-NEXT:    kmovw %k0, %ebx
 ; KNL_32-NEXT:    testb $1, %bl
@@ -3394,10 +3384,10 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; KNL_32-NEXT:    testb $8, %bl
 ; KNL_32-NEXT:    je .LBB42_24
 ; KNL_32-NEXT:  .LBB42_23: # %cond.load54
-; KNL_32-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; KNL_32-NEXT:    vpinsrd $2, (%esi), %xmm3, %xmm3
-; KNL_32-NEXT:    vpinsrd $3, 4(%esi), %xmm3, %xmm3
-; KNL_32-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; KNL_32-NEXT:    vpbroadcastd (%esi), %ymm3
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7]
+; KNL_32-NEXT:    vpbroadcastd 4(%esi), %ymm3
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
 ; KNL_32-NEXT:  .LBB42_24: # %else58
 ; KNL_32-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; KNL_32-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
@@ -3419,10 +3409,10 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; KNL_32-NEXT:    testb $4, %bl
 ; KNL_32-NEXT:    je .LBB42_14
 ; KNL_32-NEXT:  .LBB42_13: # %cond.load23
-; KNL_32-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; KNL_32-NEXT:    vpinsrd $0, (%edx), %xmm2, %xmm2
-; KNL_32-NEXT:    vpinsrd $1, 4(%edx), %xmm2, %xmm2
-; KNL_32-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; KNL_32-NEXT:    vpbroadcastd (%edx), %ymm2
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7]
+; KNL_32-NEXT:    vpbroadcastd 4(%edx), %ymm2
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
 ; KNL_32-NEXT:    testb $8, %bl
 ; KNL_32-NEXT:    jne .LBB42_15
 ; KNL_32-NEXT:    jmp .LBB42_16
@@ -3437,10 +3427,10 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; KNL_32-NEXT:    testb $4, %bl
 ; KNL_32-NEXT:    je .LBB42_22
 ; KNL_32-NEXT:  .LBB42_21: # %cond.load48
-; KNL_32-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; KNL_32-NEXT:    vpinsrd $0, (%edx), %xmm3, %xmm3
-; KNL_32-NEXT:    vpinsrd $1, 4(%edx), %xmm3, %xmm3
-; KNL_32-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; KNL_32-NEXT:    vpbroadcastd (%edx), %ymm3
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7]
+; KNL_32-NEXT:    vpbroadcastd 4(%edx), %ymm3
+; KNL_32-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
 ; KNL_32-NEXT:    testb $8, %bl
 ; KNL_32-NEXT:    jne .LBB42_23
 ; KNL_32-NEXT:    jmp .LBB42_24
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 2e55a372ab576..e8e45a1567362 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -3463,51 +3463,51 @@ define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <1
 ; AVX2-NEXT:    testl $256, %eax ## imm = 0x100
 ; AVX2-NEXT:    je LBB22_18
 ; AVX2-NEXT:  LBB22_17: ## %cond.load22
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT:    vpinsrw $0, 16(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw 16(%rdi), %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    testl $512, %eax ## imm = 0x200
 ; AVX2-NEXT:    je LBB22_20
 ; AVX2-NEXT:  LBB22_19: ## %cond.load25
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT:    vpinsrw $1, 18(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw 18(%rdi), %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    testl $1024, %eax ## imm = 0x400
 ; AVX2-NEXT:    je LBB22_22
 ; AVX2-NEXT:  LBB22_21: ## %cond.load28
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT:    vpinsrw $2, 20(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw 20(%rdi), %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    testl $2048, %eax ## imm = 0x800
 ; AVX2-NEXT:    je LBB22_24
 ; AVX2-NEXT:  LBB22_23: ## %cond.load31
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT:    vpinsrw $3, 22(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw 22(%rdi), %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    testl $4096, %eax ## imm = 0x1000
 ; AVX2-NEXT:    je LBB22_26
 ; AVX2-NEXT:  LBB22_25: ## %cond.load34
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT:    vpinsrw $4, 24(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw 24(%rdi), %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    testl $8192, %eax ## imm = 0x2000
 ; AVX2-NEXT:    je LBB22_28
 ; AVX2-NEXT:  LBB22_27: ## %cond.load37
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT:    vpinsrw $5, 26(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw 26(%rdi), %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    testl $16384, %eax ## imm = 0x4000
 ; AVX2-NEXT:    je LBB22_30
 ; AVX2-NEXT:  LBB22_29: ## %cond.load40
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT:    vpinsrw $6, 28(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw 28(%rdi), %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    testl $32768, %eax ## imm = 0x8000
 ; AVX2-NEXT:    je LBB22_32
 ; AVX2-NEXT:  LBB22_31: ## %cond.load43
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX2-NEXT:    vpinsrw $7, 30(%rdi), %xmm0, %xmm0
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastw 30(%rdi), %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -3609,51 +3609,51 @@ define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <1
 ; AVX512F-NEXT:    testl $256, %eax ## imm = 0x100
 ; AVX512F-NEXT:    je LBB22_18
 ; AVX512F-NEXT:  LBB22_17: ## %cond.load22
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT:    vpinsrw $0, 16(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastw 16(%rdi), %ymm0
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-NEXT:    testl $512, %eax ## imm = 0x200
 ; AVX512F-NEXT:    je LBB22_20
 ; AVX512F-NEXT:  LBB22_19: ## %cond.load25
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT:    vpinsrw $1, 18(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastw 18(%rdi), %ymm0
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-NEXT:    testl $1024, %eax ## imm = 0x400
 ; AVX512F-NEXT:    je LBB22_22
 ; AVX512F-NEXT:  LBB22_21: ## %cond.load28
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT:    vpinsrw $2, 20(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastw 20(%rdi), %ymm0
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-NEXT:    testl $2048, %eax ## imm = 0x800
 ; AVX512F-NEXT:    je LBB22_24
 ; AVX512F-NEXT:  LBB22_23: ## %cond.load31
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT:    vpinsrw $3, 22(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastw 22(%rdi), %ymm0
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-NEXT:    testl $4096, %eax ## imm = 0x1000
 ; AVX512F-NEXT:    je LBB22_26
 ; AVX512F-NEXT:  LBB22_25: ## %cond.load34
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT:    vpinsrw $4, 24(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastw 24(%rdi), %ymm0
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-NEXT:    testl $8192, %eax ## imm = 0x2000
 ; AVX512F-NEXT:    je LBB22_28
 ; AVX512F-NEXT:  LBB22_27: ## %cond.load37
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT:    vpinsrw $5, 26(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastw 26(%rdi), %ymm0
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-NEXT:    testl $16384, %eax ## imm = 0x4000
 ; AVX512F-NEXT:    je LBB22_30
 ; AVX512F-NEXT:  LBB22_29: ## %cond.load40
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT:    vpinsrw $6, 28(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastw 28(%rdi), %ymm0
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-NEXT:    testl $32768, %eax ## imm = 0x8000
 ; AVX512F-NEXT:    je LBB22_32
 ; AVX512F-NEXT:  LBB22_31: ## %cond.load43
-; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT:    vpinsrw $7, 30(%rdi), %xmm0, %xmm0
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512F-NEXT:    vpbroadcastw 30(%rdi), %ymm0
+; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -3755,51 +3755,51 @@ define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <1
 ; AVX512VLDQ-NEXT:    testl $256, %eax ## imm = 0x100
 ; AVX512VLDQ-NEXT:    je LBB22_18
 ; AVX512VLDQ-NEXT:  LBB22_17: ## %cond.load22
-; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT:    vpinsrw $0, 16(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpbroadcastw 16(%rdi), %ymm0
+; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512VLDQ-NEXT:    testl $512, %eax ## imm = 0x200
 ; AVX512VLDQ-NEXT:    je LBB22_20
 ; AVX512VLDQ-NEXT:  LBB22_19: ## %cond.load25
-; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT:    vpinsrw $1, 18(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpbroadcastw 18(%rdi), %ymm0
+; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512VLDQ-NEXT:    testl $1024, %eax ## imm = 0x400
 ; AVX512VLDQ-NEXT:    je LBB22_22
 ; AVX512VLDQ-NEXT:  LBB22_21: ## %cond.load28
-; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT:    vpinsrw $2, 20(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpbroadcastw 20(%rdi), %ymm0
+; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512VLDQ-NEXT:    testl $2048, %eax ## imm = 0x800
 ; AVX512VLDQ-NEXT:    je LBB22_24
 ; AVX512VLDQ-NEXT:  LBB22_23: ## %cond.load31
-; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT:    vpinsrw $3, 22(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpbroadcastw 22(%rdi), %ymm0
+; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7,8,9,10],ymm0[11],ymm1[12,13,14,15]
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512VLDQ-NEXT:    testl $4096, %eax ## imm = 0x1000
 ; AVX512VLDQ-NEXT:    je LBB22_26
 ; AVX512VLDQ-NEXT:  LBB22_25: ## %cond.load34
-; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT:    vpinsrw $4, 24(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpbroadcastw 24(%rdi), %ymm0
+; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512VLDQ-NEXT:    testl $8192, %eax ## imm = 0x2000
 ; AVX512VLDQ-NEXT:    je LBB22_28
 ; AVX512VLDQ-NEXT:  LBB22_27: ## %cond.load37
-; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT:    vpinsrw $5, 26(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpbroadcastw 26(%rdi), %ymm0
+; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7,8,9,10,11,12],ymm0[13],ymm1[14,15]
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512VLDQ-NEXT:    testl $16384, %eax ## imm = 0x4000
 ; AVX512VLDQ-NEXT:    je LBB22_30
 ; AVX512VLDQ-NEXT:  LBB22_29: ## %cond.load40
-; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT:    vpinsrw $6, 28(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpbroadcastw 28(%rdi), %ymm0
+; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512VLDQ-NEXT:    testl $32768, %eax ## imm = 0x8000
 ; AVX512VLDQ-NEXT:    je LBB22_32
 ; AVX512VLDQ-NEXT:  LBB22_31: ## %cond.load43
-; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm1, %xmm0
-; AVX512VLDQ-NEXT:    vpinsrw $7, 30(%rdi), %xmm0, %xmm0
-; AVX512VLDQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm1
+; AVX512VLDQ-NEXT:    vpbroadcastw 30(%rdi), %ymm0
+; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7],ymm1[8,9,10,11,12,13,14],ymm0[15]
+; AVX512VLDQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512VLDQ-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512VLDQ-NEXT:    retq
 ;
@@ -7084,33 +7084,17 @@ define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
 ; SSE42-NEXT:    pinsrq $0, 16(%rdi), %xmm1
 ; SSE42-NEXT:    retq
 ;
-; AVX1-LABEL: load_one_mask_bit_set3:
-; AVX1:       ## %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_one_mask_bit_set3:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_one_mask_bit_set3:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpinsrq $0, 16(%rdi), %xmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_one_mask_bit_set3:
+; AVX:       ## %bb.0:
+; AVX-NEXT:    vbroadcastsd 16(%rdi), %ymm1
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT:    retq
 ;
 ; X86-AVX512-LABEL: load_one_mask_bit_set3:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-AVX512-NEXT:    vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; X86-AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX512-NEXT:    vbroadcastsd 16(%eax), %ymm1
+; X86-AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; X86-AVX512-NEXT:    retl
   %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
   ret <4 x i64> %res
@@ -7126,17 +7110,15 @@ define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %v
 ;
 ; AVX-LABEL: load_one_mask_bit_set4:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT:    vbroadcastsd 24(%rdi), %ymm1
+; AVX-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX-NEXT:    retq
 ;
 ; X86-AVX512-LABEL: load_one_mask_bit_set4:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-AVX512-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; X86-AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX512-NEXT:    vbroadcastsd 24(%eax), %ymm1
+; X86-AVX512-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; X86-AVX512-NEXT:    retl
   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
   ret <4 x double> %res
@@ -7152,24 +7134,37 @@ define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %v
 ;
 ; AVX1OR2-LABEL: load_one_mask_bit_set5:
 ; AVX1OR2:       ## %bb.0:
-; AVX1OR2-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1OR2-NEXT:    vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
-; AVX1OR2-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1OR2-NEXT:    vbroadcastsd 56(%rdi), %ymm2
+; AVX1OR2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
 ; AVX1OR2-NEXT:    retq
 ;
-; AVX512-LABEL: load_one_mask_bit_set5:
-; AVX512:       ## %bb.0:
-; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
-; AVX512-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; AVX512-NEXT:    vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: load_one_mask_bit_set5:
+; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    movb $-128, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vbroadcastsd 56(%rdi), %zmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; AVX512VLDQ-LABEL: load_one_mask_bit_set5:
+; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movb $-128, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vbroadcastsd 56(%rdi), %zmm0 {%k1}
+; AVX512VLDQ-NEXT:    retq
+;
+; AVX512VLBW-LABEL: load_one_mask_bit_set5:
+; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    movb $-128, %al
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vbroadcastsd 56(%rdi), %zmm0 {%k1}
+; AVX512VLBW-NEXT:    retq
 ;
 ; X86-AVX512-LABEL: load_one_mask_bit_set5:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm1
-; X86-AVX512-NEXT:    vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; X86-AVX512-NEXT:    vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; X86-AVX512-NEXT:    movb $-128, %cl
+; X86-AVX512-NEXT:    kmovd %ecx, %k1
+; X86-AVX512-NEXT:    vbroadcastsd 56(%eax), %zmm0 {%k1}
 ; X86-AVX512-NEXT:    retl
   %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
   ret <8 x double> %res
@@ -7235,43 +7230,43 @@ define <16 x i64> @load_one_mask_bit_set6(<16 x i64>* %addr, <16 x i64> %val) {
 ;
 ; AVX512F-LABEL: load_one_mask_bit_set6:
 ; AVX512F:       ## %bb.0:
+; AVX512F-NEXT:    movb $4, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpbroadcastq 16(%rdi), %zmm0 {%k1}
 ; AVX512F-NEXT:    movb $36, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1 {%k1}
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vpinsrq $0, 16(%rdi), %xmm2, %xmm2
-; AVX512F-NEXT:    vinserti32x4 $1, %xmm2, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: load_one_mask_bit_set6:
 ; AVX512VLDQ:       ## %bb.0:
+; AVX512VLDQ-NEXT:    movb $4, %al
+; AVX512VLDQ-NEXT:    kmovw %eax, %k1
+; AVX512VLDQ-NEXT:    vpbroadcastq 16(%rdi), %zmm0 {%k1}
 ; AVX512VLDQ-NEXT:    movb $36, %al
 ; AVX512VLDQ-NEXT:    kmovw %eax, %k1
 ; AVX512VLDQ-NEXT:    vmovdqu64 64(%rdi), %zmm1 {%k1}
-; AVX512VLDQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VLDQ-NEXT:    vpinsrq $0, 16(%rdi), %xmm2, %xmm2
-; AVX512VLDQ-NEXT:    vinserti32x4 $1, %xmm2, %zmm0, %zmm0
 ; AVX512VLDQ-NEXT:    retq
 ;
 ; AVX512VLBW-LABEL: load_one_mask_bit_set6:
 ; AVX512VLBW:       ## %bb.0:
+; AVX512VLBW-NEXT:    movb $4, %al
+; AVX512VLBW-NEXT:    kmovd %eax, %k1
+; AVX512VLBW-NEXT:    vpbroadcastq 16(%rdi), %zmm0 {%k1}
 ; AVX512VLBW-NEXT:    movb $36, %al
 ; AVX512VLBW-NEXT:    kmovd %eax, %k1
 ; AVX512VLBW-NEXT:    vmovdqu64 64(%rdi), %zmm1 {%k1}
-; AVX512VLBW-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VLBW-NEXT:    vpinsrq $0, 16(%rdi), %xmm2, %xmm2
-; AVX512VLBW-NEXT:    vinserti32x4 $1, %xmm2, %zmm0, %zmm0
 ; AVX512VLBW-NEXT:    retq
 ;
 ; X86-AVX512-LABEL: load_one_mask_bit_set6:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT:    movb $4, %cl
+; X86-AVX512-NEXT:    kmovd %ecx, %k1
+; X86-AVX512-NEXT:    vbroadcastsd 16(%eax), %zmm0 {%k1}
 ; X86-AVX512-NEXT:    movb $36, %cl
 ; X86-AVX512-NEXT:    kmovd %ecx, %k1
 ; X86-AVX512-NEXT:    vmovdqu64 64(%eax), %zmm1 {%k1}
-; X86-AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; X86-AVX512-NEXT:    vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
-; X86-AVX512-NEXT:    vinsertf32x4 $1, %xmm2, %zmm0, %zmm0
 ; X86-AVX512-NEXT:    retl
   %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %addr, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>, <16 x i64> %val)
   ret <16 x i64> %res
diff --git a/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
new file mode 100644
index 0000000000000..9f56c38062f68
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memcmp-minsize-x32.ll
@@ -0,0 +1,445 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(i8*, i8*, i32)
+
+define i32 @length2(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $2
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length2_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length2_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length2_eq_nobuiltin_attr:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $2
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $3
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length3_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $3
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length4:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $4
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length4_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length4_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length5:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $5
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length5_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $5
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $8
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length8_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $8
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length8_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $8
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length12_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length12:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $16
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(i8* %X) nounwind minsize {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length24:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(i8* %x, i8* %y) nounwind minsize {
+; X86-LABEL: length24_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length24_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(i8* %x, i8* %y) nounwind minsize {
+; X86-LABEL: length32_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length32_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(i8* %X, i8* %Y) nounwind minsize {
+; X86-LABEL: length64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(i8* %x, i8* %y) nounwind minsize {
+; X86-LABEL: length64_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(i8* %X) nounwind minsize {
+; X86-LABEL: length64_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
diff --git a/llvm/test/CodeGen/X86/memcmp-minsize.ll b/llvm/test/CodeGen/X86/memcmp-minsize.ll
index fec09f6b8d364..3bcd79c4c0cd4 100644
--- a/llvm/test/CodeGen/X86/memcmp-minsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-minsize.ll
@@ -1,6 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
@@ -13,16 +11,6 @@
 declare dso_local i32 @memcmp(i8*, i8*, i64)
 
 define i32 @length2(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq $2
@@ -33,15 +21,6 @@ define i32 @length2(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i1 @length2_eq(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -54,13 +33,6 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i1 @length2_eq_const(i8* %X) nounwind minsize {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpw $12849, (%eax) # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    cmpw $12849, (%rdi) # imm = 0x3231
@@ -72,18 +44,6 @@ define i1 @length2_eq_const(i8* %X) nounwind minsize {
 }
 
 define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq_nobuiltin_attr:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -100,16 +60,6 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i32 @length3(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length3:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq $3
@@ -120,18 +70,6 @@ define i32 @length3(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i1 @length3_eq(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $3
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length3_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -148,16 +86,6 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i32 @length4(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $4
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq $4
@@ -168,15 +96,6 @@ define i32 @length4(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i1 @length4_eq(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -189,13 +108,6 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i1 @length4_eq_const(i8* %X) nounwind minsize {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
@@ -207,16 +119,6 @@ define i1 @length4_eq_const(i8* %X) nounwind minsize {
 }
 
 define i32 @length5(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $5
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length5:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq $5
@@ -227,18 +129,6 @@ define i32 @length5(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i1 @length5_eq(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $5
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length5_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -255,16 +145,6 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i32 @length8(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq $8
@@ -275,18 +155,6 @@ define i32 @length8(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i1 @length8_eq(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -299,18 +167,6 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i1 @length8_eq_const(i8* %X) nounwind minsize {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $8
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
@@ -323,18 +179,6 @@ define i1 @length8_eq_const(i8* %X) nounwind minsize {
 }
 
 define i1 @length12_eq(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length12_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -351,16 +195,6 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i32 @length12(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length12:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq $12
@@ -373,15 +207,6 @@ define i32 @length12(i8* %X, i8* %Y) nounwind minsize {
 ; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
 
 define i32 @length16(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
 ;
 ; X64-LABEL: length16:
 ; X64:       # %bb.0:
@@ -393,30 +218,6 @@ define i32 @length16(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length16_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rsi), %xmm0
@@ -440,28 +241,6 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind minsize {
 }
 
 define i1 @length16_eq_const(i8* %X) nounwind minsize {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length16_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -486,16 +265,6 @@ define i1 @length16_eq_const(i8* %X) nounwind minsize {
 ; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
 
 define i32 @length24(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length24:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq $24
@@ -506,18 +275,6 @@ define i32 @length24(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i1 @length24_eq(i8* %x, i8* %y) nounwind minsize {
-; X86-LABEL: length24_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length24_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -534,18 +291,6 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind minsize {
 }
 
 define i1 @length24_eq_const(i8* %X) nounwind minsize {
-; X86-LABEL: length24_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length24_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -563,16 +308,6 @@ define i1 @length24_eq_const(i8* %X) nounwind minsize {
 }
 
 define i32 @length32(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq $32
@@ -585,18 +320,6 @@ define i32 @length32(i8* %X, i8* %Y) nounwind minsize {
 ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
 
 define i1 @length32_eq(i8* %x, i8* %y) nounwind minsize {
-; X86-LABEL: length32_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length32_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pushq %rax
@@ -631,18 +354,6 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind minsize {
 }
 
 define i1 @length32_eq_const(i8* %X) nounwind minsize {
-; X86-LABEL: length32_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length32_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pushq %rax
@@ -678,16 +389,6 @@ define i1 @length32_eq_const(i8* %X) nounwind minsize {
 }
 
 define i32 @length64(i8* %X, i8* %Y) nounwind minsize {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq $64
@@ -698,18 +399,6 @@ define i32 @length64(i8* %X, i8* %Y) nounwind minsize {
 }
 
 define i1 @length64_eq(i8* %x, i8* %y) nounwind minsize {
-; X86-LABEL: length64_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length64_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -726,18 +415,6 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind minsize {
 }
 
 define i1 @length64_eq_const(i8* %X) nounwind minsize {
-; X86-LABEL: length64_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length64_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
new file mode 100644
index 0000000000000..edfaaaed7d849
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
@@ -0,0 +1,2916 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way.
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=cmov     | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse     | FileCheck %s --check-prefixes=X86,X86-SSE1
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse2    | FileCheck %s --check-prefixes=X86,X86-SSE2
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1  | FileCheck %s --check-prefixes=X86,X86-SSE41
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+@.str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(i8*, i8*, i32)
+
+define i32 @length0(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length0:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind
+   ret i32 %m
+ }
+
+define i1 @length0_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length0_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length0_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %cx, %ecx
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(i8* %X) nounwind {
+; X86-LABEL: length2_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2_eq_nobuiltin_attr:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $2
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    rolw $8, %si
+; X86-NEXT:    cmpw %si, %dx
+; X86-NEXT:    jne .LBB9_3
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB9_3: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length3_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %edx
+; X86-NEXT:    xorw (%eax), %dx
+; X86-NEXT:    movb 2(%ecx), %cl
+; X86-NEXT:    xorb 2(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orw %dx, %ax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length4:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length4_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length4_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_gt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length4_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    seta %dl
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(i8* %X) nounwind {
+; X86-LABEL: length4_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length5:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB16_3
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB16_3: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length5_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    movb 4(%ecx), %cl
+; X86-NEXT:    xorb 4(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length5_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB18_3
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB18_2
+; X86-NEXT:  .LBB18_3: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB18_2: # %endblock
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length7(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length7:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB19_2
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 3(%esi), %ecx
+; X86-NEXT:    movl 3(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB19_3
+; X86-NEXT:  .LBB19_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB19_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length7_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 3(%ecx), %ecx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 3(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_lt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length7_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB21_2
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 3(%esi), %ecx
+; X86-NEXT:    movl 3(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB21_3
+; X86-NEXT:  .LBB21_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB21_3: # %endblock
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB22_2
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB22_3
+; X86-NEXT:  .LBB22_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB22_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length8_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %ecx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(i8* %X) nounwind {
+; X86-LABEL: length8_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
+; X86-NEXT:    xorl (%eax), %ecx
+; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
+; X86-NEXT:    xorl 4(%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length9_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length9_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %esi
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movb 8(%ecx), %cl
+; X86-NEXT:    xorb 8(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length10_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length10_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %esi
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movzwl 8(%ecx), %ecx
+; X86-NEXT:    xorw 8(%eax), %cx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length11_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length11_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %esi
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl 7(%ecx), %ecx
+; X86-NEXT:    xorl 7(%eax), %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    sete %al
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length12_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %esi
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl 8(%ecx), %ecx
+; X86-NEXT:    xorl 8(%eax), %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length12:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB29_3
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB29_3
+; X86-NEXT:  # %bb.2: # %loadbb2
+; X86-NEXT:    movl 8(%esi), %ecx
+; X86-NEXT:    movl 8(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB29_4
+; X86-NEXT:  .LBB29_3: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB29_4: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length13_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %esi
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl 8(%ecx), %edx
+; X86-NEXT:    xorl 8(%eax), %edx
+; X86-NEXT:    movb 12(%ecx), %cl
+; X86-NEXT:    xorb 12(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length14_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %esi
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl 8(%ecx), %edx
+; X86-NEXT:    xorl 8(%eax), %edx
+; X86-NEXT:    movzwl 12(%ecx), %ecx
+; X86-NEXT:    xorw 12(%eax), %cx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length15_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %esi
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    movl 8(%ecx), %edx
+; X86-NEXT:    xorl 8(%eax), %edx
+; X86-NEXT:    movl 11(%ecx), %ecx
+; X86-NEXT:    xorl 11(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    sete %al
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB33_4
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB33_4
+; X86-NEXT:  # %bb.2: # %loadbb2
+; X86-NEXT:    movl 8(%esi), %ecx
+; X86-NEXT:    movl 8(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB33_4
+; X86-NEXT:  # %bb.3: # %loadbb3
+; X86-NEXT:    movl 12(%esi), %ecx
+; X86-NEXT:    movl 12(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB33_5
+; X86-NEXT:  .LBB33_4: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB33_5: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(i8* %x, i8* %y) nounwind {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl (%ecx), %edx
+; X86-NOSSE-NEXT:    movl 4(%ecx), %esi
+; X86-NOSSE-NEXT:    xorl (%eax), %edx
+; X86-NOSSE-NEXT:    xorl 4(%eax), %esi
+; X86-NOSSE-NEXT:    orl %edx, %esi
+; X86-NOSSE-NEXT:    movl 8(%ecx), %edx
+; X86-NOSSE-NEXT:    xorl 8(%eax), %edx
+; X86-NOSSE-NEXT:    movl 12(%ecx), %ecx
+; X86-NOSSE-NEXT:    xorl 12(%eax), %ecx
+; X86-NOSSE-NEXT:    orl %edx, %ecx
+; X86-NOSSE-NEXT:    orl %esi, %ecx
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    popl %esi
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length16_eq:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %esi
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE1-NEXT:    movl (%ecx), %edx
+; X86-SSE1-NEXT:    movl 4(%ecx), %esi
+; X86-SSE1-NEXT:    xorl (%eax), %edx
+; X86-SSE1-NEXT:    xorl 4(%eax), %esi
+; X86-SSE1-NEXT:    orl %edx, %esi
+; X86-SSE1-NEXT:    movl 8(%ecx), %edx
+; X86-SSE1-NEXT:    xorl 8(%eax), %edx
+; X86-SSE1-NEXT:    movl 12(%ecx), %ecx
+; X86-SSE1-NEXT:    xorl 12(%eax), %ecx
+; X86-SSE1-NEXT:    orl %edx, %ecx
+; X86-SSE1-NEXT:    orl %esi, %ecx
+; X86-SSE1-NEXT:    setne %al
+; X86-SSE1-NEXT:    popl %esi
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length16_eq:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm1
+; X86-SSE41-NEXT:    ptest %xmm1, %xmm1
+; X86-SSE41-NEXT:    setne %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length16_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB35_4
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB35_4
+; X86-NEXT:  # %bb.2: # %loadbb2
+; X86-NEXT:    movl 8(%esi), %ecx
+; X86-NEXT:    movl 8(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB35_4
+; X86-NEXT:  # %bb.3: # %loadbb3
+; X86-NEXT:    movl 12(%esi), %ecx
+; X86-NEXT:    movl 12(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB35_5
+; X86-NEXT:  .LBB35_4: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB35_5: # %endblock
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length16_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %eax
+; X86-NEXT:    movl (%edx), %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB36_4
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %eax
+; X86-NEXT:    movl 4(%edx), %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB36_4
+; X86-NEXT:  # %bb.2: # %loadbb2
+; X86-NEXT:    movl 8(%esi), %eax
+; X86-NEXT:    movl 8(%edx), %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    jne .LBB36_4
+; X86-NEXT:  # %bb.3: # %loadbb3
+; X86-NEXT:    movl 12(%esi), %eax
+; X86-NEXT:    movl 12(%edx), %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    je .LBB36_5
+; X86-NEXT:  .LBB36_4: # %res_block
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    setae %dl
+; X86-NEXT:    leal -1(%edx,%edx), %edx
+; X86-NEXT:  .LBB36_5: # %endblock
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    setg %al
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(i8* %X) nounwind {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl $858927408, %ecx # imm = 0x33323130
+; X86-NOSSE-NEXT:    xorl (%eax), %ecx
+; X86-NOSSE-NEXT:    movl $926299444, %edx # imm = 0x37363534
+; X86-NOSSE-NEXT:    xorl 4(%eax), %edx
+; X86-NOSSE-NEXT:    orl %ecx, %edx
+; X86-NOSSE-NEXT:    movl $825243960, %ecx # imm = 0x31303938
+; X86-NOSSE-NEXT:    xorl 8(%eax), %ecx
+; X86-NOSSE-NEXT:    movl $892613426, %esi # imm = 0x35343332
+; X86-NOSSE-NEXT:    xorl 12(%eax), %esi
+; X86-NOSSE-NEXT:    orl %ecx, %esi
+; X86-NOSSE-NEXT:    orl %edx, %esi
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    popl %esi
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length16_eq_const:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl %esi
+; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE1-NEXT:    movl $858927408, %ecx # imm = 0x33323130
+; X86-SSE1-NEXT:    xorl (%eax), %ecx
+; X86-SSE1-NEXT:    movl $926299444, %edx # imm = 0x37363534
+; X86-SSE1-NEXT:    xorl 4(%eax), %edx
+; X86-SSE1-NEXT:    orl %ecx, %edx
+; X86-SSE1-NEXT:    movl $825243960, %ecx # imm = 0x31303938
+; X86-SSE1-NEXT:    xorl 8(%eax), %ecx
+; X86-SSE1-NEXT:    movl $892613426, %esi # imm = 0x35343332
+; X86-SSE1-NEXT:    xorl 12(%eax), %esi
+; X86-SSE1-NEXT:    orl %ecx, %esi
+; X86-SSE1-NEXT:    orl %edx, %esi
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    popl %esi
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length16_eq_const:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length24:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(i8* %x, i8* %y) nounwind {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length24_eq:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $24
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length24_eq:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length24_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length24_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(i8* %X) nounwind {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length24_eq_const:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $24
+; X86-SSE1-NEXT:    pushl $.L.str
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    setne %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length24_eq_const:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    setne %al
+; X86-SSE41-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length31:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $31
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(i8* %x, i8* %y) nounwind {
+; X86-NOSSE-LABEL: length31_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length31_eq:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $31
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length31_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length31_eq:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length31_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $31
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length31_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $31
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length31_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length31_eq_prefer128:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $31
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length31_eq_prefer128:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length31_eq_prefer128:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(i8* %X) nounwind {
+; X86-NOSSE-LABEL: length31_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length31_eq_const:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $31
+; X86-SSE1-NEXT:    pushl $.L.str
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    setne %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length31_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length31_eq_const:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    setne %al
+; X86-SSE41-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(i8* %x, i8* %y) nounwind {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length32_eq:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $32
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length32_eq:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length32_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length32_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length32_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length32_eq_prefer128:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $32
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq_prefer128:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length32_eq_prefer128:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(i8* %X) nounwind {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length32_eq_const:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $32
+; X86-SSE1-NEXT:    pushl $.L.str
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    setne %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length32_eq_const:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    setne %al
+; X86-SSE41-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length48:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $48
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(i8* %x, i8* %y) nounwind {
+; X86-NOSSE-LABEL: length48_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $48
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length48_eq:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $48
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length48_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
+; X86-SSE2-NEXT:    pand %xmm0, %xmm3
+; X86-SSE2-NEXT:    pand %xmm2, %xmm3
+; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length48_eq:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE41-NEXT:    por %xmm0, %xmm3
+; X86-SSE41-NEXT:    por %xmm2, %xmm3
+; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length48_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $48
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length48_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $48
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length48_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $48
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length48_eq_prefer128:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $48
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length48_eq_prefer128:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
+; X86-SSE2-NEXT:    pand %xmm0, %xmm3
+; X86-SSE2-NEXT:    pand %xmm2, %xmm3
+; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length48_eq_prefer128:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE41-NEXT:    por %xmm0, %xmm3
+; X86-SSE41-NEXT:    por %xmm2, %xmm3
+; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(i8* %X) nounwind {
+; X86-NOSSE-LABEL: length48_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $48
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length48_eq_const:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $48
+; X86-SSE1-NEXT:    pushl $.L.str
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    setne %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length48_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT:    pand %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm0, %xmm2
+; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length48_eq_const:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE41-NEXT:    por %xmm1, %xmm2
+; X86-SSE41-NEXT:    por %xmm0, %xmm2
+; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
+; X86-SSE41-NEXT:    setne %al
+; X86-SSE41-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length63:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $63
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(i8* %x, i8* %y) nounwind {
+; X86-NOSSE-LABEL: length63_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $63
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length63_eq:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $63
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    setne %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length63_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
+; X86-SSE2-NEXT:    movdqu 47(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm4
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm4
+; X86-SSE2-NEXT:    pand %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm0, %xmm4
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pmovmskb %xmm4, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length63_eq:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE41-NEXT:    movdqu 47(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm4
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE41-NEXT:    por %xmm3, %xmm4
+; X86-SSE41-NEXT:    por %xmm0, %xmm4
+; X86-SSE41-NEXT:    por %xmm2, %xmm4
+; X86-SSE41-NEXT:    ptest %xmm4, %xmm4
+; X86-SSE41-NEXT:    setne %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length63_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $63
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length63_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $63
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(i8* %X) nounwind {
+; X86-NOSSE-LABEL: length63_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $63
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length63_eq_const:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $63
+; X86-SSE1-NEXT:    pushl $.L.str
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length63_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm3
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT:    pand %xmm3, %xmm2
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length63_eq_const:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm3
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE41-NEXT:    por %xmm3, %xmm2
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT:    por %xmm2, %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(i8* %x, i8* %y) nounwind {
+; X86-NOSSE-LABEL: length64_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $64
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length64_eq:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $64
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    setne %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length64_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
+; X86-SSE2-NEXT:    movdqu 48(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm4
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm4
+; X86-SSE2-NEXT:    pand %xmm3, %xmm4
+; X86-SSE2-NEXT:    pand %xmm0, %xmm4
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pmovmskb %xmm4, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length64_eq:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE41-NEXT:    movdqu 48(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm4
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE41-NEXT:    por %xmm3, %xmm4
+; X86-SSE41-NEXT:    por %xmm0, %xmm4
+; X86-SSE41-NEXT:    por %xmm2, %xmm4
+; X86-SSE41-NEXT:    ptest %xmm4, %xmm4
+; X86-SSE41-NEXT:    setne %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length64_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length64_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(i8* %X) nounwind {
+; X86-NOSSE-LABEL: length64_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $64
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length64_eq_const:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $64
+; X86-SSE1-NEXT:    pushl $.L.str
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length64_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm3
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE2-NEXT:    pand %xmm3, %xmm2
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length64_eq_const:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
+; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm3
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE41-NEXT:    por %xmm3, %xmm2
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT:    por %xmm2, %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length96:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $96
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length96_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $96
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length96_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $96
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length96_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $96
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(i8* %X) nounwind {
+; X86-LABEL: length96_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $96
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length127:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $127
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length127_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $127
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length127_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $127
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length127_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $127
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(i8* %X) nounwind {
+; X86-LABEL: length127_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $127
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $128
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length128_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $128
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length128_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $128
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length128_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $128
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(i8* %X) nounwind {
+; X86-LABEL: length128_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $128
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length192:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $192
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length192_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $192
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length192_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $192
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length192_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $192
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(i8* %X) nounwind {
+; X86-LABEL: length192_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $192
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length255:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $255
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length255_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $255
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length255_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $255
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length255_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $255
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(i8* %X) nounwind {
+; X86-LABEL: length255_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $255
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length256:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $256 # imm = 0x100
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length256_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $256 # imm = 0x100
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length256_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $256 # imm = 0x100
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length256_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $256 # imm = 0x100
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(i8* %X) nounwind {
+; X86-LABEL: length256_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $256 # imm = 0x100
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length384:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $384 # imm = 0x180
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length384_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $384 # imm = 0x180
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length384_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $384 # imm = 0x180
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length384_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $384 # imm = 0x180
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(i8* %X) nounwind {
+; X86-LABEL: length384_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $384 # imm = 0x180
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length511:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $511 # imm = 0x1FF
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length511_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $511 # imm = 0x1FF
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length511_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $511 # imm = 0x1FF
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length511_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $511 # imm = 0x1FF
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(i8* %X) nounwind {
+; X86-LABEL: length511_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $511 # imm = 0x1FF
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length512:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $512 # imm = 0x200
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length512_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $512 # imm = 0x200
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length512_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $512 # imm = 0x200
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length512_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $512 # imm = 0x200
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(i8* %X) nounwind {
+; X86-LABEL: length512_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $512 # imm = 0x200
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks that we do not do stupid things with huge sizes.
+define i32 @huge_length(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: huge_length:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $-1
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: huge_length_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $-1
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks non-constant sizes.
+define i32 @nonconst_length(i8* %X, i8* %Y, i32 %size) nounwind {
+; X86-LABEL: nonconst_length:
+; X86:       # %bb.0:
+; X86-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(i8* %X, i8* %Y, i32 %size) nounwind {
+; X86-LABEL: nonconst_length_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index 84b454776c325..9fc03d421bea4 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -1,9 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way.
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=cmov     | FileCheck %s --check-prefixes=X86,X86-NOSSE
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse     | FileCheck %s --check-prefixes=X86,X86-SSE1
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse2    | FileCheck %s --check-prefixes=X86,X86-SSE2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1  | FileCheck %s --check-prefixes=X86,X86-SSE41
 ; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown               | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2
 ; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41
 ; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx    | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
@@ -23,11 +19,6 @@
 declare dso_local i32 @memcmp(i8*, i8*, i64)
 
 define i32 @length0(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length0:
-; X86:       # %bb.0:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length0:
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorl %eax, %eax
@@ -37,11 +28,6 @@ define i32 @length0(i8* %X, i8* %Y) nounwind {
  }
 
 define i1 @length0_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length0_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movb $1, %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length0_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movb $1, %al
@@ -52,11 +38,6 @@ define i1 @length0_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length0_lt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length0_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length0_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorl %eax, %eax
@@ -67,19 +48,6 @@ define i1 @length0_lt(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length2(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -95,15 +63,6 @@ define i32 @length2(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length2_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -116,21 +75,6 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length2_lt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length2_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -149,21 +93,6 @@ define i1 @length2_lt(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length2_gt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length2_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -182,14 +111,6 @@ define i1 @length2_gt(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length2_eq_const(i8* %X) nounwind {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -202,18 +123,6 @@ define i1 @length2_eq_const(i8* %X) nounwind {
 }
 
 define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq_nobuiltin_attr:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -229,30 +138,6 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length3(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB9_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB9_3: # %res_block
-; X86-NEXT:    setae %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length3:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -276,19 +161,6 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movb 2(%ecx), %cl
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length3_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -305,20 +177,6 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length4(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %ecx
@@ -335,15 +193,6 @@ define i32 @length4(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length4_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -356,22 +205,6 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length4_lt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length4_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %ecx
@@ -391,22 +224,6 @@ define i1 @length4_lt(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length4_gt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length4_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    seta %dl
-; X86-NEXT:    sbbl $0, %edx
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -426,13 +243,6 @@ define i1 @length4_gt(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length4_eq_const(i8* %X) nounwind {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
@@ -444,30 +254,6 @@ define i1 @length4_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length5(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB16_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB16_3: # %res_block
-; X86-NEXT:    setae %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length5:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -491,19 +277,6 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movb 4(%ecx), %cl
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length5_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -520,32 +293,6 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length5_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB18_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB18_2
-; X86-NEXT:  .LBB18_3: # %res_block
-; X86-NEXT:    setae %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB18_2: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length5_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -574,34 +321,6 @@ define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length7(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length7:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB19_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB19_3
-; X86-NEXT:  .LBB19_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB19_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length7:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %ecx
@@ -630,18 +349,6 @@ define i32 @length7(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length7_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length7_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 3(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 3(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length7_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -657,36 +364,6 @@ define i1 @length7_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length7_lt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length7_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB21_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB21_3
-; X86-NEXT:  .LBB21_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB21_3: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length7_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %ecx
@@ -718,34 +395,6 @@ define i1 @length7_lt(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length8(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB22_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB22_3
-; X86-NEXT:  .LBB22_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB22_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -762,18 +411,6 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -786,17 +423,6 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length8_eq_const(i8* %X) nounwind {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
@@ -809,24 +435,6 @@ define i1 @length8_eq_const(i8* %X) nounwind {
 }
 
 define i1 @length9_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length9_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movb 8(%ecx), %cl
-; X86-NEXT:    xorb 8(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length9_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -843,24 +451,6 @@ define i1 @length9_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length10_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length10_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movzwl 8(%ecx), %ecx
-; X86-NEXT:    xorw 8(%eax), %cx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length10_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -877,23 +467,6 @@ define i1 @length10_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length11_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length11_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 7(%ecx), %ecx
-; X86-NEXT:    xorl 7(%eax), %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length11_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -909,23 +482,6 @@ define i1 @length11_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 8(%ecx), %ecx
-; X86-NEXT:    xorl 8(%eax), %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length12_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -941,41 +497,6 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length12(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB29_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB29_3
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB29_4
-; X86-NEXT:  .LBB29_3: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB29_4: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length12:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -1004,27 +525,6 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length13_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length13_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 8(%ecx), %edx
-; X86-NEXT:    xorl 8(%eax), %edx
-; X86-NEXT:    movb 12(%ecx), %cl
-; X86-NEXT:    xorb 12(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length13_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -1040,27 +540,6 @@ define i1 @length13_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length14_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length14_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 8(%ecx), %edx
-; X86-NEXT:    xorl 8(%eax), %edx
-; X86-NEXT:    movzwl 12(%ecx), %ecx
-; X86-NEXT:    xorw 12(%eax), %cx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length14_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -1076,26 +555,6 @@ define i1 @length14_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length15_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %esi
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl 8(%ecx), %edx
-; X86-NEXT:    xorl 8(%eax), %edx
-; X86-NEXT:    movl 11(%ecx), %ecx
-; X86-NEXT:    xorl 11(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length15_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -1113,48 +572,6 @@ define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
 ; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
 
 define i32 @length16(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB33_4
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB33_4
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB33_4
-; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %ecx
-; X86-NEXT:    movl 12(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB33_5
-; X86-NEXT:  .LBB33_4: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB33_5: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -1183,69 +600,6 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length16_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl (%ecx), %edx
-; X86-NOSSE-NEXT:    movl 4(%ecx), %esi
-; X86-NOSSE-NEXT:    xorl (%eax), %edx
-; X86-NOSSE-NEXT:    xorl 4(%eax), %esi
-; X86-NOSSE-NEXT:    orl %edx, %esi
-; X86-NOSSE-NEXT:    movl 8(%ecx), %edx
-; X86-NOSSE-NEXT:    xorl 8(%eax), %edx
-; X86-NOSSE-NEXT:    movl 12(%ecx), %ecx
-; X86-NOSSE-NEXT:    xorl 12(%eax), %ecx
-; X86-NOSSE-NEXT:    orl %edx, %ecx
-; X86-NOSSE-NEXT:    orl %esi, %ecx
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    popl %esi
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length16_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE1-NEXT:    movl (%ecx), %edx
-; X86-SSE1-NEXT:    movl 4(%ecx), %esi
-; X86-SSE1-NEXT:    xorl (%eax), %edx
-; X86-SSE1-NEXT:    xorl 4(%eax), %esi
-; X86-SSE1-NEXT:    orl %edx, %esi
-; X86-SSE1-NEXT:    movl 8(%ecx), %edx
-; X86-SSE1-NEXT:    xorl 8(%eax), %edx
-; X86-SSE1-NEXT:    movl 12(%ecx), %ecx
-; X86-SSE1-NEXT:    xorl 12(%eax), %ecx
-; X86-SSE1-NEXT:    orl %edx, %ecx
-; X86-SSE1-NEXT:    orl %esi, %ecx
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length16_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length16_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -1288,50 +642,6 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length16_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length16_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB35_4
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB35_4
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %ecx
-; X86-NEXT:    movl 8(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB35_4
-; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %ecx
-; X86-NEXT:    movl 12(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB35_5
-; X86-NEXT:  .LBB35_4: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB35_5: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length16_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -1363,50 +673,6 @@ define i1 @length16_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length16_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length16_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    movl (%edx), %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    jne .LBB36_4
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %eax
-; X86-NEXT:    movl 4(%edx), %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    jne .LBB36_4
-; X86-NEXT:  # %bb.2: # %loadbb2
-; X86-NEXT:    movl 8(%esi), %eax
-; X86-NEXT:    movl 8(%edx), %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    jne .LBB36_4
-; X86-NEXT:  # %bb.3: # %loadbb3
-; X86-NEXT:    movl 12(%esi), %eax
-; X86-NEXT:    movl 12(%edx), %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    je .LBB36_5
-; X86-NEXT:  .LBB36_4: # %res_block
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    setae %dl
-; X86-NEXT:    leal -1(%edx,%edx), %edx
-; X86-NEXT:  .LBB36_5: # %endblock
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    setg %al
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length16_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -1438,63 +704,6 @@ define i1 @length16_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length16_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NOSSE-NEXT:    xorl (%eax), %ecx
-; X86-NOSSE-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NOSSE-NEXT:    xorl 4(%eax), %edx
-; X86-NOSSE-NEXT:    orl %ecx, %edx
-; X86-NOSSE-NEXT:    movl $825243960, %ecx # imm = 0x31303938
-; X86-NOSSE-NEXT:    xorl 8(%eax), %ecx
-; X86-NOSSE-NEXT:    movl $892613426, %esi # imm = 0x35343332
-; X86-NOSSE-NEXT:    xorl 12(%eax), %esi
-; X86-NOSSE-NEXT:    orl %ecx, %esi
-; X86-NOSSE-NEXT:    orl %edx, %esi
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    popl %esi
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length16_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl %esi
-; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-SSE1-NEXT:    xorl (%eax), %ecx
-; X86-SSE1-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-SSE1-NEXT:    xorl 4(%eax), %edx
-; X86-SSE1-NEXT:    orl %ecx, %edx
-; X86-SSE1-NEXT:    movl $825243960, %ecx # imm = 0x31303938
-; X86-SSE1-NEXT:    xorl 8(%eax), %ecx
-; X86-SSE1-NEXT:    movl $892613426, %esi # imm = 0x35343332
-; X86-SSE1-NEXT:    xorl 12(%eax), %esi
-; X86-SSE1-NEXT:    orl %ecx, %esi
-; X86-SSE1-NEXT:    orl %edx, %esi
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    popl %esi
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length16_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length16_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -1537,16 +746,6 @@ define i1 @length16_eq_const(i8* %X) nounwind {
 ; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
 
 define i32 @length24(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length24:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -1582,61 +781,6 @@ define i32 @length24(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length24_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length24_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length24_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -1694,18 +838,6 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length24_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length24_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length24_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -1744,18 +876,6 @@ define i1 @length24_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length24_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length24_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length24_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -1794,55 +914,6 @@ define i1 @length24_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length24_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length24_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length24_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -1895,16 +966,6 @@ define i1 @length24_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length31(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length31:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length31:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -1947,61 +1008,6 @@ define i32 @length31(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length31_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length31_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length31_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -2058,18 +1064,6 @@ define i1 @length31_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length31_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length31_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length31_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -2115,18 +1109,6 @@ define i1 @length31_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length31_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length31_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length31_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -2172,61 +1154,6 @@ define i1 @length31_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length31_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length31_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length31_eq_prefer128:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -2283,55 +1210,6 @@ define i1 @length31_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"=
 }
 
 define i1 @length31_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length31_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length31_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -2384,16 +1262,6 @@ define i1 @length31_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length32(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -2438,61 +1306,6 @@ define i32 @length32(i8* %X, i8* %Y) nounwind {
 ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
 
 define i1 @length32_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length32_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length32_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -2562,18 +1375,6 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length32_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length32_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length32_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -2619,18 +1420,6 @@ define i1 @length32_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length32_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length32_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length32_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -2676,61 +1465,6 @@ define i1 @length32_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length32_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length32_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length32_eq_prefer128:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -2787,55 +1521,6 @@ define i1 @length32_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"=
 }
 
 define i1 @length32_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length32_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length32_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -2901,16 +1586,6 @@ define i1 @length32_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length48(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length48:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length48:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $48, %edx
@@ -2920,69 +1595,6 @@ define i32 @length48(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length48_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length48_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $48
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length48_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $48
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length48_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pand %xmm2, %xmm3
-; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length48_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    por %xmm0, %xmm3
-; X86-SSE41-NEXT:    por %xmm2, %xmm3
-; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length48_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -3075,18 +1687,6 @@ define i1 @length48_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length48_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length48_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length48_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -3102,18 +1702,6 @@ define i1 @length48_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length48_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length48_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length48_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -3129,69 +1717,6 @@ define i1 @length48_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length48_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length48_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $48
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length48_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $48
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length48_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    pand %xmm0, %xmm3
-; X86-SSE2-NEXT:    pand %xmm2, %xmm3
-; X86-SSE2-NEXT:    pmovmskb %xmm3, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length48_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    por %xmm0, %xmm3
-; X86-SSE41-NEXT:    por %xmm2, %xmm3
-; X86-SSE41-NEXT:    ptest %xmm3, %xmm3
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length48_eq_prefer128:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -3263,61 +1788,6 @@ define i1 @length48_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"=
 }
 
 define i1 @length48_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length48_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $48
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length48_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $48
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length48_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    pand %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pmovmskb %xmm2, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length48_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT:    por %xmm1, %xmm2
-; X86-SSE41-NEXT:    por %xmm0, %xmm2
-; X86-SSE41-NEXT:    ptest %xmm2, %xmm2
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length48_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -3401,16 +1871,6 @@ define i1 @length48_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length63(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length63:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length63:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $63, %edx
@@ -3420,77 +1880,6 @@ define i32 @length63(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length63_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length63_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $63
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length63_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $63
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length63_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    movdqu 47(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm4
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm4
-; X86-SSE2-NEXT:    pand %xmm3, %xmm4
-; X86-SSE2-NEXT:    pand %xmm0, %xmm4
-; X86-SSE2-NEXT:    pand %xmm2, %xmm4
-; X86-SSE2-NEXT:    pmovmskb %xmm4, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length63_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    movdqu 47(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm4
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE41-NEXT:    por %xmm3, %xmm4
-; X86-SSE41-NEXT:    por %xmm0, %xmm4
-; X86-SSE41-NEXT:    por %xmm2, %xmm4
-; X86-SSE41-NEXT:    ptest %xmm4, %xmm4
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length63_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -3588,18 +1977,6 @@ define i1 @length63_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length63_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length63_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length63_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -3615,18 +1992,6 @@ define i1 @length63_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length63_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length63_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length63_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -3642,67 +2007,6 @@ define i1 @length63_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length63_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length63_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $63
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length63_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $63
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length63_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    pand %xmm3, %xmm2
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length63_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    movdqu 47(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT:    por %xmm3, %xmm2
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    por %xmm2, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length63_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -3792,16 +2096,6 @@ define i1 @length63_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length64(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $64, %edx
@@ -3811,77 +2105,6 @@ define i32 @length64(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length64_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length64_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $64
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length64_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $64
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length64_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm3
-; X86-SSE2-NEXT:    movdqu 48(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm4
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm4
-; X86-SSE2-NEXT:    pand %xmm3, %xmm4
-; X86-SSE2-NEXT:    pand %xmm0, %xmm4
-; X86-SSE2-NEXT:    pand %xmm2, %xmm4
-; X86-SSE2-NEXT:    pmovmskb %xmm4, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length64_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqu 32(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm3
-; X86-SSE41-NEXT:    movdqu 48(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm4
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE41-NEXT:    por %xmm3, %xmm4
-; X86-SSE41-NEXT:    por %xmm0, %xmm4
-; X86-SSE41-NEXT:    por %xmm2, %xmm4
-; X86-SSE41-NEXT:    ptest %xmm4, %xmm4
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length64_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -3994,18 +2217,6 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length64_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length64_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length64_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4021,18 +2232,6 @@ define i1 @length64_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length64_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length64_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length64_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4048,67 +2247,6 @@ define i1 @length64_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length64_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length64_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $64
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length64_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $64
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length64_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT:    pand %xmm3, %xmm2
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length64_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT:    movdqu 48(%eax), %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT:    por %xmm3, %xmm2
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    por %xmm2, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length64_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -4213,16 +2351,6 @@ define i1 @length64_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length96(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length96:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length96:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $96, %edx
@@ -4232,18 +2360,6 @@ define i32 @length96(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length96_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length96_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length96_eq:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -4342,18 +2458,6 @@ define i1 @length96_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length96_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length96_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length96_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4369,18 +2473,6 @@ define i1 @length96_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length96_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length96_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length96_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4396,18 +2488,6 @@ define i1 @length96_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length96_eq_const(i8* %X) nounwind {
-; X86-LABEL: length96_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length96_eq_const:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -4504,16 +2584,6 @@ define i1 @length96_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length127(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length127:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length127:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $127, %edx
@@ -4523,18 +2593,6 @@ define i32 @length127(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length127_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length127_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length127_eq:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -4640,18 +2698,6 @@ define i1 @length127_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length127_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length127_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length127_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4667,18 +2713,6 @@ define i1 @length127_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length127_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length127_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length127_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4694,18 +2728,6 @@ define i1 @length127_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length127_eq_const(i8* %X) nounwind {
-; X86-LABEL: length127_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length127_eq_const:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -4812,16 +2834,6 @@ define i1 @length127_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length128(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $128, %edx
@@ -4831,18 +2843,6 @@ define i32 @length128(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length128_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length128_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length128_eq:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -4948,18 +2948,6 @@ define i1 @length128_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length128_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length128_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length128_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4975,18 +2963,6 @@ define i1 @length128_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length128_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length128_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length128_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -5002,18 +2978,6 @@ define i1 @length128_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length128_eq_const(i8* %X) nounwind {
-; X86-LABEL: length128_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length128_eq_const:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -5120,16 +3084,6 @@ define i1 @length128_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length192(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length192:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length192:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $192, %edx
@@ -5139,18 +3093,6 @@ define i32 @length192(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length192_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length192_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length192_eq:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -5238,18 +3180,6 @@ define i1 @length192_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length192_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length192_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length192_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -5265,18 +3195,6 @@ define i1 @length192_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length192_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length192_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length192_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -5292,18 +3210,6 @@ define i1 @length192_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length192_eq_const(i8* %X) nounwind {
-; X86-LABEL: length192_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length192_eq_const:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -5395,16 +3301,6 @@ define i1 @length192_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length255(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length255:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length255:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $255, %edx
@@ -5414,18 +3310,6 @@ define i32 @length255(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length255_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length255_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length255_eq:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -5522,18 +3406,6 @@ define i1 @length255_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length255_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length255_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length255_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -5549,18 +3421,6 @@ define i1 @length255_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length255_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length255_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length255_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -5576,18 +3436,6 @@ define i1 @length255_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length255_eq_const(i8* %X) nounwind {
-; X86-LABEL: length255_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length255_eq_const:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -5688,16 +3536,6 @@ define i1 @length255_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length256(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length256:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $256, %edx # imm = 0x100
@@ -5707,18 +3545,6 @@ define i32 @length256(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length256_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length256_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length256_eq:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -5815,18 +3641,6 @@ define i1 @length256_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length256_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length256_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length256_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -5842,18 +3656,6 @@ define i1 @length256_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length256_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length256_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length256_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -5869,18 +3671,6 @@ define i1 @length256_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length256_eq_const(i8* %X) nounwind {
-; X86-LABEL: length256_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length256_eq_const:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -5981,16 +3771,6 @@ define i1 @length256_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length384(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length384:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length384:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $384, %edx # imm = 0x180
@@ -6000,18 +3780,6 @@ define i32 @length384(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length384_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length384_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length384_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -6027,18 +3795,6 @@ define i1 @length384_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length384_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length384_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length384_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -6054,18 +3810,6 @@ define i1 @length384_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length384_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length384_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length384_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -6081,18 +3825,6 @@ define i1 @length384_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length384_eq_const(i8* %X) nounwind {
-; X86-LABEL: length384_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length384_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -6109,16 +3841,6 @@ define i1 @length384_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length511(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length511:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length511:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $511, %edx # imm = 0x1FF
@@ -6128,18 +3850,6 @@ define i32 @length511(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length511_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length511_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length511_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -6155,18 +3865,6 @@ define i1 @length511_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length511_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length511_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length511_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -6182,18 +3880,6 @@ define i1 @length511_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length511_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length511_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length511_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -6209,18 +3895,6 @@ define i1 @length511_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length511_eq_const(i8* %X) nounwind {
-; X86-LABEL: length511_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length511_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -6237,16 +3911,6 @@ define i1 @length511_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length512(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length512:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $512, %edx # imm = 0x200
@@ -6256,18 +3920,6 @@ define i32 @length512(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length512_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length512_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length512_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -6283,18 +3935,6 @@ define i1 @length512_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length512_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length512_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length512_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -6310,18 +3950,6 @@ define i1 @length512_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length512_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length512_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length512_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -6337,18 +3965,6 @@ define i1 @length512_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length512_eq_const(i8* %X) nounwind {
-; X86-LABEL: length512_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length512_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -6366,16 +3982,6 @@ define i1 @length512_eq_const(i8* %X) nounwind {
 
 ; This checks that we do not do stupid things with huge sizes.
 define i32 @huge_length(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: huge_length:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2147483647 # imm = 0x7FFFFFFF
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: huge_length:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
@@ -6385,18 +3991,6 @@ define i32 @huge_length(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: huge_length_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2147483647 # imm = 0x7FFFFFFF
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: huge_length_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -6413,10 +4007,6 @@ define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind {
 
 ; This checks non-constant sizes.
 define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind {
-; X86-LABEL: nonconst_length:
-; X86:       # %bb.0:
-; X86-NEXT:    jmp memcmp # TAILCALL
-;
 ; X64-LABEL: nonconst_length:
 ; X64:       # %bb.0:
 ; X64-NEXT:    jmp memcmp # TAILCALL
@@ -6425,18 +4015,6 @@ define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind {
 }
 
 define i1 @nonconst_length_eq(i8* %X, i8* %Y, i64 %size) nounwind {
-; X86-LABEL: nonconst_length_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: nonconst_length_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
new file mode 100644
index 0000000000000..2c45b8510d266
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memcmp-optsize-x32.ll
@@ -0,0 +1,584 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(i8*, i8*, i32)
+declare dso_local i32 @bcmp(i8*, i8*, i32)
+
+define i32 @length2(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length2_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length2_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length2_eq_nobuiltin_attr:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $2
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    rolw $8, %si
+; X86-NEXT:    cmpw %si, %dx
+; X86-NEXT:    jne .LBB4_3
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB4_2
+; X86-NEXT:  .LBB4_3: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB4_2: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length3_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %edx
+; X86-NEXT:    xorw (%eax), %dx
+; X86-NEXT:    movb 2(%ecx), %cl
+; X86-NEXT:    xorb 2(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orw %dx, %ax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length4:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length4_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length4_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length5:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB9_3
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB9_2
+; X86-NEXT:  .LBB9_3: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB9_2: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length5_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    movb 4(%ecx), %cl
+; X86-NEXT:    xorb 4(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB11_2
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB11_3
+; X86-NEXT:  .LBB11_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB11_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length8_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %ecx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length8_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
+; X86-NEXT:    xorl (%eax), %ecx
+; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
+; X86-NEXT:    xorl 4(%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length12_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length12:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $16
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(i8* %X) nounwind optsize {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length24:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(i8* %X) nounwind optsize {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(i8* %X) nounwind optsize {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: length64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize {
+; X86-LABEL: length64_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(i8* %X) nounwind optsize {
+; X86-LABEL: length64_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind optsize {
+; X86-LABEL: bcmp_length2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+  %m = tail call i32 @bcmp(i8* %X, i8* %Y, i32 2) nounwind
+  ret i32 %m
+}
diff --git a/llvm/test/CodeGen/X86/memcmp-optsize.ll b/llvm/test/CodeGen/X86/memcmp-optsize.ll
index ab5cc93ca41c2..4c5b339859719 100644
--- a/llvm/test/CodeGen/X86/memcmp-optsize.ll
+++ b/llvm/test/CodeGen/X86/memcmp-optsize.ll
@@ -1,6 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
@@ -14,19 +12,6 @@ declare dso_local i32 @memcmp(i8*, i8*, i64)
 declare dso_local i32 @bcmp(i8*, i8*, i64)
 
 define i32 @length2(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -42,15 +27,6 @@ define i32 @length2(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -63,14 +39,6 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i1 @length2_eq_const(i8* %X) nounwind optsize {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -83,18 +51,6 @@ define i1 @length2_eq_const(i8* %X) nounwind optsize {
 }
 
 define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq_nobuiltin_attr:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -110,30 +66,6 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB4_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB4_2
-; X86-NEXT:  .LBB4_3: # %res_block
-; X86-NEXT:    setae %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB4_2: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length3:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -157,19 +89,6 @@ define i32 @length3(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movb 2(%ecx), %cl
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length3_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -186,20 +105,6 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i32 @length4(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %ecx
@@ -216,15 +121,6 @@ define i32 @length4(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -237,13 +133,6 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i1 @length4_eq_const(i8* %X) nounwind optsize {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
@@ -255,30 +144,6 @@ define i1 @length4_eq_const(i8* %X) nounwind optsize {
 }
 
 define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB9_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB9_2
-; X86-NEXT:  .LBB9_3: # %res_block
-; X86-NEXT:    setae %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB9_2: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length5:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -302,19 +167,6 @@ define i32 @length5(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movb 4(%ecx), %cl
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length5_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -331,34 +183,6 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB11_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB11_3
-; X86-NEXT:  .LBB11_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB11_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -375,18 +199,6 @@ define i32 @length8(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -399,17 +211,6 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i1 @length8_eq_const(i8* %X) nounwind optsize {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
@@ -422,18 +223,6 @@ define i1 @length8_eq_const(i8* %X) nounwind optsize {
 }
 
 define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length12_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -449,16 +238,6 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length12:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -489,16 +268,6 @@ define i32 @length12(i8* %X, i8* %Y) nounwind optsize {
 ; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
 
 define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -527,30 +296,6 @@ define i32 @length16(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length16_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -574,28 +319,6 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind optsize {
 }
 
 define i1 @length16_eq_const(i8* %X) nounwind optsize {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length16_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -620,16 +343,6 @@ define i1 @length16_eq_const(i8* %X) nounwind optsize {
 ; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
 
 define i32 @length24(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length24:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $24, %edx
@@ -639,34 +352,6 @@ define i32 @length24(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length24_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -698,31 +383,6 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind optsize {
 }
 
 define i1 @length24_eq_const(i8* %X) nounwind optsize {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length24_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -751,16 +411,6 @@ define i1 @length24_eq_const(i8* %X) nounwind optsize {
 }
 
 define i32 @length32(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $32, %edx
@@ -772,34 +422,6 @@ define i32 @length32(i8* %X, i8* %Y) nounwind optsize {
 ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
 
 define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
-; X86-NOSSE-LABEL: length32_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length32_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -837,31 +459,6 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind optsize {
 }
 
 define i1 @length32_eq_const(i8* %X) nounwind optsize {
-; X86-NOSSE-LABEL: length32_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length32_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -897,16 +494,6 @@ define i1 @length32_eq_const(i8* %X) nounwind optsize {
 }
 
 define i32 @length64(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $64, %edx
@@ -916,18 +503,6 @@ define i32 @length64(i8* %X, i8* %Y) nounwind optsize {
 }
 
 define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize {
-; X86-LABEL: length64_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length64_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pushq %rax
@@ -967,18 +542,6 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind optsize {
 }
 
 define i1 @length64_eq_const(i8* %X) nounwind optsize {
-; X86-LABEL: length64_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length64_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pushq %rax
@@ -1019,19 +582,6 @@ define i1 @length64_eq_const(i8* %X) nounwind optsize {
 }
 
 define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind optsize {
-; X86-LABEL: bcmp_length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: bcmp_length2:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -1045,4 +595,3 @@ define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind optsize {
   %m = tail call i32 @bcmp(i8* %X, i8* %Y, i64 2) nounwind
   ret i32 %m
 }
-
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
new file mode 100644
index 0000000000000..0953e35b33979
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memcmp-pgso-x32.ll
@@ -0,0 +1,601 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
+
+declare dso_local i32 @memcmp(i8*, i8*, i32)
+declare dso_local i32 @bcmp(i8*, i8*, i32)
+
+define i32 @length2(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length2_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length2_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length2_eq_nobuiltin_attr:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $2
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    rolw $8, %si
+; X86-NEXT:    cmpw %si, %dx
+; X86-NEXT:    jne .LBB4_3
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB4_2
+; X86-NEXT:  .LBB4_3: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB4_2: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length3_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %edx
+; X86-NEXT:    xorw (%eax), %dx
+; X86-NEXT:    movb 2(%ecx), %cl
+; X86-NEXT:    xorb 2(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orw %dx, %ax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length4:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length4_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length4_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length5:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB9_3
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB9_2
+; X86-NEXT:  .LBB9_3: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB9_2: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length5_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    movb 4(%ecx), %cl
+; X86-NEXT:    xorb 4(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB11_2
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB11_3
+; X86-NEXT:  .LBB11_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB11_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length8_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %ecx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length8_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
+; X86-NEXT:    xorl (%eax), %ecx
+; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
+; X86-NEXT:    xorl 4(%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length12_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length12:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $16
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(i8* %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length24:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(i8* %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(i8* %X) nounwind !prof !14 {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: length64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(i8* %x, i8* %y) nounwind !prof !14 {
+; X86-LABEL: length64_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(i8* %X) nounwind !prof !14 {
+; X86-LABEL: length64_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind !prof !14 {
+; X86-LABEL: bcmp_length2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+  %m = tail call i32 @bcmp(i8* %X, i8* %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i32 10000}
+!4 = !{!"MaxCount", i32 10}
+!5 = !{!"MaxInternalCount", i32 1}
+!6 = !{!"MaxFunctionCount", i32 1000}
+!7 = !{!"NumCounts", i32 3}
+!8 = !{!"NumFunctions", i32 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i32 100, i32 1}
+!12 = !{i32 999000, i32 100, i32 1}
+!13 = !{i32 999999, i32 1, i32 2}
+!14 = !{!"function_entry_count", i32 0}
diff --git a/llvm/test/CodeGen/X86/memcmp-pgso.ll b/llvm/test/CodeGen/X86/memcmp-pgso.ll
index b39f0d70a56b9..f763d91b8f774 100644
--- a/llvm/test/CodeGen/X86/memcmp-pgso.ll
+++ b/llvm/test/CodeGen/X86/memcmp-pgso.ll
@@ -1,6 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefix=X86 --check-prefix=X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
@@ -14,19 +12,6 @@ declare dso_local i32 @memcmp(i8*, i8*, i64)
 declare dso_local i32 @bcmp(i8*, i8*, i64)
 
 define i32 @length2(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -42,15 +27,6 @@ define i32 @length2(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i1 @length2_eq(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -63,14 +39,6 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i1 @length2_eq_const(i8* %X) nounwind !prof !14 {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -83,18 +51,6 @@ define i1 @length2_eq_const(i8* %X) nounwind !prof !14 {
 }
 
 define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq_nobuiltin_attr:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -110,30 +66,6 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB4_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB4_2
-; X86-NEXT:  .LBB4_3: # %res_block
-; X86-NEXT:    setae %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB4_2: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length3:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -157,19 +89,6 @@ define i32 @length3(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i1 @length3_eq(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movb 2(%ecx), %cl
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length3_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -186,20 +105,6 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i32 @length4(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %ecx
@@ -216,15 +121,6 @@ define i32 @length4(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i1 @length4_eq(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -237,13 +133,6 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i1 @length4_eq_const(i8* %X) nounwind !prof !14 {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
@@ -255,30 +144,6 @@ define i1 @length4_eq_const(i8* %X) nounwind !prof !14 {
 }
 
 define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB9_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB9_2
-; X86-NEXT:  .LBB9_3: # %res_block
-; X86-NEXT:    setae %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB9_2: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length5:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -302,19 +167,6 @@ define i32 @length5(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i1 @length5_eq(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movb 4(%ecx), %cl
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length5_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -331,34 +183,6 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i32 @length8(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB11_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB11_3
-; X86-NEXT:  .LBB11_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB11_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -375,18 +199,6 @@ define i32 @length8(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i1 @length8_eq(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -399,17 +211,6 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i1 @length8_eq_const(i8* %X) nounwind !prof !14 {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
@@ -422,18 +223,6 @@ define i1 @length8_eq_const(i8* %X) nounwind !prof !14 {
 }
 
 define i1 @length12_eq(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length12_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -449,16 +238,6 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i32 @length12(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length12:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -489,16 +268,6 @@ define i32 @length12(i8* %X, i8* %Y) nounwind !prof !14 {
 ; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
 
 define i32 @length16(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -527,30 +296,6 @@ define i32 @length16(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i1 @length16_eq(i8* %x, i8* %y) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length16_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -574,28 +319,6 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind !prof !14 {
 }
 
 define i1 @length16_eq_const(i8* %X) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length16_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -620,16 +343,6 @@ define i1 @length16_eq_const(i8* %X) nounwind !prof !14 {
 ; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
 
 define i32 @length24(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length24:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $24, %edx
@@ -639,34 +352,6 @@ define i32 @length24(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i1 @length24_eq(i8* %x, i8* %y) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length24_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -698,31 +383,6 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind !prof !14 {
 }
 
 define i1 @length24_eq_const(i8* %X) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length24_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -751,16 +411,6 @@ define i1 @length24_eq_const(i8* %X) nounwind !prof !14 {
 }
 
 define i32 @length32(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $32, %edx
@@ -772,34 +422,6 @@ define i32 @length32(i8* %X, i8* %Y) nounwind !prof !14 {
 ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
 
 define i1 @length32_eq(i8* %x, i8* %y) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length32_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length32_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -837,31 +459,6 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind !prof !14 {
 }
 
 define i1 @length32_eq_const(i8* %X) nounwind !prof !14 {
-; X86-NOSSE-LABEL: length32_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length32_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -897,16 +494,6 @@ define i1 @length32_eq_const(i8* %X) nounwind !prof !14 {
 }
 
 define i32 @length64(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $64, %edx
@@ -916,18 +503,6 @@ define i32 @length64(i8* %X, i8* %Y) nounwind !prof !14 {
 }
 
 define i1 @length64_eq(i8* %x, i8* %y) nounwind !prof !14 {
-; X86-LABEL: length64_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length64_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pushq %rax
@@ -967,18 +542,6 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind !prof !14 {
 }
 
 define i1 @length64_eq_const(i8* %X) nounwind !prof !14 {
-; X86-LABEL: length64_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length64_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pushq %rax
@@ -1019,19 +582,6 @@ define i1 @length64_eq_const(i8* %X) nounwind !prof !14 {
 }
 
 define i32 @bcmp_length2(i8* %X, i8* %Y) nounwind !prof !14 {
-; X86-LABEL: bcmp_length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: bcmp_length2:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
diff --git a/llvm/test/CodeGen/X86/memcmp-x32.ll b/llvm/test/CodeGen/X86/memcmp-x32.ll
new file mode 100644
index 0000000000000..ad7a1c0e0a492
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memcmp-x32.ll
@@ -0,0 +1,2434 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov     | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse     | FileCheck %s --check-prefixes=X86,X86-SSE1
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2    | FileCheck %s --check-prefixes=X86,X86-SSE2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1  | FileCheck %s --check-prefixes=X86,X86-SSE41
+
+; This tests codegen time inlining/optimization of memcmp
+; rdar://6480398
+
+@.str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(i8*, i8*, i32)
+
+define i32 @length0(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length0:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+   %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind
+   ret i32 %m
+ }
+
+define i1 @length0_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length0_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length0_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  ret i32 %m
+}
+
+define i32 @length2_const(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_gt_const(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2_gt_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    cmpw (%eax), %cx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    movzwl %dx, %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %ecx
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movzwl %cx, %ecx
+; X86-NEXT:    movzwl %ax, %eax
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(i8* %X) nounwind {
+; X86-LABEL: length2_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %eax
+; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length2_eq_nobuiltin_attr:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $2
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length3:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl (%eax), %edx
+; X86-NEXT:    movzwl (%ecx), %esi
+; X86-NEXT:    rolw $8, %dx
+; X86-NEXT:    rolw $8, %si
+; X86-NEXT:    cmpw %si, %dx
+; X86-NEXT:    jne .LBB11_3
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movzbl 2(%eax), %eax
+; X86-NEXT:    movzbl 2(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB11_3: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length3_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzwl (%ecx), %edx
+; X86-NEXT:    xorw (%eax), %dx
+; X86-NEXT:    movb 2(%ecx), %cl
+; X86-NEXT:    xorb 2(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orw %dx, %ax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length4:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length4_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    cmpl (%eax), %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length4_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    seta %al
+; X86-NEXT:    sbbl $0, %eax
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_gt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length4_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    seta %dl
+; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(i8* %X) nounwind {
+; X86-LABEL: length4_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length5:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB18_3
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB18_3: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length5_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    movb 4(%ecx), %cl
+; X86-NEXT:    xorb 4(%eax), %cl
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length5_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    movl (%ecx), %esi
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    bswapl %esi
+; X86-NEXT:    cmpl %esi, %edx
+; X86-NEXT:    jne .LBB20_3
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movzbl 4(%eax), %eax
+; X86-NEXT:    movzbl 4(%ecx), %ecx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    jmp .LBB20_2
+; X86-NEXT:  .LBB20_3: # %res_block
+; X86-NEXT:    setae %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB20_2: # %endblock
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length7(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length7:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB21_2
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 3(%esi), %ecx
+; X86-NEXT:    movl 3(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB21_3
+; X86-NEXT:  .LBB21_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB21_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_lt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length7_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB22_2
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 3(%esi), %ecx
+; X86-NEXT:    movl 3(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB22_3
+; X86-NEXT:  .LBB22_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB22_3: # %endblock
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length7_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 3(%ecx), %ecx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 3(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl (%esi), %ecx
+; X86-NEXT:    movl (%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    jne .LBB24_2
+; X86-NEXT:  # %bb.1: # %loadbb1
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    je .LBB24_3
+; X86-NEXT:  .LBB24_2: # %res_block
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    setae %al
+; X86-NEXT:    leal -1(%eax,%eax), %eax
+; X86-NEXT:  .LBB24_3: # %endblock
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length8_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl (%ecx), %edx
+; X86-NEXT:    movl 4(%ecx), %ecx
+; X86-NEXT:    xorl (%eax), %edx
+; X86-NEXT:    xorl 4(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(i8* %X) nounwind {
+; X86-LABEL: length8_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
+; X86-NEXT:    xorl (%eax), %ecx
+; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
+; X86-NEXT:    xorl 4(%eax), %edx
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length9_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length9_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $9
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length10_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length10_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $10
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length11_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length11_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $11
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length12_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length12:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $12
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length13_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $13
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length14_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $14
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length15:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $15
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_lt(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length15_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $15
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 15) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15_const(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length15_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $15
+; X86-NEXT:    pushl $.L.str+1
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length15_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $15
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_gt_const(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length15_gt_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $15
+; X86-NEXT:    pushl $.L.str+1
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 1), i32 15) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
+
+define i32 @length16(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $16
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(i8* %x, i8* %y) nounwind {
+; X86-NOSSE-LABEL: length16_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length16_eq:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $16
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    setne %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length16_eq:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm1
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm1
+; X86-SSE41-NEXT:    ptest %xmm1, %xmm1
+; X86-SSE41-NEXT:    setne %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length16_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $16
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length16_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $16
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(i8* %X) nounwind {
+; X86-NOSSE-LABEL: length16_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $16
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length16_eq_const:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $16
+; X86-SSE1-NEXT:    pushl $.L.str
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length16_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length16_eq_const:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
+
+define i32 @length24(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length24:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(i8* %x, i8* %y) nounwind {
+; X86-NOSSE-LABEL: length24_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length24_eq:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $24
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length24_eq:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 8(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length24_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length24_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $24
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(i8* %X) nounwind {
+; X86-NOSSE-LABEL: length24_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $24
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length24_eq_const:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $24
+; X86-SSE1-NEXT:    pushl $.L.str
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    setne %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length24_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length24_eq_const:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    setne %al
+; X86-SSE41-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length31:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $31
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(i8* %x, i8* %y) nounwind {
+; X86-NOSSE-LABEL: length31_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length31_eq:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $31
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length31_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length31_eq:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length31_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $31
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length31_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $31
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length31_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length31_eq_prefer128:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $31
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length31_eq_prefer128:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length31_eq_prefer128:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(i8* %X) nounwind {
+; X86-NOSSE-LABEL: length31_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $31
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length31_eq_const:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $31
+; X86-SSE1-NEXT:    pushl $.L.str
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    setne %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length31_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length31_eq_const:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    setne %al
+; X86-SSE41-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 32) nounwind
+  ret i32 %m
+}
+
+; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
+
+define i1 @length32_eq(i8* %x, i8* %y) nounwind {
+; X86-NOSSE-LABEL: length32_eq:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length32_eq:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $32
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length32_eq:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length32_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length32_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $32
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
+; X86-NOSSE-LABEL: length32_eq_prefer128:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    sete %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length32_eq_prefer128:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $32
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    sete %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq_prefer128:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    sete %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length32_eq_prefer128:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
+; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    sete %al
+; X86-SSE41-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(i8* %X) nounwind {
+; X86-NOSSE-LABEL: length32_eq_const:
+; X86-NOSSE:       # %bb.0:
+; X86-NOSSE-NEXT:    pushl $32
+; X86-NOSSE-NEXT:    pushl $.L.str
+; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NOSSE-NEXT:    calll memcmp
+; X86-NOSSE-NEXT:    addl $12, %esp
+; X86-NOSSE-NEXT:    testl %eax, %eax
+; X86-NOSSE-NEXT:    setne %al
+; X86-NOSSE-NEXT:    retl
+;
+; X86-SSE1-LABEL: length32_eq_const:
+; X86-SSE1:       # %bb.0:
+; X86-SSE1-NEXT:    pushl $32
+; X86-SSE1-NEXT:    pushl $.L.str
+; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE1-NEXT:    calll memcmp
+; X86-SSE1-NEXT:    addl $12, %esp
+; X86-SSE1-NEXT:    testl %eax, %eax
+; X86-SSE1-NEXT:    setne %al
+; X86-SSE1-NEXT:    retl
+;
+; X86-SSE2-LABEL: length32_eq_const:
+; X86-SSE2:       # %bb.0:
+; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; X86-SSE2-NEXT:    setne %al
+; X86-SSE2-NEXT:    retl
+;
+; X86-SSE41-LABEL: length32_eq_const:
+; X86-SSE41:       # %bb.0:
+; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
+; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
+; X86-SSE41-NEXT:    setne %al
+; X86-SSE41-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length48:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $48
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length48_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $48
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length48_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $48
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length48_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $48
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
+; X86-LABEL: length48_eq_prefer128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $48
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(i8* %X) nounwind {
+; X86-LABEL: length48_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $48
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length63:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $63
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length63_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $63
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length63_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $63
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length63_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $63
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(i8* %X) nounwind {
+; X86-LABEL: length63_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $63
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length64_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length64_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length64_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(i8* %X) nounwind {
+; X86-LABEL: length64_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $64
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length96:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $96
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length96_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $96
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length96_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $96
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length96_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $96
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(i8* %X) nounwind {
+; X86-LABEL: length96_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $96
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length127:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $127
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length127_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $127
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length127_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $127
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length127_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $127
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(i8* %X) nounwind {
+; X86-LABEL: length127_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $127
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $128
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length128_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $128
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length128_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $128
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length128_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $128
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(i8* %X) nounwind {
+; X86-LABEL: length128_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $128
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length192:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $192
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length192_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $192
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length192_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $192
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length192_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $192
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(i8* %X) nounwind {
+; X86-LABEL: length192_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $192
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length255:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $255
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length255_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $255
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length255_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $255
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length255_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $255
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(i8* %X) nounwind {
+; X86-LABEL: length255_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $255
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length256:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $256 # imm = 0x100
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length256_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $256 # imm = 0x100
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length256_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $256 # imm = 0x100
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length256_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $256 # imm = 0x100
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(i8* %X) nounwind {
+; X86-LABEL: length256_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $256 # imm = 0x100
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length384:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $384 # imm = 0x180
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length384_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $384 # imm = 0x180
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length384_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $384 # imm = 0x180
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length384_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $384 # imm = 0x180
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(i8* %X) nounwind {
+; X86-LABEL: length384_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $384 # imm = 0x180
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length511:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $511 # imm = 0x1FF
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length511_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $511 # imm = 0x1FF
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length511_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $511 # imm = 0x1FF
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length511_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $511 # imm = 0x1FF
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(i8* %X) nounwind {
+; X86-LABEL: length511_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $511 # imm = 0x1FF
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: length512:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $512 # imm = 0x200
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length512_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $512 # imm = 0x200
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length512_lt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $512 # imm = 0x200
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    shrl $31, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(i8* %x, i8* %y) nounwind {
+; X86-LABEL: length512_gt:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $512 # imm = 0x200
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setg %al
+; X86-NEXT:    retl
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(i8* %X) nounwind {
+; X86-LABEL: length512_eq_const:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $512 # imm = 0x200
+; X86-NEXT:    pushl $.L.str
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([513 x i8], [513 x i8]* @.str, i32 0, i32 0), i32 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks that we do not do stupid things with huge sizes.
+define i32 @huge_length(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: huge_length:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $-1
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind {
+; X86-LABEL: huge_length_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl $-1
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+; This checks non-constant sizes.
+define i32 @nonconst_length(i8* %X, i8* %Y, i32 %size) nounwind {
+; X86-LABEL: nonconst_length:
+; X86:       # %bb.0:
+; X86-NEXT:    jmp memcmp # TAILCALL
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(i8* %X, i8* %Y, i32 %size) nounwind {
+; X86-LABEL: nonconst_length_eq:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll memcmp
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i32 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index d498d85711f80..bf11d3d18b2ed 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -1,8 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=cmov     | FileCheck %s --check-prefixes=X86,X86-NOSSE
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse     | FileCheck %s --check-prefixes=X86,X86-SSE1
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2    | FileCheck %s --check-prefixes=X86,X86-SSE2
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1  | FileCheck %s --check-prefixes=X86,X86-SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown               | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx    | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
@@ -22,11 +18,6 @@
 declare dso_local i32 @memcmp(i8*, i8*, i64)
 
 define i32 @length0(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length0:
-; X86:       # %bb.0:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length0:
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorl %eax, %eax
@@ -36,11 +27,6 @@ define i32 @length0(i8* %X, i8* %Y) nounwind {
  }
 
 define i1 @length0_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length0_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movb $1, %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length0_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movb $1, %al
@@ -51,11 +37,6 @@ define i1 @length0_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length0_lt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length0_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length0_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorl %eax, %eax
@@ -66,19 +47,6 @@ define i1 @length0_lt(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length2(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length2:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -94,15 +62,6 @@ define i32 @length2(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length2_const(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length2_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -115,17 +74,6 @@ define i32 @length2_const(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length2_gt_const(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length2_gt_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    addl $-12594, %eax # imm = 0xCECE
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_gt_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -141,15 +89,6 @@ define i1 @length2_gt_const(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length2_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length2_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    cmpw (%eax), %cx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -162,21 +101,6 @@ define i1 @length2_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length2_lt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length2_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    movzwl %cx, %eax
-; X86-NEXT:    movzwl %dx, %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -195,21 +119,6 @@ define i1 @length2_lt(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length2_gt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length2_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %ecx
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    rolw $8, %cx
-; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movzwl %cx, %ecx
-; X86-NEXT:    movzwl %ax, %eax
-; X86-NEXT:    subl %eax, %ecx
-; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -228,14 +137,6 @@ define i1 @length2_gt(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length2_eq_const(i8* %X) nounwind {
-; X86-LABEL: length2_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %eax
-; X86-NEXT:    cmpl $12849, %eax # imm = 0x3231
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -248,18 +149,6 @@ define i1 @length2_eq_const(i8* %X) nounwind {
 }
 
 define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length2_eq_nobuiltin_attr:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $2
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length2_eq_nobuiltin_attr:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -275,30 +164,6 @@ define i1 @length2_eq_nobuiltin_attr(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length3(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length3:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %edx
-; X86-NEXT:    movzwl (%ecx), %esi
-; X86-NEXT:    rolw $8, %dx
-; X86-NEXT:    rolw $8, %si
-; X86-NEXT:    cmpw %si, %dx
-; X86-NEXT:    jne .LBB11_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 2(%eax), %eax
-; X86-NEXT:    movzbl 2(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB11_3: # %res_block
-; X86-NEXT:    setae %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length3:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -322,19 +187,6 @@ define i32 @length3(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length3_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzwl (%ecx), %edx
-; X86-NEXT:    xorw (%eax), %dx
-; X86-NEXT:    movb 2(%ecx), %cl
-; X86-NEXT:    xorb 2(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orw %dx, %ax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length3_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl (%rdi), %eax
@@ -351,20 +203,6 @@ define i1 @length3_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length4(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length4:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %ecx
@@ -381,15 +219,6 @@ define i32 @length4(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length4_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length4_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    cmpl (%eax), %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -402,22 +231,6 @@ define i1 @length4_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length4_lt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length4_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    seta %al
-; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %ecx
@@ -437,22 +250,6 @@ define i1 @length4_lt(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length4_gt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length4_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    cmpl %eax, %ecx
-; X86-NEXT:    seta %dl
-; X86-NEXT:    sbbl $0, %edx
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -472,13 +269,6 @@ define i1 @length4_gt(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length4_eq_const(i8* %X) nounwind {
-; X86-LABEL: length4_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length4_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
@@ -490,30 +280,6 @@ define i1 @length4_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length5(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length5:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB18_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB18_3: # %res_block
-; X86-NEXT:    setae %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length5:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -537,19 +303,6 @@ define i32 @length5(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length5_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    movb 4(%ecx), %cl
-; X86-NEXT:    xorb 4(%eax), %cl
-; X86-NEXT:    movzbl %cl, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length5_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -566,32 +319,6 @@ define i1 @length5_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length5_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl (%ecx), %esi
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    bswapl %esi
-; X86-NEXT:    cmpl %esi, %edx
-; X86-NEXT:    jne .LBB20_3
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movzbl 4(%eax), %eax
-; X86-NEXT:    movzbl 4(%ecx), %ecx
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    jmp .LBB20_2
-; X86-NEXT:  .LBB20_3: # %res_block
-; X86-NEXT:    setae %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB20_2: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length5_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -620,34 +347,6 @@ define i1 @length5_lt(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length7(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length7:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB21_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB21_3
-; X86-NEXT:  .LBB21_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB21_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length7:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %ecx
@@ -676,36 +375,6 @@ define i32 @length7(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length7_lt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length7_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB22_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 3(%esi), %ecx
-; X86-NEXT:    movl 3(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB22_3
-; X86-NEXT:  .LBB22_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB22_3: # %endblock
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length7_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %ecx
@@ -737,18 +406,6 @@ define i1 @length7_lt(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length7_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length7_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 3(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 3(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length7_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl (%rdi), %eax
@@ -764,34 +421,6 @@ define i1 @length7_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length8(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length8:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    jne .LBB24_2
-; X86-NEXT:  # %bb.1: # %loadbb1
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl 4(%eax), %edx
-; X86-NEXT:    bswapl %ecx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    je .LBB24_3
-; X86-NEXT:  .LBB24_2: # %res_block
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    cmpl %edx, %ecx
-; X86-NEXT:    setae %al
-; X86-NEXT:    leal -1(%eax,%eax), %eax
-; X86-NEXT:  .LBB24_3: # %endblock
-; X86-NEXT:    popl %esi
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -808,18 +437,6 @@ define i32 @length8(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length8_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    xorl (%eax), %edx
-; X86-NEXT:    xorl 4(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -832,17 +449,6 @@ define i1 @length8_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length8_eq_const(i8* %X) nounwind {
-; X86-LABEL: length8_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl $858927408, %ecx # imm = 0x33323130
-; X86-NEXT:    xorl (%eax), %ecx
-; X86-NEXT:    movl $926299444, %edx # imm = 0x37363534
-; X86-NEXT:    xorl 4(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length8_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
@@ -855,18 +461,6 @@ define i1 @length8_eq_const(i8* %X) nounwind {
 }
 
 define i1 @length9_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length9_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $9
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length9_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -883,18 +477,6 @@ define i1 @length9_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length10_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length10_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $10
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length10_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -911,18 +493,6 @@ define i1 @length10_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length11_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length11_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $11
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length11_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -938,18 +508,6 @@ define i1 @length11_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length12_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length12_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -965,16 +523,6 @@ define i1 @length12_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length12(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length12:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $12
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length12:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -1003,18 +551,6 @@ define i32 @length12(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length13_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length13_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $13
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length13_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -1030,18 +566,6 @@ define i1 @length13_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length14_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length14_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $14
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length14_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -1057,16 +581,6 @@ define i1 @length14_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length15(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length15:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length15:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -1095,18 +609,6 @@ define i32 @length15(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length15_lt(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length15_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length15_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -1138,16 +640,6 @@ define i1 @length15_lt(i8* %X, i8* %Y) nounwind {
 }
 
 define i32 @length15_const(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length15_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl $.L.str+1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length15_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movabsq $3544952156018063160, %rcx # imm = 0x3132333435363738
@@ -1174,18 +666,6 @@ define i32 @length15_const(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length15_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length15_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -1201,18 +681,6 @@ define i1 @length15_eq(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length15_gt_const(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length15_gt_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $15
-; X86-NEXT:    pushl $.L.str+1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length15_gt_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movabsq $3544952156018063160, %rax # imm = 0x3132333435363738
@@ -1244,16 +712,6 @@ define i1 @length15_gt_const(i8* %X, i8* %Y) nounwind {
 ; PR33329 - https://bugs.llvm.org/show_bug.cgi?id=33329
 
 define i32 @length16(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length16:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -1282,53 +740,6 @@ define i32 @length16(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length16_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length16_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length16_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $16
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pmovmskb %xmm1, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length16_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm1
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm1
-; X86-SSE41-NEXT:    ptest %xmm1, %xmm1
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length16_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -1371,18 +782,6 @@ define i1 @length16_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length16_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length16_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length16_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rcx
@@ -1414,18 +813,6 @@ define i1 @length16_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length16_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length16_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $16
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length16_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq (%rdi), %rax
@@ -1457,49 +844,6 @@ define i1 @length16_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length16_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length16_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $16
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length16_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $16
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length16_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length16_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length16_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -1542,16 +886,6 @@ define i1 @length16_eq_const(i8* %X) nounwind {
 ; PR33914 - https://bugs.llvm.org/show_bug.cgi?id=33914
 
 define i32 @length24(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length24:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length24:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $24, %edx
@@ -1561,61 +895,6 @@ define i32 @length24(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length24_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length24_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length24_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length24_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -1673,18 +952,6 @@ define i1 @length24_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length24_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length24_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length24_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -1700,18 +967,6 @@ define i1 @length24_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length24_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length24_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $24
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length24_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -1727,55 +982,6 @@ define i1 @length24_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length24_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length24_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $24
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length24_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $24
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length24_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length24_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 8(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length24_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -1828,16 +1034,6 @@ define i1 @length24_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length31(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length31:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length31:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $31, %edx
@@ -1847,61 +1043,6 @@ define i32 @length31(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length31_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length31_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length31_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -1958,18 +1099,6 @@ define i1 @length31_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length31_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length31_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length31_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -1985,18 +1114,6 @@ define i1 @length31_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length31_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length31_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $31
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length31_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -2012,61 +1129,6 @@ define i1 @length31_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length31_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length31_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length31_eq_prefer128:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -2123,55 +1185,6 @@ define i1 @length31_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"=
 }
 
 define i1 @length31_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length31_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $31
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length31_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $31
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length31_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 15(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length31_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 15(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length31_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -2224,16 +1237,6 @@ define i1 @length31_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length32(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length32:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length32:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $32, %edx
@@ -2245,61 +1248,6 @@ define i32 @length32(i8* %X, i8* %Y) nounwind {
 ; PR33325 - https://bugs.llvm.org/show_bug.cgi?id=33325
 
 define i1 @length32_eq(i8* %x, i8* %y) nounwind {
-; X86-NOSSE-LABEL: length32_eq:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length32_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -2369,18 +1317,6 @@ define i1 @length32_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length32_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length32_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length32_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -2396,18 +1332,6 @@ define i1 @length32_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length32_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length32_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $32
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length32_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -2423,61 +1347,6 @@ define i1 @length32_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length32_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
-; X86-NOSSE-LABEL: length32_eq_prefer128:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    sete %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq_prefer128:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    sete %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_prefer128:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    sete %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq_prefer128:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE41-NEXT:    movdqu (%ecx), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%ecx), %xmm1
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm2
-; X86-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm0
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    sete %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length32_eq_prefer128:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -2534,55 +1403,6 @@ define i1 @length32_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"=
 }
 
 define i1 @length32_eq_const(i8* %X) nounwind {
-; X86-NOSSE-LABEL: length32_eq_const:
-; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl $0
-; X86-NOSSE-NEXT:    pushl $32
-; X86-NOSSE-NEXT:    pushl $.L.str
-; X86-NOSSE-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NOSSE-NEXT:    calll memcmp
-; X86-NOSSE-NEXT:    addl $16, %esp
-; X86-NOSSE-NEXT:    testl %eax, %eax
-; X86-NOSSE-NEXT:    setne %al
-; X86-NOSSE-NEXT:    retl
-;
-; X86-SSE1-LABEL: length32_eq_const:
-; X86-SSE1:       # %bb.0:
-; X86-SSE1-NEXT:    pushl $0
-; X86-SSE1-NEXT:    pushl $32
-; X86-SSE1-NEXT:    pushl $.L.str
-; X86-SSE1-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE1-NEXT:    calll memcmp
-; X86-SSE1-NEXT:    addl $16, %esp
-; X86-SSE1-NEXT:    testl %eax, %eax
-; X86-SSE1-NEXT:    setne %al
-; X86-SSE1-NEXT:    retl
-;
-; X86-SSE2-LABEL: length32_eq_const:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE2-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pmovmskb %xmm0, %eax
-; X86-SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
-; X86-SSE2-NEXT:    setne %al
-; X86-SSE2-NEXT:    retl
-;
-; X86-SSE41-LABEL: length32_eq_const:
-; X86-SSE41:       # %bb.0:
-; X86-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT:    movdqu (%eax), %xmm0
-; X86-SSE41-NEXT:    movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    ptest %xmm0, %xmm0
-; X86-SSE41-NEXT:    setne %al
-; X86-SSE41-NEXT:    retl
-;
 ; X64-SSE2-LABEL: length32_eq_const:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movdqu (%rdi), %xmm0
@@ -2648,16 +1468,6 @@ define i1 @length32_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length48(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length48:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length48:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $48, %edx
@@ -2667,18 +1477,6 @@ define i32 @length48(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length48_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length48_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length48_eq:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -2746,18 +1544,6 @@ define i1 @length48_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length48_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length48_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length48_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -2773,18 +1559,6 @@ define i1 @length48_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length48_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length48_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length48_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -2800,18 +1574,6 @@ define i1 @length48_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length48_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"="128" {
-; X86-LABEL: length48_eq_prefer128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length48_eq_prefer128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -2827,18 +1589,6 @@ define i1 @length48_eq_prefer128(i8* %x, i8* %y) nounwind "prefer-vector-width"=
 }
 
 define i1 @length48_eq_const(i8* %X) nounwind {
-; X86-LABEL: length48_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $48
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length48_eq_const:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -2904,16 +1654,6 @@ define i1 @length48_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length63(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length63:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length63:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $63, %edx
@@ -2923,18 +1663,6 @@ define i32 @length63(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length63_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length63_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length63_eq:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -2999,18 +1727,6 @@ define i1 @length63_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length63_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length63_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length63_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -3026,18 +1742,6 @@ define i1 @length63_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length63_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length63_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length63_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -3053,18 +1757,6 @@ define i1 @length63_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length63_eq_const(i8* %X) nounwind {
-; X86-LABEL: length63_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $63
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length63_eq_const:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -3130,16 +1822,6 @@ define i1 @length63_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length64(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length64:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $64, %edx
@@ -3149,18 +1831,6 @@ define i32 @length64(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length64_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length64_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length64_eq:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -3240,18 +1910,6 @@ define i1 @length64_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length64_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length64_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length64_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -3267,18 +1925,6 @@ define i1 @length64_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length64_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length64_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length64_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -3294,18 +1940,6 @@ define i1 @length64_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length64_eq_const(i8* %X) nounwind {
-; X86-LABEL: length64_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $64
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length64_eq_const:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -3386,16 +2020,6 @@ define i1 @length64_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length96(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length96:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length96:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $96, %edx
@@ -3405,18 +2029,6 @@ define i32 @length96(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length96_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length96_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length96_eq:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -3498,18 +2110,6 @@ define i1 @length96_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length96_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length96_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length96_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -3525,18 +2125,6 @@ define i1 @length96_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length96_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length96_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length96_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -3552,18 +2140,6 @@ define i1 @length96_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length96_eq_const(i8* %X) nounwind {
-; X86-LABEL: length96_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $96
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length96_eq_const:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -3646,16 +2222,6 @@ define i1 @length96_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length127(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length127:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length127:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $127, %edx
@@ -3665,18 +2231,6 @@ define i32 @length127(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length127_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length127_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length127_eq:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -3755,18 +2309,6 @@ define i1 @length127_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length127_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length127_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length127_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -3782,18 +2324,6 @@ define i1 @length127_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length127_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length127_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length127_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -3809,18 +2339,6 @@ define i1 @length127_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length127_eq_const(i8* %X) nounwind {
-; X86-LABEL: length127_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $127
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length127_eq_const:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -3903,16 +2421,6 @@ define i1 @length127_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length128(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $128, %edx
@@ -3922,18 +2430,6 @@ define i32 @length128(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length128_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length128_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length128_eq:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -4012,18 +2508,6 @@ define i1 @length128_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length128_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length128_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length128_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4039,18 +2523,6 @@ define i1 @length128_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length128_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length128_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length128_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4066,18 +2538,6 @@ define i1 @length128_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length128_eq_const(i8* %X) nounwind {
-; X86-LABEL: length128_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $128
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-SSE-LABEL: length128_eq_const:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    pushq %rax
@@ -4160,16 +2620,6 @@ define i1 @length128_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length192(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length192:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length192:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $192, %edx
@@ -4179,18 +2629,6 @@ define i32 @length192(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length192_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length192_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length192_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4206,18 +2644,6 @@ define i1 @length192_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length192_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length192_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length192_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4233,18 +2659,6 @@ define i1 @length192_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length192_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length192_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length192_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4260,18 +2674,6 @@ define i1 @length192_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length192_eq_const(i8* %X) nounwind {
-; X86-LABEL: length192_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $192
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length192_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4288,16 +2690,6 @@ define i1 @length192_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length255(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length255:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length255:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $255, %edx
@@ -4307,18 +2699,6 @@ define i32 @length255(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length255_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length255_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length255_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4334,18 +2714,6 @@ define i1 @length255_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length255_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length255_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length255_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4361,18 +2729,6 @@ define i1 @length255_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length255_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length255_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length255_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4388,18 +2744,6 @@ define i1 @length255_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length255_eq_const(i8* %X) nounwind {
-; X86-LABEL: length255_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $255
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length255_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4416,16 +2760,6 @@ define i1 @length255_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length256(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length256:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length256:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $256, %edx # imm = 0x100
@@ -4435,18 +2769,6 @@ define i32 @length256(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length256_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length256_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length256_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4462,18 +2784,6 @@ define i1 @length256_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length256_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length256_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length256_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4489,18 +2799,6 @@ define i1 @length256_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length256_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length256_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length256_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4516,18 +2814,6 @@ define i1 @length256_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length256_eq_const(i8* %X) nounwind {
-; X86-LABEL: length256_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $256 # imm = 0x100
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length256_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4544,16 +2830,6 @@ define i1 @length256_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length384(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length384:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length384:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $384, %edx # imm = 0x180
@@ -4563,18 +2839,6 @@ define i32 @length384(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length384_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length384_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length384_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4590,18 +2854,6 @@ define i1 @length384_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length384_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length384_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length384_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4617,18 +2869,6 @@ define i1 @length384_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length384_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length384_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length384_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4644,18 +2884,6 @@ define i1 @length384_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length384_eq_const(i8* %X) nounwind {
-; X86-LABEL: length384_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $384 # imm = 0x180
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length384_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4672,16 +2900,6 @@ define i1 @length384_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length511(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length511:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length511:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $511, %edx # imm = 0x1FF
@@ -4691,18 +2909,6 @@ define i32 @length511(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length511_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length511_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length511_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4718,18 +2924,6 @@ define i1 @length511_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length511_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length511_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length511_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4745,18 +2939,6 @@ define i1 @length511_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length511_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length511_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length511_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4772,18 +2954,6 @@ define i1 @length511_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length511_eq_const(i8* %X) nounwind {
-; X86-LABEL: length511_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $511 # imm = 0x1FF
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length511_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4800,16 +2970,6 @@ define i1 @length511_eq_const(i8* %X) nounwind {
 }
 
 define i32 @length512(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: length512:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl $512, %edx # imm = 0x200
@@ -4819,18 +2979,6 @@ define i32 @length512(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @length512_eq(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length512_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length512_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4846,18 +2994,6 @@ define i1 @length512_eq(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length512_lt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length512_lt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    shrl $31, %eax
-; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length512_lt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4873,18 +3009,6 @@ define i1 @length512_lt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length512_gt(i8* %x, i8* %y) nounwind {
-; X86-LABEL: length512_gt:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    setg %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length512_gt:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4900,18 +3024,6 @@ define i1 @length512_gt(i8* %x, i8* %y) nounwind {
 }
 
 define i1 @length512_eq_const(i8* %X) nounwind {
-; X86-LABEL: length512_eq_const:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $512 # imm = 0x200
-; X86-NEXT:    pushl $.L.str
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: length512_eq_const:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4929,16 +3041,6 @@ define i1 @length512_eq_const(i8* %X) nounwind {
 
 ; This checks that we do not do stupid things with huge sizes.
 define i32 @huge_length(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: huge_length:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2147483647 # imm = 0x7FFFFFFF
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    retl
-;
 ; X64-LABEL: huge_length:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
@@ -4948,18 +3050,6 @@ define i32 @huge_length(i8* %X, i8* %Y) nounwind {
 }
 
 define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind {
-; X86-LABEL: huge_length_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl $2147483647 # imm = 0x7FFFFFFF
-; X86-NEXT:    pushl $-1
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: huge_length_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
@@ -4976,10 +3066,6 @@ define i1 @huge_length_eq(i8* %X, i8* %Y) nounwind {
 
 ; This checks non-constant sizes.
 define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind {
-; X86-LABEL: nonconst_length:
-; X86:       # %bb.0:
-; X86-NEXT:    jmp memcmp # TAILCALL
-;
 ; X64-LABEL: nonconst_length:
 ; X64:       # %bb.0:
 ; X64-NEXT:    jmp memcmp # TAILCALL
@@ -4988,18 +3074,6 @@ define i32 @nonconst_length(i8* %X, i8* %Y, i64 %size) nounwind {
 }
 
 define i1 @nonconst_length_eq(i8* %X, i8* %Y, i64 %size) nounwind {
-; X86-LABEL: nonconst_length_eq:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    calll memcmp
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    sete %al
-; X86-NEXT:    retl
-;
 ; X64-LABEL: nonconst_length_eq:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rax
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
index a0818e96bf215..411f27f60b30c 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll
@@ -8,17 +8,13 @@
 define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable noinline ssp {
 ; ALL-LABEL: merge_8f64_2f64_12u4:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovups 16(%rdi), %ymm0
-; ALL-NEXT:    vinsertf128 $1, 64(%rdi), %ymm0, %ymm1
-; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    vmovups 16(%rdi), %zmm0
 ; ALL-NEXT:    retq
 ;
 ; X86-AVX512F-LABEL: merge_8f64_2f64_12u4:
 ; X86-AVX512F:       # %bb.0:
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT:    vmovups 16(%eax), %ymm0
-; X86-AVX512F-NEXT:    vinsertf128 $1, 64(%eax), %ymm0, %ymm1
-; X86-AVX512F-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X86-AVX512F-NEXT:    vmovups 16(%eax), %zmm0
 ; X86-AVX512F-NEXT:    retl
   %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 1
   %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
@@ -35,19 +31,15 @@ define <8 x double> @merge_8f64_2f64_12u4(<2 x double>* %ptr) nounwind uwtable n
 define <8 x double> @merge_8f64_2f64_23z5(<2 x double>* %ptr) nounwind uwtable noinline ssp {
 ; ALL-LABEL: merge_8f64_2f64_23z5:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovups 32(%rdi), %ymm0
-; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT:    vinsertf128 $1, 80(%rdi), %ymm1, %ymm1
-; ALL-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; ALL-NEXT:    vmovdqu64 32(%rdi), %zmm0
+; ALL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
 ; ALL-NEXT:    retq
 ;
 ; X86-AVX512F-LABEL: merge_8f64_2f64_23z5:
 ; X86-AVX512F:       # %bb.0:
 ; X86-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512F-NEXT:    vmovups 32(%eax), %ymm0
-; X86-AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512F-NEXT:    vinsertf128 $1, 80(%eax), %ymm1, %ymm1
-; X86-AVX512F-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
+; X86-AVX512F-NEXT:    vmovdqu64 32(%eax), %zmm0
+; X86-AVX512F-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0
 ; X86-AVX512F-NEXT:    retl
   %ptr0 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2
   %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 3
diff --git a/llvm/test/CodeGen/X86/pr46315.ll b/llvm/test/CodeGen/X86/pr46315.ll
new file mode 100644
index 0000000000000..e42c19fb404b6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr46315.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @PR46315() {
+; CHECK-LABEL: PR46315:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    xorl %esi, %esi
+; CHECK-NEXT:    movl $2, %edx
+; CHECK-NEXT:    movl $3, %ecx
+; CHECK-NEXT:    movl $1, %r8d
+; CHECK-NEXT:    jmp h@PLT # TAILCALL
+  tail call void @h(i32 0, i32 0, i32 2, i32 3, i32 1)
+  ret void
+}
+
+declare void @h(i32, i32, i32, i32, i32)
diff --git a/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll b/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll
index 0d1ebff5347a7..92da9ffefde70 100644
--- a/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll
+++ b/llvm/test/CodeGen/X86/pseudo_cmov_lower-fp16.ll
@@ -1,5 +1,68 @@
 ; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+avx512fp16 -mattr=+avx512vl -o - | FileCheck %s
 
+; This test checks that only a single je gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR.
+; CHECK-LABEL: foo1:
+; CHECK: je
+; CHECK-NOT: je
+define <8 x half> @foo1(i32 %v1, <8 x half> %v2, <8 x half> %v3, <8 x half> %v4) nounwind {
+entry:
+  %cmp = icmp eq i32 %v1, 0
+  %t1 = select i1 %cmp, <8 x half> %v2, <8 x half> %v3
+  %t2 = select i1 %cmp, <8 x half> %v3, <8 x half> %v4
+  %sub = fsub <8 x half> %t1, %t2
+  ret <8 x half> %sub
+}
+
+; This test checks that only a single ja gets generated in the final code
+; for lowering the CMOV pseudos that get created for this IR. This combines
+; all the supported types together into one long string of selects based
+; on the same condition.
+; CHECK-LABEL: foo2:
+; CHECK: ja
+; CHECK-NOT: ja
+define void @foo2(i32 %v1,
+                  half %v32, half %v33,
+                  <8 x half> %v52, <8 x half> %v53,
+                  <16 x half> %v122, <16 x half> %v123,
+                  <32 x half> %v132, <32 x half> %v133,
+                  i8 * %dst) nounwind {
+entry:
+  %add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 2
+  %a31 = bitcast i8* %add.ptr31 to half*
+
+  %add.ptr51 = getelementptr inbounds i8, i8* %dst, i32 4
+  %a51 = bitcast i8* %add.ptr51 to <8 x half>*
+
+  %add.ptr121 = getelementptr inbounds i8, i8* %dst, i32 20
+  %a121 = bitcast i8* %add.ptr121 to <16 x half>*
+
+  %add.ptr131 = getelementptr inbounds i8, i8* %dst, i32 52
+  %a131 = bitcast i8* %add.ptr131 to <32 x half>*
+
+  ; These operations are necessary, because select of two single use loads
+  ; ends up getting optimized into a select of two leas, followed by a
+  ; single load of the selected address.
+
+  %t33 = fadd half %v33, %v32
+  %t53 = fadd <8 x half> %v53, %v52
+  %t123 = fadd <16 x half> %v123, %v122
+  %t133 = fadd <32 x half> %v133, %v132
+
+  %cmp = icmp ugt i32 %v1, 31
+  %t31 = select i1 %cmp, half %v32, half %t33
+  %t51 = select i1 %cmp, <8 x half> %v52, <8 x half> %t53
+  %t121 = select i1 %cmp, <16 x half> %v122, <16 x half> %t123
+  %t131 = select i1 %cmp, <32 x half> %v132, <32 x half> %t133
+
+  store half %t31, half* %a31, align 2
+  store <8 x half> %t51, <8 x half>* %a51, align 16
+  store <16 x half> %t121, <16 x half>* %a121, align 32
+  store <32 x half> %t131, <32 x half>* %a131, align 64
+
+  ret void
+}
+
 ; This test checks that only a single jne gets generated in the final code
 ; for lowering the CMOV pseudos that get created for this IR.
 define dso_local <32 x half> @foo3(<32 x half> %a, <32 x half> %b, i1 zeroext %sign) local_unnamed_addr #0 {
diff --git a/llvm/test/CodeGen/X86/sadd_sat.ll b/llvm/test/CodeGen/X86/sadd_sat.ll
index e866c82564425..e839c5e18bc3f 100644
--- a/llvm/test/CodeGen/X86/sadd_sat.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat.ll
@@ -12,26 +12,22 @@ declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
 define i32 @func(i32 %x, i32 %y) nounwind {
 ; X86-LABEL: func:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    setns %cl
-; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    cmovol %ecx, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%eax,%ecx), %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl $-2147483648, %edx # imm = 0x80000000
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    cmovol %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    addl %esi, %ecx
-; X64-NEXT:    setns %al
-; X64-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rsi), %eax
+; X64-NEXT:    sarl $31, %eax
+; X64-NEXT:    xorl $-2147483648, %eax # imm = 0x80000000
 ; X64-NEXT:    addl %esi, %edi
 ; X64-NEXT:    cmovnol %edi, %eax
 ; X64-NEXT:    retq
@@ -43,34 +39,27 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; X86-LABEL: func2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seto %bl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %ecx, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setns %dl
-; X86-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    xorl $-2147483648, %edx # imm = 0x80000000
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %esi, %edx
-; X86-NEXT:    popl %esi
+; X86-NEXT:    cmovel %ecx, %edx
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func2:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    addq %rsi, %rax
-; X64-NEXT:    setns %cl
-; X64-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    leaq (%rdi,%rsi), %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X64-NEXT:    xorq %rcx, %rax
 ; X64-NEXT:    addq %rsi, %rdi
 ; X64-NEXT:    cmovnoq %rdi, %rax
 ; X64-NEXT:    retq
@@ -81,27 +70,26 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 define signext i16 @func16(i16 signext %x, i16 signext %y) nounwind {
 ; X86-LABEL: func16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addw %dx, %si
-; X86-NEXT:    setns %cl
-; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    addw %dx, %ax
-; X86-NEXT:    cmovol %ecx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addw %cx, %dx
+; X86-NEXT:    movswl %dx, %edx
+; X86-NEXT:    sarl $15, %edx
+; X86-NEXT:    xorl $-32768, %edx # imm = 0x8000
+; X86-NEXT:    addw %cx, %ax
+; X86-NEXT:    cmovol %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func16:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    addw %si, %cx
-; X64-NEXT:    setns %al
-; X64-NEXT:    addl $32767, %eax # imm = 0x7FFF
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rsi), %eax
+; X64-NEXT:    cwtl
+; X64-NEXT:    sarl $15, %eax
+; X64-NEXT:    xorl $-32768, %eax # imm = 0x8000
 ; X64-NEXT:    addw %si, %di
 ; X64-NEXT:    cmovnol %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -114,28 +102,29 @@ define signext i8 @func8(i8 signext %x, i8 signext %y) nounwind {
 ; X86-LABEL: func8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movb %al, %ah
-; X86-NEXT:    addb %dl, %ah
-; X86-NEXT:    setns %cl
-; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    addb %dl, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    cmovol %ecx, %eax
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addb %cl, %dl
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb $-128, %dl
+; X86-NEXT:    addb %cl, %al
+; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    movzbl %dl, %eax
+; X86-NEXT:    cmovnol %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func8:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    addb %sil, %al
-; X64-NEXT:    setns %cl
-; X64-NEXT:    addl $127, %ecx
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    leal (%rdi,%rsi), %eax
+; X64-NEXT:    sarb $7, %al
+; X64-NEXT:    xorb $-128, %al
 ; X64-NEXT:    addb %sil, %dil
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    cmovol %ecx, %eax
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    cmovnol %ecx, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %tmp = call i8 @llvm.sadd.sat.i8(i8 %x, i8 %y)
@@ -176,72 +165,59 @@ define signext i4 @func3(i4 signext %x, i4 signext %y) nounwind {
 define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-LABEL: vec:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%ecx,%eax), %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    setns %al
-; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    addl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    cmovol %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    setns %al
-; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovol %eax, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    addl %edi, %ebx
-; X86-NEXT:    setns %al
-; X86-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
-; X86-NEXT:    addl %edi, %esi
+; X86-NEXT:    cmovol %esi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%edx,%eax), %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    addl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovol %eax, %esi
+; X86-NEXT:    cmovol %esi, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    addl %eax, %ebp
-; X86-NEXT:    setns %bl
-; X86-NEXT:    addl $2147483647, %ebx # imm = 0x7FFFFFFF
+; X86-NEXT:    leal (%edi,%eax), %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl $-2147483648, %esi # imm = 0x80000000
 ; X86-NEXT:    addl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmovol %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%ebx,%eax), %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    addl %eax, %ebx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmovol %ebx, %edi
+; X86-NEXT:    cmovol %esi, %ebx
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: vec:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pxor %xmm2, %xmm2
-; X64-NEXT:    pxor %xmm3, %xmm3
-; X64-NEXT:    pcmpgtd %xmm1, %xmm3
+; X64-NEXT:    pcmpgtd %xmm1, %xmm2
 ; X64-NEXT:    paddd %xmm0, %xmm1
 ; X64-NEXT:    pcmpgtd %xmm1, %xmm0
-; X64-NEXT:    pxor %xmm3, %xmm0
-; X64-NEXT:    movdqa %xmm1, %xmm3
-; X64-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; X64-NEXT:    pcmpgtd %xmm1, %xmm2
-; X64-NEXT:    psrld $1, %xmm2
-; X64-NEXT:    por %xmm3, %xmm2
-; X64-NEXT:    pand %xmm0, %xmm2
-; X64-NEXT:    pandn %xmm1, %xmm0
+; X64-NEXT:    pxor %xmm2, %xmm0
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    pandn %xmm1, %xmm2
+; X64-NEXT:    psrad $31, %xmm1
+; X64-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-NEXT:    pand %xmm1, %xmm0
 ; X64-NEXT:    por %xmm2, %xmm0
 ; X64-NEXT:    retq
   %tmp = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y);
diff --git a/llvm/test/CodeGen/X86/sadd_sat_plus.ll b/llvm/test/CodeGen/X86/sadd_sat_plus.ll
index e50f6d5b6c553..6deab30470b43 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_plus.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_plus.ll
@@ -11,28 +11,24 @@ declare i64 @llvm.sadd.sat.i64(i64, i64)
 define i32 @func32(i32 %x, i32 %y, i32 %z) nounwind {
 ; X86-LABEL: func32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    setns %cl
-; X86-NEXT:    addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    addl %edx, %eax
-; X86-NEXT:    cmovol %ecx, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%eax,%ecx), %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    xorl $-2147483648, %edx # imm = 0x80000000
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    cmovol %edx, %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func32:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    imull %edx, %esi
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    addl %esi, %ecx
-; X64-NEXT:    setns %al
-; X64-NEXT:    addl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    leal (%rdi,%rsi), %eax
+; X64-NEXT:    sarl $31, %eax
+; X64-NEXT:    xorl $-2147483648, %eax # imm = 0x80000000
 ; X64-NEXT:    addl %edi, %esi
 ; X64-NEXT:    cmovnol %esi, %eax
 ; X64-NEXT:    retq
@@ -45,34 +41,27 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-LABEL: func64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seto %bl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %ecx, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setns %dl
-; X86-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    xorl $-2147483648, %edx # imm = 0x80000000
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %esi, %edx
-; X86-NEXT:    popl %esi
+; X86-NEXT:    cmovel %ecx, %edx
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func64:
 ; X64:       # %bb.0:
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    addq %rdx, %rax
-; X64-NEXT:    setns %cl
-; X64-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
-; X64-NEXT:    addq %rcx, %rax
+; X64-NEXT:    leaq (%rdi,%rdx), %rcx
+; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; X64-NEXT:    xorq %rcx, %rax
 ; X64-NEXT:    addq %rdx, %rdi
 ; X64-NEXT:    cmovnoq %rdi, %rax
 ; X64-NEXT:    retq
@@ -84,31 +73,30 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 define signext i16 @func16(i16 signext %x, i16 signext %y, i16 signext %z) nounwind {
 ; X86-LABEL: func16:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    imulw {{[0-9]+}}(%esp), %ax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    addw %dx, %si
-; X86-NEXT:    setns %cl
-; X86-NEXT:    addl $32767, %ecx # imm = 0x7FFF
-; X86-NEXT:    addw %dx, %ax
-; X86-NEXT:    cmovol %ecx, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addw %cx, %dx
+; X86-NEXT:    movswl %dx, %edx
+; X86-NEXT:    sarl $15, %edx
+; X86-NEXT:    xorl $-32768, %edx # imm = 0x8000
+; X86-NEXT:    addw %cx, %ax
+; X86-NEXT:    cmovol %edx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
-; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func16:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    imull %edx, %esi
-; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    addw %si, %cx
-; X64-NEXT:    setns %al
-; X64-NEXT:    addl $32767, %eax # imm = 0x7FFF
-; X64-NEXT:    addw %si, %di
-; X64-NEXT:    cmovnol %edi, %eax
+; X64-NEXT:    leal (%rdi,%rsi), %eax
+; X64-NEXT:    cwtl
+; X64-NEXT:    sarl $15, %eax
+; X64-NEXT:    xorl $-32768, %eax # imm = 0x8000
+; X64-NEXT:    addw %di, %si
+; X64-NEXT:    cmovnol %esi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
   %a = mul i16 %y, %z
@@ -121,31 +109,32 @@ define signext i8 @func8(i8 signext %x, i8 signext %y, i8 signext %z) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    mulb {{[0-9]+}}(%esp)
-; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
-; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    movb %al, %ah
-; X86-NEXT:    addb %dl, %ah
-; X86-NEXT:    setns %cl
-; X86-NEXT:    addl $127, %ecx
-; X86-NEXT:    addb %dl, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    cmovol %ecx, %eax
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    addb %cl, %dl
+; X86-NEXT:    sarb $7, %dl
+; X86-NEXT:    xorb $-128, %dl
+; X86-NEXT:    addb %cl, %al
+; X86-NEXT:    movzbl %al, %ecx
+; X86-NEXT:    movzbl %dl, %eax
+; X86-NEXT:    cmovnol %ecx, %eax
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: func8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    mulb %dl
-; X64-NEXT:    xorl %ecx, %ecx
-; X64-NEXT:    movl %edi, %edx
-; X64-NEXT:    addb %al, %dl
-; X64-NEXT:    setns %cl
-; X64-NEXT:    addl $127, %ecx
-; X64-NEXT:    addb %al, %dil
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    cmovol %ecx, %eax
+; X64-NEXT:    # kill: def $al killed $al def $rax
+; X64-NEXT:    leal (%rdi,%rax), %ecx
+; X64-NEXT:    sarb $7, %cl
+; X64-NEXT:    xorb $-128, %cl
+; X64-NEXT:    addb %dil, %al
+; X64-NEXT:    movzbl %al, %edx
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    cmovnol %edx, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %a = mul i8 %y, %z
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index cc1d94c9c4870..334c7fba0e558 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -429,32 +429,30 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind
 define void @v1i8(<1 x i8>* %px, <1 x i8>* %py, <1 x i8>* %pz) nounwind {
 ; SSE-LABEL: v1i8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movb (%rdi), %cl
-; SSE-NEXT:    movb (%rsi), %dil
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    movl %ecx, %eax
-; SSE-NEXT:    addb %dil, %al
-; SSE-NEXT:    setns %sil
-; SSE-NEXT:    addl $127, %esi
-; SSE-NEXT:    addb %dil, %cl
+; SSE-NEXT:    movb (%rdi), %al
+; SSE-NEXT:    movb (%rsi), %cl
+; SSE-NEXT:    leal (%rax,%rcx), %esi
+; SSE-NEXT:    sarb $7, %sil
+; SSE-NEXT:    xorb $-128, %sil
+; SSE-NEXT:    addb %al, %cl
 ; SSE-NEXT:    movzbl %cl, %eax
-; SSE-NEXT:    cmovol %esi, %eax
-; SSE-NEXT:    movb %al, (%rdx)
+; SSE-NEXT:    movzbl %sil, %ecx
+; SSE-NEXT:    cmovnol %eax, %ecx
+; SSE-NEXT:    movb %cl, (%rdx)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: v1i8:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    movb (%rdi), %cl
-; AVX-NEXT:    movb (%rsi), %dil
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    movl %ecx, %eax
-; AVX-NEXT:    addb %dil, %al
-; AVX-NEXT:    setns %sil
-; AVX-NEXT:    addl $127, %esi
-; AVX-NEXT:    addb %dil, %cl
+; AVX-NEXT:    movb (%rdi), %al
+; AVX-NEXT:    movb (%rsi), %cl
+; AVX-NEXT:    leal (%rax,%rcx), %esi
+; AVX-NEXT:    sarb $7, %sil
+; AVX-NEXT:    xorb $-128, %sil
+; AVX-NEXT:    addb %al, %cl
 ; AVX-NEXT:    movzbl %cl, %eax
-; AVX-NEXT:    cmovol %esi, %eax
-; AVX-NEXT:    movb %al, (%rdx)
+; AVX-NEXT:    movzbl %sil, %ecx
+; AVX-NEXT:    cmovnol %eax, %ecx
+; AVX-NEXT:    movb %cl, (%rdx)
 ; AVX-NEXT:    retq
   %x = load <1 x i8>, <1 x i8>* %px
   %y = load <1 x i8>, <1 x i8>* %py
@@ -468,28 +466,26 @@ define void @v1i16(<1 x i16>* %px, <1 x i16>* %py, <1 x i16>* %pz) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movzwl (%rdi), %eax
 ; SSE-NEXT:    movzwl (%rsi), %ecx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    movl %eax, %edi
-; SSE-NEXT:    addw %cx, %di
-; SSE-NEXT:    setns %sil
-; SSE-NEXT:    addl $32767, %esi # imm = 0x7FFF
-; SSE-NEXT:    addw %cx, %ax
-; SSE-NEXT:    cmovol %esi, %eax
-; SSE-NEXT:    movw %ax, (%rdx)
+; SSE-NEXT:    leal (%rax,%rcx), %esi
+; SSE-NEXT:    movswl %si, %esi
+; SSE-NEXT:    sarl $15, %esi
+; SSE-NEXT:    xorl $-32768, %esi # imm = 0x8000
+; SSE-NEXT:    addw %ax, %cx
+; SSE-NEXT:    cmovol %esi, %ecx
+; SSE-NEXT:    movw %cx, (%rdx)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: v1i16:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movzwl (%rdi), %eax
 ; AVX-NEXT:    movzwl (%rsi), %ecx
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    movl %eax, %edi
-; AVX-NEXT:    addw %cx, %di
-; AVX-NEXT:    setns %sil
-; AVX-NEXT:    addl $32767, %esi # imm = 0x7FFF
-; AVX-NEXT:    addw %cx, %ax
-; AVX-NEXT:    cmovol %esi, %eax
-; AVX-NEXT:    movw %ax, (%rdx)
+; AVX-NEXT:    leal (%rax,%rcx), %esi
+; AVX-NEXT:    movswl %si, %esi
+; AVX-NEXT:    sarl $15, %esi
+; AVX-NEXT:    xorl $-32768, %esi # imm = 0x8000
+; AVX-NEXT:    addw %ax, %cx
+; AVX-NEXT:    cmovol %esi, %ecx
+; AVX-NEXT:    movw %cx, (%rdx)
 ; AVX-NEXT:    retq
   %x = load <1 x i16>, <1 x i16>* %px
   %y = load <1 x i16>, <1 x i16>* %py
@@ -598,84 +594,76 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; SSE2-LABEL: v2i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v2i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSSE3-NEXT:    paddd %xmm0, %xmm1
 ; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm3, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT:    psrld $1, %xmm2
-; SSSE3-NEXT:    por %xmm3, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pandn %xmm1, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v2i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    paddd %xmm1, %xmm3
-; SSE41-NEXT:    movaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm3
-; SSE41-NEXT:    movaps %xmm3, %xmm0
+; SSE41-NEXT:    paddd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movaps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v2i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v2i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %xmm2, %xmm3, %xmm4, %xmm3
 ; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v2i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vblendvps %xmm2, %xmm3, %xmm4, %xmm3
 ; AVX512F-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; AVX512F-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512F-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: v2i32:
@@ -685,10 +673,8 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm2, %k2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512BW-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512BW-NEXT:    vpsrad $31, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    retq
   %z = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
@@ -699,84 +685,76 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE2-LABEL: v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT:    pxor %xmm3, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    por %xmm3, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pandn %xmm1, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    pandn %xmm1, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v4i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm3, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
 ; SSSE3-NEXT:    paddd %xmm0, %xmm1
 ; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSSE3-NEXT:    pxor %xmm3, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm3
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT:    psrld $1, %xmm2
-; SSSE3-NEXT:    por %xmm3, %xmm2
-; SSSE3-NEXT:    pand %xmm0, %xmm2
-; SSSE3-NEXT:    pandn %xmm1, %xmm0
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pandn %xmm1, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT:    pand %xmm1, %xmm0
 ; SSSE3-NEXT:    por %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v4i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    paddd %xmm1, %xmm3
-; SSE41-NEXT:    movaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm3
-; SSE41-NEXT:    movaps %xmm3, %xmm0
+; SSE41-NEXT:    paddd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movaps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v4i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v4i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %xmm2, %xmm3, %xmm4, %xmm3
 ; AVX2-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v4i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpaddd %xmm1, %xmm0, %xmm2
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vblendvps %xmm2, %xmm3, %xmm4, %xmm3
 ; AVX512F-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT:    vblendvps %xmm0, %xmm3, %xmm2, %xmm0
+; AVX512F-NEXT:    vpsrad $31, %xmm2, %xmm1
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512F-NEXT:    vpxor %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: v4i32:
@@ -786,10 +764,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX512BW-NEXT:    vpaddd %xmm1, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm2, %k2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512BW-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512BW-NEXT:    vpsrad $31, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    retq
   %z = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
@@ -805,29 +781,23 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm5, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm2, %xmm6
-; SSE2-NEXT:    pandn %xmm5, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    por %xmm6, %xmm7
-; SSE2-NEXT:    pand %xmm0, %xmm7
-; SSE2-NEXT:    pandn %xmm2, %xmm0
-; SSE2-NEXT:    por %xmm7, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    pandn %xmm2, %xmm5
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm5, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSE2-NEXT:    paddd %xmm1, %xmm3
 ; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    pandn %xmm5, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    por %xmm2, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    pandn %xmm3, %xmm1
-; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    pandn %xmm3, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    pxor %xmm3, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm1
+; SSE2-NEXT:    por %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i32:
@@ -838,55 +808,47 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; SSSE3-NEXT:    paddd %xmm0, %xmm2
 ; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm5, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm6
-; SSSE3-NEXT:    pandn %xmm5, %xmm6
-; SSSE3-NEXT:    pxor %xmm7, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm7
-; SSSE3-NEXT:    psrld $1, %xmm7
-; SSSE3-NEXT:    por %xmm6, %xmm7
-; SSSE3-NEXT:    pand %xmm0, %xmm7
-; SSSE3-NEXT:    pandn %xmm2, %xmm0
-; SSSE3-NEXT:    por %xmm7, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    pandn %xmm2, %xmm5
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm6, %xmm2
+; SSSE3-NEXT:    pand %xmm2, %xmm0
+; SSSE3-NEXT:    por %xmm5, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
 ; SSSE3-NEXT:    paddd %xmm1, %xmm3
 ; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
-; SSSE3-NEXT:    pxor %xmm2, %xmm1
-; SSSE3-NEXT:    movdqa %xmm3, %xmm2
-; SSSE3-NEXT:    pandn %xmm5, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT:    psrld $1, %xmm4
-; SSSE3-NEXT:    por %xmm2, %xmm4
-; SSSE3-NEXT:    pand %xmm1, %xmm4
-; SSSE3-NEXT:    pandn %xmm3, %xmm1
-; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    pandn %xmm3, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    pxor %xmm3, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm1
+; SSSE3-NEXT:    por %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v8i32:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    paddd %xmm2, %xmm5
-; SSE41-NEXT:    movaps {{.*#+}} xmm8 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT:    movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movaps %xmm6, %xmm7
+; SSE41-NEXT:    paddd %xmm2, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm4
+; SSE41-NEXT:    movdqa %xmm5, %xmm1
+; SSE41-NEXT:    paddd %xmm3, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE41-NEXT:    pxor %xmm3, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm8, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT:    pxor %xmm2, %xmm4
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm7, %xmm5
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    paddd %xmm3, %xmm2
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm8, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm6, %xmm2
-; SSE41-NEXT:    movaps %xmm5, %xmm0
-; SSE41-NEXT:    movaps %xmm2, %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movaps %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v8i32:
@@ -896,35 +858,37 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm4
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm5
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %ymm5, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vblendvps %ymm0, %ymm6, %ymm5, %ymm0
+; AVX1-NEXT:    vpsrad $31, %xmm4, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vblendvps %ymm0, %ymm1, %ymm5, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %ymm2, %ymm3, %ymm4, %ymm3
 ; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvps %ymm0, %ymm3, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm2, %ymm1
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v8i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpaddd %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vblendvps %ymm2, %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vblendvps %ymm0, %ymm3, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsrad $31, %ymm2, %ymm1
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX512F-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: v8i32:
@@ -934,10 +898,8 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX512BW-NEXT:    vpaddd %ymm1, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtd %ymm1, %ymm2, %k2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512BW-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpsrad $31, %ymm1, %ymm0
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
   %z = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
@@ -953,57 +915,45 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; SSE2-NEXT:    paddd %xmm0, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
 ; SSE2-NEXT:    pxor %xmm9, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm10
+; SSE2-NEXT:    pandn %xmm4, %xmm10
+; SSE2-NEXT:    psrad $31, %xmm4
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm4, %xmm10
-; SSE2-NEXT:    pandn %xmm9, %xmm10
-; SSE2-NEXT:    pxor %xmm11, %xmm11
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
-; SSE2-NEXT:    psrld $1, %xmm11
-; SSE2-NEXT:    por %xmm10, %xmm11
-; SSE2-NEXT:    pand %xmm0, %xmm11
-; SSE2-NEXT:    pandn %xmm4, %xmm0
-; SSE2-NEXT:    por %xmm11, %xmm0
+; SSE2-NEXT:    pxor %xmm9, %xmm4
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm10, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
 ; SSE2-NEXT:    paddd %xmm1, %xmm5
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm5, %xmm10
-; SSE2-NEXT:    pandn %xmm9, %xmm10
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    por %xmm10, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    pandn %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm5, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm5
+; SSE2-NEXT:    pxor %xmm9, %xmm5
+; SSE2-NEXT:    pand %xmm5, %xmm1
 ; SSE2-NEXT:    por %xmm4, %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
 ; SSE2-NEXT:    paddd %xmm2, %xmm6
 ; SSE2-NEXT:    pcmpgtd %xmm6, %xmm2
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
-; SSE2-NEXT:    pandn %xmm9, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm2, %xmm5
-; SSE2-NEXT:    pandn %xmm6, %xmm2
-; SSE2-NEXT:    por %xmm5, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSE2-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-NEXT:    pandn %xmm6, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm6
+; SSE2-NEXT:    pxor %xmm9, %xmm6
+; SSE2-NEXT:    pand %xmm6, %xmm2
+; SSE2-NEXT:    por %xmm4, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm7, %xmm8
 ; SSE2-NEXT:    paddd %xmm3, %xmm7
 ; SSE2-NEXT:    pcmpgtd %xmm7, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm7, %xmm4
-; SSE2-NEXT:    pandn %xmm9, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm8
-; SSE2-NEXT:    psrld $1, %xmm8
-; SSE2-NEXT:    por %xmm4, %xmm8
-; SSE2-NEXT:    pand %xmm3, %xmm8
-; SSE2-NEXT:    pandn %xmm7, %xmm3
-; SSE2-NEXT:    por %xmm8, %xmm3
+; SSE2-NEXT:    pxor %xmm8, %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-NEXT:    pandn %xmm7, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm7
+; SSE2-NEXT:    pxor %xmm7, %xmm9
+; SSE2-NEXT:    pand %xmm9, %xmm3
+; SSE2-NEXT:    por %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v16i32:
@@ -1014,105 +964,89 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; SSSE3-NEXT:    paddd %xmm0, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm0
 ; SSSE3-NEXT:    pxor %xmm9, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm10
+; SSSE3-NEXT:    pandn %xmm4, %xmm10
+; SSSE3-NEXT:    psrad $31, %xmm4
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm4, %xmm10
-; SSSE3-NEXT:    pandn %xmm9, %xmm10
-; SSSE3-NEXT:    pxor %xmm11, %xmm11
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm11
-; SSSE3-NEXT:    psrld $1, %xmm11
-; SSSE3-NEXT:    por %xmm10, %xmm11
-; SSSE3-NEXT:    pand %xmm0, %xmm11
-; SSSE3-NEXT:    pandn %xmm4, %xmm0
-; SSSE3-NEXT:    por %xmm11, %xmm0
+; SSSE3-NEXT:    pxor %xmm9, %xmm4
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    por %xmm10, %xmm0
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
 ; SSSE3-NEXT:    paddd %xmm1, %xmm5
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm1
 ; SSSE3-NEXT:    pxor %xmm4, %xmm1
-; SSSE3-NEXT:    movdqa %xmm5, %xmm10
-; SSSE3-NEXT:    pandn %xmm9, %xmm10
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    psrld $1, %xmm4
-; SSSE3-NEXT:    por %xmm10, %xmm4
-; SSSE3-NEXT:    pand %xmm1, %xmm4
-; SSSE3-NEXT:    pandn %xmm5, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pandn %xmm5, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm5
+; SSSE3-NEXT:    pxor %xmm9, %xmm5
+; SSSE3-NEXT:    pand %xmm5, %xmm1
 ; SSSE3-NEXT:    por %xmm4, %xmm1
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm4
 ; SSSE3-NEXT:    paddd %xmm2, %xmm6
 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm2
 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
-; SSSE3-NEXT:    movdqa %xmm6, %xmm4
-; SSSE3-NEXT:    pandn %xmm9, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSSE3-NEXT:    psrld $1, %xmm5
-; SSSE3-NEXT:    por %xmm4, %xmm5
-; SSSE3-NEXT:    pand %xmm2, %xmm5
-; SSSE3-NEXT:    pandn %xmm6, %xmm2
-; SSSE3-NEXT:    por %xmm5, %xmm2
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm4
+; SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSSE3-NEXT:    pandn %xmm6, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm6
+; SSSE3-NEXT:    pxor %xmm9, %xmm6
+; SSSE3-NEXT:    pand %xmm6, %xmm2
+; SSSE3-NEXT:    por %xmm4, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm8
 ; SSSE3-NEXT:    paddd %xmm3, %xmm7
 ; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm3
-; SSSE3-NEXT:    pxor %xmm4, %xmm3
-; SSSE3-NEXT:    movdqa %xmm7, %xmm4
-; SSSE3-NEXT:    pandn %xmm9, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm7, %xmm8
-; SSSE3-NEXT:    psrld $1, %xmm8
-; SSSE3-NEXT:    por %xmm4, %xmm8
-; SSSE3-NEXT:    pand %xmm3, %xmm8
-; SSSE3-NEXT:    pandn %xmm7, %xmm3
-; SSSE3-NEXT:    por %xmm8, %xmm3
+; SSSE3-NEXT:    pxor %xmm8, %xmm3
+; SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSSE3-NEXT:    pandn %xmm7, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm7
+; SSSE3-NEXT:    pxor %xmm7, %xmm9
+; SSSE3-NEXT:    pand %xmm9, %xmm3
+; SSSE3-NEXT:    por %xmm4, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v16i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm3, %xmm8
-; SSE41-NEXT:    movdqa %xmm2, %xmm12
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
+; SSE41-NEXT:    movdqa %xmm2, %xmm10
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm9
 ; SSE41-NEXT:    paddd %xmm4, %xmm9
-; SSE41-NEXT:    movaps {{.*#+}} xmm11 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT:    movaps {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movaps %xmm10, %xmm2
-; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm9, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm9
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    paddd %xmm5, %xmm4
-; SSE41-NEXT:    movaps %xmm10, %xmm2
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSE41-NEXT:    pxor %xmm5, %xmm1
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm4
-; SSE41-NEXT:    movdqa %xmm12, %xmm3
-; SSE41-NEXT:    paddd %xmm6, %xmm3
-; SSE41-NEXT:    movaps %xmm10, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm0
+; SSE41-NEXT:    movdqa %xmm9, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm4, %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm9
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    paddd %xmm5, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm5, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm12
-; SSE41-NEXT:    pxor %xmm6, %xmm12
-; SSE41-NEXT:    movdqa %xmm12, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm8, %xmm5
-; SSE41-NEXT:    paddd %xmm7, %xmm5
-; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm10
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm10, %xmm2
+; SSE41-NEXT:    paddd %xmm6, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm10
+; SSE41-NEXT:    pxor %xmm6, %xmm10
+; SSE41-NEXT:    movdqa %xmm2, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm10, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm8, %xmm3
+; SSE41-NEXT:    paddd %xmm7, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm8
 ; SSE41-NEXT:    pxor %xmm7, %xmm8
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    psrad $31, %xmm5
+; SSE41-NEXT:    pxor %xmm4, %xmm5
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm10, %xmm5
+; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm3
 ; SSE41-NEXT:    movaps %xmm9, %xmm0
-; SSE41-NEXT:    movaps %xmm4, %xmm1
-; SSE41-NEXT:    movaps %xmm3, %xmm2
-; SSE41-NEXT:    movaps %xmm5, %xmm3
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v16i32:
@@ -1122,41 +1056,47 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm6
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm7
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm8 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm9 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %ymm7, %ymm8, %ymm9, %ymm10
-; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm5, %xmm5
 ; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT:    vblendvps %ymm0, %ymm10, %ymm7, %ymm0
+; AVX1-NEXT:    vpsrad $31, %xmm6, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm4, %xmm4
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vxorps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm7, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm6
-; AVX1-NEXT:    vblendvps %ymm6, %ymm8, %ymm9, %ymm7
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpaddd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm6
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm6, %ymm7
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm1, %ymm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm3, %ymm1
-; AVX1-NEXT:    vblendvps %ymm1, %ymm7, %ymm6, %ymm1
+; AVX1-NEXT:    vpsrad $31, %xmm6, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vxorps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm7, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v16i32:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm4
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %ymm4, %ymm5, %ymm6, %ymm7
 ; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vblendvps %ymm0, %ymm7, %ymm4, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm4, %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %ymm5, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm4, %ymm0
 ; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm2
-; AVX2-NEXT:    vblendvps %ymm2, %ymm5, %ymm6, %ymm4
 ; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vblendvps %ymm1, %ymm4, %ymm2, %ymm1
+; AVX2-NEXT:    vpsrad $31, %ymm2, %ymm3
+; AVX2-NEXT:    vpxor %ymm5, %ymm3, %ymm3
+; AVX2-NEXT:    vblendvps %ymm1, %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v16i32:
@@ -1166,10 +1106,8 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm2, %k2
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vpsrad $31, %zmm1, %zmm0
+; AVX512-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %z = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
@@ -1194,17 +1132,14 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT:    pxor %xmm3, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT:    pxor %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1225,17 +1160,14 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSSE3-NEXT:    por %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT:    pxor %xmm3, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT:    pxor %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    pandn %xmm0, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm0
-; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSSE3-NEXT:    pand %xmm2, %xmm0
 ; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
 ;
@@ -1331,42 +1263,35 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSE2-NEXT:    pxor %xmm7, %xmm7
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
 ; SSE2-NEXT:    pxor %xmm6, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, %xmm5
-; SSE2-NEXT:    pandn %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm7, %xmm6
+; SSE2-NEXT:    pandn %xmm0, %xmm6
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm8, %xmm0
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm6
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT:    pand %xmm9, %xmm6
-; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    pxor %xmm8, %xmm0
 ; SSE2-NEXT:    pand %xmm7, %xmm0
-; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    por %xmm6, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm4, %xmm6
 ; SSE2-NEXT:    paddq %xmm3, %xmm1
 ; SSE2-NEXT:    pxor %xmm1, %xmm4
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm6, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm7, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE2-NEXT:    pand %xmm5, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    pxor %xmm5, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-NEXT:    pxor %xmm5, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; SSE2-NEXT:    pandn %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm8, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT:    pand %xmm9, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm8, %xmm1
+; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
@@ -1391,42 +1316,35 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm7, %xmm7
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
 ; SSSE3-NEXT:    pxor %xmm6, %xmm7
-; SSSE3-NEXT:    movdqa %xmm7, %xmm5
-; SSSE3-NEXT:    pandn %xmm0, %xmm5
+; SSSE3-NEXT:    movdqa %xmm7, %xmm6
+; SSSE3-NEXT:    pandn %xmm0, %xmm6
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm8, %xmm0
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm6
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT:    pand %xmm9, %xmm6
-; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    pxor %xmm8, %xmm0
 ; SSSE3-NEXT:    pand %xmm7, %xmm0
-; SSSE3-NEXT:    por %xmm5, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    por %xmm6, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm6
+; SSSE3-NEXT:    pxor %xmm4, %xmm6
 ; SSSE3-NEXT:    paddq %xmm3, %xmm1
 ; SSSE3-NEXT:    pxor %xmm1, %xmm4
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm6, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm6, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm7, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm5, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm4, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT:    pxor %xmm5, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSSE3-NEXT:    pxor %xmm5, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm3
 ; SSSE3-NEXT:    pandn %xmm1, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm8, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSSE3-NEXT:    pand %xmm9, %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm4, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm8, %xmm1
+; SSSE3-NEXT:    pand %xmm2, %xmm1
 ; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
 ;
@@ -1477,35 +1395,39 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm4
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm4, %ymm5
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %ymm5, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorpd %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm0, %ymm6, %ymm5, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm1, %ymm5, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm3, %ymm4, %ymm3
 ; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm3, %ymm2, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: v4i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512F-NEXT:    vblendvpd %ymm2, %ymm3, %ymm4, %ymm3
 ; AVX512F-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vblendvpd %ymm0, %ymm3, %ymm2, %ymm0
+; AVX512F-NEXT:    vpsraq $63, %zmm2, %zmm1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX512F-NEXT:    vpxor %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT:    vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: v4i64:
@@ -1515,10 +1437,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; AVX512BW-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpcmpgtq %ymm1, %ymm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtq %ymm1, %ymm2, %k2
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512BW-NEXT:    vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpsraq $63, %ymm1, %ymm0
+; AVX512BW-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
   %z = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
@@ -1544,59 +1464,52 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-NEXT:    por %xmm9, %xmm10
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm11, %xmm11
-; SSE2-NEXT:    pxor %xmm13, %xmm13
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm13
-; SSE2-NEXT:    pxor %xmm10, %xmm13
-; SSE2-NEXT:    movdqa %xmm13, %xmm12
-; SSE2-NEXT:    pandn %xmm0, %xmm12
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm0
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT:    pand %xmm10, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    pand %xmm13, %xmm0
-; SSE2-NEXT:    por %xmm12, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm12
-; SSE2-NEXT:    pxor %xmm8, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm10
+; SSE2-NEXT:    pandn %xmm0, %xmm10
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    pxor %xmm9, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
 ; SSE2-NEXT:    paddq %xmm5, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    movdqa %xmm12, %xmm13
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm4
+; SSE2-NEXT:    movdqa %xmm10, %xmm12
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm14, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm12
+; SSE2-NEXT:    pand %xmm13, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm10
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    pxor %xmm5, %xmm5
 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT:    pxor %xmm12, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, %xmm13
-; SSE2-NEXT:    pandn %xmm1, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm12, %xmm4
-; SSE2-NEXT:    pand %xmm10, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm1
 ; SSE2-NEXT:    pand %xmm5, %xmm1
-; SSE2-NEXT:    por %xmm13, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm12
-; SSE2-NEXT:    pxor %xmm8, %xmm12
+; SSE2-NEXT:    por %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
 ; SSE2-NEXT:    paddq %xmm6, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    movdqa %xmm12, %xmm4
+; SSE2-NEXT:    movdqa %xmm10, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm13, %xmm5
+; SSE2-NEXT:    pand %xmm12, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    por %xmm5, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
@@ -1605,12 +1518,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-NEXT:    pxor %xmm4, %xmm6
 ; SSE2-NEXT:    movdqa %xmm6, %xmm4
 ; SSE2-NEXT:    pandn %xmm2, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm2
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm12, %xmm5
-; SSE2-NEXT:    pand %xmm10, %xmm5
-; SSE2-NEXT:    por %xmm5, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm2
 ; SSE2-NEXT:    pand %xmm6, %xmm2
 ; SSE2-NEXT:    por %xmm4, %xmm2
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
@@ -1626,17 +1536,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT:    pxor %xmm5, %xmm6
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSE2-NEXT:    pxor %xmm5, %xmm11
+; SSE2-NEXT:    movdqa %xmm11, %xmm4
 ; SSE2-NEXT:    pandn %xmm3, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm11
-; SSE2-NEXT:    pand %xmm10, %xmm11
-; SSE2-NEXT:    por %xmm11, %xmm3
-; SSE2-NEXT:    pand %xmm6, %xmm3
+; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm3
+; SSE2-NEXT:    pand %xmm11, %xmm3
 ; SSE2-NEXT:    por %xmm4, %xmm3
 ; SSE2-NEXT:    retq
 ;
@@ -1658,59 +1565,52 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSSE3-NEXT:    por %xmm9, %xmm10
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm11, %xmm11
-; SSSE3-NEXT:    pxor %xmm13, %xmm13
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm13
-; SSSE3-NEXT:    pxor %xmm10, %xmm13
-; SSSE3-NEXT:    movdqa %xmm13, %xmm12
-; SSSE3-NEXT:    pandn %xmm0, %xmm12
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm0
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT:    pand %xmm10, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm0
-; SSSE3-NEXT:    pand %xmm13, %xmm0
-; SSSE3-NEXT:    por %xmm12, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm12
-; SSSE3-NEXT:    pxor %xmm8, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm10
+; SSSE3-NEXT:    pandn %xmm0, %xmm10
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    pxor %xmm9, %xmm0
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    por %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm10
+; SSSE3-NEXT:    pxor %xmm8, %xmm10
 ; SSSE3-NEXT:    paddq %xmm5, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pxor %xmm8, %xmm4
-; SSSE3-NEXT:    movdqa %xmm12, %xmm13
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm13
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm12, %xmm4
+; SSSE3-NEXT:    movdqa %xmm10, %xmm12
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm12
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm14, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm12
+; SSSE3-NEXT:    pand %xmm13, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm10
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
 ; SSSE3-NEXT:    pxor %xmm5, %xmm5
 ; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT:    pxor %xmm12, %xmm5
-; SSSE3-NEXT:    movdqa %xmm5, %xmm13
-; SSSE3-NEXT:    pandn %xmm1, %xmm13
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm1
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm12, %xmm4
-; SSSE3-NEXT:    pand %xmm10, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
+; SSSE3-NEXT:    pandn %xmm1, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm9, %xmm1
 ; SSSE3-NEXT:    pand %xmm5, %xmm1
-; SSSE3-NEXT:    por %xmm13, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm12
-; SSSE3-NEXT:    pxor %xmm8, %xmm12
+; SSSE3-NEXT:    por %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm10
+; SSSE3-NEXT:    pxor %xmm8, %xmm10
 ; SSSE3-NEXT:    paddq %xmm6, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
 ; SSSE3-NEXT:    pxor %xmm8, %xmm5
-; SSSE3-NEXT:    movdqa %xmm12, %xmm4
+; SSSE3-NEXT:    movdqa %xmm10, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm12, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm13, %xmm5
+; SSSE3-NEXT:    pand %xmm12, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm5, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
@@ -1719,12 +1619,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm4, %xmm6
 ; SSSE3-NEXT:    movdqa %xmm6, %xmm4
 ; SSSE3-NEXT:    pandn %xmm2, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm2
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm12, %xmm5
-; SSSE3-NEXT:    pand %xmm10, %xmm5
-; SSSE3-NEXT:    por %xmm5, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm9, %xmm2
 ; SSSE3-NEXT:    pand %xmm6, %xmm2
 ; SSSE3-NEXT:    por %xmm4, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
@@ -1740,17 +1637,14 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm4, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT:    pxor %xmm5, %xmm6
-; SSSE3-NEXT:    movdqa %xmm6, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSSE3-NEXT:    pxor %xmm5, %xmm11
+; SSSE3-NEXT:    movdqa %xmm11, %xmm4
 ; SSSE3-NEXT:    pandn %xmm3, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm11
-; SSSE3-NEXT:    pand %xmm10, %xmm11
-; SSSE3-NEXT:    por %xmm11, %xmm3
-; SSSE3-NEXT:    pand %xmm6, %xmm3
+; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm9, %xmm3
+; SSSE3-NEXT:    pand %xmm11, %xmm3
 ; SSSE3-NEXT:    por %xmm4, %xmm3
 ; SSSE3-NEXT:    retq
 ;
@@ -1835,41 +1729,49 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm6
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm7
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm8 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm9 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %ymm7, %ymm8, %ymm9, %ymm10
-; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm5
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorpd %ymm0, %ymm2, %ymm0
-; AVX1-NEXT:    vblendvpd %ymm0, %ymm10, %ymm7, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm6
-; AVX1-NEXT:    vblendvpd %ymm6, %ymm8, %ymm9, %ymm7
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm2, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorpd %ymm5, %ymm4, %ymm4
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm4, %ymm7, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm4
+; AVX1-NEXT:    vpaddq %xmm3, %xmm1, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm7, %ymm8
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
 ; AVX1-NEXT:    vxorpd %ymm1, %ymm3, %ymm1
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm7, %ymm6, %ymm1
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm3
+; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vxorpd %ymm5, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm8, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm4
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %ymm4, %ymm5, %ymm6, %ymm7
 ; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm7, %ymm4, %ymm0
-; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm2
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm5, %ymm6, %ymm4
-; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm2, %ymm5
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm4, %ymm0
+; AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm4
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm1, %ymm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm4, %ymm2, %ymm1
+; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm4, %ymm1
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: v8i64:
@@ -1879,10 +1781,8 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm2, %k2
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512-NEXT:    vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vpsraq $63, %zmm1, %zmm0
+; AVX512-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %z = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
@@ -1900,31 +1800,25 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; SSE-NEXT:    movq %r8, %rbx
 ; SSE-NEXT:    sarq $63, %rbx
 ; SSE-NEXT:    testb %r10b, %r10b
-; SSE-NEXT:    cmoveq %rcx, %rbx
-; SSE-NEXT:    xorl %ecx, %ecx
-; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    setns %cl
-; SSE-NEXT:    movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
-; SSE-NEXT:    addq %r11, %rcx
+; SSE-NEXT:    cmovneq %rbx, %rcx
+; SSE-NEXT:    movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %r11, %rbx
 ; SSE-NEXT:    testb %r10b, %r10b
-; SSE-NEXT:    cmoveq %r8, %rcx
+; SSE-NEXT:    cmoveq %r8, %rbx
 ; SSE-NEXT:    addq %r9, %rsi
 ; SSE-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; SSE-NEXT:    seto %r8b
 ; SSE-NEXT:    movq %rdx, %rdi
 ; SSE-NEXT:    sarq $63, %rdi
 ; SSE-NEXT:    testb %r8b, %r8b
-; SSE-NEXT:    cmoveq %rsi, %rdi
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    setns %sil
-; SSE-NEXT:    addq %r11, %rsi
+; SSE-NEXT:    cmovneq %rdi, %rsi
+; SSE-NEXT:    xorq %r11, %rdi
 ; SSE-NEXT:    testb %r8b, %r8b
-; SSE-NEXT:    cmoveq %rdx, %rsi
-; SSE-NEXT:    movq %rbx, 16(%rax)
-; SSE-NEXT:    movq %rdi, (%rax)
-; SSE-NEXT:    movq %rcx, 24(%rax)
-; SSE-NEXT:    movq %rsi, 8(%rax)
+; SSE-NEXT:    cmoveq %rdx, %rdi
+; SSE-NEXT:    movq %rcx, 16(%rax)
+; SSE-NEXT:    movq %rsi, (%rax)
+; SSE-NEXT:    movq %rbx, 24(%rax)
+; SSE-NEXT:    movq %rdi, 8(%rax)
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    retq
 ;
@@ -1938,31 +1832,25 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; AVX-NEXT:    movq %r8, %rbx
 ; AVX-NEXT:    sarq $63, %rbx
 ; AVX-NEXT:    testb %r10b, %r10b
-; AVX-NEXT:    cmoveq %rcx, %rbx
-; AVX-NEXT:    xorl %ecx, %ecx
-; AVX-NEXT:    testq %r8, %r8
-; AVX-NEXT:    setns %cl
-; AVX-NEXT:    movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
-; AVX-NEXT:    addq %r11, %rcx
+; AVX-NEXT:    cmovneq %rbx, %rcx
+; AVX-NEXT:    movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; AVX-NEXT:    xorq %r11, %rbx
 ; AVX-NEXT:    testb %r10b, %r10b
-; AVX-NEXT:    cmoveq %r8, %rcx
+; AVX-NEXT:    cmoveq %r8, %rbx
 ; AVX-NEXT:    addq %r9, %rsi
 ; AVX-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; AVX-NEXT:    seto %r8b
 ; AVX-NEXT:    movq %rdx, %rdi
 ; AVX-NEXT:    sarq $63, %rdi
 ; AVX-NEXT:    testb %r8b, %r8b
-; AVX-NEXT:    cmoveq %rsi, %rdi
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    testq %rdx, %rdx
-; AVX-NEXT:    setns %sil
-; AVX-NEXT:    addq %r11, %rsi
+; AVX-NEXT:    cmovneq %rdi, %rsi
+; AVX-NEXT:    xorq %r11, %rdi
 ; AVX-NEXT:    testb %r8b, %r8b
-; AVX-NEXT:    cmoveq %rdx, %rsi
-; AVX-NEXT:    movq %rbx, 16(%rax)
-; AVX-NEXT:    movq %rdi, (%rax)
-; AVX-NEXT:    movq %rcx, 24(%rax)
-; AVX-NEXT:    movq %rsi, 8(%rax)
+; AVX-NEXT:    cmoveq %rdx, %rdi
+; AVX-NEXT:    movq %rcx, 16(%rax)
+; AVX-NEXT:    movq %rsi, (%rax)
+; AVX-NEXT:    movq %rbx, 24(%rax)
+; AVX-NEXT:    movq %rdi, 8(%rax)
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    retq
   %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll
index cb558e23c00f9..584c37a3bb7ce 100644
--- a/llvm/test/CodeGen/X86/seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/seh-catchpad.ll
@@ -120,23 +120,23 @@ __except.ret:                                     ; preds = %catch.dispatch.7
 ; CHECK-NEXT:         .set .Lmain$parent_frame_offset, 32
 ; CHECK-NEXT:         .long   (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
-; CHECK-NEXT:         .long   .Ltmp0@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp0@IMGREL
 ; CHECK-NEXT:         .long   .Ltmp1@IMGREL+1
 ; CHECK-NEXT:         .long   1
 ; CHECK-NEXT:         .long   .LBB1_[[except1bb]]@IMGREL
-; CHECK-NEXT:         .long   .Ltmp0@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp0@IMGREL
 ; CHECK-NEXT:         .long   .Ltmp1@IMGREL+1
 ; CHECK-NEXT:         .long   "?filt$0@0@main@@"@IMGREL
 ; CHECK-NEXT:         .long   .LBB1_[[except2bb]]@IMGREL
-; CHECK-NEXT:         .long   .Ltmp2@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp2@IMGREL
 ; CHECK-NEXT:         .long   .Ltmp3@IMGREL+1
 ; CHECK-NEXT:         .long   "?dtor$[[finbb:[0-9]+]]@?0?main@4HA"@IMGREL
 ; CHECK-NEXT:         .long   0
-; CHECK-NEXT:         .long   .Ltmp2@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp2@IMGREL
 ; CHECK-NEXT:         .long   .Ltmp3@IMGREL+1
 ; CHECK-NEXT:         .long   "?filt$0@0@main@@"@IMGREL
 ; CHECK-NEXT:         .long   .LBB1_3@IMGREL
-; CHECK-NEXT:         .long   .Ltmp6@IMGREL+1
+; CHECK-NEXT:         .long   .Ltmp6@IMGREL
 ; CHECK-NEXT:         .long   .Ltmp7@IMGREL+1
 ; CHECK-NEXT:         .long   "?filt$0@0@main@@"@IMGREL
 ; CHECK-NEXT:         .long   .LBB1_3@IMGREL
diff --git a/llvm/test/CodeGen/X86/seh-except-finally.ll b/llvm/test/CodeGen/X86/seh-except-finally.ll
index 0a1e02217b791..759d655192772 100644
--- a/llvm/test/CodeGen/X86/seh-except-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-except-finally.ll
@@ -82,15 +82,15 @@ __try.cont:                                       ; preds = %__except, %invoke.c
 ; CHECK-NEXT: .Luse_both$parent_frame_offset
 ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
-; CHECK-NEXT: .long .Ltmp0@IMGREL+1
+; CHECK-NEXT: .long .Ltmp0@IMGREL
 ; CHECK-NEXT: .long .Ltmp1@IMGREL+1
 ; CHECK-NEXT: .long "?dtor$2@?0?use_both@4HA"@IMGREL
 ; CHECK-NEXT: .long 0
-; CHECK-NEXT: .long .Ltmp0@IMGREL+1
+; CHECK-NEXT: .long .Ltmp0@IMGREL
 ; CHECK-NEXT: .long .Ltmp1@IMGREL+1
 ; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL
 ; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL
-; CHECK-NEXT: .long .Ltmp4@IMGREL+1
+; CHECK-NEXT: .long .Ltmp4@IMGREL
 ; CHECK-NEXT: .long .Ltmp5@IMGREL+1
 ; CHECK-NEXT: .long "?filt$0@0@use_both@@"@IMGREL
 ; CHECK-NEXT: .long .LBB0_{{[0-9]+}}@IMGREL
diff --git a/llvm/test/CodeGen/X86/seh-finally.ll b/llvm/test/CodeGen/X86/seh-finally.ll
index ff6c25770b405..bd005406bd9cd 100644
--- a/llvm/test/CodeGen/X86/seh-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-finally.ll
@@ -29,7 +29,7 @@ lpad:                                             ; preds = %entry
 ; X64-NEXT: .set .Lmain$parent_frame_offset, 32
 ; X64-NEXT: .long   (.Llsda_end0-.Llsda_begin0)/16 # Number of call sites
 ; X64-NEXT: .Llsda_begin0:
-; X64-NEXT: .long   .Ltmp0@IMGREL+1 # LabelStart
+; X64-NEXT: .long   .Ltmp0@IMGREL # LabelStart
 ; X64-NEXT: .long   .Ltmp1@IMGREL+1 # LabelEnd
 ; X64-NEXT: .long   "?dtor$2@?0?main@4HA"@IMGREL # FinallyFunclet
 ; X64-NEXT: .long   0               # Null
diff --git a/llvm/test/CodeGen/X86/seh-safe-div.ll b/llvm/test/CodeGen/X86/seh-safe-div.ll
index d46e235c59ac0..7e4156a8ffc63 100644
--- a/llvm/test/CodeGen/X86/seh-safe-div.ll
+++ b/llvm/test/CodeGen/X86/seh-safe-div.ll
@@ -81,11 +81,11 @@ __try.cont:
 ; CHECK-NEXT: .Lsafe_div$parent_frame_offset
 ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16
 ; CHECK-NEXT: .Llsda_begin0:
-; CHECK-NEXT: .long .Ltmp0@IMGREL+1
+; CHECK-NEXT: .long .Ltmp0@IMGREL
 ; CHECK-NEXT: .long .Ltmp1@IMGREL+1
 ; CHECK-NEXT: .long safe_div_filt0@IMGREL
 ; CHECK-NEXT: .long [[handler0]]@IMGREL
-; CHECK-NEXT: .long .Ltmp0@IMGREL+1
+; CHECK-NEXT: .long .Ltmp0@IMGREL
 ; CHECK-NEXT: .long .Ltmp1@IMGREL+1
 ; CHECK-NEXT: .long safe_div_filt1@IMGREL
 ; CHECK-NEXT: .long [[handler1]]@IMGREL
diff --git a/llvm/test/CodeGen/X86/select-of-half-constants.ll b/llvm/test/CodeGen/X86/select-of-half-constants.ll
new file mode 100644
index 0000000000000..e22d4c8b792dc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/select-of-half-constants.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512fp16  | FileCheck %s --check-prefixes=X64-AVX512FP16
+
+; This should do a single load into the fp stack for the return, not diddle with xmm registers.
+
+define half @fcmp_select_fp_constants_olt(half %x) nounwind readnone {
+; X64-AVX512FP16-LABEL: fcmp_select_fp_constants_olt:
+; X64-AVX512FP16:       # %bb.0:
+; X64-AVX512FP16-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-AVX512FP16-NEXT:    vcmpltsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
+; X64-AVX512FP16-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-AVX512FP16-NEXT:    vmovsh %xmm1, %xmm0, %xmm0 {%k1}
+; X64-AVX512FP16-NEXT:    retq
+  %c = fcmp olt half %x, -4.0
+  %r = select i1 %c, half 42.0, half 23.0
+  ret half %r
+}
+
+define half @fcmp_select_fp_constants_ogt(half %x) nounwind readnone {
+; X64-AVX512FP16-LABEL: fcmp_select_fp_constants_ogt:
+; X64-AVX512FP16:       # %bb.0:
+; X64-AVX512FP16-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-AVX512FP16-NEXT:    vcmpgtsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
+; X64-AVX512FP16-NEXT:    vmovsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-AVX512FP16-NEXT:    vmovsh %xmm1, %xmm0, %xmm0 {%k1}
+; X64-AVX512FP16-NEXT:    retq
+  %c = fcmp ogt half %x, -4.0
+  %r = select i1 %c, half 42.0, half 23.0
+  ret half %r
+}
+
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index 17aae3373c5ec..22f428b0604d6 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
 ; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X86-SSE
 ; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX1
 ; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X86-AVX512
@@ -1661,15 +1661,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-AVX1-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X86-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X86-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
 ; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X86-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
+; X86-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
+; X86-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
+; X86-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
 ; X86-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
 ; X86-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
@@ -1679,16 +1679,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
 ; X86-AVX512-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X86-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X86-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
+; X86-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
+; X86-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
+; X86-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
 ; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X86-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X86-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X86-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
-; X86-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
+; X86-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
 ; X86-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -1712,15 +1712,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use:
 ; X64-AVX1:       ## %bb.0:
 ; X64-AVX1-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X64-AVX1-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X64-AVX1-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
 ; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X64-AVX1-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
+; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08]
+; X64-AVX1-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[3]
+; X64-AVX1-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08]
+; X64-AVX1-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[3]
 ; X64-AVX1-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca]
 ; X64-AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
@@ -1728,16 +1728,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use:
 ; X64-AVX512:       ## %bb.0:
 ; X64-AVX512-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30]
-; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30]
-; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[0]
+; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08]
+; X64-AVX512-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[3]
+; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08]
+; X64-AVX512-NEXT:    ## xmm1 = xmm1[0,1,2],xmm4[3]
+; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08]
+; X64-AVX512-NEXT:    ## xmm2 = xmm2[0,1,2],xmm4[3]
+; X64-AVX512-NEXT:    vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08]
+; X64-AVX512-NEXT:    ## xmm3 = xmm3[0,1,2],xmm4[3]
 ; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30]
-; X64-AVX512-NEXT:    ## xmm1 = xmm2[0,1,2],xmm4[0]
-; X64-AVX512-NEXT:    vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30]
-; X64-AVX512-NEXT:    ## xmm2 = xmm3[0,1,2],xmm4[0]
-; X64-AVX512-NEXT:    vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca]
+; X64-AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb]
 ; X64-AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %1 = getelementptr inbounds float, float* %fb, i64 %index
diff --git a/llvm/test/CodeGen/X86/ssub_sat.ll b/llvm/test/CodeGen/X86/ssub_sat.ll
index bdb45877c913b..7199d326738cf 100644
--- a/llvm/test/CodeGen/X86/ssub_sat.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat.ll
@@ -39,23 +39,18 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
 ; X86-LABEL: func2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seto %bl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %ecx, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setns %dl
-; X86-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    xorl $-2147483648, %edx # imm = 0x80000000
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %esi, %edx
-; X86-NEXT:    popl %esi
+; X86-NEXT:    cmovel %ecx, %edx
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
@@ -212,20 +207,19 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ;
 ; X64-LABEL: vec:
 ; X64:       # %bb.0:
-; X64-NEXT:    pxor %xmm2, %xmm2
-; X64-NEXT:    movdqa %xmm0, %xmm3
-; X64-NEXT:    psubd %xmm1, %xmm3
-; X64-NEXT:    pcmpgtd %xmm2, %xmm1
-; X64-NEXT:    pcmpgtd %xmm3, %xmm0
+; X64-NEXT:    pxor %xmm3, %xmm3
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    psubd %xmm1, %xmm2
+; X64-NEXT:    pcmpgtd %xmm3, %xmm1
+; X64-NEXT:    pcmpgtd %xmm2, %xmm0
 ; X64-NEXT:    pxor %xmm1, %xmm0
-; X64-NEXT:    movdqa %xmm3, %xmm1
-; X64-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; X64-NEXT:    pcmpgtd %xmm3, %xmm2
-; X64-NEXT:    psrld $1, %xmm2
-; X64-NEXT:    por %xmm2, %xmm1
-; X64-NEXT:    pand %xmm0, %xmm1
-; X64-NEXT:    pandn %xmm3, %xmm0
-; X64-NEXT:    por %xmm1, %xmm0
+; X64-NEXT:    movdqa %xmm0, %xmm1
+; X64-NEXT:    pandn %xmm2, %xmm1
+; X64-NEXT:    psrad $31, %xmm2
+; X64-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; X64-NEXT:    pand %xmm0, %xmm2
+; X64-NEXT:    por %xmm1, %xmm2
+; X64-NEXT:    movdqa %xmm2, %xmm0
 ; X64-NEXT:    retq
   %tmp = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %tmp
diff --git a/llvm/test/CodeGen/X86/ssub_sat_plus.ll b/llvm/test/CodeGen/X86/ssub_sat_plus.ll
index dc0804994a777..51cd25c7885a8 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_plus.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_plus.ll
@@ -41,23 +41,18 @@ define i64 @func64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-LABEL: func64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    seto %bl
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %ecx, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    setns %dl
-; X86-NEXT:    addl $2147483647, %edx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmovnel %edx, %eax
+; X86-NEXT:    xorl $-2147483648, %edx # imm = 0x80000000
 ; X86-NEXT:    testb %bl, %bl
-; X86-NEXT:    cmovel %esi, %edx
-; X86-NEXT:    popl %esi
+; X86-NEXT:    cmovel %ecx, %edx
 ; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index 0ae0cb5b557e2..66da1cb172db3 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -612,55 +612,51 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind {
 define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; SSE2-LABEL: v2i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psubd %xmm1, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v2i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    psubd %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    psubd %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm1, %xmm0
-; SSSE3-NEXT:    movdqa %xmm3, %xmm1
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT:    psrld $1, %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pandn %xmm3, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm2, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v2i32:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    psubd %xmm1, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT:    movaps %xmm3, %xmm0
+; SSE41-NEXT:    psubd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movaps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v2i32:
@@ -670,8 +666,8 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -682,9 +678,9 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %xmm1, %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -695,9 +691,9 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX512F-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX512F-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vblendvps %xmm1, %xmm2, %xmm3, %xmm2
+; AVX512F-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512F-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -708,10 +704,8 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX512BW-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm2, %k2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512BW-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512BW-NEXT:    vpsrad $31, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    retq
   %z = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %x, <2 x i32> %y)
@@ -721,55 +715,51 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; SSE2-LABEL: v4i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    psubd %xmm1, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psubd %xmm1, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pandn %xmm3, %xmm0
-; SSE2-NEXT:    por %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pandn %xmm2, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    por %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v4i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    movdqa %xmm0, %xmm3
-; SSSE3-NEXT:    psubd %xmm1, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    psubd %xmm1, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSSE3-NEXT:    pxor %xmm1, %xmm0
-; SSSE3-NEXT:    movdqa %xmm3, %xmm1
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSSE3-NEXT:    psrld $1, %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm1
-; SSSE3-NEXT:    pand %xmm0, %xmm1
-; SSSE3-NEXT:    pandn %xmm3, %xmm0
-; SSSE3-NEXT:    por %xmm1, %xmm0
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pandn %xmm2, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSSE3-NEXT:    por %xmm1, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v4i32:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    pxor %xmm0, %xmm0
-; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    psubd %xmm1, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm1, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT:    movaps %xmm3, %xmm0
+; SSE41-NEXT:    psubd %xmm1, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm2, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT:    movaps %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v4i32:
@@ -779,8 +769,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -791,9 +781,9 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX2-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %xmm1, %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -804,9 +794,9 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX512F-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX512F-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vblendvps %xmm1, %xmm2, %xmm3, %xmm2
+; AVX512F-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
+; AVX512F-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vblendvps %xmm0, %xmm2, %xmm1, %xmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -817,10 +807,8 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX512BW-NEXT:    vpsubd %xmm1, %xmm0, %xmm1
 ; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm2, %k2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512BW-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm1 {%k1}
+; AVX512BW-NEXT:    vpsrad $31, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512BW-NEXT:    retq
   %z = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %x, <4 x i32> %y)
@@ -830,97 +818,87 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; SSE2-LABEL: v8i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    psubd %xmm2, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm5, %xmm6
-; SSE2-NEXT:    pandn %xmm2, %xmm6
-; SSE2-NEXT:    pxor %xmm7, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT:    psrld $1, %xmm7
-; SSE2-NEXT:    por %xmm7, %xmm6
-; SSE2-NEXT:    pand %xmm0, %xmm6
-; SSE2-NEXT:    pandn %xmm5, %xmm0
-; SSE2-NEXT:    por %xmm6, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    psubd %xmm3, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    psubd %xmm2, %xmm0
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    pandn %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm6, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psubd %xmm3, %xmm2
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm3, %xmm1
-; SSE2-NEXT:    movdqa %xmm5, %xmm3
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
 ; SSE2-NEXT:    pandn %xmm2, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm3
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    pandn %xmm5, %xmm1
-; SSE2-NEXT:    por %xmm3, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pxor %xmm6, %xmm2
+; SSE2-NEXT:    pand %xmm1, %xmm2
+; SSE2-NEXT:    por %xmm3, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v8i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    psubd %xmm2, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm5, %xmm6
-; SSSE3-NEXT:    pandn %xmm2, %xmm6
-; SSSE3-NEXT:    pxor %xmm7, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT:    psrld $1, %xmm7
-; SSSE3-NEXT:    por %xmm7, %xmm6
-; SSSE3-NEXT:    pand %xmm0, %xmm6
-; SSSE3-NEXT:    pandn %xmm5, %xmm0
-; SSSE3-NEXT:    por %xmm6, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm5
-; SSSE3-NEXT:    psubd %xmm3, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm5, %xmm5
+; SSSE3-NEXT:    psubd %xmm2, %xmm0
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSSE3-NEXT:    pxor %xmm2, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm2
+; SSSE3-NEXT:    pandn %xmm0, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm6, %xmm0
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    por %xmm2, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    psubd %xmm3, %xmm2
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm2, %xmm1
 ; SSSE3-NEXT:    pxor %xmm3, %xmm1
-; SSSE3-NEXT:    movdqa %xmm5, %xmm3
+; SSSE3-NEXT:    movdqa %xmm1, %xmm3
 ; SSSE3-NEXT:    pandn %xmm2, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    psrld $1, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm3
-; SSSE3-NEXT:    pand %xmm1, %xmm3
-; SSSE3-NEXT:    pandn %xmm5, %xmm1
-; SSSE3-NEXT:    por %xmm3, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pxor %xmm6, %xmm2
+; SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSSE3-NEXT:    por %xmm3, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v8i32:
 ; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa %xmm1, %xmm5
+; SSE41-NEXT:    pxor %xmm6, %xmm6
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
-; SSE41-NEXT:    pxor %xmm8, %xmm8
-; SSE41-NEXT:    movdqa %xmm0, %xmm5
-; SSE41-NEXT:    psubd %xmm2, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm8, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT:    pxor %xmm2, %xmm4
-; SSE41-NEXT:    movaps {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT:    movaps {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movaps %xmm6, %xmm2
+; SSE41-NEXT:    psubd %xmm2, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT:    pxor %xmm2, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm2, %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm4
+; SSE41-NEXT:    movdqa %xmm5, %xmm1
+; SSE41-NEXT:    psubd %xmm3, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm6, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE41-NEXT:    pxor %xmm3, %xmm5
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    pxor %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm7, %xmm2
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm5
-; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    psubd %xmm3, %xmm2
-; SSE41-NEXT:    pcmpgtd %xmm8, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
-; SSE41-NEXT:    pxor %xmm3, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm7, %xmm6
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm6, %xmm2
-; SSE41-NEXT:    movaps %xmm5, %xmm0
-; SSE41-NEXT:    movaps %xmm2, %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm1
+; SSE41-NEXT:    movaps %xmm4, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v8i32:
@@ -937,10 +915,12 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm3
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %ymm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vblendvps %ymm0, %ymm1, %ymm3, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i32:
@@ -950,9 +930,9 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %ymm1, %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpsrad $31, %ymm1, %ymm2
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -963,9 +943,9 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX512F-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX512F-NEXT:    vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT:    vblendvps %ymm1, %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsrad $31, %ymm1, %ymm2
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX512F-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vblendvps %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -976,10 +956,8 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 ; AVX512BW-NEXT:    vpsubd %ymm1, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtd %ymm1, %ymm2, %k2
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512BW-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpsrad $31, %ymm1, %ymm0
+; AVX512BW-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
   %z = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> %x, <8 x i32> %y)
@@ -989,176 +967,157 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
 define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; SSE2-LABEL: v16i32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    pxor %xmm8, %xmm8
-; SSE2-NEXT:    movdqa %xmm0, %xmm9
-; SSE2-NEXT:    psubd %xmm4, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm9, %xmm11
-; SSE2-NEXT:    pandn %xmm10, %xmm11
-; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm9, %xmm9
+; SSE2-NEXT:    psubd %xmm4, %xmm0
 ; SSE2-NEXT:    pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm11
-; SSE2-NEXT:    pand %xmm0, %xmm11
-; SSE2-NEXT:    pandn %xmm9, %xmm0
-; SSE2-NEXT:    por %xmm11, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm9
-; SSE2-NEXT:    psubd %xmm5, %xmm9
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT:    pxor %xmm5, %xmm1
-; SSE2-NEXT:    movdqa %xmm9, %xmm5
-; SSE2-NEXT:    pandn %xmm10, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm9, %xmm4
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm1, %xmm5
-; SSE2-NEXT:    pandn %xmm9, %xmm1
-; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm4, %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    pandn %xmm0, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT:    pxor %xmm10, %xmm0
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NEXT:    psubd %xmm5, %xmm1
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm1, %xmm8
+; SSE2-NEXT:    pxor %xmm5, %xmm8
+; SSE2-NEXT:    movdqa %xmm8, %xmm4
+; SSE2-NEXT:    pandn %xmm1, %xmm4
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm1
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    por %xmm4, %xmm1
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; SSE2-NEXT:    psubd %xmm6, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm6
 ; SSE2-NEXT:    pcmpgtd %xmm4, %xmm2
 ; SSE2-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm10, %xmm5
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT:    psrld $1, %xmm6
-; SSE2-NEXT:    por %xmm6, %xmm5
-; SSE2-NEXT:    pand %xmm2, %xmm5
-; SSE2-NEXT:    pandn %xmm4, %xmm2
-; SSE2-NEXT:    por %xmm5, %xmm2
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psubd %xmm7, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm8, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    pandn %xmm4, %xmm5
+; SSE2-NEXT:    psrad $31, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    pand %xmm2, %xmm4
+; SSE2-NEXT:    por %xmm5, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    psubd %xmm7, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm9, %xmm7
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm3
 ; SSE2-NEXT:    pxor %xmm7, %xmm3
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    pandn %xmm10, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm8
-; SSE2-NEXT:    psrld $1, %xmm8
-; SSE2-NEXT:    por %xmm8, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    pandn %xmm5, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm5
+; SSE2-NEXT:    pxor %xmm10, %xmm5
 ; SSE2-NEXT:    pand %xmm3, %xmm5
-; SSE2-NEXT:    pandn %xmm4, %xmm3
-; SSE2-NEXT:    por %xmm5, %xmm3
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm5, %xmm3
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: v16i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pxor %xmm8, %xmm8
-; SSSE3-NEXT:    movdqa %xmm0, %xmm9
-; SSSE3-NEXT:    psubd %xmm4, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm0
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm9, %xmm11
-; SSSE3-NEXT:    pandn %xmm10, %xmm11
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm4
-; SSSE3-NEXT:    psrld $1, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm11
-; SSSE3-NEXT:    pand %xmm0, %xmm11
-; SSSE3-NEXT:    pandn %xmm9, %xmm0
-; SSSE3-NEXT:    por %xmm11, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm9
-; SSSE3-NEXT:    psubd %xmm5, %xmm9
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT:    pxor %xmm5, %xmm1
-; SSSE3-NEXT:    movdqa %xmm9, %xmm5
-; SSSE3-NEXT:    pandn %xmm10, %xmm5
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
+; SSSE3-NEXT:    movdqa %xmm1, %xmm8
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm9, %xmm9
+; SSSE3-NEXT:    psubd %xmm4, %xmm0
 ; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm4
-; SSSE3-NEXT:    psrld $1, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm5
-; SSSE3-NEXT:    pand %xmm1, %xmm5
-; SSSE3-NEXT:    pandn %xmm9, %xmm1
-; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pandn %xmm0, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT:    pxor %xmm10, %xmm0
+; SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    movdqa %xmm8, %xmm1
+; SSSE3-NEXT:    psubd %xmm5, %xmm1
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm8
+; SSSE3-NEXT:    pxor %xmm5, %xmm8
+; SSSE3-NEXT:    movdqa %xmm8, %xmm4
+; SSSE3-NEXT:    pandn %xmm1, %xmm4
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm1
+; SSSE3-NEXT:    pand %xmm8, %xmm1
+; SSSE3-NEXT:    por %xmm4, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm4
 ; SSSE3-NEXT:    psubd %xmm6, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm6
 ; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm2
 ; SSSE3-NEXT:    pxor %xmm6, %xmm2
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pandn %xmm10, %xmm5
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT:    psrld $1, %xmm6
-; SSSE3-NEXT:    por %xmm6, %xmm5
-; SSSE3-NEXT:    pand %xmm2, %xmm5
-; SSSE3-NEXT:    pandn %xmm4, %xmm2
-; SSSE3-NEXT:    por %xmm5, %xmm2
-; SSSE3-NEXT:    movdqa %xmm3, %xmm4
-; SSSE3-NEXT:    psubd %xmm7, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm3
+; SSSE3-NEXT:    movdqa %xmm2, %xmm5
+; SSSE3-NEXT:    pandn %xmm4, %xmm5
+; SSSE3-NEXT:    psrad $31, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    pand %xmm2, %xmm4
+; SSSE3-NEXT:    por %xmm5, %xmm4
+; SSSE3-NEXT:    movdqa %xmm3, %xmm5
+; SSSE3-NEXT:    psubd %xmm7, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm9, %xmm7
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm3
 ; SSSE3-NEXT:    pxor %xmm7, %xmm3
-; SSSE3-NEXT:    movdqa %xmm4, %xmm5
-; SSSE3-NEXT:    pandn %xmm10, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm8
-; SSSE3-NEXT:    psrld $1, %xmm8
-; SSSE3-NEXT:    por %xmm8, %xmm5
+; SSSE3-NEXT:    movdqa %xmm3, %xmm2
+; SSSE3-NEXT:    pandn %xmm5, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm5
+; SSSE3-NEXT:    pxor %xmm10, %xmm5
 ; SSSE3-NEXT:    pand %xmm3, %xmm5
-; SSSE3-NEXT:    pandn %xmm4, %xmm3
-; SSSE3-NEXT:    por %xmm5, %xmm3
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    movdqa %xmm4, %xmm2
+; SSSE3-NEXT:    movdqa %xmm5, %xmm3
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: v16i32:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm3, %xmm8
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pxor %xmm10, %xmm10
+; SSE41-NEXT:    movdqa %xmm2, %xmm10
+; SSE41-NEXT:    movdqa %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm11, %xmm11
 ; SSE41-NEXT:    movdqa %xmm0, %xmm9
 ; SSE41-NEXT:    psubd %xmm4, %xmm9
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm9, %xmm3
-; SSE41-NEXT:    pxor %xmm4, %xmm3
-; SSE41-NEXT:    movaps {{.*#+}} xmm12 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT:    movaps {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT:    movaps %xmm11, %xmm4
-; SSE41-NEXT:    movdqa %xmm9, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm11, %xmm4
+; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
+; SSE41-NEXT:    pxor %xmm4, %xmm0
+; SSE41-NEXT:    movdqa %xmm9, %xmm1
+; SSE41-NEXT:    psrad $31, %xmm1
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT:    pxor %xmm4, %xmm1
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm9
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    psubd %xmm5, %xmm1
+; SSE41-NEXT:    pcmpgtd %xmm11, %xmm5
+; SSE41-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE41-NEXT:    pxor %xmm5, %xmm3
+; SSE41-NEXT:    movdqa %xmm1, %xmm2
+; SSE41-NEXT:    psrad $31, %xmm2
+; SSE41-NEXT:    pxor %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm9
-; SSE41-NEXT:    movdqa %xmm1, %xmm4
-; SSE41-NEXT:    psubd %xmm5, %xmm4
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
-; SSE41-NEXT:    pxor %xmm5, %xmm1
-; SSE41-NEXT:    movaps %xmm11, %xmm3
-; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm3
-; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm4
+; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT:    movdqa %xmm10, %xmm2
+; SSE41-NEXT:    psubd %xmm6, %xmm2
+; SSE41-NEXT:    pcmpgtd %xmm11, %xmm6
+; SSE41-NEXT:    pcmpgtd %xmm2, %xmm10
+; SSE41-NEXT:    pxor %xmm6, %xmm10
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
-; SSE41-NEXT:    psubd %xmm6, %xmm3
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm6
-; SSE41-NEXT:    pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT:    pxor %xmm6, %xmm2
-; SSE41-NEXT:    movaps %xmm11, %xmm1
-; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm1
-; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm8, %xmm5
-; SSE41-NEXT:    psubd %xmm7, %xmm5
-; SSE41-NEXT:    pcmpgtd %xmm10, %xmm7
-; SSE41-NEXT:    pcmpgtd %xmm5, %xmm8
+; SSE41-NEXT:    psrad $31, %xmm3
+; SSE41-NEXT:    pxor %xmm4, %xmm3
+; SSE41-NEXT:    movdqa %xmm10, %xmm0
+; SSE41-NEXT:    blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT:    movdqa %xmm8, %xmm3
+; SSE41-NEXT:    psubd %xmm7, %xmm3
+; SSE41-NEXT:    pcmpgtd %xmm11, %xmm7
+; SSE41-NEXT:    pcmpgtd %xmm3, %xmm8
 ; SSE41-NEXT:    pxor %xmm7, %xmm8
-; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm12, %xmm11
+; SSE41-NEXT:    movdqa %xmm3, %xmm5
+; SSE41-NEXT:    psrad $31, %xmm5
+; SSE41-NEXT:    pxor %xmm4, %xmm5
 ; SSE41-NEXT:    movdqa %xmm8, %xmm0
-; SSE41-NEXT:    blendvps %xmm0, %xmm11, %xmm5
+; SSE41-NEXT:    blendvps %xmm0, %xmm5, %xmm3
 ; SSE41-NEXT:    movaps %xmm9, %xmm0
-; SSE41-NEXT:    movaps %xmm4, %xmm1
-; SSE41-NEXT:    movaps %xmm3, %xmm2
-; SSE41-NEXT:    movaps %xmm5, %xmm3
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: v16i32:
@@ -1175,25 +1134,30 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorps %ymm0, %ymm6, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm6
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm4, %xmm4
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT:    vblendvps %ymm2, %ymm4, %ymm6, %ymm7
-; AVX1-NEXT:    vblendvps %ymm0, %ymm7, %ymm2, %ymm0
+; AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX1-NEXT:    vxorps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvps %ymm0, %ymm2, %ymm6, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm2, %xmm6
 ; AVX1-NEXT:    vpcmpgtd %xmm5, %xmm3, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT:    vpsubd %xmm2, %xmm7, %xmm2
-; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm7, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT:    vpsubd %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm6, %xmm6
 ; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm1, %ymm1
 ; AVX1-NEXT:    vxorps %ymm1, %ymm5, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm5
+; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vblendvps %ymm2, %ymm4, %ymm6, %ymm3
-; AVX1-NEXT:    vblendvps %ymm1, %ymm3, %ymm2, %ymm1
+; AVX1-NEXT:    vxorps %ymm4, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvps %ymm1, %ymm2, %ymm5, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v16i32:
@@ -1203,15 +1167,16 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm5, %ymm0
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm5 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
-; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT:    vblendvps %ymm2, %ymm5, %ymm6, %ymm7
-; AVX2-NEXT:    vblendvps %ymm0, %ymm7, %ymm2, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm2, %ymm5
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vblendvps %ymm0, %ymm5, %ymm2, %ymm0
 ; AVX2-NEXT:    vpcmpgtd %ymm4, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm3
 ; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vblendvps %ymm3, %ymm5, %ymm6, %ymm2
+; AVX2-NEXT:    vpsrad $31, %ymm3, %ymm2
+; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvps %ymm1, %ymm2, %ymm3, %ymm1
 ; AVX2-NEXT:    retq
 ;
@@ -1222,10 +1187,8 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtd %zmm1, %zmm2, %k2
-; AVX512-NEXT:    vpbroadcastd {{.*#+}} zmm0 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT:    vpbroadcastd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vpsrad $31, %zmm1, %zmm0
+; AVX512-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %z = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> %x, <16 x i32> %y)
@@ -1261,12 +1224,9 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSE2-NEXT:    pxor %xmm4, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    pandn %xmm0, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
 ; SSE2-NEXT:    retq
@@ -1299,12 +1259,9 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm4, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    pandn %xmm0, %xmm1
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSSE3-NEXT:    pand %xmm2, %xmm0
 ; SSSE3-NEXT:    por %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
@@ -1417,34 +1374,29 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm8, %xmm2
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm6, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; SSE2-NEXT:    por %xmm2, %xmm7
-; SSE2-NEXT:    pxor %xmm4, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT:    por %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm5, %xmm4
 ; SSE2-NEXT:    pandn %xmm0, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT:    pand %xmm10, %xmm5
-; SSE2-NEXT:    por %xmm5, %xmm0
-; SSE2-NEXT:    pand %xmm7, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm0
 ; SSE2-NEXT:    por %xmm4, %xmm0
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
 ; SSE2-NEXT:    psubq %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    movdqa %xmm4, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
+; SSE2-NEXT:    movdqa %xmm4, %xmm6
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm6, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE2-NEXT:    pand %xmm7, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    pxor %xmm8, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
@@ -1458,11 +1410,9 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSE2-NEXT:    pxor %xmm5, %xmm4
 ; SSE2-NEXT:    movdqa %xmm4, %xmm3
 ; SSE2-NEXT:    pandn %xmm1, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm1
-; SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSE2-NEXT:    pand %xmm10, %xmm2
-; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm2, %xmm1
 ; SSE2-NEXT:    pand %xmm4, %xmm1
 ; SSE2-NEXT:    por %xmm3, %xmm1
 ; SSE2-NEXT:    retq
@@ -1490,34 +1440,29 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm2
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm6, %xmm2
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    por %xmm2, %xmm7
-; SSSE3-NEXT:    pxor %xmm4, %xmm7
-; SSSE3-NEXT:    movdqa %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSSE3-NEXT:    por %xmm2, %xmm5
+; SSSE3-NEXT:    pxor %xmm4, %xmm5
+; SSSE3-NEXT:    movdqa %xmm5, %xmm4
 ; SSSE3-NEXT:    pandn %xmm0, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm0
-; SSSE3-NEXT:    pxor %xmm2, %xmm2
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT:    pand %xmm10, %xmm5
-; SSSE3-NEXT:    por %xmm5, %xmm0
-; SSSE3-NEXT:    pand %xmm7, %xmm0
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; SSSE3-NEXT:    pxor %xmm2, %xmm0
+; SSSE3-NEXT:    pand %xmm5, %xmm0
 ; SSSE3-NEXT:    por %xmm4, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pxor %xmm8, %xmm4
 ; SSSE3-NEXT:    psubq %xmm3, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm5
 ; SSSE3-NEXT:    pxor %xmm8, %xmm5
-; SSSE3-NEXT:    movdqa %xmm4, %xmm7
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
+; SSSE3-NEXT:    movdqa %xmm4, %xmm6
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm6, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSSE3-NEXT:    pand %xmm7, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm4, %xmm5
 ; SSSE3-NEXT:    pxor %xmm8, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
@@ -1531,11 +1476,9 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm5, %xmm4
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm3
 ; SSSE3-NEXT:    pandn %xmm1, %xmm3
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm1
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm2
-; SSSE3-NEXT:    pand %xmm10, %xmm2
-; SSSE3-NEXT:    por %xmm2, %xmm1
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm2, %xmm1
 ; SSSE3-NEXT:    pand %xmm4, %xmm1
 ; SSSE3-NEXT:    por %xmm3, %xmm1
 ; SSSE3-NEXT:    retq
@@ -1600,31 +1543,33 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vpsubq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT:    vpsubq %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm5, %xmm5
 ; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vxorpd %ymm0, %ymm3, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT:    vxorpd %ymm0, %ymm4, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm4
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %ymm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
+; AVX1-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm1, %ymm4, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm3
 ; AVX2-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpxor %ymm0, %ymm3, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -1635,9 +1580,9 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; AVX512F-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX512F-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512F-NEXT:    vblendvpd %ymm1, %ymm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpsraq $63, %zmm1, %zmm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX512F-NEXT:    vpxor %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -1648,10 +1593,8 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
 ; AVX512BW-NEXT:    vpsubq %ymm1, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpcmpgtq %ymm1, %ymm0, %k1
 ; AVX512BW-NEXT:    kxorw %k1, %k0, %k1
-; AVX512BW-NEXT:    vpcmpgtq %ymm1, %ymm2, %k2
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512BW-NEXT:    vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 %ymm0, %ymm1 {%k1}
+; AVX512BW-NEXT:    vpsraq $63, %ymm1, %ymm0
+; AVX512BW-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa %ymm1, %ymm0
 ; AVX512BW-NEXT:    retq
   %z = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> %x, <4 x i64> %y)
@@ -1682,86 +1625,75 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-NEXT:    pcmpeqd %xmm8, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    pand %xmm11, %xmm12
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm9[1,1,3,3]
-; SSE2-NEXT:    por %xmm12, %xmm13
-; SSE2-NEXT:    pxor %xmm10, %xmm13
-; SSE2-NEXT:    movdqa %xmm13, %xmm12
-; SSE2-NEXT:    pandn %xmm0, %xmm12
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSE2-NEXT:    por %xmm12, %xmm4
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm10
+; SSE2-NEXT:    pandn %xmm0, %xmm10
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm0
-; SSE2-NEXT:    pxor %xmm11, %xmm11
-; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSE2-NEXT:    pand %xmm10, %xmm4
-; SSE2-NEXT:    por %xmm4, %xmm0
-; SSE2-NEXT:    pand %xmm13, %xmm0
-; SSE2-NEXT:    por %xmm12, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm12
-; SSE2-NEXT:    pxor %xmm8, %xmm12
+; SSE2-NEXT:    pxor %xmm9, %xmm0
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    por %xmm10, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
 ; SSE2-NEXT:    psubq %xmm5, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm4
-; SSE2-NEXT:    movdqa %xmm12, %xmm13
-; SSE2-NEXT:    pcmpgtd %xmm4, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm4
+; SSE2-NEXT:    movdqa %xmm10, %xmm11
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT:    pand %xmm14, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
-; SSE2-NEXT:    por %xmm4, %xmm12
+; SSE2-NEXT:    pand %xmm12, %xmm4
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSE2-NEXT:    por %xmm4, %xmm10
 ; SSE2-NEXT:    pxor %xmm8, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm8, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm13, %xmm5
+; SSE2-NEXT:    pand %xmm11, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    por %xmm5, %xmm4
-; SSE2-NEXT:    pxor %xmm12, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, %xmm13
-; SSE2-NEXT:    pandn %xmm1, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm1
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pcmpgtd %xmm12, %xmm5
-; SSE2-NEXT:    pand %xmm10, %xmm5
-; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    pxor %xmm10, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pandn %xmm1, %xmm5
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm1
 ; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    por %xmm13, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm12
-; SSE2-NEXT:    pxor %xmm8, %xmm12
+; SSE2-NEXT:    por %xmm5, %xmm1
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    pxor %xmm8, %xmm10
 ; SSE2-NEXT:    psubq %xmm6, %xmm2
 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    pxor %xmm8, %xmm5
-; SSE2-NEXT:    movdqa %xmm12, %xmm4
+; SSE2-NEXT:    movdqa %xmm10, %xmm4
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSE2-NEXT:    pcmpeqd %xmm12, %xmm5
+; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
+; SSE2-NEXT:    pcmpeqd %xmm10, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm13, %xmm5
+; SSE2-NEXT:    pand %xmm11, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    por %xmm5, %xmm4
 ; SSE2-NEXT:    pxor %xmm8, %xmm6
 ; SSE2-NEXT:    movdqa %xmm6, %xmm5
 ; SSE2-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm8, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT:    pand %xmm12, %xmm6
+; SSE2-NEXT:    pand %xmm10, %xmm6
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSE2-NEXT:    por %xmm6, %xmm5
 ; SSE2-NEXT:    pxor %xmm4, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, %xmm4
 ; SSE2-NEXT:    pandn %xmm2, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm2
-; SSE2-NEXT:    pxor %xmm6, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm12, %xmm6
-; SSE2-NEXT:    pand %xmm10, %xmm6
-; SSE2-NEXT:    por %xmm6, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm2
 ; SSE2-NEXT:    pand %xmm5, %xmm2
 ; SSE2-NEXT:    por %xmm4, %xmm2
 ; SSE2-NEXT:    movdqa %xmm3, %xmm4
@@ -1771,10 +1703,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-NEXT:    pxor %xmm8, %xmm5
 ; SSE2-NEXT:    movdqa %xmm4, %xmm6
 ; SSE2-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
 ; SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT:    pand %xmm12, %xmm4
+; SSE2-NEXT:    pand %xmm10, %xmm4
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
 ; SSE2-NEXT:    por %xmm4, %xmm5
 ; SSE2-NEXT:    pxor %xmm8, %xmm7
@@ -1789,11 +1721,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSE2-NEXT:    pxor %xmm5, %xmm4
 ; SSE2-NEXT:    movdqa %xmm4, %xmm5
 ; SSE2-NEXT:    pandn %xmm3, %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; SSE2-NEXT:    pandn %xmm9, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm11
-; SSE2-NEXT:    pand %xmm10, %xmm11
-; SSE2-NEXT:    por %xmm11, %xmm3
+; SSE2-NEXT:    psrad $31, %xmm3
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT:    pxor %xmm9, %xmm3
 ; SSE2-NEXT:    pand %xmm4, %xmm3
 ; SSE2-NEXT:    por %xmm5, %xmm3
 ; SSE2-NEXT:    retq
@@ -1821,86 +1751,75 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    pand %xmm11, %xmm12
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm9[1,1,3,3]
-; SSSE3-NEXT:    por %xmm12, %xmm13
-; SSSE3-NEXT:    pxor %xmm10, %xmm13
-; SSSE3-NEXT:    movdqa %xmm13, %xmm12
-; SSSE3-NEXT:    pandn %xmm0, %xmm12
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
+; SSSE3-NEXT:    por %xmm12, %xmm4
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm10
+; SSSE3-NEXT:    pandn %xmm0, %xmm10
+; SSSE3-NEXT:    psrad $31, %xmm0
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSSE3-NEXT:    movdqa {{.*#+}} xmm9 = [9223372036854775808,9223372036854775808]
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm0
-; SSSE3-NEXT:    pxor %xmm11, %xmm11
-; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm10 = [9223372036854775807,9223372036854775807]
-; SSSE3-NEXT:    pand %xmm10, %xmm4
-; SSSE3-NEXT:    por %xmm4, %xmm0
-; SSSE3-NEXT:    pand %xmm13, %xmm0
-; SSSE3-NEXT:    por %xmm12, %xmm0
-; SSSE3-NEXT:    movdqa %xmm1, %xmm12
-; SSSE3-NEXT:    pxor %xmm8, %xmm12
+; SSSE3-NEXT:    pxor %xmm9, %xmm0
+; SSSE3-NEXT:    pand %xmm4, %xmm0
+; SSSE3-NEXT:    por %xmm10, %xmm0
+; SSSE3-NEXT:    movdqa %xmm1, %xmm10
+; SSSE3-NEXT:    pxor %xmm8, %xmm10
 ; SSSE3-NEXT:    psubq %xmm5, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm4
 ; SSSE3-NEXT:    pxor %xmm8, %xmm4
-; SSSE3-NEXT:    movdqa %xmm12, %xmm13
-; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm13
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm13[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm12, %xmm4
+; SSSE3-NEXT:    movdqa %xmm10, %xmm11
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm11
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm14, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[1,1,3,3]
-; SSSE3-NEXT:    por %xmm4, %xmm12
+; SSSE3-NEXT:    pand %xmm12, %xmm4
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[1,1,3,3]
+; SSSE3-NEXT:    por %xmm4, %xmm10
 ; SSSE3-NEXT:    pxor %xmm8, %xmm5
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm13, %xmm5
+; SSSE3-NEXT:    pand %xmm11, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm5, %xmm4
-; SSSE3-NEXT:    pxor %xmm12, %xmm4
-; SSSE3-NEXT:    movdqa %xmm4, %xmm13
-; SSSE3-NEXT:    pandn %xmm1, %xmm13
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm1[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm1
-; SSSE3-NEXT:    pxor %xmm5, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm12, %xmm5
-; SSSE3-NEXT:    pand %xmm10, %xmm5
-; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    pxor %xmm10, %xmm4
+; SSSE3-NEXT:    movdqa %xmm4, %xmm5
+; SSSE3-NEXT:    pandn %xmm1, %xmm5
+; SSSE3-NEXT:    psrad $31, %xmm1
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm9, %xmm1
 ; SSSE3-NEXT:    pand %xmm4, %xmm1
-; SSSE3-NEXT:    por %xmm13, %xmm1
-; SSSE3-NEXT:    movdqa %xmm2, %xmm12
-; SSSE3-NEXT:    pxor %xmm8, %xmm12
+; SSSE3-NEXT:    por %xmm5, %xmm1
+; SSSE3-NEXT:    movdqa %xmm2, %xmm10
+; SSSE3-NEXT:    pxor %xmm8, %xmm10
 ; SSSE3-NEXT:    psubq %xmm6, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm5
 ; SSSE3-NEXT:    pxor %xmm8, %xmm5
-; SSSE3-NEXT:    movdqa %xmm12, %xmm4
+; SSSE3-NEXT:    movdqa %xmm10, %xmm4
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,0,2,2]
-; SSSE3-NEXT:    pcmpeqd %xmm12, %xmm5
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm11 = xmm4[0,0,2,2]
+; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm13, %xmm5
+; SSSE3-NEXT:    pand %xmm11, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm5, %xmm4
 ; SSSE3-NEXT:    pxor %xmm8, %xmm6
 ; SSSE3-NEXT:    movdqa %xmm6, %xmm5
 ; SSSE3-NEXT:    pcmpgtd %xmm8, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm5[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm12, %xmm6
+; SSSE3-NEXT:    pand %xmm10, %xmm6
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm6, %xmm5
 ; SSSE3-NEXT:    pxor %xmm4, %xmm5
 ; SSSE3-NEXT:    movdqa %xmm5, %xmm4
 ; SSSE3-NEXT:    pandn %xmm2, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm2
-; SSSE3-NEXT:    pxor %xmm6, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm12, %xmm6
-; SSSE3-NEXT:    pand %xmm10, %xmm6
-; SSSE3-NEXT:    por %xmm6, %xmm2
+; SSSE3-NEXT:    psrad $31, %xmm2
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm9, %xmm2
 ; SSSE3-NEXT:    pand %xmm5, %xmm2
 ; SSSE3-NEXT:    por %xmm4, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm4
@@ -1910,10 +1829,10 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm8, %xmm5
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm6
 ; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm6[0,0,2,2]
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
 ; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm5
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT:    pand %xmm12, %xmm4
+; SSSE3-NEXT:    pand %xmm10, %xmm4
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm4, %xmm5
 ; SSSE3-NEXT:    pxor %xmm8, %xmm7
@@ -1928,11 +1847,9 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; SSSE3-NEXT:    pxor %xmm5, %xmm4
 ; SSSE3-NEXT:    movdqa %xmm4, %xmm5
 ; SSSE3-NEXT:    pandn %xmm3, %xmm5
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; SSSE3-NEXT:    pandn %xmm9, %xmm3
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm11
-; SSSE3-NEXT:    pand %xmm10, %xmm11
-; SSSE3-NEXT:    por %xmm11, %xmm3
+; SSSE3-NEXT:    psrad $31, %xmm3
+; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT:    pxor %xmm9, %xmm3
 ; SSSE3-NEXT:    pand %xmm4, %xmm3
 ; SSSE3-NEXT:    por %xmm5, %xmm3
 ; SSSE3-NEXT:    retq
@@ -2042,37 +1959,42 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ;
 ; AVX1-LABEL: v8i64:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm4, %xmm6
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm7
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vpsubq %xmm4, %xmm7, %xmm4
-; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm7, %xmm7
+; AVX1-NEXT:    vpsubq %xmm5, %xmm7, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm7, %xmm7
 ; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm0, %ymm0
 ; AVX1-NEXT:    vxorpd %ymm0, %ymm6, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm4 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX1-NEXT:    vmovapd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vblendvpd %ymm2, %ymm4, %ymm6, %ymm7
-; AVX1-NEXT:    vblendvpd %ymm0, %ymm7, %ymm2, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm6
+; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm4, %xmm5
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX1-NEXT:    vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vxorpd %ymm5, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm0, %ymm2, %ymm6, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm2, %xmm7
-; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm2, %xmm6
+; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm7
+; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
 ; AVX1-NEXT:    vpsubq %xmm2, %xmm7, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm7, %xmm7
 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm1
-; AVX1-NEXT:    vxorpd %ymm1, %ymm5, %ymm1
+; AVX1-NEXT:    vxorpd %ymm1, %ymm6, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm6
+; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vblendvpd %ymm2, %ymm4, %ymm6, %ymm3
-; AVX1-NEXT:    vblendvpd %ymm1, %ymm3, %ymm2, %ymm1
+; AVX1-NEXT:    vxorpd %ymm5, %ymm2, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm1, %ymm2, %ymm6, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: v8i64:
@@ -2082,15 +2004,16 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm0, %ymm5, %ymm0
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm5 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
-; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT:    vblendvpd %ymm2, %ymm5, %ymm6, %ymm7
-; AVX2-NEXT:    vblendvpd %ymm0, %ymm7, %ymm2, %ymm0
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm4, %ymm5
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
+; AVX2-NEXT:    vblendvpd %ymm0, %ymm5, %ymm2, %ymm0
 ; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm2
 ; AVX2-NEXT:    vpsubq %ymm3, %ymm1, %ymm3
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vblendvpd %ymm3, %ymm5, %ymm6, %ymm2
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm2
+; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
 ; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
 ; AVX2-NEXT:    retq
 ;
@@ -2101,10 +2024,8 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm1
 ; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
 ; AVX512-NEXT:    kxorw %k1, %k0, %k1
-; AVX512-NEXT:    vpcmpgtq %zmm1, %zmm2, %k2
-; AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512-NEXT:    vpbroadcastq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0 {%k2}
-; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-NEXT:    vpsraq $63, %zmm1, %zmm0
+; AVX512-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm1 {%k1}
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512-NEXT:    retq
   %z = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> %x, <8 x i64> %y)
@@ -2122,31 +2043,25 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; SSE-NEXT:    movq %r8, %rbx
 ; SSE-NEXT:    sarq $63, %rbx
 ; SSE-NEXT:    testb %r10b, %r10b
-; SSE-NEXT:    cmoveq %rcx, %rbx
-; SSE-NEXT:    xorl %ecx, %ecx
-; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    setns %cl
-; SSE-NEXT:    movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
-; SSE-NEXT:    addq %r11, %rcx
+; SSE-NEXT:    cmovneq %rbx, %rcx
+; SSE-NEXT:    movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %r11, %rbx
 ; SSE-NEXT:    testb %r10b, %r10b
-; SSE-NEXT:    cmoveq %r8, %rcx
+; SSE-NEXT:    cmoveq %r8, %rbx
 ; SSE-NEXT:    subq %r9, %rsi
 ; SSE-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
 ; SSE-NEXT:    seto %r8b
 ; SSE-NEXT:    movq %rdx, %rdi
 ; SSE-NEXT:    sarq $63, %rdi
 ; SSE-NEXT:    testb %r8b, %r8b
-; SSE-NEXT:    cmoveq %rsi, %rdi
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    setns %sil
-; SSE-NEXT:    addq %r11, %rsi
+; SSE-NEXT:    cmovneq %rdi, %rsi
+; SSE-NEXT:    xorq %r11, %rdi
 ; SSE-NEXT:    testb %r8b, %r8b
-; SSE-NEXT:    cmoveq %rdx, %rsi
-; SSE-NEXT:    movq %rbx, 16(%rax)
-; SSE-NEXT:    movq %rdi, (%rax)
-; SSE-NEXT:    movq %rcx, 24(%rax)
-; SSE-NEXT:    movq %rsi, 8(%rax)
+; SSE-NEXT:    cmoveq %rdx, %rdi
+; SSE-NEXT:    movq %rcx, 16(%rax)
+; SSE-NEXT:    movq %rsi, (%rax)
+; SSE-NEXT:    movq %rbx, 24(%rax)
+; SSE-NEXT:    movq %rdi, 8(%rax)
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    retq
 ;
@@ -2160,31 +2075,25 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; AVX-NEXT:    movq %r8, %rbx
 ; AVX-NEXT:    sarq $63, %rbx
 ; AVX-NEXT:    testb %r10b, %r10b
-; AVX-NEXT:    cmoveq %rcx, %rbx
-; AVX-NEXT:    xorl %ecx, %ecx
-; AVX-NEXT:    testq %r8, %r8
-; AVX-NEXT:    setns %cl
-; AVX-NEXT:    movabsq $9223372036854775807, %r11 # imm = 0x7FFFFFFFFFFFFFFF
-; AVX-NEXT:    addq %r11, %rcx
+; AVX-NEXT:    cmovneq %rbx, %rcx
+; AVX-NEXT:    movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; AVX-NEXT:    xorq %r11, %rbx
 ; AVX-NEXT:    testb %r10b, %r10b
-; AVX-NEXT:    cmoveq %r8, %rcx
+; AVX-NEXT:    cmoveq %r8, %rbx
 ; AVX-NEXT:    subq %r9, %rsi
 ; AVX-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
 ; AVX-NEXT:    seto %r8b
 ; AVX-NEXT:    movq %rdx, %rdi
 ; AVX-NEXT:    sarq $63, %rdi
 ; AVX-NEXT:    testb %r8b, %r8b
-; AVX-NEXT:    cmoveq %rsi, %rdi
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    testq %rdx, %rdx
-; AVX-NEXT:    setns %sil
-; AVX-NEXT:    addq %r11, %rsi
+; AVX-NEXT:    cmovneq %rdi, %rsi
+; AVX-NEXT:    xorq %r11, %rdi
 ; AVX-NEXT:    testb %r8b, %r8b
-; AVX-NEXT:    cmoveq %rdx, %rsi
-; AVX-NEXT:    movq %rbx, 16(%rax)
-; AVX-NEXT:    movq %rdi, (%rax)
-; AVX-NEXT:    movq %rcx, 24(%rax)
-; AVX-NEXT:    movq %rsi, 8(%rax)
+; AVX-NEXT:    cmoveq %rdx, %rdi
+; AVX-NEXT:    movq %rcx, 16(%rax)
+; AVX-NEXT:    movq %rsi, (%rax)
+; AVX-NEXT:    movq %rbx, 24(%rax)
+; AVX-NEXT:    movq %rdi, 8(%rax)
 ; AVX-NEXT:    popq %rbx
 ; AVX-NEXT:    retq
   %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
new file mode 100644
index 0000000000000..ef84bf32619e0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
@@ -0,0 +1,993 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define <32 x half> @stack_fold_addph_zmm(<32 x half> %a0, <32 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_addph_zmm
+  ;CHECK:       vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fadd <32 x half> %a0, %a1
+  ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_addph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) {
+  ;CHECK-LABEL: stack_fold_addph_zmm_k:
+  ;CHECK:       vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fadd <32 x half> %a0, %a1
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = load <32 x half>, <32 x half>* %passthru
+  %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
+  ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_addph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) {
+  ;CHECK-LABEL: stack_fold_addph_zmm_k_commuted:
+  ;CHECK:       vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fadd <32 x half> %a1, %a0
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = load <32 x half>, <32 x half>* %passthru
+  %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
+  ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_addph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_addph_zmm_kz
+  ;CHECK:       vaddph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fadd <32 x half> %a1, %a0
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
+  ret <32 x half> %4
+}
+
+define half @stack_fold_addsh(half %a0, half %a1) {
+  ;CHECK-LABEL: stack_fold_addsh
+  ;CHECK:       vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fadd half %a0, %a1
+  ret half %2
+}
+
+define <8 x half> @stack_fold_addsh_int(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_addsh_int
+  ;CHECK:       vaddsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = extractelement <8 x half> %a0, i32 0
+  %3 = extractelement <8 x half> %a1, i32 0
+  %4 = fadd half %2, %3
+  %5 = insertelement <8 x half> %a0, half %4, i32 0
+  ret <8 x half> %5
+}
+
+define i32 @stack_fold_cmpph(<32 x half> %a0, <32 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_cmpph
+  ;CHECK:       vcmpeqph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %a0, <32 x half> %a1, i32 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
+  %2 = bitcast <32 x i1> %res to i32
+  ret i32 %2
+}
+declare <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half>, <32 x half>, i32, <32 x i1>, i32)
+
+define <32 x half> @stack_fold_cmpph_mask(<32 x half> %a0, <32 x half> %a1, <32 x half>* %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) {
+  ;CHECK-LABEL: stack_fold_cmpph_mask:
+  ;CHECK:       vcmpeqph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
+  %2 = load <32 x half>, <32 x half>* %a2
+  %3 = fadd <32 x half> %a1, %2
+  %4 = bitcast i32 %mask to <32 x i1>
+  %5 = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %3, <32 x half> %a0, i32 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
+  %6 = and <32 x i1> %4, %5
+  %7 = select <32 x i1> %6, <32 x half> %b0, <32 x half> %b1
+  ret <32 x half> %7
+}
+
+define <32 x half> @stack_fold_cmpph_mask_commuted(<32 x half> %a0, <32 x half> %a1, <32 x half>* %a2, i32 %mask, <32 x half> %b0, <32 x half> %b1) {
+  ;CHECK-LABEL: stack_fold_cmpph_mask_commuted:
+  ;CHECK:       vcmpeqph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ; load and fadd are here to keep the operations below the side effecting block and to avoid folding the wrong load
+  %2 = load <32 x half>, <32 x half>* %a2
+  %3 = fadd <32 x half> %a1, %2
+  %4 = bitcast i32 %mask to <32 x i1>
+  %5 = call <32 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.512(<32 x half> %a0, <32 x half> %3, i32 0, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
+  %6 = and <32 x i1> %4, %5
+  %7 = select <32 x i1> %6, <32 x half> %b0, <32 x half> %b1
+  ret <32 x half> %7
+}
+
+define half @stack_fold_divsh(half %a0, half %a1) {
+  ;CHECK-LABEL: stack_fold_divsh
+  ;CHECK:       vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fdiv half %a0, %a1
+  ret half %2
+}
+
+define <8 x half> @stack_fold_divsh_int(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_divsh_int
+  ;CHECK:       vdivsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = extractelement <8 x half> %a0, i32 0
+  %3 = extractelement <8 x half> %a1, i32 0
+  %4 = fdiv half %2, %3
+  %5 = insertelement <8 x half> %a0, half %4, i32 0
+  ret <8 x half> %5
+}
+
+define i32 @stack_fold_fpclassph(<32 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_fpclassph:
+  ;CHECK:       vfpclassphz $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4)
+  %3 = bitcast <32 x i1> %2 to i32
+  ret i32 %3
+}
+declare <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half>, i32)
+
+define i32 @stack_fold_fpclassph_mask(<32 x half> %a0, <32 x i1>* %p) {
+  ;CHECK-LABEL: stack_fold_fpclassph_mask:
+  ;CHECK:       vfpclassphz $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i1> @llvm.x86.avx512fp16.fpclass.ph.512(<32 x half> %a0, i32 4)
+  %mask = load <32 x i1>, <32 x i1>* %p
+  %3 = and <32 x i1> %2, %mask
+  %4 = bitcast <32 x i1> %3 to i32
+  ret i32 %4
+}
+
+define i8 @stack_fold_fpclasssh(<8 x half> %a0) {
+  ;CHECK-LABEl: stack_fold_fpclasssh:
+  ;CHECK:       vfpclasssh $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 -1)
+  ret i8 %2
+}
+declare i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half>, i32, i8)
+
+define i8 @stack_fold_fpclasssh_mask(<8 x half> %a0, i8* %p) {
+  ;CHECK-LABEL: stack_fold_fpclasssh_mask:
+  ;CHECK:       vfpclasssh $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %mask = load i8, i8* %p
+  %2 = call i8 @llvm.x86.avx512fp16.mask.fpclass.sh(<8 x half> %a0, i32 4, i8 %mask)
+  ret i8 %2
+}
+
+define <32 x half> @stack_fold_getexpph(<32 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_getexpph:
+  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1, i32 4)
+  ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half>, <32 x half>, i32, i32)
+
+define <32 x half> @stack_fold_getexpph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_getexpph_mask:
+  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <32 x half>, <32 x half>* %passthru
+  %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask, i32 4)
+  ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_getexpph_maskz(<32 x half> %a0, i32* %mask) {
+  ;CHECK-LABEL: stack_fold_getexpph_maskz:
+  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i32, i32* %mask
+  %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getexp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2, i32 4)
+  ret <32 x half> %3
+}
+
+define <8 x half> @stack_fold_getexpsh(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_getexpsh:
+  ;CHECK:       vgetexpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32)
+
+define <8 x half> @stack_fold_getexpsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_getexpsh_mask:
+  ;CHECK:       vgetexpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_getexpsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_getexpsh_maskz:
+  ;CHECK:       vgetexpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4)
+  ret <8 x half> %3
+}
+
+define <32 x half> @stack_fold_getmantph(<32 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_getmantph:
+  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
+  ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
+
+define <32 x half> @stack_fold_getmantph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_getmantph_mask:
+  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <32 x half>, <32 x half>* %passthru
+  %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
+  ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_getmantph_maskz(<32 x half> %a0, i32* %mask) {
+  ;CHECK-LABEL: stack_fold_getmantph_maskz:
+  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i32, i32* %mask
+  %3 = call <32 x half> @llvm.x86.avx512fp16.mask.getmant.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
+  ret <32 x half> %3
+}
+
+define <8 x half> @stack_fold_getmantsh(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_getmantsh:
+  ;CHECK:       vgetmantsh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> undef, i8 -1, i32 4)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half>, <8 x half>, i32, <8 x half>, i8, i32)
+
+define <8 x half> @stack_fold_getmantsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_getmantsh_mask:
+  ;CHECK:       vgetmantsh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> %2, i8 %mask, i32 4)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_getmantsh_maskz:
+  ;CHECK:       vgetmantsh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.sh(<8 x half> %a0, <8 x half> %a1, i32 8, <8 x half> zeroinitializer, i8 %2, i32 4)
+  ret <8 x half> %3
+}
+
+define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
+  ;CHECK-LABEL: stack_fold_maxph_zmm:
+  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
+  ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
+
+define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
+  ;CHECK-LABEL: stack_fold_maxph_zmm_commuted:
+  ;CHECK-NOT:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
+  ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #0 {
+  ;CHECK-LABEL: stack_fold_maxph_zmm_k:
+  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = load <32 x half>, <32 x half>* %passthru
+  %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
+  ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #0 {
+  ;CHECK-LABEL: stack_fold_maxph_zmm_k_commuted:
+  ;CHECK-NOT:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = load <32 x half>, <32 x half>* %passthru
+  %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
+  ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+  ;CHECK-LABEL: stack_fold_maxph_zmm_kz:
+  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
+  ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+  ;CHECK-LABEL: stack_fold_maxph_zmm_kz_commuted:
+  ;CHECK-NOT:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
+  ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_maxph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 {
+  ;CHECK-LABEL: stack_fold_maxph_zmm_commutable:
+  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
+  ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_maxph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 {
+  ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_commuted:
+  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
+  ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_maxph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #1 {
+  ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_k:
+  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = load <32 x half>, <32 x half>* %passthru
+  %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
+  ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_maxph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #1 {
+  ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_k_commuted:
+  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = load <32 x half>, <32 x half>* %passthru
+  %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
+  ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_maxph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
+  ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz:
+  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
+  ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
+  ;CHECK-LABEL: stack_fold_maxph_zmm_commutable_kz_commuted:
+  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
+  ret <32 x half> %4
+}
+
+define half @stack_fold_maxsh(half %a0, half %a1) #0 {
+  ;CHECK-LABEL: stack_fold_maxsh:
+  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fcmp ogt half %a0, %a1
+  %3 = select i1 %2, half %a0, half %a1
+  ret half %3
+}
+
+define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 {
+  ;CHECK-LABEL: stack_fold_maxsh_commuted:
+  ;CHECK-NOT:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fcmp ogt half %a1, %a0
+  %3 = select i1 %2, half %a1, half %a0
+  ret half %3
+}
+
+define half @stack_fold_maxsh_commutable(half %a0, half %a1) #1 {
+  ;CHECK-LABEL: stack_fold_maxsh_commutable:
+  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fcmp ogt half %a0, %a1
+  %3 = select i1 %2, half %a0, half %a1
+  ret half %3
+}
+
+define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 {
+  ;CHECK-LABEL: stack_fold_maxsh_commutable_commuted:
+  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fcmp ogt half %a1, %a0
+  %3 = select i1 %2, half %a1, half %a0
+  ret half %3
+}
+
+define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
+  ;CHECK-LABEL: stack_fold_maxsh_int:
+  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
+
+define <8 x half> @stack_fold_maxsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, <8 x half>* %passthru) {
+  ;CHECK-LABEL: stack_fold_maxsh_mask:
+  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_maxsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_maxsh_maskz:
+  ;CHECK:       vmaxsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.max.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4)
+  ret <8 x half> %2
+}
+
+define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
+  ;CHECK-LABEL: stack_fold_minph_zmm:
+  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
+  ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
+
+define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
+  ;CHECK-LABEL: stack_fold_minph_zmm_commuted:
+  ;CHECK-NOT:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
+  ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #0 {
+  ;CHECK-LABEL: stack_fold_minph_zmm_k:
+  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = load <32 x half>, <32 x half>* %passthru
+  %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
+  ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #0 {
+  ;CHECK-LABEL: stack_fold_minph_zmm_k_commuted:
+  ;CHECK-NOT:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = load <32 x half>, <32 x half>* %passthru
+  %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
+  ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+  ;CHECK-LABEL: stack_fold_minph_zmm_kz:
+  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
+  ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+  ;CHECK-LABEL: stack_fold_minph_zmm_kz_commuted:
+  ;CHECK-NOT:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
+  ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_minph_zmm_commutable(<32 x half> %a0, <32 x half> %a1) #1 {
+  ;CHECK-LABEL: stack_fold_minph_zmm_commutable:
+  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
+  ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_minph_zmm_commutable_commuted(<32 x half> %a0, <32 x half> %a1) #1 {
+  ;CHECK-LABEL: stack_fold_minph_zmm_commutable_commuted:
+  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
+  ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_minph_zmm_commutable_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #1 {
+  ;CHECK-LABEL: stack_fold_minph_zmm_commutable_k:
+  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = load <32 x half>, <32 x half>* %passthru
+  %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
+  ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_minph_zmm_commutable_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) #1 {
+  ;CHECK-LABEL: stack_fold_minph_zmm_commutable_k_commuted:
+  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = load <32 x half>, <32 x half>* %passthru
+  %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
+  ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_minph_zmm_commutable_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
+  ;CHECK-LABEL: stack_fold_minph_zmm_commutable_kz:
+  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a0, <32 x half> %a1, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
+  ret <32 x half> %4
+}
+
+define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #1 {
+  ;CHECK-LABEL: stack_fold_minph_zmm_commutable_kz_commuted:
+  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half> %a1, <32 x half> %a0, i32 4)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
+  ret <32 x half> %4
+}
+
+define half @stack_fold_minsh(half %a0, half %a1) #0 {
+  ;CHECK-LABEL: stack_fold_minsh:
+  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fcmp olt half %a0, %a1
+  %3 = select i1 %2, half %a0, half %a1
+  ret half %3
+}
+
+define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 {
+  ;CHECK-LABEL: stack_fold_minsh_commuted:
+  ;CHECK-NOT:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fcmp olt half %a1, %a0
+  %3 = select i1 %2, half %a1, half %a0
+  ret half %3
+}
+
+define half @stack_fold_minsh_commutable(half %a0, half %a1) #1 {
+  ;CHECK-LABEL: stack_fold_minsh_commutable:
+  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fcmp olt half %a0, %a1
+  %3 = select i1 %2, half %a0, half %a1
+  ret half %3
+}
+
+define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 {
+  ;CHECK-LABEL: stack_fold_minsh_commutable_commuted:
+  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fcmp olt half %a1, %a0
+  %3 = select i1 %2, half %a1, half %a0
+  ret half %3
+}
+
+define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
+  ;CHECK-LABEL: stack_fold_minsh_int:
+  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half>, <8 x half>, <8 x half>, i8, i32)
+
+define <8 x half> @stack_fold_minsh_mask(<8 x half> %a0, <8 x half> %a1, i8 %mask, <8 x half>* %passthru) {
+  ;CHECK-LABEL: stack_fold_minsh_mask:
+  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_minsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_minsh_maskz:
+  ;CHECK:       vminsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.min.sh.round(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %mask, i32 4)
+  ret <8 x half> %2
+}
+
+define <32 x half> @stack_fold_mulph_zmm(<32 x half> %a0, <32 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_mulph_zmm
+  ;CHECK:       vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fmul <32 x half> %a0, %a1
+  ret <32 x half> %2
+}
+
+define <32 x half> @stack_fold_mulph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) {
+  ;CHECK-LABEL: stack_fold_mulph_zmm_k:
+  ;CHECK:       vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fmul <32 x half> %a0, %a1
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = load <32 x half>, <32 x half>* %passthru
+  %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
+  ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_mulph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, <32 x half>* %passthru) {
+  ;CHECK-LABEL: stack_fold_mulph_zmm_k_commuted:
+  ;CHECK:       vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fmul <32 x half> %a1, %a0
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = load <32 x half>, <32 x half>* %passthru
+  %5 = select <32 x i1> %3, <32 x half> %2, <32 x half> %4
+  ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_mulph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_mulph_zmm_kz
+  ;CHECK:       vmulph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fmul <32 x half> %a1, %a0
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x half> %2, <32 x half> zeroinitializer
+  ret <32 x half> %4
+}
+
+define half @stack_fold_mulsh(half %a0, half %a1) {
+  ;CHECK-LABEL: stack_fold_mulsh
+  ;CHECK-NOT:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fmul half %a0, %a1
+  ret half %2
+}
+
+define <8 x half> @stack_fold_mulsh_int(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_mulsh_int
+  ;CHECK-NOT:       vmulss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = extractelement <8 x half> %a0, i32 0
+  %3 = extractelement <8 x half> %a1, i32 0
+  %4 = fmul half %2, %3
+  %5 = insertelement <8 x half> %a0, half %4, i32 0
+  ret <8 x half> %5
+}
+
+define <32 x half> @stack_fold_rcpph(<32 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_rcpph:
+  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1)
+  ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half>, <32 x half>, i32)
+
+define <32 x half> @stack_fold_rcpph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_rcpph_mask:
+  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <32 x half>, <32 x half>* %passthru
+  %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask)
+  ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_rcpph_maskz(<32 x half> %a0, i32* %mask) {
+  ;CHECK-LABEL: stack_fold_rcpph_maskz:
+  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i32, i32* %mask
+  %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rcp.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2)
+  ret <32 x half> %3
+}
+
+define <8 x half> @stack_fold_rcpsh(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_rcpsh:
+  ;CHECK:       vrcpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half>, <8 x half>, <8 x half>, i8)
+
+define <8 x half> @stack_fold_rcpsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_rcpsh_mask:
+  ;CHECK:       vrcpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_rcpsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_rcpsh_maskz:
+  ;CHECK:       vrcpsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2)
+  ret <8 x half> %3
+}
+
+define <32 x half> @stack_fold_reduceph(<32 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_reduceph:
+  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
+  ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
+
+define <32 x half> @stack_fold_reduceph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_reduceph_mask:
+  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <32 x half>, <32 x half>* %passthru
+  %3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
+  ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_reduceph_maskz(<32 x half> %a0, i32* %mask) {
+  ;CHECK-LABEL: stack_fold_reduceph_maskz:
+  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i32, i32* %mask
+  %3 = call <32 x half> @llvm.x86.avx512fp16.mask.reduce.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
+  ret <32 x half> %3
+}
+
+define <8 x half> @stack_fold_reducesh(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_reducesh:
+  ;CHECK:       vreducesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32)
+
+define <8 x half> @stack_fold_reducesh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_reducesh_mask:
+  ;CHECK:       vreducesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_reducesh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_reducesh_maskz:
+  ;CHECK:       vreducesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4)
+  ret <8 x half> %3
+}
+
+define <32 x half> @stack_fold_rndscaleph(<32 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_rndscaleph:
+  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> undef, i32 -1, i32 4)
+  ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half>, i32, <32 x half>, i32, i32)
+
+define <32 x half> @stack_fold_rndscaleph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_rndscaleph_mask:
+  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <32 x half>, <32 x half>* %passthru
+  %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> %2, i32 %mask, i32 4)
+  ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_rndscaleph_maskz(<32 x half> %a0, i32* %mask) {
+  ;CHECK-LABEL: stack_fold_rndscaleph_maskz:
+  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i32, i32* %mask
+  %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.512(<32 x half> %a0, i32 8, <32 x half> zeroinitializer, i32 %2, i32 4)
+  ret <32 x half> %3
+}
+
+define <8 x half> @stack_fold_rndscalesh(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_rndscalesh:
+  ;CHECK:       vrndscalesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 8, i32 4)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32, i32)
+
+define <8 x half> @stack_fold_rndscalesh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_rndscalesh_mask:
+  ;CHECK:       vrndscalesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 8, i32 4)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_rndscalesh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_rndscalesh_maskz:
+  ;CHECK:       vrndscalesh $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 8, i32 4)
+  ret <8 x half> %3
+}
+
+define <32 x half> @stack_fold_rsqrtph(<32 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_rsqrtph:
+  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> undef, i32 -1)
+  ret <32 x half> %2
+}
+declare <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half>, <32 x half>, i32)
+
+define <32 x half> @stack_fold_rsqrtph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_rsqrtph_mask:
+  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <32 x half>, <32 x half>* %passthru
+  %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> %2, i32 %mask)
+  ret <32 x half> %3
+}
+
+define <32 x half> @stack_fold_rsqrtph_maskz(<32 x half> %a0, i32* %mask) {
+  ;CHECK-LABEL: stack_fold_rsqrtph_maskz:
+  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i32, i32* %mask
+  %3 = call <32 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.512(<32 x half> %a0, <32 x half> zeroinitializer, i32 %2)
+  ret <32 x half> %3
+}
+
+define <8 x half> @stack_fold_rsqrtsh(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_rsqrtsh:
+  ;CHECK:       vrsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8)
+
+define <8 x half> @stack_fold_rsqrtsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_rsqrtsh_mask:
+  ;CHECK:       vrsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_rsqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_rsqrtsh_maskz:
+  ;CHECK:       vrsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2)
+  ret <8 x half> %3
+}
+
+define <32 x half> @stack_fold_sqrtph(<32 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_sqrtph:
+  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+  ret <32 x half> %2
+}
+declare <32 x half> @llvm.sqrt.v32f16(<32 x half>)
+
+define <32 x half> @stack_fold_sqrtph_mask(<32 x half> %a0, <32 x half>* %passthru, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_sqrtph_mask:
+  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <32 x half>, <32 x half>* %passthru
+  %3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+  %4 = bitcast i32 %mask to <32 x i1>
+  %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> %2
+  ret <32 x half> %5
+}
+
+define <32 x half> @stack_fold_sqrtph_maskz(<32 x half> %a0, i32* %mask) {
+  ;CHECK-LABEL: stack_fold_sqrtph_maskz:
+  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i32, i32* %mask
+  %3 = call <32 x half> @llvm.sqrt.v32f16(<32 x half> %a0)
+  %4 = bitcast i32 %2 to <32 x i1>
+  %5 = select <32 x i1> %4, <32 x half> %3, <32 x half> zeroinitializer
+  ret <32 x half> %5
+}
+
+define <8 x half> @stack_fold_sqrtsh(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_sqrtsh:
+  ;CHECK:       vsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> undef, i8 -1, i32 4)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half>, <8 x half>, <8 x half>, i8, i32)
+
+define <8 x half> @stack_fold_sqrtsh_mask(<8 x half> %a0, <8 x half> %a1, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_sqrtsh_mask:
+  ;CHECK:       vsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> %2, i8 %mask, i32 4)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_sqrtsh_maskz(<8 x half> %a0, <8 x half> %a1, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_sqrtsh_maskz:
+  ;CHECK:       vsqrtsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.sqrt.sh(<8 x half> %a0, <8 x half> %a1, <8 x half> zeroinitializer, i8 %2, i32 4)
+  ret <8 x half> %3
+}
+
+define <32 x half> @stack_fold_subph_zmm(<32 x half> %a0, <32 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_subph_zmm
+  ;CHECK:       vsubph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fsub <32 x half> %a0, %a1
+  ret <32 x half> %2
+}
+
+define half @stack_fold_subsh(half %a0, half %a1) {
+  ;CHECK-LABEL: stack_fold_subsh
+  ;CHECK:       vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 2-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fsub half %a0, %a1
+  ret half %2
+}
+
+define <8 x half> @stack_fold_subsh_int(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_subsh_int
+  ;CHECK:       vsubsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = extractelement <8 x half> %a0, i32 0
+  %3 = extractelement <8 x half> %a1, i32 0
+  %4 = fsub half %2, %3
+  %5 = insertelement <8 x half> %a0, half %4, i32 0
+  ret <8 x half> %5
+}
+
+attributes #0 = { "unsafe-fp-math"="false" }
+attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
new file mode 100644
index 0000000000000..92cb57f27b9ab
--- /dev/null
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
@@ -0,0 +1,592 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512fp16 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define <8 x half> @stack_fold_addph(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_addph
+  ;CHECK:       vaddph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fadd <8 x half> %a0, %a1
+  ret <8 x half> %2
+}
+
+define <16 x half> @stack_fold_addph_ymm(<16 x half> %a0, <16 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_addph_ymm
+  ;CHECK:       vaddph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fadd <16 x half> %a0, %a1
+  ret <16 x half> %2
+}
+
+define i8 @stack_fold_cmpph(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_cmpph
+  ;CHECK:       vcmpeqph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <8 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.128(<8 x half> %a0, <8 x half> %a1, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %2 = bitcast <8 x i1> %res to i8
+  ret i8 %2
+}
+declare <8 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.128(<8 x half>, <8 x half>, i32,  <8 x i1>)
+
+define i16 @stack_fold_cmpph_ymm(<16 x half> %a0, <16 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_cmpph_ymm
+  ;CHECK:       vcmpeqph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%k[0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <16 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.256(<16 x half> %a0, <16 x half> %a1, i32 0, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %2 = bitcast <16 x i1> %res to i16
+  ret i16 %2
+}
+declare <16 x i1> @llvm.x86.avx512fp16.mask.cmp.ph.256(<16 x half>, <16 x half>, i32, <16 x i1>)
+
+define <8 x half> @stack_fold_divph(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_divph
+  ;CHECK:       vdivph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fdiv <8 x half> %a0, %a1
+  ret <8 x half> %2
+}
+
+define <16 x half> @stack_fold_divph_ymm(<16 x half> %a0, <16 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_divph_ymm
+  ;CHECK:       vdivph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fdiv <16 x half> %a0, %a1
+  ret <16 x half> %2
+}
+
+define i8 @stack_fold_fpclassph(<8 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_fpclassph:
+  ;CHECK:       fpclassphx $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %a0, i32 4)
+  %3 = bitcast <8 x i1> %2 to i8
+  ret i8 %3
+}
+declare <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half>, i32)
+
+define i8 @stack_fold_fpclassph_mask(<8 x half> %a0, <8 x i1>* %p) {
+  ;CHECK-LABEL: stack_fold_fpclassph_mask:
+  ;CHECK:       fpclassphx $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i1> @llvm.x86.avx512fp16.fpclass.ph.128(<8 x half> %a0, i32 4)
+  %mask = load <8 x i1>, <8 x i1>* %p
+  %3 = and <8 x i1> %2, %mask
+  %4 = bitcast <8 x i1> %3 to i8
+  ret i8 %4
+}
+
+define i16 @stack_fold_fpclassph_ymm(<16 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_fpclassph_ymm:
+  ;CHECK:       fpclassphy $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %a0, i32 4)
+  %3 = bitcast <16 x i1> %2 to i16
+  ret i16 %3
+}
+declare <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half>, i32)
+
+define i16 @stack_fold_fpclassph_mask_ymm(<16 x half> %a0, <16 x i1>* %p) {
+  ;CHECK-LABEL: stack_fold_fpclassph_mask_ymm:
+  ;CHECK:       fpclassphy $4, {{-?[0-9]*}}(%rsp), {{%k[0-7]}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i1> @llvm.x86.avx512fp16.fpclass.ph.256(<16 x half> %a0, i32 4)
+  %mask = load <16 x i1>, <16 x i1>* %p
+  %3 = and <16 x i1> %2, %mask
+  %4 = bitcast <16 x i1> %3 to i16
+  ret i16 %4
+}
+
+define <8 x half> @stack_fold_getexpph(<8 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_getexpph:
+  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half>, <8 x half>, i8)
+
+define <8 x half> @stack_fold_getexpph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_getexpph_mask:
+  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_getexpph_maskz(<8 x half> %a0, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_getexpph_maskz:
+  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getexp.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
+  ret <8 x half> %3
+}
+
+define <16 x half> @stack_fold_getexpph_ymm(<16 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_getexpph_ymm:
+  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
+  ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half>, <16 x half>, i16)
+
+define <16 x half> @stack_fold_getexpph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_getexpph_mask_ymm:
+  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <16 x half>, <16 x half>* %passthru
+  %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
+  ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_getexpph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+  ;CHECK-LABEL: stack_fold_getexpph_maskz_ymm:
+  ;CHECK:       vgetexpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i16, i16* %mask
+  %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getexp.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
+  ret <16 x half> %3
+}
+
+define <8 x half> @stack_fold_getmantph(<8 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_getmantph:
+  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half>, i32, <8 x half>, i8)
+
+define <8 x half> @stack_fold_getmantph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_getmantph_mask:
+  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_getmantph_maskz(<8 x half> %a0, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_getmantph_maskz:
+  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.getmant.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
+  ret <8 x half> %3
+}
+
+define <16 x half> @stack_fold_getmantph_ymm(<16 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_getmantph_ymm:
+  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
+  ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half>, i32, <16 x half>, i16)
+
+define <16 x half> @stack_fold_getmantph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_getmantph_mask_ymm:
+  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <16 x half>, <16 x half>* %passthru
+  %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
+  ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_getmantph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+  ;CHECK-LABEL: stack_fold_getmantph_maskz_ymm:
+  ;CHECK:       vgetmantph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i16, i16* %mask
+  %3 = call <16 x half> @llvm.x86.avx512fp16.mask.getmant.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
+  ret <16 x half> %3
+}
+
+define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
+  ;CHECK-LABEL: stack_fold_maxph
+  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %a0, <8 x half> %a1)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half>, <8 x half>) nounwind readnone
+
+define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
+  ;CHECK-LABEL: stack_fold_maxph_commutable
+  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half> %a0, <8 x half> %a1)
+  ret <8 x half> %2
+}
+
+define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
+  ;CHECK-LABEL: stack_fold_maxph_ymm
+  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %a0, <16 x half> %a1)
+  ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half>, <16 x half>) nounwind readnone
+
+define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
+  ;CHECK-LABEL: stack_fold_maxph_ymm_commutable
+  ;CHECK:       vmaxph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half> %a0, <16 x half> %a1)
+  ret <16 x half> %2
+}
+
+define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 {
+  ;CHECK-LABEL: stack_fold_minph
+  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %a0, <8 x half> %a1)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half>, <8 x half>) nounwind readnone
+
+define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
+  ;CHECK-LABEL: stack_fold_minph_commutable
+  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half> %a0, <8 x half> %a1)
+  ret <8 x half> %2
+}
+
+define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
+  ;CHECK-LABEL: stack_fold_minph_ymm
+  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %a0, <16 x half> %a1)
+  ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half>, <16 x half>) nounwind readnone
+
+define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
+  ;CHECK-LABEL: stack_fold_minph_ymm_commutable
+  ;CHECK:       vminph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half> %a0, <16 x half> %a1)
+  ret <16 x half> %2
+}
+
+define <8 x half> @stack_fold_mulph(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_mulph
+  ;CHECK:       vmulph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fmul <8 x half> %a0, %a1
+  ret <8 x half> %2
+}
+
+define <16 x half> @stack_fold_mulph_ymm(<16 x half> %a0, <16 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_mulph_ymm
+  ;CHECK:       vmulph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fmul <16 x half> %a0, %a1
+  ret <16 x half> %2
+}
+
+define <8 x half> @stack_fold_rcpph(<8 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_rcpph:
+  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half>, <8 x half>, i8)
+
+define <8 x half> @stack_fold_rcpph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_rcpph_mask:
+  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_rcpph_maskz(<8 x half> %a0, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_rcpph_maskz:
+  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rcp.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
+  ret <8 x half> %3
+}
+
+define <16 x half> @stack_fold_rcpph_ymm(<16 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_rcpph_ymm:
+  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
+  ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half>, <16 x half>, i16)
+
+define <16 x half> @stack_fold_rcpph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_rcpph_mask_ymm:
+  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <16 x half>, <16 x half>* %passthru
+  %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
+  ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_rcpph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+  ;CHECK-LABEL: stack_fold_rcpph_maskz_ymm:
+  ;CHECK:       vrcpph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i16, i16* %mask
+  %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rcp.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
+  ret <16 x half> %3
+}
+
+define <8 x half> @stack_fold_reduceph(<8 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_reduceph:
+  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half>, i32, <8 x half>, i8)
+
+define <8 x half> @stack_fold_reduceph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_reduceph_mask:
+  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_reduceph_maskz(<8 x half> %a0, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_reduceph_maskz:
+  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.reduce.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
+  ret <8 x half> %3
+}
+
+define <16 x half> @stack_fold_reduceph_ymm(<16 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_reduceph_ymm:
+  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
+  ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half>, i32, <16 x half>, i16)
+
+define <16 x half> @stack_fold_reduceph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_reduceph_mask_ymm:
+  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <16 x half>, <16 x half>* %passthru
+  %3 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
+  ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_reduceph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+  ;CHECK-LABEL: stack_fold_reduceph_maskz_ymm:
+  ;CHECK:       vreduceph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i16, i16* %mask
+  %3 = call <16 x half> @llvm.x86.avx512fp16.mask.reduce.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
+  ret <16 x half> %3
+}
+
+define <8 x half> @stack_fold_rndscaleph(<8 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_rndscaleph:
+  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> undef, i8 -1)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half>, i32, <8 x half>, i8)
+
+define <8 x half> @stack_fold_rndscaleph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_rndscaleph_mask:
+  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> %2, i8 %mask)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_rndscaleph_maskz(<8 x half> %a0, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_rndscaleph_maskz:
+  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.128(<8 x half> %a0, i32 8, <8 x half> zeroinitializer, i8 %2)
+  ret <8 x half> %3
+}
+
+define <16 x half> @stack_fold_rndscaleph_ymm(<16 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_rndscaleph_ymm:
+  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> undef, i16 -1)
+  ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half>, i32, <16 x half>, i16)
+
+define <16 x half> @stack_fold_rndscaleph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_rndscaleph_mask_ymm:
+  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <16 x half>, <16 x half>* %passthru
+  %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> %2, i16 %mask)
+  ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_rndscaleph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+  ;CHECK-LABEL: stack_fold_rndscaleph_maskz_ymm:
+  ;CHECK:       vrndscaleph $8, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i16, i16* %mask
+  %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rndscale.ph.256(<16 x half> %a0, i32 8, <16 x half> zeroinitializer, i16 %2)
+  ret <16 x half> %3
+}
+
+define <8 x half> @stack_fold_rsqrtph(<8 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_rsqrtph:
+  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> undef, i8 -1)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half>, <8 x half>, i8)
+
+define <8 x half> @stack_fold_rsqrtph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_rsqrtph_mask:
+  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> %2, i8 %mask)
+  ret <8 x half> %3
+}
+
+define <8 x half> @stack_fold_rsqrtph_maskz(<8 x half> %a0, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_rsqrtph_maskz:
+  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.128(<8 x half> %a0, <8 x half> zeroinitializer, i8 %2)
+  ret <8 x half> %3
+}
+
+define <16 x half> @stack_fold_rsqrtph_ymm(<16 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_rsqrtph_ymm:
+  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> undef, i16 -1)
+  ret <16 x half> %2
+}
+declare <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half>, <16 x half>, i16)
+
+define <16 x half> @stack_fold_rsqrtph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_rsqrtph_mask_ymm:
+  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <16 x half>, <16 x half>* %passthru
+  %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> %2, i16 %mask)
+  ret <16 x half> %3
+}
+
+define <16 x half> @stack_fold_rsqrtph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+  ;CHECK-LABEL: stack_fold_rsqrtph_maskz_ymm:
+  ;CHECK:       vrsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i16, i16* %mask
+  %3 = call <16 x half> @llvm.x86.avx512fp16.mask.rsqrt.ph.256(<16 x half> %a0, <16 x half> zeroinitializer, i16 %2)
+  ret <16 x half> %3
+}
+
+define <8 x half> @stack_fold_sqrtph(<8 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_sqrtph:
+  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
+  ret <8 x half> %2
+}
+declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
+
+define <8 x half> @stack_fold_sqrtph_mask(<8 x half> %a0, <8 x half>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_sqrtph_mask:
+  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <8 x half>, <8 x half>* %passthru
+  %3 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
+  %4 = bitcast i8 %mask to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> %2
+  ret <8 x half> %5
+}
+
+define <8 x half> @stack_fold_sqrtph_maskz(<8 x half> %a0, i8* %mask) {
+  ;CHECK-LABEL: stack_fold_sqrtph_maskz:
+  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i8, i8* %mask
+  %3 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a0)
+  %4 = bitcast i8 %2 to <8 x i1>
+  %5 = select <8 x i1> %4, <8 x half> %3, <8 x half> zeroinitializer
+  ret <8 x half> %5
+}
+
+define <16 x half> @stack_fold_sqrtph_ymm(<16 x half> %a0) {
+  ;CHECK-LABEL: stack_fold_sqrtph_ymm:
+  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
+  ret <16 x half> %2
+}
+declare <16 x half> @llvm.sqrt.v16f16(<16 x half>)
+
+define <16 x half> @stack_fold_sqrtph_mask_ymm(<16 x half> %a0, <16 x half>* %passthru, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_sqrtph_mask_ymm:
+  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <16 x half>, <16 x half>* %passthru
+  %3 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
+  %4 = bitcast i16 %mask to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> %2
+  ret <16 x half> %5
+}
+
+define <16 x half> @stack_fold_sqrtph_maskz_ymm(<16 x half> %a0, i16* %mask) {
+  ;CHECK-LABEL: stack_fold_sqrtph_maskz_ymm:
+  ;CHECK:       vsqrtph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[1-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load i16, i16* %mask
+  %3 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> %a0)
+  %4 = bitcast i16 %2 to <16 x i1>
+  %5 = select <16 x i1> %4, <16 x half> %3, <16 x half> zeroinitializer
+  ret <16 x half> %5
+}
+
+define <8 x half> @stack_fold_subph(<8 x half> %a0, <8 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_subph
+  ;CHECK:       vsubph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fsub <8 x half> %a0, %a1
+  ret <8 x half> %2
+}
+
+define <16 x half> @stack_fold_subph_ymm(<16 x half> %a0, <16 x half> %a1) {
+  ;CHECK-LABEL: stack_fold_subph_ymm
+  ;CHECK:       vsubph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = fsub <16 x half> %a0, %a1
+  ret <16 x half> %2
+}
+
+attributes #0 = { "unsafe-fp-math"="false" }
+attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/store_op_load_fold.ll b/llvm/test/CodeGen/X86/store_op_load_fold.ll
index c4cdc0e9932a7..8c6e28630f9a3 100644
--- a/llvm/test/CodeGen/X86/store_op_load_fold.ll
+++ b/llvm/test/CodeGen/X86/store_op_load_fold.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-darwin | FileCheck %s
 ;
 ; Test the add and load are folded into the store instruction.
@@ -6,13 +7,13 @@
 
 define void @foo() nounwind {
 ; CHECK-LABEL: foo:
-; CHECK-NOT: mov
-; CHECK: add
-; CHECK-NEXT: ret
-        %tmp.0 = load i16, i16* @X           ; <i16> [#uses=1]
-        %tmp.3 = add i16 %tmp.0, 329            ; <i16> [#uses=1]
-        store i16 %tmp.3, i16* @X
-        ret void
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    addw $329, _X ## imm = 0x149
+; CHECK-NEXT:    retl
+  %tmp.0 = load i16, i16* @X           ; <i16> [#uses=1]
+  %tmp.3 = add i16 %tmp.0, 329            ; <i16> [#uses=1]
+  store i16 %tmp.3, i16* @X
+  ret void
 }
 
 ; rdar://12838504
@@ -20,9 +21,10 @@ define void @foo() nounwind {
 @s2 = external global %struct.S2, align 16
 define void @test2() nounwind uwtable ssp {
 ; CHECK-LABEL: test2:
-; CHECK: mov
-; CHECK-NEXT: and
-; CHECK-NEXT: ret
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    movl L_s2$non_lazy_ptr, %eax
+; CHECK-NEXT:    andl $-262144, 20(%eax) ## imm = 0xFFFC0000
+; CHECK-NEXT:    retl
   %bf.load35 = load i56, i56* bitcast ([7 x i8]* getelementptr inbounds (%struct.S2, %struct.S2* @s2, i32 0, i32 5) to i56*), align 16
   %bf.clear36 = and i56 %bf.load35, -1125895611875329
   store i56 %bf.clear36, i56* bitcast ([7 x i8]* getelementptr inbounds (%struct.S2, %struct.S2* @s2, i32 0, i32 5) to i56*), align 16
diff --git a/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
new file mode 100644
index 0000000000000..f73742947b5cc
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-128-fp16.ll
@@ -0,0 +1,203 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
+
+declare <8 x half> @llvm.experimental.constrained.fadd.v8f16(<8 x half>, <8 x half>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.fsub.v8f16(<8 x half>, <8 x half>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.fmul.v8f16(<8 x half>, <8 x half>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.fdiv.v8f16(<8 x half>, <8 x half>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.sqrt.v8f16(<8 x half>, metadata, metadata)
+declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata)
+declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, metadata)
+declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata)
+declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata)
+declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f64(<2 x double>, metadata, metadata)
+declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata)
+declare double @llvm.experimental.constrained.fpext.f64.f16(half, metadata)
+declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata)
+declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata)
+declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(<2 x half>, metadata)
+
+define <8 x half> @f2(<8 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x half> @llvm.experimental.constrained.fadd.v8f16(<8 x half> %a, <8 x half> %b,
+                                                                   metadata !"round.dynamic",
+                                                                   metadata !"fpexcept.strict") #0
+  ret <8 x half> %ret
+}
+
+define <8 x half> @f4(<8 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: f4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsubph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x half> @llvm.experimental.constrained.fsub.v8f16(<8 x half> %a, <8 x half> %b,
+                                                                   metadata !"round.dynamic",
+                                                                   metadata !"fpexcept.strict") #0
+  ret <8 x half> %ret
+}
+
+define <8 x half> @f6(<8 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: f6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmulph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x half> @llvm.experimental.constrained.fmul.v8f16(<8 x half> %a, <8 x half> %b,
+                                                                   metadata !"round.dynamic",
+                                                                   metadata !"fpexcept.strict") #0
+  ret <8 x half> %ret
+}
+
+define <8 x half> @f8(<8 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: f8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivph %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x half> @llvm.experimental.constrained.fdiv.v8f16(<8 x half> %a, <8 x half> %b,
+                                                                   metadata !"round.dynamic",
+                                                                   metadata !"fpexcept.strict") #0
+  ret <8 x half> %ret
+}
+
+define <8 x half> @f10(<8 x half> %a) #0 {
+; CHECK-LABEL: f10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsqrtph %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %sqrt = call <8 x half> @llvm.experimental.constrained.sqrt.v8f16(
+                              <8 x half> %a,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict") #0
+  ret <8 x half > %sqrt
+}
+
+define <8 x half> @f11(<2 x double> %a0, <8 x half> %a1) #0 {
+; CHECK-LABEL: f11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtsd2sh %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ext = extractelement <2 x double> %a0, i32 0
+  %cvt = call half @llvm.experimental.constrained.fptrunc.f16.f64(double %ext,
+                                                                  metadata !"round.dynamic",
+                                                                  metadata !"fpexcept.strict") #0
+  %res = insertelement <8 x half> %a1, half %cvt, i32 0
+  ret <8 x half> %res
+}
+
+define <2 x double> @f12(<2 x double> %a0, <8 x half> %a1) #0 {
+; CHECK-LABEL: f12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtsh2sd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ext = extractelement <8 x half> %a1, i32 0
+  %cvt = call double @llvm.experimental.constrained.fpext.f64.f16(half %ext,
+                                                                  metadata !"fpexcept.strict") #0
+  %res = insertelement <2 x double> %a0, double %cvt, i32 0
+  ret <2 x double> %res
+}
+
+define <2 x double> @f15(<2 x half> %a) #0 {
+; CHECK-LABEL: f15:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2pd %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f16(
+                                <2 x half> %a,
+                                metadata !"fpexcept.strict") #0
+  ret <2 x double> %ret
+}
+
+define <2 x half> @f16(<2 x double> %a) #0 {
+; CHECK-LABEL: f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtpd2ph %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f64(
+                                <2 x double> %a,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict") #0
+  ret <2 x half> %ret
+}
+
+define <8 x half> @f17(<4 x float> %a0, <8 x half> %a1) #0 {
+; CHECK-LABEL: f17:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vmovsh %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ext = extractelement <4 x float> %a0, i32 0
+  %cvt = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %ext,
+                                                                  metadata !"round.dynamic",
+                                                                  metadata !"fpexcept.strict") #0
+  %res = insertelement <8 x half> %a1, half %cvt, i32 0
+  ret <8 x half> %res
+}
+
+define <4 x float> @f18(<4 x float> %a0, <8 x half> %a1) #0 {
+; CHECK-LABEL: f18:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtsh2ss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ext = extractelement <8 x half> %a1, i32 0
+  %cvt = call float @llvm.experimental.constrained.fpext.f32.f16(half %ext,
+                                                                  metadata !"fpexcept.strict") #0
+  %res = insertelement <4 x float> %a0, float %cvt, i32 0
+  ret <4 x float> %res
+}
+
+define <2 x float> @f19(<2 x half> %a) #0 {
+; CHECK-LABEL: f19:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; CHECK-NEXT:    vcvtph2psx %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(
+                                <2 x half> %a,
+                                metadata !"fpexcept.strict") #0
+  ret <2 x float> %ret
+}
+
+define <4 x float> @f20(<4 x half> %a) #0 {
+; CHECK-LABEL: f20:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2psx %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(
+                                <4 x half> %a,
+                                metadata !"fpexcept.strict") #0
+  ret <4 x float> %ret
+}
+
+define <2 x half> @f21(<2 x float> %a) #0 {
+; CHECK-LABEL: f21:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    vcvtps2phx %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(
+                                <2 x float> %a,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict") #0
+  ret <2 x half> %ret
+}
+
+define <4 x half> @f22(<4 x float> %a) #0 {
+; CHECK-LABEL: f22:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtps2phx %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(
+                                <4 x float> %a,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict") #0
+  ret <4 x half> %ret
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll
new file mode 100644
index 0000000000000..d5868287823fb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-256-fp16.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s
+
+declare <16 x half> @llvm.experimental.constrained.fadd.v16f16(<16 x half>, <16 x half>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.fsub.v16f16(<16 x half>, <16 x half>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.fmul.v16f16(<16 x half>, <16 x half>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.fdiv.v16f16(<16 x half>, <16 x half>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.sqrt.v16f16(<16 x half>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(<4 x half>, metadata)
+declare <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(<8 x half>, metadata)
+declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f64(<4 x double>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f32(<8 x float>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.ceil.v16f16(<16 x half>, metadata)
+declare <16 x half> @llvm.experimental.constrained.floor.v16f16(<16 x half>, metadata)
+declare <16 x half> @llvm.experimental.constrained.trunc.v16f16(<16 x half>, metadata)
+declare <16 x half> @llvm.experimental.constrained.rint.v16f16(<16 x half>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.nearbyint.v16f16(<16 x half>, metadata, metadata)
+
+define <16 x half> @f2(<16 x half> %a, <16 x half> %b) #0 {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x half> @llvm.experimental.constrained.fadd.v16f16(<16 x half> %a, <16 x half> %b,
+                                                                     metadata !"round.dynamic",
+                                                                     metadata !"fpexcept.strict") #0
+  ret <16 x half> %ret
+}
+
+define <16 x half> @f4(<16 x half> %a, <16 x half> %b) #0 {
+; CHECK-LABEL: f4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsubph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x half> @llvm.experimental.constrained.fsub.v16f16(<16 x half> %a, <16 x half> %b,
+                                                                     metadata !"round.dynamic",
+                                                                     metadata !"fpexcept.strict") #0
+  ret <16 x half> %ret
+}
+
+define <16 x half> @f6(<16 x half> %a, <16 x half> %b) #0 {
+; CHECK-LABEL: f6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmulph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x half> @llvm.experimental.constrained.fmul.v16f16(<16 x half> %a, <16 x half> %b,
+                                                                     metadata !"round.dynamic",
+                                                                     metadata !"fpexcept.strict") #0
+  ret <16 x half> %ret
+}
+
+define <16 x half> @f8(<16 x half> %a, <16 x half> %b) #0 {
+; CHECK-LABEL: f8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivph %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x half> @llvm.experimental.constrained.fdiv.v16f16(<16 x half> %a, <16 x half> %b,
+                                                                     metadata !"round.dynamic",
+                                                                     metadata !"fpexcept.strict") #0
+  ret <16 x half> %ret
+}
+
+
+define <16 x half> @f10(<16 x half> %a) #0 {
+; CHECK-LABEL: f10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsqrtph %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x half> @llvm.experimental.constrained.sqrt.v16f16(
+                              <16 x half> %a,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict") #0
+  ret <16 x half > %ret
+}
+
+define <4 x double> @f11(<4 x half> %a) #0 {
+; CHECK-LABEL: f11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2pd %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x double> @llvm.experimental.constrained.fpext.v4f64.v4f16(
+                                <4 x half> %a,
+                                metadata !"fpexcept.strict") #0
+  ret <4 x double> %ret
+}
+
+define <4 x half> @f12(<4 x double> %a) #0 {
+; CHECK-LABEL: f12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtpd2ph %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f64(
+                                <4 x double> %a,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict") #0
+  ret <4 x half> %ret
+}
+
+define <8 x float> @f14(<8 x half> %a) #0 {
+; CHECK-LABEL: f14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2psx %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x float> @llvm.experimental.constrained.fpext.v8f32.v8f16(
+                                <8 x half> %a,
+                                metadata !"fpexcept.strict") #0
+  ret <8 x float> %ret
+}
+
+define <8 x half> @f15(<8 x float> %a) #0 {
+; CHECK-LABEL: f15:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtps2phx %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f32(
+                                <8 x float> %a,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict") #0
+  ret <8 x half> %ret
+}
+
+define <16 x half> @fceilv16f16(<16 x half> %f) #0 {
+; CHECK-LABEL: fceilv16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrndscaleph $10, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = call <16 x half> @llvm.experimental.constrained.ceil.v16f16(
+                          <16 x half> %f, metadata !"fpexcept.strict") #0
+  ret <16 x half> %res
+}
+
+define <16 x half> @ffloorv16f16(<16 x half> %f) #0 {
+; CHECK-LABEL: ffloorv16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrndscaleph $9, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = call <16 x half> @llvm.experimental.constrained.floor.v16f16(
+                          <16 x half> %f, metadata !"fpexcept.strict") #0
+  ret <16 x half> %res
+}
+
+
+define <16 x half> @ftruncv16f16(<16 x half> %f) #0 {
+; CHECK-LABEL: ftruncv16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrndscaleph $11, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = call <16 x half> @llvm.experimental.constrained.trunc.v16f16(
+                          <16 x half> %f, metadata !"fpexcept.strict") #0
+  ret <16 x half> %res
+}
+
+define <16 x half> @frintv16f16(<16 x half> %f) #0 {
+; CHECK-LABEL: frintv16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrndscaleph $4, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = call <16 x half> @llvm.experimental.constrained.rint.v16f16(
+                          <16 x half> %f,
+                          metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret <16 x half> %res
+}
+
+define <16 x half> @fnearbyintv16f16(<16 x half> %f) #0 {
+; CHECK-LABEL: fnearbyintv16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrndscaleph $12, %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = call <16 x half> @llvm.experimental.constrained.nearbyint.v16f16(
+                          <16 x half> %f,
+                          metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret <16 x half> %res
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll
new file mode 100644
index 0000000000000..6273a525b15d6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-512-fp16.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s
+
+declare <32 x half> @llvm.experimental.constrained.fadd.v32f16(<32 x half>, <32 x half>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.fsub.v32f16(<32 x half>, <32 x half>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.fmul.v32f16(<32 x half>, <32 x half>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.fdiv.v32f16(<32 x half>, <32 x half>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.sqrt.v32f16(<32 x half>, metadata, metadata)
+declare <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(<8 x half>, metadata)
+declare <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(<16 x half>, metadata)
+declare <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f64(<8 x double>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.fptrunc.v16f16.v16f32(<16 x float>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.ceil.v32f16(<32 x half>, metadata)
+declare <32 x half> @llvm.experimental.constrained.floor.v32f16(<32 x half>, metadata)
+declare <32 x half> @llvm.experimental.constrained.trunc.v32f16(<32 x half>, metadata)
+declare <32 x half> @llvm.experimental.constrained.rint.v32f16(<32 x half>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.nearbyint.v32f16(<32 x half>, metadata, metadata)
+
+define <32 x half> @f2(<32 x half> %a, <32 x half> %b) #0 {
+; CHECK-LABEL: f2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vaddph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <32 x half> @llvm.experimental.constrained.fadd.v32f16(<32 x half> %a, <32 x half> %b,
+                                                                     metadata !"round.dynamic",
+                                                                     metadata !"fpexcept.strict") #0
+  ret <32 x half> %ret
+}
+
+define <32 x half> @f4(<32 x half> %a, <32 x half> %b) #0 {
+; CHECK-LABEL: f4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsubph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <32 x half> @llvm.experimental.constrained.fsub.v32f16(<32 x half> %a, <32 x half> %b,
+                                                                     metadata !"round.dynamic",
+                                                                     metadata !"fpexcept.strict") #0
+  ret <32 x half> %ret
+}
+
+define <32 x half> @f6(<32 x half> %a, <32 x half> %b) #0 {
+; CHECK-LABEL: f6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmulph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <32 x half> @llvm.experimental.constrained.fmul.v32f16(<32 x half> %a, <32 x half> %b,
+                                                                     metadata !"round.dynamic",
+                                                                     metadata !"fpexcept.strict") #0
+  ret <32 x half> %ret
+}
+
+define <32 x half> @f8(<32 x half> %a, <32 x half> %b) #0 {
+; CHECK-LABEL: f8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vdivph %zmm1, %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <32 x half> @llvm.experimental.constrained.fdiv.v32f16(<32 x half> %a, <32 x half> %b,
+                                                                     metadata !"round.dynamic",
+                                                                     metadata !"fpexcept.strict") #0
+  ret <32 x half> %ret
+}
+
+define <32 x half> @f10(<32 x half> %a) #0 {
+; CHECK-LABEL: f10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsqrtph %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <32 x half> @llvm.experimental.constrained.sqrt.v32f16(
+                              <32 x half> %a,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict") #0
+  ret <32 x half > %ret
+}
+
+define <8 x double> @f11(<8 x half> %a) #0 {
+; CHECK-LABEL: f11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2pd %xmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x double> @llvm.experimental.constrained.fpext.v8f64.v8f16(
+                                <8 x half> %a,
+                                metadata !"fpexcept.strict") #0
+  ret <8 x double> %ret
+}
+
+define <8 x half> @f12(<8 x double> %a) #0 {
+; CHECK-LABEL: f12:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtpd2ph %zmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x half> @llvm.experimental.constrained.fptrunc.v8f16.v8f64(
+                                <8 x double> %a,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict") #0
+  ret <8 x half> %ret
+}
+
+define <16 x float> @f14(<16 x half> %a) #0 {
+; CHECK-LABEL: f14:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtph2psx %ymm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x float> @llvm.experimental.constrained.fpext.v16f32.v16f16(
+                                <16 x half> %a,
+                                metadata !"fpexcept.strict") #0
+  ret <16 x float> %ret
+}
+
+define <16 x half> @f15(<16 x float> %a) #0 {
+; CHECK-LABEL: f15:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtps2phx %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x half> @llvm.experimental.constrained.fptrunc.v16f16.v16f32(
+                                <16 x float> %a,
+                                metadata !"round.dynamic",
+                                metadata !"fpexcept.strict") #0
+  ret <16 x half> %ret
+}
+
+define <32 x half> @strict_vector_fceil_v32f16(<32 x half> %f) #0 {
+; CHECK-LABEL: strict_vector_fceil_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrndscaleph $10, %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = call <32 x half> @llvm.experimental.constrained.ceil.v32f16(<32 x half> %f, metadata !"fpexcept.strict") #0
+  ret <32 x half> %res
+}
+
+define <32 x half> @strict_vector_ffloor_v32f16(<32 x half> %f) #0 {
+; CHECK-LABEL: strict_vector_ffloor_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrndscaleph $9, %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = call <32 x half> @llvm.experimental.constrained.floor.v32f16(<32 x half> %f, metadata !"fpexcept.strict") #0
+  ret <32 x half> %res
+}
+
+define <32 x half> @strict_vector_ftrunc_v32f16(<32 x half> %f) #0 {
+; CHECK-LABEL: strict_vector_ftrunc_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrndscaleph $11, %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = call <32 x half> @llvm.experimental.constrained.trunc.v32f16(<32 x half> %f, metadata !"fpexcept.strict") #0
+  ret <32 x half> %res
+}
+
+define <32 x half> @strict_vector_frint_v32f16(<32 x half> %f) #0 {
+; CHECK-LABEL: strict_vector_frint_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrndscaleph $4, %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = call <32 x half> @llvm.experimental.constrained.rint.v32f16(<32 x half> %f,
+                             metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret <32 x half> %res
+}
+
+define <32 x half> @strict_vector_fnearbyint_v32f16(<32 x half> %f) #0 {
+; CHECK-LABEL: strict_vector_fnearbyint_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vrndscaleph $12, %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %res = call <32 x half> @llvm.experimental.constrained.nearbyint.v32f16(<32 x half> %f,
+                             metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret <32 x half> %res
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll
new file mode 100644
index 0000000000000..da21aa68e7594
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-128-fp16.ll
@@ -0,0 +1,1012 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=X64
+
+define <8 x i16> @test_v8f16_oeq_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_oeq_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpeqph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_oeq_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpeqph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"oeq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ogt_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ogt_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpgt_oqph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ogt_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmplt_oqph %xmm2, %xmm3, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ogt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_oge_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_oge_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpge_oqph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_oge_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmple_oqph %xmm2, %xmm3, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"oge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_olt_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_olt_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmplt_oqph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_olt_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmplt_oqph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"olt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ole_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ole_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmple_oqph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ole_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmple_oqph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ole",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_one_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_one_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpneq_oqph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_one_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpneq_oqph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"one",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ord_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ord_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpordph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ord_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpordph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ord",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ueq_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ueq_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpeq_uqph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ueq_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpeq_uqph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ueq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ugt_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ugt_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpnle_uqph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ugt_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnle_uqph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ugt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_uge_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_uge_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpnlt_uqph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_uge_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnlt_uqph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"uge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ult_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ult_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpnge_uqph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ult_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnle_uqph %xmm2, %xmm3, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ult",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ule_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ule_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpngt_uqph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ule_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnlt_uqph %xmm2, %xmm3, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ule",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_une_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_une_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpneqph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_une_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpneqph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"une",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_uno_q(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_uno_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpunordph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_uno_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpunordph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"uno",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_oeq_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_oeq_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpeq_osph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_oeq_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpeq_osph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"oeq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ogt_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ogt_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpgtph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ogt_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpltph %xmm2, %xmm3, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ogt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_oge_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_oge_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpgeph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_oge_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpleph %xmm2, %xmm3, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"oge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_olt_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_olt_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpltph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_olt_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpltph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"olt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ole_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ole_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpleph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ole_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpleph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ole",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_one_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_one_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpneq_osph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_one_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpneq_osph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"one",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ord_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ord_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpord_sph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ord_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpord_sph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ord",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ueq_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ueq_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpeq_usph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ueq_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpeq_usph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ueq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ugt_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ugt_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpnleph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ugt_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnleph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ugt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_uge_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_uge_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpnltph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_uge_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnltph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"uge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ult_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ult_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpngeph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ult_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnleph %xmm2, %xmm3, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ult",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_ule_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_ule_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpngtph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_ule_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnltph %xmm2, %xmm3, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"ule",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_une_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_une_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpneq_usph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_une_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpneq_usph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"une",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_v8f16_uno_s(<8 x i16> %a, <8 x i16> %b, <8 x half> %f1, <8 x half> %f2) #0 {
+; X86-LABEL: test_v8f16_uno_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    vcmpunord_sph 8(%ebp), %xmm2, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v8f16_uno_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpunord_sph %xmm3, %xmm2, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(
+                                               <8 x half> %f1, <8 x half> %f2, metadata !"uno",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <8 x i1> %cond, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %res
+}
+
+define <2 x i16> @test_v2f16_oeq_q(<2 x i16> %a, <2 x i16> %b, <2 x half> %f1, <2 x half> %f2) #0 {
+; X86-LABEL: test_v2f16_oeq_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movb $-3, %al
+; X86-NEXT:    kmovd %eax, %k0
+; X86-NEXT:    vucomish 8(%ebp), %xmm2
+; X86-NEXT:    setnp %al
+; X86-NEXT:    sete %cl
+; X86-NEXT:    testb %al, %cl
+; X86-NEXT:    setne %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kandb %k0, %k1, %k0
+; X86-NEXT:    vpsrld $16, %xmm2, %xmm2
+; X86-NEXT:    vucomish 10(%ebp), %xmm2
+; X86-NEXT:    setnp %al
+; X86-NEXT:    sete %cl
+; X86-NEXT:    testb %al, %cl
+; X86-NEXT:    setne %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kshiftlb $7, %k1, %k1
+; X86-NEXT:    kshiftrb $6, %k1, %k1
+; X86-NEXT:    korb %k1, %k0, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v2f16_oeq_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vucomish %xmm3, %xmm2
+; X64-NEXT:    setnp %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb %al, %cl
+; X64-NEXT:    setne %al
+; X64-NEXT:    kmovd %eax, %k0
+; X64-NEXT:    movb $-3, %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kandb %k1, %k0, %k0
+; X64-NEXT:    vpsrld $16, %xmm3, %xmm3
+; X64-NEXT:    vpsrld $16, %xmm2, %xmm2
+; X64-NEXT:    vucomish %xmm3, %xmm2
+; X64-NEXT:    setnp %al
+; X64-NEXT:    sete %cl
+; X64-NEXT:    testb %al, %cl
+; X64-NEXT:    setne %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kshiftlb $7, %k1, %k1
+; X64-NEXT:    kshiftrb $6, %k1, %k1
+; X64-NEXT:    korb %k1, %k0, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <2 x i1> @llvm.experimental.constrained.fcmp.v2f16(
+                                               <2 x half> %f1, <2 x half> %f2, metadata !"oeq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <2 x i1> %cond, <2 x i16> %a, <2 x i16> %b
+  ret <2 x i16> %res
+}
+
+define <2 x i16> @test_v2f16_ogt_q(<2 x i16> %a, <2 x i16> %b, <2 x half> %f1, <2 x half> %f2) #0 {
+; X86-LABEL: test_v2f16_ogt_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movb $-3, %al
+; X86-NEXT:    kmovd %eax, %k0
+; X86-NEXT:    vcomish 8(%ebp), %xmm2
+; X86-NEXT:    seta %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kandb %k0, %k1, %k0
+; X86-NEXT:    vpsrld $16, %xmm2, %xmm2
+; X86-NEXT:    vcomish 10(%ebp), %xmm2
+; X86-NEXT:    seta %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kshiftlb $7, %k1, %k1
+; X86-NEXT:    kshiftrb $6, %k1, %k1
+; X86-NEXT:    korb %k1, %k0, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v2f16_ogt_q:
+; X64:       # %bb.0:
+; X64-NEXT:    movb $-3, %al
+; X64-NEXT:    kmovd %eax, %k0
+; X64-NEXT:    vcomish %xmm3, %xmm2
+; X64-NEXT:    seta %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kandb %k0, %k1, %k0
+; X64-NEXT:    vpsrld $16, %xmm3, %xmm3
+; X64-NEXT:    vpsrld $16, %xmm2, %xmm2
+; X64-NEXT:    vcomish %xmm3, %xmm2
+; X64-NEXT:    seta %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kshiftlb $7, %k1, %k1
+; X64-NEXT:    kshiftrb $6, %k1, %k1
+; X64-NEXT:    korb %k1, %k0, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <2 x i1> @llvm.experimental.constrained.fcmps.v2f16(
+                                               <2 x half> %f1, <2 x half> %f2, metadata !"ogt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <2 x i1> %cond, <2 x i16> %a, <2 x i16> %b
+  ret <2 x i16> %res
+}
+
+define <4 x i16> @test_v4f16_oge_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, <4 x half> %f2) #0 {
+; X86-LABEL: test_v4f16_oge_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movb $-3, %al
+; X86-NEXT:    kmovd %eax, %k0
+; X86-NEXT:    vucomish 8(%ebp), %xmm2
+; X86-NEXT:    setae %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kandb %k0, %k1, %k0
+; X86-NEXT:    vpsrld $16, %xmm2, %xmm3
+; X86-NEXT:    vucomish 10(%ebp), %xmm3
+; X86-NEXT:    setae %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kshiftlb $7, %k1, %k1
+; X86-NEXT:    kshiftrb $6, %k1, %k1
+; X86-NEXT:    korb %k1, %k0, %k0
+; X86-NEXT:    movb $-5, %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kandb %k1, %k0, %k0
+; X86-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; X86-NEXT:    vucomish 12(%ebp), %xmm3
+; X86-NEXT:    setae %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kshiftlb $7, %k1, %k1
+; X86-NEXT:    kshiftrb $5, %k1, %k1
+; X86-NEXT:    korb %k1, %k0, %k0
+; X86-NEXT:    movb $-9, %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kandb %k1, %k0, %k0
+; X86-NEXT:    vpsrlq $48, %xmm2, %xmm2
+; X86-NEXT:    vucomish 14(%ebp), %xmm2
+; X86-NEXT:    setae %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kshiftlb $7, %k1, %k1
+; X86-NEXT:    kshiftrb $4, %k1, %k1
+; X86-NEXT:    korb %k1, %k0, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v4f16_oge_q:
+; X64:       # %bb.0:
+; X64-NEXT:    movb $-3, %al
+; X64-NEXT:    kmovd %eax, %k0
+; X64-NEXT:    vucomish %xmm3, %xmm2
+; X64-NEXT:    setae %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kandb %k0, %k1, %k0
+; X64-NEXT:    vpsrld $16, %xmm3, %xmm4
+; X64-NEXT:    vpsrld $16, %xmm2, %xmm5
+; X64-NEXT:    vucomish %xmm4, %xmm5
+; X64-NEXT:    setae %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kshiftlb $7, %k1, %k1
+; X64-NEXT:    kshiftrb $6, %k1, %k1
+; X64-NEXT:    korb %k1, %k0, %k0
+; X64-NEXT:    movb $-5, %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kandb %k1, %k0, %k0
+; X64-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; X64-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
+; X64-NEXT:    vucomish %xmm4, %xmm5
+; X64-NEXT:    setae %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kshiftlb $7, %k1, %k1
+; X64-NEXT:    kshiftrb $5, %k1, %k1
+; X64-NEXT:    korb %k1, %k0, %k0
+; X64-NEXT:    movb $-9, %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kandb %k1, %k0, %k0
+; X64-NEXT:    vpsrlq $48, %xmm3, %xmm3
+; X64-NEXT:    vpsrlq $48, %xmm2, %xmm2
+; X64-NEXT:    vucomish %xmm3, %xmm2
+; X64-NEXT:    setae %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kshiftlb $7, %k1, %k1
+; X64-NEXT:    kshiftrb $4, %k1, %k1
+; X64-NEXT:    korb %k1, %k0, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <4 x i1> @llvm.experimental.constrained.fcmp.v4f16(
+                                               <4 x half> %f1, <4 x half> %f2, metadata !"oge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <4 x i1> %cond, <4 x i16> %a, <4 x i16> %b
+  ret <4 x i16> %res
+}
+
+define <4 x i16> @test_v4f16_olt_q(<4 x i16> %a, <4 x i16> %b, <4 x half> %f1, <4 x half> %f2) #0 {
+; X86-LABEL: test_v4f16_olt_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movb $-3, %al
+; X86-NEXT:    kmovd %eax, %k0
+; X86-NEXT:    vmovsh 8(%ebp), %xmm3
+; X86-NEXT:    vcomish %xmm2, %xmm3
+; X86-NEXT:    seta %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kandb %k0, %k1, %k0
+; X86-NEXT:    vpsrld $16, %xmm2, %xmm3
+; X86-NEXT:    vmovsh 10(%ebp), %xmm4
+; X86-NEXT:    vcomish %xmm3, %xmm4
+; X86-NEXT:    seta %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kshiftlb $7, %k1, %k1
+; X86-NEXT:    kshiftrb $6, %k1, %k1
+; X86-NEXT:    korb %k1, %k0, %k0
+; X86-NEXT:    movb $-5, %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kandb %k1, %k0, %k0
+; X86-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; X86-NEXT:    vmovsh 12(%ebp), %xmm4
+; X86-NEXT:    vcomish %xmm3, %xmm4
+; X86-NEXT:    seta %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kshiftlb $7, %k1, %k1
+; X86-NEXT:    kshiftrb $5, %k1, %k1
+; X86-NEXT:    korb %k1, %k0, %k0
+; X86-NEXT:    movb $-9, %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kandb %k1, %k0, %k0
+; X86-NEXT:    vpsrlq $48, %xmm2, %xmm2
+; X86-NEXT:    vmovsh 14(%ebp), %xmm3
+; X86-NEXT:    vcomish %xmm2, %xmm3
+; X86-NEXT:    seta %al
+; X86-NEXT:    kmovd %eax, %k1
+; X86-NEXT:    kshiftlb $7, %k1, %k1
+; X86-NEXT:    kshiftrb $4, %k1, %k1
+; X86-NEXT:    korb %k1, %k0, %k1
+; X86-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v4f16_olt_q:
+; X64:       # %bb.0:
+; X64-NEXT:    movb $-3, %al
+; X64-NEXT:    kmovd %eax, %k0
+; X64-NEXT:    vcomish %xmm2, %xmm3
+; X64-NEXT:    seta %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kandb %k0, %k1, %k0
+; X64-NEXT:    vpsrld $16, %xmm2, %xmm4
+; X64-NEXT:    vpsrld $16, %xmm3, %xmm5
+; X64-NEXT:    vcomish %xmm4, %xmm5
+; X64-NEXT:    seta %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kshiftlb $7, %k1, %k1
+; X64-NEXT:    kshiftrb $6, %k1, %k1
+; X64-NEXT:    korb %k1, %k0, %k0
+; X64-NEXT:    movb $-5, %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kandb %k1, %k0, %k0
+; X64-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; X64-NEXT:    vmovshdup {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; X64-NEXT:    vcomish %xmm4, %xmm5
+; X64-NEXT:    seta %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kshiftlb $7, %k1, %k1
+; X64-NEXT:    kshiftrb $5, %k1, %k1
+; X64-NEXT:    korb %k1, %k0, %k0
+; X64-NEXT:    movb $-9, %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kandb %k1, %k0, %k0
+; X64-NEXT:    vpsrlq $48, %xmm2, %xmm2
+; X64-NEXT:    vpsrlq $48, %xmm3, %xmm3
+; X64-NEXT:    vcomish %xmm2, %xmm3
+; X64-NEXT:    seta %al
+; X64-NEXT:    kmovd %eax, %k1
+; X64-NEXT:    kshiftlb $7, %k1, %k1
+; X64-NEXT:    kshiftrb $4, %k1, %k1
+; X64-NEXT:    korb %k1, %k0, %k1
+; X64-NEXT:    vpblendmw %xmm0, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <4 x i1> @llvm.experimental.constrained.fcmps.v4f16(
+                                               <4 x half> %f1, <4 x half> %f2, metadata !"olt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <4 x i1> %cond, <4 x i16> %a, <4 x i16> %b
+  ret <4 x i16> %res
+}
+
+attributes #0 = { strictfp nounwind }
+
+declare <2 x i1> @llvm.experimental.constrained.fcmp.v2f16(<2 x half>, <2 x half>, metadata, metadata)
+declare <2 x i1> @llvm.experimental.constrained.fcmps.v2f16(<2 x half>, <2 x half>, metadata, metadata)
+declare <4 x i1> @llvm.experimental.constrained.fcmp.v4f16(<4 x half>, <4 x half>, metadata, metadata)
+declare <4 x i1> @llvm.experimental.constrained.fcmps.v4f16(<4 x half>, <4 x half>, metadata, metadata)
+declare <8 x i1> @llvm.experimental.constrained.fcmp.v8f16(<8 x half>, <8 x half>, metadata, metadata)
+declare <8 x i1> @llvm.experimental.constrained.fcmps.v8f16(<8 x half>, <8 x half>, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-256-fp16.ll
new file mode 100644
index 0000000000000..81987dca26567
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-256-fp16.ll
@@ -0,0 +1,708 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX512-32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=AVX512-64
+
+define <16 x i16> @test_v16f16_oeq_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_oeq_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpeqph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_oeq_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpeqph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"oeq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ogt_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ogt_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpgt_oqph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ogt_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmplt_oqph %ymm2, %ymm3, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ogt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_oge_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_oge_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpge_oqph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_oge_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmple_oqph %ymm2, %ymm3, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"oge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_olt_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_olt_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmplt_oqph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_olt_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmplt_oqph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"olt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ole_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ole_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmple_oqph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ole_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmple_oqph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ole",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_one_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_one_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpneq_oqph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_one_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpneq_oqph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"one",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ord_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ord_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpordph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ord_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpordph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ord",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ueq_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ueq_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpeq_uqph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ueq_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpeq_uqph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ueq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ugt_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ugt_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnle_uqph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ugt_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpnle_uqph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ugt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_uge_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_uge_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnlt_uqph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_uge_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpnlt_uqph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"uge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ult_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ult_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnge_uqph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ult_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpnle_uqph %ymm2, %ymm3, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ult",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ule_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ule_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpngt_uqph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ule_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpnlt_uqph %ymm2, %ymm3, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ule",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_une_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_une_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpneqph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_une_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpneqph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"une",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_uno_q(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_uno_q:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpunordph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_uno_q:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpunordph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"uno",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_oeq_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_oeq_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpeq_osph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_oeq_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpeq_osph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"oeq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ogt_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ogt_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpgtph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ogt_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpltph %ymm2, %ymm3, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ogt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_oge_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_oge_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpgeph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_oge_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpleph %ymm2, %ymm3, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"oge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_olt_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_olt_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpltph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_olt_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpltph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"olt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ole_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ole_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpleph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ole_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpleph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ole",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_one_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_one_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpneq_osph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_one_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpneq_osph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"one",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ord_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ord_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpord_sph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ord_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpord_sph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ord",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ueq_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ueq_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpeq_usph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ueq_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpeq_usph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ueq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ugt_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ugt_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnleph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ugt_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpnleph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ugt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_uge_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_uge_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpnltph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_uge_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpnltph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"uge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ult_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ult_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpngeph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ult_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpnleph %ymm2, %ymm3, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ult",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_ule_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_ule_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpngtph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_ule_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpnltph %ymm2, %ymm3, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"ule",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_une_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_une_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpneq_usph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_une_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpneq_usph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"une",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_v16f16_uno_s(<16 x i16> %a, <16 x i16> %b, <16 x half> %f1, <16 x half> %f2) #0 {
+; AVX512-32-LABEL: test_v16f16_uno_s:
+; AVX512-32:       # %bb.0:
+; AVX512-32-NEXT:    pushl %ebp
+; AVX512-32-NEXT:    movl %esp, %ebp
+; AVX512-32-NEXT:    andl $-32, %esp
+; AVX512-32-NEXT:    subl $32, %esp
+; AVX512-32-NEXT:    vcmpunord_sph 8(%ebp), %ymm2, %k1
+; AVX512-32-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-32-NEXT:    movl %ebp, %esp
+; AVX512-32-NEXT:    popl %ebp
+; AVX512-32-NEXT:    retl
+;
+; AVX512-64-LABEL: test_v16f16_uno_s:
+; AVX512-64:       # %bb.0:
+; AVX512-64-NEXT:    vcmpunord_sph %ymm3, %ymm2, %k1
+; AVX512-64-NEXT:    vpblendmw %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512-64-NEXT:    retq
+  %cond = call <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(
+                                               <16 x half> %f1, <16 x half> %f2, metadata !"uno",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <16 x i1> %cond, <16 x i16> %a, <16 x i16> %b
+  ret <16 x i16> %res
+}
+
+attributes #0 = { strictfp nounwind }
+
+declare <16 x i1> @llvm.experimental.constrained.fcmp.v16f16(<16 x half>, <16 x half>, metadata, metadata)
+declare <16 x i1> @llvm.experimental.constrained.fcmps.v16f16(<16 x half>, <16 x half>, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-512-fp16.ll
new file mode 100644
index 0000000000000..bfeb41e9cf94e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-cmp-512-fp16.ll
@@ -0,0 +1,708 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16 -O3 | FileCheck %s --check-prefixes=X64
+
+define <32 x i16> @test_v32f16_oeq_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_oeq_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpeqph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_oeq_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpeqph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"oeq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ogt_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ogt_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpgt_oqph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ogt_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmplt_oqph %zmm2, %zmm3, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ogt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_oge_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_oge_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpge_oqph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_oge_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmple_oqph %zmm2, %zmm3, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"oge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_olt_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_olt_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmplt_oqph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_olt_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmplt_oqph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"olt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ole_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ole_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmple_oqph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ole_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmple_oqph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ole",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_one_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_one_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpneq_oqph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_one_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpneq_oqph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"one",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ord_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ord_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpordph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ord_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpordph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ord",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ueq_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ueq_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpeq_uqph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ueq_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpeq_uqph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ueq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ugt_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ugt_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpnle_uqph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ugt_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnle_uqph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ugt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_uge_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_uge_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpnlt_uqph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_uge_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnlt_uqph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"uge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ult_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ult_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpnge_uqph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ult_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnle_uqph %zmm2, %zmm3, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ult",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ule_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ule_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpngt_uqph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ule_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnlt_uqph %zmm2, %zmm3, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ule",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_une_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_une_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpneqph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_une_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpneqph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"une",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_uno_q(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_uno_q:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpunordph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_uno_q:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpunordph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"uno",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_oeq_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_oeq_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpeq_osph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_oeq_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpeq_osph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"oeq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ogt_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ogt_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpgtph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ogt_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpltph %zmm2, %zmm3, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ogt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_oge_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_oge_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpgeph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_oge_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpleph %zmm2, %zmm3, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"oge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_olt_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_olt_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpltph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_olt_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpltph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"olt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ole_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ole_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpleph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ole_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpleph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ole",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_one_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_one_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpneq_osph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_one_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpneq_osph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"one",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ord_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ord_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpord_sph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ord_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpord_sph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ord",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ueq_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ueq_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpeq_usph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ueq_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpeq_usph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ueq",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ugt_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ugt_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpnleph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ugt_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnleph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ugt",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_uge_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_uge_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpnltph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_uge_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnltph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"uge",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ult_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ult_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpngeph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ult_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnleph %zmm2, %zmm3, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ult",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_ule_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_ule_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpngtph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_ule_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpnltph %zmm2, %zmm3, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"ule",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_une_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_une_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpneq_usph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_une_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpneq_usph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"une",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_v32f16_uno_s(<32 x i16> %a, <32 x i16> %b, <32 x half> %f1, <32 x half> %f2) #0 {
+; X86-LABEL: test_v32f16_uno_s:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-64, %esp
+; X86-NEXT:    subl $64, %esp
+; X86-NEXT:    vcmpunord_sph 8(%ebp), %zmm2, %k1
+; X86-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_v32f16_uno_s:
+; X64:       # %bb.0:
+; X64-NEXT:    vcmpunord_sph %zmm3, %zmm2, %k1
+; X64-NEXT:    vpblendmw %zmm0, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+  %cond = call <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(
+                                               <32 x half> %f1, <32 x half> %f2, metadata !"uno",
+                                               metadata !"fpexcept.strict") #0
+  %res = select <32 x i1> %cond, <32 x i16> %a, <32 x i16> %b
+  ret <32 x i16> %res
+}
+
+attributes #0 = { strictfp nounwind }
+
+declare <32 x i1> @llvm.experimental.constrained.fcmp.v32f16(<32 x half>, <32 x half>, metadata, metadata)
+declare <32 x i1> @llvm.experimental.constrained.fcmps.v32f16(<32 x half>, <32 x half>, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
new file mode 100644
index 0000000000000..441fd8926acd0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128-fp16.ll
@@ -0,0 +1,323 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
+
+declare <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f16(<2 x half>, metadata)
+declare <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f16(<2 x half>, metadata)
+declare <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f16(<2 x half>, metadata)
+declare <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f16(<2 x half>, metadata)
+declare <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f16(<2 x half>, metadata)
+declare <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f16(<2 x half>, metadata)
+declare <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f16(<2 x half>, metadata)
+declare <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f16(<2 x half>, metadata)
+declare <2 x i1> @llvm.experimental.constrained.fptosi.v2i1.v2f16(<2 x half>, metadata)
+declare <2 x i1> @llvm.experimental.constrained.fptoui.v2i1.v2f16(<2 x half>, metadata)
+declare <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f16(<4 x half>, metadata)
+declare <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f16(<4 x half>, metadata)
+declare <4 x i16> @llvm.experimental.constrained.fptosi.v4i16.v4f16(<4 x half>, metadata)
+declare <4 x i16> @llvm.experimental.constrained.fptoui.v4i16.v4f16(<4 x half>, metadata)
+declare <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f16(<4 x half>, metadata)
+declare <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f16(<4 x half>, metadata)
+declare <4 x i1> @llvm.experimental.constrained.fptosi.v4i1.v4f16(<4 x half>, metadata)
+declare <4 x i1> @llvm.experimental.constrained.fptoui.v4i1.v4f16(<4 x half>, metadata)
+declare <8 x i16> @llvm.experimental.constrained.fptosi.v8i16.v8f16(<8 x half>, metadata)
+declare <8 x i16> @llvm.experimental.constrained.fptoui.v8i16.v8f16(<8 x half>, metadata)
+declare <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f16(<8 x half>, metadata)
+declare <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f16(<8 x half>, metadata)
+declare <8 x i1> @llvm.experimental.constrained.fptosi.v8i1.v8f16(<8 x half>, metadata)
+declare <8 x i1> @llvm.experimental.constrained.fptoui.v8i1.v8f16(<8 x half>, metadata)
+
+define <2 x i64> @strict_vector_fptosi_v2f16_to_v2i64(<2 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vcvttph2qq %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f16(<2 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <2 x i64> %ret
+}
+
+define <2 x i64> @strict_vector_fptoui_v2f16_to_v2i64(<2 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vcvttph2uqq %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f16(<2 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <2 x i64> %ret
+}
+
+define <2 x i32> @strict_vector_fptosi_v2f16_to_v2i32(<2 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f16(<2 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <2 x i32> %ret
+}
+
+define <2 x i32> @strict_vector_fptoui_v2f16_to_v2i32(<2 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f16(<2 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <2 x i32> %ret
+}
+
+define <2 x i16> @strict_vector_fptosi_v2f16_to_v2i16(<2 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f16(<2 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @strict_vector_fptoui_v2f16_to_v2i16(<2 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f16(<2 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <2 x i16> %ret
+}
+
+define <2 x i8> @strict_vector_fptosi_v2f16_to_v2i8(<2 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
+; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f16(<2 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <2 x i8> %ret
+}
+
+define <2 x i8> @strict_vector_fptoui_v2f16_to_v2i8(<2 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
+; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f16(<2 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <2 x i8> %ret
+}
+
+define <2 x i1> @strict_vector_fptosi_v2f16_to_v2i1(<2 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v2f16_to_v2i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovw2m %xmm0, %k0
+; CHECK-NEXT:    vpmovm2q %k0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x i1> @llvm.experimental.constrained.fptosi.v2i1.v2f16(<2 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <2 x i1> %ret
+}
+
+define <2 x i1> @strict_vector_fptoui_v2f16_to_v2i1(<2 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v2f16_to_v2i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovw2m %xmm0, %k0
+; CHECK-NEXT:    vpmovm2q %k0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <2 x i1> @llvm.experimental.constrained.fptoui.v2i1.v2f16(<2 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <2 x i1> %ret
+}
+
+define <4 x i32> @strict_vector_fptosi_v4f16_to_v4i32(<4 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v4f16_to_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    vcvttph2dq %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f16(<4 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <4 x i32> %ret
+}
+
+define <4 x i32> @strict_vector_fptoui_v4f16_to_v4i32(<4 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v4f16_to_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    vcvttph2udq %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f16(<4 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <4 x i32> %ret
+}
+
+define <4 x i16> @strict_vector_fptosi_v4f16_to_v4i16(<4 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v4f16_to_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x i16> @llvm.experimental.constrained.fptosi.v4i16.v4f16(<4 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <4 x i16> %ret
+}
+
+define <4 x i16> @strict_vector_fptoui_v4f16_to_v4i16(<4 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v4f16_to_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x i16> @llvm.experimental.constrained.fptoui.v4i16.v4f16(<4 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <4 x i16> %ret
+}
+
+define <4 x i8> @strict_vector_fptosi_v4f16_to_v4i8(<4 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v4f16_to_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
+; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f16(<4 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <4 x i8> %ret
+}
+
+define <4 x i8> @strict_vector_fptoui_v4f16_to_v4i8(<4 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v4f16_to_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
+; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f16(<4 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <4 x i8> %ret
+}
+
+define <4 x i1> @strict_vector_fptosi_v4f16_to_v4i1(<4 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v4f16_to_v4i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovw2m %xmm0, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x i1> @llvm.experimental.constrained.fptosi.v4i1.v4f16(<4 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <4 x i1> %ret
+}
+
+define <4 x i1> @strict_vector_fptoui_v4f16_to_v4i1(<4 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v4f16_to_v4i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0
+; CHECK-NEXT:    vpmovw2m %xmm0, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x i1> @llvm.experimental.constrained.fptoui.v4i1.v4f16(<4 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <4 x i1> %ret
+}
+
+define <8 x i16> @strict_vector_fptosi_v8f16_to_v8i16(<8 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v8f16_to_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x i16> @llvm.experimental.constrained.fptosi.v8i16.v8f16(<8 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <8 x i16> %ret
+}
+
+define <8 x i16> @strict_vector_fptoui_v8f16_to_v8i16(<8 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v8f16_to_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x i16> @llvm.experimental.constrained.fptoui.v8i16.v8f16(<8 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <8 x i16> %ret
+}
+
+define <8 x i8> @strict_vector_fptosi_v8f16_to_v8i8(<8 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v8f16_to_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2w %xmm0, %xmm0
+; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f16(<8 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <8 x i8> %ret
+}
+
+define <8 x i8> @strict_vector_fptoui_v8f16_to_v8i8(<8 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v8f16_to_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2uw %xmm0, %xmm0
+; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f16(<8 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <8 x i8> %ret
+}
+
+define <8 x i1> @strict_vector_fptosi_v8f16_to_v8i1(<8 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v8f16_to_v8i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %xmm0, %ymm0
+; CHECK-NEXT:    vpmovd2m %ymm0, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x i1> @llvm.experimental.constrained.fptosi.v8i1.v8f16(<8 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <8 x i1> %ret
+}
+
+define <8 x i1> @strict_vector_fptoui_v8f16_to_v8i1(<8 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v8f16_to_v8i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %xmm0, %ymm0
+; CHECK-NEXT:    vpslld $31, %ymm0, %ymm0
+; CHECK-NEXT:    vpmovd2m %ymm0, %k0
+; CHECK-NEXT:    vpmovm2w %k0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x i1> @llvm.experimental.constrained.fptoui.v8i1.v8f16(<8 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <8 x i1> %ret
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll
new file mode 100644
index 0000000000000..bc0dd022bfae4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256-fp16.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK
+
+
+declare <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f16(<4 x half>, metadata)
+declare <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f16(<4 x half>, metadata)
+declare <8 x i32> @llvm.experimental.constrained.fptosi.v8i32.v8f16(<8 x half>, metadata)
+declare <8 x i32> @llvm.experimental.constrained.fptoui.v8i32.v8f16(<8 x half>, metadata)
+declare <16 x i16> @llvm.experimental.constrained.fptosi.v16i16.v16f16(<16 x half>, metadata)
+declare <16 x i16> @llvm.experimental.constrained.fptoui.v16i16.v16f16(<16 x half>, metadata)
+declare <16 x i8> @llvm.experimental.constrained.fptosi.v16i8.v16f16(<16 x half>, metadata)
+declare <16 x i8> @llvm.experimental.constrained.fptoui.v16i8.v16f16(<16 x half>, metadata)
+declare <16 x i1> @llvm.experimental.constrained.fptosi.v16i1.v16f16(<16 x half>, metadata)
+declare <16 x i1> @llvm.experimental.constrained.fptoui.v16i1.v16f16(<16 x half>, metadata)
+
+define <4 x i64> @strict_vector_fptosi_v4f16_to_v4i64(<4 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v4f16_to_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    vcvttph2qq %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f16(<4 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <4 x i64> %ret
+}
+
+define <4 x i64> @strict_vector_fptoui_v4f16_to_v4i64(<4 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v4f16_to_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    vcvttph2uqq %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f16(<4 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <4 x i64> %ret
+}
+
+define <8 x i32> @strict_vector_fptosi_v8f16_to_v8i32(<8 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v8f16_to_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x i32> @llvm.experimental.constrained.fptosi.v8i32.v8f16(<8 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <8 x i32> %ret
+}
+
+define <8 x i32> @strict_vector_fptoui_v8f16_to_v8i32(<8 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v8f16_to_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2udq %xmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x i32> @llvm.experimental.constrained.fptoui.v8i32.v8f16(<8 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <8 x i32> %ret
+}
+
+define <16 x i16> @strict_vector_fptosi_v16f16_to_v16i16(<16 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v16f16_to_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %ymm0, %zmm0
+; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x i16> @llvm.experimental.constrained.fptosi.v16i16.v16f16(<16 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <16 x i16> %ret
+}
+
+define <16 x i16> @strict_vector_fptoui_v16f16_to_v16i16(<16 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v16f16_to_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %ymm0, %zmm0
+; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x i16> @llvm.experimental.constrained.fptoui.v16i16.v16f16(<16 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <16 x i16> %ret
+}
+
+define <16 x i8> @strict_vector_fptosi_v16f16_to_v16i8(<16 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v16f16_to_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %ymm0, %zmm0
+; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x i8> @llvm.experimental.constrained.fptosi.v16i8.v16f16(<16 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <16 x i8> %ret
+}
+
+define <16 x i8> @strict_vector_fptoui_v16f16_to_v16i8(<16 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v16f16_to_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %ymm0, %zmm0
+; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x i8> @llvm.experimental.constrained.fptoui.v16i8.v16f16(<16 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <16 x i8> %ret
+}
+
+define <16 x i1> @strict_vector_fptosi_v16f16_to_v16i1(<16 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v16f16_to_v16i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %ymm0, %zmm0
+; CHECK-NEXT:    vpmovd2m %zmm0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x i1> @llvm.experimental.constrained.fptosi.v16i1.v16f16(<16 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <16 x i1> %ret
+}
+
+define <16 x i1> @strict_vector_fptoui_v16f16_to_v16i1(<16 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v16f16_to_v16i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %ymm0, %zmm0
+; CHECK-NEXT:    vpslld $31, %zmm0, %zmm0
+; CHECK-NEXT:    vpmovd2m %zmm0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x i1> @llvm.experimental.constrained.fptoui.v16i1.v16f16(<16 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <16 x i1> %ret
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512-fp16.ll
new file mode 100644
index 0000000000000..dc8823710291e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512-fp16.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=CHECK
+
+
+declare <8 x i64> @llvm.experimental.constrained.fptosi.v8i64.v8f16(<8 x half>, metadata)
+declare <8 x i64> @llvm.experimental.constrained.fptoui.v8i64.v8f16(<8 x half>, metadata)
+declare <16 x i32> @llvm.experimental.constrained.fptosi.v16i32.v16f16(<16 x half>, metadata)
+declare <16 x i32> @llvm.experimental.constrained.fptoui.v16i32.v16f16(<16 x half>, metadata)
+declare <32 x i16> @llvm.experimental.constrained.fptosi.v32i16.v32f16(<32 x half>, metadata)
+declare <32 x i16> @llvm.experimental.constrained.fptoui.v32i16.v32f16(<32 x half>, metadata)
+declare <32 x i8> @llvm.experimental.constrained.fptosi.v32i8.v32f16(<32 x half>, metadata)
+declare <32 x i8> @llvm.experimental.constrained.fptoui.v32i8.v32f16(<32 x half>, metadata)
+declare <32 x i1> @llvm.experimental.constrained.fptosi.v32i1.v32f16(<32 x half>, metadata)
+declare <32 x i1> @llvm.experimental.constrained.fptoui.v32i1.v32f16(<32 x half>, metadata)
+
+define <8 x i64> @strict_vector_fptosi_v8f16_to_v8i64(<8 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v8f16_to_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2qq %xmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x i64> @llvm.experimental.constrained.fptosi.v8i64.v8f16(<8 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <8 x i64> %ret
+}
+
+define <8 x i64> @strict_vector_fptoui_v8f16_to_v8i64(<8 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v8f16_to_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2uqq %xmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <8 x i64> @llvm.experimental.constrained.fptoui.v8i64.v8f16(<8 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <8 x i64> %ret
+}
+
+define <16 x i32> @strict_vector_fptosi_v16f16_to_v16i32(<16 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v16f16_to_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2dq %ymm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x i32> @llvm.experimental.constrained.fptosi.v16i32.v16f16(<16 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <16 x i32> %ret
+}
+
+define <16 x i32> @strict_vector_fptoui_v16f16_to_v16i32(<16 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v16f16_to_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2udq %ymm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <16 x i32> @llvm.experimental.constrained.fptoui.v16i32.v16f16(<16 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <16 x i32> %ret
+}
+
+define <32 x i16> @strict_vector_fptosi_v32f16_to_v32i16(<32 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v32f16_to_v32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2w %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <32 x i16> @llvm.experimental.constrained.fptosi.v32i16.v32f16(<32 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <32 x i16> %ret
+}
+
+define <32 x i16> @strict_vector_fptoui_v32f16_to_v32i16(<32 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v32f16_to_v32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2uw %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <32 x i16> @llvm.experimental.constrained.fptoui.v32i16.v32f16(<32 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <32 x i16> %ret
+}
+
+define <32 x i8> @strict_vector_fptosi_v32f16_to_v32i8(<32 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v32f16_to_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2w %zmm0, %zmm0
+; CHECK-NEXT:    vpmovwb %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <32 x i8> @llvm.experimental.constrained.fptosi.v32i8.v32f16(<32 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <32 x i8> %ret
+}
+
+define <32 x i8> @strict_vector_fptoui_v32f16_to_v32i8(<32 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v32f16_to_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2w %zmm0, %zmm0
+; CHECK-NEXT:    vpmovwb %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <32 x i8> @llvm.experimental.constrained.fptoui.v32i8.v32f16(<32 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <32 x i8> %ret
+}
+
+define <32 x i1> @strict_vector_fptosi_v32f16_to_v32i1(<32 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptosi_v32f16_to_v32i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2w %zmm0, %zmm0
+; CHECK-NEXT:    vpmovw2m %zmm0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <32 x i1> @llvm.experimental.constrained.fptosi.v32i1.v32f16(<32 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <32 x i1> %ret
+}
+
+define <32 x i1> @strict_vector_fptoui_v32f16_to_v32i1(<32 x half> %a) #0 {
+; CHECK-LABEL: strict_vector_fptoui_v32f16_to_v32i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttph2w %zmm0, %zmm0
+; CHECK-NEXT:    vpsllw $15, %zmm0, %zmm0
+; CHECK-NEXT:    vpmovw2m %zmm0, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %ret = call <32 x i1> @llvm.experimental.constrained.fptoui.v32i1.v32f16(<32 x half> %a,
+                                              metadata !"fpexcept.strict") #0
+  ret <32 x i1> %ret
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll
new file mode 100644
index 0000000000000..7e10ab56faae1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128-fp16.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,X64
+
+declare <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i1(<8 x i1>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i1(<8 x i1>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i8(<8 x i8>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i8(<8 x i8>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i16(<8 x i16>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i16(<8 x i16>, metadata, metadata)
+declare <4 x half> @llvm.experimental.constrained.sitofp.v4f16.v4i32(<4 x i32>, metadata, metadata)
+declare <4 x half> @llvm.experimental.constrained.uitofp.v4f16.v4i32(<4 x i32>, metadata, metadata)
+declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64>, metadata, metadata)
+declare <2 x half> @llvm.experimental.constrained.uitofp.v2f16.v2i64(<2 x i64>, metadata, metadata)
+
+define <4 x half> @sitofp_v4i32_v4f16(<4 x i32> %x) #0 {
+; CHECK-LABEL: sitofp_v4i32_v4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtdq2ph %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <4 x half> @llvm.experimental.constrained.sitofp.v4f16.v4i32(<4 x i32> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <4 x half> %result
+}
+
+define <4 x half> @uitofp_v4i32_v4f16(<4 x i32> %x) #0 {
+; CHECK-LABEL: uitofp_v4i32_v4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtudq2ph %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <4 x half> @llvm.experimental.constrained.uitofp.v4f16.v4i32(<4 x i32> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <4 x half> %result
+}
+
+define <2 x half> @sitofp_v2i64_v2f16(<2 x i64> %x) #0 {
+; CHECK-LABEL: sitofp_v2i64_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtqq2ph %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <2 x half> %result
+}
+
+define <2 x half> @uitofp_v2i64_v2f16(<2 x i64> %x) #0 {
+; CHECK-LABEL: uitofp_v2i64_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuqq2ph %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <2 x half> @llvm.experimental.constrained.uitofp.v2f16.v2i64(<2 x i64> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <2 x half> %result
+}
+
+define <8 x half> @sitofp_v8i1_v8f16(<8 x i1> %x) #0 {
+; CHECK-LABEL: sitofp_v8i1_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpsllw $15, %xmm0, %xmm0
+; CHECK-NEXT:    vpsraw $15, %xmm0, %xmm0
+; CHECK-NEXT:    vcvtw2ph %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i1(<8 x i1> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <8 x half> %result
+}
+
+define <8 x half> @uitofp_v8i1_v8f16(<8 x i1> %x) #0 {
+; X86-LABEL: uitofp_v8i1_v8f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vcvtuw2ph %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_v8i1_v8f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vcvtuw2ph %xmm0, %xmm0
+; X64-NEXT:    retq
+ %result = call <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i1(<8 x i1> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <8 x half> %result
+}
+
+define <8 x half> @sitofp_v8i8_v8f16(<8 x i8> %x) #0 {
+; CHECK-LABEL: sitofp_v8i8_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm0
+; CHECK-NEXT:    vcvtw2ph %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i8(<8 x i8> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <8 x half> %result
+}
+
+define <8 x half> @uitofp_v8i8_v8f16(<8 x i8> %x) #0 {
+; CHECK-LABEL: uitofp_v8i8_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT:    vcvtuw2ph %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i8(<8 x i8> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <8 x half> %result
+}
+
+define <8 x half> @sitofp_v8i16_v8f16(<8 x i16> %x) #0 {
+; CHECK-LABEL: sitofp_v8i16_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtw2ph %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i16(<8 x i16> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <8 x half> %result
+}
+
+define <8 x half> @uitofp_v8i16_v8f16(<8 x i16> %x) #0 {
+; CHECK-LABEL: uitofp_v8i16_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuw2ph %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i16(<8 x i16> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <8 x half> %result
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll
new file mode 100644
index 0000000000000..9eaef5a772fd7
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256-fp16.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,X64
+
+declare <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i1(<16 x i1>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i1(<16 x i1>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i8(<16 x i8>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i8(<16 x i8>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i16(<16 x i16>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i16(<16 x i16>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i32(<8 x i32>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i32(<8 x i32>, metadata, metadata)
+declare <4 x half> @llvm.experimental.constrained.sitofp.v4f16.v4i64(<4 x i64>, metadata, metadata)
+declare <4 x half> @llvm.experimental.constrained.uitofp.v4f16.v4i64(<4 x i64>, metadata, metadata)
+
+define <16 x half> @sitofp_v16i1_v16f16(<16 x i1> %x) #0 {
+; CHECK-LABEL: sitofp_v16i1_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT:    vpsllw $15, %ymm0, %ymm0
+; CHECK-NEXT:    vpsraw $15, %ymm0, %ymm0
+; CHECK-NEXT:    vcvtw2ph %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i1(<16 x i1> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <16 x half> %result
+}
+
+define <16 x half> @uitofp_v16i1_v16f16(<16 x i1> %x) #0 {
+; X86-LABEL: uitofp_v16i1_v16f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X86-NEXT:    vcvtuw2ph %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_v16i1_v16f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; X64-NEXT:    vcvtuw2ph %ymm0, %ymm0
+; X64-NEXT:    retq
+ %result = call <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i1(<16 x i1> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <16 x half> %result
+}
+
+define <16 x half> @sitofp_v16i8_v16f16(<16 x i8> %x) #0 {
+; CHECK-LABEL: sitofp_v16i8_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
+; CHECK-NEXT:    vcvtw2ph %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i8(<16 x i8> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <16 x half> %result
+}
+
+define <16 x half> @uitofp_v16i8_v16f16(<16 x i8> %x) #0 {
+; CHECK-LABEL: uitofp_v16i8_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT:    vcvtuw2ph %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i8(<16 x i8> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <16 x half> %result
+}
+
+define <16 x half> @sitofp_v16i16_v16f16(<16 x i16> %x) #0 {
+; CHECK-LABEL: sitofp_v16i16_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtw2ph %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i16(<16 x i16> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <16 x half> %result
+}
+
+define <16 x half> @uitofp_v16i16_v16f16(<16 x i16> %x) #0 {
+; CHECK-LABEL: uitofp_v16i16_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuw2ph %ymm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i16(<16 x i16> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <16 x half> %result
+}
+
+define <8 x half> @sitofp_v8i32_v8f16(<8 x i32> %x) #0 {
+; CHECK-LABEL: sitofp_v8i32_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtdq2ph %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i32(<8 x i32> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <8 x half> %result
+}
+
+define <8 x half> @uitofp_v8i32_v8f16(<8 x i32> %x) #0 {
+; CHECK-LABEL: uitofp_v8i32_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtudq2ph %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i32(<8 x i32> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <8 x half> %result
+}
+
+define <4 x half> @sitofp_v4i64_v4f16(<4 x i64> %x) #0 {
+; CHECK-LABEL: sitofp_v4i64_v4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtqq2ph %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <4 x half> @llvm.experimental.constrained.sitofp.v4f16.v4i64(<4 x i64> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <4 x half> %result
+}
+
+define <4 x half> @uitofp_v4i64_v4f16(<4 x i64> %x) #0 {
+; CHECK-LABEL: uitofp_v4i64_v4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuqq2ph %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <4 x half> @llvm.experimental.constrained.uitofp.v4f16.v4i64(<4 x i64> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <4 x half> %result
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll
new file mode 100644
index 0000000000000..c807af0932b56
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512-fp16.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16 -O3 | FileCheck %s --check-prefixes=CHECK,X64
+
+declare <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i1(<32 x i1>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i1(<32 x i1>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i8(<32 x i8>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i8(<32 x i8>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i16(<32 x i16>, metadata, metadata)
+declare <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i16(<32 x i16>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i32(<16 x i32>, metadata, metadata)
+declare <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i32(<16 x i32>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i64(<8 x i64>, metadata, metadata)
+declare <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i64(<8 x i64>, metadata, metadata)
+
+define <32 x half> @sitofp_v32i1_v32f16(<32 x i1> %x) #0 {
+; CHECK-LABEL: sitofp_v32i1_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; CHECK-NEXT:    vpsllw $15, %zmm0, %zmm0
+; CHECK-NEXT:    vpsraw $15, %zmm0, %zmm0
+; CHECK-NEXT:    vcvtw2ph %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i1(<32 x i1> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <32 x half> %result
+}
+
+define <32 x half> @uitofp_v32i1_v32f16(<32 x i1> %x) #0 {
+; X86-LABEL: uitofp_v32i1_v32f16:
+; X86:       # %bb.0:
+; X86-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; X86-NEXT:    vcvtuw2ph %zmm0, %zmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: uitofp_v32i1_v32f16:
+; X64:       # %bb.0:
+; X64-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; X64-NEXT:    vcvtuw2ph %zmm0, %zmm0
+; X64-NEXT:    retq
+ %result = call <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i1(<32 x i1> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <32 x half> %result
+}
+
+define <32 x half> @sitofp_v32i8_v32f16(<32 x i8> %x) #0 {
+; CHECK-LABEL: sitofp_v32i8_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovsxbw %ymm0, %zmm0
+; CHECK-NEXT:    vcvtw2ph %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i8(<32 x i8> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <32 x half> %result
+}
+
+define <32 x half> @uitofp_v32i8_v32f16(<32 x i8> %x) #0 {
+; CHECK-LABEL: uitofp_v32i8_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; CHECK-NEXT:    vcvtuw2ph %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i8(<32 x i8> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <32 x half> %result
+}
+
+define <32 x half> @sitofp_v32i16_v32f16(<32 x i16> %x) #0 {
+; CHECK-LABEL: sitofp_v32i16_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtw2ph %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <32 x half> @llvm.experimental.constrained.sitofp.v32f16.v32i16(<32 x i16> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <32 x half> %result
+}
+
+define <32 x half> @uitofp_v32i16_v32f16(<32 x i16> %x) #0 {
+; CHECK-LABEL: uitofp_v32i16_v32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuw2ph %zmm0, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <32 x half> @llvm.experimental.constrained.uitofp.v32f16.v32i16(<32 x i16> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <32 x half> %result
+}
+
+define <16 x half> @sitofp_v16i32_v16f16(<16 x i32> %x) #0 {
+; CHECK-LABEL: sitofp_v16i32_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtdq2ph %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <16 x half> @llvm.experimental.constrained.sitofp.v16f16.v16i32(<16 x i32> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <16 x half> %result
+}
+
+define <16 x half> @uitofp_v16i32_v16f16(<16 x i32> %x) #0 {
+; CHECK-LABEL: uitofp_v16i32_v16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtudq2ph %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <16 x half> @llvm.experimental.constrained.uitofp.v16f16.v16i32(<16 x i32> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <16 x half> %result
+}
+
+define <8 x half> @sitofp_v8i64_v8f16(<8 x i64> %x) #0 {
+; CHECK-LABEL: sitofp_v8i64_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtqq2ph %zmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <8 x half> @llvm.experimental.constrained.sitofp.v8f16.v8i64(<8 x i64> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <8 x half> %result
+}
+
+define <8 x half> @uitofp_v8i64_v8f16(<8 x i64> %x) #0 {
+; CHECK-LABEL: uitofp_v8i64_v8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvtuqq2ph %zmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+ %result = call <8 x half> @llvm.experimental.constrained.uitofp.v8f16.v8i64(<8 x i64> %x,
+                                                              metadata !"round.dynamic",
+                                                              metadata !"fpexcept.strict") #0
+  ret <8 x half> %result
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
index d576fc074775e..b1d30090cc6d8 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax-nnan.ll
@@ -3,8 +3,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512FP16
 
 ;
 ; vXf32
@@ -416,21 +417,29 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; AVX-NEXT:    popq %rbp
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v2f16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movzwl %si, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm0
-; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT:    movzwl %di, %ecx
-; AVX512-NEXT:    vmovd %ecx, %xmm1
-; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT:    vucomiss %xmm0, %xmm1
-; AVX512-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    cmoval %edi, %esi
-; AVX512-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: test_v2f16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movzwl %si, %eax
+; AVX512BW-NEXT:    vmovd %eax, %xmm0
+; AVX512BW-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT:    movzwl %di, %ecx
+; AVX512BW-NEXT:    vmovd %ecx, %xmm1
+; AVX512BW-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512BW-NEXT:    vucomiss %xmm0, %xmm1
+; AVX512BW-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    cmoval %edi, %esi
+; AVX512BW-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512FP16-LABEL: test_v2f16:
+; AVX512FP16:       # %bb.0:
+; AVX512FP16-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512FP16-NEXT:    vcmpltph %xmm0, %xmm1, %k1
+; AVX512FP16-NEXT:    vmovsh %xmm0, %xmm0, %xmm1 {%k1}
+; AVX512FP16-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512FP16-NEXT:    retq
   %1 = call nnan half @llvm.vector.reduce.fmax.v2f16(<2 x half> %a0)
   ret half %1
 }
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
index 0e96da412ef1d..50c805d37ddb7 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin-nnan.ll
@@ -3,8 +3,9 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512fp16,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX512,AVX512FP16
 
 ;
 ; vXf32
@@ -415,21 +416,29 @@ define half @test_v2f16(<2 x half> %a0) nounwind {
 ; AVX-NEXT:    popq %rbp
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v2f16:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movzwl %si, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm0
-; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT:    movzwl %di, %ecx
-; AVX512-NEXT:    vmovd %ecx, %xmm1
-; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT:    vucomiss %xmm0, %xmm1
-; AVX512-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    cmovbl %edi, %esi
-; AVX512-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT:    retq
+; AVX512BW-LABEL: test_v2f16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    movzwl %si, %eax
+; AVX512BW-NEXT:    vmovd %eax, %xmm0
+; AVX512BW-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX512BW-NEXT:    movzwl %di, %ecx
+; AVX512BW-NEXT:    vmovd %ecx, %xmm1
+; AVX512BW-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX512BW-NEXT:    vucomiss %xmm0, %xmm1
+; AVX512BW-NEXT:    movw %ax, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    cmovbl %edi, %esi
+; AVX512BW-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512FP16-LABEL: test_v2f16:
+; AVX512FP16:       # %bb.0:
+; AVX512FP16-NEXT:    vpsrld $16, %xmm0, %xmm1
+; AVX512FP16-NEXT:    vcmpltph %xmm1, %xmm0, %k1
+; AVX512FP16-NEXT:    vmovsh %xmm0, %xmm0, %xmm1 {%k1}
+; AVX512FP16-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512FP16-NEXT:    retq
   %1 = call nnan half @llvm.vector.reduce.fmin.v2f16(<2 x half> %a0)
   ret half %1
 }
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index 05caca8eaae1b..0037cfcf47f4a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -575,9 +575,7 @@ define <16 x float> @insert_sub1_12(<16 x float> %base, <4 x float> %sub1, <4 x
 define <16 x float> @insert_sub2_4(<16 x float> %base, <4 x float> %sub1, <4 x float> %sub2, <4 x float> %sub3, <4 x float> %sub4) {
 ; ALL-LABEL: insert_sub2_4:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vinsertf32x4 $2, %xmm3, %zmm0, %zmm1
-; ALL-NEXT:    vmovapd {{.*#+}} zmm2 = [0,1,12,13,4,5,6,7]
-; ALL-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; ALL-NEXT:    vinsertf32x4 $1, %xmm3, %zmm0, %zmm0
 ; ALL-NEXT:    retq
   %sub12 = shufflevector <4 x float> %sub1, <4 x float> %sub2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %sub34 = shufflevector <4 x float> %sub3, <4 x float> %sub4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 66f30754fd7af..258b54203438b 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -369,8 +369,8 @@ define void @interleaved_store_vf16_i8_stride4(<16 x i8> %x1, <16 x i8> %x2, <16
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; AVX512-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
 ; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
 ; AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdi)
 ; AVX512-NEXT:    vzeroupper
@@ -1467,15 +1467,15 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(<192 x i8>* %ptr){
 ; AVX512-NEXT:    vmovdqu 96(%rdi), %xmm3
 ; AVX512-NEXT:    vmovdqu 112(%rdi), %xmm4
 ; AVX512-NEXT:    vmovdqu 128(%rdi), %xmm5
+; AVX512-NEXT:    vinserti128 $1, 144(%rdi), %ymm3, %ymm3
 ; AVX512-NEXT:    vinserti128 $1, 48(%rdi), %ymm0, %ymm0
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-NEXT:    vinserti128 $1, 160(%rdi), %ymm4, %ymm3
 ; AVX512-NEXT:    vinserti128 $1, 64(%rdi), %ymm1, %ymm1
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-NEXT:    vinserti128 $1, 176(%rdi), %ymm5, %ymm3
 ; AVX512-NEXT:    vinserti128 $1, 80(%rdi), %ymm2, %ymm2
-; AVX512-NEXT:    vinserti128 $1, 144(%rdi), %ymm3, %ymm3
-; AVX512-NEXT:    vinserti128 $1, 160(%rdi), %ymm4, %ymm4
-; AVX512-NEXT:    vinserti128 $1, 176(%rdi), %ymm5, %ymm5
-; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm5, %zmm2, %zmm2
+; AVX512-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
 ; AVX512-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
 ; AVX512-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/ignore-dbg-value-list.mir b/llvm/test/DebugInfo/MIR/InstrRef/ignore-dbg-value-list.mir
new file mode 100644
index 0000000000000..ee3a0d8e3ac27
--- /dev/null
+++ b/llvm/test/DebugInfo/MIR/InstrRef/ignore-dbg-value-list.mir
@@ -0,0 +1,74 @@
+# RUN: llc %s -o - -mtriple=x86_64-unknown-unknown \
+# RUN:    -experimental-debug-variable-locations -run-pass=livedebugvalues\
+# RUN:    | FileCheck %s --implicit-check-not=DBG_VALUE \
+# RUN:                   --implicit-check-not=DBG_VALUE_LIST
+#
+# Test that any DBG_VALUE_LISTs observed are interpreted as DBG_VALUE $noreg.
+# This is obviously sub-optimal, but avoids false variable locations, for the
+# period of time until InstrRefBasedLDV supports these variable locations.
+# 
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  define dso_local i32 @foo() !dbg !7 {
+  entry:
+    ret i32 0
+  }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4, !5}
+  !llvm.ident = !{!6}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "test.c", directory: "/tmp/out.c")
+  !2 = !{}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{i32 1, !"wchar_size", i32 4}
+  !6 = !{!""}
+  !7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !8, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+  !8 = !DISubroutineType(types: !9)
+  !9 = !{!10, !11, !11}
+  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !11 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+  !12 = !DILocalVariable(name: "bar", arg: 1, scope: !7, file: !1, line: 3, type: !11)
+  !13 = !DILocation(line: 0, scope: !7)
+  !14 = !DILocalVariable(name: "baz", arg: 2, scope: !7, file: !1, line: 3, type: !11)
+
+...
+---
+name:            foo
+alignment:       16
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $rdi, $rsi, $r14, $rbx
+
+    ; CHECK-LABEL: bb.0:
+    $rax = MOV64ri 0, debug-location !13
+    $rbx = MOV64ri 0, debug-location !13
+    DBG_VALUE_LIST !12, !DIExpression(), $rax, $rbx, debug-location !13
+    ; CHECK: DBG_VALUE_LIST
+
+  bb.1:
+    liveins: $rax
+
+    ; CHECK-LABEL: bb.1:
+    ; Earlier DBG_VALUE_LIST should not be propagated here, would be caught
+    ; by implicit check not on FileCheck cmdline.
+
+    DBG_VALUE $rax, $noreg, !12, !DIExpression(), debug-location !13
+    ; CHECK: DBG_VALUE
+    $rbx = MOV64ri 0, debug-location !13
+    DBG_VALUE_LIST !12, !DIExpression(), $rax, $rbx, debug-location !13
+    ; CHECK: DBG_VALUE_LIST
+    $rbx = COPY killed $rax, debug-location !13
+    $rax = MOV64ri 0, debug-location !13
+    ; This clobber of $rax might cause LDV to re-issue a DBG_VALUE stating the
+    ; variable location as $rbx. However, the preceeding DBG_VALUE_LIST should
+    ; terminate the earlier location.
+    RETQ implicit $rbx, debug-location !13
+
+...
diff --git a/llvm/test/DebugInfo/MIR/InstrRef/restore-to-rsp-crash.mir b/llvm/test/DebugInfo/MIR/InstrRef/restore-to-rsp-crash.mir
new file mode 100644
index 0000000000000..cfe6c1757a0c5
--- /dev/null
+++ b/llvm/test/DebugInfo/MIR/InstrRef/restore-to-rsp-crash.mir
@@ -0,0 +1,66 @@
+# RUN: llc %s -o - -mtriple=x86_64-unknown-unknown \
+# RUN:    -experimental-debug-variable-locations -run-pass=livedebugvalues
+#
+# Regression test for a rare crash with InstrRefBasedLDV. It would be quite
+# wild for a transfer (copy/spill/restore) instruction to write to $rsp, but if
+# it did, we hit a stale assertion that made assumptions about the numbering
+# order of location numbers. Test that this input (transferring to $rsp via
+# a stack restore) doesn't crash.
+# 
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  define dso_local i32 @foo(i64 %bar, i64 %baz) !dbg !7 {
+  entry:
+    ret i32 0
+  }
+  
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4, !5}
+  !llvm.ident = !{!6}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+  !1 = !DIFile(filename: "test.c", directory: "/tmp/out.c")
+  !2 = !{}
+  !3 = !{i32 7, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{i32 1, !"wchar_size", i32 4}
+  !6 = !{!""}
+  !7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !8, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+  !8 = !DISubroutineType(types: !9)
+  !9 = !{!10, !11, !11}
+  !10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !11 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed)
+  !12 = !DILocalVariable(name: "bar", arg: 1, scope: !7, file: !1, line: 3, type: !11)
+  !13 = !DILocation(line: 0, scope: !7)
+  !14 = !DILocalVariable(name: "baz", arg: 2, scope: !7, file: !1, line: 3, type: !11)
+
+...
+---
+name:            foo
+alignment:       16
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       24
+  offsetAdjustment: -24
+  maxAlignment:    1
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 16
+stack:
+  - { id: 0, type: spill-slot, offset: -64, size: 8, alignment: 8 }
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $rax, $rbx
+  
+    DBG_VALUE $rax, $noreg, !12, !DIExpression(), debug-location !13
+    MOV64mr $rsp, 1, $noreg, -8, $noreg, renamable $rax :: (store 8 into %stack.0)
+    $rsp = MOV64rm $rsp, 1, $noreg, 0, $noreg, debug-location !13 :: (load 8 from %stack.0)
+    RETQ implicit $rbx, debug-location !13
+
+...
diff --git a/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir b/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir
index 4922c36086f16..17b6b9b3149c3 100644
--- a/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir
+++ b/llvm/test/DebugInfo/MIR/X86/live-debug-values-cutoffs.mir
@@ -5,21 +5,41 @@
 # RUN:   -livedebugvalues-input-bb-limit=1 \
 # RUN:   -livedebugvalues-input-dbg-value-limit=1 \
 # RUN:   | FileCheck %s -check-prefix=LDV-DISABLED
+# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \
+# RUN:   -experimental-debug-variable-locations \
+# RUN:   -livedebugvalues-input-bb-limit=1 \
+# RUN:   -livedebugvalues-input-dbg-value-limit=1 \
+# RUN:   | FileCheck %s -check-prefix=LDV-DISABLED
 
 # RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \
 # RUN:   -livedebugvalues-input-bb-limit=1 \
 # RUN:   -livedebugvalues-input-dbg-value-limit=10 \
 # RUN:   | FileCheck %s -check-prefix=LDV-ENABLED
+# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \
+# RUN:   -experimental-debug-variable-locations \
+# RUN:   -livedebugvalues-input-bb-limit=1 \
+# RUN:   -livedebugvalues-input-dbg-value-limit=10 \
+# RUN:   | FileCheck %s -check-prefix=LDV-ENABLED
 
 # RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \
 # RUN:   -livedebugvalues-input-bb-limit=10 \
 # RUN:   -livedebugvalues-input-dbg-value-limit=1 \
 # RUN:   | FileCheck %s -check-prefix=LDV-ENABLED
+# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \
+# RUN:   -experimental-debug-variable-locations \
+# RUN:   -livedebugvalues-input-bb-limit=10 \
+# RUN:   -livedebugvalues-input-dbg-value-limit=1 \
+# RUN:   | FileCheck %s -check-prefix=LDV-ENABLED
 
 # RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \
 # RUN:   -livedebugvalues-input-bb-limit=10 \
 # RUN:   -livedebugvalues-input-dbg-value-limit=10 \
 # RUN:   | FileCheck %s -check-prefix=LDV-ENABLED
+# RUN: llc %s -o - -run-pass=livedebugvalues -mtriple=x86_64-unknown-unknown \
+# RUN:   -experimental-debug-variable-locations \
+# RUN:   -livedebugvalues-input-bb-limit=10 \
+# RUN:   -livedebugvalues-input-dbg-value-limit=10 \
+# RUN:   | FileCheck %s -check-prefix=LDV-ENABLED
 
 # LDV-DISABLED-LABEL: bb.1.exit
 # LDV-DISABLED-NEXT: $edi = MOV32rm
diff --git a/llvm/test/DebugInfo/X86/skeleton-unit-verify.s b/llvm/test/DebugInfo/X86/skeleton-unit-verify.s
index 95fbd113942a4..a5911e93f4bc4 100644
--- a/llvm/test/DebugInfo/X86/skeleton-unit-verify.s
+++ b/llvm/test/DebugInfo/X86/skeleton-unit-verify.s
@@ -6,7 +6,6 @@
 # CHECK-NEXT: warning: DW_TAG_skeleton_unit has DW_CHILDREN_yes but DIE has no children
 # CHECK-NEXT: DW_TAG_skeleton_unit
 # CHECK-NEXT: error: Skeleton compilation unit has children.
-# CHECK-NEXT: Verifying .debug_info references...
 # CHECK-NEXT: Verifying .debug_types Unit Header Chain...
 # CHECK-NEXT: Errors detected.
 
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s
index 33be9d5ddccb2..de464c7a07bfd 100644
--- a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s
+++ b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_common.s
@@ -1,5 +1,5 @@
 # RUN: rm -rf %t && mkdir -p %t
-# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent -filetype=obj -o %t/elf_common.o %s
+# RUN: llvm-mc -triple=x86_64-unknown-linux -relax-relocations=false -position-independent -filetype=obj -o %t/elf_common.o %s
 # RUN: llvm-jitlink -entry=load_common -noexec -check %s %t/elf_common.o
 
         .text
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_got_plt_optimizations.s b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_got_plt_optimizations.s
new file mode 100644
index 0000000000000..d74c56a533af1
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_got_plt_optimizations.s
@@ -0,0 +1,63 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent -filetype=obj \
+# RUN:         -o %t/elf_sm_pic_reloc.o %s
+# RUN: llvm-jitlink -noexec -slab-allocate 100Kb -slab-address 0xfff00000 \
+# RUN:              -define-abs extern_in_range32=0xffe00000 \
+# RUN:              -check %s %t/elf_sm_pic_reloc.o
+#
+
+
+        .text
+        .file   "testcase.c"
+
+# Empty main entry point.
+        .globl  main
+        .p2align        4, 0x90
+        .type   main,@function
+main:
+        retq
+
+        .size   main, .-main
+
+# Test optimization of transforming "call *foo@GOTPCREL(%rip)" to "addr call foo"
+# We need check both the target address and the instruction opcodes
+# jitlink-check: decode_operand(test_call_gotpcrelx, 0)[31:0] = extern_in_range32
+# jitlink-check: *{1}test_call_gotpcrelx = 0x67
+# jitlink-check: *{1}test_call_gotpcrelx+1 = 0xe8
+        .globl test_call_gotpcrelx
+        .p2align      4, 0x90
+        .type   test_call_gotpcrelx,@function
+test_call_gotpcrelx:
+	call    *extern_in_range32@GOTPCREL(%rip)
+
+        .size   test_call_gotpcrelx, .-test_call_gotpcrelx
+
+
+# Test optimization of transforming "jmp *foo@GOTPCREL(%rip)" to "jmp foo ; nop"
+# We need check both the target address and the instruction opcodes
+# jitlink-check: decode_operand(test_call_gotpcrelx, 0)[31:0] = extern_in_range32
+# jitlink-check: *{1}test_jmp_gotpcrelx = 0xe9
+# jitlink-check: *{1}test_jmp_gotpcrelx+5 = 0x90
+        .globl test_jmp_gotpcrelx
+        .p2align      4, 0x90
+        .type   test_jmp_gotpcrelx,@function
+test_jmp_gotpcrelx:
+	jmp    *extern_in_range32@GOTPCREL(%rip)
+
+        .size   test_jmp_gotpcrelx, .-test_jmp_gotpcrelx
+
+# Check R_X86_64_PLT32 handling with a call to an external. This produces a
+# Branch32ToStub edge, because externals are not defined locally. During
+# resolution, the target turns out to be in-range from the callsite and so the
+# edge is relaxed in post-allocation optimization.
+#
+# jitlink-check: decode_operand(test_call_extern, 0) = \
+# jitlink-check:     extern_in_range32 - next_pc(test_call_extern)
+        .globl  test_call_extern
+        .p2align       4, 0x90
+        .type   test_call_extern,@function
+test_call_extern:
+        callq   extern_in_range32@plt
+
+        .size   test_call_extern, .-test_call_extern
+
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s
index 147f2b48bbb7e..b195693049340 100644
--- a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s
+++ b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86-64_small_pic_relocations.s
@@ -3,7 +3,6 @@
 # RUN:         -o %t/elf_sm_pic_reloc.o %s
 # RUN: llvm-jitlink -noexec -slab-allocate 100Kb -slab-address 0xfff00000 \
 # RUN:              -define-abs external_data=0x1 \
-# RUN:              -define-abs extern_in_range32=0xffe00000 \
 # RUN:              -define-abs extern_out_of_range32=0x7fff00000000 \
 # RUN:              -check %s %t/elf_sm_pic_reloc.o
 #
@@ -51,21 +50,6 @@ test_call_local:
 
         .size   test_call_local, .-test_call_local
 
-# Check R_X86_64_PLT32 handling with a call to an external. This produces a
-# Branch32ToStub edge, because externals are not defined locally. During
-# resolution, the target turns out to be in-range from the callsite and so the
-# edge is relaxed in post-allocation optimization.
-#
-# jitlink-check: decode_operand(test_call_extern, 0) = \
-# jitlink-check:     extern_in_range32 - next_pc(test_call_extern)
-        .globl  test_call_extern
-        .p2align       4, 0x90
-        .type   test_call_extern,@function
-test_call_extern:
-        callq   extern_in_range32@plt
-
-        .size   test_call_extern, .-test_call_extern
-
 # Check R_X86_64_PLT32 handling with a call to an external via PLT. This
 # produces a Branch32ToStub edge, because externals are not defined locally.
 # As the target is out-of-range from the callsite, the edge keeps using its PLT
@@ -85,7 +69,9 @@ test_call_extern_plt:
         .size   test_call_extern_plt, .-test_call_extern_plt
 
 # Test GOTPCREL handling. We want to check both the offset to the GOT entry and its
-# contents.
+# contents. "movl" will be optimized to "leal" and a non-got access if the pc relative
+# offset to named_data is in range of 32 bits signed immediate. So use "leal" here to
+# suppress optimization
 # jitlink-check: decode_operand(test_gotpcrel, 4) = \
 # jitlink-check:     got_addr(elf_sm_pic_reloc.o, named_data) - next_pc(test_gotpcrel)
 # jitlink-check: *{8}(got_addr(elf_sm_pic_reloc.o, named_data)) = named_data
@@ -94,7 +80,7 @@ test_call_extern_plt:
         .p2align      4, 0x90
         .type   test_gotpcrel,@function
 test_gotpcrel:
-	movl    named_data@GOTPCREL(%rip), %eax
+	leal    named_data@GOTPCREL(%rip), %eax
 
         .size   test_gotpcrel, .-test_gotpcrel
 
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86_64_absolute_relocations.s b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86_64_absolute_relocations.s
new file mode 100644
index 0000000000000..56dc1e0eca200
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/X86/ELF_x86_64_absolute_relocations.s
@@ -0,0 +1,35 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent -filetype=obj \
+# RUN:         -o %t/elf_abs_reloc.o %s
+# RUN: llvm-jitlink -noexec -slab-allocate 100Kb -slab-address 0xfff00000 \
+# RUN:              -define-abs external_data_low=0x1 \
+# RUN:              -define-abs external_data_high=0xffffffff80000000 \
+# RUN:              -check %s %t/elf_abs_reloc.o
+#
+# Test ELF absolute relocations.
+
+
+        .text
+        .file   "testcase.c"
+
+# Empty main entry point.
+        .globl  main
+        .p2align        4, 0x90
+        .type   main,@function
+main:
+        retq
+
+        .size   main, .-main
+
+# R_X86_64_32S handling
+# Test the target value is in range of signed 32-bits imm
+# jitlink-check: decode_operand(test_abs_32S, 4) = external_data_low
+# jitlink-check: decode_operand(test_abs_32S+7, 4)[31:0] = external_data_high[31:0]
+        .globl  test_abs_32S
+        .p2align       4, 0x90
+        .type   test_abs_32S,@function
+test_abs_32S:
+        movl    external_data_low, %eax
+        movl    external_data_high, %esi
+
+         .size   test_abs_32S, .-test_abs_32S
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/memaccess-clobber.ll b/llvm/test/Instrumentation/HWAddressSanitizer/memaccess-clobber.ll
new file mode 100644
index 0000000000000..2a23079e532ce
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/memaccess-clobber.ll
@@ -0,0 +1,20 @@
+; Make sure memaccess checks preceed the following reads.
+;
+; RUN: opt < %s -S -enable-new-pm=0 -hwasan -basic-aa -memdep -print-memdeps -analyze -mtriple aarch64-linux-android30 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android10000"
+
+declare void @use32(i32*)
+
+define i32 @test_alloca() sanitize_hwaddress {
+entry:
+  %x = alloca i32, align 4
+  call void @use32(i32* nonnull %x)
+  ; CHECK: Clobber from:   call void @llvm.hwasan.check.memaccess.shortgranule
+  ; CHECK-NEXT: load i32, i32* %x.hwasan, align 4
+  %y = load i32, i32* %x
+  ; CHECK:  Clobber from:   %y = load i32, i32* %x.hwasan, align 4
+  ; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 {{.*}}, i8 0, i64 1, i1 false)
+  ret i32 %y
+}
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/stack-safety-analysis.ll b/llvm/test/Instrumentation/HWAddressSanitizer/stack-safety-analysis.ll
index 766660a5c26f8..0c57028039696 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/stack-safety-analysis.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/stack-safety-analysis.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -hwasan -hwasan-use-stack-safety=1 -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=SAFETY
 ; RUN: opt -hwasan -hwasan-use-stack-safety=0 -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=NOSAFETY
+; RUN: opt -hwasan -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=SAFETY
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/InstrProfiling/icall.ll b/llvm/test/Instrumentation/InstrProfiling/icall-comdat.ll
similarity index 87%
rename from llvm/test/Instrumentation/InstrProfiling/icall.ll
rename to llvm/test/Instrumentation/InstrProfiling/icall-comdat.ll
index cd733455b8de7..ac5b689d7ba88 100644
--- a/llvm/test/Instrumentation/InstrProfiling/icall.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/icall-comdat.ll
@@ -1,6 +1,5 @@
 ;; Check that static counters are allocated for value profiler
 
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.10.0 -vp-static-alloc=true -instrprof -S | FileCheck %s --check-prefix=STATIC
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -instrprof -vp-static-alloc=true -S | FileCheck %s --check-prefix=STATIC
 ; RUN: opt < %s -mtriple=powerpc-unknown-linux -instrprof -vp-static-alloc=true -S | FileCheck %s --check-prefix=STATIC
 ; RUN: opt < %s -mtriple=sparc-unknown-linux -instrprof -vp-static-alloc=true -S | FileCheck %s --check-prefix=STATIC
@@ -9,10 +8,7 @@
 ; RUN: opt < %s -mtriple=sparc64-unknown-linux -instrprof -vp-static-alloc=true -S | FileCheck %s --check-prefix=STATIC-EXT
 ; RUN: opt < %s -mtriple=mips-unknown-linux -instrprof -vp-static-alloc=true -S | FileCheck %s --check-prefix=STATIC-SEXT
 ; RUN: opt < %s -mtriple=mips64-unknown-linux -instrprof -vp-static-alloc=true -S | FileCheck %s --check-prefix=STATIC-SEXT
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.10.0 -vp-static-alloc=false -instrprof -S | FileCheck %s --check-prefix=DYN
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux -instrprof -vp-static-alloc=false -S | FileCheck %s --check-prefix=DYN
-; RUN: opt < %s -mtriple=powerpc-ibm-aix-xcoff -instrprof -vp-static-alloc=false -S | FileCheck %s --check-prefix=DYN
-; RUN: opt < %s -mtriple=powerpc64-ibm-aix-xcoff -instrprof -vp-static-alloc=false -S | FileCheck %s --check-prefix=DYN
 
 
 
@@ -46,7 +42,7 @@ declare void @llvm.instrprof.value.profile(i8*, i64, i64, i32, i32) #0
 
 attributes #0 = { nounwind }
 
-; STATIC: @__profvp_foo = private global [1 x i64] zeroinitializer, section "{{[^"]+}}"
+; STATIC: @__profvp_foo = private global [1 x i64] zeroinitializer, section "{{[^"]+}}", comdat($__profc_foo)
 ; STATIC: @__profvp_bar = private global [1 x i64] zeroinitializer, section "{{[^"]+}}", comdat($__profc_bar)
 ; STATIC: @__llvm_prf_vnodes
 
diff --git a/llvm/test/Instrumentation/InstrProfiling/icall-nocomdat.ll b/llvm/test/Instrumentation/InstrProfiling/icall-nocomdat.ll
new file mode 100644
index 0000000000000..912d39a71ebca
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/icall-nocomdat.ll
@@ -0,0 +1,44 @@
+;; Check that static counters are allocated for value profiler
+
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.10.0 -vp-static-alloc=true -instrprof -S | FileCheck %s --check-prefix=STATIC
+; RUN: opt < %s -mtriple=powerpc-ibm-aix-xcoff -instrprof -vp-static-alloc=false -S | FileCheck %s --check-prefix=DYN
+; RUN: opt < %s -mtriple=powerpc64-ibm-aix-xcoff -instrprof -vp-static-alloc=false -S | FileCheck %s --check-prefix=DYN
+
+@__profn_foo = private constant [3 x i8] c"foo"
+@__profn_bar = private constant [3 x i8] c"bar"
+
+define i32 @foo(i32 ()* ) {
+  call void @llvm.instrprof.increment(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__profn_foo, i32 0, i32 0), i64 12884901887, i32 1, i32 0)
+  %2 = ptrtoint i32 ()* %0 to i64
+  call void @llvm.instrprof.value.profile(i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__profn_foo, i32 0, i32 0), i64 12884901887, i64 %2, i32 0, i32 0)
+  %3 = tail call i32 %0()
+  ret i32 %3
+}
+
+; Function Attrs: nounwind
+declare void @llvm.instrprof.increment(i8*, i64, i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.instrprof.value.profile(i8*, i64, i64, i32, i32) #0
+
+attributes #0 = { nounwind }
+
+; STATIC: @__profvp_foo = private global [1 x i64] zeroinitializer, section "{{[^"]+}}"
+; STATIC: @__llvm_prf_vnodes
+
+; DYN-NOT: @__profvp_foo
+; DYN-NOT: @__llvm_prf_vnodes
+
+;; __llvm_prf_vnodes and __llvm_prf_nm are not referenced by other metadata sections.
+;; We have to conservatively place them in llvm.used.
+; STATIC:      @llvm.used = appending global
+; STATIC-SAME:   @__llvm_prf_vnodes
+; STATIC-SAME:   @__llvm_prf_nm
+
+; STATIC: call void @__llvm_profile_instrument_target(i64 %3, i8* bitcast ({ i64, i64, i64, i8*, i8*, i32, [2 x i16] }* @__profd_foo to i8*), i32 0)
+; STATIC-EXT: call void @__llvm_profile_instrument_target(i64 %3, i8* bitcast ({ i64, i64, i64, i8*, i8*, i32, [2 x i16] }* @__profd_foo to i8*), i32 zeroext 0)
+; STATIC-SEXT: call void @__llvm_profile_instrument_target(i64 %3, i8* bitcast ({ i64, i64, i64, i8*, i8*, i32, [2 x i16] }* @__profd_foo to i8*), i32 signext 0)
+
+; STATIC: declare void @__llvm_profile_instrument_target(i64, i8*, i32)
+; STATIC-EXT: declare void @__llvm_profile_instrument_target(i64, i8*, i32 zeroext)
+; STATIC-SEXT: declare void @__llvm_profile_instrument_target(i64, i8*, i32 signext)
diff --git a/llvm/test/Linker/ifunc.ll b/llvm/test/Linker/ifunc.ll
new file mode 100644
index 0000000000000..1e5396ed5fed6
--- /dev/null
+++ b/llvm/test/Linker/ifunc.ll
@@ -0,0 +1,25 @@
+; RUN: split-file %s %t
+; RUN: llvm-link %t/a.ll %t/b.ll -S -o - | FileCheck %s
+
+;; Check that ifuncs are linked in properly.
+
+; CHECK-DAG: @foo = ifunc void (), bitcast (void ()* ()* @foo_resolve to void ()*)
+; CHECK-DAG: define internal void ()* @foo_resolve() {
+
+; CHECK-DAG: @bar = ifunc void (), bitcast (void ()* ()* @bar_resolve to void ()*)
+; CHECK-DAG: define internal void ()* @bar_resolve() {
+
+;--- a.ll
+declare void @bar()
+
+;--- b.ll
+@foo = ifunc void (), bitcast (void ()* ()* @foo_resolve to void ()*)
+@bar = ifunc void (), bitcast (void ()* ()* @bar_resolve to void ()*)
+
+define internal void ()* @foo_resolve() {
+  ret void ()* null
+}
+
+define internal void ()* @bar_resolve() {
+  ret void ()* null
+}
diff --git a/llvm/test/MC/AArch64/SME/streaming-sve-feature.s b/llvm/test/MC/AArch64/SME/streaming-sve-feature.s
new file mode 100644
index 0000000000000..e35505ca39c58
--- /dev/null
+++ b/llvm/test/MC/AArch64/SME/streaming-sve-feature.s
@@ -0,0 +1,8 @@
+// RUN: llvm-mc -triple=aarch64 -mattr=+streaming-sve,+neon < %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -mattr=+streaming-sve < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+// Verify NEON is disabled when targeting streaming mode, if it's not
+// explicitly requested.
+add v0.8b, v1.8b, v2.8b
+// CHECK: add v0.8b, v1.8b, v2.8b
+// CHECK-ERROR: error: instruction requires: neon
diff --git a/llvm/test/MC/AsmParser/macro-same-context.ll b/llvm/test/MC/AsmParser/macro-same-context.ll
index 81d15d8a19688..91d58579afac7 100644
--- a/llvm/test/MC/AsmParser/macro-same-context.ll
+++ b/llvm/test/MC/AsmParser/macro-same-context.ll
@@ -3,9 +3,6 @@
 ; UNSUPPORTED: -zos
 ; RUN: not llc < %s 2>&1 | FileCheck %s
 ; REQUIRES: default_triple
-;; This test is expected to fail on AIX,
-;; since the integrated assembly parser was not implemented yet for AIX.
-; XFAIL: aix
 
 define void @test() {
   call void asm sideeffect ".macro FOO\0A.endm", "~{dirflag},~{fpsr},~{flags}"() #1
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt
index 0b65c52d66d8d..0688cd71537de 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_ldst_acc.txt
@@ -7869,40 +7869,40 @@
 # GFX90A: image_load a5, v2, s[8:15] dmask:0x2 ; encoding: [0x00,0x02,0x01,0xf0,0x02,0x05,0x02,0x00]
 0x00,0x02,0x01,0xf0,0x02,0x05,0x02,0x00
 
-# GFX90A: image_load a6, v2, s[8:15] dmask:0x3 ; encoding: [0x00,0x03,0x01,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_load a[6:7], v2, s[8:15] dmask:0x3 ; encoding: [0x00,0x03,0x01,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x03,0x01,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_load a5, v2, s[8:15] dmask:0x4 ; encoding: [0x00,0x04,0x01,0xf0,0x02,0x05,0x02,0x00]
 0x00,0x04,0x01,0xf0,0x02,0x05,0x02,0x00
 
-# GFX90A: image_load a6, v2, s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x01,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_load a[6:7], v2, s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x01,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x05,0x01,0xf0,0x02,0x06,0x02,0x00
 
-# GFX90A: image_load a6, v2, s[8:15] dmask:0x6 ; encoding: [0x00,0x06,0x01,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_load a[6:7], v2, s[8:15] dmask:0x6 ; encoding: [0x00,0x06,0x01,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x06,0x01,0xf0,0x02,0x06,0x02,0x00
 
-# GFX90A: image_load a6, v2, s[8:15] dmask:0x7 ; encoding: [0x00,0x07,0x01,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_load a[6:8], v2, s[8:15] dmask:0x7 ; encoding: [0x00,0x07,0x01,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x07,0x01,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_load a5, v2, s[8:15] dmask:0x8 ; encoding: [0x00,0x08,0x01,0xf0,0x02,0x05,0x02,0x00]
 0x00,0x08,0x01,0xf0,0x02,0x05,0x02,0x00
 
-# GFX90A: image_load a6, v2, s[8:15] dmask:0x9 ; encoding: [0x00,0x09,0x01,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_load a[6:7], v2, s[8:15] dmask:0x9 ; encoding: [0x00,0x09,0x01,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x09,0x01,0xf0,0x02,0x06,0x02,0x00
 
-# GFX90A: image_load a6, v2, s[8:15] dmask:0xa ; encoding: [0x00,0x0a,0x01,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_load a[6:7], v2, s[8:15] dmask:0xa ; encoding: [0x00,0x0a,0x01,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x0a,0x01,0xf0,0x02,0x06,0x02,0x00
 
-# GFX90A: image_load a6, v2, s[8:15] dmask:0xb ; encoding: [0x00,0x0b,0x01,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_load a[6:8], v2, s[8:15] dmask:0xb ; encoding: [0x00,0x0b,0x01,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x0b,0x01,0xf0,0x02,0x06,0x02,0x00
 
-# GFX90A: image_load a6, v2, s[8:15] dmask:0xc ; encoding: [0x00,0x0c,0x01,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_load a[6:7], v2, s[8:15] dmask:0xc ; encoding: [0x00,0x0c,0x01,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x0c,0x01,0xf0,0x02,0x06,0x02,0x00
 
-# GFX90A: image_load a6, v2, s[8:15] dmask:0xd ; encoding: [0x00,0x0d,0x01,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_load a[6:8], v2, s[8:15] dmask:0xd ; encoding: [0x00,0x0d,0x01,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x0d,0x01,0xf0,0x02,0x06,0x02,0x00
 
-# GFX90A: image_load a6, v2, s[8:15] dmask:0xe ; encoding: [0x00,0x0e,0x01,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_load a[6:8], v2, s[8:15] dmask:0xe ; encoding: [0x00,0x0e,0x01,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x0e,0x01,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_load a5, v2, s[8:15]  ; encoding: [0x00,0x00,0x01,0xf0,0x02,0x05,0x02,0x00]
@@ -7944,43 +7944,43 @@
 # GFX90A: image_store a1, v2, s[12:19] dmask:0x2 unorm ; encoding: [0x00,0x12,0x21,0xf0,0x02,0x01,0x03,0x00]
 0x00,0x12,0x21,0xf0,0x02,0x01,0x03,0x00
 
-# GFX90A: image_store a2, v2, s[12:19] dmask:0x3 unorm ; encoding: [0x00,0x13,0x21,0xf0,0x02,0x02,0x03,0x00]
+# GFX90A: image_store a[2:3], v2, s[12:19] dmask:0x3 unorm ; encoding: [0x00,0x13,0x21,0xf0,0x02,0x02,0x03,0x00]
 0x00,0x13,0x21,0xf0,0x02,0x02,0x03,0x00
 
 # GFX90A: image_store a1, v2, s[12:19] dmask:0x4 unorm ; encoding: [0x00,0x14,0x21,0xf0,0x02,0x01,0x03,0x00]
 0x00,0x14,0x21,0xf0,0x02,0x01,0x03,0x00
 
-# GFX90A: image_store a2, v2, s[12:19] dmask:0x5 unorm ; encoding: [0x00,0x15,0x21,0xf0,0x02,0x02,0x03,0x00]
+# GFX90A: image_store a[2:3], v2, s[12:19] dmask:0x5 unorm ; encoding: [0x00,0x15,0x21,0xf0,0x02,0x02,0x03,0x00]
 0x00,0x15,0x21,0xf0,0x02,0x02,0x03,0x00
 
-# GFX90A: image_store a2, v2, s[12:19] dmask:0x6 unorm ; encoding: [0x00,0x16,0x21,0xf0,0x02,0x02,0x03,0x00]
+# GFX90A: image_store a[2:3], v2, s[12:19] dmask:0x6 unorm ; encoding: [0x00,0x16,0x21,0xf0,0x02,0x02,0x03,0x00]
 0x00,0x16,0x21,0xf0,0x02,0x02,0x03,0x00
 
-# GFX90A: image_store a2, v2, s[12:19] dmask:0x7 unorm ; encoding: [0x00,0x17,0x21,0xf0,0x02,0x02,0x03,0x00]
+# GFX90A: image_store a[2:4], v2, s[12:19] dmask:0x7 unorm ; encoding: [0x00,0x17,0x21,0xf0,0x02,0x02,0x03,0x00]
 0x00,0x17,0x21,0xf0,0x02,0x02,0x03,0x00
 
 # GFX90A: image_store a1, v2, s[12:19] dmask:0x8 unorm ; encoding: [0x00,0x18,0x21,0xf0,0x02,0x01,0x03,0x00]
 0x00,0x18,0x21,0xf0,0x02,0x01,0x03,0x00
 
-# GFX90A: image_store a2, v2, s[12:19] dmask:0x9 unorm ; encoding: [0x00,0x19,0x21,0xf0,0x02,0x02,0x03,0x00]
+# GFX90A: image_store a[2:3], v2, s[12:19] dmask:0x9 unorm ; encoding: [0x00,0x19,0x21,0xf0,0x02,0x02,0x03,0x00]
 0x00,0x19,0x21,0xf0,0x02,0x02,0x03,0x00
 
-# GFX90A: image_store a2, v2, s[12:19] dmask:0xa unorm ; encoding: [0x00,0x1a,0x21,0xf0,0x02,0x02,0x03,0x00]
+# GFX90A: image_store a[2:3], v2, s[12:19] dmask:0xa unorm ; encoding: [0x00,0x1a,0x21,0xf0,0x02,0x02,0x03,0x00]
 0x00,0x1a,0x21,0xf0,0x02,0x02,0x03,0x00
 
-# GFX90A: image_store a2, v2, s[12:19] dmask:0xb unorm ; encoding: [0x00,0x1b,0x21,0xf0,0x02,0x02,0x03,0x00]
+# GFX90A: image_store a[2:4], v2, s[12:19] dmask:0xb unorm ; encoding: [0x00,0x1b,0x21,0xf0,0x02,0x02,0x03,0x00]
 0x00,0x1b,0x21,0xf0,0x02,0x02,0x03,0x00
 
-# GFX90A: image_store a2, v2, s[12:19] dmask:0xc unorm ; encoding: [0x00,0x1c,0x21,0xf0,0x02,0x02,0x03,0x00]
+# GFX90A: image_store a[2:3], v2, s[12:19] dmask:0xc unorm ; encoding: [0x00,0x1c,0x21,0xf0,0x02,0x02,0x03,0x00]
 0x00,0x1c,0x21,0xf0,0x02,0x02,0x03,0x00
 
-# GFX90A: image_store a2, v2, s[12:19] dmask:0xd unorm ; encoding: [0x00,0x1d,0x21,0xf0,0x02,0x02,0x03,0x00]
+# GFX90A: image_store a[2:4], v2, s[12:19] dmask:0xd unorm ; encoding: [0x00,0x1d,0x21,0xf0,0x02,0x02,0x03,0x00]
 0x00,0x1d,0x21,0xf0,0x02,0x02,0x03,0x00
 
-# GFX90A: image_store a2, v2, s[12:19] dmask:0xe unorm ; encoding: [0x00,0x1e,0x21,0xf0,0x02,0x02,0x03,0x00]
+# GFX90A: image_store a[2:4], v2, s[12:19] dmask:0xe unorm ; encoding: [0x00,0x1e,0x21,0xf0,0x02,0x02,0x03,0x00]
 0x00,0x1e,0x21,0xf0,0x02,0x02,0x03,0x00
 
-# GFX90A: image_store a2, v2, s[12:19] dmask:0xf unorm ; encoding: [0x00,0x1f,0x21,0xf0,0x02,0x02,0x03,0x00]
+# GFX90A: image_store a[2:5], v2, s[12:19] dmask:0xf unorm ; encoding: [0x00,0x1f,0x21,0xf0,0x02,0x02,0x03,0x00]
 0x00,0x1f,0x21,0xf0,0x02,0x02,0x03,0x00
 
 # GFX90A: image_store a1, v2, s[12:19] unorm ; encoding: [0x00,0x10,0x21,0xf0,0x02,0x01,0x03,0x00]
@@ -8016,7 +8016,7 @@
 # GFX90A: image_atomic_swap a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x41,0xf0,0x02,0x05,0x17,0x00]
 0x00,0x11,0x41,0xf0,0x02,0x05,0x17,0x00
 
-# GFX90A: image_atomic_swap a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x41,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_atomic_swap a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x41,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x13,0x41,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_atomic_swap a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x41,0xf0,0x02,0x05,0x02,0x00]
@@ -8046,7 +8046,7 @@
 # GFX90A: image_atomic_cmpswap a[6:7], v2, s[92:99] dmask:0x3 unorm ; encoding: [0x00,0x13,0x45,0xf0,0x02,0x06,0x17,0x00]
 0x00,0x13,0x45,0xf0,0x02,0x06,0x17,0x00
 
-# GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0xf unorm ; encoding: [0x00,0x1f,0x45,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_atomic_cmpswap a[6:9], v2, s[8:15] dmask:0xf unorm ; encoding: [0x00,0x1f,0x45,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x1f,0x45,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_atomic_cmpswap a[6:7], v2, s[8:15] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x45,0xf0,0x02,0x06,0x02,0x00]
@@ -8076,7 +8076,7 @@
 # GFX90A: image_atomic_add a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x49,0xf0,0x02,0x05,0x17,0x00]
 0x00,0x11,0x49,0xf0,0x02,0x05,0x17,0x00
 
-# GFX90A: image_atomic_add a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x49,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_atomic_add a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x49,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x13,0x49,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_atomic_add a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x49,0xf0,0x02,0x05,0x02,0x00]
@@ -8106,7 +8106,7 @@
 # GFX90A: image_atomic_sub a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x4d,0xf0,0x02,0x05,0x17,0x00]
 0x00,0x11,0x4d,0xf0,0x02,0x05,0x17,0x00
 
-# GFX90A: image_atomic_sub a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x4d,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_atomic_sub a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x4d,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x13,0x4d,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_atomic_sub a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x4d,0xf0,0x02,0x05,0x02,0x00]
@@ -8136,7 +8136,7 @@
 # GFX90A: image_atomic_smin a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x51,0xf0,0x02,0x05,0x17,0x00]
 0x00,0x11,0x51,0xf0,0x02,0x05,0x17,0x00
 
-# GFX90A: image_atomic_smin a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x51,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_atomic_smin a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x51,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x13,0x51,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_atomic_smin a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x51,0xf0,0x02,0x05,0x02,0x00]
@@ -8166,7 +8166,7 @@
 # GFX90A: image_atomic_umin a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x55,0xf0,0x02,0x05,0x17,0x00]
 0x00,0x11,0x55,0xf0,0x02,0x05,0x17,0x00
 
-# GFX90A: image_atomic_umin a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x55,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_atomic_umin a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x55,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x13,0x55,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_atomic_umin a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x55,0xf0,0x02,0x05,0x02,0x00]
@@ -8196,7 +8196,7 @@
 # GFX90A: image_atomic_smax a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x59,0xf0,0x02,0x05,0x17,0x00]
 0x00,0x11,0x59,0xf0,0x02,0x05,0x17,0x00
 
-# GFX90A: image_atomic_smax a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x59,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_atomic_smax a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x59,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x13,0x59,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_atomic_smax a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x59,0xf0,0x02,0x05,0x02,0x00]
@@ -8226,7 +8226,7 @@
 # GFX90A: image_atomic_umax a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x5d,0xf0,0x02,0x05,0x17,0x00]
 0x00,0x11,0x5d,0xf0,0x02,0x05,0x17,0x00
 
-# GFX90A: image_atomic_umax a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x5d,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_atomic_umax a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x5d,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x13,0x5d,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_atomic_umax a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x5d,0xf0,0x02,0x05,0x02,0x00]
@@ -8256,7 +8256,7 @@
 # GFX90A: image_atomic_and a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x61,0xf0,0x02,0x05,0x17,0x00]
 0x00,0x11,0x61,0xf0,0x02,0x05,0x17,0x00
 
-# GFX90A: image_atomic_and a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x61,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_atomic_and a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x61,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x13,0x61,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_atomic_and a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x61,0xf0,0x02,0x05,0x02,0x00]
@@ -8286,7 +8286,7 @@
 # GFX90A: image_atomic_or a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x65,0xf0,0x02,0x05,0x17,0x00]
 0x00,0x11,0x65,0xf0,0x02,0x05,0x17,0x00
 
-# GFX90A: image_atomic_or a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x65,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_atomic_or a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x65,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x13,0x65,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_atomic_or a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x65,0xf0,0x02,0x05,0x02,0x00]
@@ -8316,7 +8316,7 @@
 # GFX90A: image_atomic_xor a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x69,0xf0,0x02,0x05,0x17,0x00]
 0x00,0x11,0x69,0xf0,0x02,0x05,0x17,0x00
 
-# GFX90A: image_atomic_xor a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x69,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_atomic_xor a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x69,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x13,0x69,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_atomic_xor a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x69,0xf0,0x02,0x05,0x02,0x00]
@@ -8346,7 +8346,7 @@
 # GFX90A: image_atomic_inc a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x6d,0xf0,0x02,0x05,0x17,0x00]
 0x00,0x11,0x6d,0xf0,0x02,0x05,0x17,0x00
 
-# GFX90A: image_atomic_inc a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x6d,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_atomic_inc a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x6d,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x13,0x6d,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_atomic_inc a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x6d,0xf0,0x02,0x05,0x02,0x00]
@@ -8376,7 +8376,7 @@
 # GFX90A: image_atomic_dec a5, v2, s[92:99] dmask:0x1 unorm ; encoding: [0x00,0x11,0x71,0xf0,0x02,0x05,0x17,0x00]
 0x00,0x11,0x71,0xf0,0x02,0x05,0x17,0x00
 
-# GFX90A: image_atomic_dec a6, v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x71,0xf0,0x02,0x06,0x02,0x00]
+# GFX90A: image_atomic_dec a[6:7], v2, s[8:15] dmask:0x3 unorm ; encoding: [0x00,0x13,0x71,0xf0,0x02,0x06,0x02,0x00]
 0x00,0x13,0x71,0xf0,0x02,0x06,0x02,0x00
 
 # GFX90A: image_atomic_dec a5, v2, s[8:15] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x71,0xf0,0x02,0x05,0x02,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt b/llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt
index ced068d669caf..b902fca9288ce 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/mimg_gfx90a.txt
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -arch=amdgcn -mcpu=gfx90a -disassemble -show-encoding < %s | FileCheck %s -check-prefix=GFX90A
 
-# GFX90A: image_load v4, v238, s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x00,0xf0,0xee,0x04,0x07,0x00]
+# GFX90A: image_load v[4:6], v238, s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x00,0xf0,0xee,0x04,0x07,0x00]
 0x00,0x17,0x00,0xf0,0xee,0x04,0x07,0x00
 
 # GFX90A: image_load_pck v5, v0, s[8:15] dmask:0x1 glc ; encoding: [0x00,0x21,0x08,0xf0,0x00,0x05,0x02,0x00]
@@ -15,10 +15,10 @@
 # GFX90A: image_load_mip_pck v5, v1, s[8:15] dmask:0x1 ; encoding: [0x00,0x01,0x10,0xf0,0x01,0x05,0x02,0x00]
 0x00,0x01,0x10,0xf0,0x01,0x05,0x02,0x00
 
-# GFX90A: image_load_mip_pck_sgn v4, v0, s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x14,0xf0,0x00,0x04,0x02,0x00]
+# GFX90A: image_load_mip_pck_sgn v[4:5], v0, s[8:15] dmask:0x5 ; encoding: [0x00,0x05,0x14,0xf0,0x00,0x04,0x02,0x00]
 0x00,0x05,0x14,0xf0,0x00,0x04,0x02,0x00
 
-# GFX90A: image_store v192, v238, s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x20,0xf0,0xee,0xc0,0x07,0x00]
+# GFX90A: image_store v[192:194], v238, s[28:35] dmask:0x7 unorm ; encoding: [0x00,0x17,0x20,0xf0,0xee,0xc0,0x07,0x00]
 0x00,0x17,0x20,0xf0,0xee,0xc0,0x07,0x00
 
 # GFX90A: image_store_pck v1, v2, s[12:19] dmask:0x1 unorm da ; encoding: [0x00,0x51,0x28,0xf0,0x02,0x01,0x03,0x00]
diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16.txt b/llvm/test/MC/Disassembler/X86/avx512fp16.txt
index 39aa2e92d9f4f..67514e50b1e12 100644
--- a/llvm/test/MC/Disassembler/X86/avx512fp16.txt
+++ b/llvm/test/MC/Disassembler/X86/avx512fp16.txt
@@ -76,3 +76,1691 @@
 # ATT:   vmovw %xmm30, -256(%rdx)
 # INTEL: vmovw word ptr [rdx - 256], xmm30
 0x62,0x65,0x7d,0x08,0x7e,0x72,0x80
+
+# ATT:   vaddph %zmm28, %zmm29, %zmm30
+# INTEL: vaddph zmm30, zmm29, zmm28
+0x62,0x05,0x14,0x40,0x58,0xf4
+
+# ATT:   vaddph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vaddph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x05,0x14,0x10,0x58,0xf4
+
+# ATT:   vaddph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vaddph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x14,0x47,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vaddph  (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vaddph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x45,0x14,0x50,0x58,0x31
+
+# ATT:   vaddph  8128(%rcx), %zmm29, %zmm30
+# INTEL: vaddph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x65,0x14,0x40,0x58,0x71,0x7f
+
+# ATT:   vaddph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vaddph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x14,0xd7,0x58,0x72,0x80
+
+# ATT:   vaddsh %xmm28, %xmm29, %xmm30
+# INTEL: vaddsh xmm30, xmm29, xmm28
+0x62,0x05,0x16,0x00,0x58,0xf4
+
+# ATT:   vaddsh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vaddsh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x05,0x16,0x10,0x58,0xf4
+
+# ATT:   vaddsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vaddsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x16,0x07,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vaddsh  (%r9), %xmm29, %xmm30
+# INTEL: vaddsh xmm30, xmm29, word ptr [r9]
+0x62,0x45,0x16,0x00,0x58,0x31
+
+# ATT:   vaddsh  254(%rcx), %xmm29, %xmm30
+# INTEL: vaddsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x65,0x16,0x00,0x58,0x71,0x7f
+
+# ATT:   vaddsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vaddsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x65,0x16,0x87,0x58,0x72,0x80
+
+# ATT:   vcmpeqph %zmm28, %zmm29, %k5
+# INTEL: vcmpeqph k5, zmm29, zmm28
+0x62,0x93,0x14,0x40,0xc2,0xec,0x00
+
+# ATT:   vcmpleph {sae}, %zmm28, %zmm29, %k5
+# INTEL: vcmpleph k5, zmm29, zmm28, {sae}
+0x62,0x93,0x14,0x10,0xc2,0xec,0x02
+
+# ATT:   vcmpneqph 268435456(%rbp,%r14,8), %zmm29, %k5 {%k7}
+# INTEL: vcmpneqph k5 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0xb3,0x14,0x47,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x04
+
+# ATT:   vcmpnleph (%r9){1to32}, %zmm29, %k5
+# INTEL: vcmpnleph k5, zmm29, word ptr [r9]{1to32}
+0x62,0xd3,0x14,0x50,0xc2,0x29,0x06
+
+# ATT:   vcmpeq_uqph 8128(%rcx), %zmm29, %k5
+# INTEL: vcmpeq_uqph k5, zmm29, zmmword ptr [rcx + 8128]
+0x62,0xf3,0x14,0x40,0xc2,0x69,0x7f,0x08
+
+# ATT:   vcmpngtph -256(%rdx){1to32}, %zmm29, %k5 {%k7}
+# INTEL: vcmpngtph k5 {k7}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0xf3,0x14,0x57,0xc2,0x6a,0x80,0x0a
+
+# ATT:   vcmpneq_oqsh %xmm28, %xmm29, %k5
+# INTEL: vcmpneq_oqsh k5, xmm29, xmm28
+0x62,0x93,0x16,0x00,0xc2,0xec,0x0c
+
+# ATT:   vcmpgtsh {sae}, %xmm28, %xmm29, %k5
+# INTEL: vcmpgtsh k5, xmm29, xmm28, {sae}
+0x62,0x93,0x16,0x10,0xc2,0xec,0x0e
+
+# ATT:   vcmpeq_ossh 268435456(%rbp,%r14,8), %xmm29, %k5 {%k7}
+# INTEL: vcmpeq_ossh k5 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0xb3,0x16,0x07,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x10
+
+# ATT:   vcmple_oqsh (%r9), %xmm29, %k5
+# INTEL: vcmple_oqsh k5, xmm29, word ptr [r9]
+0x62,0xd3,0x16,0x00,0xc2,0x29,0x12
+
+# ATT:   vcmpneq_ussh 254(%rcx), %xmm29, %k5
+# INTEL: vcmpneq_ussh k5, xmm29, word ptr [rcx + 254]
+0x62,0xf3,0x16,0x00,0xc2,0x69,0x7f,0x14
+
+# ATT:   vcmpnle_uqsh -256(%rdx), %xmm29, %k5 {%k7}
+# INTEL: vcmpnle_uqsh k5 {k7}, xmm29, word ptr [rdx - 256]
+0x62,0xf3,0x16,0x07,0xc2,0x6a,0x80,0x16
+
+# ATT:   vcomish %xmm29, %xmm30
+# INTEL: vcomish xmm30, xmm29
+0x62,0x05,0x7c,0x08,0x2f,0xf5
+
+# ATT:   vcomish {sae}, %xmm29, %xmm30
+# INTEL: vcomish xmm30, xmm29, {sae}
+0x62,0x05,0x7c,0x18,0x2f,0xf5
+
+# ATT:   vcomish  268435456(%rbp,%r14,8), %xmm30
+# INTEL: vcomish xmm30, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7c,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcomish  (%r9), %xmm30
+# INTEL: vcomish xmm30, word ptr [r9]
+0x62,0x45,0x7c,0x08,0x2f,0x31
+
+# ATT:   vcomish  254(%rcx), %xmm30
+# INTEL: vcomish xmm30, word ptr [rcx + 254]
+0x62,0x65,0x7c,0x08,0x2f,0x71,0x7f
+
+# ATT:   vcomish  -256(%rdx), %xmm30
+# INTEL: vcomish xmm30, word ptr [rdx - 256]
+0x62,0x65,0x7c,0x08,0x2f,0x72,0x80
+
+# ATT:   vdivph %zmm28, %zmm29, %zmm30
+# INTEL: vdivph zmm30, zmm29, zmm28
+0x62,0x05,0x14,0x40,0x5e,0xf4
+
+# ATT:   vdivph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vdivph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x05,0x14,0x10,0x5e,0xf4
+
+# ATT:   vdivph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vdivph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x14,0x47,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vdivph  (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vdivph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x45,0x14,0x50,0x5e,0x31
+
+# ATT:   vdivph  8128(%rcx), %zmm29, %zmm30
+# INTEL: vdivph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x65,0x14,0x40,0x5e,0x71,0x7f
+
+# ATT:   vdivph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vdivph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x14,0xd7,0x5e,0x72,0x80
+
+# ATT:   vdivsh %xmm28, %xmm29, %xmm30
+# INTEL: vdivsh xmm30, xmm29, xmm28
+0x62,0x05,0x16,0x00,0x5e,0xf4
+
+# ATT:   vdivsh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vdivsh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x05,0x16,0x10,0x5e,0xf4
+
+# ATT:   vdivsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vdivsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x16,0x07,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vdivsh  (%r9), %xmm29, %xmm30
+# INTEL: vdivsh xmm30, xmm29, word ptr [r9]
+0x62,0x45,0x16,0x00,0x5e,0x31
+
+# ATT:   vdivsh  254(%rcx), %xmm29, %xmm30
+# INTEL: vdivsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x65,0x16,0x00,0x5e,0x71,0x7f
+
+# ATT:   vdivsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vdivsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x65,0x16,0x87,0x5e,0x72,0x80
+
+# ATT:   vmaxph %zmm28, %zmm29, %zmm30
+# INTEL: vmaxph zmm30, zmm29, zmm28
+0x62,0x05,0x14,0x40,0x5f,0xf4
+
+# ATT:   vmaxph {sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vmaxph zmm30, zmm29, zmm28, {sae}
+0x62,0x05,0x14,0x10,0x5f,0xf4
+
+# ATT:   vmaxph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vmaxph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x14,0x47,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmaxph  (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vmaxph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x45,0x14,0x50,0x5f,0x31
+
+# ATT:   vmaxph  8128(%rcx), %zmm29, %zmm30
+# INTEL: vmaxph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x65,0x14,0x40,0x5f,0x71,0x7f
+
+# ATT:   vmaxph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vmaxph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x14,0xd7,0x5f,0x72,0x80
+
+# ATT:   vmaxsh %xmm28, %xmm29, %xmm30
+# INTEL: vmaxsh xmm30, xmm29, xmm28
+0x62,0x05,0x16,0x00,0x5f,0xf4
+
+# ATT:   vmaxsh {sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vmaxsh xmm30, xmm29, xmm28, {sae}
+0x62,0x05,0x16,0x10,0x5f,0xf4
+
+# ATT:   vmaxsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vmaxsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x16,0x07,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmaxsh  (%r9), %xmm29, %xmm30
+# INTEL: vmaxsh xmm30, xmm29, word ptr [r9]
+0x62,0x45,0x16,0x00,0x5f,0x31
+
+# ATT:   vmaxsh  254(%rcx), %xmm29, %xmm30
+# INTEL: vmaxsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x65,0x16,0x00,0x5f,0x71,0x7f
+
+# ATT:   vmaxsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vmaxsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x65,0x16,0x87,0x5f,0x72,0x80
+
+# ATT:   vminph %zmm28, %zmm29, %zmm30
+# INTEL: vminph zmm30, zmm29, zmm28
+0x62,0x05,0x14,0x40,0x5d,0xf4
+
+# ATT:   vminph {sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vminph zmm30, zmm29, zmm28, {sae}
+0x62,0x05,0x14,0x10,0x5d,0xf4
+
+# ATT:   vminph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vminph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x14,0x47,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vminph  (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vminph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x45,0x14,0x50,0x5d,0x31
+
+# ATT:   vminph  8128(%rcx), %zmm29, %zmm30
+# INTEL: vminph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x65,0x14,0x40,0x5d,0x71,0x7f
+
+# ATT:   vminph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vminph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x14,0xd7,0x5d,0x72,0x80
+
+# ATT:   vminsh %xmm28, %xmm29, %xmm30
+# INTEL: vminsh xmm30, xmm29, xmm28
+0x62,0x05,0x16,0x00,0x5d,0xf4
+
+# ATT:   vminsh {sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vminsh xmm30, xmm29, xmm28, {sae}
+0x62,0x05,0x16,0x10,0x5d,0xf4
+
+# ATT:   vminsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vminsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x16,0x07,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vminsh  (%r9), %xmm29, %xmm30
+# INTEL: vminsh xmm30, xmm29, word ptr [r9]
+0x62,0x45,0x16,0x00,0x5d,0x31
+
+# ATT:   vminsh  254(%rcx), %xmm29, %xmm30
+# INTEL: vminsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x65,0x16,0x00,0x5d,0x71,0x7f
+
+# ATT:   vminsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vminsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x65,0x16,0x87,0x5d,0x72,0x80
+
+# ATT:   vmulph %zmm28, %zmm29, %zmm30
+# INTEL: vmulph zmm30, zmm29, zmm28
+0x62,0x05,0x14,0x40,0x59,0xf4
+
+# ATT:   vmulph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vmulph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x05,0x14,0x10,0x59,0xf4
+
+# ATT:   vmulph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vmulph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x14,0x47,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmulph  (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vmulph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x45,0x14,0x50,0x59,0x31
+
+# ATT:   vmulph  8128(%rcx), %zmm29, %zmm30
+# INTEL: vmulph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x65,0x14,0x40,0x59,0x71,0x7f
+
+# ATT:   vmulph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vmulph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x14,0xd7,0x59,0x72,0x80
+
+# ATT:   vmulsh %xmm28, %xmm29, %xmm30
+# INTEL: vmulsh xmm30, xmm29, xmm28
+0x62,0x05,0x16,0x00,0x59,0xf4
+
+# ATT:   vmulsh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vmulsh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x05,0x16,0x10,0x59,0xf4
+
+# ATT:   vmulsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vmulsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x16,0x07,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vmulsh  (%r9), %xmm29, %xmm30
+# INTEL: vmulsh xmm30, xmm29, word ptr [r9]
+0x62,0x45,0x16,0x00,0x59,0x31
+
+# ATT:   vmulsh  254(%rcx), %xmm29, %xmm30
+# INTEL: vmulsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x65,0x16,0x00,0x59,0x71,0x7f
+
+# ATT:   vmulsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vmulsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x65,0x16,0x87,0x59,0x72,0x80
+
+# ATT:   vsubph %zmm28, %zmm29, %zmm30
+# INTEL: vsubph zmm30, zmm29, zmm28
+0x62,0x05,0x14,0x40,0x5c,0xf4
+
+# ATT:   vsubph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vsubph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x05,0x14,0x10,0x5c,0xf4
+
+# ATT:   vsubph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vsubph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x14,0x47,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsubph  (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vsubph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x45,0x14,0x50,0x5c,0x31
+
+# ATT:   vsubph  8128(%rcx), %zmm29, %zmm30
+# INTEL: vsubph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x65,0x14,0x40,0x5c,0x71,0x7f
+
+# ATT:   vsubph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vsubph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x14,0xd7,0x5c,0x72,0x80
+
+# ATT:   vsubsh %xmm28, %xmm29, %xmm30
+# INTEL: vsubsh xmm30, xmm29, xmm28
+0x62,0x05,0x16,0x00,0x5c,0xf4
+
+# ATT:   vsubsh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vsubsh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x05,0x16,0x10,0x5c,0xf4
+
+# ATT:   vsubsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vsubsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x16,0x07,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsubsh  (%r9), %xmm29, %xmm30
+# INTEL: vsubsh xmm30, xmm29, word ptr [r9]
+0x62,0x45,0x16,0x00,0x5c,0x31
+
+# ATT:   vsubsh  254(%rcx), %xmm29, %xmm30
+# INTEL: vsubsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x65,0x16,0x00,0x5c,0x71,0x7f
+
+# ATT:   vsubsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vsubsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x65,0x16,0x87,0x5c,0x72,0x80
+
+# ATT:   vucomish %xmm29, %xmm30
+# INTEL: vucomish xmm30, xmm29
+0x62,0x05,0x7c,0x08,0x2e,0xf5
+
+# ATT:   vucomish {sae}, %xmm29, %xmm30
+# INTEL: vucomish xmm30, xmm29, {sae}
+0x62,0x05,0x7c,0x18,0x2e,0xf5
+
+# ATT:   vucomish  268435456(%rbp,%r14,8), %xmm30
+# INTEL: vucomish xmm30, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7c,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vucomish  (%r9), %xmm30
+# INTEL: vucomish xmm30, word ptr [r9]
+0x62,0x45,0x7c,0x08,0x2e,0x31
+
+# ATT:   vucomish  254(%rcx), %xmm30
+# INTEL: vucomish xmm30, word ptr [rcx + 254]
+0x62,0x65,0x7c,0x08,0x2e,0x71,0x7f
+
+# ATT:   vucomish  -256(%rdx), %xmm30
+# INTEL: vucomish xmm30, word ptr [rdx - 256]
+0x62,0x65,0x7c,0x08,0x2e,0x72,0x80
+
+# ATT:   vcvtdq2ph %zmm29, %ymm30
+# INTEL: vcvtdq2ph ymm30, zmm29
+0x62,0x05,0x7c,0x48,0x5b,0xf5
+
+# ATT:   vcvtdq2ph {rn-sae}, %zmm29, %ymm30
+# INTEL: vcvtdq2ph ymm30, zmm29, {rn-sae}
+0x62,0x05,0x7c,0x18,0x5b,0xf5
+
+# ATT:   vcvtdq2ph  268435456(%rbp,%r14,8), %ymm30 {%k7}
+# INTEL: vcvtdq2ph ymm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7c,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtdq2ph  (%r9){1to16}, %ymm30
+# INTEL: vcvtdq2ph ymm30, dword ptr [r9]{1to16}
+0x62,0x45,0x7c,0x58,0x5b,0x31
+
+# ATT:   vcvtdq2ph  8128(%rcx), %ymm30
+# INTEL: vcvtdq2ph ymm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0x7c,0x48,0x5b,0x71,0x7f
+
+# ATT:   vcvtdq2ph  -512(%rdx){1to16}, %ymm30 {%k7} {z}
+# INTEL: vcvtdq2ph ymm30 {k7} {z}, dword ptr [rdx - 512]{1to16}
+0x62,0x65,0x7c,0xdf,0x5b,0x72,0x80
+
+# ATT:   vcvtpd2ph %zmm29, %xmm30
+# INTEL: vcvtpd2ph xmm30, zmm29
+0x62,0x05,0xfd,0x48,0x5a,0xf5
+
+# ATT:   vcvtpd2ph {rn-sae}, %zmm29, %xmm30
+# INTEL: vcvtpd2ph xmm30, zmm29, {rn-sae}
+0x62,0x05,0xfd,0x18,0x5a,0xf5
+
+# ATT:   vcvtpd2phz  268435456(%rbp,%r14,8), %xmm30 {%k7}
+# INTEL: vcvtpd2ph xmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0xfd,0x4f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtpd2ph  (%r9){1to8}, %xmm30
+# INTEL: vcvtpd2ph xmm30, qword ptr [r9]{1to8}
+0x62,0x45,0xfd,0x58,0x5a,0x31
+
+# ATT:   vcvtpd2phz  8128(%rcx), %xmm30
+# INTEL: vcvtpd2ph xmm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0xfd,0x48,0x5a,0x71,0x7f
+
+# ATT:   vcvtpd2ph  -1024(%rdx){1to8}, %xmm30 {%k7} {z}
+# INTEL: vcvtpd2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to8}
+0x62,0x65,0xfd,0xdf,0x5a,0x72,0x80
+
+# ATT:   vcvtph2dq %ymm29, %zmm30
+# INTEL: vcvtph2dq zmm30, ymm29
+0x62,0x05,0x7d,0x48,0x5b,0xf5
+
+# ATT:   vcvtph2dq {rn-sae}, %ymm29, %zmm30
+# INTEL: vcvtph2dq zmm30, ymm29, {rn-sae}
+0x62,0x05,0x7d,0x18,0x5b,0xf5
+
+# ATT:   vcvtph2dq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvtph2dq zmm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7d,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2dq  (%r9){1to16}, %zmm30
+# INTEL: vcvtph2dq zmm30, word ptr [r9]{1to16}
+0x62,0x45,0x7d,0x58,0x5b,0x31
+
+# ATT:   vcvtph2dq  4064(%rcx), %zmm30
+# INTEL: vcvtph2dq zmm30, ymmword ptr [rcx + 4064]
+0x62,0x65,0x7d,0x48,0x5b,0x71,0x7f
+
+# ATT:   vcvtph2dq  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+# INTEL: vcvtph2dq zmm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+0x62,0x65,0x7d,0xdf,0x5b,0x72,0x80
+
+# ATT:   vcvtph2pd %xmm29, %zmm30
+# INTEL: vcvtph2pd zmm30, xmm29
+0x62,0x05,0x7c,0x48,0x5a,0xf5
+
+# ATT:   vcvtph2pd {sae}, %xmm29, %zmm30
+# INTEL: vcvtph2pd zmm30, xmm29, {sae}
+0x62,0x05,0x7c,0x18,0x5a,0xf5
+
+# ATT:   vcvtph2pd  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvtph2pd zmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7c,0x4f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2pd  (%r9){1to8}, %zmm30
+# INTEL: vcvtph2pd zmm30, word ptr [r9]{1to8}
+0x62,0x45,0x7c,0x58,0x5a,0x31
+
+# ATT:   vcvtph2pd  2032(%rcx), %zmm30
+# INTEL: vcvtph2pd zmm30, xmmword ptr [rcx + 2032]
+0x62,0x65,0x7c,0x48,0x5a,0x71,0x7f
+
+# ATT:   vcvtph2pd  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+# INTEL: vcvtph2pd zmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+0x62,0x65,0x7c,0xdf,0x5a,0x72,0x80
+
+# ATT:   vcvtph2psx %ymm29, %zmm30
+# INTEL: vcvtph2psx zmm30, ymm29
+0x62,0x06,0x7d,0x48,0x13,0xf5
+
+# ATT:   vcvtph2psx {sae}, %ymm29, %zmm30
+# INTEL: vcvtph2psx zmm30, ymm29, {sae}
+0x62,0x06,0x7d,0x18,0x13,0xf5
+
+# ATT:   vcvtph2psx  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvtph2psx zmm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x7d,0x4f,0x13,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2psx  (%r9){1to16}, %zmm30
+# INTEL: vcvtph2psx zmm30, word ptr [r9]{1to16}
+0x62,0x46,0x7d,0x58,0x13,0x31
+
+# ATT:   vcvtph2psx  4064(%rcx), %zmm30
+# INTEL: vcvtph2psx zmm30, ymmword ptr [rcx + 4064]
+0x62,0x66,0x7d,0x48,0x13,0x71,0x7f
+
+# ATT:   vcvtph2psx  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+# INTEL: vcvtph2psx zmm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+0x62,0x66,0x7d,0xdf,0x13,0x72,0x80
+
+# ATT:   vcvtph2qq %xmm29, %zmm30
+# INTEL: vcvtph2qq zmm30, xmm29
+0x62,0x05,0x7d,0x48,0x7b,0xf5
+
+# ATT:   vcvtph2qq {rn-sae}, %xmm29, %zmm30
+# INTEL: vcvtph2qq zmm30, xmm29, {rn-sae}
+0x62,0x05,0x7d,0x18,0x7b,0xf5
+
+# ATT:   vcvtph2qq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvtph2qq zmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7d,0x4f,0x7b,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2qq  (%r9){1to8}, %zmm30
+# INTEL: vcvtph2qq zmm30, word ptr [r9]{1to8}
+0x62,0x45,0x7d,0x58,0x7b,0x31
+
+# ATT:   vcvtph2qq  2032(%rcx), %zmm30
+# INTEL: vcvtph2qq zmm30, xmmword ptr [rcx + 2032]
+0x62,0x65,0x7d,0x48,0x7b,0x71,0x7f
+
+# ATT:   vcvtph2qq  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+# INTEL: vcvtph2qq zmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+0x62,0x65,0x7d,0xdf,0x7b,0x72,0x80
+
+# ATT:   vcvtph2udq %ymm29, %zmm30
+# INTEL: vcvtph2udq zmm30, ymm29
+0x62,0x05,0x7c,0x48,0x79,0xf5
+
+# ATT:   vcvtph2udq {rn-sae}, %ymm29, %zmm30
+# INTEL: vcvtph2udq zmm30, ymm29, {rn-sae}
+0x62,0x05,0x7c,0x18,0x79,0xf5
+
+# ATT:   vcvtph2udq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvtph2udq zmm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7c,0x4f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2udq  (%r9){1to16}, %zmm30
+# INTEL: vcvtph2udq zmm30, word ptr [r9]{1to16}
+0x62,0x45,0x7c,0x58,0x79,0x31
+
+# ATT:   vcvtph2udq  4064(%rcx), %zmm30
+# INTEL: vcvtph2udq zmm30, ymmword ptr [rcx + 4064]
+0x62,0x65,0x7c,0x48,0x79,0x71,0x7f
+
+# ATT:   vcvtph2udq  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+# INTEL: vcvtph2udq zmm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+0x62,0x65,0x7c,0xdf,0x79,0x72,0x80
+
+# ATT:   vcvtph2uqq %xmm29, %zmm30
+# INTEL: vcvtph2uqq zmm30, xmm29
+0x62,0x05,0x7d,0x48,0x79,0xf5
+
+# ATT:   vcvtph2uqq {rn-sae}, %xmm29, %zmm30
+# INTEL: vcvtph2uqq zmm30, xmm29, {rn-sae}
+0x62,0x05,0x7d,0x18,0x79,0xf5
+
+# ATT:   vcvtph2uqq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvtph2uqq zmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7d,0x4f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2uqq  (%r9){1to8}, %zmm30
+# INTEL: vcvtph2uqq zmm30, word ptr [r9]{1to8}
+0x62,0x45,0x7d,0x58,0x79,0x31
+
+# ATT:   vcvtph2uqq  2032(%rcx), %zmm30
+# INTEL: vcvtph2uqq zmm30, xmmword ptr [rcx + 2032]
+0x62,0x65,0x7d,0x48,0x79,0x71,0x7f
+
+# ATT:   vcvtph2uqq  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+# INTEL: vcvtph2uqq zmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+0x62,0x65,0x7d,0xdf,0x79,0x72,0x80
+
+# ATT:   vcvtph2uw %zmm29, %zmm30
+# INTEL: vcvtph2uw zmm30, zmm29
+0x62,0x05,0x7c,0x48,0x7d,0xf5
+
+# ATT:   vcvtph2uw {rn-sae}, %zmm29, %zmm30
+# INTEL: vcvtph2uw zmm30, zmm29, {rn-sae}
+0x62,0x05,0x7c,0x18,0x7d,0xf5
+
+# ATT:   vcvtph2uw  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvtph2uw zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7c,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2uw  (%r9){1to32}, %zmm30
+# INTEL: vcvtph2uw zmm30, word ptr [r9]{1to32}
+0x62,0x45,0x7c,0x58,0x7d,0x31
+
+# ATT:   vcvtph2uw  8128(%rcx), %zmm30
+# INTEL: vcvtph2uw zmm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0x7c,0x48,0x7d,0x71,0x7f
+
+# ATT:   vcvtph2uw  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vcvtph2uw zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x7c,0xdf,0x7d,0x72,0x80
+
+# ATT:   vcvtph2w %zmm29, %zmm30
+# INTEL: vcvtph2w zmm30, zmm29
+0x62,0x05,0x7d,0x48,0x7d,0xf5
+
+# ATT:   vcvtph2w {rn-sae}, %zmm29, %zmm30
+# INTEL: vcvtph2w zmm30, zmm29, {rn-sae}
+0x62,0x05,0x7d,0x18,0x7d,0xf5
+
+# ATT:   vcvtph2w  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvtph2w zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7d,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2w  (%r9){1to32}, %zmm30
+# INTEL: vcvtph2w zmm30, word ptr [r9]{1to32}
+0x62,0x45,0x7d,0x58,0x7d,0x31
+
+# ATT:   vcvtph2w  8128(%rcx), %zmm30
+# INTEL: vcvtph2w zmm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0x7d,0x48,0x7d,0x71,0x7f
+
+# ATT:   vcvtph2w  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vcvtph2w zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x7d,0xdf,0x7d,0x72,0x80
+
+# ATT:   vcvtps2phx %zmm29, %ymm30
+# INTEL: vcvtps2phx ymm30, zmm29
+0x62,0x05,0x7d,0x48,0x1d,0xf5
+
+# ATT:   vcvtps2phx {rn-sae}, %zmm29, %ymm30
+# INTEL: vcvtps2phx ymm30, zmm29, {rn-sae}
+0x62,0x05,0x7d,0x18,0x1d,0xf5
+
+# ATT:   vcvtps2phx  268435456(%rbp,%r14,8), %ymm30 {%k7}
+# INTEL: vcvtps2phx ymm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7d,0x4f,0x1d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtps2phx  (%r9){1to16}, %ymm30
+# INTEL: vcvtps2phx ymm30, dword ptr [r9]{1to16}
+0x62,0x45,0x7d,0x58,0x1d,0x31
+
+# ATT:   vcvtps2phx  8128(%rcx), %ymm30
+# INTEL: vcvtps2phx ymm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0x7d,0x48,0x1d,0x71,0x7f
+
+# ATT:   vcvtps2phx  -512(%rdx){1to16}, %ymm30 {%k7} {z}
+# INTEL: vcvtps2phx ymm30 {k7} {z}, dword ptr [rdx - 512]{1to16}
+0x62,0x65,0x7d,0xdf,0x1d,0x72,0x80
+
+# ATT:   vcvtqq2ph %zmm29, %xmm30
+# INTEL: vcvtqq2ph xmm30, zmm29
+0x62,0x05,0xfc,0x48,0x5b,0xf5
+
+# ATT:   vcvtqq2ph {rn-sae}, %zmm29, %xmm30
+# INTEL: vcvtqq2ph xmm30, zmm29, {rn-sae}
+0x62,0x05,0xfc,0x18,0x5b,0xf5
+
+# ATT:   vcvtqq2phz  268435456(%rbp,%r14,8), %xmm30 {%k7}
+# INTEL: vcvtqq2ph xmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0xfc,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtqq2ph  (%r9){1to8}, %xmm30
+# INTEL: vcvtqq2ph xmm30, qword ptr [r9]{1to8}
+0x62,0x45,0xfc,0x58,0x5b,0x31
+
+# ATT:   vcvtqq2phz  8128(%rcx), %xmm30
+# INTEL: vcvtqq2ph xmm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0xfc,0x48,0x5b,0x71,0x7f
+
+# ATT:   vcvtqq2ph  -1024(%rdx){1to8}, %xmm30 {%k7} {z}
+# INTEL: vcvtqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to8}
+0x62,0x65,0xfc,0xdf,0x5b,0x72,0x80
+
+# ATT:   vcvtsd2sh %xmm28, %xmm29, %xmm30
+# INTEL: vcvtsd2sh xmm30, xmm29, xmm28
+0x62,0x05,0x97,0x00,0x5a,0xf4
+
+# ATT:   vcvtsd2sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vcvtsd2sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x05,0x97,0x10,0x5a,0xf4
+
+# ATT:   vcvtsd2sh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vcvtsd2sh xmm30 {k7}, xmm29, qword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x97,0x07,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtsd2sh  (%r9), %xmm29, %xmm30
+# INTEL: vcvtsd2sh xmm30, xmm29, qword ptr [r9]
+0x62,0x45,0x97,0x00,0x5a,0x31
+
+# ATT:   vcvtsd2sh  1016(%rcx), %xmm29, %xmm30
+# INTEL: vcvtsd2sh xmm30, xmm29, qword ptr [rcx + 1016]
+0x62,0x65,0x97,0x00,0x5a,0x71,0x7f
+
+# ATT:   vcvtsd2sh  -1024(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vcvtsd2sh xmm30 {k7} {z}, xmm29, qword ptr [rdx - 1024]
+0x62,0x65,0x97,0x87,0x5a,0x72,0x80
+
+# ATT:   vcvtsh2sd %xmm28, %xmm29, %xmm30
+# INTEL: vcvtsh2sd xmm30, xmm29, xmm28
+0x62,0x05,0x16,0x00,0x5a,0xf4
+
+# ATT:   vcvtsh2sd {sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vcvtsh2sd xmm30, xmm29, xmm28, {sae}
+0x62,0x05,0x16,0x10,0x5a,0xf4
+
+# ATT:   vcvtsh2sd  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vcvtsh2sd xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x16,0x07,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtsh2sd  (%r9), %xmm29, %xmm30
+# INTEL: vcvtsh2sd xmm30, xmm29, word ptr [r9]
+0x62,0x45,0x16,0x00,0x5a,0x31
+
+# ATT:   vcvtsh2sd  254(%rcx), %xmm29, %xmm30
+# INTEL: vcvtsh2sd xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x65,0x16,0x00,0x5a,0x71,0x7f
+
+# ATT:   vcvtsh2sd  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vcvtsh2sd xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x65,0x16,0x87,0x5a,0x72,0x80
+
+# ATT:   vcvtsh2si %xmm30, %edx
+# INTEL: vcvtsh2si edx, xmm30
+0x62,0x95,0x7e,0x08,0x2d,0xd6
+
+# ATT:   vcvtsh2si {rn-sae}, %xmm30, %edx
+# INTEL: vcvtsh2si edx, xmm30, {rn-sae}
+0x62,0x95,0x7e,0x18,0x2d,0xd6
+
+# ATT:   vcvtsh2si %xmm30, %r12
+# INTEL: vcvtsh2si r12, xmm30
+0x62,0x15,0xfe,0x08,0x2d,0xe6
+
+# ATT:   vcvtsh2si {rn-sae}, %xmm30, %r12
+# INTEL: vcvtsh2si r12, xmm30, {rn-sae}
+0x62,0x15,0xfe,0x18,0x2d,0xe6
+
+# ATT:   vcvtsh2si  268435456(%rbp,%r14,8), %edx
+# INTEL: vcvtsh2si edx, word ptr [rbp + 8*r14 + 268435456]
+0x62,0xb5,0x7e,0x08,0x2d,0x94,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtsh2si  (%r9), %edx
+# INTEL: vcvtsh2si edx, word ptr [r9]
+0x62,0xd5,0x7e,0x08,0x2d,0x11
+
+# ATT:   vcvtsh2si  254(%rcx), %edx
+# INTEL: vcvtsh2si edx, word ptr [rcx + 254]
+0x62,0xf5,0x7e,0x08,0x2d,0x51,0x7f
+
+# ATT:   vcvtsh2si  -256(%rdx), %edx
+# INTEL: vcvtsh2si edx, word ptr [rdx - 256]
+0x62,0xf5,0x7e,0x08,0x2d,0x52,0x80
+
+# ATT:   vcvtsh2si  268435456(%rbp,%r14,8), %r12
+# INTEL: vcvtsh2si r12, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x35,0xfe,0x08,0x2d,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtsh2si  (%r9), %r12
+# INTEL: vcvtsh2si r12, word ptr [r9]
+0x62,0x55,0xfe,0x08,0x2d,0x21
+
+# ATT:   vcvtsh2si  254(%rcx), %r12
+# INTEL: vcvtsh2si r12, word ptr [rcx + 254]
+0x62,0x75,0xfe,0x08,0x2d,0x61,0x7f
+
+# ATT:   vcvtsh2si  -256(%rdx), %r12
+# INTEL: vcvtsh2si r12, word ptr [rdx - 256]
+0x62,0x75,0xfe,0x08,0x2d,0x62,0x80
+
+# ATT:   vcvtsh2ss %xmm28, %xmm29, %xmm30
+# INTEL: vcvtsh2ss xmm30, xmm29, xmm28
+0x62,0x06,0x14,0x00,0x13,0xf4
+
+# ATT:   vcvtsh2ss {sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vcvtsh2ss xmm30, xmm29, xmm28, {sae}
+0x62,0x06,0x14,0x10,0x13,0xf4
+
+# ATT:   vcvtsh2ss  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vcvtsh2ss xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x14,0x07,0x13,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtsh2ss  (%r9), %xmm29, %xmm30
+# INTEL: vcvtsh2ss xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x14,0x00,0x13,0x31
+
+# ATT:   vcvtsh2ss  254(%rcx), %xmm29, %xmm30
+# INTEL: vcvtsh2ss xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x14,0x00,0x13,0x71,0x7f
+
+# ATT:   vcvtsh2ss  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vcvtsh2ss xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x14,0x87,0x13,0x72,0x80
+
+# ATT:   vcvtsh2usi %xmm30, %edx
+# INTEL: vcvtsh2usi edx, xmm30
+0x62,0x95,0x7e,0x08,0x79,0xd6
+
+# ATT:   vcvtsh2usi {rn-sae}, %xmm30, %edx
+# INTEL: vcvtsh2usi edx, xmm30, {rn-sae}
+0x62,0x95,0x7e,0x18,0x79,0xd6
+
+# ATT:   vcvtsh2usi %xmm30, %r12
+# INTEL: vcvtsh2usi r12, xmm30
+0x62,0x15,0xfe,0x08,0x79,0xe6
+
+# ATT:   vcvtsh2usi {rn-sae}, %xmm30, %r12
+# INTEL: vcvtsh2usi r12, xmm30, {rn-sae}
+0x62,0x15,0xfe,0x18,0x79,0xe6
+
+# ATT:   vcvtsh2usi  268435456(%rbp,%r14,8), %edx
+# INTEL: vcvtsh2usi edx, word ptr [rbp + 8*r14 + 268435456]
+0x62,0xb5,0x7e,0x08,0x79,0x94,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtsh2usi  (%r9), %edx
+# INTEL: vcvtsh2usi edx, word ptr [r9]
+0x62,0xd5,0x7e,0x08,0x79,0x11
+
+# ATT:   vcvtsh2usi  254(%rcx), %edx
+# INTEL: vcvtsh2usi edx, word ptr [rcx + 254]
+0x62,0xf5,0x7e,0x08,0x79,0x51,0x7f
+
+# ATT:   vcvtsh2usi  -256(%rdx), %edx
+# INTEL: vcvtsh2usi edx, word ptr [rdx - 256]
+0x62,0xf5,0x7e,0x08,0x79,0x52,0x80
+
+# ATT:   vcvtsh2usi  268435456(%rbp,%r14,8), %r12
+# INTEL: vcvtsh2usi r12, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x35,0xfe,0x08,0x79,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtsh2usi  (%r9), %r12
+# INTEL: vcvtsh2usi r12, word ptr [r9]
+0x62,0x55,0xfe,0x08,0x79,0x21
+
+# ATT:   vcvtsh2usi  254(%rcx), %r12
+# INTEL: vcvtsh2usi r12, word ptr [rcx + 254]
+0x62,0x75,0xfe,0x08,0x79,0x61,0x7f
+
+# ATT:   vcvtsh2usi  -256(%rdx), %r12
+# INTEL: vcvtsh2usi r12, word ptr [rdx - 256]
+0x62,0x75,0xfe,0x08,0x79,0x62,0x80
+
+# ATT:   vcvtsi2sh %r12, %xmm29, %xmm30
+# INTEL: vcvtsi2sh xmm30, xmm29, r12
+0x62,0x45,0x96,0x00,0x2a,0xf4
+
+# ATT:   vcvtsi2sh %r12, {rn-sae}, %xmm29, %xmm30
+# INTEL: vcvtsi2sh xmm30, xmm29, {rn-sae}, r12
+0x62,0x45,0x96,0x10,0x2a,0xf4
+
+# ATT:   vcvtsi2sh %edx, %xmm29, %xmm30
+# INTEL: vcvtsi2sh xmm30, xmm29, edx
+0x62,0x65,0x16,0x00,0x2a,0xf2
+
+# ATT:   vcvtsi2sh %edx, {rn-sae}, %xmm29, %xmm30
+# INTEL: vcvtsi2sh xmm30, xmm29, {rn-sae}, edx
+0x62,0x65,0x16,0x10,0x2a,0xf2
+
+# ATT:   vcvtsi2shl  268435456(%rbp,%r14,8), %xmm29, %xmm30
+# INTEL: vcvtsi2sh xmm30, xmm29, dword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x16,0x00,0x2a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtsi2shl  (%r9), %xmm29, %xmm30
+# INTEL: vcvtsi2sh xmm30, xmm29, dword ptr [r9]
+0x62,0x45,0x16,0x00,0x2a,0x31
+
+# ATT:   vcvtsi2shl  508(%rcx), %xmm29, %xmm30
+# INTEL: vcvtsi2sh xmm30, xmm29, dword ptr [rcx + 508]
+0x62,0x65,0x16,0x00,0x2a,0x71,0x7f
+
+# ATT:   vcvtsi2shl  -512(%rdx), %xmm29, %xmm30
+# INTEL: vcvtsi2sh xmm30, xmm29, dword ptr [rdx - 512]
+0x62,0x65,0x16,0x00,0x2a,0x72,0x80
+
+# ATT:   vcvtsi2shq  1016(%rcx), %xmm29, %xmm30
+# INTEL: vcvtsi2sh xmm30, xmm29, qword ptr [rcx + 1016]
+0x62,0x65,0x96,0x00,0x2a,0x71,0x7f
+
+# ATT:   vcvtsi2shq  -1024(%rdx), %xmm29, %xmm30
+# INTEL: vcvtsi2sh xmm30, xmm29, qword ptr [rdx - 1024]
+0x62,0x65,0x96,0x00,0x2a,0x72,0x80
+
+# ATT:   vcvtss2sh %xmm28, %xmm29, %xmm30
+# INTEL: vcvtss2sh xmm30, xmm29, xmm28
+0x62,0x05,0x14,0x00,0x1d,0xf4
+
+# ATT:   vcvtss2sh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vcvtss2sh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x05,0x14,0x10,0x1d,0xf4
+
+# ATT:   vcvtss2sh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vcvtss2sh xmm30 {k7}, xmm29, dword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x14,0x07,0x1d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtss2sh  (%r9), %xmm29, %xmm30
+# INTEL: vcvtss2sh xmm30, xmm29, dword ptr [r9]
+0x62,0x45,0x14,0x00,0x1d,0x31
+
+# ATT:   vcvtss2sh  508(%rcx), %xmm29, %xmm30
+# INTEL: vcvtss2sh xmm30, xmm29, dword ptr [rcx + 508]
+0x62,0x65,0x14,0x00,0x1d,0x71,0x7f
+
+# ATT:   vcvtss2sh  -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vcvtss2sh xmm30 {k7} {z}, xmm29, dword ptr [rdx - 512]
+0x62,0x65,0x14,0x87,0x1d,0x72,0x80
+
+# ATT:   vcvttph2dq %ymm29, %zmm30
+# INTEL: vcvttph2dq zmm30, ymm29
+0x62,0x05,0x7e,0x48,0x5b,0xf5
+
+# ATT:   vcvttph2dq {sae}, %ymm29, %zmm30
+# INTEL: vcvttph2dq zmm30, ymm29, {sae}
+0x62,0x05,0x7e,0x18,0x5b,0xf5
+
+# ATT:   vcvttph2dq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvttph2dq zmm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7e,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2dq  (%r9){1to16}, %zmm30
+# INTEL: vcvttph2dq zmm30, word ptr [r9]{1to16}
+0x62,0x45,0x7e,0x58,0x5b,0x31
+
+# ATT:   vcvttph2dq  4064(%rcx), %zmm30
+# INTEL: vcvttph2dq zmm30, ymmword ptr [rcx + 4064]
+0x62,0x65,0x7e,0x48,0x5b,0x71,0x7f
+
+# ATT:   vcvttph2dq  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+# INTEL: vcvttph2dq zmm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+0x62,0x65,0x7e,0xdf,0x5b,0x72,0x80
+
+# ATT:   vcvttph2qq %xmm29, %zmm30
+# INTEL: vcvttph2qq zmm30, xmm29
+0x62,0x05,0x7d,0x48,0x7a,0xf5
+
+# ATT:   vcvttph2qq {sae}, %xmm29, %zmm30
+# INTEL: vcvttph2qq zmm30, xmm29, {sae}
+0x62,0x05,0x7d,0x18,0x7a,0xf5
+
+# ATT:   vcvttph2qq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvttph2qq zmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7d,0x4f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2qq  (%r9){1to8}, %zmm30
+# INTEL: vcvttph2qq zmm30, word ptr [r9]{1to8}
+0x62,0x45,0x7d,0x58,0x7a,0x31
+
+# ATT:   vcvttph2qq  2032(%rcx), %zmm30
+# INTEL: vcvttph2qq zmm30, xmmword ptr [rcx + 2032]
+0x62,0x65,0x7d,0x48,0x7a,0x71,0x7f
+
+# ATT:   vcvttph2qq  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+# INTEL: vcvttph2qq zmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+0x62,0x65,0x7d,0xdf,0x7a,0x72,0x80
+
+# ATT:   vcvttph2udq %ymm29, %zmm30
+# INTEL: vcvttph2udq zmm30, ymm29
+0x62,0x05,0x7c,0x48,0x78,0xf5
+
+# ATT:   vcvttph2udq {sae}, %ymm29, %zmm30
+# INTEL: vcvttph2udq zmm30, ymm29, {sae}
+0x62,0x05,0x7c,0x18,0x78,0xf5
+
+# ATT:   vcvttph2udq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvttph2udq zmm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7c,0x4f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2udq  (%r9){1to16}, %zmm30
+# INTEL: vcvttph2udq zmm30, word ptr [r9]{1to16}
+0x62,0x45,0x7c,0x58,0x78,0x31
+
+# ATT:   vcvttph2udq  4064(%rcx), %zmm30
+# INTEL: vcvttph2udq zmm30, ymmword ptr [rcx + 4064]
+0x62,0x65,0x7c,0x48,0x78,0x71,0x7f
+
+# ATT:   vcvttph2udq  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+# INTEL: vcvttph2udq zmm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+0x62,0x65,0x7c,0xdf,0x78,0x72,0x80
+
+# ATT:   vcvttph2uqq %xmm29, %zmm30
+# INTEL: vcvttph2uqq zmm30, xmm29
+0x62,0x05,0x7d,0x48,0x78,0xf5
+
+# ATT:   vcvttph2uqq {sae}, %xmm29, %zmm30
+# INTEL: vcvttph2uqq zmm30, xmm29, {sae}
+0x62,0x05,0x7d,0x18,0x78,0xf5
+
+# ATT:   vcvttph2uqq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvttph2uqq zmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7d,0x4f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2uqq  (%r9){1to8}, %zmm30
+# INTEL: vcvttph2uqq zmm30, word ptr [r9]{1to8}
+0x62,0x45,0x7d,0x58,0x78,0x31
+
+# ATT:   vcvttph2uqq  2032(%rcx), %zmm30
+# INTEL: vcvttph2uqq zmm30, xmmword ptr [rcx + 2032]
+0x62,0x65,0x7d,0x48,0x78,0x71,0x7f
+
+# ATT:   vcvttph2uqq  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+# INTEL: vcvttph2uqq zmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+0x62,0x65,0x7d,0xdf,0x78,0x72,0x80
+
+# ATT:   vcvttph2uw %zmm29, %zmm30
+# INTEL: vcvttph2uw zmm30, zmm29
+0x62,0x05,0x7c,0x48,0x7c,0xf5
+
+# ATT:   vcvttph2uw {sae}, %zmm29, %zmm30
+# INTEL: vcvttph2uw zmm30, zmm29, {sae}
+0x62,0x05,0x7c,0x18,0x7c,0xf5
+
+# ATT:   vcvttph2uw  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvttph2uw zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7c,0x4f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2uw  (%r9){1to32}, %zmm30
+# INTEL: vcvttph2uw zmm30, word ptr [r9]{1to32}
+0x62,0x45,0x7c,0x58,0x7c,0x31
+
+# ATT:   vcvttph2uw  8128(%rcx), %zmm30
+# INTEL: vcvttph2uw zmm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0x7c,0x48,0x7c,0x71,0x7f
+
+# ATT:   vcvttph2uw  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vcvttph2uw zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x7c,0xdf,0x7c,0x72,0x80
+
+# ATT:   vcvttph2w %zmm29, %zmm30
+# INTEL: vcvttph2w zmm30, zmm29
+0x62,0x05,0x7d,0x48,0x7c,0xf5
+
+# ATT:   vcvttph2w {sae}, %zmm29, %zmm30
+# INTEL: vcvttph2w zmm30, zmm29, {sae}
+0x62,0x05,0x7d,0x18,0x7c,0xf5
+
+# ATT:   vcvttph2w  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvttph2w zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7d,0x4f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2w  (%r9){1to32}, %zmm30
+# INTEL: vcvttph2w zmm30, word ptr [r9]{1to32}
+0x62,0x45,0x7d,0x58,0x7c,0x31
+
+# ATT:   vcvttph2w  8128(%rcx), %zmm30
+# INTEL: vcvttph2w zmm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0x7d,0x48,0x7c,0x71,0x7f
+
+# ATT:   vcvttph2w  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vcvttph2w zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x7d,0xdf,0x7c,0x72,0x80
+
+# ATT:   vcvttsh2si %xmm30, %edx
+# INTEL: vcvttsh2si edx, xmm30
+0x62,0x95,0x7e,0x08,0x2c,0xd6
+
+# ATT:   vcvttsh2si {sae}, %xmm30, %edx
+# INTEL: vcvttsh2si edx, xmm30, {sae}
+0x62,0x95,0x7e,0x18,0x2c,0xd6
+
+# ATT:   vcvttsh2si %xmm30, %r12
+# INTEL: vcvttsh2si r12, xmm30
+0x62,0x15,0xfe,0x08,0x2c,0xe6
+
+# ATT:   vcvttsh2si {sae}, %xmm30, %r12
+# INTEL: vcvttsh2si r12, xmm30, {sae}
+0x62,0x15,0xfe,0x18,0x2c,0xe6
+
+# ATT:   vcvttsh2si  268435456(%rbp,%r14,8), %edx
+# INTEL: vcvttsh2si edx, word ptr [rbp + 8*r14 + 268435456]
+0x62,0xb5,0x7e,0x08,0x2c,0x94,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttsh2si  (%r9), %edx
+# INTEL: vcvttsh2si edx, word ptr [r9]
+0x62,0xd5,0x7e,0x08,0x2c,0x11
+
+# ATT:   vcvttsh2si  254(%rcx), %edx
+# INTEL: vcvttsh2si edx, word ptr [rcx + 254]
+0x62,0xf5,0x7e,0x08,0x2c,0x51,0x7f
+
+# ATT:   vcvttsh2si  -256(%rdx), %edx
+# INTEL: vcvttsh2si edx, word ptr [rdx - 256]
+0x62,0xf5,0x7e,0x08,0x2c,0x52,0x80
+
+# ATT:   vcvttsh2si  268435456(%rbp,%r14,8), %r12
+# INTEL: vcvttsh2si r12, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x35,0xfe,0x08,0x2c,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttsh2si  (%r9), %r12
+# INTEL: vcvttsh2si r12, word ptr [r9]
+0x62,0x55,0xfe,0x08,0x2c,0x21
+
+# ATT:   vcvttsh2si  254(%rcx), %r12
+# INTEL: vcvttsh2si r12, word ptr [rcx + 254]
+0x62,0x75,0xfe,0x08,0x2c,0x61,0x7f
+
+# ATT:   vcvttsh2si  -256(%rdx), %r12
+# INTEL: vcvttsh2si r12, word ptr [rdx - 256]
+0x62,0x75,0xfe,0x08,0x2c,0x62,0x80
+
+# ATT:   vcvttsh2usi %xmm30, %edx
+# INTEL: vcvttsh2usi edx, xmm30
+0x62,0x95,0x7e,0x08,0x78,0xd6
+
+# ATT:   vcvttsh2usi {sae}, %xmm30, %edx
+# INTEL: vcvttsh2usi edx, xmm30, {sae}
+0x62,0x95,0x7e,0x18,0x78,0xd6
+
+# ATT:   vcvttsh2usi %xmm30, %r12
+# INTEL: vcvttsh2usi r12, xmm30
+0x62,0x15,0xfe,0x08,0x78,0xe6
+
+# ATT:   vcvttsh2usi {sae}, %xmm30, %r12
+# INTEL: vcvttsh2usi r12, xmm30, {sae}
+0x62,0x15,0xfe,0x18,0x78,0xe6
+
+# ATT:   vcvttsh2usi  268435456(%rbp,%r14,8), %edx
+# INTEL: vcvttsh2usi edx, word ptr [rbp + 8*r14 + 268435456]
+0x62,0xb5,0x7e,0x08,0x78,0x94,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttsh2usi  (%r9), %edx
+# INTEL: vcvttsh2usi edx, word ptr [r9]
+0x62,0xd5,0x7e,0x08,0x78,0x11
+
+# ATT:   vcvttsh2usi  254(%rcx), %edx
+# INTEL: vcvttsh2usi edx, word ptr [rcx + 254]
+0x62,0xf5,0x7e,0x08,0x78,0x51,0x7f
+
+# ATT:   vcvttsh2usi  -256(%rdx), %edx
+# INTEL: vcvttsh2usi edx, word ptr [rdx - 256]
+0x62,0xf5,0x7e,0x08,0x78,0x52,0x80
+
+# ATT:   vcvttsh2usi  268435456(%rbp,%r14,8), %r12
+# INTEL: vcvttsh2usi r12, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x35,0xfe,0x08,0x78,0xa4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttsh2usi  (%r9), %r12
+# INTEL: vcvttsh2usi r12, word ptr [r9]
+0x62,0x55,0xfe,0x08,0x78,0x21
+
+# ATT:   vcvttsh2usi  254(%rcx), %r12
+# INTEL: vcvttsh2usi r12, word ptr [rcx + 254]
+0x62,0x75,0xfe,0x08,0x78,0x61,0x7f
+
+# ATT:   vcvttsh2usi  -256(%rdx), %r12
+# INTEL: vcvttsh2usi r12, word ptr [rdx - 256]
+0x62,0x75,0xfe,0x08,0x78,0x62,0x80
+
+# ATT:   vcvtudq2ph %zmm29, %ymm30
+# INTEL: vcvtudq2ph ymm30, zmm29
+0x62,0x05,0x7f,0x48,0x7a,0xf5
+
+# ATT:   vcvtudq2ph {rn-sae}, %zmm29, %ymm30
+# INTEL: vcvtudq2ph ymm30, zmm29, {rn-sae}
+0x62,0x05,0x7f,0x18,0x7a,0xf5
+
+# ATT:   vcvtudq2ph  268435456(%rbp,%r14,8), %ymm30 {%k7}
+# INTEL: vcvtudq2ph ymm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7f,0x4f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtudq2ph  (%r9){1to16}, %ymm30
+# INTEL: vcvtudq2ph ymm30, dword ptr [r9]{1to16}
+0x62,0x45,0x7f,0x58,0x7a,0x31
+
+# ATT:   vcvtudq2ph  8128(%rcx), %ymm30
+# INTEL: vcvtudq2ph ymm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0x7f,0x48,0x7a,0x71,0x7f
+
+# ATT:   vcvtudq2ph  -512(%rdx){1to16}, %ymm30 {%k7} {z}
+# INTEL: vcvtudq2ph ymm30 {k7} {z}, dword ptr [rdx - 512]{1to16}
+0x62,0x65,0x7f,0xdf,0x7a,0x72,0x80
+
+# ATT:   vcvtuqq2ph %zmm29, %xmm30
+# INTEL: vcvtuqq2ph xmm30, zmm29
+0x62,0x05,0xff,0x48,0x7a,0xf5
+
+# ATT:   vcvtuqq2ph {rn-sae}, %zmm29, %xmm30
+# INTEL: vcvtuqq2ph xmm30, zmm29, {rn-sae}
+0x62,0x05,0xff,0x18,0x7a,0xf5
+
+# ATT:   vcvtuqq2phz  268435456(%rbp,%r14,8), %xmm30 {%k7}
+# INTEL: vcvtuqq2ph xmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0xff,0x4f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtuqq2ph  (%r9){1to8}, %xmm30
+# INTEL: vcvtuqq2ph xmm30, qword ptr [r9]{1to8}
+0x62,0x45,0xff,0x58,0x7a,0x31
+
+# ATT:   vcvtuqq2phz  8128(%rcx), %xmm30
+# INTEL: vcvtuqq2ph xmm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0xff,0x48,0x7a,0x71,0x7f
+
+# ATT:   vcvtuqq2ph  -1024(%rdx){1to8}, %xmm30 {%k7} {z}
+# INTEL: vcvtuqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to8}
+0x62,0x65,0xff,0xdf,0x7a,0x72,0x80
+
+# ATT:   vcvtusi2sh %r12, %xmm29, %xmm30
+# INTEL: vcvtusi2sh xmm30, xmm29, r12
+0x62,0x45,0x96,0x00,0x7b,0xf4
+
+# ATT:   vcvtusi2sh %r12, {rn-sae}, %xmm29, %xmm30
+# INTEL: vcvtusi2sh xmm30, xmm29, {rn-sae}, r12
+0x62,0x45,0x96,0x10,0x7b,0xf4
+
+# ATT:   vcvtusi2sh %edx, %xmm29, %xmm30
+# INTEL: vcvtusi2sh xmm30, xmm29, edx
+0x62,0x65,0x16,0x00,0x7b,0xf2
+
+# ATT:   vcvtusi2sh %edx, {rn-sae}, %xmm29, %xmm30
+# INTEL: vcvtusi2sh xmm30, xmm29, {rn-sae}, edx
+0x62,0x65,0x16,0x10,0x7b,0xf2
+
+# ATT:   vcvtusi2shl  268435456(%rbp,%r14,8), %xmm29, %xmm30
+# INTEL: vcvtusi2sh xmm30, xmm29, dword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x16,0x00,0x7b,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtusi2shl  (%r9), %xmm29, %xmm30
+# INTEL: vcvtusi2sh xmm30, xmm29, dword ptr [r9]
+0x62,0x45,0x16,0x00,0x7b,0x31
+
+# ATT:   vcvtusi2shl  508(%rcx), %xmm29, %xmm30
+# INTEL: vcvtusi2sh xmm30, xmm29, dword ptr [rcx + 508]
+0x62,0x65,0x16,0x00,0x7b,0x71,0x7f
+
+# ATT:   vcvtusi2shl  -512(%rdx), %xmm29, %xmm30
+# INTEL: vcvtusi2sh xmm30, xmm29, dword ptr [rdx - 512]
+0x62,0x65,0x16,0x00,0x7b,0x72,0x80
+
+# ATT:   vcvtusi2shq  1016(%rcx), %xmm29, %xmm30
+# INTEL: vcvtusi2sh xmm30, xmm29, qword ptr [rcx + 1016]
+0x62,0x65,0x96,0x00,0x7b,0x71,0x7f
+
+# ATT:   vcvtusi2shq  -1024(%rdx), %xmm29, %xmm30
+# INTEL: vcvtusi2sh xmm30, xmm29, qword ptr [rdx - 1024]
+0x62,0x65,0x96,0x00,0x7b,0x72,0x80
+
+# ATT:   vcvtuw2ph %zmm29, %zmm30
+# INTEL: vcvtuw2ph zmm30, zmm29
+0x62,0x05,0x7f,0x48,0x7d,0xf5
+
+# ATT:   vcvtuw2ph {rn-sae}, %zmm29, %zmm30
+# INTEL: vcvtuw2ph zmm30, zmm29, {rn-sae}
+0x62,0x05,0x7f,0x18,0x7d,0xf5
+
+# ATT:   vcvtuw2ph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvtuw2ph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7f,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtuw2ph  (%r9){1to32}, %zmm30
+# INTEL: vcvtuw2ph zmm30, word ptr [r9]{1to32}
+0x62,0x45,0x7f,0x58,0x7d,0x31
+
+# ATT:   vcvtuw2ph  8128(%rcx), %zmm30
+# INTEL: vcvtuw2ph zmm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0x7f,0x48,0x7d,0x71,0x7f
+
+# ATT:   vcvtuw2ph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vcvtuw2ph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x7f,0xdf,0x7d,0x72,0x80
+
+# ATT:   vcvtw2ph %zmm29, %zmm30
+# INTEL: vcvtw2ph zmm30, zmm29
+0x62,0x05,0x7e,0x48,0x7d,0xf5
+
+# ATT:   vcvtw2ph {rn-sae}, %zmm29, %zmm30
+# INTEL: vcvtw2ph zmm30, zmm29, {rn-sae}
+0x62,0x05,0x7e,0x18,0x7d,0xf5
+
+# ATT:   vcvtw2ph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vcvtw2ph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7e,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtw2ph  (%r9){1to32}, %zmm30
+# INTEL: vcvtw2ph zmm30, word ptr [r9]{1to32}
+0x62,0x45,0x7e,0x58,0x7d,0x31
+
+# ATT:   vcvtw2ph  8128(%rcx), %zmm30
+# INTEL: vcvtw2ph zmm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0x7e,0x48,0x7d,0x71,0x7f
+
+# ATT:   vcvtw2ph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vcvtw2ph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x7e,0xdf,0x7d,0x72,0x80
+
+# ATT:   vfpclassph $123, %zmm30, %k5
+# INTEL: vfpclassph k5, zmm30, 123
+0x62,0x93,0x7c,0x48,0x66,0xee,0x7b
+
+# ATT:   vfpclassphz  $123, 268435456(%rbp,%r14,8), %k5 {%k7}
+# INTEL: vfpclassph k5 {k7}, zmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xb3,0x7c,0x4f,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vfpclassph  $123, (%r9){1to32}, %k5
+# INTEL: vfpclassph k5, word ptr [r9]{1to32}, 123
+0x62,0xd3,0x7c,0x58,0x66,0x29,0x7b
+
+# ATT:   vfpclassphz  $123, 8128(%rcx), %k5
+# INTEL: vfpclassph k5, zmmword ptr [rcx + 8128], 123
+0x62,0xf3,0x7c,0x48,0x66,0x69,0x7f,0x7b
+
+# ATT:   vfpclassph  $123, -256(%rdx){1to32}, %k5 {%k7}
+# INTEL: vfpclassph k5 {k7}, word ptr [rdx - 256]{1to32}, 123
+0x62,0xf3,0x7c,0x5f,0x66,0x6a,0x80,0x7b
+
+# ATT:   vfpclasssh $123, %xmm30, %k5
+# INTEL: vfpclasssh k5, xmm30, 123
+0x62,0x93,0x7c,0x08,0x67,0xee,0x7b
+
+# ATT:   vfpclasssh  $123, 268435456(%rbp,%r14,8), %k5 {%k7}
+# INTEL: vfpclasssh k5 {k7}, word ptr [rbp + 8*r14 + 268435456], 123
+0x62,0xb3,0x7c,0x0f,0x67,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vfpclasssh  $123, (%r9), %k5
+# INTEL: vfpclasssh k5, word ptr [r9], 123
+0x62,0xd3,0x7c,0x08,0x67,0x29,0x7b
+
+# ATT:   vfpclasssh  $123, 254(%rcx), %k5
+# INTEL: vfpclasssh k5, word ptr [rcx + 254], 123
+0x62,0xf3,0x7c,0x08,0x67,0x69,0x7f,0x7b
+
+# ATT:   vfpclasssh  $123, -256(%rdx), %k5 {%k7}
+# INTEL: vfpclasssh k5 {k7}, word ptr [rdx - 256], 123
+0x62,0xf3,0x7c,0x0f,0x67,0x6a,0x80,0x7b
+
+# ATT:   vgetexpph %zmm29, %zmm30
+# INTEL: vgetexpph zmm30, zmm29
+0x62,0x06,0x7d,0x48,0x42,0xf5
+
+# ATT:   vgetexpph {sae}, %zmm29, %zmm30
+# INTEL: vgetexpph zmm30, zmm29, {sae}
+0x62,0x06,0x7d,0x18,0x42,0xf5
+
+# ATT:   vgetexpph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vgetexpph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x7d,0x4f,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vgetexpph  (%r9){1to32}, %zmm30
+# INTEL: vgetexpph zmm30, word ptr [r9]{1to32}
+0x62,0x46,0x7d,0x58,0x42,0x31
+
+# ATT:   vgetexpph  8128(%rcx), %zmm30
+# INTEL: vgetexpph zmm30, zmmword ptr [rcx + 8128]
+0x62,0x66,0x7d,0x48,0x42,0x71,0x7f
+
+# ATT:   vgetexpph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vgetexpph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x7d,0xdf,0x42,0x72,0x80
+
+# ATT:   vgetexpsh %xmm28, %xmm29, %xmm30
+# INTEL: vgetexpsh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0x43,0xf4
+
+# ATT:   vgetexpsh {sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vgetexpsh xmm30, xmm29, xmm28, {sae}
+0x62,0x06,0x15,0x10,0x43,0xf4
+
+# ATT:   vgetexpsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vgetexpsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0x43,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vgetexpsh  (%r9), %xmm29, %xmm30
+# INTEL: vgetexpsh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0x43,0x31
+
+# ATT:   vgetexpsh  254(%rcx), %xmm29, %xmm30
+# INTEL: vgetexpsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0x43,0x71,0x7f
+
+# ATT:   vgetexpsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vgetexpsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0x43,0x72,0x80
+
+# ATT:   vgetmantph $123, %zmm29, %zmm30
+# INTEL: vgetmantph zmm30, zmm29, 123
+0x62,0x03,0x7c,0x48,0x26,0xf5,0x7b
+
+# ATT:   vgetmantph $123, {sae}, %zmm29, %zmm30
+# INTEL: vgetmantph zmm30, zmm29, {sae}, 123
+0x62,0x03,0x7c,0x18,0x26,0xf5,0x7b
+
+# ATT:   vgetmantph  $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vgetmantph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0x23,0x7c,0x4f,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vgetmantph  $123, (%r9){1to32}, %zmm30
+# INTEL: vgetmantph zmm30, word ptr [r9]{1to32}, 123
+0x62,0x43,0x7c,0x58,0x26,0x31,0x7b
+
+# ATT:   vgetmantph  $123, 8128(%rcx), %zmm30
+# INTEL: vgetmantph zmm30, zmmword ptr [rcx + 8128], 123
+0x62,0x63,0x7c,0x48,0x26,0x71,0x7f,0x7b
+
+# ATT:   vgetmantph  $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vgetmantph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+0x62,0x63,0x7c,0xdf,0x26,0x72,0x80,0x7b
+
+# ATT:   vgetmantsh $123, %xmm28, %xmm29, %xmm30
+# INTEL: vgetmantsh xmm30, xmm29, xmm28, 123
+0x62,0x03,0x14,0x00,0x27,0xf4,0x7b
+
+# ATT:   vgetmantsh $123, {sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vgetmantsh xmm30, xmm29, xmm28, {sae}, 123
+0x62,0x03,0x14,0x10,0x27,0xf4,0x7b
+
+# ATT:   vgetmantsh  $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vgetmantsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456], 123
+0x62,0x23,0x14,0x07,0x27,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vgetmantsh  $123, (%r9), %xmm29, %xmm30
+# INTEL: vgetmantsh xmm30, xmm29, word ptr [r9], 123
+0x62,0x43,0x14,0x00,0x27,0x31,0x7b
+
+# ATT:   vgetmantsh  $123, 254(%rcx), %xmm29, %xmm30
+# INTEL: vgetmantsh xmm30, xmm29, word ptr [rcx + 254], 123
+0x62,0x63,0x14,0x00,0x27,0x71,0x7f,0x7b
+
+# ATT:   vgetmantsh  $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vgetmantsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256], 123
+0x62,0x63,0x14,0x87,0x27,0x72,0x80,0x7b
+
+# ATT:   vrcpph %zmm29, %zmm30
+# INTEL: vrcpph zmm30, zmm29
+0x62,0x06,0x7d,0x48,0x4c,0xf5
+
+# ATT:   vrcpph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vrcpph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x7d,0x4f,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vrcpph  (%r9){1to32}, %zmm30
+# INTEL: vrcpph zmm30, word ptr [r9]{1to32}
+0x62,0x46,0x7d,0x58,0x4c,0x31
+
+# ATT:   vrcpph  8128(%rcx), %zmm30
+# INTEL: vrcpph zmm30, zmmword ptr [rcx + 8128]
+0x62,0x66,0x7d,0x48,0x4c,0x71,0x7f
+
+# ATT:   vrcpph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vrcpph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x7d,0xdf,0x4c,0x72,0x80
+
+# ATT:   vrcpsh %xmm28, %xmm29, %xmm30
+# INTEL: vrcpsh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0x4d,0xf4
+
+# ATT:   vrcpsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vrcpsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0x4d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vrcpsh  (%r9), %xmm29, %xmm30
+# INTEL: vrcpsh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0x4d,0x31
+
+# ATT:   vrcpsh  254(%rcx), %xmm29, %xmm30
+# INTEL: vrcpsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0x4d,0x71,0x7f
+
+# ATT:   vrcpsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vrcpsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0x4d,0x72,0x80
+
+# ATT:   vreduceph $123, %zmm29, %zmm30
+# INTEL: vreduceph zmm30, zmm29, 123
+0x62,0x03,0x7c,0x48,0x56,0xf5,0x7b
+
+# ATT:   vreduceph $123, {sae}, %zmm29, %zmm30
+# INTEL: vreduceph zmm30, zmm29, {sae}, 123
+0x62,0x03,0x7c,0x18,0x56,0xf5,0x7b
+
+# ATT:   vreduceph  $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vreduceph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0x23,0x7c,0x4f,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vreduceph  $123, (%r9){1to32}, %zmm30
+# INTEL: vreduceph zmm30, word ptr [r9]{1to32}, 123
+0x62,0x43,0x7c,0x58,0x56,0x31,0x7b
+
+# ATT:   vreduceph  $123, 8128(%rcx), %zmm30
+# INTEL: vreduceph zmm30, zmmword ptr [rcx + 8128], 123
+0x62,0x63,0x7c,0x48,0x56,0x71,0x7f,0x7b
+
+# ATT:   vreduceph  $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vreduceph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+0x62,0x63,0x7c,0xdf,0x56,0x72,0x80,0x7b
+
+# ATT:   vreducesh $123, %xmm28, %xmm29, %xmm30
+# INTEL: vreducesh xmm30, xmm29, xmm28, 123
+0x62,0x03,0x14,0x00,0x57,0xf4,0x7b
+
+# ATT:   vreducesh $123, {sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vreducesh xmm30, xmm29, xmm28, {sae}, 123
+0x62,0x03,0x14,0x10,0x57,0xf4,0x7b
+
+# ATT:   vreducesh  $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vreducesh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456], 123
+0x62,0x23,0x14,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vreducesh  $123, (%r9), %xmm29, %xmm30
+# INTEL: vreducesh xmm30, xmm29, word ptr [r9], 123
+0x62,0x43,0x14,0x00,0x57,0x31,0x7b
+
+# ATT:   vreducesh  $123, 254(%rcx), %xmm29, %xmm30
+# INTEL: vreducesh xmm30, xmm29, word ptr [rcx + 254], 123
+0x62,0x63,0x14,0x00,0x57,0x71,0x7f,0x7b
+
+# ATT:   vreducesh  $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vreducesh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256], 123
+0x62,0x63,0x14,0x87,0x57,0x72,0x80,0x7b
+
+# ATT:   vrndscaleph $123, %zmm29, %zmm30
+# INTEL: vrndscaleph zmm30, zmm29, 123
+0x62,0x03,0x7c,0x48,0x08,0xf5,0x7b
+
+# ATT:   vrndscaleph $123, {sae}, %zmm29, %zmm30
+# INTEL: vrndscaleph zmm30, zmm29, {sae}, 123
+0x62,0x03,0x7c,0x18,0x08,0xf5,0x7b
+
+# ATT:   vrndscaleph  $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vrndscaleph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456], 123
+0x62,0x23,0x7c,0x4f,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vrndscaleph  $123, (%r9){1to32}, %zmm30
+# INTEL: vrndscaleph zmm30, word ptr [r9]{1to32}, 123
+0x62,0x43,0x7c,0x58,0x08,0x31,0x7b
+
+# ATT:   vrndscaleph  $123, 8128(%rcx), %zmm30
+# INTEL: vrndscaleph zmm30, zmmword ptr [rcx + 8128], 123
+0x62,0x63,0x7c,0x48,0x08,0x71,0x7f,0x7b
+
+# ATT:   vrndscaleph  $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vrndscaleph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}, 123
+0x62,0x63,0x7c,0xdf,0x08,0x72,0x80,0x7b
+
+# ATT:   vrndscalesh $123, %xmm28, %xmm29, %xmm30
+# INTEL: vrndscalesh xmm30, xmm29, xmm28, 123
+0x62,0x03,0x14,0x00,0x0a,0xf4,0x7b
+
+# ATT:   vrndscalesh $123, {sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vrndscalesh xmm30, xmm29, xmm28, {sae}, 123
+0x62,0x03,0x14,0x10,0x0a,0xf4,0x7b
+
+# ATT:   vrndscalesh  $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vrndscalesh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456], 123
+0x62,0x23,0x14,0x07,0x0a,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vrndscalesh  $123, (%r9), %xmm29, %xmm30
+# INTEL: vrndscalesh xmm30, xmm29, word ptr [r9], 123
+0x62,0x43,0x14,0x00,0x0a,0x31,0x7b
+
+# ATT:   vrndscalesh  $123, 254(%rcx), %xmm29, %xmm30
+# INTEL: vrndscalesh xmm30, xmm29, word ptr [rcx + 254], 123
+0x62,0x63,0x14,0x00,0x0a,0x71,0x7f,0x7b
+
+# ATT:   vrndscalesh  $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vrndscalesh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256], 123
+0x62,0x63,0x14,0x87,0x0a,0x72,0x80,0x7b
+
+# ATT:   vrsqrtph %zmm29, %zmm30
+# INTEL: vrsqrtph zmm30, zmm29
+0x62,0x06,0x7d,0x48,0x4e,0xf5
+
+# ATT:   vrsqrtph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vrsqrtph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x7d,0x4f,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vrsqrtph  (%r9){1to32}, %zmm30
+# INTEL: vrsqrtph zmm30, word ptr [r9]{1to32}
+0x62,0x46,0x7d,0x58,0x4e,0x31
+
+# ATT:   vrsqrtph  8128(%rcx), %zmm30
+# INTEL: vrsqrtph zmm30, zmmword ptr [rcx + 8128]
+0x62,0x66,0x7d,0x48,0x4e,0x71,0x7f
+
+# ATT:   vrsqrtph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vrsqrtph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x7d,0xdf,0x4e,0x72,0x80
+
+# ATT:   vrsqrtsh %xmm28, %xmm29, %xmm30
+# INTEL: vrsqrtsh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0x4f,0xf4
+
+# ATT:   vrsqrtsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vrsqrtsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0x4f,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vrsqrtsh  (%r9), %xmm29, %xmm30
+# INTEL: vrsqrtsh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0x4f,0x31
+
+# ATT:   vrsqrtsh  254(%rcx), %xmm29, %xmm30
+# INTEL: vrsqrtsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0x4f,0x71,0x7f
+
+# ATT:   vrsqrtsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vrsqrtsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0x4f,0x72,0x80
+
+# ATT:   vscalefph %zmm28, %zmm29, %zmm30
+# INTEL: vscalefph zmm30, zmm29, zmm28
+0x62,0x06,0x15,0x40,0x2c,0xf4
+
+# ATT:   vscalefph {rn-sae}, %zmm28, %zmm29, %zmm30
+# INTEL: vscalefph zmm30, zmm29, zmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x2c,0xf4
+
+# ATT:   vscalefph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+# INTEL: vscalefph zmm30 {k7}, zmm29, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x47,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vscalefph  (%r9){1to32}, %zmm29, %zmm30
+# INTEL: vscalefph zmm30, zmm29, word ptr [r9]{1to32}
+0x62,0x46,0x15,0x50,0x2c,0x31
+
+# ATT:   vscalefph  8128(%rcx), %zmm29, %zmm30
+# INTEL: vscalefph zmm30, zmm29, zmmword ptr [rcx + 8128]
+0x62,0x66,0x15,0x40,0x2c,0x71,0x7f
+
+# ATT:   vscalefph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+# INTEL: vscalefph zmm30 {k7} {z}, zmm29, word ptr [rdx - 256]{1to32}
+0x62,0x66,0x15,0xd7,0x2c,0x72,0x80
+
+# ATT:   vscalefsh %xmm28, %xmm29, %xmm30
+# INTEL: vscalefsh xmm30, xmm29, xmm28
+0x62,0x06,0x15,0x00,0x2d,0xf4
+
+# ATT:   vscalefsh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vscalefsh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x06,0x15,0x10,0x2d,0xf4
+
+# ATT:   vscalefsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vscalefsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x26,0x15,0x07,0x2d,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vscalefsh  (%r9), %xmm29, %xmm30
+# INTEL: vscalefsh xmm30, xmm29, word ptr [r9]
+0x62,0x46,0x15,0x00,0x2d,0x31
+
+# ATT:   vscalefsh  254(%rcx), %xmm29, %xmm30
+# INTEL: vscalefsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x66,0x15,0x00,0x2d,0x71,0x7f
+
+# ATT:   vscalefsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vscalefsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x66,0x15,0x87,0x2d,0x72,0x80
+
+# ATT:   vsqrtph %zmm29, %zmm30
+# INTEL: vsqrtph zmm30, zmm29
+0x62,0x05,0x7c,0x48,0x51,0xf5
+
+# ATT:   vsqrtph {rn-sae}, %zmm29, %zmm30
+# INTEL: vsqrtph zmm30, zmm29, {rn-sae}
+0x62,0x05,0x7c,0x18,0x51,0xf5
+
+# ATT:   vsqrtph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+# INTEL: vsqrtph zmm30 {k7}, zmmword ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x7c,0x4f,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsqrtph  (%r9){1to32}, %zmm30
+# INTEL: vsqrtph zmm30, word ptr [r9]{1to32}
+0x62,0x45,0x7c,0x58,0x51,0x31
+
+# ATT:   vsqrtph  8128(%rcx), %zmm30
+# INTEL: vsqrtph zmm30, zmmword ptr [rcx + 8128]
+0x62,0x65,0x7c,0x48,0x51,0x71,0x7f
+
+# ATT:   vsqrtph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+# INTEL: vsqrtph zmm30 {k7} {z}, word ptr [rdx - 256]{1to32}
+0x62,0x65,0x7c,0xdf,0x51,0x72,0x80
+
+# ATT:   vsqrtsh %xmm28, %xmm29, %xmm30
+# INTEL: vsqrtsh xmm30, xmm29, xmm28
+0x62,0x05,0x16,0x00,0x51,0xf4
+
+# ATT:   vsqrtsh {rn-sae}, %xmm28, %xmm29, %xmm30
+# INTEL: vsqrtsh xmm30, xmm29, xmm28, {rn-sae}
+0x62,0x05,0x16,0x10,0x51,0xf4
+
+# ATT:   vsqrtsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+# INTEL: vsqrtsh xmm30 {k7}, xmm29, word ptr [rbp + 8*r14 + 268435456]
+0x62,0x25,0x16,0x07,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# ATT:   vsqrtsh  (%r9), %xmm29, %xmm30
+# INTEL: vsqrtsh xmm30, xmm29, word ptr [r9]
+0x62,0x45,0x16,0x00,0x51,0x31
+
+# ATT:   vsqrtsh  254(%rcx), %xmm29, %xmm30
+# INTEL: vsqrtsh xmm30, xmm29, word ptr [rcx + 254]
+0x62,0x65,0x16,0x00,0x51,0x71,0x7f
+
+# ATT:   vsqrtsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+# INTEL: vsqrtsh xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]
+0x62,0x65,0x16,0x87,0x51,0x72,0x80
diff --git a/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
new file mode 100644
index 0000000000000..8f480fc13d82f
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/avx512fp16vl.txt
@@ -0,0 +1,1494 @@
+# RUN: llvm-mc --disassemble %s -triple=i686 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   vaddph %ymm4, %ymm5, %ymm6
+# INTEL: vaddph ymm6, ymm5, ymm4
+0x62,0xf5,0x54,0x28,0x58,0xf4
+
+# ATT:   vaddph %xmm4, %xmm5, %xmm6
+# INTEL: vaddph xmm6, xmm5, xmm4
+0x62,0xf5,0x54,0x08,0x58,0xf4
+
+# ATT:   vaddph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vaddph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x54,0x2f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vaddph  (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vaddph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf5,0x54,0x38,0x58,0x31
+
+# ATT:   vaddph  4064(%ecx), %ymm5, %ymm6
+# INTEL: vaddph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x54,0x28,0x58,0x71,0x7f
+
+# ATT:   vaddph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vaddph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x54,0xbf,0x58,0x72,0x80
+
+# ATT:   vaddph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vaddph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x54,0x0f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vaddph  (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vaddph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf5,0x54,0x18,0x58,0x31
+
+# ATT:   vaddph  2032(%ecx), %xmm5, %xmm6
+# INTEL: vaddph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x54,0x08,0x58,0x71,0x7f
+
+# ATT:   vaddph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vaddph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x54,0x9f,0x58,0x72,0x80
+
+# ATT:   vcmpltph %ymm4, %ymm5, %k5
+# INTEL: vcmpltph k5, ymm5, ymm4
+0x62,0xf3,0x54,0x28,0xc2,0xec,0x01
+
+# ATT:   vcmpunordph %xmm4, %xmm5, %k5
+# INTEL: vcmpunordph k5, xmm5, xmm4
+0x62,0xf3,0x54,0x08,0xc2,0xec,0x03
+
+# ATT:   vcmpnltph 268435456(%esp,%esi,8), %xmm5, %k5 {%k7}
+# INTEL: vcmpnltph k5 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf3,0x54,0x0f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x05
+
+# ATT:   vcmpordph (%ecx){1to8}, %xmm5, %k5
+# INTEL: vcmpordph k5, xmm5, word ptr [ecx]{1to8}
+0x62,0xf3,0x54,0x18,0xc2,0x29,0x07
+
+# ATT:   vcmpngeph 2032(%ecx), %xmm5, %k5
+# INTEL: vcmpngeph k5, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf3,0x54,0x08,0xc2,0x69,0x7f,0x09
+
+# ATT:   vcmpfalseph -256(%edx){1to8}, %xmm5, %k5 {%k7}
+# INTEL: vcmpfalseph k5 {k7}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf3,0x54,0x1f,0xc2,0x6a,0x80,0x0b
+
+# ATT:   vcmpgeph 268435456(%esp,%esi,8), %ymm5, %k5 {%k7}
+# INTEL: vcmpgeph k5 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf3,0x54,0x2f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x0d
+
+# ATT:   vcmptrueph (%ecx){1to16}, %ymm5, %k5
+# INTEL: vcmptrueph k5, ymm5, word ptr [ecx]{1to16}
+0x62,0xf3,0x54,0x38,0xc2,0x29,0x0f
+
+# ATT:   vcmplt_oqph 4064(%ecx), %ymm5, %k5
+# INTEL: vcmplt_oqph k5, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf3,0x54,0x28,0xc2,0x69,0x7f,0x11
+
+# ATT:   vcmpunord_sph -256(%edx){1to16}, %ymm5, %k5 {%k7}
+# INTEL: vcmpunord_sph k5 {k7}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf3,0x54,0x3f,0xc2,0x6a,0x80,0x13
+
+# ATT:   vdivph %ymm4, %ymm5, %ymm6
+# INTEL: vdivph ymm6, ymm5, ymm4
+0x62,0xf5,0x54,0x28,0x5e,0xf4
+
+# ATT:   vdivph %xmm4, %xmm5, %xmm6
+# INTEL: vdivph xmm6, xmm5, xmm4
+0x62,0xf5,0x54,0x08,0x5e,0xf4
+
+# ATT:   vdivph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vdivph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x54,0x2f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vdivph  (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vdivph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf5,0x54,0x38,0x5e,0x31
+
+# ATT:   vdivph  4064(%ecx), %ymm5, %ymm6
+# INTEL: vdivph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x54,0x28,0x5e,0x71,0x7f
+
+# ATT:   vdivph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vdivph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x54,0xbf,0x5e,0x72,0x80
+
+# ATT:   vdivph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vdivph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x54,0x0f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vdivph  (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vdivph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf5,0x54,0x18,0x5e,0x31
+
+# ATT:   vdivph  2032(%ecx), %xmm5, %xmm6
+# INTEL: vdivph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x54,0x08,0x5e,0x71,0x7f
+
+# ATT:   vdivph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vdivph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x54,0x9f,0x5e,0x72,0x80
+
+# ATT:   vmaxph %ymm4, %ymm5, %ymm6
+# INTEL: vmaxph ymm6, ymm5, ymm4
+0x62,0xf5,0x54,0x28,0x5f,0xf4
+
+# ATT:   vmaxph %xmm4, %xmm5, %xmm6
+# INTEL: vmaxph xmm6, xmm5, xmm4
+0x62,0xf5,0x54,0x08,0x5f,0xf4
+
+# ATT:   vmaxph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vmaxph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x54,0x2f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vmaxph  (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vmaxph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf5,0x54,0x38,0x5f,0x31
+
+# ATT:   vmaxph  4064(%ecx), %ymm5, %ymm6
+# INTEL: vmaxph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x54,0x28,0x5f,0x71,0x7f
+
+# ATT:   vmaxph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vmaxph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x54,0xbf,0x5f,0x72,0x80
+
+# ATT:   vmaxph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vmaxph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x54,0x0f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vmaxph  (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vmaxph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf5,0x54,0x18,0x5f,0x31
+
+# ATT:   vmaxph  2032(%ecx), %xmm5, %xmm6
+# INTEL: vmaxph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x54,0x08,0x5f,0x71,0x7f
+
+# ATT:   vmaxph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vmaxph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x54,0x9f,0x5f,0x72,0x80
+
+# ATT:   vminph %ymm4, %ymm5, %ymm6
+# INTEL: vminph ymm6, ymm5, ymm4
+0x62,0xf5,0x54,0x28,0x5d,0xf4
+
+# ATT:   vminph %xmm4, %xmm5, %xmm6
+# INTEL: vminph xmm6, xmm5, xmm4
+0x62,0xf5,0x54,0x08,0x5d,0xf4
+
+# ATT:   vminph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vminph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x54,0x2f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vminph  (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vminph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf5,0x54,0x38,0x5d,0x31
+
+# ATT:   vminph  4064(%ecx), %ymm5, %ymm6
+# INTEL: vminph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x54,0x28,0x5d,0x71,0x7f
+
+# ATT:   vminph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vminph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x54,0xbf,0x5d,0x72,0x80
+
+# ATT:   vminph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vminph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x54,0x0f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vminph  (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vminph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf5,0x54,0x18,0x5d,0x31
+
+# ATT:   vminph  2032(%ecx), %xmm5, %xmm6
+# INTEL: vminph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x54,0x08,0x5d,0x71,0x7f
+
+# ATT:   vminph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vminph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x54,0x9f,0x5d,0x72,0x80
+
+# ATT:   vmulph %ymm4, %ymm5, %ymm6
+# INTEL: vmulph ymm6, ymm5, ymm4
+0x62,0xf5,0x54,0x28,0x59,0xf4
+
+# ATT:   vmulph %xmm4, %xmm5, %xmm6
+# INTEL: vmulph xmm6, xmm5, xmm4
+0x62,0xf5,0x54,0x08,0x59,0xf4
+
+# ATT:   vmulph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vmulph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x54,0x2f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vmulph  (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vmulph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf5,0x54,0x38,0x59,0x31
+
+# ATT:   vmulph  4064(%ecx), %ymm5, %ymm6
+# INTEL: vmulph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x54,0x28,0x59,0x71,0x7f
+
+# ATT:   vmulph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vmulph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x54,0xbf,0x59,0x72,0x80
+
+# ATT:   vmulph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vmulph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x54,0x0f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vmulph  (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vmulph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf5,0x54,0x18,0x59,0x31
+
+# ATT:   vmulph  2032(%ecx), %xmm5, %xmm6
+# INTEL: vmulph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x54,0x08,0x59,0x71,0x7f
+
+# ATT:   vmulph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vmulph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x54,0x9f,0x59,0x72,0x80
+
+# ATT:   vsubph %ymm4, %ymm5, %ymm6
+# INTEL: vsubph ymm6, ymm5, ymm4
+0x62,0xf5,0x54,0x28,0x5c,0xf4
+
+# ATT:   vsubph %xmm4, %xmm5, %xmm6
+# INTEL: vsubph xmm6, xmm5, xmm4
+0x62,0xf5,0x54,0x08,0x5c,0xf4
+
+# ATT:   vsubph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vsubph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x54,0x2f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vsubph  (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vsubph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf5,0x54,0x38,0x5c,0x31
+
+# ATT:   vsubph  4064(%ecx), %ymm5, %ymm6
+# INTEL: vsubph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x54,0x28,0x5c,0x71,0x7f
+
+# ATT:   vsubph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vsubph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x54,0xbf,0x5c,0x72,0x80
+
+# ATT:   vsubph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vsubph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x54,0x0f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vsubph  (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vsubph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf5,0x54,0x18,0x5c,0x31
+
+# ATT:   vsubph  2032(%ecx), %xmm5, %xmm6
+# INTEL: vsubph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x54,0x08,0x5c,0x71,0x7f
+
+# ATT:   vsubph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vsubph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x54,0x9f,0x5c,0x72,0x80
+
+# ATT:   vcvtdq2ph %xmm5, %xmm6
+# INTEL: vcvtdq2ph xmm6, xmm5
+0x62,0xf5,0x7c,0x08,0x5b,0xf5
+
+# ATT:   vcvtdq2ph %ymm5, %xmm6
+# INTEL: vcvtdq2ph xmm6, ymm5
+0x62,0xf5,0x7c,0x28,0x5b,0xf5
+
+# ATT:   vcvtdq2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtdq2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtdq2ph  (%ecx){1to4}, %xmm6
+# INTEL: vcvtdq2ph xmm6, dword ptr [ecx]{1to4}
+0x62,0xf5,0x7c,0x18,0x5b,0x31
+
+# ATT:   vcvtdq2phx  2032(%ecx), %xmm6
+# INTEL: vcvtdq2ph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7c,0x08,0x5b,0x71,0x7f
+
+# ATT:   vcvtdq2ph  -512(%edx){1to4}, %xmm6 {%k7} {z}
+# INTEL: vcvtdq2ph xmm6 {k7} {z}, dword ptr [edx - 512]{1to4}
+0x62,0xf5,0x7c,0x9f,0x5b,0x72,0x80
+
+# ATT:   vcvtdq2ph  (%ecx){1to8}, %xmm6
+# INTEL: vcvtdq2ph xmm6, dword ptr [ecx]{1to8}
+0x62,0xf5,0x7c,0x38,0x5b,0x31
+
+# ATT:   vcvtdq2phy  4064(%ecx), %xmm6
+# INTEL: vcvtdq2ph xmm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x7c,0x28,0x5b,0x71,0x7f
+
+# ATT:   vcvtdq2ph  -512(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vcvtdq2ph xmm6 {k7} {z}, dword ptr [edx - 512]{1to8}
+0x62,0xf5,0x7c,0xbf,0x5b,0x72,0x80
+
+# ATT:   vcvtpd2ph %xmm5, %xmm6
+# INTEL: vcvtpd2ph xmm6, xmm5
+0x62,0xf5,0xfd,0x08,0x5a,0xf5
+
+# ATT:   vcvtpd2ph %ymm5, %xmm6
+# INTEL: vcvtpd2ph xmm6, ymm5
+0x62,0xf5,0xfd,0x28,0x5a,0xf5
+
+# ATT:   vcvtpd2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtpd2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0xfd,0x0f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtpd2ph  (%ecx){1to2}, %xmm6
+# INTEL: vcvtpd2ph xmm6, qword ptr [ecx]{1to2}
+0x62,0xf5,0xfd,0x18,0x5a,0x31
+
+# ATT:   vcvtpd2phx  2032(%ecx), %xmm6
+# INTEL: vcvtpd2ph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0xfd,0x08,0x5a,0x71,0x7f
+
+# ATT:   vcvtpd2ph  -1024(%edx){1to2}, %xmm6 {%k7} {z}
+# INTEL: vcvtpd2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to2}
+0x62,0xf5,0xfd,0x9f,0x5a,0x72,0x80
+
+# ATT:   vcvtpd2ph  (%ecx){1to4}, %xmm6
+# INTEL: vcvtpd2ph xmm6, qword ptr [ecx]{1to4}
+0x62,0xf5,0xfd,0x38,0x5a,0x31
+
+# ATT:   vcvtpd2phy  4064(%ecx), %xmm6
+# INTEL: vcvtpd2ph xmm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0xfd,0x28,0x5a,0x71,0x7f
+
+# ATT:   vcvtpd2ph  -1024(%edx){1to4}, %xmm6 {%k7} {z}
+# INTEL: vcvtpd2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to4}
+0x62,0xf5,0xfd,0xbf,0x5a,0x72,0x80
+
+# ATT:   vcvtph2dq %xmm5, %xmm6
+# INTEL: vcvtph2dq xmm6, xmm5
+0x62,0xf5,0x7d,0x08,0x5b,0xf5
+
+# ATT:   vcvtph2dq %xmm5, %ymm6
+# INTEL: vcvtph2dq ymm6, xmm5
+0x62,0xf5,0x7d,0x28,0x5b,0xf5
+
+# ATT:   vcvtph2dq  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtph2dq xmm6 {k7}, qword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2dq  (%ecx){1to4}, %xmm6
+# INTEL: vcvtph2dq xmm6, word ptr [ecx]{1to4}
+0x62,0xf5,0x7d,0x18,0x5b,0x31
+
+# ATT:   vcvtph2dq  1016(%ecx), %xmm6
+# INTEL: vcvtph2dq xmm6, qword ptr [ecx + 1016]
+0x62,0xf5,0x7d,0x08,0x5b,0x71,0x7f
+
+# ATT:   vcvtph2dq  -256(%edx){1to4}, %xmm6 {%k7} {z}
+# INTEL: vcvtph2dq xmm6 {k7} {z}, word ptr [edx - 256]{1to4}
+0x62,0xf5,0x7d,0x9f,0x5b,0x72,0x80
+
+# ATT:   vcvtph2dq  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvtph2dq ymm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x2f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2dq  (%ecx){1to8}, %ymm6
+# INTEL: vcvtph2dq ymm6, word ptr [ecx]{1to8}
+0x62,0xf5,0x7d,0x38,0x5b,0x31
+
+# ATT:   vcvtph2dq  2032(%ecx), %ymm6
+# INTEL: vcvtph2dq ymm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7d,0x28,0x5b,0x71,0x7f
+
+# ATT:   vcvtph2dq  -256(%edx){1to8}, %ymm6 {%k7} {z}
+# INTEL: vcvtph2dq ymm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7d,0xbf,0x5b,0x72,0x80
+
+# ATT:   vcvtph2pd %xmm5, %xmm6
+# INTEL: vcvtph2pd xmm6, xmm5
+0x62,0xf5,0x7c,0x08,0x5a,0xf5
+
+# ATT:   vcvtph2pd %xmm5, %ymm6
+# INTEL: vcvtph2pd ymm6, xmm5
+0x62,0xf5,0x7c,0x28,0x5a,0xf5
+
+# ATT:   vcvtph2pd  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtph2pd xmm6 {k7}, dword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x0f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2pd  (%ecx){1to2}, %xmm6
+# INTEL: vcvtph2pd xmm6, word ptr [ecx]{1to2}
+0x62,0xf5,0x7c,0x18,0x5a,0x31
+
+# ATT:   vcvtph2pd  508(%ecx), %xmm6
+# INTEL: vcvtph2pd xmm6, dword ptr [ecx + 508]
+0x62,0xf5,0x7c,0x08,0x5a,0x71,0x7f
+
+# ATT:   vcvtph2pd  -256(%edx){1to2}, %xmm6 {%k7} {z}
+# INTEL: vcvtph2pd xmm6 {k7} {z}, word ptr [edx - 256]{1to2}
+0x62,0xf5,0x7c,0x9f,0x5a,0x72,0x80
+
+# ATT:   vcvtph2pd  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvtph2pd ymm6 {k7}, qword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x2f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2pd  (%ecx){1to4}, %ymm6
+# INTEL: vcvtph2pd ymm6, word ptr [ecx]{1to4}
+0x62,0xf5,0x7c,0x38,0x5a,0x31
+
+# ATT:   vcvtph2pd  1016(%ecx), %ymm6
+# INTEL: vcvtph2pd ymm6, qword ptr [ecx + 1016]
+0x62,0xf5,0x7c,0x28,0x5a,0x71,0x7f
+
+# ATT:   vcvtph2pd  -256(%edx){1to4}, %ymm6 {%k7} {z}
+# INTEL: vcvtph2pd ymm6 {k7} {z}, word ptr [edx - 256]{1to4}
+0x62,0xf5,0x7c,0xbf,0x5a,0x72,0x80
+
+# ATT:   vcvtph2psx %xmm5, %xmm6
+# INTEL: vcvtph2psx xmm6, xmm5
+0x62,0xf6,0x7d,0x08,0x13,0xf5
+
+# ATT:   vcvtph2psx %xmm5, %ymm6
+# INTEL: vcvtph2psx ymm6, xmm5
+0x62,0xf6,0x7d,0x28,0x13,0xf5
+
+# ATT:   vcvtph2psx  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtph2psx xmm6 {k7}, qword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x0f,0x13,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2psx  (%ecx){1to4}, %xmm6
+# INTEL: vcvtph2psx xmm6, word ptr [ecx]{1to4}
+0x62,0xf6,0x7d,0x18,0x13,0x31
+
+# ATT:   vcvtph2psx  1016(%ecx), %xmm6
+# INTEL: vcvtph2psx xmm6, qword ptr [ecx + 1016]
+0x62,0xf6,0x7d,0x08,0x13,0x71,0x7f
+
+# ATT:   vcvtph2psx  -256(%edx){1to4}, %xmm6 {%k7} {z}
+# INTEL: vcvtph2psx xmm6 {k7} {z}, word ptr [edx - 256]{1to4}
+0x62,0xf6,0x7d,0x9f,0x13,0x72,0x80
+
+# ATT:   vcvtph2psx  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvtph2psx ymm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x2f,0x13,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2psx  (%ecx){1to8}, %ymm6
+# INTEL: vcvtph2psx ymm6, word ptr [ecx]{1to8}
+0x62,0xf6,0x7d,0x38,0x13,0x31
+
+# ATT:   vcvtph2psx  2032(%ecx), %ymm6
+# INTEL: vcvtph2psx ymm6, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x7d,0x28,0x13,0x71,0x7f
+
+# ATT:   vcvtph2psx  -256(%edx){1to8}, %ymm6 {%k7} {z}
+# INTEL: vcvtph2psx ymm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x7d,0xbf,0x13,0x72,0x80
+
+# ATT:   vcvtph2qq %xmm5, %xmm6
+# INTEL: vcvtph2qq xmm6, xmm5
+0x62,0xf5,0x7d,0x08,0x7b,0xf5
+
+# ATT:   vcvtph2qq %xmm5, %ymm6
+# INTEL: vcvtph2qq ymm6, xmm5
+0x62,0xf5,0x7d,0x28,0x7b,0xf5
+
+# ATT:   vcvtph2qq  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtph2qq xmm6 {k7}, dword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x0f,0x7b,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2qq  (%ecx){1to2}, %xmm6
+# INTEL: vcvtph2qq xmm6, word ptr [ecx]{1to2}
+0x62,0xf5,0x7d,0x18,0x7b,0x31
+
+# ATT:   vcvtph2qq  508(%ecx), %xmm6
+# INTEL: vcvtph2qq xmm6, dword ptr [ecx + 508]
+0x62,0xf5,0x7d,0x08,0x7b,0x71,0x7f
+
+# ATT:   vcvtph2qq  -256(%edx){1to2}, %xmm6 {%k7} {z}
+# INTEL: vcvtph2qq xmm6 {k7} {z}, word ptr [edx - 256]{1to2}
+0x62,0xf5,0x7d,0x9f,0x7b,0x72,0x80
+
+# ATT:   vcvtph2qq  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvtph2qq ymm6 {k7}, qword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x2f,0x7b,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2qq  (%ecx){1to4}, %ymm6
+# INTEL: vcvtph2qq ymm6, word ptr [ecx]{1to4}
+0x62,0xf5,0x7d,0x38,0x7b,0x31
+
+# ATT:   vcvtph2qq  1016(%ecx), %ymm6
+# INTEL: vcvtph2qq ymm6, qword ptr [ecx + 1016]
+0x62,0xf5,0x7d,0x28,0x7b,0x71,0x7f
+
+# ATT:   vcvtph2qq  -256(%edx){1to4}, %ymm6 {%k7} {z}
+# INTEL: vcvtph2qq ymm6 {k7} {z}, word ptr [edx - 256]{1to4}
+0x62,0xf5,0x7d,0xbf,0x7b,0x72,0x80
+
+# ATT:   vcvtph2udq %xmm5, %xmm6
+# INTEL: vcvtph2udq xmm6, xmm5
+0x62,0xf5,0x7c,0x08,0x79,0xf5
+
+# ATT:   vcvtph2udq %xmm5, %ymm6
+# INTEL: vcvtph2udq ymm6, xmm5
+0x62,0xf5,0x7c,0x28,0x79,0xf5
+
+# ATT:   vcvtph2udq  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtph2udq xmm6 {k7}, qword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x0f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2udq  (%ecx){1to4}, %xmm6
+# INTEL: vcvtph2udq xmm6, word ptr [ecx]{1to4}
+0x62,0xf5,0x7c,0x18,0x79,0x31
+
+# ATT:   vcvtph2udq  1016(%ecx), %xmm6
+# INTEL: vcvtph2udq xmm6, qword ptr [ecx + 1016]
+0x62,0xf5,0x7c,0x08,0x79,0x71,0x7f
+
+# ATT:   vcvtph2udq  -256(%edx){1to4}, %xmm6 {%k7} {z}
+# INTEL: vcvtph2udq xmm6 {k7} {z}, word ptr [edx - 256]{1to4}
+0x62,0xf5,0x7c,0x9f,0x79,0x72,0x80
+
+# ATT:   vcvtph2udq  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvtph2udq ymm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x2f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2udq  (%ecx){1to8}, %ymm6
+# INTEL: vcvtph2udq ymm6, word ptr [ecx]{1to8}
+0x62,0xf5,0x7c,0x38,0x79,0x31
+
+# ATT:   vcvtph2udq  2032(%ecx), %ymm6
+# INTEL: vcvtph2udq ymm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7c,0x28,0x79,0x71,0x7f
+
+# ATT:   vcvtph2udq  -256(%edx){1to8}, %ymm6 {%k7} {z}
+# INTEL: vcvtph2udq ymm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7c,0xbf,0x79,0x72,0x80
+
+# ATT:   vcvtph2uqq %xmm5, %xmm6
+# INTEL: vcvtph2uqq xmm6, xmm5
+0x62,0xf5,0x7d,0x08,0x79,0xf5
+
+# ATT:   vcvtph2uqq %xmm5, %ymm6
+# INTEL: vcvtph2uqq ymm6, xmm5
+0x62,0xf5,0x7d,0x28,0x79,0xf5
+
+# ATT:   vcvtph2uqq  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtph2uqq xmm6 {k7}, dword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x0f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2uqq  (%ecx){1to2}, %xmm6
+# INTEL: vcvtph2uqq xmm6, word ptr [ecx]{1to2}
+0x62,0xf5,0x7d,0x18,0x79,0x31
+
+# ATT:   vcvtph2uqq  508(%ecx), %xmm6
+# INTEL: vcvtph2uqq xmm6, dword ptr [ecx + 508]
+0x62,0xf5,0x7d,0x08,0x79,0x71,0x7f
+
+# ATT:   vcvtph2uqq  -256(%edx){1to2}, %xmm6 {%k7} {z}
+# INTEL: vcvtph2uqq xmm6 {k7} {z}, word ptr [edx - 256]{1to2}
+0x62,0xf5,0x7d,0x9f,0x79,0x72,0x80
+
+# ATT:   vcvtph2uqq  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvtph2uqq ymm6 {k7}, qword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x2f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2uqq  (%ecx){1to4}, %ymm6
+# INTEL: vcvtph2uqq ymm6, word ptr [ecx]{1to4}
+0x62,0xf5,0x7d,0x38,0x79,0x31
+
+# ATT:   vcvtph2uqq  1016(%ecx), %ymm6
+# INTEL: vcvtph2uqq ymm6, qword ptr [ecx + 1016]
+0x62,0xf5,0x7d,0x28,0x79,0x71,0x7f
+
+# ATT:   vcvtph2uqq  -256(%edx){1to4}, %ymm6 {%k7} {z}
+# INTEL: vcvtph2uqq ymm6 {k7} {z}, word ptr [edx - 256]{1to4}
+0x62,0xf5,0x7d,0xbf,0x79,0x72,0x80
+
+# ATT:   vcvtph2uw %xmm5, %xmm6
+# INTEL: vcvtph2uw xmm6, xmm5
+0x62,0xf5,0x7c,0x08,0x7d,0xf5
+
+# ATT:   vcvtph2uw %ymm5, %ymm6
+# INTEL: vcvtph2uw ymm6, ymm5
+0x62,0xf5,0x7c,0x28,0x7d,0xf5
+
+# ATT:   vcvtph2uw  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtph2uw xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2uw  (%ecx){1to8}, %xmm6
+# INTEL: vcvtph2uw xmm6, word ptr [ecx]{1to8}
+0x62,0xf5,0x7c,0x18,0x7d,0x31
+
+# ATT:   vcvtph2uw  2032(%ecx), %xmm6
+# INTEL: vcvtph2uw xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7c,0x08,0x7d,0x71,0x7f
+
+# ATT:   vcvtph2uw  -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vcvtph2uw xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7c,0x9f,0x7d,0x72,0x80
+
+# ATT:   vcvtph2uw  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvtph2uw ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2uw  (%ecx){1to16}, %ymm6
+# INTEL: vcvtph2uw ymm6, word ptr [ecx]{1to16}
+0x62,0xf5,0x7c,0x38,0x7d,0x31
+
+# ATT:   vcvtph2uw  4064(%ecx), %ymm6
+# INTEL: vcvtph2uw ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x7c,0x28,0x7d,0x71,0x7f
+
+# ATT:   vcvtph2uw  -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vcvtph2uw ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x7c,0xbf,0x7d,0x72,0x80
+
+# ATT:   vcvtph2w %xmm5, %xmm6
+# INTEL: vcvtph2w xmm6, xmm5
+0x62,0xf5,0x7d,0x08,0x7d,0xf5
+
+# ATT:   vcvtph2w %ymm5, %ymm6
+# INTEL: vcvtph2w ymm6, ymm5
+0x62,0xf5,0x7d,0x28,0x7d,0xf5
+
+# ATT:   vcvtph2w  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtph2w xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2w  (%ecx){1to8}, %xmm6
+# INTEL: vcvtph2w xmm6, word ptr [ecx]{1to8}
+0x62,0xf5,0x7d,0x18,0x7d,0x31
+
+# ATT:   vcvtph2w  2032(%ecx), %xmm6
+# INTEL: vcvtph2w xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7d,0x08,0x7d,0x71,0x7f
+
+# ATT:   vcvtph2w  -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vcvtph2w xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7d,0x9f,0x7d,0x72,0x80
+
+# ATT:   vcvtph2w  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvtph2w ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtph2w  (%ecx){1to16}, %ymm6
+# INTEL: vcvtph2w ymm6, word ptr [ecx]{1to16}
+0x62,0xf5,0x7d,0x38,0x7d,0x31
+
+# ATT:   vcvtph2w  4064(%ecx), %ymm6
+# INTEL: vcvtph2w ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x7d,0x28,0x7d,0x71,0x7f
+
+# ATT:   vcvtph2w  -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vcvtph2w ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x7d,0xbf,0x7d,0x72,0x80
+
+# ATT:   vcvtps2phx %xmm5, %xmm6
+# INTEL: vcvtps2phx xmm6, xmm5
+0x62,0xf5,0x7d,0x08,0x1d,0xf5
+
+# ATT:   vcvtps2phx %ymm5, %xmm6
+# INTEL: vcvtps2phx xmm6, ymm5
+0x62,0xf5,0x7d,0x28,0x1d,0xf5
+
+# ATT:   vcvtps2phxx  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtps2phx xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x0f,0x1d,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtps2phx  (%ecx){1to4}, %xmm6
+# INTEL: vcvtps2phx xmm6, dword ptr [ecx]{1to4}
+0x62,0xf5,0x7d,0x18,0x1d,0x31
+
+# ATT:   vcvtps2phxx  2032(%ecx), %xmm6
+# INTEL: vcvtps2phx xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7d,0x08,0x1d,0x71,0x7f
+
+# ATT:   vcvtps2phx  -512(%edx){1to4}, %xmm6 {%k7} {z}
+# INTEL: vcvtps2phx xmm6 {k7} {z}, dword ptr [edx - 512]{1to4}
+0x62,0xf5,0x7d,0x9f,0x1d,0x72,0x80
+
+# ATT:   vcvtps2phx  (%ecx){1to8}, %xmm6
+# INTEL: vcvtps2phx xmm6, dword ptr [ecx]{1to8}
+0x62,0xf5,0x7d,0x38,0x1d,0x31
+
+# ATT:   vcvtps2phxy  4064(%ecx), %xmm6
+# INTEL: vcvtps2phx xmm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x7d,0x28,0x1d,0x71,0x7f
+
+# ATT:   vcvtps2phx  -512(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vcvtps2phx xmm6 {k7} {z}, dword ptr [edx - 512]{1to8}
+0x62,0xf5,0x7d,0xbf,0x1d,0x72,0x80
+
+# ATT:   vcvtqq2ph %xmm5, %xmm6
+# INTEL: vcvtqq2ph xmm6, xmm5
+0x62,0xf5,0xfc,0x08,0x5b,0xf5
+
+# ATT:   vcvtqq2ph %ymm5, %xmm6
+# INTEL: vcvtqq2ph xmm6, ymm5
+0x62,0xf5,0xfc,0x28,0x5b,0xf5
+
+# ATT:   vcvtqq2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtqq2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0xfc,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtqq2ph  (%ecx){1to2}, %xmm6
+# INTEL: vcvtqq2ph xmm6, qword ptr [ecx]{1to2}
+0x62,0xf5,0xfc,0x18,0x5b,0x31
+
+# ATT:   vcvtqq2phx  2032(%ecx), %xmm6
+# INTEL: vcvtqq2ph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0xfc,0x08,0x5b,0x71,0x7f
+
+# ATT:   vcvtqq2ph  -1024(%edx){1to2}, %xmm6 {%k7} {z}
+# INTEL: vcvtqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to2}
+0x62,0xf5,0xfc,0x9f,0x5b,0x72,0x80
+
+# ATT:   vcvtqq2ph  (%ecx){1to4}, %xmm6
+# INTEL: vcvtqq2ph xmm6, qword ptr [ecx]{1to4}
+0x62,0xf5,0xfc,0x38,0x5b,0x31
+
+# ATT:   vcvtqq2phy  4064(%ecx), %xmm6
+# INTEL: vcvtqq2ph xmm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0xfc,0x28,0x5b,0x71,0x7f
+
+# ATT:   vcvtqq2ph  -1024(%edx){1to4}, %xmm6 {%k7} {z}
+# INTEL: vcvtqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to4}
+0x62,0xf5,0xfc,0xbf,0x5b,0x72,0x80
+
+# ATT:   vcvttph2dq %xmm5, %xmm6
+# INTEL: vcvttph2dq xmm6, xmm5
+0x62,0xf5,0x7e,0x08,0x5b,0xf5
+
+# ATT:   vcvttph2dq %xmm5, %ymm6
+# INTEL: vcvttph2dq ymm6, xmm5
+0x62,0xf5,0x7e,0x28,0x5b,0xf5
+
+# ATT:   vcvttph2dq  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvttph2dq xmm6 {k7}, qword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7e,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2dq  (%ecx){1to4}, %xmm6
+# INTEL: vcvttph2dq xmm6, word ptr [ecx]{1to4}
+0x62,0xf5,0x7e,0x18,0x5b,0x31
+
+# ATT:   vcvttph2dq  1016(%ecx), %xmm6
+# INTEL: vcvttph2dq xmm6, qword ptr [ecx + 1016]
+0x62,0xf5,0x7e,0x08,0x5b,0x71,0x7f
+
+# ATT:   vcvttph2dq  -256(%edx){1to4}, %xmm6 {%k7} {z}
+# INTEL: vcvttph2dq xmm6 {k7} {z}, word ptr [edx - 256]{1to4}
+0x62,0xf5,0x7e,0x9f,0x5b,0x72,0x80
+
+# ATT:   vcvttph2dq  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvttph2dq ymm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7e,0x2f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2dq  (%ecx){1to8}, %ymm6
+# INTEL: vcvttph2dq ymm6, word ptr [ecx]{1to8}
+0x62,0xf5,0x7e,0x38,0x5b,0x31
+
+# ATT:   vcvttph2dq  2032(%ecx), %ymm6
+# INTEL: vcvttph2dq ymm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7e,0x28,0x5b,0x71,0x7f
+
+# ATT:   vcvttph2dq  -256(%edx){1to8}, %ymm6 {%k7} {z}
+# INTEL: vcvttph2dq ymm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7e,0xbf,0x5b,0x72,0x80
+
+# ATT:   vcvttph2qq %xmm5, %xmm6
+# INTEL: vcvttph2qq xmm6, xmm5
+0x62,0xf5,0x7d,0x08,0x7a,0xf5
+
+# ATT:   vcvttph2qq %xmm5, %ymm6
+# INTEL: vcvttph2qq ymm6, xmm5
+0x62,0xf5,0x7d,0x28,0x7a,0xf5
+
+# ATT:   vcvttph2qq  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvttph2qq xmm6 {k7}, dword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x0f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2qq  (%ecx){1to2}, %xmm6
+# INTEL: vcvttph2qq xmm6, word ptr [ecx]{1to2}
+0x62,0xf5,0x7d,0x18,0x7a,0x31
+
+# ATT:   vcvttph2qq  508(%ecx), %xmm6
+# INTEL: vcvttph2qq xmm6, dword ptr [ecx + 508]
+0x62,0xf5,0x7d,0x08,0x7a,0x71,0x7f
+
+# ATT:   vcvttph2qq  -256(%edx){1to2}, %xmm6 {%k7} {z}
+# INTEL: vcvttph2qq xmm6 {k7} {z}, word ptr [edx - 256]{1to2}
+0x62,0xf5,0x7d,0x9f,0x7a,0x72,0x80
+
+# ATT:   vcvttph2qq  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvttph2qq ymm6 {k7}, qword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x2f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2qq  (%ecx){1to4}, %ymm6
+# INTEL: vcvttph2qq ymm6, word ptr [ecx]{1to4}
+0x62,0xf5,0x7d,0x38,0x7a,0x31
+
+# ATT:   vcvttph2qq  1016(%ecx), %ymm6
+# INTEL: vcvttph2qq ymm6, qword ptr [ecx + 1016]
+0x62,0xf5,0x7d,0x28,0x7a,0x71,0x7f
+
+# ATT:   vcvttph2qq  -256(%edx){1to4}, %ymm6 {%k7} {z}
+# INTEL: vcvttph2qq ymm6 {k7} {z}, word ptr [edx - 256]{1to4}
+0x62,0xf5,0x7d,0xbf,0x7a,0x72,0x80
+
+# ATT:   vcvttph2udq %xmm5, %xmm6
+# INTEL: vcvttph2udq xmm6, xmm5
+0x62,0xf5,0x7c,0x08,0x78,0xf5
+
+# ATT:   vcvttph2udq %xmm5, %ymm6
+# INTEL: vcvttph2udq ymm6, xmm5
+0x62,0xf5,0x7c,0x28,0x78,0xf5
+
+# ATT:   vcvttph2udq  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvttph2udq xmm6 {k7}, qword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x0f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2udq  (%ecx){1to4}, %xmm6
+# INTEL: vcvttph2udq xmm6, word ptr [ecx]{1to4}
+0x62,0xf5,0x7c,0x18,0x78,0x31
+
+# ATT:   vcvttph2udq  1016(%ecx), %xmm6
+# INTEL: vcvttph2udq xmm6, qword ptr [ecx + 1016]
+0x62,0xf5,0x7c,0x08,0x78,0x71,0x7f
+
+# ATT:   vcvttph2udq  -256(%edx){1to4}, %xmm6 {%k7} {z}
+# INTEL: vcvttph2udq xmm6 {k7} {z}, word ptr [edx - 256]{1to4}
+0x62,0xf5,0x7c,0x9f,0x78,0x72,0x80
+
+# ATT:   vcvttph2udq  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvttph2udq ymm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x2f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2udq  (%ecx){1to8}, %ymm6
+# INTEL: vcvttph2udq ymm6, word ptr [ecx]{1to8}
+0x62,0xf5,0x7c,0x38,0x78,0x31
+
+# ATT:   vcvttph2udq  2032(%ecx), %ymm6
+# INTEL: vcvttph2udq ymm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7c,0x28,0x78,0x71,0x7f
+
+# ATT:   vcvttph2udq  -256(%edx){1to8}, %ymm6 {%k7} {z}
+# INTEL: vcvttph2udq ymm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7c,0xbf,0x78,0x72,0x80
+
+# ATT:   vcvttph2uqq %xmm5, %xmm6
+# INTEL: vcvttph2uqq xmm6, xmm5
+0x62,0xf5,0x7d,0x08,0x78,0xf5
+
+# ATT:   vcvttph2uqq %xmm5, %ymm6
+# INTEL: vcvttph2uqq ymm6, xmm5
+0x62,0xf5,0x7d,0x28,0x78,0xf5
+
+# ATT:   vcvttph2uqq  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvttph2uqq xmm6 {k7}, dword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x0f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2uqq  (%ecx){1to2}, %xmm6
+# INTEL: vcvttph2uqq xmm6, word ptr [ecx]{1to2}
+0x62,0xf5,0x7d,0x18,0x78,0x31
+
+# ATT:   vcvttph2uqq  508(%ecx), %xmm6
+# INTEL: vcvttph2uqq xmm6, dword ptr [ecx + 508]
+0x62,0xf5,0x7d,0x08,0x78,0x71,0x7f
+
+# ATT:   vcvttph2uqq  -256(%edx){1to2}, %xmm6 {%k7} {z}
+# INTEL: vcvttph2uqq xmm6 {k7} {z}, word ptr [edx - 256]{1to2}
+0x62,0xf5,0x7d,0x9f,0x78,0x72,0x80
+
+# ATT:   vcvttph2uqq  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvttph2uqq ymm6 {k7}, qword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x2f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2uqq  (%ecx){1to4}, %ymm6
+# INTEL: vcvttph2uqq ymm6, word ptr [ecx]{1to4}
+0x62,0xf5,0x7d,0x38,0x78,0x31
+
+# ATT:   vcvttph2uqq  1016(%ecx), %ymm6
+# INTEL: vcvttph2uqq ymm6, qword ptr [ecx + 1016]
+0x62,0xf5,0x7d,0x28,0x78,0x71,0x7f
+
+# ATT:   vcvttph2uqq  -256(%edx){1to4}, %ymm6 {%k7} {z}
+# INTEL: vcvttph2uqq ymm6 {k7} {z}, word ptr [edx - 256]{1to4}
+0x62,0xf5,0x7d,0xbf,0x78,0x72,0x80
+
+# ATT:   vcvttph2uw %xmm5, %xmm6
+# INTEL: vcvttph2uw xmm6, xmm5
+0x62,0xf5,0x7c,0x08,0x7c,0xf5
+
+# ATT:   vcvttph2uw %ymm5, %ymm6
+# INTEL: vcvttph2uw ymm6, ymm5
+0x62,0xf5,0x7c,0x28,0x7c,0xf5
+
+# ATT:   vcvttph2uw  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvttph2uw xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x0f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2uw  (%ecx){1to8}, %xmm6
+# INTEL: vcvttph2uw xmm6, word ptr [ecx]{1to8}
+0x62,0xf5,0x7c,0x18,0x7c,0x31
+
+# ATT:   vcvttph2uw  2032(%ecx), %xmm6
+# INTEL: vcvttph2uw xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7c,0x08,0x7c,0x71,0x7f
+
+# ATT:   vcvttph2uw  -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vcvttph2uw xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7c,0x9f,0x7c,0x72,0x80
+
+# ATT:   vcvttph2uw  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvttph2uw ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x2f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2uw  (%ecx){1to16}, %ymm6
+# INTEL: vcvttph2uw ymm6, word ptr [ecx]{1to16}
+0x62,0xf5,0x7c,0x38,0x7c,0x31
+
+# ATT:   vcvttph2uw  4064(%ecx), %ymm6
+# INTEL: vcvttph2uw ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x7c,0x28,0x7c,0x71,0x7f
+
+# ATT:   vcvttph2uw  -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vcvttph2uw ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x7c,0xbf,0x7c,0x72,0x80
+
+# ATT:   vcvttph2w %xmm5, %xmm6
+# INTEL: vcvttph2w xmm6, xmm5
+0x62,0xf5,0x7d,0x08,0x7c,0xf5
+
+# ATT:   vcvttph2w %ymm5, %ymm6
+# INTEL: vcvttph2w ymm6, ymm5
+0x62,0xf5,0x7d,0x28,0x7c,0xf5
+
+# ATT:   vcvttph2w  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvttph2w xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x0f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2w  (%ecx){1to8}, %xmm6
+# INTEL: vcvttph2w xmm6, word ptr [ecx]{1to8}
+0x62,0xf5,0x7d,0x18,0x7c,0x31
+
+# ATT:   vcvttph2w  2032(%ecx), %xmm6
+# INTEL: vcvttph2w xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7d,0x08,0x7c,0x71,0x7f
+
+# ATT:   vcvttph2w  -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vcvttph2w xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7d,0x9f,0x7c,0x72,0x80
+
+# ATT:   vcvttph2w  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvttph2w ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7d,0x2f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvttph2w  (%ecx){1to16}, %ymm6
+# INTEL: vcvttph2w ymm6, word ptr [ecx]{1to16}
+0x62,0xf5,0x7d,0x38,0x7c,0x31
+
+# ATT:   vcvttph2w  4064(%ecx), %ymm6
+# INTEL: vcvttph2w ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x7d,0x28,0x7c,0x71,0x7f
+
+# ATT:   vcvttph2w  -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vcvttph2w ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x7d,0xbf,0x7c,0x72,0x80
+
+# ATT:   vcvtudq2ph %xmm5, %xmm6
+# INTEL: vcvtudq2ph xmm6, xmm5
+0x62,0xf5,0x7f,0x08,0x7a,0xf5
+
+# ATT:   vcvtudq2ph %ymm5, %xmm6
+# INTEL: vcvtudq2ph xmm6, ymm5
+0x62,0xf5,0x7f,0x28,0x7a,0xf5
+
+# ATT:   vcvtudq2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtudq2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7f,0x0f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtudq2ph  (%ecx){1to4}, %xmm6
+# INTEL: vcvtudq2ph xmm6, dword ptr [ecx]{1to4}
+0x62,0xf5,0x7f,0x18,0x7a,0x31
+
+# ATT:   vcvtudq2phx  2032(%ecx), %xmm6
+# INTEL: vcvtudq2ph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7f,0x08,0x7a,0x71,0x7f
+
+# ATT:   vcvtudq2ph  -512(%edx){1to4}, %xmm6 {%k7} {z}
+# INTEL: vcvtudq2ph xmm6 {k7} {z}, dword ptr [edx - 512]{1to4}
+0x62,0xf5,0x7f,0x9f,0x7a,0x72,0x80
+
+# ATT:   vcvtudq2ph  (%ecx){1to8}, %xmm6
+# INTEL: vcvtudq2ph xmm6, dword ptr [ecx]{1to8}
+0x62,0xf5,0x7f,0x38,0x7a,0x31
+
+# ATT:   vcvtudq2phy  4064(%ecx), %xmm6
+# INTEL: vcvtudq2ph xmm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x7f,0x28,0x7a,0x71,0x7f
+
+# ATT:   vcvtudq2ph  -512(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vcvtudq2ph xmm6 {k7} {z}, dword ptr [edx - 512]{1to8}
+0x62,0xf5,0x7f,0xbf,0x7a,0x72,0x80
+
+# ATT:   vcvtuqq2ph %xmm5, %xmm6
+# INTEL: vcvtuqq2ph xmm6, xmm5
+0x62,0xf5,0xff,0x08,0x7a,0xf5
+
+# ATT:   vcvtuqq2ph %ymm5, %xmm6
+# INTEL: vcvtuqq2ph xmm6, ymm5
+0x62,0xf5,0xff,0x28,0x7a,0xf5
+
+# ATT:   vcvtuqq2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtuqq2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0xff,0x0f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtuqq2ph  (%ecx){1to2}, %xmm6
+# INTEL: vcvtuqq2ph xmm6, qword ptr [ecx]{1to2}
+0x62,0xf5,0xff,0x18,0x7a,0x31
+
+# ATT:   vcvtuqq2phx  2032(%ecx), %xmm6
+# INTEL: vcvtuqq2ph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0xff,0x08,0x7a,0x71,0x7f
+
+# ATT:   vcvtuqq2ph  -1024(%edx){1to2}, %xmm6 {%k7} {z}
+# INTEL: vcvtuqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to2}
+0x62,0xf5,0xff,0x9f,0x7a,0x72,0x80
+
+# ATT:   vcvtuqq2ph  (%ecx){1to4}, %xmm6
+# INTEL: vcvtuqq2ph xmm6, qword ptr [ecx]{1to4}
+0x62,0xf5,0xff,0x38,0x7a,0x31
+
+# ATT:   vcvtuqq2phy  4064(%ecx), %xmm6
+# INTEL: vcvtuqq2ph xmm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0xff,0x28,0x7a,0x71,0x7f
+
+# ATT:   vcvtuqq2ph  -1024(%edx){1to4}, %xmm6 {%k7} {z}
+# INTEL: vcvtuqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to4}
+0x62,0xf5,0xff,0xbf,0x7a,0x72,0x80
+
+# ATT:   vcvtuw2ph %xmm5, %xmm6
+# INTEL: vcvtuw2ph xmm6, xmm5
+0x62,0xf5,0x7f,0x08,0x7d,0xf5
+
+# ATT:   vcvtuw2ph %ymm5, %ymm6
+# INTEL: vcvtuw2ph ymm6, ymm5
+0x62,0xf5,0x7f,0x28,0x7d,0xf5
+
+# ATT:   vcvtuw2ph  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtuw2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7f,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtuw2ph  (%ecx){1to8}, %xmm6
+# INTEL: vcvtuw2ph xmm6, word ptr [ecx]{1to8}
+0x62,0xf5,0x7f,0x18,0x7d,0x31
+
+# ATT:   vcvtuw2ph  2032(%ecx), %xmm6
+# INTEL: vcvtuw2ph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7f,0x08,0x7d,0x71,0x7f
+
+# ATT:   vcvtuw2ph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vcvtuw2ph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7f,0x9f,0x7d,0x72,0x80
+
+# ATT:   vcvtuw2ph  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvtuw2ph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7f,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtuw2ph  (%ecx){1to16}, %ymm6
+# INTEL: vcvtuw2ph ymm6, word ptr [ecx]{1to16}
+0x62,0xf5,0x7f,0x38,0x7d,0x31
+
+# ATT:   vcvtuw2ph  4064(%ecx), %ymm6
+# INTEL: vcvtuw2ph ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x7f,0x28,0x7d,0x71,0x7f
+
+# ATT:   vcvtuw2ph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vcvtuw2ph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x7f,0xbf,0x7d,0x72,0x80
+
+# ATT:   vcvtw2ph %xmm5, %xmm6
+# INTEL: vcvtw2ph xmm6, xmm5
+0x62,0xf5,0x7e,0x08,0x7d,0xf5
+
+# ATT:   vcvtw2ph %ymm5, %ymm6
+# INTEL: vcvtw2ph ymm6, ymm5
+0x62,0xf5,0x7e,0x28,0x7d,0xf5
+
+# ATT:   vcvtw2ph  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vcvtw2ph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7e,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtw2ph  (%ecx){1to8}, %xmm6
+# INTEL: vcvtw2ph xmm6, word ptr [ecx]{1to8}
+0x62,0xf5,0x7e,0x18,0x7d,0x31
+
+# ATT:   vcvtw2ph  2032(%ecx), %xmm6
+# INTEL: vcvtw2ph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7e,0x08,0x7d,0x71,0x7f
+
+# ATT:   vcvtw2ph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vcvtw2ph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7e,0x9f,0x7d,0x72,0x80
+
+# ATT:   vcvtw2ph  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vcvtw2ph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7e,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vcvtw2ph  (%ecx){1to16}, %ymm6
+# INTEL: vcvtw2ph ymm6, word ptr [ecx]{1to16}
+0x62,0xf5,0x7e,0x38,0x7d,0x31
+
+# ATT:   vcvtw2ph  4064(%ecx), %ymm6
+# INTEL: vcvtw2ph ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x7e,0x28,0x7d,0x71,0x7f
+
+# ATT:   vcvtw2ph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vcvtw2ph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x7e,0xbf,0x7d,0x72,0x80
+
+# ATT:   vfpclassph $123, %xmm6, %k5
+# INTEL: vfpclassph k5, xmm6, 123
+0x62,0xf3,0x7c,0x08,0x66,0xee,0x7b
+
+# ATT:   vfpclassph $123, %ymm6, %k5
+# INTEL: vfpclassph k5, ymm6, 123
+0x62,0xf3,0x7c,0x28,0x66,0xee,0x7b
+
+# ATT:   vfpclassphx  $123, 268435456(%esp,%esi,8), %k5 {%k7}
+# INTEL: vfpclassph k5 {k7}, xmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x0f,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vfpclassph  $123, (%ecx){1to8}, %k5
+# INTEL: vfpclassph k5, word ptr [ecx]{1to8}, 123
+0x62,0xf3,0x7c,0x18,0x66,0x29,0x7b
+
+# ATT:   vfpclassphx  $123, 2032(%ecx), %k5
+# INTEL: vfpclassph k5, xmmword ptr [ecx + 2032], 123
+0x62,0xf3,0x7c,0x08,0x66,0x69,0x7f,0x7b
+
+# ATT:   vfpclassph  $123, -256(%edx){1to8}, %k5 {%k7}
+# INTEL: vfpclassph k5 {k7}, word ptr [edx - 256]{1to8}, 123
+0x62,0xf3,0x7c,0x1f,0x66,0x6a,0x80,0x7b
+
+# ATT:   vfpclassph  $123, (%ecx){1to16}, %k5
+# INTEL: vfpclassph k5, word ptr [ecx]{1to16}, 123
+0x62,0xf3,0x7c,0x38,0x66,0x29,0x7b
+
+# ATT:   vfpclassphy  $123, 4064(%ecx), %k5
+# INTEL: vfpclassph k5, ymmword ptr [ecx + 4064], 123
+0x62,0xf3,0x7c,0x28,0x66,0x69,0x7f,0x7b
+
+# ATT:   vfpclassph  $123, -256(%edx){1to16}, %k5 {%k7}
+# INTEL: vfpclassph k5 {k7}, word ptr [edx - 256]{1to16}, 123
+0x62,0xf3,0x7c,0x3f,0x66,0x6a,0x80,0x7b
+
+# ATT:   vgetexpph %xmm5, %xmm6
+# INTEL: vgetexpph xmm6, xmm5
+0x62,0xf6,0x7d,0x08,0x42,0xf5
+
+# ATT:   vgetexpph %ymm5, %ymm6
+# INTEL: vgetexpph ymm6, ymm5
+0x62,0xf6,0x7d,0x28,0x42,0xf5
+
+# ATT:   vgetexpph  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vgetexpph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x0f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vgetexpph  (%ecx){1to8}, %xmm6
+# INTEL: vgetexpph xmm6, word ptr [ecx]{1to8}
+0x62,0xf6,0x7d,0x18,0x42,0x31
+
+# ATT:   vgetexpph  2032(%ecx), %xmm6
+# INTEL: vgetexpph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x7d,0x08,0x42,0x71,0x7f
+
+# ATT:   vgetexpph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vgetexpph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x7d,0x9f,0x42,0x72,0x80
+
+# ATT:   vgetexpph  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vgetexpph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x2f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vgetexpph  (%ecx){1to16}, %ymm6
+# INTEL: vgetexpph ymm6, word ptr [ecx]{1to16}
+0x62,0xf6,0x7d,0x38,0x42,0x31
+
+# ATT:   vgetexpph  4064(%ecx), %ymm6
+# INTEL: vgetexpph ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x7d,0x28,0x42,0x71,0x7f
+
+# ATT:   vgetexpph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vgetexpph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x7d,0xbf,0x42,0x72,0x80
+
+# ATT:   vgetmantph $123, %ymm5, %ymm6
+# INTEL: vgetmantph ymm6, ymm5, 123
+0x62,0xf3,0x7c,0x28,0x26,0xf5,0x7b
+
+# ATT:   vgetmantph $123, %xmm5, %xmm6
+# INTEL: vgetmantph xmm6, xmm5, 123
+0x62,0xf3,0x7c,0x08,0x26,0xf5,0x7b
+
+# ATT:   vgetmantph  $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vgetmantph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x0f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vgetmantph  $123, (%ecx){1to8}, %xmm6
+# INTEL: vgetmantph xmm6, word ptr [ecx]{1to8}, 123
+0x62,0xf3,0x7c,0x18,0x26,0x31,0x7b
+
+# ATT:   vgetmantph  $123, 2032(%ecx), %xmm6
+# INTEL: vgetmantph xmm6, xmmword ptr [ecx + 2032], 123
+0x62,0xf3,0x7c,0x08,0x26,0x71,0x7f,0x7b
+
+# ATT:   vgetmantph  $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vgetmantph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+0x62,0xf3,0x7c,0x9f,0x26,0x72,0x80,0x7b
+
+# ATT:   vgetmantph  $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vgetmantph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x2f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vgetmantph  $123, (%ecx){1to16}, %ymm6
+# INTEL: vgetmantph ymm6, word ptr [ecx]{1to16}, 123
+0x62,0xf3,0x7c,0x38,0x26,0x31,0x7b
+
+# ATT:   vgetmantph  $123, 4064(%ecx), %ymm6
+# INTEL: vgetmantph ymm6, ymmword ptr [ecx + 4064], 123
+0x62,0xf3,0x7c,0x28,0x26,0x71,0x7f,0x7b
+
+# ATT:   vgetmantph  $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vgetmantph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+0x62,0xf3,0x7c,0xbf,0x26,0x72,0x80,0x7b
+
+# ATT:   vrcpph %xmm5, %xmm6
+# INTEL: vrcpph xmm6, xmm5
+0x62,0xf6,0x7d,0x08,0x4c,0xf5
+
+# ATT:   vrcpph %ymm5, %ymm6
+# INTEL: vrcpph ymm6, ymm5
+0x62,0xf6,0x7d,0x28,0x4c,0xf5
+
+# ATT:   vrcpph  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vrcpph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x0f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vrcpph  (%ecx){1to8}, %xmm6
+# INTEL: vrcpph xmm6, word ptr [ecx]{1to8}
+0x62,0xf6,0x7d,0x18,0x4c,0x31
+
+# ATT:   vrcpph  2032(%ecx), %xmm6
+# INTEL: vrcpph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x7d,0x08,0x4c,0x71,0x7f
+
+# ATT:   vrcpph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vrcpph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x7d,0x9f,0x4c,0x72,0x80
+
+# ATT:   vrcpph  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vrcpph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x2f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vrcpph  (%ecx){1to16}, %ymm6
+# INTEL: vrcpph ymm6, word ptr [ecx]{1to16}
+0x62,0xf6,0x7d,0x38,0x4c,0x31
+
+# ATT:   vrcpph  4064(%ecx), %ymm6
+# INTEL: vrcpph ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x7d,0x28,0x4c,0x71,0x7f
+
+# ATT:   vrcpph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vrcpph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x7d,0xbf,0x4c,0x72,0x80
+
+# ATT:   vreduceph $123, %ymm5, %ymm6
+# INTEL: vreduceph ymm6, ymm5, 123
+0x62,0xf3,0x7c,0x28,0x56,0xf5,0x7b
+
+# ATT:   vreduceph $123, %xmm5, %xmm6
+# INTEL: vreduceph xmm6, xmm5, 123
+0x62,0xf3,0x7c,0x08,0x56,0xf5,0x7b
+
+# ATT:   vreduceph  $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vreduceph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vreduceph  $123, (%ecx){1to8}, %xmm6
+# INTEL: vreduceph xmm6, word ptr [ecx]{1to8}, 123
+0x62,0xf3,0x7c,0x18,0x56,0x31,0x7b
+
+# ATT:   vreduceph  $123, 2032(%ecx), %xmm6
+# INTEL: vreduceph xmm6, xmmword ptr [ecx + 2032], 123
+0x62,0xf3,0x7c,0x08,0x56,0x71,0x7f,0x7b
+
+# ATT:   vreduceph  $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vreduceph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+0x62,0xf3,0x7c,0x9f,0x56,0x72,0x80,0x7b
+
+# ATT:   vreduceph  $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vreduceph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vreduceph  $123, (%ecx){1to16}, %ymm6
+# INTEL: vreduceph ymm6, word ptr [ecx]{1to16}, 123
+0x62,0xf3,0x7c,0x38,0x56,0x31,0x7b
+
+# ATT:   vreduceph  $123, 4064(%ecx), %ymm6
+# INTEL: vreduceph ymm6, ymmword ptr [ecx + 4064], 123
+0x62,0xf3,0x7c,0x28,0x56,0x71,0x7f,0x7b
+
+# ATT:   vreduceph  $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vreduceph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+0x62,0xf3,0x7c,0xbf,0x56,0x72,0x80,0x7b
+
+# ATT:   vrndscaleph $123, %ymm5, %ymm6
+# INTEL: vrndscaleph ymm6, ymm5, 123
+0x62,0xf3,0x7c,0x28,0x08,0xf5,0x7b
+
+# ATT:   vrndscaleph $123, %xmm5, %xmm6
+# INTEL: vrndscaleph xmm6, xmm5, 123
+0x62,0xf3,0x7c,0x08,0x08,0xf5,0x7b
+
+# ATT:   vrndscaleph  $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vrndscaleph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x0f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vrndscaleph  $123, (%ecx){1to8}, %xmm6
+# INTEL: vrndscaleph xmm6, word ptr [ecx]{1to8}, 123
+0x62,0xf3,0x7c,0x18,0x08,0x31,0x7b
+
+# ATT:   vrndscaleph  $123, 2032(%ecx), %xmm6
+# INTEL: vrndscaleph xmm6, xmmword ptr [ecx + 2032], 123
+0x62,0xf3,0x7c,0x08,0x08,0x71,0x7f,0x7b
+
+# ATT:   vrndscaleph  $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vrndscaleph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}, 123
+0x62,0xf3,0x7c,0x9f,0x08,0x72,0x80,0x7b
+
+# ATT:   vrndscaleph  $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vrndscaleph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456], 123
+0x62,0xf3,0x7c,0x2f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b
+
+# ATT:   vrndscaleph  $123, (%ecx){1to16}, %ymm6
+# INTEL: vrndscaleph ymm6, word ptr [ecx]{1to16}, 123
+0x62,0xf3,0x7c,0x38,0x08,0x31,0x7b
+
+# ATT:   vrndscaleph  $123, 4064(%ecx), %ymm6
+# INTEL: vrndscaleph ymm6, ymmword ptr [ecx + 4064], 123
+0x62,0xf3,0x7c,0x28,0x08,0x71,0x7f,0x7b
+
+# ATT:   vrndscaleph  $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vrndscaleph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}, 123
+0x62,0xf3,0x7c,0xbf,0x08,0x72,0x80,0x7b
+
+# ATT:   vrsqrtph %xmm5, %xmm6
+# INTEL: vrsqrtph xmm6, xmm5
+0x62,0xf6,0x7d,0x08,0x4e,0xf5
+
+# ATT:   vrsqrtph %ymm5, %ymm6
+# INTEL: vrsqrtph ymm6, ymm5
+0x62,0xf6,0x7d,0x28,0x4e,0xf5
+
+# ATT:   vrsqrtph  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vrsqrtph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x0f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vrsqrtph  (%ecx){1to8}, %xmm6
+# INTEL: vrsqrtph xmm6, word ptr [ecx]{1to8}
+0x62,0xf6,0x7d,0x18,0x4e,0x31
+
+# ATT:   vrsqrtph  2032(%ecx), %xmm6
+# INTEL: vrsqrtph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x7d,0x08,0x4e,0x71,0x7f
+
+# ATT:   vrsqrtph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vrsqrtph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x7d,0x9f,0x4e,0x72,0x80
+
+# ATT:   vrsqrtph  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vrsqrtph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x7d,0x2f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vrsqrtph  (%ecx){1to16}, %ymm6
+# INTEL: vrsqrtph ymm6, word ptr [ecx]{1to16}
+0x62,0xf6,0x7d,0x38,0x4e,0x31
+
+# ATT:   vrsqrtph  4064(%ecx), %ymm6
+# INTEL: vrsqrtph ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x7d,0x28,0x4e,0x71,0x7f
+
+# ATT:   vrsqrtph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vrsqrtph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x7d,0xbf,0x4e,0x72,0x80
+
+# ATT:   vscalefph %ymm4, %ymm5, %ymm6
+# INTEL: vscalefph ymm6, ymm5, ymm4
+0x62,0xf6,0x55,0x28,0x2c,0xf4
+
+# ATT:   vscalefph %xmm4, %xmm5, %xmm6
+# INTEL: vscalefph xmm6, xmm5, xmm4
+0x62,0xf6,0x55,0x08,0x2c,0xf4
+
+# ATT:   vscalefph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+# INTEL: vscalefph ymm6 {k7}, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x2f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vscalefph  (%ecx){1to16}, %ymm5, %ymm6
+# INTEL: vscalefph ymm6, ymm5, word ptr [ecx]{1to16}
+0x62,0xf6,0x55,0x38,0x2c,0x31
+
+# ATT:   vscalefph  4064(%ecx), %ymm5, %ymm6
+# INTEL: vscalefph ymm6, ymm5, ymmword ptr [ecx + 4064]
+0x62,0xf6,0x55,0x28,0x2c,0x71,0x7f
+
+# ATT:   vscalefph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+# INTEL: vscalefph ymm6 {k7} {z}, ymm5, word ptr [edx - 256]{1to16}
+0x62,0xf6,0x55,0xbf,0x2c,0x72,0x80
+
+# ATT:   vscalefph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+# INTEL: vscalefph xmm6 {k7}, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf6,0x55,0x0f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vscalefph  (%ecx){1to8}, %xmm5, %xmm6
+# INTEL: vscalefph xmm6, xmm5, word ptr [ecx]{1to8}
+0x62,0xf6,0x55,0x18,0x2c,0x31
+
+# ATT:   vscalefph  2032(%ecx), %xmm5, %xmm6
+# INTEL: vscalefph xmm6, xmm5, xmmword ptr [ecx + 2032]
+0x62,0xf6,0x55,0x08,0x2c,0x71,0x7f
+
+# ATT:   vscalefph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+# INTEL: vscalefph xmm6 {k7} {z}, xmm5, word ptr [edx - 256]{1to8}
+0x62,0xf6,0x55,0x9f,0x2c,0x72,0x80
+
+# ATT:   vsqrtph %xmm5, %xmm6
+# INTEL: vsqrtph xmm6, xmm5
+0x62,0xf5,0x7c,0x08,0x51,0xf5
+
+# ATT:   vsqrtph %ymm5, %ymm6
+# INTEL: vsqrtph ymm6, ymm5
+0x62,0xf5,0x7c,0x28,0x51,0xf5
+
+# ATT:   vsqrtph  268435456(%esp,%esi,8), %xmm6 {%k7}
+# INTEL: vsqrtph xmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x0f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vsqrtph  (%ecx){1to8}, %xmm6
+# INTEL: vsqrtph xmm6, word ptr [ecx]{1to8}
+0x62,0xf5,0x7c,0x18,0x51,0x31
+
+# ATT:   vsqrtph  2032(%ecx), %xmm6
+# INTEL: vsqrtph xmm6, xmmword ptr [ecx + 2032]
+0x62,0xf5,0x7c,0x08,0x51,0x71,0x7f
+
+# ATT:   vsqrtph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+# INTEL: vsqrtph xmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+0x62,0xf5,0x7c,0x9f,0x51,0x72,0x80
+
+# ATT:   vsqrtph  268435456(%esp,%esi,8), %ymm6 {%k7}
+# INTEL: vsqrtph ymm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+0x62,0xf5,0x7c,0x2f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# ATT:   vsqrtph  (%ecx){1to16}, %ymm6
+# INTEL: vsqrtph ymm6, word ptr [ecx]{1to16}
+0x62,0xf5,0x7c,0x38,0x51,0x31
+
+# ATT:   vsqrtph  4064(%ecx), %ymm6
+# INTEL: vsqrtph ymm6, ymmword ptr [ecx + 4064]
+0x62,0xf5,0x7c,0x28,0x51,0x71,0x7f
+
+# ATT:   vsqrtph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+# INTEL: vsqrtph ymm6 {k7} {z}, word ptr [edx - 256]{1to16}
+0x62,0xf5,0x7c,0xbf,0x51,0x72,0x80
diff --git a/llvm/test/MC/M68k/Data/Classes/MxMove_MM.s b/llvm/test/MC/M68k/Data/Classes/MxMove_MM.s
new file mode 100644
index 0000000000000..a810626fcb885
--- /dev/null
+++ b/llvm/test/MC/M68k/Data/Classes/MxMove_MM.s
@@ -0,0 +1,90 @@
+; RUN: llvm-mc -triple=m68k -motorola-integers -show-encoding %s | FileCheck %s
+
+; CHECK:      move.b  (0,%pc,%d1), (%a0)
+; CHECK-SAME: encoding: [0x10,0xbb,0x18,0x00]
+move.b	(0,%pc,%d1), (%a0)
+; CHECK:      move.b  (-1,%pc,%d1), (%a0)
+; CHECK-SAME: encoding: [0x10,0xbb,0x18,0xff]
+move.b	(-1,%pc,%d1), (%a0)
+; CHECK:      move.l  (0,%pc,%d1), (%a0)
+; CHECK-SAME: encoding: [0x20,0xbb,0x18,0x00]
+move.l	(0,%pc,%d1), (%a0)
+; CHECK:      move.l  (0,%pc,%a2), (%a1)
+; CHECK-SAME: encoding: [0x22,0xbb,0xa8,0x00]
+move.l	(0,%pc,%a2), (%a1)
+
+; CHECK:      move.b  (0,%pc), (%a0)
+; CHECK-SAME: encoding: [0x10,0xba,0x00,0x00]
+move.b	(0,%pc), (%a0)
+; CHECK:      move.l  (-1,%pc), (%a0)
+; CHECK-SAME: encoding: [0x20,0xba,0xff,0xff]
+move.l	(-1,%pc), (%a0)
+; CHECK:      move.l  (-1,%pc), (%a0)
+; CHECK-SAME: encoding: [0x20,0xba,0xff,0xff]
+move.l	(-1,%pc), (%a0)
+
+; CHECK:      move.b  (0,%a0,%d1), (-1,%a0,%d1)
+; CHECK-SAME: encoding: [0x11,0xb0,0x18,0x00,0x18,0xff]
+move.b	(0,%a0,%d1), (-1,%a0,%d1)
+; CHECK:      move.b  (-1,%a0,%d1), (-1,%a0,%d1)
+; CHECK-SAME: encoding: [0x11,0xb0,0x18,0xff,0x18,0xff]
+move.b	(-1,%a0,%d1), (-1,%a0,%d1)
+; CHECK:      move.l  (0,%a1,%d1), (0,%a1,%d1)
+; CHECK-SAME: encoding: [0x23,0xb1,0x18,0x00,0x18,0x00]
+move.l	(0,%a1,%d1), (0,%a1,%d1)
+; CHECK:      move.l  (42,%a2,%a2), (0,%a2,%a2)
+; CHECK-SAME: encoding: [0x25,0xb2,0xa8,0x2a,0xa8,0x00]
+move.l	(42,%a2,%a2), (0,%a2,%a2)
+
+; CHECK:      move.b  (0,%a0), (0,%a0)
+; CHECK-SAME: encoding: [0x11,0x68,0x00,0x00,0x00,0x00]
+move.b	(0,%a0), (0,%a0)
+; CHECK:      move.l  (-1,%a1), (0,%a1)
+; CHECK-SAME: encoding: [0x23,0x69,0xff,0xff,0x00,0x00]
+move.l	(-1,%a1), (0,%a1)
+; CHECK:      move.l  (42,%a1), (-1,%a1)
+; CHECK-SAME: encoding: [0x23,0x69,0x00,0x2a,0xff,0xff]
+move.l	(42,%a1), (-1,%a1)
+
+; CHECK:      move.b  -(%a0), -(%a0)
+; CHECK-SAME: encoding: [0x11,0x20]
+move.b	-(%a0), -(%a0)
+; CHECK:      move.l  -(%a1), -(%a1)
+; CHECK-SAME: encoding: [0x23,0x21]
+move.l	-(%a1), -(%a1)
+; CHECK:      move.l  -(%a1), -(%a1)
+; CHECK-SAME: encoding: [0x23,0x21]
+move.l	-(%a1), -(%a1)
+
+; CHECK:      move.b  (%a0)+, (%a0)+
+; CHECK-SAME: encoding: [0x10,0xd8]
+move.b	(%a0)+, (%a0)+
+; CHECK:      move.l  (%a1)+, (%a1)+
+; CHECK-SAME: encoding: [0x22,0xd9]
+move.l	(%a1)+, (%a1)+
+; CHECK:      move.l  (%a1)+, (%a1)+
+; CHECK-SAME: encoding: [0x22,0xd9]
+move.l	(%a1)+, (%a1)+
+
+; CHECK:      move.b  (%a0), (%a0)
+; CHECK-SAME: encoding: [0x10,0x90]
+move.b	(%a0), (%a0)
+; CHECK:      move.l  (%a1), (%a1)
+; CHECK-SAME: encoding: [0x22,0x91]
+move.l	(%a1), (%a1)
+; CHECK:      move.l  (%a1), (%a1)
+; CHECK-SAME: encoding: [0x22,0x91]
+move.l	(%a1), (%a1)
+
+; FIXME: Currently we don't have the 'B' encoding
+; (i.e. abs.W) so we're always using 32-bit absolute address.
+; CHECK:      move.b  $ffffffffffffffff, $0
+; CHECK-SAME: encoding: [0x13,0xf9,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00]
+move.b	$ffffffffffffffff, $0
+; CHECK:      move.l  $0, $ffffffffffffffff
+; CHECK-SAME: encoding: [0x23,0xf9,0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff]
+move.l	$0, $ffffffffffffffff
+; CHECK:      move.l  $7fffffff, $0
+; CHECK-SAME: encoding: [0x23,0xf9,0x7f,0xff,0xff,0xff,0x00,0x00,0x00,0x00]
+move.l	$7fffffff, $0
+
diff --git a/llvm/test/MC/M68k/Data/Classes/MxMove_MR.s b/llvm/test/MC/M68k/Data/Classes/MxMove_MR.s
new file mode 100644
index 0000000000000..0dbd675d12a94
--- /dev/null
+++ b/llvm/test/MC/M68k/Data/Classes/MxMove_MR.s
@@ -0,0 +1,35 @@
+; RUN: llvm-mc -triple=m68k -show-encoding %s | FileCheck %s
+
+; CHECK:      move.b  %d0, (0,%a0,%d1)
+; CHECK-SAME: encoding: [0x11,0x80,0x18,0x00]
+move.b	%d0, (0,%a0,%d1)
+; CHECK:      move.b  %d0, (-1,%a0,%d1)
+; CHECK-SAME: encoding: [0x11,0x80,0x18,0xff]
+move.b	%d0, (-1,%a0,%d1)
+; CHECK:      move.l  %d0, (0,%a1,%d1)
+; CHECK-SAME: encoding: [0x23,0x80,0x18,0x00]
+move.l	%d0, (0,%a1,%d1)
+; CHECK:      move.l  %d1, (0,%a2,%a2)
+; CHECK-SAME: encoding: [0x25,0x81,0xa8,0x00]
+move.l	%d1, (0,%a2,%a2)
+
+; CHECK:      move.b  %d0, (0,%a0)
+; CHECK-SAME: encoding: [0x11,0x40,0x00,0x00]
+move.b	%d0, (0,%a0)
+; CHECK:      move.l  %d0, (-1,%a1)
+; CHECK-SAME: encoding: [0x23,0x40,0xff,0xff]
+move.l	%d0, (-1,%a1)
+; CHECK:      move.l  %a0, (-1,%a1)
+; CHECK-SAME: encoding: [0x23,0x48,0xff,0xff]
+move.l	%a0, (-1,%a1)
+
+; CHECK:      move.b  %d0, (%a0)
+; CHECK-SAME: encoding: [0x10,0x80]
+move.b	%d0, (%a0)
+; CHECK:      move.l  %d3, (%a1)
+; CHECK-SAME: encoding: [0x22,0x83]
+move.l	%d3, (%a1)
+; CHECK:      move.l  %a4, (%a1)
+; CHECK-SAME: encoding: [0x22,0x8c]
+move.l	%a4, (%a1)
+
diff --git a/llvm/test/MC/M68k/Data/Classes/MxMove_RM.s b/llvm/test/MC/M68k/Data/Classes/MxMove_RM.s
new file mode 100644
index 0000000000000..8ca071c09109b
--- /dev/null
+++ b/llvm/test/MC/M68k/Data/Classes/MxMove_RM.s
@@ -0,0 +1,90 @@
+; RUN: llvm-mc -triple=m68k -motorola-integers -show-encoding %s | FileCheck %s
+
+; CHECK:      move.b  (0,%pc,%d1), %d0
+; CHECK-SAME: encoding: [0x10,0x3b,0x18,0x00]
+move.b	(0,%pc,%d1), %d0
+; CHECK:      move.b  (-1,%pc,%d1), %d0
+; CHECK-SAME: encoding: [0x10,0x3b,0x18,0xff]
+move.b	(-1,%pc,%d1), %d0
+; CHECK:      move.l  (0,%pc,%d1), %d0
+; CHECK-SAME: encoding: [0x20,0x3b,0x18,0x00]
+move.l	(0,%pc,%d1), %d0
+; CHECK:      move.l  (0,%pc,%a2), %d1
+; CHECK-SAME: encoding: [0x22,0x3b,0xa8,0x00]
+move.l	(0,%pc,%a2), %d1
+
+; CHECK:      move.b  (0,%pc), %d0
+; CHECK-SAME: encoding: [0x10,0x3a,0x00,0x00]
+move.b	(0,%pc), %d0
+; CHECK:      move.l  (-1,%pc), %d0
+; CHECK-SAME: encoding: [0x20,0x3a,0xff,0xff]
+move.l	(-1,%pc), %d0
+; CHECK:      move.l  (-1,%pc), %a0
+; CHECK-SAME: encoding: [0x20,0x7a,0xff,0xff]
+move.l	(-1,%pc), %a0
+
+; CHECK:      move.b  (0,%a0,%d1), %d0
+; CHECK-SAME: encoding: [0x10,0x30,0x18,0x00]
+move.b	(0,%a0,%d1), %d0
+; CHECK:      move.b  (-1,%a0,%d1), %d0
+; CHECK-SAME: encoding: [0x10,0x30,0x18,0xff]
+move.b	(-1,%a0,%d1), %d0
+; CHECK:      move.l  (0,%a1,%d1), %d0
+; CHECK-SAME: encoding: [0x20,0x31,0x18,0x00]
+move.l	(0,%a1,%d1), %d0
+; CHECK:      move.l  (0,%a2,%a2), %d1
+; CHECK-SAME: encoding: [0x22,0x32,0xa8,0x00]
+move.l	(0,%a2,%a2), %d1
+
+; CHECK:      move.b  (0,%a0), %d0
+; CHECK-SAME: encoding: [0x10,0x28,0x00,0x00]
+move.b	(0,%a0), %d0
+; CHECK:      move.l  (-1,%a1), %d0
+; CHECK-SAME: encoding: [0x20,0x29,0xff,0xff]
+move.l	(-1,%a1), %d0
+; CHECK:      move.l  (-1,%a1), %a0
+; CHECK-SAME: encoding: [0x20,0x69,0xff,0xff]
+move.l	(-1,%a1), %a0
+
+; CHECK:      move.b  -(%a0), %d0
+; CHECK-SAME: encoding: [0x10,0x20]
+move.b	-(%a0), %d0
+; CHECK:      move.l  -(%a1), %d3
+; CHECK-SAME: encoding: [0x26,0x21]
+move.l	-(%a1), %d3
+; CHECK:      move.l  -(%a1), %a4
+; CHECK-SAME: encoding: [0x28,0x61]
+move.l	-(%a1), %a4
+
+; CHECK:      move.b  (%a0)+, %d0
+; CHECK-SAME: encoding: [0x10,0x18]
+move.b	(%a0)+, %d0
+; CHECK:      move.l  (%a1)+, %d3
+; CHECK-SAME: encoding: [0x26,0x19]
+move.l	(%a1)+, %d3
+; CHECK:      move.l  (%a1)+, %a4
+; CHECK-SAME: encoding: [0x28,0x59]
+move.l	(%a1)+, %a4
+
+; CHECK:      move.b  (%a0), %d0
+; CHECK-SAME: encoding: [0x10,0x10]
+move.b	(%a0), %d0
+; CHECK:      move.l  (%a1), %d3
+; CHECK-SAME: encoding: [0x26,0x11]
+move.l	(%a1), %d3
+; CHECK:      move.l  (%a1), %a4
+; CHECK-SAME: encoding: [0x28,0x51]
+move.l	(%a1), %a4
+
+; FIXME: Currently we don't have the 'B' encoding
+; (i.e. abs.W) so we're always using 32-bit absolute address.
+; CHECK:      move.b  $0, %d0
+; CHECK-SAME: encoding: [0x10,0x39,0x00,0x00,0x00,0x00]
+move.b	$0, %d0
+; CHECK:      move.l  $0, %d3
+; CHECK-SAME: encoding: [0x26,0x39,0x00,0x00,0x00,0x00]
+move.l	$0, %d3
+; CHECK:      move.l  $0, %a4
+; CHECK-SAME: encoding: [0x28,0x79,0x00,0x00,0x00,0x00]
+move.l	$0, %a4
+
diff --git a/llvm/test/MC/M68k/Data/Classes/MxMove_RR.s b/llvm/test/MC/M68k/Data/Classes/MxMove_RR.s
new file mode 100644
index 0000000000000..814abc4bde0aa
--- /dev/null
+++ b/llvm/test/MC/M68k/Data/Classes/MxMove_RR.s
@@ -0,0 +1,21 @@
+; RUN: llvm-mc -triple=m68k -show-encoding %s | FileCheck %s
+
+; CHECK:      move.b  %d0, %d1
+; CHECK-SAME: encoding: [0x12,0x00]
+move.b	%d0, %d1
+; CHECK:      move.w  %a2, %d3
+; CHECK-SAME: encoding: [0x36,0x0a]
+move.w	%a2, %d3
+; CHECK:      move.w  %a2, %a6
+; CHECK-SAME: encoding: [0x3c,0x4a]
+move.w	%a2, %a6
+; CHECK:      move.w  %a2, %d1
+; CHECK-SAME: encoding: [0x32,0x0a]
+move.w	%a2, %d1
+; CHECK:      move.l  %d2, %d1
+; CHECK-SAME: encoding: [0x22,0x02]
+move.l	%d2, %d1
+; CHECK:      move.l  %a2, %a1
+; CHECK-SAME: encoding: [0x22,0x4a]
+move.l	%a2, %a1
+
diff --git a/llvm/test/MC/X86/avx512-err.s b/llvm/test/MC/X86/avx512-err.s
index 0d353a6c54981..96e8c267979f3 100644
--- a/llvm/test/MC/X86/avx512-err.s
+++ b/llvm/test/MC/X86/avx512-err.s
@@ -20,3 +20,6 @@ vpmuld %xmm1, %xmm2, %xmm3
 
 // ERR: invalid instruction mnemonic 'maskmov'
 maskmov %mm1, %mm2
+
+// ERR: invalid instruction mnemonic 'cmpeqsh'
+cmpeqsh %xmm2, %xmm1, %k0
diff --git a/llvm/test/MC/X86/avx512fp16.s b/llvm/test/MC/X86/avx512fp16.s
index 6118339d49d48..b358705fbedc8 100644
--- a/llvm/test/MC/X86/avx512fp16.s
+++ b/llvm/test/MC/X86/avx512fp16.s
@@ -75,3 +75,1691 @@
 // CHECK: vmovw %xmm30, -256(%rdx)
 // CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7e,0x72,0x80]
           vmovw %xmm30, -256(%rdx)
+
+// CHECK: vaddph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x40,0x58,0xf4]
+          vaddph %zmm28, %zmm29, %zmm30
+
+// CHECK: vaddph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x10,0x58,0xf4]
+          vaddph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vaddph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x14,0x47,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vaddph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vaddph  (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x45,0x14,0x50,0x58,0x31]
+          vaddph  (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vaddph  8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x65,0x14,0x40,0x58,0x71,0x7f]
+          vaddph  8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vaddph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x58,0x72,0x80]
+          vaddph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vaddsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x00,0x58,0xf4]
+          vaddsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vaddsh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x10,0x58,0xf4]
+          vaddsh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vaddsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x16,0x07,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vaddsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vaddsh  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x16,0x00,0x58,0x31]
+          vaddsh  (%r9), %xmm29, %xmm30
+
+// CHECK: vaddsh  254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x58,0x71,0x7f]
+          vaddsh  254(%rcx), %xmm29, %xmm30
+
+// CHECK: vaddsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x16,0x87,0x58,0x72,0x80]
+          vaddsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vcmpneq_usph %zmm28, %zmm29, %k5
+// CHECK: encoding: [0x62,0x93,0x14,0x40,0xc2,0xec,0x14]
+          vcmpneq_usph %zmm28, %zmm29, %k5
+
+// CHECK: vcmpnlt_uqph {sae}, %zmm28, %zmm29, %k5
+// CHECK: encoding: [0x62,0x93,0x14,0x10,0xc2,0xec,0x15]
+          vcmpnlt_uqph {sae}, %zmm28, %zmm29, %k5
+
+// CHECK: vcmpnle_uqph 268435456(%rbp,%r14,8), %zmm29, %k5 {%k7}
+// CHECK: encoding: [0x62,0xb3,0x14,0x47,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x16]
+          vcmpnle_uqph 268435456(%rbp,%r14,8), %zmm29, %k5 {%k7}
+
+// CHECK: vcmpord_sph (%r9){1to32}, %zmm29, %k5
+// CHECK: encoding: [0x62,0xd3,0x14,0x50,0xc2,0x29,0x17]
+          vcmpord_sph (%r9){1to32}, %zmm29, %k5
+
+// CHECK: vcmpeq_usph 8128(%rcx), %zmm29, %k5
+// CHECK: encoding: [0x62,0xf3,0x14,0x40,0xc2,0x69,0x7f,0x18]
+          vcmpeq_usph 8128(%rcx), %zmm29, %k5
+
+// CHECK: vcmpnge_uqph -256(%rdx){1to32}, %zmm29, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x14,0x57,0xc2,0x6a,0x80,0x19]
+          vcmpnge_uqph -256(%rdx){1to32}, %zmm29, %k5 {%k7}
+
+// CHECK: vcmpngt_uqsh %xmm28, %xmm29, %k5
+// CHECK: encoding: [0x62,0x93,0x16,0x00,0xc2,0xec,0x1a]
+          vcmpngt_uqsh %xmm28, %xmm29, %k5
+
+// CHECK: vcmpfalse_ossh {sae}, %xmm28, %xmm29, %k5
+// CHECK: encoding: [0x62,0x93,0x16,0x10,0xc2,0xec,0x1b]
+          vcmpfalse_ossh {sae}, %xmm28, %xmm29, %k5
+
+// CHECK: vcmpneq_ossh 268435456(%rbp,%r14,8), %xmm29, %k5 {%k7}
+// CHECK: encoding: [0x62,0xb3,0x16,0x07,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x1c]
+          vcmpneq_ossh 268435456(%rbp,%r14,8), %xmm29, %k5 {%k7}
+
+// CHECK: vcmpge_oqsh (%r9), %xmm29, %k5
+// CHECK: encoding: [0x62,0xd3,0x16,0x00,0xc2,0x29,0x1d]
+          vcmpge_oqsh (%r9), %xmm29, %k5
+
+// CHECK: vcmpgt_oqsh 254(%rcx), %xmm29, %k5
+// CHECK: encoding: [0x62,0xf3,0x16,0x00,0xc2,0x69,0x7f,0x1e]
+          vcmpgt_oqsh 254(%rcx), %xmm29, %k5
+
+// CHECK: vcmptrue_ussh -256(%rdx), %xmm29, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x16,0x07,0xc2,0x6a,0x80,0x1f]
+          vcmptrue_ussh -256(%rdx), %xmm29, %k5 {%k7}
+
+// CHECK: vcomish %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x2f,0xf5]
+          vcomish %xmm29, %xmm30
+
+// CHECK: vcomish {sae}, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x2f,0xf5]
+          vcomish {sae}, %xmm29, %xmm30
+
+// CHECK: vcomish  268435456(%rbp,%r14,8), %xmm30
+// CHECK: encoding: [0x62,0x25,0x7c,0x08,0x2f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcomish  268435456(%rbp,%r14,8), %xmm30
+
+// CHECK: vcomish  (%r9), %xmm30
+// CHECK: encoding: [0x62,0x45,0x7c,0x08,0x2f,0x31]
+          vcomish  (%r9), %xmm30
+
+// CHECK: vcomish  254(%rcx), %xmm30
+// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x2f,0x71,0x7f]
+          vcomish  254(%rcx), %xmm30
+
+// CHECK: vcomish  -256(%rdx), %xmm30
+// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x2f,0x72,0x80]
+          vcomish  -256(%rdx), %xmm30
+
+// CHECK: vdivph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x40,0x5e,0xf4]
+          vdivph %zmm28, %zmm29, %zmm30
+
+// CHECK: vdivph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x10,0x5e,0xf4]
+          vdivph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vdivph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x14,0x47,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vdivph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vdivph  (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x45,0x14,0x50,0x5e,0x31]
+          vdivph  (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vdivph  8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x65,0x14,0x40,0x5e,0x71,0x7f]
+          vdivph  8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vdivph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x5e,0x72,0x80]
+          vdivph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vdivsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5e,0xf4]
+          vdivsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vdivsh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5e,0xf4]
+          vdivsh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vdivsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vdivsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vdivsh  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5e,0x31]
+          vdivsh  (%r9), %xmm29, %xmm30
+
+// CHECK: vdivsh  254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5e,0x71,0x7f]
+          vdivsh  254(%rcx), %xmm29, %xmm30
+
+// CHECK: vdivsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5e,0x72,0x80]
+          vdivsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vmaxph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x40,0x5f,0xf4]
+          vmaxph %zmm28, %zmm29, %zmm30
+
+// CHECK: vmaxph {sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x10,0x5f,0xf4]
+          vmaxph {sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vmaxph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x14,0x47,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmaxph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vmaxph  (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x45,0x14,0x50,0x5f,0x31]
+          vmaxph  (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vmaxph  8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x65,0x14,0x40,0x5f,0x71,0x7f]
+          vmaxph  8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vmaxph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x5f,0x72,0x80]
+          vmaxph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vmaxsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5f,0xf4]
+          vmaxsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vmaxsh {sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5f,0xf4]
+          vmaxsh {sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vmaxsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmaxsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vmaxsh  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5f,0x31]
+          vmaxsh  (%r9), %xmm29, %xmm30
+
+// CHECK: vmaxsh  254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5f,0x71,0x7f]
+          vmaxsh  254(%rcx), %xmm29, %xmm30
+
+// CHECK: vmaxsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5f,0x72,0x80]
+          vmaxsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vminph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x40,0x5d,0xf4]
+          vminph %zmm28, %zmm29, %zmm30
+
+// CHECK: vminph {sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x10,0x5d,0xf4]
+          vminph {sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vminph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x14,0x47,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vminph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vminph  (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x45,0x14,0x50,0x5d,0x31]
+          vminph  (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vminph  8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x65,0x14,0x40,0x5d,0x71,0x7f]
+          vminph  8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vminph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x5d,0x72,0x80]
+          vminph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vminsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5d,0xf4]
+          vminsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vminsh {sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5d,0xf4]
+          vminsh {sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vminsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vminsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vminsh  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5d,0x31]
+          vminsh  (%r9), %xmm29, %xmm30
+
+// CHECK: vminsh  254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5d,0x71,0x7f]
+          vminsh  254(%rcx), %xmm29, %xmm30
+
+// CHECK: vminsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5d,0x72,0x80]
+          vminsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vmulph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x40,0x59,0xf4]
+          vmulph %zmm28, %zmm29, %zmm30
+
+// CHECK: vmulph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x10,0x59,0xf4]
+          vmulph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vmulph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x14,0x47,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmulph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vmulph  (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x45,0x14,0x50,0x59,0x31]
+          vmulph  (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vmulph  8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x65,0x14,0x40,0x59,0x71,0x7f]
+          vmulph  8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vmulph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x59,0x72,0x80]
+          vmulph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vmulsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x00,0x59,0xf4]
+          vmulsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vmulsh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x10,0x59,0xf4]
+          vmulsh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vmulsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x16,0x07,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmulsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vmulsh  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x16,0x00,0x59,0x31]
+          vmulsh  (%r9), %xmm29, %xmm30
+
+// CHECK: vmulsh  254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x59,0x71,0x7f]
+          vmulsh  254(%rcx), %xmm29, %xmm30
+
+// CHECK: vmulsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x16,0x87,0x59,0x72,0x80]
+          vmulsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vsubph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x40,0x5c,0xf4]
+          vsubph %zmm28, %zmm29, %zmm30
+
+// CHECK: vsubph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x10,0x5c,0xf4]
+          vsubph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vsubph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x14,0x47,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsubph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vsubph  (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x45,0x14,0x50,0x5c,0x31]
+          vsubph  (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vsubph  8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x65,0x14,0x40,0x5c,0x71,0x7f]
+          vsubph  8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vsubph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x14,0xd7,0x5c,0x72,0x80]
+          vsubph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vsubsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5c,0xf4]
+          vsubsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vsubsh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5c,0xf4]
+          vsubsh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vsubsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsubsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vsubsh  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5c,0x31]
+          vsubsh  (%r9), %xmm29, %xmm30
+
+// CHECK: vsubsh  254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5c,0x71,0x7f]
+          vsubsh  254(%rcx), %xmm29, %xmm30
+
+// CHECK: vsubsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5c,0x72,0x80]
+          vsubsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vucomish %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x2e,0xf5]
+          vucomish %xmm29, %xmm30
+
+// CHECK: vucomish {sae}, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x2e,0xf5]
+          vucomish {sae}, %xmm29, %xmm30
+
+// CHECK: vucomish  268435456(%rbp,%r14,8), %xmm30
+// CHECK: encoding: [0x62,0x25,0x7c,0x08,0x2e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vucomish  268435456(%rbp,%r14,8), %xmm30
+
+// CHECK: vucomish  (%r9), %xmm30
+// CHECK: encoding: [0x62,0x45,0x7c,0x08,0x2e,0x31]
+          vucomish  (%r9), %xmm30
+
+// CHECK: vucomish  254(%rcx), %xmm30
+// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x2e,0x71,0x7f]
+          vucomish  254(%rcx), %xmm30
+
+// CHECK: vucomish  -256(%rdx), %xmm30
+// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x2e,0x72,0x80]
+          vucomish  -256(%rdx), %xmm30
+
+// CHECK: vcvtdq2ph %zmm29, %ymm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x5b,0xf5]
+          vcvtdq2ph %zmm29, %ymm30
+
+// CHECK: vcvtdq2ph {rn-sae}, %zmm29, %ymm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x5b,0xf5]
+          vcvtdq2ph {rn-sae}, %zmm29, %ymm30
+
+// CHECK: vcvtdq2ph  268435456(%rbp,%r14,8), %ymm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtdq2ph  268435456(%rbp,%r14,8), %ymm30 {%k7}
+
+// CHECK: vcvtdq2ph  (%r9){1to16}, %ymm30
+// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x5b,0x31]
+          vcvtdq2ph  (%r9){1to16}, %ymm30
+
+// CHECK: vcvtdq2ph  8128(%rcx), %ymm30
+// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x5b,0x71,0x7f]
+          vcvtdq2ph  8128(%rcx), %ymm30
+
+// CHECK: vcvtdq2ph  -512(%rdx){1to16}, %ymm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x5b,0x72,0x80]
+          vcvtdq2ph  -512(%rdx){1to16}, %ymm30 {%k7} {z}
+
+// CHECK: vcvtpd2ph %zmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0xfd,0x48,0x5a,0xf5]
+          vcvtpd2ph %zmm29, %xmm30
+
+// CHECK: vcvtpd2ph {rn-sae}, %zmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0xfd,0x18,0x5a,0xf5]
+          vcvtpd2ph {rn-sae}, %zmm29, %xmm30
+
+// CHECK: vcvtpd2phz  268435456(%rbp,%r14,8), %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0xfd,0x4f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtpd2phz  268435456(%rbp,%r14,8), %xmm30 {%k7}
+
+// CHECK: vcvtpd2ph  (%r9){1to8}, %xmm30
+// CHECK: encoding: [0x62,0x45,0xfd,0x58,0x5a,0x31]
+          vcvtpd2ph  (%r9){1to8}, %xmm30
+
+// CHECK: vcvtpd2phz  8128(%rcx), %xmm30
+// CHECK: encoding: [0x62,0x65,0xfd,0x48,0x5a,0x71,0x7f]
+          vcvtpd2phz  8128(%rcx), %xmm30
+
+// CHECK: vcvtpd2ph  -1024(%rdx){1to8}, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0xfd,0xdf,0x5a,0x72,0x80]
+          vcvtpd2ph  -1024(%rdx){1to8}, %xmm30 {%k7} {z}
+
+// CHECK: vcvtph2dq %ymm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x5b,0xf5]
+          vcvtph2dq %ymm29, %zmm30
+
+// CHECK: vcvtph2dq {rn-sae}, %ymm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x5b,0xf5]
+          vcvtph2dq {rn-sae}, %ymm29, %zmm30
+
+// CHECK: vcvtph2dq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2dq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvtph2dq  (%r9){1to16}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x5b,0x31]
+          vcvtph2dq  (%r9){1to16}, %zmm30
+
+// CHECK: vcvtph2dq  4064(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x5b,0x71,0x7f]
+          vcvtph2dq  4064(%rcx), %zmm30
+
+// CHECK: vcvtph2dq  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x5b,0x72,0x80]
+          vcvtph2dq  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+
+// CHECK: vcvtph2pd %xmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x5a,0xf5]
+          vcvtph2pd %xmm29, %zmm30
+
+// CHECK: vcvtph2pd {sae}, %xmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x5a,0xf5]
+          vcvtph2pd {sae}, %xmm29, %zmm30
+
+// CHECK: vcvtph2pd  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2pd  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvtph2pd  (%r9){1to8}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x5a,0x31]
+          vcvtph2pd  (%r9){1to8}, %zmm30
+
+// CHECK: vcvtph2pd  2032(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x5a,0x71,0x7f]
+          vcvtph2pd  2032(%rcx), %zmm30
+
+// CHECK: vcvtph2pd  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x5a,0x72,0x80]
+          vcvtph2pd  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+
+// CHECK: vcvtph2psx %ymm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x7d,0x48,0x13,0xf5]
+          vcvtph2psx %ymm29, %zmm30
+
+// CHECK: vcvtph2psx {sae}, %ymm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x7d,0x18,0x13,0xf5]
+          vcvtph2psx {sae}, %ymm29, %zmm30
+
+// CHECK: vcvtph2psx  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x7d,0x4f,0x13,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2psx  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvtph2psx  (%r9){1to16}, %zmm30
+// CHECK: encoding: [0x62,0x46,0x7d,0x58,0x13,0x31]
+          vcvtph2psx  (%r9){1to16}, %zmm30
+
+// CHECK: vcvtph2psx  4064(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x66,0x7d,0x48,0x13,0x71,0x7f]
+          vcvtph2psx  4064(%rcx), %zmm30
+
+// CHECK: vcvtph2psx  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x7d,0xdf,0x13,0x72,0x80]
+          vcvtph2psx  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+
+// CHECK: vcvtph2qq %xmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x7b,0xf5]
+          vcvtph2qq %xmm29, %zmm30
+
+// CHECK: vcvtph2qq {rn-sae}, %xmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x7b,0xf5]
+          vcvtph2qq {rn-sae}, %xmm29, %zmm30
+
+// CHECK: vcvtph2qq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x7b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2qq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvtph2qq  (%r9){1to8}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x7b,0x31]
+          vcvtph2qq  (%r9){1to8}, %zmm30
+
+// CHECK: vcvtph2qq  2032(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x7b,0x71,0x7f]
+          vcvtph2qq  2032(%rcx), %zmm30
+
+// CHECK: vcvtph2qq  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x7b,0x72,0x80]
+          vcvtph2qq  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+
+// CHECK: vcvtph2udq %ymm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x79,0xf5]
+          vcvtph2udq %ymm29, %zmm30
+
+// CHECK: vcvtph2udq {rn-sae}, %ymm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x79,0xf5]
+          vcvtph2udq {rn-sae}, %ymm29, %zmm30
+
+// CHECK: vcvtph2udq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2udq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvtph2udq  (%r9){1to16}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x79,0x31]
+          vcvtph2udq  (%r9){1to16}, %zmm30
+
+// CHECK: vcvtph2udq  4064(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x79,0x71,0x7f]
+          vcvtph2udq  4064(%rcx), %zmm30
+
+// CHECK: vcvtph2udq  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x79,0x72,0x80]
+          vcvtph2udq  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+
+// CHECK: vcvtph2uqq %xmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x79,0xf5]
+          vcvtph2uqq %xmm29, %zmm30
+
+// CHECK: vcvtph2uqq {rn-sae}, %xmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x79,0xf5]
+          vcvtph2uqq {rn-sae}, %xmm29, %zmm30
+
+// CHECK: vcvtph2uqq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2uqq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvtph2uqq  (%r9){1to8}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x79,0x31]
+          vcvtph2uqq  (%r9){1to8}, %zmm30
+
+// CHECK: vcvtph2uqq  2032(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x79,0x71,0x7f]
+          vcvtph2uqq  2032(%rcx), %zmm30
+
+// CHECK: vcvtph2uqq  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x79,0x72,0x80]
+          vcvtph2uqq  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+
+// CHECK: vcvtph2uw %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x7d,0xf5]
+          vcvtph2uw %zmm29, %zmm30
+
+// CHECK: vcvtph2uw {rn-sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x7d,0xf5]
+          vcvtph2uw {rn-sae}, %zmm29, %zmm30
+
+// CHECK: vcvtph2uw  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2uw  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvtph2uw  (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x7d,0x31]
+          vcvtph2uw  (%r9){1to32}, %zmm30
+
+// CHECK: vcvtph2uw  8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x7d,0x71,0x7f]
+          vcvtph2uw  8128(%rcx), %zmm30
+
+// CHECK: vcvtph2uw  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x7d,0x72,0x80]
+          vcvtph2uw  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vcvtph2w %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x7d,0xf5]
+          vcvtph2w %zmm29, %zmm30
+
+// CHECK: vcvtph2w {rn-sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x7d,0xf5]
+          vcvtph2w {rn-sae}, %zmm29, %zmm30
+
+// CHECK: vcvtph2w  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2w  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvtph2w  (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x7d,0x31]
+          vcvtph2w  (%r9){1to32}, %zmm30
+
+// CHECK: vcvtph2w  8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x7d,0x71,0x7f]
+          vcvtph2w  8128(%rcx), %zmm30
+
+// CHECK: vcvtph2w  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x7d,0x72,0x80]
+          vcvtph2w  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vcvtps2phx %zmm29, %ymm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x1d,0xf5]
+          vcvtps2phx %zmm29, %ymm30
+
+// CHECK: vcvtps2phx {rn-sae}, %zmm29, %ymm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x1d,0xf5]
+          vcvtps2phx {rn-sae}, %zmm29, %ymm30
+
+// CHECK: vcvtps2phx  268435456(%rbp,%r14,8), %ymm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x1d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtps2phx  268435456(%rbp,%r14,8), %ymm30 {%k7}
+
+// CHECK: vcvtps2phx  (%r9){1to16}, %ymm30
+// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x1d,0x31]
+          vcvtps2phx  (%r9){1to16}, %ymm30
+
+// CHECK: vcvtps2phx  8128(%rcx), %ymm30
+// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x1d,0x71,0x7f]
+          vcvtps2phx  8128(%rcx), %ymm30
+
+// CHECK: vcvtps2phx  -512(%rdx){1to16}, %ymm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x1d,0x72,0x80]
+          vcvtps2phx  -512(%rdx){1to16}, %ymm30 {%k7} {z}
+
+// CHECK: vcvtqq2ph %zmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0xfc,0x48,0x5b,0xf5]
+          vcvtqq2ph %zmm29, %xmm30
+
+// CHECK: vcvtqq2ph {rn-sae}, %zmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0xfc,0x18,0x5b,0xf5]
+          vcvtqq2ph {rn-sae}, %zmm29, %xmm30
+
+// CHECK: vcvtqq2phz  268435456(%rbp,%r14,8), %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0xfc,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtqq2phz  268435456(%rbp,%r14,8), %xmm30 {%k7}
+
+// CHECK: vcvtqq2ph  (%r9){1to8}, %xmm30
+// CHECK: encoding: [0x62,0x45,0xfc,0x58,0x5b,0x31]
+          vcvtqq2ph  (%r9){1to8}, %xmm30
+
+// CHECK: vcvtqq2phz  8128(%rcx), %xmm30
+// CHECK: encoding: [0x62,0x65,0xfc,0x48,0x5b,0x71,0x7f]
+          vcvtqq2phz  8128(%rcx), %xmm30
+
+// CHECK: vcvtqq2ph  -1024(%rdx){1to8}, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0xfc,0xdf,0x5b,0x72,0x80]
+          vcvtqq2ph  -1024(%rdx){1to8}, %xmm30 {%k7} {z}
+
+// CHECK: vcvtsd2sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x97,0x00,0x5a,0xf4]
+          vcvtsd2sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vcvtsd2sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x97,0x10,0x5a,0xf4]
+          vcvtsd2sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vcvtsd2sh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x97,0x07,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtsd2sh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vcvtsd2sh  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x97,0x00,0x5a,0x31]
+          vcvtsd2sh  (%r9), %xmm29, %xmm30
+
+// CHECK: vcvtsd2sh  1016(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x97,0x00,0x5a,0x71,0x7f]
+          vcvtsd2sh  1016(%rcx), %xmm29, %xmm30
+
+// CHECK: vcvtsd2sh  -1024(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x97,0x87,0x5a,0x72,0x80]
+          vcvtsd2sh  -1024(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vcvtsh2sd %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x00,0x5a,0xf4]
+          vcvtsh2sd %xmm28, %xmm29, %xmm30
+
+// CHECK: vcvtsh2sd {sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x10,0x5a,0xf4]
+          vcvtsh2sd {sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vcvtsh2sd  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x16,0x07,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtsh2sd  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vcvtsh2sd  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x16,0x00,0x5a,0x31]
+          vcvtsh2sd  (%r9), %xmm29, %xmm30
+
+// CHECK: vcvtsh2sd  254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x5a,0x71,0x7f]
+          vcvtsh2sd  254(%rcx), %xmm29, %xmm30
+
+// CHECK: vcvtsh2sd  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x16,0x87,0x5a,0x72,0x80]
+          vcvtsh2sd  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vcvtsh2si %xmm30, %edx
+// CHECK: encoding: [0x62,0x95,0x7e,0x08,0x2d,0xd6]
+          vcvtsh2si %xmm30, %edx
+
+// CHECK: vcvtsh2si {rn-sae}, %xmm30, %edx
+// CHECK: encoding: [0x62,0x95,0x7e,0x18,0x2d,0xd6]
+          vcvtsh2si {rn-sae}, %xmm30, %edx
+
+// CHECK: vcvtsh2si %xmm30, %r12
+// CHECK: encoding: [0x62,0x15,0xfe,0x08,0x2d,0xe6]
+          vcvtsh2si %xmm30, %r12
+
+// CHECK: vcvtsh2si {rn-sae}, %xmm30, %r12
+// CHECK: encoding: [0x62,0x15,0xfe,0x18,0x2d,0xe6]
+          vcvtsh2si {rn-sae}, %xmm30, %r12
+
+// CHECK: vcvtsh2si  268435456(%rbp,%r14,8), %edx
+// CHECK: encoding: [0x62,0xb5,0x7e,0x08,0x2d,0x94,0xf5,0x00,0x00,0x00,0x10]
+          vcvtsh2si  268435456(%rbp,%r14,8), %edx
+
+// CHECK: vcvtsh2si  (%r9), %edx
+// CHECK: encoding: [0x62,0xd5,0x7e,0x08,0x2d,0x11]
+          vcvtsh2si  (%r9), %edx
+
+// CHECK: vcvtsh2si  254(%rcx), %edx
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0x51,0x7f]
+          vcvtsh2si  254(%rcx), %edx
+
+// CHECK: vcvtsh2si  -256(%rdx), %edx
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0x52,0x80]
+          vcvtsh2si  -256(%rdx), %edx
+
+// CHECK: vcvtsh2si  268435456(%rbp,%r14,8), %r12
+// CHECK: encoding: [0x62,0x35,0xfe,0x08,0x2d,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtsh2si  268435456(%rbp,%r14,8), %r12
+
+// CHECK: vcvtsh2si  (%r9), %r12
+// CHECK: encoding: [0x62,0x55,0xfe,0x08,0x2d,0x21]
+          vcvtsh2si  (%r9), %r12
+
+// CHECK: vcvtsh2si  254(%rcx), %r12
+// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x2d,0x61,0x7f]
+          vcvtsh2si  254(%rcx), %r12
+
+// CHECK: vcvtsh2si  -256(%rdx), %r12
+// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x2d,0x62,0x80]
+          vcvtsh2si  -256(%rdx), %r12
+
+// CHECK: vcvtsh2ss %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x14,0x00,0x13,0xf4]
+          vcvtsh2ss %xmm28, %xmm29, %xmm30
+
+// CHECK: vcvtsh2ss {sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x14,0x10,0x13,0xf4]
+          vcvtsh2ss {sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vcvtsh2ss  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x14,0x07,0x13,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtsh2ss  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vcvtsh2ss  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x14,0x00,0x13,0x31]
+          vcvtsh2ss  (%r9), %xmm29, %xmm30
+
+// CHECK: vcvtsh2ss  254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x14,0x00,0x13,0x71,0x7f]
+          vcvtsh2ss  254(%rcx), %xmm29, %xmm30
+
+// CHECK: vcvtsh2ss  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x14,0x87,0x13,0x72,0x80]
+          vcvtsh2ss  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vcvtsh2usi %xmm30, %edx
+// CHECK: encoding: [0x62,0x95,0x7e,0x08,0x79,0xd6]
+          vcvtsh2usi %xmm30, %edx
+
+// CHECK: vcvtsh2usi {rn-sae}, %xmm30, %edx
+// CHECK: encoding: [0x62,0x95,0x7e,0x18,0x79,0xd6]
+          vcvtsh2usi {rn-sae}, %xmm30, %edx
+
+// CHECK: vcvtsh2usi %xmm30, %r12
+// CHECK: encoding: [0x62,0x15,0xfe,0x08,0x79,0xe6]
+          vcvtsh2usi %xmm30, %r12
+
+// CHECK: vcvtsh2usi {rn-sae}, %xmm30, %r12
+// CHECK: encoding: [0x62,0x15,0xfe,0x18,0x79,0xe6]
+          vcvtsh2usi {rn-sae}, %xmm30, %r12
+
+// CHECK: vcvtsh2usi  268435456(%rbp,%r14,8), %edx
+// CHECK: encoding: [0x62,0xb5,0x7e,0x08,0x79,0x94,0xf5,0x00,0x00,0x00,0x10]
+          vcvtsh2usi  268435456(%rbp,%r14,8), %edx
+
+// CHECK: vcvtsh2usi  (%r9), %edx
+// CHECK: encoding: [0x62,0xd5,0x7e,0x08,0x79,0x11]
+          vcvtsh2usi  (%r9), %edx
+
+// CHECK: vcvtsh2usi  254(%rcx), %edx
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0x51,0x7f]
+          vcvtsh2usi  254(%rcx), %edx
+
+// CHECK: vcvtsh2usi  -256(%rdx), %edx
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0x52,0x80]
+          vcvtsh2usi  -256(%rdx), %edx
+
+// CHECK: vcvtsh2usi  268435456(%rbp,%r14,8), %r12
+// CHECK: encoding: [0x62,0x35,0xfe,0x08,0x79,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtsh2usi  268435456(%rbp,%r14,8), %r12
+
+// CHECK: vcvtsh2usi  (%r9), %r12
+// CHECK: encoding: [0x62,0x55,0xfe,0x08,0x79,0x21]
+          vcvtsh2usi  (%r9), %r12
+
+// CHECK: vcvtsh2usi  254(%rcx), %r12
+// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x79,0x61,0x7f]
+          vcvtsh2usi  254(%rcx), %r12
+
+// CHECK: vcvtsh2usi  -256(%rdx), %r12
+// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x79,0x62,0x80]
+          vcvtsh2usi  -256(%rdx), %r12
+
+// CHECK: vcvtsi2sh %r12, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x96,0x00,0x2a,0xf4]
+          vcvtsi2sh %r12, %xmm29, %xmm30
+
+// CHECK: vcvtsi2sh %r12, {rn-sae}, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x96,0x10,0x2a,0xf4]
+          vcvtsi2sh %r12, {rn-sae}, %xmm29, %xmm30
+
+// CHECK: vcvtsi2sh %edx, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x2a,0xf2]
+          vcvtsi2sh %edx, %xmm29, %xmm30
+
+// CHECK: vcvtsi2sh %edx, {rn-sae}, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x10,0x2a,0xf2]
+          vcvtsi2sh %edx, {rn-sae}, %xmm29, %xmm30
+
+// CHECK: vcvtsi2shl  268435456(%rbp,%r14,8), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x25,0x16,0x00,0x2a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtsi2shl  268435456(%rbp,%r14,8), %xmm29, %xmm30
+
+// CHECK: vcvtsi2shl  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x16,0x00,0x2a,0x31]
+          vcvtsi2shl  (%r9), %xmm29, %xmm30
+
+// CHECK: vcvtsi2shl  508(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x2a,0x71,0x7f]
+          vcvtsi2shl  508(%rcx), %xmm29, %xmm30
+
+// CHECK: vcvtsi2shl  -512(%rdx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x2a,0x72,0x80]
+          vcvtsi2shl  -512(%rdx), %xmm29, %xmm30
+
+// CHECK: vcvtsi2shq  1016(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x96,0x00,0x2a,0x71,0x7f]
+          vcvtsi2shq  1016(%rcx), %xmm29, %xmm30
+
+// CHECK: vcvtsi2shq  -1024(%rdx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x96,0x00,0x2a,0x72,0x80]
+          vcvtsi2shq  -1024(%rdx), %xmm29, %xmm30
+
+// CHECK: vcvtss2sh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x00,0x1d,0xf4]
+          vcvtss2sh %xmm28, %xmm29, %xmm30
+
+// CHECK: vcvtss2sh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x14,0x10,0x1d,0xf4]
+          vcvtss2sh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vcvtss2sh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x14,0x07,0x1d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtss2sh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vcvtss2sh  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x14,0x00,0x1d,0x31]
+          vcvtss2sh  (%r9), %xmm29, %xmm30
+
+// CHECK: vcvtss2sh  508(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x14,0x00,0x1d,0x71,0x7f]
+          vcvtss2sh  508(%rcx), %xmm29, %xmm30
+
+// CHECK: vcvtss2sh  -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x14,0x87,0x1d,0x72,0x80]
+          vcvtss2sh  -512(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vcvttph2dq %ymm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7e,0x48,0x5b,0xf5]
+          vcvttph2dq %ymm29, %zmm30
+
+// CHECK: vcvttph2dq {sae}, %ymm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7e,0x18,0x5b,0xf5]
+          vcvttph2dq {sae}, %ymm29, %zmm30
+
+// CHECK: vcvttph2dq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7e,0x4f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2dq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvttph2dq  (%r9){1to16}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7e,0x58,0x5b,0x31]
+          vcvttph2dq  (%r9){1to16}, %zmm30
+
+// CHECK: vcvttph2dq  4064(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7e,0x48,0x5b,0x71,0x7f]
+          vcvttph2dq  4064(%rcx), %zmm30
+
+// CHECK: vcvttph2dq  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7e,0xdf,0x5b,0x72,0x80]
+          vcvttph2dq  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+
+// CHECK: vcvttph2qq %xmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x7a,0xf5]
+          vcvttph2qq %xmm29, %zmm30
+
+// CHECK: vcvttph2qq {sae}, %xmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x7a,0xf5]
+          vcvttph2qq {sae}, %xmm29, %zmm30
+
+// CHECK: vcvttph2qq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2qq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvttph2qq  (%r9){1to8}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x7a,0x31]
+          vcvttph2qq  (%r9){1to8}, %zmm30
+
+// CHECK: vcvttph2qq  2032(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x7a,0x71,0x7f]
+          vcvttph2qq  2032(%rcx), %zmm30
+
+// CHECK: vcvttph2qq  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x7a,0x72,0x80]
+          vcvttph2qq  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+
+// CHECK: vcvttph2udq %ymm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x78,0xf5]
+          vcvttph2udq %ymm29, %zmm30
+
+// CHECK: vcvttph2udq {sae}, %ymm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x78,0xf5]
+          vcvttph2udq {sae}, %ymm29, %zmm30
+
+// CHECK: vcvttph2udq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2udq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvttph2udq  (%r9){1to16}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x78,0x31]
+          vcvttph2udq  (%r9){1to16}, %zmm30
+
+// CHECK: vcvttph2udq  4064(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x78,0x71,0x7f]
+          vcvttph2udq  4064(%rcx), %zmm30
+
+// CHECK: vcvttph2udq  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x78,0x72,0x80]
+          vcvttph2udq  -256(%rdx){1to16}, %zmm30 {%k7} {z}
+
+// CHECK: vcvttph2uqq %xmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x78,0xf5]
+          vcvttph2uqq %xmm29, %zmm30
+
+// CHECK: vcvttph2uqq {sae}, %xmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x78,0xf5]
+          vcvttph2uqq {sae}, %xmm29, %zmm30
+
+// CHECK: vcvttph2uqq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2uqq  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvttph2uqq  (%r9){1to8}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x78,0x31]
+          vcvttph2uqq  (%r9){1to8}, %zmm30
+
+// CHECK: vcvttph2uqq  2032(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x78,0x71,0x7f]
+          vcvttph2uqq  2032(%rcx), %zmm30
+
+// CHECK: vcvttph2uqq  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x78,0x72,0x80]
+          vcvttph2uqq  -256(%rdx){1to8}, %zmm30 {%k7} {z}
+
+// CHECK: vcvttph2uw %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x7c,0xf5]
+          vcvttph2uw %zmm29, %zmm30
+
+// CHECK: vcvttph2uw {sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x7c,0xf5]
+          vcvttph2uw {sae}, %zmm29, %zmm30
+
+// CHECK: vcvttph2uw  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2uw  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvttph2uw  (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x7c,0x31]
+          vcvttph2uw  (%r9){1to32}, %zmm30
+
+// CHECK: vcvttph2uw  8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x7c,0x71,0x7f]
+          vcvttph2uw  8128(%rcx), %zmm30
+
+// CHECK: vcvttph2uw  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x7c,0x72,0x80]
+          vcvttph2uw  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vcvttph2w %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x48,0x7c,0xf5]
+          vcvttph2w %zmm29, %zmm30
+
+// CHECK: vcvttph2w {sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7d,0x18,0x7c,0xf5]
+          vcvttph2w {sae}, %zmm29, %zmm30
+
+// CHECK: vcvttph2w  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7d,0x4f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2w  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvttph2w  (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7d,0x58,0x7c,0x31]
+          vcvttph2w  (%r9){1to32}, %zmm30
+
+// CHECK: vcvttph2w  8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7d,0x48,0x7c,0x71,0x7f]
+          vcvttph2w  8128(%rcx), %zmm30
+
+// CHECK: vcvttph2w  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7d,0xdf,0x7c,0x72,0x80]
+          vcvttph2w  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vcvttsh2si %xmm30, %edx
+// CHECK: encoding: [0x62,0x95,0x7e,0x08,0x2c,0xd6]
+          vcvttsh2si %xmm30, %edx
+
+// CHECK: vcvttsh2si {sae}, %xmm30, %edx
+// CHECK: encoding: [0x62,0x95,0x7e,0x18,0x2c,0xd6]
+          vcvttsh2si {sae}, %xmm30, %edx
+
+// CHECK: vcvttsh2si %xmm30, %r12
+// CHECK: encoding: [0x62,0x15,0xfe,0x08,0x2c,0xe6]
+          vcvttsh2si %xmm30, %r12
+
+// CHECK: vcvttsh2si {sae}, %xmm30, %r12
+// CHECK: encoding: [0x62,0x15,0xfe,0x18,0x2c,0xe6]
+          vcvttsh2si {sae}, %xmm30, %r12
+
+// CHECK: vcvttsh2si  268435456(%rbp,%r14,8), %edx
+// CHECK: encoding: [0x62,0xb5,0x7e,0x08,0x2c,0x94,0xf5,0x00,0x00,0x00,0x10]
+          vcvttsh2si  268435456(%rbp,%r14,8), %edx
+
+// CHECK: vcvttsh2si  (%r9), %edx
+// CHECK: encoding: [0x62,0xd5,0x7e,0x08,0x2c,0x11]
+          vcvttsh2si  (%r9), %edx
+
+// CHECK: vcvttsh2si  254(%rcx), %edx
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0x51,0x7f]
+          vcvttsh2si  254(%rcx), %edx
+
+// CHECK: vcvttsh2si  -256(%rdx), %edx
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0x52,0x80]
+          vcvttsh2si  -256(%rdx), %edx
+
+// CHECK: vcvttsh2si  268435456(%rbp,%r14,8), %r12
+// CHECK: encoding: [0x62,0x35,0xfe,0x08,0x2c,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttsh2si  268435456(%rbp,%r14,8), %r12
+
+// CHECK: vcvttsh2si  (%r9), %r12
+// CHECK: encoding: [0x62,0x55,0xfe,0x08,0x2c,0x21]
+          vcvttsh2si  (%r9), %r12
+
+// CHECK: vcvttsh2si  254(%rcx), %r12
+// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x2c,0x61,0x7f]
+          vcvttsh2si  254(%rcx), %r12
+
+// CHECK: vcvttsh2si  -256(%rdx), %r12
+// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x2c,0x62,0x80]
+          vcvttsh2si  -256(%rdx), %r12
+
+// CHECK: vcvttsh2usi %xmm30, %edx
+// CHECK: encoding: [0x62,0x95,0x7e,0x08,0x78,0xd6]
+          vcvttsh2usi %xmm30, %edx
+
+// CHECK: vcvttsh2usi {sae}, %xmm30, %edx
+// CHECK: encoding: [0x62,0x95,0x7e,0x18,0x78,0xd6]
+          vcvttsh2usi {sae}, %xmm30, %edx
+
+// CHECK: vcvttsh2usi %xmm30, %r12
+// CHECK: encoding: [0x62,0x15,0xfe,0x08,0x78,0xe6]
+          vcvttsh2usi %xmm30, %r12
+
+// CHECK: vcvttsh2usi {sae}, %xmm30, %r12
+// CHECK: encoding: [0x62,0x15,0xfe,0x18,0x78,0xe6]
+          vcvttsh2usi {sae}, %xmm30, %r12
+
+// CHECK: vcvttsh2usi  268435456(%rbp,%r14,8), %edx
+// CHECK: encoding: [0x62,0xb5,0x7e,0x08,0x78,0x94,0xf5,0x00,0x00,0x00,0x10]
+          vcvttsh2usi  268435456(%rbp,%r14,8), %edx
+
+// CHECK: vcvttsh2usi  (%r9), %edx
+// CHECK: encoding: [0x62,0xd5,0x7e,0x08,0x78,0x11]
+          vcvttsh2usi  (%r9), %edx
+
+// CHECK: vcvttsh2usi  254(%rcx), %edx
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0x51,0x7f]
+          vcvttsh2usi  254(%rcx), %edx
+
+// CHECK: vcvttsh2usi  -256(%rdx), %edx
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0x52,0x80]
+          vcvttsh2usi  -256(%rdx), %edx
+
+// CHECK: vcvttsh2usi  268435456(%rbp,%r14,8), %r12
+// CHECK: encoding: [0x62,0x35,0xfe,0x08,0x78,0xa4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttsh2usi  268435456(%rbp,%r14,8), %r12
+
+// CHECK: vcvttsh2usi  (%r9), %r12
+// CHECK: encoding: [0x62,0x55,0xfe,0x08,0x78,0x21]
+          vcvttsh2usi  (%r9), %r12
+
+// CHECK: vcvttsh2usi  254(%rcx), %r12
+// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x78,0x61,0x7f]
+          vcvttsh2usi  254(%rcx), %r12
+
+// CHECK: vcvttsh2usi  -256(%rdx), %r12
+// CHECK: encoding: [0x62,0x75,0xfe,0x08,0x78,0x62,0x80]
+          vcvttsh2usi  -256(%rdx), %r12
+
+// CHECK: vcvtudq2ph %zmm29, %ymm30
+// CHECK: encoding: [0x62,0x05,0x7f,0x48,0x7a,0xf5]
+          vcvtudq2ph %zmm29, %ymm30
+
+// CHECK: vcvtudq2ph {rn-sae}, %zmm29, %ymm30
+// CHECK: encoding: [0x62,0x05,0x7f,0x18,0x7a,0xf5]
+          vcvtudq2ph {rn-sae}, %zmm29, %ymm30
+
+// CHECK: vcvtudq2ph  268435456(%rbp,%r14,8), %ymm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7f,0x4f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtudq2ph  268435456(%rbp,%r14,8), %ymm30 {%k7}
+
+// CHECK: vcvtudq2ph  (%r9){1to16}, %ymm30
+// CHECK: encoding: [0x62,0x45,0x7f,0x58,0x7a,0x31]
+          vcvtudq2ph  (%r9){1to16}, %ymm30
+
+// CHECK: vcvtudq2ph  8128(%rcx), %ymm30
+// CHECK: encoding: [0x62,0x65,0x7f,0x48,0x7a,0x71,0x7f]
+          vcvtudq2ph  8128(%rcx), %ymm30
+
+// CHECK: vcvtudq2ph  -512(%rdx){1to16}, %ymm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7f,0xdf,0x7a,0x72,0x80]
+          vcvtudq2ph  -512(%rdx){1to16}, %ymm30 {%k7} {z}
+
+// CHECK: vcvtuqq2ph %zmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0xff,0x48,0x7a,0xf5]
+          vcvtuqq2ph %zmm29, %xmm30
+
+// CHECK: vcvtuqq2ph {rn-sae}, %zmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0xff,0x18,0x7a,0xf5]
+          vcvtuqq2ph {rn-sae}, %zmm29, %xmm30
+
+// CHECK: vcvtuqq2phz  268435456(%rbp,%r14,8), %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0xff,0x4f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtuqq2phz  268435456(%rbp,%r14,8), %xmm30 {%k7}
+
+// CHECK: vcvtuqq2ph  (%r9){1to8}, %xmm30
+// CHECK: encoding: [0x62,0x45,0xff,0x58,0x7a,0x31]
+          vcvtuqq2ph  (%r9){1to8}, %xmm30
+
+// CHECK: vcvtuqq2phz  8128(%rcx), %xmm30
+// CHECK: encoding: [0x62,0x65,0xff,0x48,0x7a,0x71,0x7f]
+          vcvtuqq2phz  8128(%rcx), %xmm30
+
+// CHECK: vcvtuqq2ph  -1024(%rdx){1to8}, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0xff,0xdf,0x7a,0x72,0x80]
+          vcvtuqq2ph  -1024(%rdx){1to8}, %xmm30 {%k7} {z}
+
+// CHECK: vcvtusi2sh %r12, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x96,0x00,0x7b,0xf4]
+          vcvtusi2sh %r12, %xmm29, %xmm30
+
+// CHECK: vcvtusi2sh %r12, {rn-sae}, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x96,0x10,0x7b,0xf4]
+          vcvtusi2sh %r12, {rn-sae}, %xmm29, %xmm30
+
+// CHECK: vcvtusi2sh %edx, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x7b,0xf2]
+          vcvtusi2sh %edx, %xmm29, %xmm30
+
+// CHECK: vcvtusi2sh %edx, {rn-sae}, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x10,0x7b,0xf2]
+          vcvtusi2sh %edx, {rn-sae}, %xmm29, %xmm30
+
+// CHECK: vcvtusi2shl  268435456(%rbp,%r14,8), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x25,0x16,0x00,0x7b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtusi2shl  268435456(%rbp,%r14,8), %xmm29, %xmm30
+
+// CHECK: vcvtusi2shl  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x16,0x00,0x7b,0x31]
+          vcvtusi2shl  (%r9), %xmm29, %xmm30
+
+// CHECK: vcvtusi2shl  508(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x7b,0x71,0x7f]
+          vcvtusi2shl  508(%rcx), %xmm29, %xmm30
+
+// CHECK: vcvtusi2shl  -512(%rdx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x7b,0x72,0x80]
+          vcvtusi2shl  -512(%rdx), %xmm29, %xmm30
+
+// CHECK: vcvtusi2shq  1016(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x96,0x00,0x7b,0x71,0x7f]
+          vcvtusi2shq  1016(%rcx), %xmm29, %xmm30
+
+// CHECK: vcvtusi2shq  -1024(%rdx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x96,0x00,0x7b,0x72,0x80]
+          vcvtusi2shq  -1024(%rdx), %xmm29, %xmm30
+
+// CHECK: vcvtuw2ph %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7f,0x48,0x7d,0xf5]
+          vcvtuw2ph %zmm29, %zmm30
+
+// CHECK: vcvtuw2ph {rn-sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7f,0x18,0x7d,0xf5]
+          vcvtuw2ph {rn-sae}, %zmm29, %zmm30
+
+// CHECK: vcvtuw2ph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7f,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtuw2ph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvtuw2ph  (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7f,0x58,0x7d,0x31]
+          vcvtuw2ph  (%r9){1to32}, %zmm30
+
+// CHECK: vcvtuw2ph  8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7f,0x48,0x7d,0x71,0x7f]
+          vcvtuw2ph  8128(%rcx), %zmm30
+
+// CHECK: vcvtuw2ph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7f,0xdf,0x7d,0x72,0x80]
+          vcvtuw2ph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vcvtw2ph %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7e,0x48,0x7d,0xf5]
+          vcvtw2ph %zmm29, %zmm30
+
+// CHECK: vcvtw2ph {rn-sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7e,0x18,0x7d,0xf5]
+          vcvtw2ph {rn-sae}, %zmm29, %zmm30
+
+// CHECK: vcvtw2ph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7e,0x4f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtw2ph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vcvtw2ph  (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7e,0x58,0x7d,0x31]
+          vcvtw2ph  (%r9){1to32}, %zmm30
+
+// CHECK: vcvtw2ph  8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7e,0x48,0x7d,0x71,0x7f]
+          vcvtw2ph  8128(%rcx), %zmm30
+
+// CHECK: vcvtw2ph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7e,0xdf,0x7d,0x72,0x80]
+          vcvtw2ph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vfpclassph $123, %zmm30, %k5
+// CHECK: encoding: [0x62,0x93,0x7c,0x48,0x66,0xee,0x7b]
+          vfpclassph $123, %zmm30, %k5
+
+// CHECK: vfpclassphz  $123, 268435456(%rbp,%r14,8), %k5 {%k7}
+// CHECK: encoding: [0x62,0xb3,0x7c,0x4f,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vfpclassphz  $123, 268435456(%rbp,%r14,8), %k5 {%k7}
+
+// CHECK: vfpclassph  $123, (%r9){1to32}, %k5
+// CHECK: encoding: [0x62,0xd3,0x7c,0x58,0x66,0x29,0x7b]
+          vfpclassph  $123, (%r9){1to32}, %k5
+
+// CHECK: vfpclassphz  $123, 8128(%rcx), %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x66,0x69,0x7f,0x7b]
+          vfpclassphz  $123, 8128(%rcx), %k5
+
+// CHECK: vfpclassph  $123, -256(%rdx){1to32}, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x5f,0x66,0x6a,0x80,0x7b]
+          vfpclassph  $123, -256(%rdx){1to32}, %k5 {%k7}
+
+// CHECK: vfpclasssh $123, %xmm30, %k5
+// CHECK: encoding: [0x62,0x93,0x7c,0x08,0x67,0xee,0x7b]
+          vfpclasssh $123, %xmm30, %k5
+
+// CHECK: vfpclasssh  $123, 268435456(%rbp,%r14,8), %k5 {%k7}
+// CHECK: encoding: [0x62,0xb3,0x7c,0x0f,0x67,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vfpclasssh  $123, 268435456(%rbp,%r14,8), %k5 {%k7}
+
+// CHECK: vfpclasssh  $123, (%r9), %k5
+// CHECK: encoding: [0x62,0xd3,0x7c,0x08,0x67,0x29,0x7b]
+          vfpclasssh  $123, (%r9), %k5
+
+// CHECK: vfpclasssh  $123, 254(%rcx), %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x67,0x69,0x7f,0x7b]
+          vfpclasssh  $123, 254(%rcx), %k5
+
+// CHECK: vfpclasssh  $123, -256(%rdx), %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x67,0x6a,0x80,0x7b]
+          vfpclasssh  $123, -256(%rdx), %k5 {%k7}
+
+// CHECK: vgetexpph %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x7d,0x48,0x42,0xf5]
+          vgetexpph %zmm29, %zmm30
+
+// CHECK: vgetexpph {sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x7d,0x18,0x42,0xf5]
+          vgetexpph {sae}, %zmm29, %zmm30
+
+// CHECK: vgetexpph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x7d,0x4f,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vgetexpph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vgetexpph  (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x46,0x7d,0x58,0x42,0x31]
+          vgetexpph  (%r9){1to32}, %zmm30
+
+// CHECK: vgetexpph  8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x66,0x7d,0x48,0x42,0x71,0x7f]
+          vgetexpph  8128(%rcx), %zmm30
+
+// CHECK: vgetexpph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x7d,0xdf,0x42,0x72,0x80]
+          vgetexpph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vgetexpsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x43,0xf4]
+          vgetexpsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vgetexpsh {sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x43,0xf4]
+          vgetexpsh {sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vgetexpsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x43,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vgetexpsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vgetexpsh  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0x43,0x31]
+          vgetexpsh  (%r9), %xmm29, %xmm30
+
+// CHECK: vgetexpsh  254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x43,0x71,0x7f]
+          vgetexpsh  254(%rcx), %xmm29, %xmm30
+
+// CHECK: vgetexpsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0x43,0x72,0x80]
+          vgetexpsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vgetmantph $123, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x03,0x7c,0x48,0x26,0xf5,0x7b]
+          vgetmantph $123, %zmm29, %zmm30
+
+// CHECK: vgetmantph $123, {sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x03,0x7c,0x18,0x26,0xf5,0x7b]
+          vgetmantph $123, {sae}, %zmm29, %zmm30
+
+// CHECK: vgetmantph  $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x23,0x7c,0x4f,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantph  $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vgetmantph  $123, (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x43,0x7c,0x58,0x26,0x31,0x7b]
+          vgetmantph  $123, (%r9){1to32}, %zmm30
+
+// CHECK: vgetmantph  $123, 8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x63,0x7c,0x48,0x26,0x71,0x7f,0x7b]
+          vgetmantph  $123, 8128(%rcx), %zmm30
+
+// CHECK: vgetmantph  $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x63,0x7c,0xdf,0x26,0x72,0x80,0x7b]
+          vgetmantph  $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vgetmantsh $123, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x03,0x14,0x00,0x27,0xf4,0x7b]
+          vgetmantsh $123, %xmm28, %xmm29, %xmm30
+
+// CHECK: vgetmantsh $123, {sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x03,0x14,0x10,0x27,0xf4,0x7b]
+          vgetmantsh $123, {sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vgetmantsh  $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x23,0x14,0x07,0x27,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantsh  $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vgetmantsh  $123, (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x43,0x14,0x00,0x27,0x31,0x7b]
+          vgetmantsh  $123, (%r9), %xmm29, %xmm30
+
+// CHECK: vgetmantsh  $123, 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x63,0x14,0x00,0x27,0x71,0x7f,0x7b]
+          vgetmantsh  $123, 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vgetmantsh  $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x63,0x14,0x87,0x27,0x72,0x80,0x7b]
+          vgetmantsh  $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vrcpph %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x7d,0x48,0x4c,0xf5]
+          vrcpph %zmm29, %zmm30
+
+// CHECK: vrcpph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x7d,0x4f,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrcpph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vrcpph  (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x46,0x7d,0x58,0x4c,0x31]
+          vrcpph  (%r9){1to32}, %zmm30
+
+// CHECK: vrcpph  8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x66,0x7d,0x48,0x4c,0x71,0x7f]
+          vrcpph  8128(%rcx), %zmm30
+
+// CHECK: vrcpph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x7d,0xdf,0x4c,0x72,0x80]
+          vrcpph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vrcpsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x4d,0xf4]
+          vrcpsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vrcpsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x4d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrcpsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vrcpsh  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0x4d,0x31]
+          vrcpsh  (%r9), %xmm29, %xmm30
+
+// CHECK: vrcpsh  254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x4d,0x71,0x7f]
+          vrcpsh  254(%rcx), %xmm29, %xmm30
+
+// CHECK: vrcpsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0x4d,0x72,0x80]
+          vrcpsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vreduceph $123, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x03,0x7c,0x48,0x56,0xf5,0x7b]
+          vreduceph $123, %zmm29, %zmm30
+
+// CHECK: vreduceph $123, {sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x03,0x7c,0x18,0x56,0xf5,0x7b]
+          vreduceph $123, {sae}, %zmm29, %zmm30
+
+// CHECK: vreduceph  $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x23,0x7c,0x4f,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vreduceph  $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vreduceph  $123, (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x43,0x7c,0x58,0x56,0x31,0x7b]
+          vreduceph  $123, (%r9){1to32}, %zmm30
+
+// CHECK: vreduceph  $123, 8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x63,0x7c,0x48,0x56,0x71,0x7f,0x7b]
+          vreduceph  $123, 8128(%rcx), %zmm30
+
+// CHECK: vreduceph  $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x63,0x7c,0xdf,0x56,0x72,0x80,0x7b]
+          vreduceph  $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vreducesh $123, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x03,0x14,0x00,0x57,0xf4,0x7b]
+          vreducesh $123, %xmm28, %xmm29, %xmm30
+
+// CHECK: vreducesh $123, {sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x03,0x14,0x10,0x57,0xf4,0x7b]
+          vreducesh $123, {sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vreducesh  $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x23,0x14,0x07,0x57,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vreducesh  $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vreducesh  $123, (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x43,0x14,0x00,0x57,0x31,0x7b]
+          vreducesh  $123, (%r9), %xmm29, %xmm30
+
+// CHECK: vreducesh  $123, 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x63,0x14,0x00,0x57,0x71,0x7f,0x7b]
+          vreducesh  $123, 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vreducesh  $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x63,0x14,0x87,0x57,0x72,0x80,0x7b]
+          vreducesh  $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vrndscaleph $123, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x03,0x7c,0x48,0x08,0xf5,0x7b]
+          vrndscaleph $123, %zmm29, %zmm30
+
+// CHECK: vrndscaleph $123, {sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x03,0x7c,0x18,0x08,0xf5,0x7b]
+          vrndscaleph $123, {sae}, %zmm29, %zmm30
+
+// CHECK: vrndscaleph  $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x23,0x7c,0x4f,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vrndscaleph  $123, 268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vrndscaleph  $123, (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x43,0x7c,0x58,0x08,0x31,0x7b]
+          vrndscaleph  $123, (%r9){1to32}, %zmm30
+
+// CHECK: vrndscaleph  $123, 8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x63,0x7c,0x48,0x08,0x71,0x7f,0x7b]
+          vrndscaleph  $123, 8128(%rcx), %zmm30
+
+// CHECK: vrndscaleph  $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x63,0x7c,0xdf,0x08,0x72,0x80,0x7b]
+          vrndscaleph  $123, -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vrndscalesh $123, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x03,0x14,0x00,0x0a,0xf4,0x7b]
+          vrndscalesh $123, %xmm28, %xmm29, %xmm30
+
+// CHECK: vrndscalesh $123, {sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x03,0x14,0x10,0x0a,0xf4,0x7b]
+          vrndscalesh $123, {sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vrndscalesh  $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x23,0x14,0x07,0x0a,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalesh  $123, 268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vrndscalesh  $123, (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x43,0x14,0x00,0x0a,0x31,0x7b]
+          vrndscalesh  $123, (%r9), %xmm29, %xmm30
+
+// CHECK: vrndscalesh  $123, 254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x63,0x14,0x00,0x0a,0x71,0x7f,0x7b]
+          vrndscalesh  $123, 254(%rcx), %xmm29, %xmm30
+
+// CHECK: vrndscalesh  $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x63,0x14,0x87,0x0a,0x72,0x80,0x7b]
+          vrndscalesh  $123, -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vrsqrtph %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x7d,0x48,0x4e,0xf5]
+          vrsqrtph %zmm29, %zmm30
+
+// CHECK: vrsqrtph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x7d,0x4f,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrsqrtph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vrsqrtph  (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x46,0x7d,0x58,0x4e,0x31]
+          vrsqrtph  (%r9){1to32}, %zmm30
+
+// CHECK: vrsqrtph  8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x66,0x7d,0x48,0x4e,0x71,0x7f]
+          vrsqrtph  8128(%rcx), %zmm30
+
+// CHECK: vrsqrtph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x7d,0xdf,0x4e,0x72,0x80]
+          vrsqrtph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vrsqrtsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x4f,0xf4]
+          vrsqrtsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vrsqrtsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x4f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrsqrtsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vrsqrtsh  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0x4f,0x31]
+          vrsqrtsh  (%r9), %xmm29, %xmm30
+
+// CHECK: vrsqrtsh  254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x4f,0x71,0x7f]
+          vrsqrtsh  254(%rcx), %xmm29, %xmm30
+
+// CHECK: vrsqrtsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0x4f,0x72,0x80]
+          vrsqrtsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vscalefph %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x40,0x2c,0xf4]
+          vscalefph %zmm28, %zmm29, %zmm30
+
+// CHECK: vscalefph {rn-sae}, %zmm28, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x2c,0xf4]
+          vscalefph {rn-sae}, %zmm28, %zmm29, %zmm30
+
+// CHECK: vscalefph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x47,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vscalefph  268435456(%rbp,%r14,8), %zmm29, %zmm30 {%k7}
+
+// CHECK: vscalefph  (%r9){1to32}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x50,0x2c,0x31]
+          vscalefph  (%r9){1to32}, %zmm29, %zmm30
+
+// CHECK: vscalefph  8128(%rcx), %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x40,0x2c,0x71,0x7f]
+          vscalefph  8128(%rcx), %zmm29, %zmm30
+
+// CHECK: vscalefph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0xd7,0x2c,0x72,0x80]
+          vscalefph  -256(%rdx){1to32}, %zmm29, %zmm30 {%k7} {z}
+
+// CHECK: vscalefsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x2d,0xf4]
+          vscalefsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vscalefsh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x06,0x15,0x10,0x2d,0xf4]
+          vscalefsh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vscalefsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x2d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vscalefsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vscalefsh  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x46,0x15,0x00,0x2d,0x31]
+          vscalefsh  (%r9), %xmm29, %xmm30
+
+// CHECK: vscalefsh  254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x2d,0x71,0x7f]
+          vscalefsh  254(%rcx), %xmm29, %xmm30
+
+// CHECK: vscalefsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x66,0x15,0x87,0x2d,0x72,0x80]
+          vscalefsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+
+// CHECK: vsqrtph %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x48,0x51,0xf5]
+          vsqrtph %zmm29, %zmm30
+
+// CHECK: vsqrtph {rn-sae}, %zmm29, %zmm30
+// CHECK: encoding: [0x62,0x05,0x7c,0x18,0x51,0xf5]
+          vsqrtph {rn-sae}, %zmm29, %zmm30
+
+// CHECK: vsqrtph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x7c,0x4f,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsqrtph  268435456(%rbp,%r14,8), %zmm30 {%k7}
+
+// CHECK: vsqrtph  (%r9){1to32}, %zmm30
+// CHECK: encoding: [0x62,0x45,0x7c,0x58,0x51,0x31]
+          vsqrtph  (%r9){1to32}, %zmm30
+
+// CHECK: vsqrtph  8128(%rcx), %zmm30
+// CHECK: encoding: [0x62,0x65,0x7c,0x48,0x51,0x71,0x7f]
+          vsqrtph  8128(%rcx), %zmm30
+
+// CHECK: vsqrtph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x7c,0xdf,0x51,0x72,0x80]
+          vsqrtph  -256(%rdx){1to32}, %zmm30 {%k7} {z}
+
+// CHECK: vsqrtsh %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x00,0x51,0xf4]
+          vsqrtsh %xmm28, %xmm29, %xmm30
+
+// CHECK: vsqrtsh {rn-sae}, %xmm28, %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x05,0x16,0x10,0x51,0xf4]
+          vsqrtsh {rn-sae}, %xmm28, %xmm29, %xmm30
+
+// CHECK: vsqrtsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+// CHECK: encoding: [0x62,0x25,0x16,0x07,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsqrtsh  268435456(%rbp,%r14,8), %xmm29, %xmm30 {%k7}
+
+// CHECK: vsqrtsh  (%r9), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x45,0x16,0x00,0x51,0x31]
+          vsqrtsh  (%r9), %xmm29, %xmm30
+
+// CHECK: vsqrtsh  254(%rcx), %xmm29, %xmm30
+// CHECK: encoding: [0x62,0x65,0x16,0x00,0x51,0x71,0x7f]
+          vsqrtsh  254(%rcx), %xmm29, %xmm30
+
+// CHECK: vsqrtsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
+// CHECK: encoding: [0x62,0x65,0x16,0x87,0x51,0x72,0x80]
+          vsqrtsh  -256(%rdx), %xmm29, %xmm30 {%k7} {z}
diff --git a/llvm/test/MC/X86/avx512fp16vl.s b/llvm/test/MC/X86/avx512fp16vl.s
new file mode 100644
index 0000000000000..91c45a56a2e8a
--- /dev/null
+++ b/llvm/test/MC/X86/avx512fp16vl.s
@@ -0,0 +1,1493 @@
+// RUN: llvm-mc -triple i686-unknown-unknown --show-encoding < %s  | FileCheck %s
+
+// CHECK: vaddph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x58,0xf4]
+          vaddph %ymm4, %ymm5, %ymm6
+
+// CHECK: vaddph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x58,0xf4]
+          vaddph %xmm4, %xmm5, %xmm6
+
+// CHECK: vaddph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vaddph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vaddph  (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x58,0x31]
+          vaddph  (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vaddph  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x58,0x71,0x7f]
+          vaddph  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vaddph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x58,0x72,0x80]
+          vaddph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vaddph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vaddph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vaddph  (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x58,0x31]
+          vaddph  (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vaddph  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x58,0x71,0x7f]
+          vaddph  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vaddph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x58,0x72,0x80]
+          vaddph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vcmpeqph %ymm4, %ymm5, %k5
+// CHECK: encoding: [0x62,0xf3,0x54,0x28,0xc2,0xec,0x00]
+          vcmpph $0, %ymm4, %ymm5, %k5
+
+// CHECK: vcmpltph %xmm4, %xmm5, %k5
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0xc2,0xec,0x01]
+          vcmpph $1, %xmm4, %xmm5, %k5
+
+// CHECK: vcmpleph 268435456(%esp,%esi,8), %xmm5, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x54,0x0f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x02]
+          vcmpph  $2, 268435456(%esp,%esi,8), %xmm5, %k5 {%k7}
+
+// CHECK: vcmpunordph (%ecx){1to8}, %xmm5, %k5
+// CHECK: encoding: [0x62,0xf3,0x54,0x18,0xc2,0x29,0x03]
+          vcmpph  $3, (%ecx){1to8}, %xmm5, %k5
+
+// CHECK: vcmpneqph 2032(%ecx), %xmm5, %k5
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0xc2,0x69,0x7f,0x04]
+          vcmpph  $4, 2032(%ecx), %xmm5, %k5
+
+// CHECK: vcmpnltph -256(%edx){1to8}, %xmm5, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x54,0x1f,0xc2,0x6a,0x80,0x05]
+          vcmpph  $5, -256(%edx){1to8}, %xmm5, %k5 {%k7}
+
+// CHECK: vcmpnleph 268435456(%esp,%esi,8), %ymm5, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x54,0x2f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x06]
+          vcmpph  $6, 268435456(%esp,%esi,8), %ymm5, %k5 {%k7}
+
+// CHECK: vcmpordph (%ecx){1to16}, %ymm5, %k5
+// CHECK: encoding: [0x62,0xf3,0x54,0x38,0xc2,0x29,0x07]
+          vcmpph  $7, (%ecx){1to16}, %ymm5, %k5
+
+// CHECK: vcmpeq_uqph 4064(%ecx), %ymm5, %k5
+// CHECK: encoding: [0x62,0xf3,0x54,0x28,0xc2,0x69,0x7f,0x08]
+          vcmpph  $8, 4064(%ecx), %ymm5, %k5
+
+// CHECK: vcmpngeph -256(%edx){1to16}, %ymm5, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x54,0x3f,0xc2,0x6a,0x80,0x09]
+          vcmpph  $9, -256(%edx){1to16}, %ymm5, %k5 {%k7}
+
+// CHECK: vdivph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5e,0xf4]
+          vdivph %ymm4, %ymm5, %ymm6
+
+// CHECK: vdivph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5e,0xf4]
+          vdivph %xmm4, %xmm5, %xmm6
+
+// CHECK: vdivph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vdivph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vdivph  (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x5e,0x31]
+          vdivph  (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vdivph  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5e,0x71,0x7f]
+          vdivph  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vdivph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x5e,0x72,0x80]
+          vdivph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vdivph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vdivph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vdivph  (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5e,0x31]
+          vdivph  (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vdivph  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5e,0x71,0x7f]
+          vdivph  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vdivph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x5e,0x72,0x80]
+          vdivph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vmaxph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5f,0xf4]
+          vmaxph %ymm4, %ymm5, %ymm6
+
+// CHECK: vmaxph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5f,0xf4]
+          vmaxph %xmm4, %xmm5, %xmm6
+
+// CHECK: vmaxph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vmaxph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vmaxph  (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x5f,0x31]
+          vmaxph  (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vmaxph  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5f,0x71,0x7f]
+          vmaxph  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vmaxph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x5f,0x72,0x80]
+          vmaxph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vmaxph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vmaxph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vmaxph  (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5f,0x31]
+          vmaxph  (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vmaxph  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5f,0x71,0x7f]
+          vmaxph  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vmaxph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x5f,0x72,0x80]
+          vmaxph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vminph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5d,0xf4]
+          vminph %ymm4, %ymm5, %ymm6
+
+// CHECK: vminph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5d,0xf4]
+          vminph %xmm4, %xmm5, %xmm6
+
+// CHECK: vminph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vminph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vminph  (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x5d,0x31]
+          vminph  (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vminph  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5d,0x71,0x7f]
+          vminph  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vminph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x5d,0x72,0x80]
+          vminph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vminph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vminph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vminph  (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5d,0x31]
+          vminph  (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vminph  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5d,0x71,0x7f]
+          vminph  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vminph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x5d,0x72,0x80]
+          vminph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vmulph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x59,0xf4]
+          vmulph %ymm4, %ymm5, %ymm6
+
+// CHECK: vmulph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x59,0xf4]
+          vmulph %xmm4, %xmm5, %xmm6
+
+// CHECK: vmulph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vmulph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vmulph  (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x59,0x31]
+          vmulph  (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vmulph  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x59,0x71,0x7f]
+          vmulph  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vmulph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x59,0x72,0x80]
+          vmulph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vmulph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vmulph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vmulph  (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x59,0x31]
+          vmulph  (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vmulph  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x59,0x71,0x7f]
+          vmulph  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vmulph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x59,0x72,0x80]
+          vmulph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vsubph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5c,0xf4]
+          vsubph %ymm4, %ymm5, %ymm6
+
+// CHECK: vsubph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5c,0xf4]
+          vsubph %xmm4, %xmm5, %xmm6
+
+// CHECK: vsubph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x54,0x2f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vsubph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vsubph  (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x38,0x5c,0x31]
+          vsubph  (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vsubph  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x28,0x5c,0x71,0x7f]
+          vsubph  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vsubph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x54,0xbf,0x5c,0x72,0x80]
+          vsubph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vsubph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vsubph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vsubph  (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5c,0x31]
+          vsubph  (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vsubph  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x5c,0x71,0x7f]
+          vsubph  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vsubph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x54,0x9f,0x5c,0x72,0x80]
+          vsubph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vcvtdq2ph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x5b,0xf5]
+          vcvtdq2ph %xmm5, %xmm6
+
+// CHECK: vcvtdq2ph %ymm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x5b,0xf5]
+          vcvtdq2ph %ymm5, %xmm6
+
+// CHECK: vcvtdq2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtdq2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtdq2ph  (%ecx){1to4}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x5b,0x31]
+          vcvtdq2ph  (%ecx){1to4}, %xmm6
+
+// CHECK: vcvtdq2phx  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x5b,0x71,0x7f]
+          vcvtdq2phx  2032(%ecx), %xmm6
+
+// CHECK: vcvtdq2ph  -512(%edx){1to4}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x5b,0x72,0x80]
+          vcvtdq2ph  -512(%edx){1to4}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtdq2ph  (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x5b,0x31]
+          vcvtdq2ph  (%ecx){1to8}, %xmm6
+
+// CHECK: vcvtdq2phy  4064(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x5b,0x71,0x7f]
+          vcvtdq2phy  4064(%ecx), %xmm6
+
+// CHECK: vcvtdq2ph  -512(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x5b,0x72,0x80]
+          vcvtdq2ph  -512(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtpd2ph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0xfd,0x08,0x5a,0xf5]
+          vcvtpd2ph %xmm5, %xmm6
+
+// CHECK: vcvtpd2ph %ymm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0xfd,0x28,0x5a,0xf5]
+          vcvtpd2ph %ymm5, %xmm6
+
+// CHECK: vcvtpd2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0xfd,0x0f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtpd2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtpd2ph  (%ecx){1to2}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0xfd,0x18,0x5a,0x31]
+          vcvtpd2ph  (%ecx){1to2}, %xmm6
+
+// CHECK: vcvtpd2phx  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0xfd,0x08,0x5a,0x71,0x7f]
+          vcvtpd2phx  2032(%ecx), %xmm6
+
+// CHECK: vcvtpd2ph  -1024(%edx){1to2}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0xfd,0x9f,0x5a,0x72,0x80]
+          vcvtpd2ph  -1024(%edx){1to2}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtpd2ph  (%ecx){1to4}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0xfd,0x38,0x5a,0x31]
+          vcvtpd2ph  (%ecx){1to4}, %xmm6
+
+// CHECK: vcvtpd2phy  4064(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0xfd,0x28,0x5a,0x71,0x7f]
+          vcvtpd2phy  4064(%ecx), %xmm6
+
+// CHECK: vcvtpd2ph  -1024(%edx){1to4}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0xfd,0xbf,0x5a,0x72,0x80]
+          vcvtpd2ph  -1024(%edx){1to4}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtph2dq %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x5b,0xf5]
+          vcvtph2dq %xmm5, %xmm6
+
+// CHECK: vcvtph2dq %xmm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x5b,0xf5]
+          vcvtph2dq %xmm5, %ymm6
+
+// CHECK: vcvtph2dq  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2dq  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtph2dq  (%ecx){1to4}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x5b,0x31]
+          vcvtph2dq  (%ecx){1to4}, %xmm6
+
+// CHECK: vcvtph2dq  1016(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x5b,0x71,0x7f]
+          vcvtph2dq  1016(%ecx), %xmm6
+
+// CHECK: vcvtph2dq  -256(%edx){1to4}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x5b,0x72,0x80]
+          vcvtph2dq  -256(%edx){1to4}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtph2dq  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2dq  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvtph2dq  (%ecx){1to8}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x5b,0x31]
+          vcvtph2dq  (%ecx){1to8}, %ymm6
+
+// CHECK: vcvtph2dq  2032(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x5b,0x71,0x7f]
+          vcvtph2dq  2032(%ecx), %ymm6
+
+// CHECK: vcvtph2dq  -256(%edx){1to8}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x5b,0x72,0x80]
+          vcvtph2dq  -256(%edx){1to8}, %ymm6 {%k7} {z}
+
+// CHECK: vcvtph2pd %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x5a,0xf5]
+          vcvtph2pd %xmm5, %xmm6
+
+// CHECK: vcvtph2pd %xmm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x5a,0xf5]
+          vcvtph2pd %xmm5, %ymm6
+
+// CHECK: vcvtph2pd  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2pd  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtph2pd  (%ecx){1to2}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x5a,0x31]
+          vcvtph2pd  (%ecx){1to2}, %xmm6
+
+// CHECK: vcvtph2pd  508(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x5a,0x71,0x7f]
+          vcvtph2pd  508(%ecx), %xmm6
+
+// CHECK: vcvtph2pd  -256(%edx){1to2}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x5a,0x72,0x80]
+          vcvtph2pd  -256(%edx){1to2}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtph2pd  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x2f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2pd  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvtph2pd  (%ecx){1to4}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x5a,0x31]
+          vcvtph2pd  (%ecx){1to4}, %ymm6
+
+// CHECK: vcvtph2pd  1016(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x5a,0x71,0x7f]
+          vcvtph2pd  1016(%ecx), %ymm6
+
+// CHECK: vcvtph2pd  -256(%edx){1to4}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x5a,0x72,0x80]
+          vcvtph2pd  -256(%edx){1to4}, %ymm6 {%k7} {z}
+
+// CHECK: vcvtph2psx %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x13,0xf5]
+          vcvtph2psx %xmm5, %xmm6
+
+// CHECK: vcvtph2psx %xmm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x13,0xf5]
+          vcvtph2psx %xmm5, %ymm6
+
+// CHECK: vcvtph2psx  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x0f,0x13,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2psx  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtph2psx  (%ecx){1to4}, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x13,0x31]
+          vcvtph2psx  (%ecx){1to4}, %xmm6
+
+// CHECK: vcvtph2psx  1016(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x13,0x71,0x7f]
+          vcvtph2psx  1016(%ecx), %xmm6
+
+// CHECK: vcvtph2psx  -256(%edx){1to4}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x9f,0x13,0x72,0x80]
+          vcvtph2psx  -256(%edx){1to4}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtph2psx  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x2f,0x13,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2psx  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvtph2psx  (%ecx){1to8}, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x38,0x13,0x31]
+          vcvtph2psx  (%ecx){1to8}, %ymm6
+
+// CHECK: vcvtph2psx  2032(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x13,0x71,0x7f]
+          vcvtph2psx  2032(%ecx), %ymm6
+
+// CHECK: vcvtph2psx  -256(%edx){1to8}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xbf,0x13,0x72,0x80]
+          vcvtph2psx  -256(%edx){1to8}, %ymm6 {%k7} {z}
+
+// CHECK: vcvtph2qq %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7b,0xf5]
+          vcvtph2qq %xmm5, %xmm6
+
+// CHECK: vcvtph2qq %xmm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7b,0xf5]
+          vcvtph2qq %xmm5, %ymm6
+
+// CHECK: vcvtph2qq  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x7b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2qq  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtph2qq  (%ecx){1to2}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7b,0x31]
+          vcvtph2qq  (%ecx){1to2}, %xmm6
+
+// CHECK: vcvtph2qq  508(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7b,0x71,0x7f]
+          vcvtph2qq  508(%ecx), %xmm6
+
+// CHECK: vcvtph2qq  -256(%edx){1to2}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x7b,0x72,0x80]
+          vcvtph2qq  -256(%edx){1to2}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtph2qq  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x7b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2qq  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvtph2qq  (%ecx){1to4}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x7b,0x31]
+          vcvtph2qq  (%ecx){1to4}, %ymm6
+
+// CHECK: vcvtph2qq  1016(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7b,0x71,0x7f]
+          vcvtph2qq  1016(%ecx), %ymm6
+
+// CHECK: vcvtph2qq  -256(%edx){1to4}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x7b,0x72,0x80]
+          vcvtph2qq  -256(%edx){1to4}, %ymm6 {%k7} {z}
+
+// CHECK: vcvtph2udq %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x79,0xf5]
+          vcvtph2udq %xmm5, %xmm6
+
+// CHECK: vcvtph2udq %xmm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x79,0xf5]
+          vcvtph2udq %xmm5, %ymm6
+
+// CHECK: vcvtph2udq  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2udq  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtph2udq  (%ecx){1to4}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x79,0x31]
+          vcvtph2udq  (%ecx){1to4}, %xmm6
+
+// CHECK: vcvtph2udq  1016(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x79,0x71,0x7f]
+          vcvtph2udq  1016(%ecx), %xmm6
+
+// CHECK: vcvtph2udq  -256(%edx){1to4}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x79,0x72,0x80]
+          vcvtph2udq  -256(%edx){1to4}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtph2udq  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x2f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2udq  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvtph2udq  (%ecx){1to8}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x79,0x31]
+          vcvtph2udq  (%ecx){1to8}, %ymm6
+
+// CHECK: vcvtph2udq  2032(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x79,0x71,0x7f]
+          vcvtph2udq  2032(%ecx), %ymm6
+
+// CHECK: vcvtph2udq  -256(%edx){1to8}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x79,0x72,0x80]
+          vcvtph2udq  -256(%edx){1to8}, %ymm6 {%k7} {z}
+
+// CHECK: vcvtph2uqq %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x79,0xf5]
+          vcvtph2uqq %xmm5, %xmm6
+
+// CHECK: vcvtph2uqq %xmm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x79,0xf5]
+          vcvtph2uqq %xmm5, %ymm6
+
+// CHECK: vcvtph2uqq  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2uqq  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtph2uqq  (%ecx){1to2}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x79,0x31]
+          vcvtph2uqq  (%ecx){1to2}, %xmm6
+
+// CHECK: vcvtph2uqq  508(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x79,0x71,0x7f]
+          vcvtph2uqq  508(%ecx), %xmm6
+
+// CHECK: vcvtph2uqq  -256(%edx){1to2}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x79,0x72,0x80]
+          vcvtph2uqq  -256(%edx){1to2}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtph2uqq  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2uqq  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvtph2uqq  (%ecx){1to4}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x79,0x31]
+          vcvtph2uqq  (%ecx){1to4}, %ymm6
+
+// CHECK: vcvtph2uqq  1016(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x79,0x71,0x7f]
+          vcvtph2uqq  1016(%ecx), %ymm6
+
+// CHECK: vcvtph2uqq  -256(%edx){1to4}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x79,0x72,0x80]
+          vcvtph2uqq  -256(%edx){1to4}, %ymm6 {%k7} {z}
+
+// CHECK: vcvtph2uw %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x7d,0xf5]
+          vcvtph2uw %xmm5, %xmm6
+
+// CHECK: vcvtph2uw %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x7d,0xf5]
+          vcvtph2uw %ymm5, %ymm6
+
+// CHECK: vcvtph2uw  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2uw  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtph2uw  (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x7d,0x31]
+          vcvtph2uw  (%ecx){1to8}, %xmm6
+
+// CHECK: vcvtph2uw  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x7d,0x71,0x7f]
+          vcvtph2uw  2032(%ecx), %xmm6
+
+// CHECK: vcvtph2uw  -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x7d,0x72,0x80]
+          vcvtph2uw  -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtph2uw  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2uw  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvtph2uw  (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x7d,0x31]
+          vcvtph2uw  (%ecx){1to16}, %ymm6
+
+// CHECK: vcvtph2uw  4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x7d,0x71,0x7f]
+          vcvtph2uw  4064(%ecx), %ymm6
+
+// CHECK: vcvtph2uw  -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x7d,0x72,0x80]
+          vcvtph2uw  -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vcvtph2w %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7d,0xf5]
+          vcvtph2w %xmm5, %xmm6
+
+// CHECK: vcvtph2w %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7d,0xf5]
+          vcvtph2w %ymm5, %ymm6
+
+// CHECK: vcvtph2w  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2w  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtph2w  (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7d,0x31]
+          vcvtph2w  (%ecx){1to8}, %xmm6
+
+// CHECK: vcvtph2w  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7d,0x71,0x7f]
+          vcvtph2w  2032(%ecx), %xmm6
+
+// CHECK: vcvtph2w  -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x7d,0x72,0x80]
+          vcvtph2w  -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtph2w  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2w  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvtph2w  (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x7d,0x31]
+          vcvtph2w  (%ecx){1to16}, %ymm6
+
+// CHECK: vcvtph2w  4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7d,0x71,0x7f]
+          vcvtph2w  4064(%ecx), %ymm6
+
+// CHECK: vcvtph2w  -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x7d,0x72,0x80]
+          vcvtph2w  -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vcvtps2phx %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x1d,0xf5]
+          vcvtps2phx %xmm5, %xmm6
+
+// CHECK: vcvtps2phx %ymm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x1d,0xf5]
+          vcvtps2phx %ymm5, %xmm6
+
+// CHECK: vcvtps2phxx  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x1d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtps2phxx  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtps2phx  (%ecx){1to4}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x1d,0x31]
+          vcvtps2phx  (%ecx){1to4}, %xmm6
+
+// CHECK: vcvtps2phxx  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x1d,0x71,0x7f]
+          vcvtps2phxx  2032(%ecx), %xmm6
+
+// CHECK: vcvtps2phx  -512(%edx){1to4}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x1d,0x72,0x80]
+          vcvtps2phx  -512(%edx){1to4}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtps2phx  (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x1d,0x31]
+          vcvtps2phx  (%ecx){1to8}, %xmm6
+
+// CHECK: vcvtps2phxy  4064(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x1d,0x71,0x7f]
+          vcvtps2phxy  4064(%ecx), %xmm6
+
+// CHECK: vcvtps2phx  -512(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x1d,0x72,0x80]
+          vcvtps2phx  -512(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtqq2ph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0xfc,0x08,0x5b,0xf5]
+          vcvtqq2ph %xmm5, %xmm6
+
+// CHECK: vcvtqq2ph %ymm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0xfc,0x28,0x5b,0xf5]
+          vcvtqq2ph %ymm5, %xmm6
+
+// CHECK: vcvtqq2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0xfc,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtqq2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtqq2ph  (%ecx){1to2}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0xfc,0x18,0x5b,0x31]
+          vcvtqq2ph  (%ecx){1to2}, %xmm6
+
+// CHECK: vcvtqq2phx  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0xfc,0x08,0x5b,0x71,0x7f]
+          vcvtqq2phx  2032(%ecx), %xmm6
+
+// CHECK: vcvtqq2ph  -1024(%edx){1to2}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0xfc,0x9f,0x5b,0x72,0x80]
+          vcvtqq2ph  -1024(%edx){1to2}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtqq2ph  (%ecx){1to4}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0xfc,0x38,0x5b,0x31]
+          vcvtqq2ph  (%ecx){1to4}, %xmm6
+
+// CHECK: vcvtqq2phy  4064(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0xfc,0x28,0x5b,0x71,0x7f]
+          vcvtqq2phy  4064(%ecx), %xmm6
+
+// CHECK: vcvtqq2ph  -1024(%edx){1to4}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0xfc,0xbf,0x5b,0x72,0x80]
+          vcvtqq2ph  -1024(%edx){1to4}, %xmm6 {%k7} {z}
+
+// CHECK: vcvttph2dq %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x5b,0xf5]
+          vcvttph2dq %xmm5, %xmm6
+
+// CHECK: vcvttph2dq %xmm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x5b,0xf5]
+          vcvttph2dq %xmm5, %ymm6
+
+// CHECK: vcvttph2dq  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2dq  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvttph2dq  (%ecx){1to4}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x5b,0x31]
+          vcvttph2dq  (%ecx){1to4}, %xmm6
+
+// CHECK: vcvttph2dq  1016(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x5b,0x71,0x7f]
+          vcvttph2dq  1016(%ecx), %xmm6
+
+// CHECK: vcvttph2dq  -256(%edx){1to4}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x5b,0x72,0x80]
+          vcvttph2dq  -256(%edx){1to4}, %xmm6 {%k7} {z}
+
+// CHECK: vcvttph2dq  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2dq  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvttph2dq  (%ecx){1to8}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x5b,0x31]
+          vcvttph2dq  (%ecx){1to8}, %ymm6
+
+// CHECK: vcvttph2dq  2032(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x5b,0x71,0x7f]
+          vcvttph2dq  2032(%ecx), %ymm6
+
+// CHECK: vcvttph2dq  -256(%edx){1to8}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x5b,0x72,0x80]
+          vcvttph2dq  -256(%edx){1to8}, %ymm6 {%k7} {z}
+
+// CHECK: vcvttph2qq %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7a,0xf5]
+          vcvttph2qq %xmm5, %xmm6
+
+// CHECK: vcvttph2qq %xmm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7a,0xf5]
+          vcvttph2qq %xmm5, %ymm6
+
+// CHECK: vcvttph2qq  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2qq  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvttph2qq  (%ecx){1to2}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7a,0x31]
+          vcvttph2qq  (%ecx){1to2}, %xmm6
+
+// CHECK: vcvttph2qq  508(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7a,0x71,0x7f]
+          vcvttph2qq  508(%ecx), %xmm6
+
+// CHECK: vcvttph2qq  -256(%edx){1to2}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x7a,0x72,0x80]
+          vcvttph2qq  -256(%edx){1to2}, %xmm6 {%k7} {z}
+
+// CHECK: vcvttph2qq  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2qq  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvttph2qq  (%ecx){1to4}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x7a,0x31]
+          vcvttph2qq  (%ecx){1to4}, %ymm6
+
+// CHECK: vcvttph2qq  1016(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7a,0x71,0x7f]
+          vcvttph2qq  1016(%ecx), %ymm6
+
+// CHECK: vcvttph2qq  -256(%edx){1to4}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x7a,0x72,0x80]
+          vcvttph2qq  -256(%edx){1to4}, %ymm6 {%k7} {z}
+
+// CHECK: vcvttph2udq %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x78,0xf5]
+          vcvttph2udq %xmm5, %xmm6
+
+// CHECK: vcvttph2udq %xmm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x78,0xf5]
+          vcvttph2udq %xmm5, %ymm6
+
+// CHECK: vcvttph2udq  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2udq  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvttph2udq  (%ecx){1to4}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x78,0x31]
+          vcvttph2udq  (%ecx){1to4}, %xmm6
+
+// CHECK: vcvttph2udq  1016(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x78,0x71,0x7f]
+          vcvttph2udq  1016(%ecx), %xmm6
+
+// CHECK: vcvttph2udq  -256(%edx){1to4}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x78,0x72,0x80]
+          vcvttph2udq  -256(%edx){1to4}, %xmm6 {%k7} {z}
+
+// CHECK: vcvttph2udq  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x2f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2udq  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvttph2udq  (%ecx){1to8}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x78,0x31]
+          vcvttph2udq  (%ecx){1to8}, %ymm6
+
+// CHECK: vcvttph2udq  2032(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x78,0x71,0x7f]
+          vcvttph2udq  2032(%ecx), %ymm6
+
+// CHECK: vcvttph2udq  -256(%edx){1to8}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x78,0x72,0x80]
+          vcvttph2udq  -256(%edx){1to8}, %ymm6 {%k7} {z}
+
+// CHECK: vcvttph2uqq %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x78,0xf5]
+          vcvttph2uqq %xmm5, %xmm6
+
+// CHECK: vcvttph2uqq %xmm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x78,0xf5]
+          vcvttph2uqq %xmm5, %ymm6
+
+// CHECK: vcvttph2uqq  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2uqq  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvttph2uqq  (%ecx){1to2}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x78,0x31]
+          vcvttph2uqq  (%ecx){1to2}, %xmm6
+
+// CHECK: vcvttph2uqq  508(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x78,0x71,0x7f]
+          vcvttph2uqq  508(%ecx), %xmm6
+
+// CHECK: vcvttph2uqq  -256(%edx){1to2}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x78,0x72,0x80]
+          vcvttph2uqq  -256(%edx){1to2}, %xmm6 {%k7} {z}
+
+// CHECK: vcvttph2uqq  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2uqq  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvttph2uqq  (%ecx){1to4}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x78,0x31]
+          vcvttph2uqq  (%ecx){1to4}, %ymm6
+
+// CHECK: vcvttph2uqq  1016(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x78,0x71,0x7f]
+          vcvttph2uqq  1016(%ecx), %ymm6
+
+// CHECK: vcvttph2uqq  -256(%edx){1to4}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x78,0x72,0x80]
+          vcvttph2uqq  -256(%edx){1to4}, %ymm6 {%k7} {z}
+
+// CHECK: vcvttph2uw %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x7c,0xf5]
+          vcvttph2uw %xmm5, %xmm6
+
+// CHECK: vcvttph2uw %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x7c,0xf5]
+          vcvttph2uw %ymm5, %ymm6
+
+// CHECK: vcvttph2uw  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2uw  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvttph2uw  (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x7c,0x31]
+          vcvttph2uw  (%ecx){1to8}, %xmm6
+
+// CHECK: vcvttph2uw  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x7c,0x71,0x7f]
+          vcvttph2uw  2032(%ecx), %xmm6
+
+// CHECK: vcvttph2uw  -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x7c,0x72,0x80]
+          vcvttph2uw  -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vcvttph2uw  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x2f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2uw  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvttph2uw  (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x7c,0x31]
+          vcvttph2uw  (%ecx){1to16}, %ymm6
+
+// CHECK: vcvttph2uw  4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x7c,0x71,0x7f]
+          vcvttph2uw  4064(%ecx), %ymm6
+
+// CHECK: vcvttph2uw  -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x7c,0x72,0x80]
+          vcvttph2uw  -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vcvttph2w %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7c,0xf5]
+          vcvttph2w %xmm5, %xmm6
+
+// CHECK: vcvttph2w %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7c,0xf5]
+          vcvttph2w %ymm5, %ymm6
+
+// CHECK: vcvttph2w  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x0f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2w  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvttph2w  (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7c,0x31]
+          vcvttph2w  (%ecx){1to8}, %xmm6
+
+// CHECK: vcvttph2w  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7c,0x71,0x7f]
+          vcvttph2w  2032(%ecx), %xmm6
+
+// CHECK: vcvttph2w  -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x9f,0x7c,0x72,0x80]
+          vcvttph2w  -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vcvttph2w  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x2f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2w  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvttph2w  (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x38,0x7c,0x31]
+          vcvttph2w  (%ecx){1to16}, %ymm6
+
+// CHECK: vcvttph2w  4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7d,0x28,0x7c,0x71,0x7f]
+          vcvttph2w  4064(%ecx), %ymm6
+
+// CHECK: vcvttph2w  -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xbf,0x7c,0x72,0x80]
+          vcvttph2w  -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vcvtudq2ph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x7a,0xf5]
+          vcvtudq2ph %xmm5, %xmm6
+
+// CHECK: vcvtudq2ph %ymm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x7a,0xf5]
+          vcvtudq2ph %ymm5, %xmm6
+
+// CHECK: vcvtudq2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7f,0x0f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtudq2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtudq2ph  (%ecx){1to4}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x7a,0x31]
+          vcvtudq2ph  (%ecx){1to4}, %xmm6
+
+// CHECK: vcvtudq2phx  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x7a,0x71,0x7f]
+          vcvtudq2phx  2032(%ecx), %xmm6
+
+// CHECK: vcvtudq2ph  -512(%edx){1to4}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7f,0x9f,0x7a,0x72,0x80]
+          vcvtudq2ph  -512(%edx){1to4}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtudq2ph  (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7f,0x38,0x7a,0x31]
+          vcvtudq2ph  (%ecx){1to8}, %xmm6
+
+// CHECK: vcvtudq2phy  4064(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x7a,0x71,0x7f]
+          vcvtudq2phy  4064(%ecx), %xmm6
+
+// CHECK: vcvtudq2ph  -512(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7f,0xbf,0x7a,0x72,0x80]
+          vcvtudq2ph  -512(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtuqq2ph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0xff,0x08,0x7a,0xf5]
+          vcvtuqq2ph %xmm5, %xmm6
+
+// CHECK: vcvtuqq2ph %ymm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0xff,0x28,0x7a,0xf5]
+          vcvtuqq2ph %ymm5, %xmm6
+
+// CHECK: vcvtuqq2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0xff,0x0f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtuqq2phx  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtuqq2ph  (%ecx){1to2}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0xff,0x18,0x7a,0x31]
+          vcvtuqq2ph  (%ecx){1to2}, %xmm6
+
+// CHECK: vcvtuqq2phx  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0xff,0x08,0x7a,0x71,0x7f]
+          vcvtuqq2phx  2032(%ecx), %xmm6
+
+// CHECK: vcvtuqq2ph  -1024(%edx){1to2}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0xff,0x9f,0x7a,0x72,0x80]
+          vcvtuqq2ph  -1024(%edx){1to2}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtuqq2ph  (%ecx){1to4}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0xff,0x38,0x7a,0x31]
+          vcvtuqq2ph  (%ecx){1to4}, %xmm6
+
+// CHECK: vcvtuqq2phy  4064(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0xff,0x28,0x7a,0x71,0x7f]
+          vcvtuqq2phy  4064(%ecx), %xmm6
+
+// CHECK: vcvtuqq2ph  -1024(%edx){1to4}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0xff,0xbf,0x7a,0x72,0x80]
+          vcvtuqq2ph  -1024(%edx){1to4}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtuw2ph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x7d,0xf5]
+          vcvtuw2ph %xmm5, %xmm6
+
+// CHECK: vcvtuw2ph %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x7d,0xf5]
+          vcvtuw2ph %ymm5, %ymm6
+
+// CHECK: vcvtuw2ph  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7f,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtuw2ph  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtuw2ph  (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x7d,0x31]
+          vcvtuw2ph  (%ecx){1to8}, %xmm6
+
+// CHECK: vcvtuw2ph  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7f,0x08,0x7d,0x71,0x7f]
+          vcvtuw2ph  2032(%ecx), %xmm6
+
+// CHECK: vcvtuw2ph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7f,0x9f,0x7d,0x72,0x80]
+          vcvtuw2ph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtuw2ph  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7f,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtuw2ph  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvtuw2ph  (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7f,0x38,0x7d,0x31]
+          vcvtuw2ph  (%ecx){1to16}, %ymm6
+
+// CHECK: vcvtuw2ph  4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7f,0x28,0x7d,0x71,0x7f]
+          vcvtuw2ph  4064(%ecx), %ymm6
+
+// CHECK: vcvtuw2ph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7f,0xbf,0x7d,0x72,0x80]
+          vcvtuw2ph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vcvtw2ph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x7d,0xf5]
+          vcvtw2ph %xmm5, %xmm6
+
+// CHECK: vcvtw2ph %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x7d,0xf5]
+          vcvtw2ph %ymm5, %ymm6
+
+// CHECK: vcvtw2ph  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x0f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtw2ph  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vcvtw2ph  (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x7d,0x31]
+          vcvtw2ph  (%ecx){1to8}, %xmm6
+
+// CHECK: vcvtw2ph  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x7d,0x71,0x7f]
+          vcvtw2ph  2032(%ecx), %xmm6
+
+// CHECK: vcvtw2ph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x9f,0x7d,0x72,0x80]
+          vcvtw2ph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vcvtw2ph  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x2f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtw2ph  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vcvtw2ph  (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x38,0x7d,0x31]
+          vcvtw2ph  (%ecx){1to16}, %ymm6
+
+// CHECK: vcvtw2ph  4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x28,0x7d,0x71,0x7f]
+          vcvtw2ph  4064(%ecx), %ymm6
+
+// CHECK: vcvtw2ph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7e,0xbf,0x7d,0x72,0x80]
+          vcvtw2ph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vfpclassph $123, %xmm6, %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x66,0xee,0x7b]
+          vfpclassph $123, %xmm6, %k5
+
+// CHECK: vfpclassph $123, %ymm6, %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x66,0xee,0x7b]
+          vfpclassph $123, %ymm6, %k5
+
+// CHECK: vfpclassphx  $123, 268435456(%esp,%esi,8), %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vfpclassphx  $123, 268435456(%esp,%esi,8), %k5 {%k7}
+
+// CHECK: vfpclassph  $123, (%ecx){1to8}, %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x66,0x29,0x7b]
+          vfpclassph  $123, (%ecx){1to8}, %k5
+
+// CHECK: vfpclassphx  $123, 2032(%ecx), %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x66,0x69,0x7f,0x7b]
+          vfpclassphx  $123, 2032(%ecx), %k5
+
+// CHECK: vfpclassph  $123, -256(%edx){1to8}, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x1f,0x66,0x6a,0x80,0x7b]
+          vfpclassph  $123, -256(%edx){1to8}, %k5 {%k7}
+
+// CHECK: vfpclassph  $123, (%ecx){1to16}, %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x38,0x66,0x29,0x7b]
+          vfpclassph  $123, (%ecx){1to16}, %k5
+
+// CHECK: vfpclassphy  $123, 4064(%ecx), %k5
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x66,0x69,0x7f,0x7b]
+          vfpclassphy  $123, 4064(%ecx), %k5
+
+// CHECK: vfpclassph  $123, -256(%edx){1to16}, %k5 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x3f,0x66,0x6a,0x80,0x7b]
+          vfpclassph  $123, -256(%edx){1to16}, %k5 {%k7}
+
+// CHECK: vgetexpph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x42,0xf5]
+          vgetexpph %xmm5, %xmm6
+
+// CHECK: vgetexpph %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x42,0xf5]
+          vgetexpph %ymm5, %ymm6
+
+// CHECK: vgetexpph  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x0f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vgetexpph  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vgetexpph  (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x42,0x31]
+          vgetexpph  (%ecx){1to8}, %xmm6
+
+// CHECK: vgetexpph  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x42,0x71,0x7f]
+          vgetexpph  2032(%ecx), %xmm6
+
+// CHECK: vgetexpph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x9f,0x42,0x72,0x80]
+          vgetexpph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vgetexpph  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x2f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vgetexpph  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vgetexpph  (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x38,0x42,0x31]
+          vgetexpph  (%ecx){1to16}, %ymm6
+
+// CHECK: vgetexpph  4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x42,0x71,0x7f]
+          vgetexpph  4064(%ecx), %ymm6
+
+// CHECK: vgetexpph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xbf,0x42,0x72,0x80]
+          vgetexpph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vgetmantph $123, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x26,0xf5,0x7b]
+          vgetmantph $123, %ymm5, %ymm6
+
+// CHECK: vgetmantph $123, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x26,0xf5,0x7b]
+          vgetmantph $123, %xmm5, %xmm6
+
+// CHECK: vgetmantph  $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantph  $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vgetmantph  $123, (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x26,0x31,0x7b]
+          vgetmantph  $123, (%ecx){1to8}, %xmm6
+
+// CHECK: vgetmantph  $123, 2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x26,0x71,0x7f,0x7b]
+          vgetmantph  $123, 2032(%ecx), %xmm6
+
+// CHECK: vgetmantph  $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x9f,0x26,0x72,0x80,0x7b]
+          vgetmantph  $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vgetmantph  $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x2f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantph  $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vgetmantph  $123, (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x38,0x26,0x31,0x7b]
+          vgetmantph  $123, (%ecx){1to16}, %ymm6
+
+// CHECK: vgetmantph  $123, 4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x26,0x71,0x7f,0x7b]
+          vgetmantph  $123, 4064(%ecx), %ymm6
+
+// CHECK: vgetmantph  $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7c,0xbf,0x26,0x72,0x80,0x7b]
+          vgetmantph  $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vrcpph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x4c,0xf5]
+          vrcpph %xmm5, %xmm6
+
+// CHECK: vrcpph %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x4c,0xf5]
+          vrcpph %ymm5, %ymm6
+
+// CHECK: vrcpph  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x0f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vrcpph  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vrcpph  (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x4c,0x31]
+          vrcpph  (%ecx){1to8}, %xmm6
+
+// CHECK: vrcpph  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x4c,0x71,0x7f]
+          vrcpph  2032(%ecx), %xmm6
+
+// CHECK: vrcpph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x9f,0x4c,0x72,0x80]
+          vrcpph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vrcpph  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x2f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vrcpph  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vrcpph  (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x38,0x4c,0x31]
+          vrcpph  (%ecx){1to16}, %ymm6
+
+// CHECK: vrcpph  4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x4c,0x71,0x7f]
+          vrcpph  4064(%ecx), %ymm6
+
+// CHECK: vrcpph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xbf,0x4c,0x72,0x80]
+          vrcpph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vreduceph $123, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x56,0xf5,0x7b]
+          vreduceph $123, %ymm5, %ymm6
+
+// CHECK: vreduceph $123, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x56,0xf5,0x7b]
+          vreduceph $123, %xmm5, %xmm6
+
+// CHECK: vreduceph  $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vreduceph  $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vreduceph  $123, (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x56,0x31,0x7b]
+          vreduceph  $123, (%ecx){1to8}, %xmm6
+
+// CHECK: vreduceph  $123, 2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x56,0x71,0x7f,0x7b]
+          vreduceph  $123, 2032(%ecx), %xmm6
+
+// CHECK: vreduceph  $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x9f,0x56,0x72,0x80,0x7b]
+          vreduceph  $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vreduceph  $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x2f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vreduceph  $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vreduceph  $123, (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x38,0x56,0x31,0x7b]
+          vreduceph  $123, (%ecx){1to16}, %ymm6
+
+// CHECK: vreduceph  $123, 4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x56,0x71,0x7f,0x7b]
+          vreduceph  $123, 4064(%ecx), %ymm6
+
+// CHECK: vreduceph  $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7c,0xbf,0x56,0x72,0x80,0x7b]
+          vreduceph  $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vrndscaleph $123, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x08,0xf5,0x7b]
+          vrndscaleph $123, %ymm5, %ymm6
+
+// CHECK: vrndscaleph $123, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x08,0xf5,0x7b]
+          vrndscaleph $123, %xmm5, %xmm6
+
+// CHECK: vrndscaleph  $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vrndscaleph  $123, 268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vrndscaleph  $123, (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x08,0x31,0x7b]
+          vrndscaleph  $123, (%ecx){1to8}, %xmm6
+
+// CHECK: vrndscaleph  $123, 2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x08,0x71,0x7f,0x7b]
+          vrndscaleph  $123, 2032(%ecx), %xmm6
+
+// CHECK: vrndscaleph  $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x9f,0x08,0x72,0x80,0x7b]
+          vrndscaleph  $123, -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vrndscaleph  $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf3,0x7c,0x2f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vrndscaleph  $123, 268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vrndscaleph  $123, (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x38,0x08,0x31,0x7b]
+          vrndscaleph  $123, (%ecx){1to16}, %ymm6
+
+// CHECK: vrndscaleph  $123, 4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x08,0x71,0x7f,0x7b]
+          vrndscaleph  $123, 4064(%ecx), %ymm6
+
+// CHECK: vrndscaleph  $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf3,0x7c,0xbf,0x08,0x72,0x80,0x7b]
+          vrndscaleph  $123, -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vrsqrtph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x4e,0xf5]
+          vrsqrtph %xmm5, %xmm6
+
+// CHECK: vrsqrtph %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x4e,0xf5]
+          vrsqrtph %ymm5, %ymm6
+
+// CHECK: vrsqrtph  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x0f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vrsqrtph  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vrsqrtph  (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x4e,0x31]
+          vrsqrtph  (%ecx){1to8}, %xmm6
+
+// CHECK: vrsqrtph  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x08,0x4e,0x71,0x7f]
+          vrsqrtph  2032(%ecx), %xmm6
+
+// CHECK: vrsqrtph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x9f,0x4e,0x72,0x80]
+          vrsqrtph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vrsqrtph  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x2f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vrsqrtph  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vrsqrtph  (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x38,0x4e,0x31]
+          vrsqrtph  (%ecx){1to16}, %ymm6
+
+// CHECK: vrsqrtph  4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf6,0x7d,0x28,0x4e,0x71,0x7f]
+          vrsqrtph  4064(%ecx), %ymm6
+
+// CHECK: vrsqrtph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xbf,0x4e,0x72,0x80]
+          vrsqrtph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+
+// CHECK: vscalefph %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x2c,0xf4]
+          vscalefph %ymm4, %ymm5, %ymm6
+
+// CHECK: vscalefph %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2c,0xf4]
+          vscalefph %xmm4, %xmm5, %xmm6
+
+// CHECK: vscalefph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x2f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vscalefph  268435456(%esp,%esi,8), %ymm5, %ymm6 {%k7}
+
+// CHECK: vscalefph  (%ecx){1to16}, %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x38,0x2c,0x31]
+          vscalefph  (%ecx){1to16}, %ymm5, %ymm6
+
+// CHECK: vscalefph  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x28,0x2c,0x71,0x7f]
+          vscalefph  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vscalefph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0xbf,0x2c,0x72,0x80]
+          vscalefph  -256(%edx){1to16}, %ymm5, %ymm6 {%k7} {z}
+
+// CHECK: vscalefph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vscalefph  268435456(%esp,%esi,8), %xmm5, %xmm6 {%k7}
+
+// CHECK: vscalefph  (%ecx){1to8}, %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x2c,0x31]
+          vscalefph  (%ecx){1to8}, %xmm5, %xmm6
+
+// CHECK: vscalefph  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2c,0x71,0x7f]
+          vscalefph  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vscalefph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf6,0x55,0x9f,0x2c,0x72,0x80]
+          vscalefph  -256(%edx){1to8}, %xmm5, %xmm6 {%k7} {z}
+
+// CHECK: vsqrtph %xmm5, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x51,0xf5]
+          vsqrtph %xmm5, %xmm6
+
+// CHECK: vsqrtph %ymm5, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x51,0xf5]
+          vsqrtph %ymm5, %ymm6
+
+// CHECK: vsqrtph  268435456(%esp,%esi,8), %xmm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x0f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vsqrtph  268435456(%esp,%esi,8), %xmm6 {%k7}
+
+// CHECK: vsqrtph  (%ecx){1to8}, %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x51,0x31]
+          vsqrtph  (%ecx){1to8}, %xmm6
+
+// CHECK: vsqrtph  2032(%ecx), %xmm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x51,0x71,0x7f]
+          vsqrtph  2032(%ecx), %xmm6
+
+// CHECK: vsqrtph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x9f,0x51,0x72,0x80]
+          vsqrtph  -256(%edx){1to8}, %xmm6 {%k7} {z}
+
+// CHECK: vsqrtph  268435456(%esp,%esi,8), %ymm6 {%k7}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x2f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vsqrtph  268435456(%esp,%esi,8), %ymm6 {%k7}
+
+// CHECK: vsqrtph  (%ecx){1to16}, %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x38,0x51,0x31]
+          vsqrtph  (%ecx){1to16}, %ymm6
+
+// CHECK: vsqrtph  4064(%ecx), %ymm6
+// CHECK: encoding: [0x62,0xf5,0x7c,0x28,0x51,0x71,0x7f]
+          vsqrtph  4064(%ecx), %ymm6
+
+// CHECK: vsqrtph  -256(%edx){1to16}, %ymm6 {%k7} {z}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xbf,0x51,0x72,0x80]
+          vsqrtph  -256(%edx){1to16}, %ymm6 {%k7} {z}
diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16.s b/llvm/test/MC/X86/intel-syntax-avx512fp16.s
index 0f3ea81b84021..36ca110e12e6e 100644
--- a/llvm/test/MC/X86/intel-syntax-avx512fp16.s
+++ b/llvm/test/MC/X86/intel-syntax-avx512fp16.s
@@ -75,3 +75,1563 @@
 // CHECK: vmovw word ptr [edx - 256], xmm6
 // CHECK: encoding: [0x62,0xf5,0x7d,0x08,0x7e,0x72,0x80]
           vmovw word ptr [edx - 256], xmm6
+
+// CHECK: vaddph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x58,0xf4]
+          vaddph zmm6, zmm5, zmm4
+
+// CHECK: vaddph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x58,0xf4]
+          vaddph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vaddph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vaddph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vaddph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x58,0x31]
+          vaddph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vaddph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x58,0x71,0x7f]
+          vaddph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vaddph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x58,0x72,0x80]
+          vaddph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vaddsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x58,0xf4]
+          vaddsh xmm6, xmm5, xmm4
+
+// CHECK: vaddsh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x58,0xf4]
+          vaddsh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vaddsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x58,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vaddsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vaddsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x58,0x31]
+          vaddsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vaddsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x58,0x71,0x7f]
+          vaddsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vaddsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x58,0x72,0x80]
+          vaddsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vcmpneq_usph k5, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf3,0x54,0x48,0xc2,0xec,0x14]
+          vcmpneq_usph k5, zmm5, zmm4
+
+// CHECK: vcmpnlt_uqph k5, zmm5, zmm4, {sae}
+// CHECK: encoding: [0x62,0xf3,0x54,0x18,0xc2,0xec,0x15]
+          vcmpnlt_uqph k5, zmm5, zmm4, {sae}
+
+// CHECK: vcmpnle_uqph k5 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf3,0x54,0x4f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x16]
+          vcmpnle_uqph k5 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcmpord_sph k5, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf3,0x54,0x58,0xc2,0x29,0x17]
+          vcmpord_sph k5, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vcmpeq_usph k5, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf3,0x54,0x48,0xc2,0x69,0x7f,0x18]
+          vcmpeq_usph k5, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vcmpnge_uqph k5 {k7}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf3,0x54,0x5f,0xc2,0x6a,0x80,0x19]
+          vcmpnge_uqph k5 {k7}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vcmpngt_uqsh k5, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0xec,0x1a]
+          vcmpngt_uqsh k5, xmm5, xmm4
+
+// CHECK: vcmpfalse_ossh k5, xmm5, xmm4, {sae}
+// CHECK: encoding: [0x62,0xf3,0x56,0x18,0xc2,0xec,0x1b]
+          vcmpfalse_ossh k5, xmm5, xmm4, {sae}
+
+// CHECK: vcmpneq_ossh k5 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf3,0x56,0x0f,0xc2,0xac,0xf4,0x00,0x00,0x00,0x10,0x1c]
+          vcmpneq_ossh k5 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcmpge_oqsh k5, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0x29,0x1d]
+          vcmpge_oqsh k5, xmm5, word ptr [ecx]
+
+// CHECK: vcmpgt_oqsh k5, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf3,0x56,0x08,0xc2,0x69,0x7f,0x1e]
+          vcmpgt_oqsh k5, xmm5, word ptr [ecx + 254]
+
+// CHECK: vcmptrue_ussh k5 {k7}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf3,0x56,0x0f,0xc2,0x6a,0x80,0x1f]
+          vcmptrue_ussh k5 {k7}, xmm5, word ptr [edx - 256]
+
+// CHECK: vcomish xmm6, xmm5
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0xf5]
+          vcomish xmm6, xmm5
+
+// CHECK: vcomish xmm6, xmm5, {sae}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x2f,0xf5]
+          vcomish xmm6, xmm5, {sae}
+
+// CHECK: vcomish xmm6, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcomish xmm6, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcomish xmm6, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0x31]
+          vcomish xmm6, word ptr [ecx]
+
+// CHECK: vcomish xmm6, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0x71,0x7f]
+          vcomish xmm6, word ptr [ecx + 254]
+
+// CHECK: vcomish xmm6, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2f,0x72,0x80]
+          vcomish xmm6, word ptr [edx - 256]
+
+// CHECK: vdivph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5e,0xf4]
+          vdivph zmm6, zmm5, zmm4
+
+// CHECK: vdivph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5e,0xf4]
+          vdivph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vdivph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vdivph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vdivph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x5e,0x31]
+          vdivph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vdivph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5e,0x71,0x7f]
+          vdivph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vdivph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x5e,0x72,0x80]
+          vdivph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vdivsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5e,0xf4]
+          vdivsh xmm6, xmm5, xmm4
+
+// CHECK: vdivsh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5e,0xf4]
+          vdivsh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vdivsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vdivsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vdivsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5e,0x31]
+          vdivsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vdivsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5e,0x71,0x7f]
+          vdivsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vdivsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5e,0x72,0x80]
+          vdivsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vmaxph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5f,0xf4]
+          vmaxph zmm6, zmm5, zmm4
+
+// CHECK: vmaxph zmm6, zmm5, zmm4, {sae}
+// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5f,0xf4]
+          vmaxph zmm6, zmm5, zmm4, {sae}
+
+// CHECK: vmaxph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vmaxph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vmaxph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x5f,0x31]
+          vmaxph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vmaxph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5f,0x71,0x7f]
+          vmaxph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vmaxph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x5f,0x72,0x80]
+          vmaxph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vmaxsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5f,0xf4]
+          vmaxsh xmm6, xmm5, xmm4
+
+// CHECK: vmaxsh xmm6, xmm5, xmm4, {sae}
+// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5f,0xf4]
+          vmaxsh xmm6, xmm5, xmm4, {sae}
+
+// CHECK: vmaxsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5f,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vmaxsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vmaxsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5f,0x31]
+          vmaxsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vmaxsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5f,0x71,0x7f]
+          vmaxsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vmaxsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5f,0x72,0x80]
+          vmaxsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vminph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5d,0xf4]
+          vminph zmm6, zmm5, zmm4
+
+// CHECK: vminph zmm6, zmm5, zmm4, {sae}
+// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5d,0xf4]
+          vminph zmm6, zmm5, zmm4, {sae}
+
+// CHECK: vminph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vminph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vminph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x5d,0x31]
+          vminph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vminph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5d,0x71,0x7f]
+          vminph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vminph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x5d,0x72,0x80]
+          vminph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vminsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5d,0xf4]
+          vminsh xmm6, xmm5, xmm4
+
+// CHECK: vminsh xmm6, xmm5, xmm4, {sae}
+// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5d,0xf4]
+          vminsh xmm6, xmm5, xmm4, {sae}
+
+// CHECK: vminsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vminsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vminsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5d,0x31]
+          vminsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vminsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5d,0x71,0x7f]
+          vminsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vminsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5d,0x72,0x80]
+          vminsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vmulph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x59,0xf4]
+          vmulph zmm6, zmm5, zmm4
+
+// CHECK: vmulph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x59,0xf4]
+          vmulph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vmulph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vmulph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vmulph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x59,0x31]
+          vmulph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vmulph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x59,0x71,0x7f]
+          vmulph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vmulph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x59,0x72,0x80]
+          vmulph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vmulsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x59,0xf4]
+          vmulsh xmm6, xmm5, xmm4
+
+// CHECK: vmulsh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x59,0xf4]
+          vmulsh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vmulsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x59,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vmulsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vmulsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x59,0x31]
+          vmulsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vmulsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x59,0x71,0x7f]
+          vmulsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vmulsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x59,0x72,0x80]
+          vmulsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vsubph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5c,0xf4]
+          vsubph zmm6, zmm5, zmm4
+
+// CHECK: vsubph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x5c,0xf4]
+          vsubph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vsubph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x54,0x4f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vsubph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vsubph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x54,0x58,0x5c,0x31]
+          vsubph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vsubph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x54,0x48,0x5c,0x71,0x7f]
+          vsubph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vsubph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x54,0xdf,0x5c,0x72,0x80]
+          vsubph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vsubsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5c,0xf4]
+          vsubsh xmm6, xmm5, xmm4
+
+// CHECK: vsubsh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5c,0xf4]
+          vsubsh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vsubsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vsubsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vsubsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5c,0x31]
+          vsubsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vsubsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5c,0x71,0x7f]
+          vsubsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vsubsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5c,0x72,0x80]
+          vsubsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vucomish xmm6, xmm5
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0xf5]
+          vucomish xmm6, xmm5
+
+// CHECK: vucomish xmm6, xmm5, {sae}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x2e,0xf5]
+          vucomish xmm6, xmm5, {sae}
+
+// CHECK: vucomish xmm6, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vucomish xmm6, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vucomish xmm6, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0x31]
+          vucomish xmm6, word ptr [ecx]
+
+// CHECK: vucomish xmm6, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0x71,0x7f]
+          vucomish xmm6, word ptr [ecx + 254]
+
+// CHECK: vucomish xmm6, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x08,0x2e,0x72,0x80]
+          vucomish xmm6, word ptr [edx - 256]
+
+// CHECK: vcvtdq2ph ymm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x5b,0xf5]
+          vcvtdq2ph ymm6, zmm5
+
+// CHECK: vcvtdq2ph ymm6, zmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x5b,0xf5]
+          vcvtdq2ph ymm6, zmm5, {rn-sae}
+
+// CHECK: vcvtdq2ph ymm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtdq2ph ymm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtdq2ph ymm6, dword ptr [ecx]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x5b,0x31]
+          vcvtdq2ph ymm6, dword ptr [ecx]{1to16}
+
+// CHECK: vcvtdq2ph ymm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x5b,0x71,0x7f]
+          vcvtdq2ph ymm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vcvtdq2ph ymm6 {k7} {z}, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x5b,0x72,0x80]
+          vcvtdq2ph ymm6 {k7} {z}, dword ptr [edx - 512]{1to16}
+
+// CHECK: vcvtpd2ph xmm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0xfd,0x48,0x5a,0xf5]
+          vcvtpd2ph xmm6, zmm5
+
+// CHECK: vcvtpd2ph xmm6, zmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0xfd,0x18,0x5a,0xf5]
+          vcvtpd2ph xmm6, zmm5, {rn-sae}
+
+// CHECK: vcvtpd2ph xmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0xfd,0x4f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtpd2ph xmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtpd2ph xmm6, qword ptr [ecx]{1to8}
+// CHECK: encoding: [0x62,0xf5,0xfd,0x58,0x5a,0x31]
+          vcvtpd2ph xmm6, qword ptr [ecx]{1to8}
+
+// CHECK: vcvtpd2ph xmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0xfd,0x48,0x5a,0x71,0x7f]
+          vcvtpd2ph xmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vcvtpd2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to8}
+// CHECK: encoding: [0x62,0xf5,0xfd,0xdf,0x5a,0x72,0x80]
+          vcvtpd2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to8}
+
+// CHECK: vcvtph2dq zmm6, ymm5
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x5b,0xf5]
+          vcvtph2dq zmm6, ymm5
+
+// CHECK: vcvtph2dq zmm6, ymm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x5b,0xf5]
+          vcvtph2dq zmm6, ymm5, {rn-sae}
+
+// CHECK: vcvtph2dq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2dq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtph2dq zmm6, word ptr [ecx]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x5b,0x31]
+          vcvtph2dq zmm6, word ptr [ecx]{1to16}
+
+// CHECK: vcvtph2dq zmm6, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x5b,0x71,0x7f]
+          vcvtph2dq zmm6, ymmword ptr [ecx + 4064]
+
+// CHECK: vcvtph2dq zmm6 {k7} {z}, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x5b,0x72,0x80]
+          vcvtph2dq zmm6 {k7} {z}, word ptr [edx - 256]{1to16}
+
+// CHECK: vcvtph2pd zmm6, xmm5
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x5a,0xf5]
+          vcvtph2pd zmm6, xmm5
+
+// CHECK: vcvtph2pd zmm6, xmm5, {sae}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x5a,0xf5]
+          vcvtph2pd zmm6, xmm5, {sae}
+
+// CHECK: vcvtph2pd zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2pd zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtph2pd zmm6, word ptr [ecx]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x5a,0x31]
+          vcvtph2pd zmm6, word ptr [ecx]{1to8}
+
+// CHECK: vcvtph2pd zmm6, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x5a,0x71,0x7f]
+          vcvtph2pd zmm6, xmmword ptr [ecx + 2032]
+
+// CHECK: vcvtph2pd zmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x5a,0x72,0x80]
+          vcvtph2pd zmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+
+// CHECK: vcvtph2psx zmm6, ymm5
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x13,0xf5]
+          vcvtph2psx zmm6, ymm5
+
+// CHECK: vcvtph2psx zmm6, ymm5, {sae}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x13,0xf5]
+          vcvtph2psx zmm6, ymm5, {sae}
+
+// CHECK: vcvtph2psx zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x4f,0x13,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2psx zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtph2psx zmm6, word ptr [ecx]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x58,0x13,0x31]
+          vcvtph2psx zmm6, word ptr [ecx]{1to16}
+
+// CHECK: vcvtph2psx zmm6, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x13,0x71,0x7f]
+          vcvtph2psx zmm6, ymmword ptr [ecx + 4064]
+
+// CHECK: vcvtph2psx zmm6 {k7} {z}, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xdf,0x13,0x72,0x80]
+          vcvtph2psx zmm6 {k7} {z}, word ptr [edx - 256]{1to16}
+
+// CHECK: vcvtph2qq zmm6, xmm5
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7b,0xf5]
+          vcvtph2qq zmm6, xmm5
+
+// CHECK: vcvtph2qq zmm6, xmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7b,0xf5]
+          vcvtph2qq zmm6, xmm5, {rn-sae}
+
+// CHECK: vcvtph2qq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x7b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2qq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtph2qq zmm6, word ptr [ecx]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x7b,0x31]
+          vcvtph2qq zmm6, word ptr [ecx]{1to8}
+
+// CHECK: vcvtph2qq zmm6, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7b,0x71,0x7f]
+          vcvtph2qq zmm6, xmmword ptr [ecx + 2032]
+
+// CHECK: vcvtph2qq zmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x7b,0x72,0x80]
+          vcvtph2qq zmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+
+// CHECK: vcvtph2udq zmm6, ymm5
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x79,0xf5]
+          vcvtph2udq zmm6, ymm5
+
+// CHECK: vcvtph2udq zmm6, ymm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x79,0xf5]
+          vcvtph2udq zmm6, ymm5, {rn-sae}
+
+// CHECK: vcvtph2udq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2udq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtph2udq zmm6, word ptr [ecx]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x79,0x31]
+          vcvtph2udq zmm6, word ptr [ecx]{1to16}
+
+// CHECK: vcvtph2udq zmm6, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x79,0x71,0x7f]
+          vcvtph2udq zmm6, ymmword ptr [ecx + 4064]
+
+// CHECK: vcvtph2udq zmm6 {k7} {z}, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x79,0x72,0x80]
+          vcvtph2udq zmm6 {k7} {z}, word ptr [edx - 256]{1to16}
+
+// CHECK: vcvtph2uqq zmm6, xmm5
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x79,0xf5]
+          vcvtph2uqq zmm6, xmm5
+
+// CHECK: vcvtph2uqq zmm6, xmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x79,0xf5]
+          vcvtph2uqq zmm6, xmm5, {rn-sae}
+
+// CHECK: vcvtph2uqq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x79,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2uqq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtph2uqq zmm6, word ptr [ecx]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x79,0x31]
+          vcvtph2uqq zmm6, word ptr [ecx]{1to8}
+
+// CHECK: vcvtph2uqq zmm6, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x79,0x71,0x7f]
+          vcvtph2uqq zmm6, xmmword ptr [ecx + 2032]
+
+// CHECK: vcvtph2uqq zmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x79,0x72,0x80]
+          vcvtph2uqq zmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+
+// CHECK: vcvtph2uw zmm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x7d,0xf5]
+          vcvtph2uw zmm6, zmm5
+
+// CHECK: vcvtph2uw zmm6, zmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x7d,0xf5]
+          vcvtph2uw zmm6, zmm5, {rn-sae}
+
+// CHECK: vcvtph2uw zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2uw zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtph2uw zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x7d,0x31]
+          vcvtph2uw zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vcvtph2uw zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x7d,0x71,0x7f]
+          vcvtph2uw zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vcvtph2uw zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x7d,0x72,0x80]
+          vcvtph2uw zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vcvtph2w zmm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7d,0xf5]
+          vcvtph2w zmm6, zmm5
+
+// CHECK: vcvtph2w zmm6, zmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7d,0xf5]
+          vcvtph2w zmm6, zmm5, {rn-sae}
+
+// CHECK: vcvtph2w zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtph2w zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtph2w zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x7d,0x31]
+          vcvtph2w zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vcvtph2w zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7d,0x71,0x7f]
+          vcvtph2w zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vcvtph2w zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x7d,0x72,0x80]
+          vcvtph2w zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vcvtps2phx ymm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x1d,0xf5]
+          vcvtps2phx ymm6, zmm5
+
+// CHECK: vcvtps2phx ymm6, zmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x1d,0xf5]
+          vcvtps2phx ymm6, zmm5, {rn-sae}
+
+// CHECK: vcvtps2phx ymm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x1d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtps2phx ymm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtps2phx ymm6, dword ptr [ecx]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x1d,0x31]
+          vcvtps2phx ymm6, dword ptr [ecx]{1to16}
+
+// CHECK: vcvtps2phx ymm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x1d,0x71,0x7f]
+          vcvtps2phx ymm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vcvtps2phx ymm6 {k7} {z}, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x1d,0x72,0x80]
+          vcvtps2phx ymm6 {k7} {z}, dword ptr [edx - 512]{1to16}
+
+// CHECK: vcvtqq2ph xmm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0xfc,0x48,0x5b,0xf5]
+          vcvtqq2ph xmm6, zmm5
+
+// CHECK: vcvtqq2ph xmm6, zmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0xfc,0x18,0x5b,0xf5]
+          vcvtqq2ph xmm6, zmm5, {rn-sae}
+
+// CHECK: vcvtqq2ph xmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0xfc,0x4f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtqq2ph xmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtqq2ph xmm6, qword ptr [ecx]{1to8}
+// CHECK: encoding: [0x62,0xf5,0xfc,0x58,0x5b,0x31]
+          vcvtqq2ph xmm6, qword ptr [ecx]{1to8}
+
+// CHECK: vcvtqq2ph xmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0xfc,0x48,0x5b,0x71,0x7f]
+          vcvtqq2ph xmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vcvtqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to8}
+// CHECK: encoding: [0x62,0xf5,0xfc,0xdf,0x5b,0x72,0x80]
+          vcvtqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to8}
+
+// CHECK: vcvtsd2sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf5,0xd7,0x08,0x5a,0xf4]
+          vcvtsd2sh xmm6, xmm5, xmm4
+
+// CHECK: vcvtsd2sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0xd7,0x18,0x5a,0xf4]
+          vcvtsd2sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vcvtsd2sh xmm6 {k7}, xmm5, qword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0xd7,0x0f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtsd2sh xmm6 {k7}, xmm5, qword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtsd2sh xmm6, xmm5, qword ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0xd7,0x08,0x5a,0x31]
+          vcvtsd2sh xmm6, xmm5, qword ptr [ecx]
+
+// CHECK: vcvtsd2sh xmm6, xmm5, qword ptr [ecx + 1016]
+// CHECK: encoding: [0x62,0xf5,0xd7,0x08,0x5a,0x71,0x7f]
+          vcvtsd2sh xmm6, xmm5, qword ptr [ecx + 1016]
+
+// CHECK: vcvtsd2sh xmm6 {k7} {z}, xmm5, qword ptr [edx - 1024]
+// CHECK: encoding: [0x62,0xf5,0xd7,0x8f,0x5a,0x72,0x80]
+          vcvtsd2sh xmm6 {k7} {z}, xmm5, qword ptr [edx - 1024]
+
+// CHECK: vcvtsh2sd xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5a,0xf4]
+          vcvtsh2sd xmm6, xmm5, xmm4
+
+// CHECK: vcvtsh2sd xmm6, xmm5, xmm4, {sae}
+// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x5a,0xf4]
+          vcvtsh2sd xmm6, xmm5, xmm4, {sae}
+
+// CHECK: vcvtsh2sd xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x5a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtsh2sd xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtsh2sd xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5a,0x31]
+          vcvtsh2sd xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vcvtsh2sd xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x5a,0x71,0x7f]
+          vcvtsh2sd xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vcvtsh2sd xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x5a,0x72,0x80]
+          vcvtsh2sd xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vcvtsh2si edx, xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0xd6]
+          vcvtsh2si edx, xmm6
+
+// CHECK: vcvtsh2si edx, xmm6, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x2d,0xd6]
+          vcvtsh2si edx, xmm6, {rn-sae}
+
+// CHECK: vcvtsh2si edx, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vcvtsh2si edx, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtsh2si edx, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0x11]
+          vcvtsh2si edx, word ptr [ecx]
+
+// CHECK: vcvtsh2si edx, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0x51,0x7f]
+          vcvtsh2si edx, word ptr [ecx + 254]
+
+// CHECK: vcvtsh2si edx, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2d,0x52,0x80]
+          vcvtsh2si edx, word ptr [edx - 256]
+
+// CHECK: vcvtsh2ss xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x54,0x08,0x13,0xf4]
+          vcvtsh2ss xmm6, xmm5, xmm4
+
+// CHECK: vcvtsh2ss xmm6, xmm5, xmm4, {sae}
+// CHECK: encoding: [0x62,0xf6,0x54,0x18,0x13,0xf4]
+          vcvtsh2ss xmm6, xmm5, xmm4, {sae}
+
+// CHECK: vcvtsh2ss xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x54,0x0f,0x13,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtsh2ss xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtsh2ss xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x54,0x08,0x13,0x31]
+          vcvtsh2ss xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vcvtsh2ss xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x54,0x08,0x13,0x71,0x7f]
+          vcvtsh2ss xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vcvtsh2ss xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x54,0x8f,0x13,0x72,0x80]
+          vcvtsh2ss xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vcvtsh2usi edx, xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0xd6]
+          vcvtsh2usi edx, xmm6
+
+// CHECK: vcvtsh2usi edx, xmm6, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x79,0xd6]
+          vcvtsh2usi edx, xmm6, {rn-sae}
+
+// CHECK: vcvtsh2usi edx, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vcvtsh2usi edx, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtsh2usi edx, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0x11]
+          vcvtsh2usi edx, word ptr [ecx]
+
+// CHECK: vcvtsh2usi edx, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0x51,0x7f]
+          vcvtsh2usi edx, word ptr [ecx + 254]
+
+// CHECK: vcvtsh2usi edx, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x79,0x52,0x80]
+          vcvtsh2usi edx, word ptr [edx - 256]
+
+// CHECK: vcvtsi2sh xmm6, xmm5, edx
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x2a,0xf2]
+          vcvtsi2sh xmm6, xmm5, edx
+
+// CHECK: vcvtsi2sh xmm6, xmm5, {rn-sae}, edx
+// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x2a,0xf2]
+          vcvtsi2sh xmm6, xmm5, {rn-sae}, edx
+
+// CHECK: vcvtsi2sh xmm6, xmm5, dword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x2a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtsi2sh xmm6, xmm5, dword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtsi2sh xmm6, xmm5, dword ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x2a,0x31]
+          vcvtsi2sh xmm6, xmm5, dword ptr [ecx]
+
+// CHECK: vcvtsi2sh xmm6, xmm5, dword ptr [ecx + 508]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x2a,0x71,0x7f]
+          vcvtsi2sh xmm6, xmm5, dword ptr [ecx + 508]
+
+// CHECK: vcvtsi2sh xmm6, xmm5, dword ptr [edx - 512]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x2a,0x72,0x80]
+          vcvtsi2sh xmm6, xmm5, dword ptr [edx - 512]
+
+// CHECK: vcvtss2sh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x1d,0xf4]
+          vcvtss2sh xmm6, xmm5, xmm4
+
+// CHECK: vcvtss2sh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x54,0x18,0x1d,0xf4]
+          vcvtss2sh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vcvtss2sh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x54,0x0f,0x1d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtss2sh xmm6 {k7}, xmm5, dword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtss2sh xmm6, xmm5, dword ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x1d,0x31]
+          vcvtss2sh xmm6, xmm5, dword ptr [ecx]
+
+// CHECK: vcvtss2sh xmm6, xmm5, dword ptr [ecx + 508]
+// CHECK: encoding: [0x62,0xf5,0x54,0x08,0x1d,0x71,0x7f]
+          vcvtss2sh xmm6, xmm5, dword ptr [ecx + 508]
+
+// CHECK: vcvtss2sh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]
+// CHECK: encoding: [0x62,0xf5,0x54,0x8f,0x1d,0x72,0x80]
+          vcvtss2sh xmm6 {k7} {z}, xmm5, dword ptr [edx - 512]
+
+// CHECK: vcvttph2dq zmm6, ymm5
+// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x5b,0xf5]
+          vcvttph2dq zmm6, ymm5
+
+// CHECK: vcvttph2dq zmm6, ymm5, {sae}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x5b,0xf5]
+          vcvttph2dq zmm6, ymm5, {sae}
+
+// CHECK: vcvttph2dq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x5b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2dq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvttph2dq zmm6, word ptr [ecx]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x5b,0x31]
+          vcvttph2dq zmm6, word ptr [ecx]{1to16}
+
+// CHECK: vcvttph2dq zmm6, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x5b,0x71,0x7f]
+          vcvttph2dq zmm6, ymmword ptr [ecx + 4064]
+
+// CHECK: vcvttph2dq zmm6 {k7} {z}, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x5b,0x72,0x80]
+          vcvttph2dq zmm6 {k7} {z}, word ptr [edx - 256]{1to16}
+
+// CHECK: vcvttph2qq zmm6, xmm5
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7a,0xf5]
+          vcvttph2qq zmm6, xmm5
+
+// CHECK: vcvttph2qq zmm6, xmm5, {sae}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7a,0xf5]
+          vcvttph2qq zmm6, xmm5, {sae}
+
+// CHECK: vcvttph2qq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2qq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvttph2qq zmm6, word ptr [ecx]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x7a,0x31]
+          vcvttph2qq zmm6, word ptr [ecx]{1to8}
+
+// CHECK: vcvttph2qq zmm6, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7a,0x71,0x7f]
+          vcvttph2qq zmm6, xmmword ptr [ecx + 2032]
+
+// CHECK: vcvttph2qq zmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x7a,0x72,0x80]
+          vcvttph2qq zmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+
+// CHECK: vcvttph2udq zmm6, ymm5
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x78,0xf5]
+          vcvttph2udq zmm6, ymm5
+
+// CHECK: vcvttph2udq zmm6, ymm5, {sae}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x78,0xf5]
+          vcvttph2udq zmm6, ymm5, {sae}
+
+// CHECK: vcvttph2udq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2udq zmm6 {k7}, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvttph2udq zmm6, word ptr [ecx]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x78,0x31]
+          vcvttph2udq zmm6, word ptr [ecx]{1to16}
+
+// CHECK: vcvttph2udq zmm6, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x78,0x71,0x7f]
+          vcvttph2udq zmm6, ymmword ptr [ecx + 4064]
+
+// CHECK: vcvttph2udq zmm6 {k7} {z}, word ptr [edx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x78,0x72,0x80]
+          vcvttph2udq zmm6 {k7} {z}, word ptr [edx - 256]{1to16}
+
+// CHECK: vcvttph2uqq zmm6, xmm5
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x78,0xf5]
+          vcvttph2uqq zmm6, xmm5
+
+// CHECK: vcvttph2uqq zmm6, xmm5, {sae}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x78,0xf5]
+          vcvttph2uqq zmm6, xmm5, {sae}
+
+// CHECK: vcvttph2uqq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x78,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2uqq zmm6 {k7}, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvttph2uqq zmm6, word ptr [ecx]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x78,0x31]
+          vcvttph2uqq zmm6, word ptr [ecx]{1to8}
+
+// CHECK: vcvttph2uqq zmm6, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x78,0x71,0x7f]
+          vcvttph2uqq zmm6, xmmword ptr [ecx + 2032]
+
+// CHECK: vcvttph2uqq zmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x78,0x72,0x80]
+          vcvttph2uqq zmm6 {k7} {z}, word ptr [edx - 256]{1to8}
+
+// CHECK: vcvttph2uw zmm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x7c,0xf5]
+          vcvttph2uw zmm6, zmm5
+
+// CHECK: vcvttph2uw zmm6, zmm5, {sae}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x7c,0xf5]
+          vcvttph2uw zmm6, zmm5, {sae}
+
+// CHECK: vcvttph2uw zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2uw zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvttph2uw zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x7c,0x31]
+          vcvttph2uw zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vcvttph2uw zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x7c,0x71,0x7f]
+          vcvttph2uw zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vcvttph2uw zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x7c,0x72,0x80]
+          vcvttph2uw zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vcvttph2w zmm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7c,0xf5]
+          vcvttph2w zmm6, zmm5
+
+// CHECK: vcvttph2w zmm6, zmm5, {sae}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x18,0x7c,0xf5]
+          vcvttph2w zmm6, zmm5, {sae}
+
+// CHECK: vcvttph2w zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x4f,0x7c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvttph2w zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvttph2w zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7d,0x58,0x7c,0x31]
+          vcvttph2w zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vcvttph2w zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x7d,0x48,0x7c,0x71,0x7f]
+          vcvttph2w zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vcvttph2w zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7d,0xdf,0x7c,0x72,0x80]
+          vcvttph2w zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vcvttsh2si edx, xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0xd6]
+          vcvttsh2si edx, xmm6
+
+// CHECK: vcvttsh2si edx, xmm6, {sae}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x2c,0xd6]
+          vcvttsh2si edx, xmm6, {sae}
+
+// CHECK: vcvttsh2si edx, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vcvttsh2si edx, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvttsh2si edx, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0x11]
+          vcvttsh2si edx, word ptr [ecx]
+
+// CHECK: vcvttsh2si edx, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0x51,0x7f]
+          vcvttsh2si edx, word ptr [ecx + 254]
+
+// CHECK: vcvttsh2si edx, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x2c,0x52,0x80]
+          vcvttsh2si edx, word ptr [edx - 256]
+
+// CHECK: vcvttsh2usi edx, xmm6
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0xd6]
+          vcvttsh2usi edx, xmm6
+
+// CHECK: vcvttsh2usi edx, xmm6, {sae}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x78,0xd6]
+          vcvttsh2usi edx, xmm6, {sae}
+
+// CHECK: vcvttsh2usi edx, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0x94,0xf4,0x00,0x00,0x00,0x10]
+          vcvttsh2usi edx, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvttsh2usi edx, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0x11]
+          vcvttsh2usi edx, word ptr [ecx]
+
+// CHECK: vcvttsh2usi edx, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0x51,0x7f]
+          vcvttsh2usi edx, word ptr [ecx + 254]
+
+// CHECK: vcvttsh2usi edx, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x08,0x78,0x52,0x80]
+          vcvttsh2usi edx, word ptr [edx - 256]
+
+// CHECK: vcvtudq2ph ymm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x7a,0xf5]
+          vcvtudq2ph ymm6, zmm5
+
+// CHECK: vcvtudq2ph ymm6, zmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x7a,0xf5]
+          vcvtudq2ph ymm6, zmm5, {rn-sae}
+
+// CHECK: vcvtudq2ph ymm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7f,0x4f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtudq2ph ymm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtudq2ph ymm6, dword ptr [ecx]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7f,0x58,0x7a,0x31]
+          vcvtudq2ph ymm6, dword ptr [ecx]{1to16}
+
+// CHECK: vcvtudq2ph ymm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x7a,0x71,0x7f]
+          vcvtudq2ph ymm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vcvtudq2ph ymm6 {k7} {z}, dword ptr [edx - 512]{1to16}
+// CHECK: encoding: [0x62,0xf5,0x7f,0xdf,0x7a,0x72,0x80]
+          vcvtudq2ph ymm6 {k7} {z}, dword ptr [edx - 512]{1to16}
+
+// CHECK: vcvtuqq2ph xmm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0xff,0x48,0x7a,0xf5]
+          vcvtuqq2ph xmm6, zmm5
+
+// CHECK: vcvtuqq2ph xmm6, zmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0xff,0x18,0x7a,0xf5]
+          vcvtuqq2ph xmm6, zmm5, {rn-sae}
+
+// CHECK: vcvtuqq2ph xmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0xff,0x4f,0x7a,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtuqq2ph xmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtuqq2ph xmm6, qword ptr [ecx]{1to8}
+// CHECK: encoding: [0x62,0xf5,0xff,0x58,0x7a,0x31]
+          vcvtuqq2ph xmm6, qword ptr [ecx]{1to8}
+
+// CHECK: vcvtuqq2ph xmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0xff,0x48,0x7a,0x71,0x7f]
+          vcvtuqq2ph xmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vcvtuqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to8}
+// CHECK: encoding: [0x62,0xf5,0xff,0xdf,0x7a,0x72,0x80]
+          vcvtuqq2ph xmm6 {k7} {z}, qword ptr [edx - 1024]{1to8}
+
+// CHECK: vcvtusi2sh xmm6, xmm5, edx
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x7b,0xf2]
+          vcvtusi2sh xmm6, xmm5, edx
+
+// CHECK: vcvtusi2sh xmm6, xmm5, {rn-sae}, edx
+// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x7b,0xf2]
+          vcvtusi2sh xmm6, xmm5, {rn-sae}, edx
+
+// CHECK: vcvtusi2sh xmm6, xmm5, dword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x7b,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtusi2sh xmm6, xmm5, dword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtusi2sh xmm6, xmm5, dword ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x7b,0x31]
+          vcvtusi2sh xmm6, xmm5, dword ptr [ecx]
+
+// CHECK: vcvtusi2sh xmm6, xmm5, dword ptr [ecx + 508]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x7b,0x71,0x7f]
+          vcvtusi2sh xmm6, xmm5, dword ptr [ecx + 508]
+
+// CHECK: vcvtusi2sh xmm6, xmm5, dword ptr [edx - 512]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x7b,0x72,0x80]
+          vcvtusi2sh xmm6, xmm5, dword ptr [edx - 512]
+
+// CHECK: vcvtuw2ph zmm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x7d,0xf5]
+          vcvtuw2ph zmm6, zmm5
+
+// CHECK: vcvtuw2ph zmm6, zmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7f,0x18,0x7d,0xf5]
+          vcvtuw2ph zmm6, zmm5, {rn-sae}
+
+// CHECK: vcvtuw2ph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7f,0x4f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtuw2ph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtuw2ph zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7f,0x58,0x7d,0x31]
+          vcvtuw2ph zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vcvtuw2ph zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x7f,0x48,0x7d,0x71,0x7f]
+          vcvtuw2ph zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vcvtuw2ph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7f,0xdf,0x7d,0x72,0x80]
+          vcvtuw2ph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vcvtw2ph zmm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x7d,0xf5]
+          vcvtw2ph zmm6, zmm5
+
+// CHECK: vcvtw2ph zmm6, zmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x18,0x7d,0xf5]
+          vcvtw2ph zmm6, zmm5, {rn-sae}
+
+// CHECK: vcvtw2ph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x4f,0x7d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vcvtw2ph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vcvtw2ph zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7e,0x58,0x7d,0x31]
+          vcvtw2ph zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vcvtw2ph zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x7e,0x48,0x7d,0x71,0x7f]
+          vcvtw2ph zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vcvtw2ph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7e,0xdf,0x7d,0x72,0x80]
+          vcvtw2ph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vfpclassph k5, zmm6, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x66,0xee,0x7b]
+          vfpclassph k5, zmm6, 123
+
+// CHECK: vfpclassph k5 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x4f,0x66,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vfpclassph k5 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vfpclassph k5, word ptr [ecx]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x58,0x66,0x29,0x7b]
+          vfpclassph k5, word ptr [ecx]{1to32}, 123
+
+// CHECK: vfpclassph k5, zmmword ptr [ecx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x66,0x69,0x7f,0x7b]
+          vfpclassph k5, zmmword ptr [ecx + 8128], 123
+
+// CHECK: vfpclassph k5 {k7}, word ptr [edx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x5f,0x66,0x6a,0x80,0x7b]
+          vfpclassph k5 {k7}, word ptr [edx - 256]{1to32}, 123
+
+// CHECK: vfpclasssh k5, xmm6, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x67,0xee,0x7b]
+          vfpclasssh k5, xmm6, 123
+
+// CHECK: vfpclasssh k5 {k7}, word ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x67,0xac,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vfpclasssh k5 {k7}, word ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vfpclasssh k5, word ptr [ecx], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x67,0x29,0x7b]
+          vfpclasssh k5, word ptr [ecx], 123
+
+// CHECK: vfpclasssh k5, word ptr [ecx + 254], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x67,0x69,0x7f,0x7b]
+          vfpclasssh k5, word ptr [ecx + 254], 123
+
+// CHECK: vfpclasssh k5 {k7}, word ptr [edx - 256], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x0f,0x67,0x6a,0x80,0x7b]
+          vfpclasssh k5 {k7}, word ptr [edx - 256], 123
+
+// CHECK: vgetexpph zmm6, zmm5
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x42,0xf5]
+          vgetexpph zmm6, zmm5
+
+// CHECK: vgetexpph zmm6, zmm5, {sae}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x18,0x42,0xf5]
+          vgetexpph zmm6, zmm5, {sae}
+
+// CHECK: vgetexpph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x4f,0x42,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vgetexpph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vgetexpph zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x58,0x42,0x31]
+          vgetexpph zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vgetexpph zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x42,0x71,0x7f]
+          vgetexpph zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vgetexpph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xdf,0x42,0x72,0x80]
+          vgetexpph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vgetexpsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x43,0xf4]
+          vgetexpsh xmm6, xmm5, xmm4
+
+// CHECK: vgetexpsh xmm6, xmm5, xmm4, {sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x43,0xf4]
+          vgetexpsh xmm6, xmm5, xmm4, {sae}
+
+// CHECK: vgetexpsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x43,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vgetexpsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vgetexpsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x43,0x31]
+          vgetexpsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vgetexpsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x43,0x71,0x7f]
+          vgetexpsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vgetexpsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x43,0x72,0x80]
+          vgetexpsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vgetmantph zmm6, zmm5, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x26,0xf5,0x7b]
+          vgetmantph zmm6, zmm5, 123
+
+// CHECK: vgetmantph zmm6, zmm5, {sae}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x26,0xf5,0x7b]
+          vgetmantph zmm6, zmm5, {sae}, 123
+
+// CHECK: vgetmantph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x4f,0x26,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vgetmantph zmm6, word ptr [ecx]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x58,0x26,0x31,0x7b]
+          vgetmantph zmm6, word ptr [ecx]{1to32}, 123
+
+// CHECK: vgetmantph zmm6, zmmword ptr [ecx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x26,0x71,0x7f,0x7b]
+          vgetmantph zmm6, zmmword ptr [ecx + 8128], 123
+
+// CHECK: vgetmantph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0xdf,0x26,0x72,0x80,0x7b]
+          vgetmantph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+
+// CHECK: vgetmantsh xmm6, xmm5, xmm4, 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x27,0xf4,0x7b]
+          vgetmantsh xmm6, xmm5, xmm4, 123
+
+// CHECK: vgetmantsh xmm6, xmm5, xmm4, {sae}, 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x18,0x27,0xf4,0x7b]
+          vgetmantsh xmm6, xmm5, xmm4, {sae}, 123
+
+// CHECK: vgetmantsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x0f,0x27,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vgetmantsh xmm6, xmm5, word ptr [ecx], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x27,0x31,0x7b]
+          vgetmantsh xmm6, xmm5, word ptr [ecx], 123
+
+// CHECK: vgetmantsh xmm6, xmm5, word ptr [ecx + 254], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x27,0x71,0x7f,0x7b]
+          vgetmantsh xmm6, xmm5, word ptr [ecx + 254], 123
+
+// CHECK: vgetmantsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x8f,0x27,0x72,0x80,0x7b]
+          vgetmantsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123
+
+// CHECK: vrcpph zmm6, zmm5
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x4c,0xf5]
+          vrcpph zmm6, zmm5
+
+// CHECK: vrcpph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x4f,0x4c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vrcpph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrcpph zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x58,0x4c,0x31]
+          vrcpph zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vrcpph zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x4c,0x71,0x7f]
+          vrcpph zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vrcpph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xdf,0x4c,0x72,0x80]
+          vrcpph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vrcpsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4d,0xf4]
+          vrcpsh xmm6, xmm5, xmm4
+
+// CHECK: vrcpsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x4d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vrcpsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrcpsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4d,0x31]
+          vrcpsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vrcpsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4d,0x71,0x7f]
+          vrcpsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vrcpsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x4d,0x72,0x80]
+          vrcpsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vreduceph zmm6, zmm5, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x56,0xf5,0x7b]
+          vreduceph zmm6, zmm5, 123
+
+// CHECK: vreduceph zmm6, zmm5, {sae}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x56,0xf5,0x7b]
+          vreduceph zmm6, zmm5, {sae}, 123
+
+// CHECK: vreduceph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x4f,0x56,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vreduceph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vreduceph zmm6, word ptr [ecx]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x58,0x56,0x31,0x7b]
+          vreduceph zmm6, word ptr [ecx]{1to32}, 123
+
+// CHECK: vreduceph zmm6, zmmword ptr [ecx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x56,0x71,0x7f,0x7b]
+          vreduceph zmm6, zmmword ptr [ecx + 8128], 123
+
+// CHECK: vreduceph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0xdf,0x56,0x72,0x80,0x7b]
+          vreduceph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+
+// CHECK: vreducesh xmm6, xmm5, xmm4, 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x57,0xf4,0x7b]
+          vreducesh xmm6, xmm5, xmm4, 123
+
+// CHECK: vreducesh xmm6, xmm5, xmm4, {sae}, 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x18,0x57,0xf4,0x7b]
+          vreducesh xmm6, xmm5, xmm4, {sae}, 123
+
+// CHECK: vreducesh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x0f,0x57,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vreducesh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vreducesh xmm6, xmm5, word ptr [ecx], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x57,0x31,0x7b]
+          vreducesh xmm6, xmm5, word ptr [ecx], 123
+
+// CHECK: vreducesh xmm6, xmm5, word ptr [ecx + 254], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x57,0x71,0x7f,0x7b]
+          vreducesh xmm6, xmm5, word ptr [ecx + 254], 123
+
+// CHECK: vreducesh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x8f,0x57,0x72,0x80,0x7b]
+          vreducesh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123
+
+// CHECK: vrndscaleph zmm6, zmm5, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x08,0xf5,0x7b]
+          vrndscaleph zmm6, zmm5, 123
+
+// CHECK: vrndscaleph zmm6, zmm5, {sae}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x18,0x08,0xf5,0x7b]
+          vrndscaleph zmm6, zmm5, {sae}, 123
+
+// CHECK: vrndscaleph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x4f,0x08,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vrndscaleph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vrndscaleph zmm6, word ptr [ecx]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x58,0x08,0x31,0x7b]
+          vrndscaleph zmm6, word ptr [ecx]{1to32}, 123
+
+// CHECK: vrndscaleph zmm6, zmmword ptr [ecx + 8128], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x48,0x08,0x71,0x7f,0x7b]
+          vrndscaleph zmm6, zmmword ptr [ecx + 8128], 123
+
+// CHECK: vrndscaleph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0xdf,0x08,0x72,0x80,0x7b]
+          vrndscaleph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}, 123
+
+// CHECK: vrndscalesh xmm6, xmm5, xmm4, 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x0a,0xf4,0x7b]
+          vrndscalesh xmm6, xmm5, xmm4, 123
+
+// CHECK: vrndscalesh xmm6, xmm5, xmm4, {sae}, 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x18,0x0a,0xf4,0x7b]
+          vrndscalesh xmm6, xmm5, xmm4, {sae}, 123
+
+// CHECK: vrndscalesh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x0f,0x0a,0xb4,0xf4,0x00,0x00,0x00,0x10,0x7b]
+          vrndscalesh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456], 123
+
+// CHECK: vrndscalesh xmm6, xmm5, word ptr [ecx], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x0a,0x31,0x7b]
+          vrndscalesh xmm6, xmm5, word ptr [ecx], 123
+
+// CHECK: vrndscalesh xmm6, xmm5, word ptr [ecx + 254], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x08,0x0a,0x71,0x7f,0x7b]
+          vrndscalesh xmm6, xmm5, word ptr [ecx + 254], 123
+
+// CHECK: vrndscalesh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123
+// CHECK: encoding: [0x62,0xf3,0x54,0x8f,0x0a,0x72,0x80,0x7b]
+          vrndscalesh xmm6 {k7} {z}, xmm5, word ptr [edx - 256], 123
+
+// CHECK: vrsqrtph zmm6, zmm5
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x4e,0xf5]
+          vrsqrtph zmm6, zmm5
+
+// CHECK: vrsqrtph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x4f,0x4e,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vrsqrtph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrsqrtph zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7d,0x58,0x4e,0x31]
+          vrsqrtph zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vrsqrtph zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x7d,0x48,0x4e,0x71,0x7f]
+          vrsqrtph zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vrsqrtph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x7d,0xdf,0x4e,0x72,0x80]
+          vrsqrtph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vrsqrtsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4f,0xf4]
+          vrsqrtsh xmm6, xmm5, xmm4
+
+// CHECK: vrsqrtsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x4f,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vrsqrtsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vrsqrtsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4f,0x31]
+          vrsqrtsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vrsqrtsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x4f,0x71,0x7f]
+          vrsqrtsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vrsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x4f,0x72,0x80]
+          vrsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vscalefph zmm6, zmm5, zmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x2c,0xf4]
+          vscalefph zmm6, zmm5, zmm4
+
+// CHECK: vscalefph zmm6, zmm5, zmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x2c,0xf4]
+          vscalefph zmm6, zmm5, zmm4, {rn-sae}
+
+// CHECK: vscalefph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x4f,0x2c,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vscalefph zmm6 {k7}, zmm5, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vscalefph zmm6, zmm5, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0x58,0x2c,0x31]
+          vscalefph zmm6, zmm5, word ptr [ecx]{1to32}
+
+// CHECK: vscalefph zmm6, zmm5, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf6,0x55,0x48,0x2c,0x71,0x7f]
+          vscalefph zmm6, zmm5, zmmword ptr [ecx + 8128]
+
+// CHECK: vscalefph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf6,0x55,0xdf,0x2c,0x72,0x80]
+          vscalefph zmm6 {k7} {z}, zmm5, word ptr [edx - 256]{1to32}
+
+// CHECK: vscalefsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2d,0xf4]
+          vscalefsh xmm6, xmm5, xmm4
+
+// CHECK: vscalefsh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf6,0x55,0x18,0x2d,0xf4]
+          vscalefsh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vscalefsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf6,0x55,0x0f,0x2d,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vscalefsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vscalefsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2d,0x31]
+          vscalefsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vscalefsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf6,0x55,0x08,0x2d,0x71,0x7f]
+          vscalefsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vscalefsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf6,0x55,0x8f,0x2d,0x72,0x80]
+          vscalefsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+
+// CHECK: vsqrtph zmm6, zmm5
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x51,0xf5]
+          vsqrtph zmm6, zmm5
+
+// CHECK: vsqrtph zmm6, zmm5, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x18,0x51,0xf5]
+          vsqrtph zmm6, zmm5, {rn-sae}
+
+// CHECK: vsqrtph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x4f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vsqrtph zmm6 {k7}, zmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: vsqrtph zmm6, word ptr [ecx]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7c,0x58,0x51,0x31]
+          vsqrtph zmm6, word ptr [ecx]{1to32}
+
+// CHECK: vsqrtph zmm6, zmmword ptr [ecx + 8128]
+// CHECK: encoding: [0x62,0xf5,0x7c,0x48,0x51,0x71,0x7f]
+          vsqrtph zmm6, zmmword ptr [ecx + 8128]
+
+// CHECK: vsqrtph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+// CHECK: encoding: [0x62,0xf5,0x7c,0xdf,0x51,0x72,0x80]
+          vsqrtph zmm6 {k7} {z}, word ptr [edx - 256]{1to32}
+
+// CHECK: vsqrtsh xmm6, xmm5, xmm4
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x51,0xf4]
+          vsqrtsh xmm6, xmm5, xmm4
+
+// CHECK: vsqrtsh xmm6, xmm5, xmm4, {rn-sae}
+// CHECK: encoding: [0x62,0xf5,0x56,0x18,0x51,0xf4]
+          vsqrtsh xmm6, xmm5, xmm4, {rn-sae}
+
+// CHECK: vsqrtsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0x62,0xf5,0x56,0x0f,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          vsqrtsh xmm6 {k7}, xmm5, word ptr [esp + 8*esi + 268435456]
+
+// CHECK: vsqrtsh xmm6, xmm5, word ptr [ecx]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x51,0x31]
+          vsqrtsh xmm6, xmm5, word ptr [ecx]
+
+// CHECK: vsqrtsh xmm6, xmm5, word ptr [ecx + 254]
+// CHECK: encoding: [0x62,0xf5,0x56,0x08,0x51,0x71,0x7f]
+          vsqrtsh xmm6, xmm5, word ptr [ecx + 254]
+
+// CHECK: vsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
+// CHECK: encoding: [0x62,0xf5,0x56,0x8f,0x51,0x72,0x80]
+          vsqrtsh xmm6 {k7} {z}, xmm5, word ptr [edx - 256]
diff --git a/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
new file mode 100644
index 0000000000000..6091599b87d66
--- /dev/null
+++ b/llvm/test/MC/X86/intel-syntax-avx512fp16vl.s
@@ -0,0 +1,1493 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: vaddph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x05,0x14,0x20,0x58,0xf4]
+          vaddph ymm30, ymm29, ymm28
+
+// CHECK: vaddph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x05,0x14,0x00,0x58,0xf4]
+          vaddph xmm30, xmm29, xmm28
+
+// CHECK: vaddph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x14,0x27,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vaddph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vaddph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x14,0x30,0x58,0x31]
+          vaddph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vaddph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x14,0x20,0x58,0x71,0x7f]
+          vaddph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vaddph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x58,0x72,0x80]
+          vaddph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vaddph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x14,0x07,0x58,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vaddph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vaddph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x14,0x10,0x58,0x31]
+          vaddph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vaddph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x14,0x00,0x58,0x71,0x7f]
+          vaddph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vaddph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x14,0x97,0x58,0x72,0x80]
+          vaddph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcmpeqph k5, ymm29, ymm28
+// CHECK: encoding: [0x62,0x93,0x14,0x20,0xc2,0xec,0x00]
+          vcmpph k5, ymm29, ymm28, 0
+
+// CHECK: vcmpltph k5, xmm29, xmm28
+// CHECK: encoding: [0x62,0x93,0x14,0x00,0xc2,0xec,0x01]
+          vcmpph k5, xmm29, xmm28, 1
+
+// CHECK: vcmpleph k5 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xb3,0x14,0x07,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x02]
+          vcmpph k5 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456], 2
+
+// CHECK: vcmpunordph k5, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0xd3,0x14,0x10,0xc2,0x29,0x03]
+          vcmpph k5, xmm29, word ptr [r9]{1to8}, 3
+
+// CHECK: vcmpneqph k5, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0xf3,0x14,0x00,0xc2,0x69,0x7f,0x04]
+          vcmpph k5, xmm29, xmmword ptr [rcx + 2032], 4
+
+// CHECK: vcmpnltph k5 {k7}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0xf3,0x14,0x17,0xc2,0x6a,0x80,0x05]
+          vcmpph k5 {k7}, xmm29, word ptr [rdx - 256]{1to8}, 5
+
+// CHECK: vcmpnleph k5 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0xb3,0x14,0x27,0xc2,0xac,0xf5,0x00,0x00,0x00,0x10,0x06]
+          vcmpph k5 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456], 6
+
+// CHECK: vcmpordph k5, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0xd3,0x14,0x30,0xc2,0x29,0x07]
+          vcmpph k5, ymm29, word ptr [r9]{1to16}, 7
+
+// CHECK: vcmpeq_uqph k5, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0xf3,0x14,0x20,0xc2,0x69,0x7f,0x08]
+          vcmpph k5, ymm29, ymmword ptr [rcx + 4064], 8
+
+// CHECK: vcmpngeph k5 {k7}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0xf3,0x14,0x37,0xc2,0x6a,0x80,0x09]
+          vcmpph k5 {k7}, ymm29, word ptr [rdx - 256]{1to16}, 9
+
+// CHECK: vdivph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x05,0x14,0x20,0x5e,0xf4]
+          vdivph ymm30, ymm29, ymm28
+
+// CHECK: vdivph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x05,0x14,0x00,0x5e,0xf4]
+          vdivph xmm30, xmm29, xmm28
+
+// CHECK: vdivph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x14,0x27,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vdivph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vdivph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x14,0x30,0x5e,0x31]
+          vdivph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vdivph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x14,0x20,0x5e,0x71,0x7f]
+          vdivph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vdivph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x5e,0x72,0x80]
+          vdivph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vdivph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x14,0x07,0x5e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vdivph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vdivph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x14,0x10,0x5e,0x31]
+          vdivph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vdivph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x14,0x00,0x5e,0x71,0x7f]
+          vdivph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vdivph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x14,0x97,0x5e,0x72,0x80]
+          vdivph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vmaxph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x05,0x14,0x20,0x5f,0xf4]
+          vmaxph ymm30, ymm29, ymm28
+
+// CHECK: vmaxph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x05,0x14,0x00,0x5f,0xf4]
+          vmaxph xmm30, xmm29, xmm28
+
+// CHECK: vmaxph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x14,0x27,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmaxph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vmaxph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x14,0x30,0x5f,0x31]
+          vmaxph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vmaxph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x14,0x20,0x5f,0x71,0x7f]
+          vmaxph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vmaxph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x5f,0x72,0x80]
+          vmaxph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vmaxph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x14,0x07,0x5f,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmaxph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vmaxph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x14,0x10,0x5f,0x31]
+          vmaxph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vmaxph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x14,0x00,0x5f,0x71,0x7f]
+          vmaxph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vmaxph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x14,0x97,0x5f,0x72,0x80]
+          vmaxph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vminph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x05,0x14,0x20,0x5d,0xf4]
+          vminph ymm30, ymm29, ymm28
+
+// CHECK: vminph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x05,0x14,0x00,0x5d,0xf4]
+          vminph xmm30, xmm29, xmm28
+
+// CHECK: vminph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x14,0x27,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vminph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vminph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x14,0x30,0x5d,0x31]
+          vminph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vminph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x14,0x20,0x5d,0x71,0x7f]
+          vminph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vminph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x5d,0x72,0x80]
+          vminph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vminph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x14,0x07,0x5d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vminph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vminph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x14,0x10,0x5d,0x31]
+          vminph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vminph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x14,0x00,0x5d,0x71,0x7f]
+          vminph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vminph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x14,0x97,0x5d,0x72,0x80]
+          vminph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vmulph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x05,0x14,0x20,0x59,0xf4]
+          vmulph ymm30, ymm29, ymm28
+
+// CHECK: vmulph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x05,0x14,0x00,0x59,0xf4]
+          vmulph xmm30, xmm29, xmm28
+
+// CHECK: vmulph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x14,0x27,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmulph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vmulph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x14,0x30,0x59,0x31]
+          vmulph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vmulph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x14,0x20,0x59,0x71,0x7f]
+          vmulph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vmulph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x59,0x72,0x80]
+          vmulph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vmulph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x14,0x07,0x59,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vmulph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vmulph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x14,0x10,0x59,0x31]
+          vmulph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vmulph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x14,0x00,0x59,0x71,0x7f]
+          vmulph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vmulph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x14,0x97,0x59,0x72,0x80]
+          vmulph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vsubph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x05,0x14,0x20,0x5c,0xf4]
+          vsubph ymm30, ymm29, ymm28
+
+// CHECK: vsubph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x05,0x14,0x00,0x5c,0xf4]
+          vsubph xmm30, xmm29, xmm28
+
+// CHECK: vsubph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x14,0x27,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsubph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsubph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x14,0x30,0x5c,0x31]
+          vsubph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vsubph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x14,0x20,0x5c,0x71,0x7f]
+          vsubph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vsubph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x14,0xb7,0x5c,0x72,0x80]
+          vsubph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vsubph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x14,0x07,0x5c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsubph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsubph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x14,0x10,0x5c,0x31]
+          vsubph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vsubph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x14,0x00,0x5c,0x71,0x7f]
+          vsubph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vsubph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x14,0x97,0x5c,0x72,0x80]
+          vsubph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcvtdq2ph xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x5b,0xf5]
+          vcvtdq2ph xmm30, xmm29
+
+// CHECK: vcvtdq2ph xmm30, ymm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x5b,0xf5]
+          vcvtdq2ph xmm30, ymm29
+
+// CHECK: vcvtdq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtdq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtdq2ph xmm30, dword ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x5b,0x31]
+          vcvtdq2ph xmm30, dword ptr [r9]{1to4}
+
+// CHECK: vcvtdq2ph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x5b,0x71,0x7f]
+          vcvtdq2ph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvtdq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to4}
+// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x5b,0x72,0x80]
+          vcvtdq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to4}
+
+// CHECK: vcvtdq2ph xmm30, dword ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x5b,0x31]
+          vcvtdq2ph xmm30, dword ptr [r9]{1to8}
+
+// CHECK: vcvtdq2ph xmm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x5b,0x71,0x7f]
+          vcvtdq2ph xmm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vcvtdq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x5b,0x72,0x80]
+          vcvtdq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to8}
+
+// CHECK: vcvtpd2ph xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0xfd,0x08,0x5a,0xf5]
+          vcvtpd2ph xmm30, xmm29
+
+// CHECK: vcvtpd2ph xmm30, ymm29
+// CHECK: encoding: [0x62,0x05,0xfd,0x28,0x5a,0xf5]
+          vcvtpd2ph xmm30, ymm29
+
+// CHECK: vcvtpd2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0xfd,0x0f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtpd2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtpd2ph xmm30, qword ptr [r9]{1to2}
+// CHECK: encoding: [0x62,0x45,0xfd,0x18,0x5a,0x31]
+          vcvtpd2ph xmm30, qword ptr [r9]{1to2}
+
+// CHECK: vcvtpd2ph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0xfd,0x08,0x5a,0x71,0x7f]
+          vcvtpd2ph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvtpd2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to2}
+// CHECK: encoding: [0x62,0x65,0xfd,0x9f,0x5a,0x72,0x80]
+          vcvtpd2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to2}
+
+// CHECK: vcvtpd2ph xmm30, qword ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0xfd,0x38,0x5a,0x31]
+          vcvtpd2ph xmm30, qword ptr [r9]{1to4}
+
+// CHECK: vcvtpd2ph xmm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0xfd,0x28,0x5a,0x71,0x7f]
+          vcvtpd2ph xmm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vcvtpd2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to4}
+// CHECK: encoding: [0x62,0x65,0xfd,0xbf,0x5a,0x72,0x80]
+          vcvtpd2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to4}
+
+// CHECK: vcvtph2dq xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x5b,0xf5]
+          vcvtph2dq xmm30, xmm29
+
+// CHECK: vcvtph2dq ymm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x5b,0xf5]
+          vcvtph2dq ymm30, xmm29
+
+// CHECK: vcvtph2dq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2dq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2dq xmm30, word ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x5b,0x31]
+          vcvtph2dq xmm30, word ptr [r9]{1to4}
+
+// CHECK: vcvtph2dq xmm30, qword ptr [rcx + 1016]
+// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x5b,0x71,0x7f]
+          vcvtph2dq xmm30, qword ptr [rcx + 1016]
+
+// CHECK: vcvtph2dq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x5b,0x72,0x80]
+          vcvtph2dq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+
+// CHECK: vcvtph2dq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2dq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2dq ymm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x5b,0x31]
+          vcvtph2dq ymm30, word ptr [r9]{1to8}
+
+// CHECK: vcvtph2dq ymm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x5b,0x71,0x7f]
+          vcvtph2dq ymm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvtph2dq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x5b,0x72,0x80]
+          vcvtph2dq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcvtph2pd xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x5a,0xf5]
+          vcvtph2pd xmm30, xmm29
+
+// CHECK: vcvtph2pd ymm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x5a,0xf5]
+          vcvtph2pd ymm30, xmm29
+
+// CHECK: vcvtph2pd xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2pd xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2pd xmm30, word ptr [r9]{1to2}
+// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x5a,0x31]
+          vcvtph2pd xmm30, word ptr [r9]{1to2}
+
+// CHECK: vcvtph2pd xmm30, dword ptr [rcx + 508]
+// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x5a,0x71,0x7f]
+          vcvtph2pd xmm30, dword ptr [rcx + 508]
+
+// CHECK: vcvtph2pd xmm30 {k7} {z}, word ptr [rdx - 256]{1to2}
+// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x5a,0x72,0x80]
+          vcvtph2pd xmm30 {k7} {z}, word ptr [rdx - 256]{1to2}
+
+// CHECK: vcvtph2pd ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x2f,0x5a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2pd ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2pd ymm30, word ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x5a,0x31]
+          vcvtph2pd ymm30, word ptr [r9]{1to4}
+
+// CHECK: vcvtph2pd ymm30, qword ptr [rcx + 1016]
+// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x5a,0x71,0x7f]
+          vcvtph2pd ymm30, qword ptr [rcx + 1016]
+
+// CHECK: vcvtph2pd ymm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x5a,0x72,0x80]
+          vcvtph2pd ymm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+
+// CHECK: vcvtph2psx xmm30, xmm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x08,0x13,0xf5]
+          vcvtph2psx xmm30, xmm29
+
+// CHECK: vcvtph2psx ymm30, xmm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x28,0x13,0xf5]
+          vcvtph2psx ymm30, xmm29
+
+// CHECK: vcvtph2psx xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x0f,0x13,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2psx xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2psx xmm30, word ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x46,0x7d,0x18,0x13,0x31]
+          vcvtph2psx xmm30, word ptr [r9]{1to4}
+
+// CHECK: vcvtph2psx xmm30, qword ptr [rcx + 1016]
+// CHECK: encoding: [0x62,0x66,0x7d,0x08,0x13,0x71,0x7f]
+          vcvtph2psx xmm30, qword ptr [rcx + 1016]
+
+// CHECK: vcvtph2psx xmm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+// CHECK: encoding: [0x62,0x66,0x7d,0x9f,0x13,0x72,0x80]
+          vcvtph2psx xmm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+
+// CHECK: vcvtph2psx ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x2f,0x13,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2psx ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2psx ymm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x7d,0x38,0x13,0x31]
+          vcvtph2psx ymm30, word ptr [r9]{1to8}
+
+// CHECK: vcvtph2psx ymm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x7d,0x28,0x13,0x71,0x7f]
+          vcvtph2psx ymm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvtph2psx ymm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x7d,0xbf,0x13,0x72,0x80]
+          vcvtph2psx ymm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcvtph2qq xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x7b,0xf5]
+          vcvtph2qq xmm30, xmm29
+
+// CHECK: vcvtph2qq ymm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x7b,0xf5]
+          vcvtph2qq ymm30, xmm29
+
+// CHECK: vcvtph2qq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x7b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2qq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2qq xmm30, word ptr [r9]{1to2}
+// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x7b,0x31]
+          vcvtph2qq xmm30, word ptr [r9]{1to2}
+
+// CHECK: vcvtph2qq xmm30, dword ptr [rcx + 508]
+// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7b,0x71,0x7f]
+          vcvtph2qq xmm30, dword ptr [rcx + 508]
+
+// CHECK: vcvtph2qq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2}
+// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x7b,0x72,0x80]
+          vcvtph2qq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2}
+
+// CHECK: vcvtph2qq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x7b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2qq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2qq ymm30, word ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x7b,0x31]
+          vcvtph2qq ymm30, word ptr [r9]{1to4}
+
+// CHECK: vcvtph2qq ymm30, qword ptr [rcx + 1016]
+// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x7b,0x71,0x7f]
+          vcvtph2qq ymm30, qword ptr [rcx + 1016]
+
+// CHECK: vcvtph2qq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x7b,0x72,0x80]
+          vcvtph2qq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+
+// CHECK: vcvtph2udq xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x79,0xf5]
+          vcvtph2udq xmm30, xmm29
+
+// CHECK: vcvtph2udq ymm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x79,0xf5]
+          vcvtph2udq ymm30, xmm29
+
+// CHECK: vcvtph2udq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2udq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2udq xmm30, word ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x79,0x31]
+          vcvtph2udq xmm30, word ptr [r9]{1to4}
+
+// CHECK: vcvtph2udq xmm30, qword ptr [rcx + 1016]
+// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x79,0x71,0x7f]
+          vcvtph2udq xmm30, qword ptr [rcx + 1016]
+
+// CHECK: vcvtph2udq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x79,0x72,0x80]
+          vcvtph2udq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+
+// CHECK: vcvtph2udq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x2f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2udq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2udq ymm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x79,0x31]
+          vcvtph2udq ymm30, word ptr [r9]{1to8}
+
+// CHECK: vcvtph2udq ymm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x79,0x71,0x7f]
+          vcvtph2udq ymm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvtph2udq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x79,0x72,0x80]
+          vcvtph2udq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcvtph2uqq xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x79,0xf5]
+          vcvtph2uqq xmm30, xmm29
+
+// CHECK: vcvtph2uqq ymm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x79,0xf5]
+          vcvtph2uqq ymm30, xmm29
+
+// CHECK: vcvtph2uqq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2uqq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2uqq xmm30, word ptr [r9]{1to2}
+// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x79,0x31]
+          vcvtph2uqq xmm30, word ptr [r9]{1to2}
+
+// CHECK: vcvtph2uqq xmm30, dword ptr [rcx + 508]
+// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x79,0x71,0x7f]
+          vcvtph2uqq xmm30, dword ptr [rcx + 508]
+
+// CHECK: vcvtph2uqq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2}
+// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x79,0x72,0x80]
+          vcvtph2uqq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2}
+
+// CHECK: vcvtph2uqq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x79,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2uqq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2uqq ymm30, word ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x79,0x31]
+          vcvtph2uqq ymm30, word ptr [r9]{1to4}
+
+// CHECK: vcvtph2uqq ymm30, qword ptr [rcx + 1016]
+// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x79,0x71,0x7f]
+          vcvtph2uqq ymm30, qword ptr [rcx + 1016]
+
+// CHECK: vcvtph2uqq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x79,0x72,0x80]
+          vcvtph2uqq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+
+// CHECK: vcvtph2uw xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x7d,0xf5]
+          vcvtph2uw xmm30, xmm29
+
+// CHECK: vcvtph2uw ymm30, ymm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x7d,0xf5]
+          vcvtph2uw ymm30, ymm29
+
+// CHECK: vcvtph2uw xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2uw xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2uw xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x7d,0x31]
+          vcvtph2uw xmm30, word ptr [r9]{1to8}
+
+// CHECK: vcvtph2uw xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x7d,0x71,0x7f]
+          vcvtph2uw xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvtph2uw xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x7d,0x72,0x80]
+          vcvtph2uw xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcvtph2uw ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x2f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2uw ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2uw ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x7d,0x31]
+          vcvtph2uw ymm30, word ptr [r9]{1to16}
+
+// CHECK: vcvtph2uw ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x7d,0x71,0x7f]
+          vcvtph2uw ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vcvtph2uw ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x7d,0x72,0x80]
+          vcvtph2uw ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vcvtph2w xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x7d,0xf5]
+          vcvtph2w xmm30, xmm29
+
+// CHECK: vcvtph2w ymm30, ymm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x7d,0xf5]
+          vcvtph2w ymm30, ymm29
+
+// CHECK: vcvtph2w xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2w xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2w xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x7d,0x31]
+          vcvtph2w xmm30, word ptr [r9]{1to8}
+
+// CHECK: vcvtph2w xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7d,0x71,0x7f]
+          vcvtph2w xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvtph2w xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x7d,0x72,0x80]
+          vcvtph2w xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcvtph2w ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtph2w ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtph2w ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x7d,0x31]
+          vcvtph2w ymm30, word ptr [r9]{1to16}
+
+// CHECK: vcvtph2w ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x7d,0x71,0x7f]
+          vcvtph2w ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vcvtph2w ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x7d,0x72,0x80]
+          vcvtph2w ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vcvtps2phx xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x1d,0xf5]
+          vcvtps2phx xmm30, xmm29
+
+// CHECK: vcvtps2phx xmm30, ymm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x1d,0xf5]
+          vcvtps2phx xmm30, ymm29
+
+// CHECK: vcvtps2phx xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x1d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtps2phx xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtps2phx xmm30, dword ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x1d,0x31]
+          vcvtps2phx xmm30, dword ptr [r9]{1to4}
+
+// CHECK: vcvtps2phx xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x1d,0x71,0x7f]
+          vcvtps2phx xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvtps2phx xmm30 {k7} {z}, dword ptr [rdx - 512]{1to4}
+// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x1d,0x72,0x80]
+          vcvtps2phx xmm30 {k7} {z}, dword ptr [rdx - 512]{1to4}
+
+// CHECK: vcvtps2phx xmm30, dword ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x1d,0x31]
+          vcvtps2phx xmm30, dword ptr [r9]{1to8}
+
+// CHECK: vcvtps2phx xmm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x1d,0x71,0x7f]
+          vcvtps2phx xmm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vcvtps2phx xmm30 {k7} {z}, dword ptr [rdx - 512]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x1d,0x72,0x80]
+          vcvtps2phx xmm30 {k7} {z}, dword ptr [rdx - 512]{1to8}
+
+// CHECK: vcvtqq2ph xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0xfc,0x08,0x5b,0xf5]
+          vcvtqq2ph xmm30, xmm29
+
+// CHECK: vcvtqq2ph xmm30, ymm29
+// CHECK: encoding: [0x62,0x05,0xfc,0x28,0x5b,0xf5]
+          vcvtqq2ph xmm30, ymm29
+
+// CHECK: vcvtqq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0xfc,0x0f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtqq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtqq2ph xmm30, qword ptr [r9]{1to2}
+// CHECK: encoding: [0x62,0x45,0xfc,0x18,0x5b,0x31]
+          vcvtqq2ph xmm30, qword ptr [r9]{1to2}
+
+// CHECK: vcvtqq2ph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0xfc,0x08,0x5b,0x71,0x7f]
+          vcvtqq2ph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvtqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to2}
+// CHECK: encoding: [0x62,0x65,0xfc,0x9f,0x5b,0x72,0x80]
+          vcvtqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to2}
+
+// CHECK: vcvtqq2ph xmm30, qword ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0xfc,0x38,0x5b,0x31]
+          vcvtqq2ph xmm30, qword ptr [r9]{1to4}
+
+// CHECK: vcvtqq2ph xmm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0xfc,0x28,0x5b,0x71,0x7f]
+          vcvtqq2ph xmm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vcvtqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to4}
+// CHECK: encoding: [0x62,0x65,0xfc,0xbf,0x5b,0x72,0x80]
+          vcvtqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to4}
+
+// CHECK: vcvttph2dq xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7e,0x08,0x5b,0xf5]
+          vcvttph2dq xmm30, xmm29
+
+// CHECK: vcvttph2dq ymm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7e,0x28,0x5b,0xf5]
+          vcvttph2dq ymm30, xmm29
+
+// CHECK: vcvttph2dq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7e,0x0f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2dq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvttph2dq xmm30, word ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0x7e,0x18,0x5b,0x31]
+          vcvttph2dq xmm30, word ptr [r9]{1to4}
+
+// CHECK: vcvttph2dq xmm30, qword ptr [rcx + 1016]
+// CHECK: encoding: [0x62,0x65,0x7e,0x08,0x5b,0x71,0x7f]
+          vcvttph2dq xmm30, qword ptr [rcx + 1016]
+
+// CHECK: vcvttph2dq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+// CHECK: encoding: [0x62,0x65,0x7e,0x9f,0x5b,0x72,0x80]
+          vcvttph2dq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+
+// CHECK: vcvttph2dq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7e,0x2f,0x5b,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2dq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvttph2dq ymm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7e,0x38,0x5b,0x31]
+          vcvttph2dq ymm30, word ptr [r9]{1to8}
+
+// CHECK: vcvttph2dq ymm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7e,0x28,0x5b,0x71,0x7f]
+          vcvttph2dq ymm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvttph2dq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7e,0xbf,0x5b,0x72,0x80]
+          vcvttph2dq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcvttph2qq xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x7a,0xf5]
+          vcvttph2qq xmm30, xmm29
+
+// CHECK: vcvttph2qq ymm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x7a,0xf5]
+          vcvttph2qq ymm30, xmm29
+
+// CHECK: vcvttph2qq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2qq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvttph2qq xmm30, word ptr [r9]{1to2}
+// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x7a,0x31]
+          vcvttph2qq xmm30, word ptr [r9]{1to2}
+
+// CHECK: vcvttph2qq xmm30, dword ptr [rcx + 508]
+// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7a,0x71,0x7f]
+          vcvttph2qq xmm30, dword ptr [rcx + 508]
+
+// CHECK: vcvttph2qq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2}
+// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x7a,0x72,0x80]
+          vcvttph2qq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2}
+
+// CHECK: vcvttph2qq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2qq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvttph2qq ymm30, word ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x7a,0x31]
+          vcvttph2qq ymm30, word ptr [r9]{1to4}
+
+// CHECK: vcvttph2qq ymm30, qword ptr [rcx + 1016]
+// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x7a,0x71,0x7f]
+          vcvttph2qq ymm30, qword ptr [rcx + 1016]
+
+// CHECK: vcvttph2qq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x7a,0x72,0x80]
+          vcvttph2qq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+
+// CHECK: vcvttph2udq xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x78,0xf5]
+          vcvttph2udq xmm30, xmm29
+
+// CHECK: vcvttph2udq ymm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x78,0xf5]
+          vcvttph2udq ymm30, xmm29
+
+// CHECK: vcvttph2udq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2udq xmm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvttph2udq xmm30, word ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x78,0x31]
+          vcvttph2udq xmm30, word ptr [r9]{1to4}
+
+// CHECK: vcvttph2udq xmm30, qword ptr [rcx + 1016]
+// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x78,0x71,0x7f]
+          vcvttph2udq xmm30, qword ptr [rcx + 1016]
+
+// CHECK: vcvttph2udq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x78,0x72,0x80]
+          vcvttph2udq xmm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+
+// CHECK: vcvttph2udq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x2f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2udq ymm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvttph2udq ymm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x78,0x31]
+          vcvttph2udq ymm30, word ptr [r9]{1to8}
+
+// CHECK: vcvttph2udq ymm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x78,0x71,0x7f]
+          vcvttph2udq ymm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvttph2udq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x78,0x72,0x80]
+          vcvttph2udq ymm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcvttph2uqq xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x78,0xf5]
+          vcvttph2uqq xmm30, xmm29
+
+// CHECK: vcvttph2uqq ymm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x78,0xf5]
+          vcvttph2uqq ymm30, xmm29
+
+// CHECK: vcvttph2uqq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2uqq xmm30 {k7}, dword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvttph2uqq xmm30, word ptr [r9]{1to2}
+// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x78,0x31]
+          vcvttph2uqq xmm30, word ptr [r9]{1to2}
+
+// CHECK: vcvttph2uqq xmm30, dword ptr [rcx + 508]
+// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x78,0x71,0x7f]
+          vcvttph2uqq xmm30, dword ptr [rcx + 508]
+
+// CHECK: vcvttph2uqq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2}
+// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x78,0x72,0x80]
+          vcvttph2uqq xmm30 {k7} {z}, word ptr [rdx - 256]{1to2}
+
+// CHECK: vcvttph2uqq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x78,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2uqq ymm30 {k7}, qword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvttph2uqq ymm30, word ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x78,0x31]
+          vcvttph2uqq ymm30, word ptr [r9]{1to4}
+
+// CHECK: vcvttph2uqq ymm30, qword ptr [rcx + 1016]
+// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x78,0x71,0x7f]
+          vcvttph2uqq ymm30, qword ptr [rcx + 1016]
+
+// CHECK: vcvttph2uqq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x78,0x72,0x80]
+          vcvttph2uqq ymm30 {k7} {z}, word ptr [rdx - 256]{1to4}
+
+// CHECK: vcvttph2uw xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x7c,0xf5]
+          vcvttph2uw xmm30, xmm29
+
+// CHECK: vcvttph2uw ymm30, ymm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x7c,0xf5]
+          vcvttph2uw ymm30, ymm29
+
+// CHECK: vcvttph2uw xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2uw xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvttph2uw xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x7c,0x31]
+          vcvttph2uw xmm30, word ptr [r9]{1to8}
+
+// CHECK: vcvttph2uw xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x7c,0x71,0x7f]
+          vcvttph2uw xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvttph2uw xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x7c,0x72,0x80]
+          vcvttph2uw xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcvttph2uw ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x2f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2uw ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvttph2uw ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x7c,0x31]
+          vcvttph2uw ymm30, word ptr [r9]{1to16}
+
+// CHECK: vcvttph2uw ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x7c,0x71,0x7f]
+          vcvttph2uw ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vcvttph2uw ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x7c,0x72,0x80]
+          vcvttph2uw ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vcvttph2w xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x08,0x7c,0xf5]
+          vcvttph2w xmm30, xmm29
+
+// CHECK: vcvttph2w ymm30, ymm29
+// CHECK: encoding: [0x62,0x05,0x7d,0x28,0x7c,0xf5]
+          vcvttph2w ymm30, ymm29
+
+// CHECK: vcvttph2w xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x0f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2w xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvttph2w xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7d,0x18,0x7c,0x31]
+          vcvttph2w xmm30, word ptr [r9]{1to8}
+
+// CHECK: vcvttph2w xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7d,0x08,0x7c,0x71,0x7f]
+          vcvttph2w xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvttph2w xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7d,0x9f,0x7c,0x72,0x80]
+          vcvttph2w xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcvttph2w ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7d,0x2f,0x7c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvttph2w ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvttph2w ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x7d,0x38,0x7c,0x31]
+          vcvttph2w ymm30, word ptr [r9]{1to16}
+
+// CHECK: vcvttph2w ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x7d,0x28,0x7c,0x71,0x7f]
+          vcvttph2w ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vcvttph2w ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x7d,0xbf,0x7c,0x72,0x80]
+          vcvttph2w ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vcvtudq2ph xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7f,0x08,0x7a,0xf5]
+          vcvtudq2ph xmm30, xmm29
+
+// CHECK: vcvtudq2ph xmm30, ymm29
+// CHECK: encoding: [0x62,0x05,0x7f,0x28,0x7a,0xf5]
+          vcvtudq2ph xmm30, ymm29
+
+// CHECK: vcvtudq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7f,0x0f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtudq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtudq2ph xmm30, dword ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0x7f,0x18,0x7a,0x31]
+          vcvtudq2ph xmm30, dword ptr [r9]{1to4}
+
+// CHECK: vcvtudq2ph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7f,0x08,0x7a,0x71,0x7f]
+          vcvtudq2ph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvtudq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to4}
+// CHECK: encoding: [0x62,0x65,0x7f,0x9f,0x7a,0x72,0x80]
+          vcvtudq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to4}
+
+// CHECK: vcvtudq2ph xmm30, dword ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7f,0x38,0x7a,0x31]
+          vcvtudq2ph xmm30, dword ptr [r9]{1to8}
+
+// CHECK: vcvtudq2ph xmm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x7f,0x28,0x7a,0x71,0x7f]
+          vcvtudq2ph xmm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vcvtudq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7f,0xbf,0x7a,0x72,0x80]
+          vcvtudq2ph xmm30 {k7} {z}, dword ptr [rdx - 512]{1to8}
+
+// CHECK: vcvtuqq2ph xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0xff,0x08,0x7a,0xf5]
+          vcvtuqq2ph xmm30, xmm29
+
+// CHECK: vcvtuqq2ph xmm30, ymm29
+// CHECK: encoding: [0x62,0x05,0xff,0x28,0x7a,0xf5]
+          vcvtuqq2ph xmm30, ymm29
+
+// CHECK: vcvtuqq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0xff,0x0f,0x7a,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtuqq2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtuqq2ph xmm30, qword ptr [r9]{1to2}
+// CHECK: encoding: [0x62,0x45,0xff,0x18,0x7a,0x31]
+          vcvtuqq2ph xmm30, qword ptr [r9]{1to2}
+
+// CHECK: vcvtuqq2ph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0xff,0x08,0x7a,0x71,0x7f]
+          vcvtuqq2ph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvtuqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to2}
+// CHECK: encoding: [0x62,0x65,0xff,0x9f,0x7a,0x72,0x80]
+          vcvtuqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to2}
+
+// CHECK: vcvtuqq2ph xmm30, qword ptr [r9]{1to4}
+// CHECK: encoding: [0x62,0x45,0xff,0x38,0x7a,0x31]
+          vcvtuqq2ph xmm30, qword ptr [r9]{1to4}
+
+// CHECK: vcvtuqq2ph xmm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0xff,0x28,0x7a,0x71,0x7f]
+          vcvtuqq2ph xmm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vcvtuqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to4}
+// CHECK: encoding: [0x62,0x65,0xff,0xbf,0x7a,0x72,0x80]
+          vcvtuqq2ph xmm30 {k7} {z}, qword ptr [rdx - 1024]{1to4}
+
+// CHECK: vcvtuw2ph xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7f,0x08,0x7d,0xf5]
+          vcvtuw2ph xmm30, xmm29
+
+// CHECK: vcvtuw2ph ymm30, ymm29
+// CHECK: encoding: [0x62,0x05,0x7f,0x28,0x7d,0xf5]
+          vcvtuw2ph ymm30, ymm29
+
+// CHECK: vcvtuw2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7f,0x0f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtuw2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtuw2ph xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7f,0x18,0x7d,0x31]
+          vcvtuw2ph xmm30, word ptr [r9]{1to8}
+
+// CHECK: vcvtuw2ph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7f,0x08,0x7d,0x71,0x7f]
+          vcvtuw2ph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvtuw2ph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7f,0x9f,0x7d,0x72,0x80]
+          vcvtuw2ph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcvtuw2ph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7f,0x2f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtuw2ph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtuw2ph ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x7f,0x38,0x7d,0x31]
+          vcvtuw2ph ymm30, word ptr [r9]{1to16}
+
+// CHECK: vcvtuw2ph ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x7f,0x28,0x7d,0x71,0x7f]
+          vcvtuw2ph ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vcvtuw2ph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x7f,0xbf,0x7d,0x72,0x80]
+          vcvtuw2ph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vcvtw2ph xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7e,0x08,0x7d,0xf5]
+          vcvtw2ph xmm30, xmm29
+
+// CHECK: vcvtw2ph ymm30, ymm29
+// CHECK: encoding: [0x62,0x05,0x7e,0x28,0x7d,0xf5]
+          vcvtw2ph ymm30, ymm29
+
+// CHECK: vcvtw2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7e,0x0f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtw2ph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtw2ph xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7e,0x18,0x7d,0x31]
+          vcvtw2ph xmm30, word ptr [r9]{1to8}
+
+// CHECK: vcvtw2ph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7e,0x08,0x7d,0x71,0x7f]
+          vcvtw2ph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vcvtw2ph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7e,0x9f,0x7d,0x72,0x80]
+          vcvtw2ph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vcvtw2ph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7e,0x2f,0x7d,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vcvtw2ph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vcvtw2ph ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x7e,0x38,0x7d,0x31]
+          vcvtw2ph ymm30, word ptr [r9]{1to16}
+
+// CHECK: vcvtw2ph ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x7e,0x28,0x7d,0x71,0x7f]
+          vcvtw2ph ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vcvtw2ph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x7e,0xbf,0x7d,0x72,0x80]
+          vcvtw2ph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vfpclassph k5, xmm30, 123
+// CHECK: encoding: [0x62,0x93,0x7c,0x08,0x66,0xee,0x7b]
+          vfpclassph k5, xmm30, 123
+
+// CHECK: vfpclassph k5, ymm30, 123
+// CHECK: encoding: [0x62,0x93,0x7c,0x28,0x66,0xee,0x7b]
+          vfpclassph k5, ymm30, 123
+
+// CHECK: vfpclassph k5 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0xb3,0x7c,0x0f,0x66,0xac,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vfpclassph k5 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vfpclassph k5, word ptr [r9]{1to8}, 123
+// CHECK: encoding: [0x62,0xd3,0x7c,0x18,0x66,0x29,0x7b]
+          vfpclassph k5, word ptr [r9]{1to8}, 123
+
+// CHECK: vfpclassph k5, xmmword ptr [rcx + 2032], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x08,0x66,0x69,0x7f,0x7b]
+          vfpclassph k5, xmmword ptr [rcx + 2032], 123
+
+// CHECK: vfpclassph k5 {k7}, word ptr [rdx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x1f,0x66,0x6a,0x80,0x7b]
+          vfpclassph k5 {k7}, word ptr [rdx - 256]{1to8}, 123
+
+// CHECK: vfpclassph k5, word ptr [r9]{1to16}, 123
+// CHECK: encoding: [0x62,0xd3,0x7c,0x38,0x66,0x29,0x7b]
+          vfpclassph k5, word ptr [r9]{1to16}, 123
+
+// CHECK: vfpclassph k5, ymmword ptr [rcx + 4064], 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x28,0x66,0x69,0x7f,0x7b]
+          vfpclassph k5, ymmword ptr [rcx + 4064], 123
+
+// CHECK: vfpclassph k5 {k7}, word ptr [rdx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0xf3,0x7c,0x3f,0x66,0x6a,0x80,0x7b]
+          vfpclassph k5 {k7}, word ptr [rdx - 256]{1to16}, 123
+
+// CHECK: vgetexpph xmm30, xmm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x08,0x42,0xf5]
+          vgetexpph xmm30, xmm29
+
+// CHECK: vgetexpph ymm30, ymm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x28,0x42,0xf5]
+          vgetexpph ymm30, ymm29
+
+// CHECK: vgetexpph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x0f,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vgetexpph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vgetexpph xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x7d,0x18,0x42,0x31]
+          vgetexpph xmm30, word ptr [r9]{1to8}
+
+// CHECK: vgetexpph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x7d,0x08,0x42,0x71,0x7f]
+          vgetexpph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vgetexpph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x7d,0x9f,0x42,0x72,0x80]
+          vgetexpph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vgetexpph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x2f,0x42,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vgetexpph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vgetexpph ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x7d,0x38,0x42,0x31]
+          vgetexpph ymm30, word ptr [r9]{1to16}
+
+// CHECK: vgetexpph ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x7d,0x28,0x42,0x71,0x7f]
+          vgetexpph ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vgetexpph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x7d,0xbf,0x42,0x72,0x80]
+          vgetexpph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vgetmantph ymm30, ymm29, 123
+// CHECK: encoding: [0x62,0x03,0x7c,0x28,0x26,0xf5,0x7b]
+          vgetmantph ymm30, ymm29, 123
+
+// CHECK: vgetmantph xmm30, xmm29, 123
+// CHECK: encoding: [0x62,0x03,0x7c,0x08,0x26,0xf5,0x7b]
+          vgetmantph xmm30, xmm29, 123
+
+// CHECK: vgetmantph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0x23,0x7c,0x0f,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vgetmantph xmm30, word ptr [r9]{1to8}, 123
+// CHECK: encoding: [0x62,0x43,0x7c,0x18,0x26,0x31,0x7b]
+          vgetmantph xmm30, word ptr [r9]{1to8}, 123
+
+// CHECK: vgetmantph xmm30, xmmword ptr [rcx + 2032], 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x08,0x26,0x71,0x7f,0x7b]
+          vgetmantph xmm30, xmmword ptr [rcx + 2032], 123
+
+// CHECK: vgetmantph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x9f,0x26,0x72,0x80,0x7b]
+          vgetmantph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+
+// CHECK: vgetmantph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0x23,0x7c,0x2f,0x26,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vgetmantph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vgetmantph ymm30, word ptr [r9]{1to16}, 123
+// CHECK: encoding: [0x62,0x43,0x7c,0x38,0x26,0x31,0x7b]
+          vgetmantph ymm30, word ptr [r9]{1to16}, 123
+
+// CHECK: vgetmantph ymm30, ymmword ptr [rcx + 4064], 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x28,0x26,0x71,0x7f,0x7b]
+          vgetmantph ymm30, ymmword ptr [rcx + 4064], 123
+
+// CHECK: vgetmantph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0x63,0x7c,0xbf,0x26,0x72,0x80,0x7b]
+          vgetmantph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+
+// CHECK: vrcpph xmm30, xmm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x08,0x4c,0xf5]
+          vrcpph xmm30, xmm29
+
+// CHECK: vrcpph ymm30, ymm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x28,0x4c,0xf5]
+          vrcpph ymm30, ymm29
+
+// CHECK: vrcpph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x0f,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrcpph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrcpph xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x7d,0x18,0x4c,0x31]
+          vrcpph xmm30, word ptr [r9]{1to8}
+
+// CHECK: vrcpph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x7d,0x08,0x4c,0x71,0x7f]
+          vrcpph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vrcpph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x7d,0x9f,0x4c,0x72,0x80]
+          vrcpph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vrcpph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x2f,0x4c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrcpph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrcpph ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x7d,0x38,0x4c,0x31]
+          vrcpph ymm30, word ptr [r9]{1to16}
+
+// CHECK: vrcpph ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x7d,0x28,0x4c,0x71,0x7f]
+          vrcpph ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vrcpph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x7d,0xbf,0x4c,0x72,0x80]
+          vrcpph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vreduceph ymm30, ymm29, 123
+// CHECK: encoding: [0x62,0x03,0x7c,0x28,0x56,0xf5,0x7b]
+          vreduceph ymm30, ymm29, 123
+
+// CHECK: vreduceph xmm30, xmm29, 123
+// CHECK: encoding: [0x62,0x03,0x7c,0x08,0x56,0xf5,0x7b]
+          vreduceph xmm30, xmm29, 123
+
+// CHECK: vreduceph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0x23,0x7c,0x0f,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vreduceph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vreduceph xmm30, word ptr [r9]{1to8}, 123
+// CHECK: encoding: [0x62,0x43,0x7c,0x18,0x56,0x31,0x7b]
+          vreduceph xmm30, word ptr [r9]{1to8}, 123
+
+// CHECK: vreduceph xmm30, xmmword ptr [rcx + 2032], 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x08,0x56,0x71,0x7f,0x7b]
+          vreduceph xmm30, xmmword ptr [rcx + 2032], 123
+
+// CHECK: vreduceph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x9f,0x56,0x72,0x80,0x7b]
+          vreduceph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+
+// CHECK: vreduceph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0x23,0x7c,0x2f,0x56,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vreduceph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vreduceph ymm30, word ptr [r9]{1to16}, 123
+// CHECK: encoding: [0x62,0x43,0x7c,0x38,0x56,0x31,0x7b]
+          vreduceph ymm30, word ptr [r9]{1to16}, 123
+
+// CHECK: vreduceph ymm30, ymmword ptr [rcx + 4064], 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x28,0x56,0x71,0x7f,0x7b]
+          vreduceph ymm30, ymmword ptr [rcx + 4064], 123
+
+// CHECK: vreduceph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0x63,0x7c,0xbf,0x56,0x72,0x80,0x7b]
+          vreduceph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+
+// CHECK: vrndscaleph ymm30, ymm29, 123
+// CHECK: encoding: [0x62,0x03,0x7c,0x28,0x08,0xf5,0x7b]
+          vrndscaleph ymm30, ymm29, 123
+
+// CHECK: vrndscaleph xmm30, xmm29, 123
+// CHECK: encoding: [0x62,0x03,0x7c,0x08,0x08,0xf5,0x7b]
+          vrndscaleph xmm30, xmm29, 123
+
+// CHECK: vrndscaleph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0x23,0x7c,0x0f,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vrndscaleph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vrndscaleph xmm30, word ptr [r9]{1to8}, 123
+// CHECK: encoding: [0x62,0x43,0x7c,0x18,0x08,0x31,0x7b]
+          vrndscaleph xmm30, word ptr [r9]{1to8}, 123
+
+// CHECK: vrndscaleph xmm30, xmmword ptr [rcx + 2032], 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x08,0x08,0x71,0x7f,0x7b]
+          vrndscaleph xmm30, xmmword ptr [rcx + 2032], 123
+
+// CHECK: vrndscaleph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x9f,0x08,0x72,0x80,0x7b]
+          vrndscaleph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}, 123
+
+// CHECK: vrndscaleph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123
+// CHECK: encoding: [0x62,0x23,0x7c,0x2f,0x08,0xb4,0xf5,0x00,0x00,0x00,0x10,0x7b]
+          vrndscaleph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456], 123
+
+// CHECK: vrndscaleph ymm30, word ptr [r9]{1to16}, 123
+// CHECK: encoding: [0x62,0x43,0x7c,0x38,0x08,0x31,0x7b]
+          vrndscaleph ymm30, word ptr [r9]{1to16}, 123
+
+// CHECK: vrndscaleph ymm30, ymmword ptr [rcx + 4064], 123
+// CHECK: encoding: [0x62,0x63,0x7c,0x28,0x08,0x71,0x7f,0x7b]
+          vrndscaleph ymm30, ymmword ptr [rcx + 4064], 123
+
+// CHECK: vrndscaleph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+// CHECK: encoding: [0x62,0x63,0x7c,0xbf,0x08,0x72,0x80,0x7b]
+          vrndscaleph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}, 123
+
+// CHECK: vrsqrtph xmm30, xmm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x08,0x4e,0xf5]
+          vrsqrtph xmm30, xmm29
+
+// CHECK: vrsqrtph ymm30, ymm29
+// CHECK: encoding: [0x62,0x06,0x7d,0x28,0x4e,0xf5]
+          vrsqrtph ymm30, ymm29
+
+// CHECK: vrsqrtph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x0f,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrsqrtph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrsqrtph xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x7d,0x18,0x4e,0x31]
+          vrsqrtph xmm30, word ptr [r9]{1to8}
+
+// CHECK: vrsqrtph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x7d,0x08,0x4e,0x71,0x7f]
+          vrsqrtph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vrsqrtph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x7d,0x9f,0x4e,0x72,0x80]
+          vrsqrtph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vrsqrtph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x7d,0x2f,0x4e,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vrsqrtph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vrsqrtph ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x7d,0x38,0x4e,0x31]
+          vrsqrtph ymm30, word ptr [r9]{1to16}
+
+// CHECK: vrsqrtph ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x7d,0x28,0x4e,0x71,0x7f]
+          vrsqrtph ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vrsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x7d,0xbf,0x4e,0x72,0x80]
+          vrsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+
+// CHECK: vscalefph ymm30, ymm29, ymm28
+// CHECK: encoding: [0x62,0x06,0x15,0x20,0x2c,0xf4]
+          vscalefph ymm30, ymm29, ymm28
+
+// CHECK: vscalefph xmm30, xmm29, xmm28
+// CHECK: encoding: [0x62,0x06,0x15,0x00,0x2c,0xf4]
+          vscalefph xmm30, xmm29, xmm28
+
+// CHECK: vscalefph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x27,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vscalefph ymm30 {k7}, ymm29, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vscalefph ymm30, ymm29, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x46,0x15,0x30,0x2c,0x31]
+          vscalefph ymm30, ymm29, word ptr [r9]{1to16}
+
+// CHECK: vscalefph ymm30, ymm29, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x66,0x15,0x20,0x2c,0x71,0x7f]
+          vscalefph ymm30, ymm29, ymmword ptr [rcx + 4064]
+
+// CHECK: vscalefph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x66,0x15,0xb7,0x2c,0x72,0x80]
+          vscalefph ymm30 {k7} {z}, ymm29, word ptr [rdx - 256]{1to16}
+
+// CHECK: vscalefph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x26,0x15,0x07,0x2c,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vscalefph xmm30 {k7}, xmm29, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vscalefph xmm30, xmm29, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x46,0x15,0x10,0x2c,0x31]
+          vscalefph xmm30, xmm29, word ptr [r9]{1to8}
+
+// CHECK: vscalefph xmm30, xmm29, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x66,0x15,0x00,0x2c,0x71,0x7f]
+          vscalefph xmm30, xmm29, xmmword ptr [rcx + 2032]
+
+// CHECK: vscalefph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x66,0x15,0x97,0x2c,0x72,0x80]
+          vscalefph xmm30 {k7} {z}, xmm29, word ptr [rdx - 256]{1to8}
+
+// CHECK: vsqrtph xmm30, xmm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x08,0x51,0xf5]
+          vsqrtph xmm30, xmm29
+
+// CHECK: vsqrtph ymm30, ymm29
+// CHECK: encoding: [0x62,0x05,0x7c,0x28,0x51,0xf5]
+          vsqrtph ymm30, ymm29
+
+// CHECK: vsqrtph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x0f,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsqrtph xmm30 {k7}, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsqrtph xmm30, word ptr [r9]{1to8}
+// CHECK: encoding: [0x62,0x45,0x7c,0x18,0x51,0x31]
+          vsqrtph xmm30, word ptr [r9]{1to8}
+
+// CHECK: vsqrtph xmm30, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0x62,0x65,0x7c,0x08,0x51,0x71,0x7f]
+          vsqrtph xmm30, xmmword ptr [rcx + 2032]
+
+// CHECK: vsqrtph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+// CHECK: encoding: [0x62,0x65,0x7c,0x9f,0x51,0x72,0x80]
+          vsqrtph xmm30 {k7} {z}, word ptr [rdx - 256]{1to8}
+
+// CHECK: vsqrtph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0x62,0x25,0x7c,0x2f,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          vsqrtph ymm30 {k7}, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: vsqrtph ymm30, word ptr [r9]{1to16}
+// CHECK: encoding: [0x62,0x45,0x7c,0x38,0x51,0x31]
+          vsqrtph ymm30, word ptr [r9]{1to16}
+
+// CHECK: vsqrtph ymm30, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0x62,0x65,0x7c,0x28,0x51,0x71,0x7f]
+          vsqrtph ymm30, ymmword ptr [rcx + 4064]
+
+// CHECK: vsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
+// CHECK: encoding: [0x62,0x65,0x7c,0xbf,0x51,0x72,0x80]
+          vsqrtph ymm30 {k7} {z}, word ptr [rdx - 256]{1to16}
diff --git a/llvm/test/MachineVerifier/test_g_isnan.mir b/llvm/test/MachineVerifier/test_g_isnan.mir
new file mode 100644
index 0000000000000..94bc93b671905
--- /dev/null
+++ b/llvm/test/MachineVerifier/test_g_isnan.mir
@@ -0,0 +1,33 @@
+# REQUIRES: aarch64-registered-target
+# RUN: not --crash llc -verify-machineinstrs -mtriple aarch64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+
+name:            test
+body: |
+  bb.0:
+   liveins: $x0
+   %s64:_(s64) = COPY $x0
+   %v4s16:_(<4 x s16>) = COPY $x0
+
+   ; CHECK: *** Bad machine code: Destination must be a 1-bit scalar or vector of 1-bit elements ***
+   ; CHECK: instruction: %isnan1:_(s64) = G_ISNAN %s64:_(s64)
+   %isnan1:_(s64) = G_ISNAN %s64
+
+   ; CHECK: *** Bad machine code: operand types must be all-vector or all-scalar ***
+   ; CHECK: instruction: %isnan2:_(<2 x s1>) = G_ISNAN %s64:_(s64)
+   %isnan2:_(<2 x s1>) = G_ISNAN %s64
+
+   ; CHECK: *** Bad machine code: operand types must preserve number of vector elements ***
+   ; CHECK: instruction: %isnan3:_(<2 x s1>) = G_ISNAN %v4s16:_(<4 x s16>)
+   %isnan3:_(<2 x s1>) = G_ISNAN %v4s16
+
+   ; CHECK: *** Bad machine code: operand types must be all-vector or all-scalar ***
+   ; CHECK: instruction: %isnan4:_(s1) = G_ISNAN %v4s16:_(<4 x s16>)
+   %isnan4:_(s1) = G_ISNAN %v4s16
+
+   ; CHECK: *** Bad machine code: Destination must be a 1-bit scalar or vector of 1-bit elements ***
+   ; CHECK: instruction: %isnan5:_(p0) = G_ISNAN %s64:_(s64)
+   %isnan5:_(p0) = G_ISNAN %s64
+
+   ; CHECK: *** Bad machine code: Destination must be a 1-bit scalar or vector of 1-bit elements ***
+   ; CHECK: instruction: %isnan6:_(<4 x p0>) = G_ISNAN %v4s16:_(<4 x s16>)
+   %isnan6:_(<4 x p0>) = G_ISNAN %v4s16
diff --git a/llvm/test/MachineVerifier/test_g_llround.mir b/llvm/test/MachineVerifier/test_g_llround.mir
new file mode 100644
index 0000000000000..3e43d3408e8d9
--- /dev/null
+++ b/llvm/test/MachineVerifier/test_g_llround.mir
@@ -0,0 +1,23 @@
+#RUN: not --crash llc -march=aarch64 -o - -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
+# REQUIRES: aarch64-registered-target
+
+---
+name:            test_llround
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:
+body:             |
+  bb.0:
+    liveins: $x0, $q0
+    %ptr:_(p0) = COPY $x0
+    %vector:_(<2 x s64>) = COPY $q0
+
+    ; CHECK: Bad machine code: All register operands must have scalar types
+    ; CHECK: instruction: %no_ptrs:_(s64) = G_LROUND %ptr:_(p0)
+    %no_ptrs:_(s64) = G_LROUND %ptr:_(p0)
+
+    ; CHECK: Bad machine code: All register operands must have scalar types
+    ; CHECK: instruction: %no_vectors:_(s64) = G_LROUND %vector:_(<2 x s64>)
+    %no_vectors:_(s64) = G_LROUND %vector:_(<2 x s64>)
diff --git a/llvm/test/MachineVerifier/test_g_lround.mir b/llvm/test/MachineVerifier/test_g_lround.mir
new file mode 100644
index 0000000000000..259ffc05d404f
--- /dev/null
+++ b/llvm/test/MachineVerifier/test_g_lround.mir
@@ -0,0 +1,23 @@
+#RUN: not --crash llc -march=aarch64 -o - -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
+# REQUIRES: aarch64-registered-target
+
+---
+name:            test_lround
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:
+body:             |
+  bb.0:
+    liveins: $x0, $q0
+    %ptr:_(p0) = COPY $x0
+    %vector:_(<2 x s64>) = COPY $q0
+
+    ; CHECK: Bad machine code: All register operands must have scalar types
+    ; CHECK: instruction: %no_ptrs:_(s32) = G_LROUND %ptr:_(p0)
+    %no_ptrs:_(s32) = G_LROUND %ptr:_(p0)
+
+    ; CHECK: Bad machine code: All register operands must have scalar types
+    ; CHECK: instruction: %no_vectors:_(s32) = G_LROUND %vector:_(<2 x s64>)
+    %no_vectors:_(s32) = G_LROUND %vector:_(<2 x s64>)
diff --git a/llvm/test/MachineVerifier/test_vector_reductions.mir b/llvm/test/MachineVerifier/test_vector_reductions.mir
index d66d1ed05519b..6ea611ecab96b 100644
--- a/llvm/test/MachineVerifier/test_vector_reductions.mir
+++ b/llvm/test/MachineVerifier/test_vector_reductions.mir
@@ -30,6 +30,4 @@ body:             |
     %dst:_(s64) = G_VECREDUCE_SEQ_FADD %scalar_s64, %scalar_s64
     ; CHECK: Bad machine code: Sequential FADD/FMUL vector reduction must have a vector 2nd operand
 
-    %dst2:_(s64) = G_VECREDUCE_MUL %scalar_s64
-    ; CHECK: Bad machine code: Vector reduction requires vector source
 ...
diff --git a/llvm/test/Other/lit-quoting.txt b/llvm/test/Other/lit-quoting.txt
index 48a72e20e4603..76dc67956a290 100644
--- a/llvm/test/Other/lit-quoting.txt
+++ b/llvm/test/Other/lit-quoting.txt
@@ -1,14 +1,14 @@
-RUN: echo "\"" | FileCheck %s --check-prefix=CHECK1
-RUN: echo '"' | FileCheck %s --check-prefix=CHECK1
-RUN: echo 'a[b\c' | FileCheck %s --check-prefix=CHECK2
-RUN: echo "a[b\\c" | FileCheck %s --check-prefix=CHECK2
-RUN: echo 'a\b\\c\\\\d' | FileCheck %s --check-prefix=CHECK3
-RUN: echo "a\\b\\\\c\\\\\\\\d" | FileCheck %s --check-prefix=CHECK3
+RUN: printf "%%s\n" "\"" | FileCheck %s --check-prefix=CHECK1
+RUN: printf "%%s\n" '"' | FileCheck %s --check-prefix=CHECK1
+RUN: printf "%%s\n" 'a[b\c' | FileCheck %s --check-prefix=CHECK2
+RUN: printf "%%s\n" "a[b\\c" | FileCheck %s --check-prefix=CHECK2
+RUN: printf "%%s\n" 'a\b\\c\\\\d' | FileCheck %s --check-prefix=CHECK3
+RUN: printf "%%s\n" "a\\b\\\\c\\\\\\\\d" | FileCheck %s --check-prefix=CHECK3
 CHECK1: {{^"$}}
 CHECK2: {{^a\[b\\c$}}
 CHECK3: {{^a\\b\\\\c\\\\\\\\d$}}
 
 On Windows, with MSYS based tools, the following commands fail though:
-RUNX: echo 'a[b\c\\d' | FileCheck %s --check-prefix=CHECK4
-RUNX: echo "a[b\\c\\\\d" | FileCheck %s --check-prefix=CHECK4
+RUNX: printf "%%s\n" 'a[b\c\\d' | FileCheck %s --check-prefix=CHECK4
+RUNX: printf "%%s\n" "a[b\\c\\\\d" | FileCheck %s --check-prefix=CHECK4
 CHECK4: {{^a\[b\\c\\\\d$}}
diff --git a/llvm/test/Other/loop-mssa-not-preserved.ll b/llvm/test/Other/loop-mssa-not-preserved.ll
new file mode 100644
index 0000000000000..38390262f113a
--- /dev/null
+++ b/llvm/test/Other/loop-mssa-not-preserved.ll
@@ -0,0 +1,17 @@
+; RUN: not --crash opt -passes='loop-mssa(loop-unroll-full)' 2>&1 < %s | FileCheck %s
+
+; CHECK: LLVM ERROR: Loop pass manager using MemorySSA contains a pass that does not preserve MemorySSA
+
+define void @test() {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %i.inc, %loop ]
+  %i.inc = add i32 %i, 1
+  %c = icmp ult i32 %i, 8
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Other/optimization-remarks-invalidation.ll b/llvm/test/Other/optimization-remarks-invalidation.ll
index df2175dc03e19..49b36e3c85a7a 100644
--- a/llvm/test/Other/optimization-remarks-invalidation.ll
+++ b/llvm/test/Other/optimization-remarks-invalidation.ll
@@ -4,26 +4,26 @@
 ;
 ; First make sure we emit remarks on this test case.
 ; RUN: opt %s -disable-output -aa-pipeline=basic-aa 2>&1 \
-; RUN:     -passes='require<opt-remark-emit>,loop(licm)' \
+; RUN:     -passes='require<opt-remark-emit>,loop-mssa(licm)' \
 ; RUN:     -pass-remarks=licm -pass-remarks-with-hotness \
 ; RUN:     | FileCheck %s
 ;
 ; Check that passes which preserve BFI don't invalidate the emitter.
 ; RUN: opt %s -disable-output -aa-pipeline=basic-aa 2>&1 \
-; RUN:     -passes='require<opt-remark-emit>,instcombine,require<opt-remark-emit>,loop(licm)' -debug-pass-manager \
+; RUN:     -passes='require<opt-remark-emit>,instcombine,require<opt-remark-emit>,loop-mssa(licm)' -debug-pass-manager \
 ; RUN:     -pass-remarks=licm -pass-remarks-with-hotness \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-PM-PRESERVE
 ;
 ; Check that invalidating BFI computes a fresh emitter.
 ; RUN: opt %s -disable-output -aa-pipeline=basic-aa 2>&1 \
-; RUN:     -passes='require<opt-remark-emit>,invalidate<block-freq>,require<opt-remark-emit>,loop(licm)' -debug-pass-manager \
+; RUN:     -passes='require<opt-remark-emit>,invalidate<block-freq>,require<opt-remark-emit>,loop-mssa(licm)' -debug-pass-manager \
 ; RUN:     -pass-remarks=licm -pass-remarks-with-hotness \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-PM-INVALIDATE
 ;
 ; Check that invalidating BFI desn't compute a fresh emitter when we don't
 ; request hotness remarks.
 ; RUN: opt %s -disable-output -aa-pipeline=basic-aa 2>&1 \
-; RUN:     -passes='require<opt-remark-emit>,invalidate<block-freq>,require<opt-remark-emit>,loop(licm)' -debug-pass-manager \
+; RUN:     -passes='require<opt-remark-emit>,invalidate<block-freq>,require<opt-remark-emit>,loop-mssa(licm)' -debug-pass-manager \
 ; RUN:     -pass-remarks=licm \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-PM-NO-INVALIDATE
 
diff --git a/llvm/test/Other/time-passes.ll b/llvm/test/Other/time-passes.ll
index b9c85b7917bc9..8032ce5fd50df 100644
--- a/llvm/test/Other/time-passes.ll
+++ b/llvm/test/Other/time-passes.ll
@@ -3,12 +3,12 @@
 ; RUN: opt < %s -disable-output -passes='default<O2>' -time-passes 2>&1 | FileCheck %s --check-prefix=TIME
 ;
 ; For new pass manager, check that -time-passes-per-run emit one report for each pass run.
-; RUN: opt < %s -disable-output -passes='instcombine,instcombine,loop(licm)' -time-passes-per-run 2>&1 | FileCheck %s --check-prefix=TIME --check-prefix=TIME-NEW
-; RUN: opt < %s -disable-output -passes='instcombine,loop(licm),instcombine,loop(licm)' -time-passes-per-run 2>&1 | FileCheck %s --check-prefix=TIME --check-prefix=TIME-NEW -check-prefix=TIME-DOUBLE-LICM-NEW
+; RUN: opt < %s -disable-output -passes='instcombine,instcombine,loop-mssa(licm)' -time-passes-per-run 2>&1 | FileCheck %s --check-prefix=TIME --check-prefix=TIME-NEW
+; RUN: opt < %s -disable-output -passes='instcombine,loop-mssa(licm),instcombine,loop-mssa(licm)' -time-passes-per-run 2>&1 | FileCheck %s --check-prefix=TIME --check-prefix=TIME-NEW -check-prefix=TIME-DOUBLE-LICM-NEW
 ;
 ; For new pass manager, check that -time-passes emit one report for each pass.
-; RUN: opt < %s -disable-output -passes='instcombine,instcombine,loop(licm)' -time-passes 2>&1 | FileCheck %s --check-prefixes=TIME,TIME-NEW-PER-PASS
-; RUN: opt < %s -disable-output -passes='instcombine,loop(licm),instcombine,loop(licm)' -time-passes 2>&1 | FileCheck %s --check-prefixes=TIME,TIME-NEW-PER-PASS
+; RUN: opt < %s -disable-output -passes='instcombine,instcombine,loop-mssa(licm)' -time-passes 2>&1 | FileCheck %s --check-prefixes=TIME,TIME-NEW-PER-PASS
+; RUN: opt < %s -disable-output -passes='instcombine,loop-mssa(licm),instcombine,loop-mssa(licm)' -time-passes 2>&1 | FileCheck %s --check-prefixes=TIME,TIME-NEW-PER-PASS
 ;
 ; The following 4 test runs verify -info-output-file interaction (default goes to stderr, '-' goes to stdout).
 ; RUN: opt -enable-new-pm=0 < %s -disable-output -O2 -time-passes -info-output-file='-' 2>/dev/null | FileCheck %s --check-prefix=TIME
diff --git a/llvm/test/Transforms/ADCE/broken-loop-info.ll b/llvm/test/Transforms/ADCE/broken-loop-info.ll
index 8c9981f2f19c0..70ef12b611db6 100644
--- a/llvm/test/Transforms/ADCE/broken-loop-info.ll
+++ b/llvm/test/Transforms/ADCE/broken-loop-info.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -licm -adce -licm -S < %s | FileCheck %s
-; RUN: opt -passes='loop(licm),adce,loop(licm)' -S < %s | FileCheck %s
+; RUN: opt -passes='loop-mssa(licm),adce,loop-mssa(licm)' -S < %s | FileCheck %s
 ;
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
diff --git a/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll b/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll
index 0bedd8fed6265..8598663878a08 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/pr50555.ll
@@ -3,11 +3,10 @@
 
 define void @trunc_one_add(i16* %a, i8 %b) {
 ; CHECK-LABEL: @trunc_one_add(
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[ZEXT]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[ZEXT]], [[SHR]]
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ADD]] to i16
-; CHECK-NEXT:    store i16 [[TRUNC]], i16* [[A:%.*]], align 2
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[B:%.*]] to i16
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i16 [[ZEXT]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[ZEXT]], [[SHR]]
+; CHECK-NEXT:    store i16 [[ADD]], i16* [[A:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %zext = zext i8 %b to i32
@@ -20,14 +19,13 @@ define void @trunc_one_add(i16* %a, i8 %b) {
 
 define void @trunc_two_adds(i16* %a, i8 %b, i8 %c) {
 ; CHECK-LABEL: @trunc_two_adds(
-; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i32
-; CHECK-NEXT:    [[ADD1:%.*]] = add nuw nsw i32 [[ZEXT1]], [[ZEXT2]]
-; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[ADD1]], 1
-; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i32 [[ADD1]], [[SHR1]]
-; CHECK-NEXT:    [[SHR2:%.*]] = lshr i32 [[ADD2]], 2
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR2]] to i16
-; CHECK-NEXT:    store i16 [[TRUNC]], i16* [[A:%.*]], align 2
+; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i16
+; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i16
+; CHECK-NEXT:    [[ADD1:%.*]] = add i16 [[ZEXT1]], [[ZEXT2]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i16 [[ADD1]], 1
+; CHECK-NEXT:    [[ADD2:%.*]] = add i16 [[ADD1]], [[SHR1]]
+; CHECK-NEXT:    [[SHR2:%.*]] = lshr i16 [[ADD2]], 2
+; CHECK-NEXT:    store i16 [[SHR2]], i16* [[A:%.*]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %zext1 = zext i8 %b to i32
diff --git a/llvm/test/Transforms/AggressiveInstCombine/trunc_ashr.ll b/llvm/test/Transforms/AggressiveInstCombine/trunc_ashr.ll
new file mode 100644
index 0000000000000..af9e608e70625
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/trunc_ashr.ll
@@ -0,0 +1,245 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -aggressive-instcombine -S | FileCheck %s
+
+define i16 @ashr_15(i16 %x) {
+; CHECK-LABEL: @ashr_15(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i32 [[ZEXT]], 15
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ASHR]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %zext = zext i16 %x to i32
+  %ashr = ashr i32 %zext, 15
+  %trunc = trunc i32 %ashr to i16
+  ret i16 %trunc
+}
+
+; Negative test
+
+define i16 @ashr_16(i16 %x) {
+; CHECK-LABEL: @ashr_16(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i32 [[ZEXT]], 16
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ASHR]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %zext = zext i16 %x to i32
+  %ashr = ashr i32 %zext, 16
+  %trunc = trunc i32 %ashr to i16
+  ret i16 %trunc
+}
+
+; Negative test
+
+define i16 @ashr_var_shift_amount(i8 %x, i8 %amt) {
+; CHECK-LABEL: @ashr_var_shift_amount(
+; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[ZA:%.*]] = zext i8 [[AMT:%.*]] to i32
+; CHECK-NEXT:    [[S:%.*]] = ashr i32 [[Z]], [[ZA]]
+; CHECK-NEXT:    [[A:%.*]] = add i32 [[S]], [[Z]]
+; CHECK-NEXT:    [[S2:%.*]] = ashr i32 [[A]], 2
+; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[S2]] to i16
+; CHECK-NEXT:    ret i16 [[T]]
+;
+  %z = zext i8 %x to i32
+  %za = zext i8 %amt to i32
+  %s = ashr i32 %z, %za
+  %a = add i32 %s, %z
+  %s2 = ashr i32 %a, 2
+  %t = trunc i32 %s2 to i16
+  ret i16 %t
+}
+
+define i16 @ashr_var_bounded_shift_amount(i8 %x, i8 %amt) {
+; CHECK-LABEL: @ashr_var_bounded_shift_amount(
+; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[ZA:%.*]] = zext i8 [[AMT:%.*]] to i32
+; CHECK-NEXT:    [[ZA2:%.*]] = and i32 [[ZA]], 15
+; CHECK-NEXT:    [[S:%.*]] = ashr i32 [[Z]], [[ZA2]]
+; CHECK-NEXT:    [[A:%.*]] = add i32 [[S]], [[Z]]
+; CHECK-NEXT:    [[S2:%.*]] = ashr i32 [[A]], 2
+; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[S2]] to i16
+; CHECK-NEXT:    ret i16 [[T]]
+;
+  %z = zext i8 %x to i32
+  %za = zext i8 %amt to i32
+  %za2 = and i32 %za, 15
+  %s = ashr i32 %z, %za2
+  %a = add i32 %s, %z
+  %s2 = ashr i32 %a, 2
+  %t = trunc i32 %s2 to i16
+  ret i16 %t
+}
+
+; Negative test
+
+define i32 @ashr_check_no_overflow(i32 %x, i16 %amt) {
+; CHECK-LABEL: @ashr_check_no_overflow(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i16 [[AMT:%.*]] to i64
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[SEXT]], 4294967295
+; CHECK-NEXT:    [[SHL:%.*]] = ashr i64 [[ZEXT]], [[AND]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[SHL]] to i32
+; CHECK-NEXT:    ret i32 [[TRUNC]]
+;
+  %zext = zext i32 %x to i64
+  %sext = sext i16 %amt to i64
+  %and = and i64 %sext, 4294967295
+  %shl = ashr i64 %zext, %and
+  %trunc = trunc i64 %shl to i32
+  ret i32 %trunc
+}
+
+define void @ashr_big_dag(i16* %a, i8 %b, i8 %c) {
+; CHECK-LABEL: @ashr_big_dag(
+; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i32
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[ZEXT1]], [[ZEXT2]]
+; CHECK-NEXT:    [[SFT1:%.*]] = and i32 [[ADD1]], 15
+; CHECK-NEXT:    [[SHR1:%.*]] = ashr i32 [[ADD1]], [[SFT1]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[SHR1]]
+; CHECK-NEXT:    [[SFT2:%.*]] = and i32 [[ADD2]], 7
+; CHECK-NEXT:    [[SHR2:%.*]] = ashr i32 [[ADD2]], [[SFT2]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR2]] to i16
+; CHECK-NEXT:    store i16 [[TRUNC]], i16* [[A:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+  %zext1 = zext i8 %b to i32
+  %zext2 = zext i8 %c to i32
+  %add1 = add i32 %zext1, %zext2
+  %sft1 = and i32 %add1, 15
+  %shr1 = ashr i32 %add1, %sft1
+  %add2 = add i32 %add1, %shr1
+  %sft2 = and i32 %add2, 7
+  %shr2 = ashr i32 %add2, %sft2
+  %trunc = trunc i32 %shr2 to i16
+  store i16 %trunc, i16* %a, align 2
+  ret void
+}
+
+; Negative test
+
+define i8 @ashr_check_not_i8_trunc(i16 %x) {
+; CHECK-LABEL: @ashr_check_not_i8_trunc(
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i16 [[X:%.*]], 1
+; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i16 [[ASHR]] to i32
+; CHECK-NEXT:    [[ASHR2:%.*]] = ashr i32 [[ZEXT2]], 2
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ASHR2]] to i8
+; CHECK-NEXT:    ret i8 [[TRUNC]]
+;
+  %ashr = ashr i16 %x, 1
+  %zext2 = zext i16 %ashr to i32
+  %ashr2 = ashr i32 %zext2, 2
+  %trunc = trunc i32 %ashr2 to i8
+  ret i8 %trunc
+}
+
+define <2 x i16> @ashr_vector(<2 x i8> %x) {
+; CHECK-LABEL: @ashr_vector(
+; CHECK-NEXT:    [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[ZA:%.*]] = and <2 x i32> [[Z]], <i32 7, i32 8>
+; CHECK-NEXT:    [[S:%.*]] = ashr <2 x i32> [[Z]], [[ZA]]
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i32> [[S]], [[Z]]
+; CHECK-NEXT:    [[S2:%.*]] = ashr <2 x i32> [[A]], <i32 4, i32 5>
+; CHECK-NEXT:    [[T:%.*]] = trunc <2 x i32> [[S2]] to <2 x i16>
+; CHECK-NEXT:    ret <2 x i16> [[T]]
+;
+  %z = zext <2 x i8> %x to <2 x i32>
+  %za = and <2 x i32> %z, <i32 7, i32 8>
+  %s = ashr <2 x i32> %z, %za
+  %a = add <2 x i32> %s, %z
+  %s2 = ashr <2 x i32> %a, <i32 4, i32 5>
+  %t = trunc <2 x i32> %s2 to <2 x i16>
+  ret <2 x i16> %t
+}
+
+; Negative test - can only fold to <2 x i16>, requiring new vector type
+
+define <2 x i8> @ashr_vector_no_new_vector_type(<2 x i8> %x) {
+; CHECK-LABEL: @ashr_vector_no_new_vector_type(
+; CHECK-NEXT:    [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[ZA:%.*]] = and <2 x i32> [[Z]], <i32 7, i32 8>
+; CHECK-NEXT:    [[S:%.*]] = ashr <2 x i32> [[Z]], [[ZA]]
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i32> [[S]], [[Z]]
+; CHECK-NEXT:    [[S2:%.*]] = ashr <2 x i32> [[A]], <i32 4, i32 5>
+; CHECK-NEXT:    [[T:%.*]] = trunc <2 x i32> [[S2]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[T]]
+;
+  %z = zext <2 x i8> %x to <2 x i32>
+  %za = and <2 x i32> %z, <i32 7, i32 8>
+  %s = ashr <2 x i32> %z, %za
+  %a = add <2 x i32> %s, %z
+  %s2 = ashr <2 x i32> %a, <i32 4, i32 5>
+  %t = trunc <2 x i32> %s2 to <2 x i8>
+  ret <2 x i8> %t
+}
+
+; Negative test
+
+define <2 x i16> @ashr_vector_large_shift_amount(<2 x i8> %x) {
+; CHECK-LABEL: @ashr_vector_large_shift_amount(
+; CHECK-NEXT:    [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[ZA:%.*]] = and <2 x i32> [[Z]], <i32 7, i32 8>
+; CHECK-NEXT:    [[S:%.*]] = ashr <2 x i32> [[Z]], [[ZA]]
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i32> [[S]], [[Z]]
+; CHECK-NEXT:    [[S2:%.*]] = ashr <2 x i32> [[A]], <i32 16, i32 5>
+; CHECK-NEXT:    [[T:%.*]] = trunc <2 x i32> [[S2]] to <2 x i16>
+; CHECK-NEXT:    ret <2 x i16> [[T]]
+;
+  %z = zext <2 x i8> %x to <2 x i32>
+  %za = and <2 x i32> %z, <i32 7, i32 8>
+  %s = ashr <2 x i32> %z, %za
+  %a = add <2 x i32> %s, %z
+  %s2 = ashr <2 x i32> %a, <i32 16, i32 5>
+  %t = trunc <2 x i32> %s2 to <2 x i16>
+  ret <2 x i16> %t
+}
+
+define i16 @ashr_exact(i16 %x) {
+; CHECK-LABEL: @ashr_exact(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ZEXT]], 32767
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr exact i32 [[AND]], 15
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[ASHR]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %zext = zext i16 %x to i32
+  %and = and i32 %zext, 32767
+  %ashr = ashr exact i32 %and, 15
+  %trunc = trunc i32 %ashr to i16
+  ret i16 %trunc
+}
+
+; Negative test
+
+define i16 @ashr_negative_operand(i16 %x) {
+; CHECK-LABEL: @ashr_negative_operand(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 -1, [[ZEXT]]
+; CHECK-NEXT:    [[LSHR2:%.*]] = ashr i32 [[XOR]], 2
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[LSHR2]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %zext = zext i16 %x to i32
+  %xor = xor i32 -1, %zext
+  %lshr2 = ashr i32 %xor, 2
+  %trunc = trunc i32 %lshr2 to i16
+  ret i16 %trunc
+}
+
+define i16 @ashr_negative_operand_but_short(i16 %x) {
+; CHECK-LABEL: @ashr_negative_operand_but_short(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ZEXT]], 32767
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 -1, [[AND]]
+; CHECK-NEXT:    [[LSHR2:%.*]] = ashr i32 [[XOR]], 2
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[LSHR2]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %zext = zext i16 %x to i32
+  %and = and i32 %zext, 32767
+  %xor = xor i32 -1, %and
+  %lshr2 = ashr i32 %xor, 2
+  %trunc = trunc i32 %lshr2 to i16
+  ret i16 %trunc
+}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/trunc_lshr.ll b/llvm/test/Transforms/AggressiveInstCombine/trunc_lshr.ll
new file mode 100644
index 0000000000000..4a05891d1fd28
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/trunc_lshr.ll
@@ -0,0 +1,250 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -aggressive-instcombine -S | FileCheck %s
+
+define i16 @lshr_15(i16 %x) {
+; CHECK-LABEL: @lshr_15(
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i16 [[X:%.*]], 15
+; CHECK-NEXT:    ret i16 [[LSHR]]
+;
+  %zext = zext i16 %x to i32
+  %lshr = lshr i32 %zext, 15
+  %trunc = trunc i32 %lshr to i16
+  ret i16 %trunc
+}
+
+; Negative test
+
+define i16 @lshr_16(i16 %x) {
+; CHECK-LABEL: @lshr_16(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 [[ZEXT]], 16
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[LSHR]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %zext = zext i16 %x to i32
+  %lshr = lshr i32 %zext, 16
+  %trunc = trunc i32 %lshr to i16
+  ret i16 %trunc
+}
+
+; Negative test
+
+define i16 @lshr_var_shift_amount(i8 %x, i8 %amt) {
+; CHECK-LABEL: @lshr_var_shift_amount(
+; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[ZA:%.*]] = zext i8 [[AMT:%.*]] to i32
+; CHECK-NEXT:    [[S:%.*]] = lshr i32 [[Z]], [[ZA]]
+; CHECK-NEXT:    [[A:%.*]] = add i32 [[S]], [[Z]]
+; CHECK-NEXT:    [[S2:%.*]] = lshr i32 [[A]], 2
+; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[S2]] to i16
+; CHECK-NEXT:    ret i16 [[T]]
+;
+  %z = zext i8 %x to i32
+  %za = zext i8 %amt to i32
+  %s = lshr i32 %z, %za
+  %a = add i32 %s, %z
+  %s2 = lshr i32 %a, 2
+  %t = trunc i32 %s2 to i16
+  ret i16 %t
+}
+
+define i16 @lshr_var_bounded_shift_amount(i8 %x, i8 %amt) {
+; CHECK-LABEL: @lshr_var_bounded_shift_amount(
+; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[ZA:%.*]] = zext i8 [[AMT:%.*]] to i16
+; CHECK-NEXT:    [[ZA2:%.*]] = and i16 [[ZA]], 15
+; CHECK-NEXT:    [[S:%.*]] = lshr i16 [[Z]], [[ZA2]]
+; CHECK-NEXT:    [[A:%.*]] = add i16 [[S]], [[Z]]
+; CHECK-NEXT:    [[S2:%.*]] = lshr i16 [[A]], 2
+; CHECK-NEXT:    ret i16 [[S2]]
+;
+  %z = zext i8 %x to i32
+  %za = zext i8 %amt to i32
+  %za2 = and i32 %za, 15
+  %s = lshr i32 %z, %za2
+  %a = add i32 %s, %z
+  %s2 = lshr i32 %a, 2
+  %t = trunc i32 %s2 to i16
+  ret i16 %t
+}
+
+; Negative test
+
+define i32 @lshr_check_no_overflow(i32 %x, i16 %amt) {
+; CHECK-LABEL: @lshr_check_no_overflow(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i16 [[AMT:%.*]] to i64
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[SEXT]], 4294967295
+; CHECK-NEXT:    [[SHL:%.*]] = lshr i64 [[ZEXT]], [[AND]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[SHL]] to i32
+; CHECK-NEXT:    ret i32 [[TRUNC]]
+;
+  %zext = zext i32 %x to i64
+  %sext = sext i16 %amt to i64
+  %and = and i64 %sext, 4294967295
+  %shl = lshr i64 %zext, %and
+  %trunc = trunc i64 %shl to i32
+  ret i32 %trunc
+}
+
+define void @lshr_big_dag(i16* %a, i8 %b, i8 %c) {
+; CHECK-LABEL: @lshr_big_dag(
+; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i8 [[B:%.*]] to i16
+; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i8 [[C:%.*]] to i16
+; CHECK-NEXT:    [[ADD1:%.*]] = add i16 [[ZEXT1]], [[ZEXT2]]
+; CHECK-NEXT:    [[SFT1:%.*]] = and i16 [[ADD1]], 15
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i16 [[ADD1]], [[SFT1]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i16 [[ADD1]], [[SHR1]]
+; CHECK-NEXT:    [[SFT2:%.*]] = and i16 [[ADD2]], 7
+; CHECK-NEXT:    [[SHR2:%.*]] = lshr i16 [[ADD2]], [[SFT2]]
+; CHECK-NEXT:    store i16 [[SHR2]], i16* [[A:%.*]], align 2
+; CHECK-NEXT:    ret void
+;
+  %zext1 = zext i8 %b to i32
+  %zext2 = zext i8 %c to i32
+  %add1 = add i32 %zext1, %zext2
+  %sft1 = and i32 %add1, 15
+  %shr1 = lshr i32 %add1, %sft1
+  %add2 = add i32 %add1, %shr1
+  %sft2 = and i32 %add2, 7
+  %shr2 = lshr i32 %add2, %sft2
+  %trunc = trunc i32 %shr2 to i16
+  store i16 %trunc, i16* %a, align 2
+  ret void
+}
+
+; Negative test
+
+define i8 @lshr_check_not_i8_trunc(i16 %x) {
+; CHECK-LABEL: @lshr_check_not_i8_trunc(
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr i16 [[X:%.*]], 1
+; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i16 [[LSHR]] to i32
+; CHECK-NEXT:    [[LSHR2:%.*]] = lshr i32 [[ZEXT2]], 2
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[LSHR2]] to i8
+; CHECK-NEXT:    ret i8 [[TRUNC]]
+;
+  %lshr = lshr i16 %x, 1
+  %zext2 = zext i16 %lshr to i32
+  %lshr2 = lshr i32 %zext2, 2
+  %trunc = trunc i32 %lshr2 to i8
+  ret i8 %trunc
+}
+
+define <2 x i16> @lshr_vector(<2 x i8> %x) {
+; CHECK-LABEL: @lshr_vector(
+; CHECK-NEXT:    [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i16>
+; CHECK-NEXT:    [[ZA:%.*]] = and <2 x i16> [[Z]], <i16 7, i16 8>
+; CHECK-NEXT:    [[S:%.*]] = lshr <2 x i16> [[Z]], [[ZA]]
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i16> [[S]], [[Z]]
+; CHECK-NEXT:    [[S2:%.*]] = lshr <2 x i16> [[A]], <i16 4, i16 5>
+; CHECK-NEXT:    ret <2 x i16> [[S2]]
+;
+  %z = zext <2 x i8> %x to <2 x i32>
+  %za = and <2 x i32> %z, <i32 7, i32 8>
+  %s = lshr <2 x i32> %z, %za
+  %a = add <2 x i32> %s, %z
+  %s2 = lshr <2 x i32> %a, <i32 4, i32 5>
+  %t = trunc <2 x i32> %s2 to <2 x i16>
+  ret <2 x i16> %t
+}
+
+; Negative test - can only fold to <2 x i16>, requiring new vector type
+
+define <2 x i8> @lshr_vector_no_new_vector_type(<2 x i8> %x) {
+; CHECK-LABEL: @lshr_vector_no_new_vector_type(
+; CHECK-NEXT:    [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[ZA:%.*]] = and <2 x i32> [[Z]], <i32 7, i32 8>
+; CHECK-NEXT:    [[S:%.*]] = lshr <2 x i32> [[Z]], [[ZA]]
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i32> [[S]], [[Z]]
+; CHECK-NEXT:    [[S2:%.*]] = lshr <2 x i32> [[A]], <i32 4, i32 5>
+; CHECK-NEXT:    [[T:%.*]] = trunc <2 x i32> [[S2]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[T]]
+;
+  %z = zext <2 x i8> %x to <2 x i32>
+  %za = and <2 x i32> %z, <i32 7, i32 8>
+  %s = lshr <2 x i32> %z, %za
+  %a = add <2 x i32> %s, %z
+  %s2 = lshr <2 x i32> %a, <i32 4, i32 5>
+  %t = trunc <2 x i32> %s2 to <2 x i8>
+  ret <2 x i8> %t
+}
+
+; Negative test
+
+define <2 x i16> @lshr_vector_large_shift_amount(<2 x i8> %x) {
+; CHECK-LABEL: @lshr_vector_large_shift_amount(
+; CHECK-NEXT:    [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[ZA:%.*]] = and <2 x i32> [[Z]], <i32 7, i32 8>
+; CHECK-NEXT:    [[S:%.*]] = lshr <2 x i32> [[Z]], [[ZA]]
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i32> [[S]], [[Z]]
+; CHECK-NEXT:    [[S2:%.*]] = lshr <2 x i32> [[A]], <i32 16, i32 5>
+; CHECK-NEXT:    [[T:%.*]] = trunc <2 x i32> [[S2]] to <2 x i16>
+; CHECK-NEXT:    ret <2 x i16> [[T]]
+;
+  %z = zext <2 x i8> %x to <2 x i32>
+  %za = and <2 x i32> %z, <i32 7, i32 8>
+  %s = lshr <2 x i32> %z, %za
+  %a = add <2 x i32> %s, %z
+  %s2 = lshr <2 x i32> %a, <i32 16, i32 5>
+  %t = trunc <2 x i32> %s2 to <2 x i16>
+  ret <2 x i16> %t
+}
+
+define i16 @lshr_exact(i16 %x) {
+; CHECK-LABEL: @lshr_exact(
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr exact i16 [[X:%.*]], 15
+; CHECK-NEXT:    ret i16 [[LSHR]]
+;
+  %zext = zext i16 %x to i32
+  %lshr = lshr exact i32 %zext, 15
+  %trunc = trunc i32 %lshr to i16
+  ret i16 %trunc
+}
+
+; Negative test
+
+define i16 @lshr_negative_operand(i16 %x) {
+; CHECK-LABEL: @lshr_negative_operand(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 -1, [[ZEXT]]
+; CHECK-NEXT:    [[LSHR2:%.*]] = lshr i32 [[XOR]], 2
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[LSHR2]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %zext = zext i16 %x to i32
+  %xor = xor i32 -1, %zext
+  %lshr2 = lshr i32 %xor, 2
+  %trunc = trunc i32 %lshr2 to i16
+  ret i16 %trunc
+}
+
+define i16 @lshr_negative_operand_but_short(i16 %x) {
+; CHECK-LABEL: @lshr_negative_operand_but_short(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[X:%.*]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ZEXT]], 32767
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 -1, [[AND]]
+; CHECK-NEXT:    [[LSHR2:%.*]] = lshr i32 [[XOR]], 2
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[LSHR2]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %zext = zext i16 %x to i32
+  %and = and i32 %zext, 32767
+  %xor = xor i32 -1, %and
+  %lshr2 = lshr i32 %xor, 2
+  %trunc = trunc i32 %lshr2 to i16
+  ret i16 %trunc
+}
+
+; We may encounter unoptimized IR as below,
+; so don't crash by assuming that we can
+; apply instruction flags (exact) if there
+; is no instruction.
+
+define i8 @non_canonical_crash() {
+; CHECK-LABEL: @non_canonical_crash(
+; CHECK-NEXT:    ret i8 8
+;
+  %sh = lshr i32 33, 2
+  %tr = trunc i32 %sh to i8
+  ret i8 %tr
+}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/trunc_shl.ll b/llvm/test/Transforms/AggressiveInstCombine/trunc_shl.ll
new file mode 100644
index 0000000000000..0b2a26a6d0e20
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/trunc_shl.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -aggressive-instcombine -S | FileCheck %s
+
+define i16 @shl_1(i8 %x) {
+; CHECK-LABEL: @shl_1(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[SHL:%.*]] = shl i16 [[ZEXT]], 1
+; CHECK-NEXT:    ret i16 [[SHL]]
+;
+  %zext = zext i8 %x to i32
+  %shl = shl i32 %zext, 1
+  %trunc = trunc i32 %shl to i16
+  ret i16 %trunc
+}
+
+define i16 @shl_15(i8 %x) {
+; CHECK-LABEL: @shl_15(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[SHL:%.*]] = shl i16 [[ZEXT]], 15
+; CHECK-NEXT:    ret i16 [[SHL]]
+;
+  %zext = zext i8 %x to i32
+  %shl = shl i32 %zext, 15
+  %trunc = trunc i32 %shl to i16
+  ret i16 %trunc
+}
+
+; Negative test - shift amount isn't less than target bitwidth
+
+define i16 @shl_16(i8 %x) {
+; CHECK-LABEL: @shl_16(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[ZEXT]], 16
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %zext = zext i8 %x to i32
+  %shl = shl i32 %zext, 16
+  %trunc = trunc i32 %shl to i16
+  ret i16 %trunc
+}
+
+; Negative test -- variable shift amount
+
+define i16 @shl_var_shift_amount(i8 %x, i8 %y) {
+; CHECK-LABEL: @shl_var_shift_amount(
+; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[ZEXT_X]], [[ZEXT_Y]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHL]] to i16
+; CHECK-NEXT:    ret i16 [[TRUNC]]
+;
+  %zext.x = zext i8 %x to i32
+  %zext.y = zext i8 %y to i32
+  %shl = shl i32 %zext.x, %zext.y
+  %trunc = trunc i32 %shl to i16
+  ret i16 %trunc
+}
+
+define i16 @shl_var_bounded_shift_amount(i8 %x, i8 %y) {
+; CHECK-LABEL: @shl_var_bounded_shift_amount(
+; CHECK-NEXT:    [[ZEXT_X:%.*]] = zext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[ZEXT_Y:%.*]] = zext i8 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[AND:%.*]] = and i16 [[ZEXT_Y]], 15
+; CHECK-NEXT:    [[SHL:%.*]] = shl i16 [[ZEXT_X]], [[AND]]
+; CHECK-NEXT:    ret i16 [[SHL]]
+;
+  %zext.x = zext i8 %x to i32
+  %zext.y = zext i8 %y to i32
+  %and = and i32 %zext.y, 15
+  %shl = shl i32 %zext.x, %and
+  %trunc = trunc i32 %shl to i16
+  ret i16 %trunc
+}
+
+; Negative test (https://reviews.llvm.org/D108091#2950930)
+
+define i32 @shl_check_no_overflow(i32 %x, i16 %amt) {
+; CHECK-LABEL: @shl_check_no_overflow(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i16 [[AMT:%.*]] to i64
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[SEXT]], 4294967295
+; CHECK-NEXT:    [[SHL:%.*]] = shl i64 [[ZEXT]], [[AND]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[SHL]] to i32
+; CHECK-NEXT:    ret i32 [[TRUNC]]
+;
+  %zext = zext i32 %x to i64
+  %sext = sext i16 %amt to i64
+  %and = and i64 %sext, 4294967295
+  %shl = shl i64 %zext, %and
+  %trunc = trunc i64 %shl to i32
+  ret i32 %trunc
+}
+
+define <2 x i16> @shl_vector(<2 x i8> %x) {
+; CHECK-LABEL: @shl_vector(
+; CHECK-NEXT:    [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i16>
+; CHECK-NEXT:    [[S:%.*]] = shl <2 x i16> [[Z]], <i16 4, i16 10>
+; CHECK-NEXT:    ret <2 x i16> [[S]]
+;
+  %z = zext <2 x i8> %x to <2 x i32>
+  %s = shl <2 x i32> %z, <i32 4, i32 10>
+  %t = trunc <2 x i32> %s to <2 x i16>
+  ret <2 x i16> %t
+}
+
+; Negative test - can only fold to <2 x i16>, requiring new vector type
+
+define <2 x i8> @shl_vector_no_new_vector_type(<2 x i8> %x) {
+; CHECK-LABEL: @shl_vector_no_new_vector_type(
+; CHECK-NEXT:    [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[S:%.*]] = shl <2 x i32> [[Z]], <i32 4, i32 10>
+; CHECK-NEXT:    [[T:%.*]] = trunc <2 x i32> [[S]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[T]]
+;
+  %z = zext <2 x i8> %x to <2 x i32>
+  %s = shl <2 x i32> %z, <i32 4, i32 10>
+  %t = trunc <2 x i32> %s to <2 x i8>
+  ret <2 x i8> %t
+}
+
+; Negative test
+
+define <2 x i16> @shl_vector_large_shift_amount(<2 x i8> %x) {
+; CHECK-LABEL: @shl_vector_large_shift_amount(
+; CHECK-NEXT:    [[Z:%.*]] = zext <2 x i8> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[S:%.*]] = shl <2 x i32> [[Z]], <i32 16, i32 5>
+; CHECK-NEXT:    [[T:%.*]] = trunc <2 x i32> [[S]] to <2 x i16>
+; CHECK-NEXT:    ret <2 x i16> [[T]]
+;
+  %z = zext <2 x i8> %x to <2 x i32>
+  %s = shl <2 x i32> %z, <i32 16, i32 5>
+  %t = trunc <2 x i32> %s to <2 x i16>
+  ret <2 x i16> %t
+}
+
+define i16 @shl_nuw(i8 %x) {
+; CHECK-LABEL: @shl_nuw(
+; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[S:%.*]] = shl i16 [[Z]], 15
+; CHECK-NEXT:    ret i16 [[S]]
+;
+  %z = zext i8 %x to i32
+  %s = shl nuw i32 %z, 15
+  %t = trunc i32 %s to i16
+  ret i16 %t
+}
+
+define i16 @shl_nsw(i8 %x) {
+; CHECK-LABEL: @shl_nsw(
+; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[S:%.*]] = shl i16 [[Z]], 15
+; CHECK-NEXT:    ret i16 [[S]]
+;
+  %z = zext i8 %x to i32
+  %s = shl nsw i32 %z, 15
+  %t = trunc i32 %s to i16
+  ret i16 %t
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions-inseltpoison.ll
index b7804468d9581..04c94eb187ef9 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions-inseltpoison.ll
@@ -9,13 +9,13 @@ define <8 x i16> @sink_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES_1]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+; CHECK-NEXT:    [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
 ; CHECK-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES_2]]
 ;
@@ -39,13 +39,13 @@ define <8 x i16> @sink_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES_1]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+; CHECK-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
 ; CHECK-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES_2]]
 ;
@@ -69,8 +69,8 @@ define <8 x i16> @do_not_sink_nonfree_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES_1]]
 ; CHECK:       if.else:
@@ -96,8 +96,8 @@ define <8 x i16> @do_not_sink_nonfree_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES_1]]
 ; CHECK:       if.else:
@@ -124,13 +124,13 @@ define <8 x i16> @sink_shufflevector_umull(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP0]], <8 x i8> [[S2]])
 ; CHECK-NEXT:    ret <8 x i16> [[VMULL0]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[S4]])
 ; CHECK-NEXT:    ret <8 x i16> [[VMULL1]]
 ;
@@ -156,17 +156,17 @@ define <8 x i16> @sink_shufflevector_ext_subadd(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
 ; CHECK-NEXT:    [[RES1:%.*]] = add <8 x i16> [[TMP1]], [[Z2]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES1]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
 ; CHECK-NEXT:    [[RES2:%.*]] = sub <8 x i16> [[TMP3]], [[Z4]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES2]]
 ;
@@ -202,17 +202,17 @@ define <8 x i16> @sink_shufflevector_ext_subadd_multiuse(<16 x i8> %a, <16 x i8>
 ; CHECK-NEXT:    call void @user1(<8 x i16> [[Z3]])
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
 ; CHECK-NEXT:    [[RES1:%.*]] = add <8 x i16> [[TMP1]], [[Z2]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES1]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
 ; CHECK-NEXT:    [[RES2:%.*]] = sub <8 x i16> [[TMP3]], [[Z4]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES2]]
 ;
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
index 35221e2ae60f8..5cc1af7193692 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
@@ -9,13 +9,13 @@ define <8 x i16> @sink_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES_1]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+; CHECK-NEXT:    [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
 ; CHECK-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES_2]]
 ;
@@ -39,13 +39,13 @@ define <8 x i16> @sink_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES_1]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+; CHECK-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
 ; CHECK-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES_2]]
 ;
@@ -69,8 +69,8 @@ define <8 x i16> @do_not_sink_nonfree_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES_1]]
 ; CHECK:       if.else:
@@ -96,8 +96,8 @@ define <8 x i16> @do_not_sink_nonfree_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; CHECK-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES_1]]
 ; CHECK:       if.else:
@@ -124,13 +124,13 @@ define <8 x i16> @sink_shufflevector_umull(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP0]], <8 x i8> [[S2]])
 ; CHECK-NEXT:    ret <8 x i16> [[VMULL0]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[S4]])
 ; CHECK-NEXT:    ret <8 x i16> [[VMULL1]]
 ;
@@ -156,17 +156,17 @@ define <8 x i16> @sink_shufflevector_ext_subadd(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
 ; CHECK-NEXT:    [[RES1:%.*]] = add <8 x i16> [[TMP1]], [[Z2]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES1]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
 ; CHECK-NEXT:    [[RES2:%.*]] = sub <8 x i16> [[TMP3]], [[Z4]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES2]]
 ;
@@ -202,17 +202,17 @@ define <8 x i16> @sink_shufflevector_ext_subadd_multiuse(<16 x i8> %a, <16 x i8>
 ; CHECK-NEXT:    call void @user1(<8 x i16> [[Z3]])
 ; CHECK-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
 ; CHECK-NEXT:    [[RES1:%.*]] = add <8 x i16> [[TMP1]], [[Z2]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES1]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
 ; CHECK-NEXT:    [[RES2:%.*]] = sub <8 x i16> [[TMP3]], [[Z4]]
 ; CHECK-NEXT:    ret <8 x i16> [[RES2]]
 ;
@@ -273,3 +273,95 @@ if.else:
 
 ; Function Attrs: nounwind readnone
 declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) #2
+
+; The insertelement should be inserted before shufflevector, otherwise 'does not dominate all uses' error will occur.
+define <4 x i32> @sink_insertelement(i16 %e, i8 %f) {
+; CHECK-LABEL: @sink_insertelement(
+; CHECK-NEXT:  for.cond4.preheader.lr.ph:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[F:%.*]], 0
+; CHECK-NEXT:    [[CONV25:%.*]] = sext i16 [[E:%.*]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND4_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND4_PREHEADER_PREHEADER:%.*]]
+; CHECK:       for.cond4.preheader.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[CONV25]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT144:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> zeroinitializer, [[BROADCAST_SPLAT144]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK:       for.cond4.preheader.preheader:
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+for.cond4.preheader.lr.ph:
+  %cmp = icmp slt i8 %f, 0
+  %conv25 = sext i16 %e to i32
+  %broadcast.splatinsert143 = insertelement <4 x i32> poison, i32 %conv25, i32 0
+  br i1 %cmp, label %for.cond4.preheader.us.preheader, label %for.cond4.preheader.preheader
+
+for.cond4.preheader.us.preheader:                 ; preds = %for.cond4.preheader.lr.ph
+  %broadcast.splat144 = shufflevector <4 x i32> %broadcast.splatinsert143, <4 x i32> poison, <4 x i32> zeroinitializer
+  %0 = mul <4 x i32> zeroinitializer, %broadcast.splat144
+  ret <4 x i32> %0
+
+for.cond4.preheader.preheader:                    ; preds = %for.cond4.preheader.lr.ph
+  ret <4 x i32> zeroinitializer
+}
+
+define <4 x i32> @sinkadd_partial(<8 x i16> %a1, <8 x i16> %a2, i8 %f) {
+; CHECK-LABEL: @sinkadd_partial(
+; CHECK-NEXT:  for.cond4.preheader.lr.ph:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[F:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND4_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND4_PREHEADER_PREHEADER:%.*]]
+; CHECK:       for.cond4.preheader.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[A1:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A2:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[E1:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[E2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[E1]], [[E2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+; CHECK:       for.cond4.preheader.preheader:
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+for.cond4.preheader.lr.ph:
+  %cmp = icmp slt i8 %f, 0
+  %s2 = shufflevector <8 x i16> %a2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <8 x i16> %a1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  br i1 %cmp, label %for.cond4.preheader.us.preheader, label %for.cond4.preheader.preheader
+
+for.cond4.preheader.us.preheader:                 ; preds = %for.cond4.preheader.lr.ph
+  %e1 = sext <4 x i16> %s1 to <4 x i32>
+  %e2 = sext <4 x i16> %s2 to <4 x i32>
+  %0 = add <4 x i32> %e1, %e2
+  ret <4 x i32> %0
+
+for.cond4.preheader.preheader:                    ; preds = %for.cond4.preheader.lr.ph
+  ret <4 x i32> zeroinitializer
+}
+
+define <4 x i32> @sinkadd_partial_rev(<8 x i16> %a1, <8 x i16> %a2, i8 %f) {
+; CHECK-LABEL: @sinkadd_partial_rev(
+; CHECK-NEXT:  for.cond4.preheader.lr.ph:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[F:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND4_PREHEADER_US_PREHEADER:%.*]], label [[FOR_COND4_PREHEADER_PREHEADER:%.*]]
+; CHECK:       for.cond4.preheader.us.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i16> [[A1:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A2:%.*]], <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[E2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[E1:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[E1]], [[E2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+; CHECK:       for.cond4.preheader.preheader:
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+for.cond4.preheader.lr.ph:
+  %cmp = icmp slt i8 %f, 0
+  %s2 = shufflevector <8 x i16> %a2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %s1 = shufflevector <8 x i16> %a1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  br i1 %cmp, label %for.cond4.preheader.us.preheader, label %for.cond4.preheader.preheader
+
+for.cond4.preheader.us.preheader:                 ; preds = %for.cond4.preheader.lr.ph
+  %e2 = sext <4 x i16> %s2 to <4 x i32>
+  %e1 = sext <4 x i16> %s1 to <4 x i32>
+  %0 = add <4 x i32> %e1, %e2
+  ret <4 x i32> %0
+
+for.cond4.preheader.preheader:                    ; preds = %for.cond4.preheader.lr.ph
+  ret <4 x i32> zeroinitializer
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions-inseltpoison.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions-inseltpoison.ll
index fbaf7bfd85ed7..bc813d75e770a 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions-inseltpoison.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions-inseltpoison.ll
@@ -7,13 +7,13 @@ define <8 x i16> @sink_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
 ; NEON-NEXT:  entry:
 ; NEON-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; NEON:       if.then:
-; NEON-NEXT:    [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16>
 ; NEON-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16>
+; NEON-NEXT:    [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16>
 ; NEON-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
 ; NEON-NEXT:    ret <8 x i16> [[RES_1]]
 ; NEON:       if.else:
-; NEON-NEXT:    [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
 ; NEON-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+; NEON-NEXT:    [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
 ; NEON-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
 ; NEON-NEXT:    ret <8 x i16> [[RES_2]]
 ;
@@ -50,13 +50,13 @@ define <8 x i16> @sink_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
 ; NEON-NEXT:  entry:
 ; NEON-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; NEON:       if.then:
-; NEON-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; NEON-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; NEON-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; NEON-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
 ; NEON-NEXT:    ret <8 x i16> [[RES_1]]
 ; NEON:       if.else:
-; NEON-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
 ; NEON-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+; NEON-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
 ; NEON-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
 ; NEON-NEXT:    ret <8 x i16> [[RES_2]]
 ;
@@ -180,14 +180,14 @@ define <8 x i16> @sink_shufflevector_ext_subadd_multiuse(<16 x i8> %a, <16 x i8>
 ; NEON-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; NEON:       if.then:
 ; NEON-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
 ; NEON-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[S1]] to <8 x i16>
+; NEON-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
 ; NEON-NEXT:    [[RES1:%.*]] = add <8 x i16> [[TMP0]], [[Z2]]
 ; NEON-NEXT:    ret <8 x i16> [[RES1]]
 ; NEON:       if.else:
 ; NEON-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
 ; NEON-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[S3]] to <8 x i16>
+; NEON-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
 ; NEON-NEXT:    [[RES2:%.*]] = sub <8 x i16> [[TMP1]], [[Z4]]
 ; NEON-NEXT:    ret <8 x i16> [[RES2]]
 ;
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions.ll
index 9dd0b373aa2e1..990fb8034ce9b 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/sink-free-instructions.ll
@@ -7,13 +7,13 @@ define <8 x i16> @sink_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
 ; NEON-NEXT:  entry:
 ; NEON-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; NEON:       if.then:
-; NEON-NEXT:    [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16>
 ; NEON-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16>
+; NEON-NEXT:    [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16>
 ; NEON-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
 ; NEON-NEXT:    ret <8 x i16> [[RES_1]]
 ; NEON:       if.else:
-; NEON-NEXT:    [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
 ; NEON-NEXT:    [[TMP1:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+; NEON-NEXT:    [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
 ; NEON-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
 ; NEON-NEXT:    ret <8 x i16> [[RES_2]]
 ;
@@ -50,13 +50,13 @@ define <8 x i16> @sink_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
 ; NEON-NEXT:  entry:
 ; NEON-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; NEON:       if.then:
-; NEON-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; NEON-NEXT:    [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; NEON-NEXT:    [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
 ; NEON-NEXT:    [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
 ; NEON-NEXT:    ret <8 x i16> [[RES_1]]
 ; NEON:       if.else:
-; NEON-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
 ; NEON-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+; NEON-NEXT:    [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
 ; NEON-NEXT:    [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
 ; NEON-NEXT:    ret <8 x i16> [[RES_2]]
 ;
@@ -180,14 +180,14 @@ define <8 x i16> @sink_shufflevector_ext_subadd_multiuse(<16 x i8> %a, <16 x i8>
 ; NEON-NEXT:    br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; NEON:       if.then:
 ; NEON-NEXT:    [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NEON-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
 ; NEON-NEXT:    [[TMP0:%.*]] = zext <8 x i8> [[S1]] to <8 x i16>
+; NEON-NEXT:    [[Z2:%.*]] = zext <8 x i8> [[S2]] to <8 x i16>
 ; NEON-NEXT:    [[RES1:%.*]] = add <8 x i16> [[TMP0]], [[Z2]]
 ; NEON-NEXT:    ret <8 x i16> [[RES1]]
 ; NEON:       if.else:
 ; NEON-NEXT:    [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; NEON-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
 ; NEON-NEXT:    [[TMP1:%.*]] = sext <8 x i8> [[S3]] to <8 x i16>
+; NEON-NEXT:    [[Z4:%.*]] = sext <8 x i8> [[S4]] to <8 x i16>
 ; NEON-NEXT:    [[RES2:%.*]] = sub <8 x i16> [[TMP1]], [[Z4]]
 ; NEON-NEXT:    ret <8 x i16> [[RES2]]
 ;
diff --git a/llvm/test/Transforms/ConstantHoisting/RISCV/immediates.ll b/llvm/test/Transforms/ConstantHoisting/RISCV/immediates.ll
index 15b04f771db2b..df354149ac115 100644
--- a/llvm/test/Transforms/ConstantHoisting/RISCV/immediates.ll
+++ b/llvm/test/Transforms/ConstantHoisting/RISCV/immediates.ll
@@ -36,3 +36,39 @@ define i128 @test4(i128 %a) nounwind {
   %2 = add i128 %1, 12297829382473034410122878
   ret i128 %2
 }
+
+; Check that we hoist zext.h without Zbb.
+define i32 @test5(i32 %a) nounwind {
+; CHECK-LABEL: test5
+; CHECK: %const = bitcast i32 65535 to i32
+  %1 = and i32 %a, 65535
+  %2 = and i32 %1, 65535
+  ret i32 %2
+}
+
+; Check that we don't hoist zext.h with 65535 with Zbb.
+define i32 @test6(i32 %a) nounwind "target-features"="+experimental-zbb" {
+; CHECK-LABEL: test6
+; CHECK: and i32 %a, 65535
+  %1 = and i32 %a, 65535
+  %2 = and i32 %1, 65535
+  ret i32 %2
+}
+
+; Check that we hoist zext.w without Zba.
+define i64 @test7(i64 %a) nounwind {
+; CHECK-LABEL: test7
+; CHECK: %const = bitcast i64 4294967295 to i64
+  %1 = and i64 %a, 4294967295
+  %2 = and i64 %1, 4294967295
+  ret i64 %2
+}
+
+; Check that we don't hoist zext.w with Zba.
+define i64 @test8(i64 %a) nounwind "target-features"="+experimental-zbb" {
+; CHECK-LABEL: test8
+; CHECK: and i64 %a, 4294967295
+  %1 = and i64 %a, 4294967295
+  %2 = and i64 %1, 4294967295
+  ret i64 %2
+}
diff --git a/llvm/test/Transforms/Coroutines/coro-debug.ll b/llvm/test/Transforms/Coroutines/coro-debug.ll
index 1e86b656d62ea..ec5924fc3ea90 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug.ll
@@ -34,8 +34,10 @@ sw.bb:                                            ; preds = %entry
   call void @llvm.dbg.declare(metadata i32 %direct, metadata !25, metadata !13), !dbg !14
   call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !12, metadata !13), !dbg !14
   call void @llvm.dbg.declare(metadata i8** %coro_hdl, metadata !15, metadata !13), !dbg !16
-  call void @llvm.dbg.declare(metadata i8* null, metadata !28, metadata !13), !dbg !16
   call void @llvm.dbg.declare(metadata i32* %late_local, metadata !29, metadata !13), !dbg !16
+  ; don't crash when encountering nonsensical debug info, verfifier doesn't yet reject these
+  call void @llvm.dbg.declare(metadata i8* null, metadata !28, metadata !13), !dbg !16
+  call void @llvm.dbg.declare(metadata !{}, metadata !28, metadata !13), !dbg !16
   br label %next, !dbg !18
 
 next:
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
index 49c6f648ccfc7..5abbcbc90e010 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll
@@ -377,6 +377,101 @@ unreachable:
   ret void
 }
 
+define i32 @switch_range(i32 %cond) {
+; CHECK-LABEL: @switch_range(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S:%.*]] = urem i32 [[COND:%.*]], 3
+; CHECK-NEXT:    [[S1:%.*]] = add nuw nsw i32 [[S]], 1
+; CHECK-NEXT:    switch i32 [[S1]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:    i32 1, label [[EXIT1:%.*]]
+; CHECK-NEXT:    i32 2, label [[EXIT2:%.*]]
+; CHECK-NEXT:    i32 3, label [[EXIT1]]
+; CHECK-NEXT:    ]
+; CHECK:       exit1:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       exit2:
+; CHECK-NEXT:    ret i32 2
+; CHECK:       unreachable:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %s = urem i32 %cond, 3
+  %s1 = add i32 %s, 1
+  switch i32 %s1, label %unreachable [
+  i32 1, label %exit1
+  i32 2, label %exit2
+  i32 3, label %exit1
+  ]
+
+exit1:
+  ret i32 1
+exit2:
+  ret i32 2
+unreachable:
+  ret i32 0
+}
+
+; If the cases do not cover the entire range of the
+; switch condition, we should not change the default.
+
+define i32 @switch_range_not_full(i32 %cond) {
+; CHECK-LABEL: @switch_range_not_full(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[S:%.*]] = urem i32 [[COND:%.*]], 3
+; CHECK-NEXT:    [[S1:%.*]] = add nuw nsw i32 [[S]], 1
+; CHECK-NEXT:    switch i32 [[S1]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:    i32 1, label [[EXIT1:%.*]]
+; CHECK-NEXT:    i32 3, label [[EXIT2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       exit1:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       exit2:
+; CHECK-NEXT:    ret i32 2
+; CHECK:       unreachable:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %s = urem i32 %cond, 3
+  %s1 = add i32 %s, 1
+  switch i32 %s1, label %unreachable [
+  i32 1, label %exit1
+  i32 3, label %exit2
+  ]
+
+exit1:
+  ret i32 1
+exit2:
+  ret i32 2
+unreachable:
+  ret i32 0
+}
+
+; PR51531
+
+define i8 @switch_defaultdest_multipleuse(i8 %t0) {
+; CHECK-LABEL: @switch_defaultdest_multipleuse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[O:%.*]] = or i8 [[T0:%.*]], 1
+; CHECK-NEXT:    [[R:%.*]] = srem i8 1, [[O]]
+; CHECK-NEXT:    switch i8 [[R]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i8 0, label [[EXIT]]
+; CHECK-NEXT:    i8 1, label [[EXIT]]
+; CHECK-NEXT:    ]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i8 0
+;
+entry:
+  %o = or i8 %t0, 1
+  %r = srem i8 1, %o
+  switch i8 %r, label %exit [
+  i8 0, label %exit
+  i8 1, label %exit
+  ]
+
+exit:
+  ret i8 0
+}
+
 define i1 @arg_attribute(i8* nonnull %a) {
 ; CHECK-LABEL: @arg_attribute(
 ; CHECK-NEXT:    ret i1 false
diff --git a/llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll b/llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll
index 18bc6c5922237..a7be718a8b466 100644
--- a/llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll
+++ b/llvm/test/Transforms/EarlyCSE/AArch64/intrinsics.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -early-cse -earlycse-debug-hash | FileCheck %s
 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -basic-aa -early-cse-memssa | FileCheck %s
 ; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -passes=early-cse | FileCheck %s
-; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -aa-pipeline=basic-aa -passes=early-cse-memssa | FileCheck %s
+; RUN: opt < %s -S -mtriple=aarch64-none-linux-gnu -mattr=+neon -aa-pipeline=basic-aa -passes='early-cse<memssa>' | FileCheck %s
 
 define <4 x i32> @test_cse(i32* %a, [2 x <4 x i32>] %s.coerce, i32 %n) {
 entry:
diff --git a/llvm/test/Transforms/EarlyCSE/memoryssa.ll b/llvm/test/Transforms/EarlyCSE/memoryssa.ll
index 3f670852a4d87..23c7137ca8c63 100644
--- a/llvm/test/Transforms/EarlyCSE/memoryssa.ll
+++ b/llvm/test/Transforms/EarlyCSE/memoryssa.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -S -early-cse -earlycse-debug-hash | FileCheck %s --check-prefix=CHECK-NOMEMSSA
 ; RUN: opt < %s -S -basic-aa -early-cse-memssa | FileCheck %s
 ; RUN: opt < %s -S -passes='early-cse' | FileCheck %s --check-prefix=CHECK-NOMEMSSA
-; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='early-cse-memssa' | FileCheck %s
+; RUN: opt < %s -S -aa-pipeline=basic-aa -passes='early-cse<memssa>' | FileCheck %s
 
 @G1 = global i32 zeroinitializer
 @G2 = global i32 zeroinitializer
diff --git a/llvm/test/Transforms/EntryExitInstrumenter/debug-info.ll b/llvm/test/Transforms/EntryExitInstrumenter/debug-info.ll
index 64361db2be63b..9fda97229acab 100644
--- a/llvm/test/Transforms/EntryExitInstrumenter/debug-info.ll
+++ b/llvm/test/Transforms/EntryExitInstrumenter/debug-info.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes="function(ee-instrument),cgscc(inline),function(post-inline-ee-instrument)" -S < %s | FileCheck %s
+; RUN: opt -passes="function(ee-instrument),cgscc(inline),function(ee-instrument<post-inline>)" -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
index 5a6daa2c9a008..5aa68b7a64fbd 100644
--- a/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
+++ b/llvm/test/Transforms/EntryExitInstrumenter/mcount.ll
@@ -1,7 +1,7 @@
-; RUN: opt -passes="function(ee-instrument),cgscc(inline),function(post-inline-ee-instrument)" -S < %s | FileCheck %s
+; RUN: opt -passes="function(ee-instrument),cgscc(inline),function(ee-instrument<post-inline>)" -S < %s | FileCheck %s
 
 ; Running the passes twice should not result in more instrumentation.
-; RUN: opt -passes="function(ee-instrument),function(ee-instrument),cgscc(inline),function(post-inline-ee-instrument),function(post-inline-ee-instrument)" -S < %s | FileCheck %s
+; RUN: opt -passes="function(ee-instrument),function(ee-instrument),cgscc(inline),function(ee-instrument<post-inline>),function(ee-instrument<post-inline>)" -S < %s | FileCheck %s
 
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux"
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
new file mode 100644
index 0000000000000..3c69841fbfc8e
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp-x32.ll
@@ -0,0 +1,615 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown   -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=X32
+
+declare i32 @memcmp(i8* nocapture, i8* nocapture, i32)
+
+define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp2(
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; X32-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1
+; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
+; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X32-NEXT:    ret i32 [[TMP9]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 2)
+  ret i32 %call
+}
+
+define i32 @cmp2_align2(i8* nocapture readonly align 2 %x, i8* nocapture readonly align 2 %y)  {
+; X32-LABEL: @cmp2_align2(
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; X32-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 2
+; X32-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2
+; X32-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
+; X32-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X32-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X32-NEXT:    ret i32 [[TMP9]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 2)
+  ret i32 %call
+}
+
+define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp3(
+; X32-NEXT:    br label [[LOADBB:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
+; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb:
+; X32-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; X32-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; X32-NEXT:    [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 1
+; X32-NEXT:    [[TMP6:%.*]] = load i16, i16* [[TMP4]], align 1
+; X32-NEXT:    [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
+; X32-NEXT:    [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
+; X32-NEXT:    [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
+; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 2
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 2
+; X32-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP10]], align 1
+; X32-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]], align 1
+; X32-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X32-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; X32-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 3)
+  ret i32 %call
+}
+
+define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp4(
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
+; X32-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X32-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X32-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
+; X32-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
+; X32-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
+; X32-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; X32-NEXT:    ret i32 [[TMP11]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 4)
+  ret i32 %call
+}
+
+define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp5(
+; X32-NEXT:    br label [[LOADBB:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP7:%.*]], [[TMP8:%.*]]
+; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb:
+; X32-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1
+; X32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1
+; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X32-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP10]], align 1
+; X32-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]], align 1
+; X32-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X32-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; X32-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X32-NEXT:    br label [[ENDBLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 5)
+  ret i32 %call
+}
+
+define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp6(
+; X32-NEXT:    br label [[LOADBB:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb:
+; X32-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1
+; X32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1
+; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i16*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i16*
+; X32-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]], align 1
+; X32-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP13]], align 1
+; X32-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
+; X32-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
+; X32-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
+; X32-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i32
+; X32-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]
+; X32-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 6)
+  ret i32 %call
+}
+
+define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp7(
+; X32-NEXT:    br label [[LOADBB:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb:
+; X32-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1
+; X32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1
+; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 3
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 3
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i32*
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]], align 1
+; X32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 1
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
+; X32-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 7)
+  ret i32 %call
+}
+
+define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp8(
+; X32-NEXT:    br label [[LOADBB:%.*]]
+; X32:       res_block:
+; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X32-NEXT:    br label [[ENDBLOCK:%.*]]
+; X32:       loadbb:
+; X32-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1
+; X32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1
+; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X32:       loadbb1:
+; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4
+; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
+; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i32*
+; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]], align 1
+; X32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 1
+; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X32-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
+; X32-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
+; X32-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X32:       endblock:
+; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X32-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 8)
+  ret i32 %call
+}
+
+define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp9(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 9)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 9)
+  ret i32 %call
+}
+
+define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp10(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 10)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 10)
+  ret i32 %call
+}
+
+define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp11(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 11)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 11)
+  ret i32 %call
+}
+
+define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp12(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 12)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 12)
+  ret i32 %call
+}
+
+define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp13(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 13)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 13)
+  ret i32 %call
+}
+
+define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp14(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 14)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 14)
+  ret i32 %call
+}
+
+define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp15(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 15)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 15)
+  ret i32 %call
+}
+
+define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp16(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 16)
+; X32-NEXT:    ret i32 [[CALL]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16)
+  ret i32 %call
+}
+
+define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq2(
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; X32-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1
+; X32-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 2)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq3(
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; X32-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1
+; X32-NEXT:    [[TMP5:%.*]] = xor i16 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 2
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 2
+; X32-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP6]], align 1
+; X32-NEXT:    [[TMP9:%.*]] = load i8, i8* [[TMP7]], align 1
+; X32-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i16
+; X32-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i16
+; X32-NEXT:    [[TMP12:%.*]] = xor i16 [[TMP10]], [[TMP11]]
+; X32-NEXT:    [[TMP13:%.*]] = or i16 [[TMP5]], [[TMP12]]
+; X32-NEXT:    [[TMP14:%.*]] = icmp ne i16 [[TMP13]], 0
+; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 3)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq4(
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
+; X32-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 4)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq5(
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
+; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X32-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP6]], align 1
+; X32-NEXT:    [[TMP9:%.*]] = load i8, i8* [[TMP7]], align 1
+; X32-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i32
+; X32-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
+; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
+; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
+; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 5)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq6(
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
+; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i16*
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i16*
+; X32-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]], align 1
+; X32-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
+; X32-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
+; X32-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
+; X32-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
+; X32-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X32-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 6)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq6_align4(i8* nocapture readonly align 4 %x, i8* nocapture readonly align 4 %y)  {
+; X32-LABEL: @cmp_eq6_align4(
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
+; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
+; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i16*
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i16*
+; X32-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]], align 4
+; X32-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]], align 4
+; X32-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
+; X32-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
+; X32-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
+; X32-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
+; X32-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
+; X32-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 6)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq7(
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
+; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 3
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 3
+; X32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32*
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 1
+; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
+; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
+; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 7)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq8(
+; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
+; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
+; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
+; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4
+; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32*
+; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32*
+; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]], align 1
+; X32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 1
+; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
+; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
+; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
+; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 8)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq9(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 9)
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 9)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq10(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 10)
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 10)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq11(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 11)
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 11)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq12(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 12)
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 12)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq13(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 13)
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 13)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq14(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 14)
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 14)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq15(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 15)
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 15)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
+; X32-LABEL: @cmp_eq16(
+; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i32 16)
+; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
+; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X32-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i32 16)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
diff --git a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
index df05021473585..27b3ce4f04b27 100644
--- a/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
+++ b/llvm/test/Transforms/ExpandMemCmp/X86/memcmp.ll
@@ -1,238 +1,205 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -expandmemcmp -mtriple=i686-unknown-unknown   -data-layout=e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128 < %s | FileCheck %s --check-prefix=ALL --check-prefix=X32
-; RUN: opt -S -expandmemcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_1LD
-; RUN: opt -S -expandmemcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=ALL --check-prefix=X64 --check-prefix=X64_2LD
+; RUN: opt -S -expandmemcmp -memcmp-num-loads-per-block=1 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_1LD
+; RUN: opt -S -expandmemcmp -memcmp-num-loads-per-block=2 -mtriple=x86_64-unknown-unknown -data-layout=e-m:o-i64:64-f80:128-n8:16:32:64-S128         < %s | FileCheck %s --check-prefix=X64 --check-prefix=X64_2LD
 
 declare i32 @memcmp(i8* nocapture, i8* nocapture, i64)
 
 define i32 @cmp2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp2(
-; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
-; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
-; ALL-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; ALL-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; ALL-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; ALL-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; ALL-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; ALL-NEXT:    ret i32 [[TMP9]]
+; X64-LABEL: @cmp2(
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; X64-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
   ret i32 %call
 }
 
 define i32 @cmp2_align2(i8* nocapture readonly align 2 %x, i8* nocapture readonly align 2 %y)  {
-; ALL-LABEL: @cmp2_align2(
-; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
-; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
-; ALL-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 2
-; ALL-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2
-; ALL-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
-; ALL-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
-; ALL-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
-; ALL-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
-; ALL-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; ALL-NEXT:    ret i32 [[TMP9]]
+; X64-LABEL: @cmp2_align2(
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; X64-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 2
+; X64-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2
+; X64-NEXT:    [[TMP5:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP5]] to i32
+; X64-NEXT:    [[TMP8:%.*]] = zext i16 [[TMP6]] to i32
+; X64-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    ret i32 [[TMP9]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
   ret i32 %call
 }
 
 define i32 @cmp3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp3(
-; ALL-NEXT:    br label [[LOADBB:%.*]]
-; ALL:       res_block:
-; ALL-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
-; ALL-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
-; ALL:       loadbb:
-; ALL-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16*
-; ALL-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16*
-; ALL-NEXT:    [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 1
-; ALL-NEXT:    [[TMP6:%.*]] = load i16, i16* [[TMP4]], align 1
-; ALL-NEXT:    [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
-; ALL-NEXT:    [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
-; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
-; ALL-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
-; ALL:       loadbb1:
-; ALL-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 2
-; ALL-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 2
-; ALL-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP10]], align 1
-; ALL-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]], align 1
-; ALL-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; ALL-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; ALL-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
-; ALL-NEXT:    br label [[ENDBLOCK]]
-; ALL:       endblock:
-; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; ALL-NEXT:    ret i32 [[PHI_RES]]
+; X64-LABEL: @cmp3(
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i16 [[TMP7:%.*]], [[TMP8:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; X64-NEXT:    [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 1
+; X64-NEXT:    [[TMP6:%.*]] = load i16, i16* [[TMP4]], align 1
+; X64-NEXT:    [[TMP7]] = call i16 @llvm.bswap.i16(i16 [[TMP5]])
+; X64-NEXT:    [[TMP8]] = call i16 @llvm.bswap.i16(i16 [[TMP6]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i16 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 2
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 2
+; X64-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]], align 1
+; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; X64-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 3)
   ret i32 %call
 }
 
 define i32 @cmp4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp4(
-; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; ALL-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; ALL-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
-; ALL-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
-; ALL-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; ALL-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; ALL-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
-; ALL-NEXT:    ret i32 [[TMP11]]
+; X64-LABEL: @cmp4(
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; X64-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; X64-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
+; X64-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
+; X64-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
+; X64-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; X64-NEXT:    ret i32 [[TMP11]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
   ret i32 %call
 }
 
 define i32 @cmp5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp5(
-; ALL-NEXT:    br label [[LOADBB:%.*]]
-; ALL:       res_block:
-; ALL-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP7:%.*]], [[TMP8:%.*]]
-; ALL-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
-; ALL:       loadbb:
-; ALL-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1
-; ALL-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1
-; ALL-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; ALL-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; ALL-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
-; ALL:       loadbb1:
-; ALL-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4
-; ALL-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4
-; ALL-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP10]], align 1
-; ALL-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]], align 1
-; ALL-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
-; ALL-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
-; ALL-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
-; ALL-NEXT:    br label [[ENDBLOCK]]
-; ALL:       endblock:
-; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; ALL-NEXT:    ret i32 [[PHI_RES]]
+; X64-LABEL: @cmp5(
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[TMP7:%.*]], [[TMP8:%.*]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1
+; X64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1
+; X64-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; X64-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X64-NEXT:    [[TMP12:%.*]] = load i8, i8* [[TMP10]], align 1
+; X64-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP11]], align 1
+; X64-NEXT:    [[TMP14:%.*]] = zext i8 [[TMP12]] to i32
+; X64-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP13]] to i32
+; X64-NEXT:    [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
+; X64-NEXT:    br label [[ENDBLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP16]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 5)
   ret i32 %call
 }
 
 define i32 @cmp6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp6(
-; ALL-NEXT:    br label [[LOADBB:%.*]]
-; ALL:       res_block:
-; ALL-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
-; ALL-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
-; ALL-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
-; ALL-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
-; ALL:       loadbb:
-; ALL-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1
-; ALL-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1
-; ALL-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; ALL-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; ALL-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
-; ALL:       loadbb1:
-; ALL-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4
-; ALL-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4
-; ALL-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i16*
-; ALL-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i16*
-; ALL-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]], align 1
-; ALL-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP13]], align 1
-; ALL-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
-; ALL-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
-; ALL-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
-; ALL-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i32
-; ALL-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]
-; ALL-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
-; ALL:       endblock:
-; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; ALL-NEXT:    ret i32 [[PHI_RES]]
+; X64-LABEL: @cmp6(
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP18:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP19:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1
+; X64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1
+; X64-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; X64-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i16*
+; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i16*
+; X64-NEXT:    [[TMP14:%.*]] = load i16, i16* [[TMP12]], align 1
+; X64-NEXT:    [[TMP15:%.*]] = load i16, i16* [[TMP13]], align 1
+; X64-NEXT:    [[TMP16:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP14]])
+; X64-NEXT:    [[TMP17:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP15]])
+; X64-NEXT:    [[TMP18]] = zext i16 [[TMP16]] to i32
+; X64-NEXT:    [[TMP19]] = zext i16 [[TMP17]] to i32
+; X64-NEXT:    [[TMP20:%.*]] = icmp eq i32 [[TMP18]], [[TMP19]]
+; X64-NEXT:    br i1 [[TMP20]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 6)
   ret i32 %call
 }
 
 define i32 @cmp7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp7(
-; ALL-NEXT:    br label [[LOADBB:%.*]]
-; ALL:       res_block:
-; ALL-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; ALL-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
-; ALL-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
-; ALL-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; ALL-NEXT:    br label [[ENDBLOCK:%.*]]
-; ALL:       loadbb:
-; ALL-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]]
-; ALL-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]]
-; ALL-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; ALL-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; ALL-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; ALL-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
-; ALL:       loadbb1:
-; ALL-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 3
-; ALL-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 3
-; ALL-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
-; ALL-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i32*
-; ALL-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]]
-; ALL-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]]
-; ALL-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; ALL-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; ALL-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
-; ALL-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
-; ALL:       endblock:
-; ALL-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; ALL-NEXT:    ret i32 [[PHI_RES]]
+; X64-LABEL: @cmp7(
+; X64-NEXT:    br label [[LOADBB:%.*]]
+; X64:       res_block:
+; X64-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
+; X64-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
+; X64-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; X64-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; X64-NEXT:    br label [[ENDBLOCK:%.*]]
+; X64:       loadbb:
+; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X64-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1
+; X64-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1
+; X64-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
+; X64-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
+; X64-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
+; X64-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; X64:       loadbb1:
+; X64-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 3
+; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 3
+; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
+; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i32*
+; X64-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]], align 1
+; X64-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 1
+; X64-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
+; X64-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
+; X64-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
+; X64-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; X64:       endblock:
+; X64-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; X64-NEXT:    ret i32 [[PHI_RES]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 7)
   ret i32 %call
 }
 
 define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp8(
-; X32-NEXT:    br label [[LOADBB:%.*]]
-; X32:       res_block:
-; X32-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP7:%.*]], [[LOADBB]] ], [ [[TMP16:%.*]], [[LOADBB1:%.*]] ]
-; X32-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP8:%.*]], [[LOADBB]] ], [ [[TMP17:%.*]], [[LOADBB1]] ]
-; X32-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
-; X32-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; X32-NEXT:    br label [[ENDBLOCK:%.*]]
-; X32:       loadbb:
-; X32-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X32-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X32-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 1
-; X32-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP4]], align 1
-; X32-NEXT:    [[TMP7]] = call i32 @llvm.bswap.i32(i32 [[TMP5]])
-; X32-NEXT:    [[TMP8]] = call i32 @llvm.bswap.i32(i32 [[TMP6]])
-; X32-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP7]], [[TMP8]]
-; X32-NEXT:    br i1 [[TMP9]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
-; X32:       loadbb1:
-; X32-NEXT:    [[TMP10:%.*]] = getelementptr i8, i8* [[X]], i64 4
-; X32-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 4
-; X32-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32*
-; X32-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i32*
-; X32-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP12]], align 1
-; X32-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP13]], align 1
-; X32-NEXT:    [[TMP16]] = call i32 @llvm.bswap.i32(i32 [[TMP14]])
-; X32-NEXT:    [[TMP17]] = call i32 @llvm.bswap.i32(i32 [[TMP15]])
-; X32-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[TMP16]], [[TMP17]]
-; X32-NEXT:    br i1 [[TMP18]], label [[ENDBLOCK]], label [[RES_BLOCK]]
-; X32:       endblock:
-; X32-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; X32-NEXT:    ret i32 [[PHI_RES]]
-;
 ; X64-LABEL: @cmp8(
 ; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
 ; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
@@ -252,10 +219,6 @@ define i32 @cmp8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp9(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9)
-; X32-NEXT:    ret i32 [[CALL]]
-;
 ; X64-LABEL: @cmp9(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
@@ -289,10 +252,6 @@ define i32 @cmp9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp10(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10)
-; X32-NEXT:    ret i32 [[CALL]]
-;
 ; X64-LABEL: @cmp10(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
@@ -332,10 +291,6 @@ define i32 @cmp10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp11(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11)
-; X32-NEXT:    ret i32 [[CALL]]
-;
 ; X64-LABEL: @cmp11(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
@@ -347,8 +302,8 @@ define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64:       loadbb:
 ; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
 ; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]]
-; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]], align 1
+; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]], align 1
 ; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
 ; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
 ; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
@@ -358,8 +313,8 @@ define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 3
 ; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i64*
 ; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i64*
-; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]]
-; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]]
+; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]], align 1
+; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]], align 1
 ; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
 ; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
 ; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
@@ -373,10 +328,6 @@ define i32 @cmp11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp12(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12)
-; X32-NEXT:    ret i32 [[CALL]]
-;
 ; X64-LABEL: @cmp12(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
@@ -416,10 +367,6 @@ define i32 @cmp12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp13(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13)
-; X32-NEXT:    ret i32 [[CALL]]
-;
 ; X64-LABEL: @cmp13(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
@@ -431,8 +378,8 @@ define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64:       loadbb:
 ; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
 ; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]]
-; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]], align 1
+; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]], align 1
 ; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
 ; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
 ; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
@@ -442,8 +389,8 @@ define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 5
 ; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i64*
 ; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i64*
-; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]]
-; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]]
+; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]], align 1
+; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]], align 1
 ; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
 ; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
 ; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
@@ -457,10 +404,6 @@ define i32 @cmp13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp14(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14)
-; X32-NEXT:    ret i32 [[CALL]]
-;
 ; X64-LABEL: @cmp14(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
@@ -472,8 +415,8 @@ define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64:       loadbb:
 ; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
 ; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]]
-; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]], align 1
+; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]], align 1
 ; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
 ; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
 ; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
@@ -483,8 +426,8 @@ define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 6
 ; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i64*
 ; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i64*
-; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]]
-; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]]
+; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]], align 1
+; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]], align 1
 ; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
 ; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
 ; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
@@ -498,10 +441,6 @@ define i32 @cmp14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp15(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
-; X32-NEXT:    ret i32 [[CALL]]
-;
 ; X64-LABEL: @cmp15(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
@@ -513,8 +452,8 @@ define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64:       loadbb:
 ; X64-NEXT:    [[TMP3:%.*]] = bitcast i8* [[X:%.*]] to i64*
 ; X64-NEXT:    [[TMP4:%.*]] = bitcast i8* [[Y:%.*]] to i64*
-; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]]
-; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]]
+; X64-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP3]], align 1
+; X64-NEXT:    [[TMP6:%.*]] = load i64, i64* [[TMP4]], align 1
 ; X64-NEXT:    [[TMP7]] = call i64 @llvm.bswap.i64(i64 [[TMP5]])
 ; X64-NEXT:    [[TMP8]] = call i64 @llvm.bswap.i64(i64 [[TMP6]])
 ; X64-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[TMP7]], [[TMP8]]
@@ -524,8 +463,8 @@ define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 ; X64-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[Y]], i64 7
 ; X64-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i64*
 ; X64-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to i64*
-; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]]
-; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]]
+; X64-NEXT:    [[TMP14:%.*]] = load i64, i64* [[TMP12]], align 1
+; X64-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP13]], align 1
 ; X64-NEXT:    [[TMP16]] = call i64 @llvm.bswap.i64(i64 [[TMP14]])
 ; X64-NEXT:    [[TMP17]] = call i64 @llvm.bswap.i64(i64 [[TMP15]])
 ; X64-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP16]], [[TMP17]]
@@ -539,10 +478,6 @@ define i32 @cmp15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp16(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16)
-; X32-NEXT:    ret i32 [[CALL]]
-;
 ; X64-LABEL: @cmp16(
 ; X64-NEXT:    br label [[LOADBB:%.*]]
 ; X64:       res_block:
@@ -580,16 +515,16 @@ define i32 @cmp16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq2(
-; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
-; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
-; ALL-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
-; ALL-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X64-LABEL: @cmp_eq2(
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
+; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
+; X64-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 2)
   %cmp = icmp eq i32 %call, 0
@@ -598,26 +533,6 @@ define i32 @cmp_eq2(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq3(
-; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i16*
-; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i16*
-; X32-NEXT:    [[TMP3:%.*]] = load i16, i16* [[TMP1]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i16 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 2
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 2
-; X32-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP6]], align 1
-; X32-NEXT:    [[TMP9:%.*]] = load i8, i8* [[TMP7]], align 1
-; X32-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i16
-; X32-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i16
-; X32-NEXT:    [[TMP12:%.*]] = xor i16 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i16 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i16 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64_1LD-LABEL: @cmp_eq3(
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
@@ -669,16 +584,16 @@ define i32 @cmp_eq3(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; ALL-LABEL: @cmp_eq4(
-; ALL-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; ALL-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; ALL-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
-; ALL-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
-; ALL-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
-; ALL-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; ALL-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
-; ALL-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; ALL-NEXT:    ret i32 [[CONV]]
+; X64-LABEL: @cmp_eq4(
+; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
+; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
+; X64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
+; X64-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
+; X64-NEXT:    [[TMP5:%.*]] = icmp ne i32 [[TMP3]], [[TMP4]]
+; X64-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
+; X64-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP6]], 0
+; X64-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; X64-NEXT:    ret i32 [[CONV]]
 ;
   %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 4)
   %cmp = icmp eq i32 %call, 0
@@ -687,26 +602,6 @@ define i32 @cmp_eq4(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq5(
-; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4
-; X32-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP6]], align 1
-; X32-NEXT:    [[TMP9:%.*]] = load i8, i8* [[TMP7]], align 1
-; X32-NEXT:    [[TMP10:%.*]] = zext i8 [[TMP8]] to i32
-; X32-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP9]] to i32
-; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64_1LD-LABEL: @cmp_eq5(
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
@@ -758,28 +653,6 @@ define i32 @cmp_eq5(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq6(
-; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4
-; X32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i16*
-; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i16*
-; X32-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]], align 1
-; X32-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]], align 1
-; X32-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X32-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X32-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X32-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X32-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X32-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64_1LD-LABEL: @cmp_eq6(
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
@@ -835,28 +708,6 @@ define i32 @cmp_eq6(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq6_align4(i8* nocapture readonly align 4 %x, i8* nocapture readonly align 4 %y)  {
-; X32-LABEL: @cmp_eq6_align4(
-; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
-; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 4
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4
-; X32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i16*
-; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i16*
-; X32-NEXT:    [[TMP10:%.*]] = load i16, i16* [[TMP8]], align 4
-; X32-NEXT:    [[TMP11:%.*]] = load i16, i16* [[TMP9]], align 4
-; X32-NEXT:    [[TMP12:%.*]] = zext i16 [[TMP10]] to i32
-; X32-NEXT:    [[TMP13:%.*]] = zext i16 [[TMP11]] to i32
-; X32-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP12]], [[TMP13]]
-; X32-NEXT:    [[TMP15:%.*]] = or i32 [[TMP5]], [[TMP14]]
-; X32-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP15]], 0
-; X32-NEXT:    [[TMP17:%.*]] = zext i1 [[TMP16]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP17]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64_1LD-LABEL: @cmp_eq6_align4(
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
@@ -912,26 +763,6 @@ define i32 @cmp_eq6_align4(i8* nocapture readonly align 4 %x, i8* nocapture read
 }
 
 define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq7(
-; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 3
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 3
-; X32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32*
-; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]], align 1
-; X32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 1
-; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64_1LD-LABEL: @cmp_eq7(
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
@@ -985,26 +816,6 @@ define i32 @cmp_eq7(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq8(
-; X32-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i32*
-; X32-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i32*
-; X32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 1
-; X32-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP2]], align 1
-; X32-NEXT:    [[TMP5:%.*]] = xor i32 [[TMP3]], [[TMP4]]
-; X32-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* [[X]], i64 4
-; X32-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[Y]], i64 4
-; X32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP6]] to i32*
-; X32-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP7]] to i32*
-; X32-NEXT:    [[TMP10:%.*]] = load i32, i32* [[TMP8]], align 1
-; X32-NEXT:    [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 1
-; X32-NEXT:    [[TMP12:%.*]] = xor i32 [[TMP10]], [[TMP11]]
-; X32-NEXT:    [[TMP13:%.*]] = or i32 [[TMP5]], [[TMP12]]
-; X32-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
-; X32-NEXT:    [[TMP15:%.*]] = zext i1 [[TMP14]] to i32
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP15]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64-LABEL: @cmp_eq8(
 ; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i64*
 ; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i64*
@@ -1023,12 +834,6 @@ define i32 @cmp_eq8(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq9(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 9)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64_1LD-LABEL: @cmp_eq9(
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
@@ -1080,12 +885,6 @@ define i32 @cmp_eq9(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq10(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 10)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64_1LD-LABEL: @cmp_eq10(
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
@@ -1141,12 +940,6 @@ define i32 @cmp_eq10(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq11(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 11)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64_1LD-LABEL: @cmp_eq11(
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
@@ -1200,12 +993,6 @@ define i32 @cmp_eq11(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq12(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 12)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64_1LD-LABEL: @cmp_eq12(
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
@@ -1261,12 +1048,6 @@ define i32 @cmp_eq12(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq13(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 13)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64_1LD-LABEL: @cmp_eq13(
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
@@ -1320,12 +1101,6 @@ define i32 @cmp_eq13(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq14(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 14)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64_1LD-LABEL: @cmp_eq14(
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
@@ -1379,12 +1154,6 @@ define i32 @cmp_eq14(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq15(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 15)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64_1LD-LABEL: @cmp_eq15(
 ; X64_1LD-NEXT:    br label [[LOADBB:%.*]]
 ; X64_1LD:       res_block:
@@ -1438,12 +1207,6 @@ define i32 @cmp_eq15(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
 }
 
 define i32 @cmp_eq16(i8* nocapture readonly %x, i8* nocapture readonly %y)  {
-; X32-LABEL: @cmp_eq16(
-; X32-NEXT:    [[CALL:%.*]] = tail call i32 @memcmp(i8* [[X:%.*]], i8* [[Y:%.*]], i64 16)
-; X32-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], 0
-; X32-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; X32-NEXT:    ret i32 [[CONV]]
-;
 ; X64-LABEL: @cmp_eq16(
 ; X64-NEXT:    [[TMP1:%.*]] = bitcast i8* [[X:%.*]] to i128*
 ; X64-NEXT:    [[TMP2:%.*]] = bitcast i8* [[Y:%.*]] to i128*
diff --git a/llvm/test/Transforms/GVN/PRE/pre-loop-load.ll b/llvm/test/Transforms/GVN/PRE/pre-loop-load.ll
index 8ca12284d5c42..a94269c548b8c 100644
--- a/llvm/test/Transforms/GVN/PRE/pre-loop-load.ll
+++ b/llvm/test/Transforms/GVN/PRE/pre-loop-load.ll
@@ -202,31 +202,36 @@ exit:
   ret i32 %x
 }
 
+declare void @may_modify_or_free_pointer(i32* %p)
+
 ; TODO: Despite the fact that the function may free memory in general, it
 ; cannot free memory allocated by alloca.
 define i32 @test_load_on_cold_path_may_free_memory_alloca() {
 ; CHECK-LABEL: @test_load_on_cold_path_may_free_memory_alloca(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[P:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    call void @may_free_memory()
+; CHECK-NEXT:    call void @may_modify_or_free_pointer(i32* [[P]])
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    br i1 undef, label [[HOT_PATH:%.*]], label [[COLD_PATH:%.*]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[X:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[HOT_PATH:%.*]], label [[COLD_PATH:%.*]]
 ; CHECK:       hot_path:
-; CHECK-NEXT:    br label [[BACKEDGE:%.*]]
+; CHECK-NEXT:    br label [[BACKEDGE]]
 ; CHECK:       cold_path:
-; CHECK-NEXT:    call void @may_free_memory()
+; CHECK-NEXT:    call void @may_modify_or_free_pointer(i32* [[P]])
 ; CHECK-NEXT:    br label [[BACKEDGE]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    br i1 false, label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[EXIT:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    br label [[LOOP]]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], [[X]]
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    ret i32 [[X]]
 ;
 entry:
   %p = alloca i32
-  call void @may_free_memory()
+  call void @may_modify_or_free_pointer(i32* %p)
   br label %loop
 
 loop:
@@ -239,7 +244,7 @@ hot_path:
   br label %backedge
 
 cold_path:
-  call void @may_free_memory()
+  call void @may_modify_or_free_pointer(i32* %p)
   br label %backedge
 
 backedge:
diff --git a/llvm/test/Transforms/GVN/gvn-loop-load-pre-order.ll b/llvm/test/Transforms/GVN/gvn-loop-load-pre-order.ll
new file mode 100644
index 0000000000000..8da767cc0db10
--- /dev/null
+++ b/llvm/test/Transforms/GVN/gvn-loop-load-pre-order.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -loops -gvn -S | FileCheck -check-prefix=CHECK-PRE %s
+; RUN: opt < %s -loops -gvn -sccp -S | FileCheck -check-prefix=CHECK-SCCP %s
+
+@b = external global i32, align 4
+
+define void @main(i1 %cond) {
+; CHECK-PRE-LABEL: @main(
+; CHECK-PRE-NEXT:  entry:
+; CHECK-PRE-NEXT:    store i32 0, i32* @b, align 4
+; CHECK-PRE-NEXT:    br label [[WHILE_BODY3:%.*]]
+; CHECK-PRE:       while.cond.loopexit.loopexit:
+; CHECK-PRE-NEXT:    ret void
+; CHECK-PRE:       while.body3:
+; CHECK-PRE-NEXT:    [[TMP0:%.*]] = phi i32 [ [[DOTPRE:%.*]], [[IF_END_WHILE_BODY3_CRIT_EDGE:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-PRE-NEXT:    [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-PRE-NEXT:    br i1 [[TOBOOL4_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK-PRE:       if.then:
+; CHECK-PRE-NEXT:    tail call void @foo()
+; CHECK-PRE-NEXT:    br label [[IF_END]]
+; CHECK-PRE:       if.end:
+; CHECK-PRE-NEXT:    br i1 [[COND:%.*]], label [[WHILE_COND_LOOPEXIT_LOOPEXIT:%.*]], label [[IF_END_WHILE_BODY3_CRIT_EDGE]]
+; CHECK-PRE:       if.end.while.body3_crit_edge:
+; CHECK-PRE-NEXT:    [[DOTPRE]] = load i32, i32* @b, align 4
+; CHECK-PRE-NEXT:    br label [[WHILE_BODY3]]
+;
+; CHECK-SCCP-LABEL: @main(
+; CHECK-SCCP-NEXT:  entry:
+; CHECK-SCCP-NEXT:    store i32 0, i32* @b, align 4
+; CHECK-SCCP-NEXT:    br label [[WHILE_BODY3:%.*]]
+; CHECK-SCCP:       while.cond.loopexit.loopexit:
+; CHECK-SCCP-NEXT:    ret void
+; CHECK-SCCP:       while.body3:
+; CHECK-SCCP-NEXT:    [[TMP0:%.*]] = phi i32 [ [[DOTPRE:%.*]], [[IF_END_WHILE_BODY3_CRIT_EDGE:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-SCCP-NEXT:    [[TOBOOL4_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-SCCP-NEXT:    br i1 [[TOBOOL4_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK-SCCP:       if.then:
+; CHECK-SCCP-NEXT:    tail call void @foo()
+; CHECK-SCCP-NEXT:    br label [[IF_END]]
+; CHECK-SCCP:       if.end:
+; CHECK-SCCP-NEXT:    br i1 [[COND:%.*]], label [[WHILE_COND_LOOPEXIT_LOOPEXIT:%.*]], label [[IF_END_WHILE_BODY3_CRIT_EDGE]]
+; CHECK-SCCP:       if.end.while.body3_crit_edge:
+; CHECK-SCCP-NEXT:    [[DOTPRE]] = load i32, i32* @b, align 4
+; CHECK-SCCP-NEXT:    br label [[WHILE_BODY3]]
+;
+entry:
+  store i32 0, i32* @b
+  br label %while.body3.lr.ph
+
+while.cond.loopexit.loopexit:                     ; preds = %if.end
+  ret void
+
+while.body3.lr.ph:                                ; preds = %while.cond1.preheader
+  %0 = load i32, i32* @b, align 4
+  %1 = icmp eq i32 %0, 0
+  br label %while.body3
+
+while.body3:                                      ; preds = %if.end, %while.body3.lr.ph
+  %2 = load i32, i32* @b, align 4
+  %tobool4.not = icmp eq i32 %2, 0
+  br i1 %tobool4.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %while.body3
+  tail call void @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %while.body3
+  br i1 %cond, label %while.cond.loopexit.loopexit, label %while.body3
+}
+
+declare void @foo()
+
diff --git a/llvm/test/Transforms/GVNHoist/hoist-pr46874.ll b/llvm/test/Transforms/GVNHoist/hoist-pr46874.ll
new file mode 100644
index 0000000000000..0b661d7e023ec
--- /dev/null
+++ b/llvm/test/Transforms/GVNHoist/hoist-pr46874.ll
@@ -0,0 +1,65 @@
+; RUN: opt -gvn-hoist -S < %s | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@input = local_unnamed_addr global i32* null, align 8
+
+; Check that the load instruction is **not** hoisted
+; CHECK-LABEL: @_Z3fooPii
+; CHECK-LABEL: if.then:
+; CHECK-NEXT: load
+; CHECK-LABEL: if2:
+; CHECK: load
+; CHECK-LABEL: @main
+
+define i32 @_Z3fooPii(i32* %p, i32 %x) local_unnamed_addr  {
+entry:
+  %cmp.not = icmp eq i32* %p, null
+  br i1 %cmp.not, label %if.end3, label %if.then
+
+if.then:                                          ; preds = %entry
+  %0 = load i32, i32* %p, align 4, !tbaa !3
+  %add = add nsw i32 %0, %x
+  %cmp1 = icmp eq i32 %add, 4
+  br i1 %cmp1, label %if2, label %if.end3
+
+if.end3:                                          ; preds = %entry, %if.then
+  %x.addr.0 = phi i32 [ %add, %if.then ], [ %x, %entry ]
+  %add4 = add nsw i32 %x.addr.0, 2
+  br i1 %cmp.not, label %if.end11, label %if2
+
+if2:                                              ; preds = %if.end3, %if.then
+  %x.addr.1 = phi i32 [ 4, %if.then ], [ %x.addr.0, %if.end3 ]
+  %y.0 = phi i32 [ 2, %if.then ], [ %add4, %if.end3 ]
+  %1 = load i32, i32* %p, align 4, !tbaa !3
+  %add7 = add nsw i32 %x.addr.1, %1
+  %cmp8 = icmp eq i32 %add7, 5
+  br i1 %cmp8, label %end, label %if.end11
+
+if.end11:                                         ; preds = %if.end3, %if2
+  %x.addr.2 = phi i32 [ %add7, %if2 ], [ %x.addr.0, %if.end3 ]
+  %y.1 = phi i32 [ %y.0, %if2 ], [ %add4, %if.end3 ]
+  %add12 = add nsw i32 %y.1, %x.addr.2
+  br label %end
+
+end:                                              ; preds = %if2, %if.end11
+  %x.addr.3 = phi i32 [ 5, %if2 ], [ %x.addr.2, %if.end11 ]
+  %y.2 = phi i32 [ %y.0, %if2 ], [ %add12, %if.end11 ]
+  %add13 = add nsw i32 %x.addr.3, %y.2
+  ret i32 %add13
+}
+
+define i32 @main() local_unnamed_addr  {
+entry:
+  %0 = load i32*, i32** @input, align 8, !tbaa !7
+  %call = call i32 @_Z3fooPii(i32* %0, i32 0)
+  ret i32 %call
+}
+
+
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"pointer@_ZTSPi", !5, i64 0}
diff --git a/llvm/test/Transforms/GVNHoist/pr37445.ll b/llvm/test/Transforms/GVNHoist/pr37445.ll
index 82cdced2c6129..23e61fefd3419 100644
--- a/llvm/test/Transforms/GVNHoist/pr37445.ll
+++ b/llvm/test/Transforms/GVNHoist/pr37445.ll
@@ -1,8 +1,12 @@
 ; RUN: opt < %s -early-cse-memssa -earlycse-debug-hash -gvn-hoist -S | FileCheck %s
 
 ; Make sure opt won't crash and that this pair of
-; instructions (load, icmp) is hoisted successfully
-; from bb45 and bb58 to bb41.
+; instructions (load, icmp) are not hoisted.
+; Although it is safe to hoist the loads from bb45 to
+; bb41, gvn-hoist does not have appropriate mechanism
+; to handle corner cases (see PR46874) when these instructions
+; were hoisted.
+; FIXME: Hoist loads from bb58 and bb45 to bb41.
 
 @g_10 = external global i32, align 4
 @g_536 = external global i8*, align 8
@@ -48,8 +52,6 @@ bb36:
   br label %bb12
 
 ;CHECK: bb41:
-;CHECK:   %tmp47 = load i32, i32* %arg1, align 4
-;CHECK:   %tmp48 = icmp eq i32 %tmp47, 0
 
 bb41:
   %tmp43 = load i32, i32* %arg, align 4
@@ -57,8 +59,8 @@ bb41:
   br i1 %tmp44, label %bb52, label %bb45
 
 ;CHECK:     bb45:
-;CHECK-NOT:   %tmp47 = load i32, i32* %arg1, align 4
-;CHECK-NOT:   %tmp48 = icmp eq i32 %tmp47, 0
+;CHECK:   %tmp47 = load i32, i32* %arg1, align 4
+;CHECK:   %tmp48 = icmp eq i32 %tmp47, 0
 
 bb45:
   %tmp47 = load i32, i32* %arg1, align 4
@@ -78,9 +80,13 @@ bb55:
   %tmp57 = add nsw i32 %tmp8.0, 1
   br label %bb52
 
-;CHECK:     bb58:
-;CHECK-NOT:   %tmp60 = load i32, i32* %arg1, align 4
-;CHECK-NOT:   %tmp61 = icmp eq i32 %tmp60, 0
+;CHECK: bb58:
+;CHECK: %tmp60 = load i32, i32* %arg1, align 4
+;CHECK: %tmp61 = icmp eq i32 %tmp60, 0
+;CHECK: bb62:
+;CHECK: load
+;CHECK: bb64:
+;CHECK: load
 
 bb58:
   %tmp60 = load i32, i32* %arg1, align 4
diff --git a/llvm/test/Transforms/GlobalOpt/stored-once-value-type.ll b/llvm/test/Transforms/GlobalOpt/stored-once-value-type.ll
new file mode 100644
index 0000000000000..c4d5b46ccd332
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/stored-once-value-type.ll
@@ -0,0 +1,22 @@
+; RUN: opt -passes=globalopt < %s -S | FileCheck %s
+
+; Check that we don't try to set a global initializer to a value of a different type.
+; In this case, we were trying to set @0's initializer to be i32* null.
+
+%0 = type { i32* }
+
+@0 = internal global %0* null
+; CHECK: global %0 undef
+
+define void @a() {
+  %1 = tail call i8* @_Znwm(i64 8)
+  %2 = bitcast i8* %1 to %0*
+  %3 = getelementptr inbounds %0, %0* %2, i64 0, i32 0
+  store i32* null, i32** %3, align 8
+  store i8* %1, i8** bitcast (%0** @0 to i8**), align 8
+  %4 = load i64*, i64** bitcast (%0** @0 to i64**), align 8
+  %5 = load atomic i64, i64* %4 acquire, align 8
+  ret void
+}
+
+declare i8* @_Znwm(i64)
diff --git a/llvm/test/Transforms/GuardWidening/basic-loop.ll b/llvm/test/Transforms/GuardWidening/basic-loop.ll
index 092c00744fd6f..20d40397d37e0 100644
--- a/llvm/test/Transforms/GuardWidening/basic-loop.ll
+++ b/llvm/test/Transforms/GuardWidening/basic-loop.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -loop-guard-widening -enable-new-pm=0 < %s | FileCheck %s
-; RUN: opt -S -passes="loop(guard-widening)" < %s | FileCheck %s
+; RUN: opt -S -loop-guard-widening -verify-memoryssa -enable-new-pm=0 < %s | FileCheck %s
+; RUN: opt -S -passes="loop-mssa(guard-widening)" -verify-memoryssa < %s | FileCheck %s
 
 declare void @llvm.experimental.guard(i1,...)
 
diff --git a/llvm/test/Transforms/GuardWidening/loop-schedule.ll b/llvm/test/Transforms/GuardWidening/loop-schedule.ll
index b8d92919c921a..0cc082d333b8b 100644
--- a/llvm/test/Transforms/GuardWidening/loop-schedule.ll
+++ b/llvm/test/Transforms/GuardWidening/loop-schedule.ll
@@ -1,13 +1,13 @@
-; RUN: opt -S -licm -loop-guard-widening -licm -debug-pass=Structure -enable-new-pm=0 < %s 2>&1 | FileCheck %s --check-prefixes=LPM,CHECK
-; RUN: opt -S -passes='licm,guard-widening,licm' -debug-pass-manager < %s 2>&1 | FileCheck %s --check-prefixes=NPM,CHECK
+; RUN: opt -S -licm -loop-guard-widening -licm -verify-memoryssa -debug-pass=Structure -enable-new-pm=0 < %s 2>&1 | FileCheck %s --check-prefixes=LPM,CHECK
+; RUN: opt -S -passes='licm,guard-widening,licm' -verify-memoryssa -debug-pass-manager < %s 2>&1 | FileCheck %s --check-prefixes=NPM,CHECK
 
 ; Main point of this test is to check the scheduling -- there should be
 ; no analysis passes needed between LICM and LoopGuardWidening
 
 ; LPM: Loop Pass Manager
 ; LPM:   Loop Invariant Code Motion
-; LPM:   Widen guards (within a single loop, as a loop pass)
-; LPM:   Loop Invariant Code Motion
+; LPM-NEXT:   Widen guards (within a single loop, as a loop pass)
+; LPM-NEXT:   Loop Invariant Code Motion
 
 ; NPM: LICMPass
 ; NPM-NEXT: GuardWideningPass
diff --git a/llvm/test/Transforms/IndVarSimplify/exit_value_test2.ll b/llvm/test/Transforms/IndVarSimplify/exit_value_test2.ll
index 6740039103733..b60d27a310eee 100644
--- a/llvm/test/Transforms/IndVarSimplify/exit_value_test2.ll
+++ b/llvm/test/Transforms/IndVarSimplify/exit_value_test2.ll
@@ -90,9 +90,7 @@ define i32 @zero_backedge_count_test(i32 %unknown_init, i32* %unknown_mem) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[UNKNOWN_NEXT:%.*]] = load volatile i32, i32* [[UNKNOWN_MEM:%.*]], align 4
-; CHECK-NEXT:    br i1 false, label [[LOOP_LOOP_CRIT_EDGE:%.*]], label [[LEAVE:%.*]]
-; CHECK:       loop.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[LEAVE:%.*]]
 ; CHECK:       leave:
 ; CHECK-NEXT:    ret i32 [[UNKNOWN_INIT:%.*]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/AddOverFlow.ll b/llvm/test/Transforms/InstCombine/AddOverFlow.ll
index 13494206f078a..e34e516c8b3ed 100644
--- a/llvm/test/Transforms/InstCombine/AddOverFlow.ll
+++ b/llvm/test/Transforms/InstCombine/AddOverFlow.ll
@@ -49,9 +49,9 @@ declare i32 @__gxx_personality_v0(...);
 define i16 @add_bounded_values(i16 %a, i16 %b) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 ; CHECK-LABEL: @add_bounded_values(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[C:%.*]] = call i16 @bounded(i16 [[A:%.*]]), !range !0
+; CHECK-NEXT:    [[C:%.*]] = call i16 @bounded(i16 [[A:%.*]]), !range [[RNG0:![0-9]+]]
 ; CHECK-NEXT:    [[D:%.*]] = invoke i16 @bounded(i16 [[B:%.*]])
-; CHECK-NEXT:    to label [[CONT:%.*]] unwind label [[LPAD:%.*]], !range !0
+; CHECK-NEXT:    to label [[CONT:%.*]] unwind label [[LPAD:%.*]], !range [[RNG0]]
 ; CHECK:       cont:
 ; CHECK-NEXT:    [[E:%.*]] = add nuw i16 [[C]], [[D]]
 ; CHECK-NEXT:    ret i16 [[E]]
@@ -76,9 +76,9 @@ lpad:
 define i16 @add_bounded_values_2(i16 %a, i16 %b) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 ; CHECK-LABEL: @add_bounded_values_2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[C:%.*]] = call i16 @bounded(i16 [[A:%.*]]), !range !1
+; CHECK-NEXT:    [[C:%.*]] = call i16 @bounded(i16 [[A:%.*]]), !range [[RNG1:![0-9]+]]
 ; CHECK-NEXT:    [[D:%.*]] = invoke i16 @bounded(i16 [[B:%.*]])
-; CHECK-NEXT:    to label [[CONT:%.*]] unwind label [[LPAD:%.*]], !range !1
+; CHECK-NEXT:    to label [[CONT:%.*]] unwind label [[LPAD:%.*]], !range [[RNG1]]
 ; CHECK:       cont:
 ; CHECK-NEXT:    [[E:%.*]] = add i16 [[C]], [[D]]
 ; CHECK-NEXT:    ret i16 [[E]]
@@ -264,3 +264,14 @@ define i16 @ripple_no_nsw6(i16 %x, i16 %y) {
   %c = add i16 %b, %a
   ret i16 %c
 }
+
+define i8 @PR38021(i8 %x) {
+; CHECK-LABEL: @PR38021(
+; CHECK-NEXT:    [[CLEAR_TOP_3_BITS:%.*]] = lshr i8 [[X:%.*]], 3
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i8 [[CLEAR_TOP_3_BITS]], -63
+; CHECK-NEXT:    ret i8 [[ADD]]
+;
+  %clear_top_3_bits = lshr i8 %x, 3
+  %add = add i8 %clear_top_3_bits, 193 ; 0b11000001
+  ret i8 %add
+}
diff --git a/llvm/test/Transforms/InstCombine/X86/simplify-libcalls-memcmp.ll b/llvm/test/Transforms/InstCombine/X86/simplify-libcalls-memcmp.ll
new file mode 100644
index 0000000000000..0bb82d8546521
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/X86/simplify-libcalls-memcmp.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S < %s -instcombine | FileCheck %s
+
+; This test requires a target with 'bcmp' in its library.
+; It used to crash/assert because the function signature does
+; not match the expected libcall signature for memcmp -
+; the last arg should have type size_t -> i64 for this target.
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@str = private unnamed_addr constant [6 x i8] c"abcde\00", align 1
+
+declare i32 @memcmp(i8*, i8*, i32)
+
+define void @PR50850() {
+; CHECK-LABEL: @PR50850(
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @memcmp(i8* bitcast (void ()* @PR50850 to i8*), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @str, i64 0, i64 0), i32 6)
+; CHECK-NEXT:    ret void
+;
+  %call = call i32 @memcmp(i8* bitcast (void ()* @PR50850 to i8*), i8* bitcast ([6 x i8]* @str to i8*), i32 6)
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/addsub-constant-folding.ll b/llvm/test/Transforms/InstCombine/addsub-constant-folding.ll
index 2d9918737d2ac..e9584f35238e3 100644
--- a/llvm/test/Transforms/InstCombine/addsub-constant-folding.ll
+++ b/llvm/test/Transforms/InstCombine/addsub-constant-folding.ll
@@ -526,3 +526,64 @@ define <4 x i32> @vec_const_sub_const_sub_nonsplat(<4 x i32> %arg) {
   %t1 = sub <4 x i32> <i32 2, i32 3, i32 undef, i32 2>, %t0
   ret <4 x i32> %t1
 }
+
+define i7 @addsub_combine_constants(i7 %x, i7 %y) {
+; CHECK-LABEL: @addsub_combine_constants(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i7 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[A2:%.*]] = add i7 [[TMP1]], 52
+; CHECK-NEXT:    ret i7 [[A2]]
+;
+  %a1 = add i7 %x, 42
+  %s = sub i7 10, %y
+  %a2 = add nsw i7 %a1, %s
+  ret i7 %a2
+}
+
+define <4 x i32> @addsub_combine_constants_use1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @addsub_combine_constants_use1(
+; CHECK-NEXT:    [[A1:%.*]] = add <4 x i32> [[X:%.*]], <i32 42, i32 -7, i32 0, i32 -1>
+; CHECK-NEXT:    call void @vec_use(<4 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <4 x i32> [[X]], [[Y:%.*]]
+; CHECK-NEXT:    [[A2:%.*]] = add <4 x i32> [[TMP1]], <i32 -58, i32 -6, i32 -1, i32 41>
+; CHECK-NEXT:    ret <4 x i32> [[A2]]
+;
+  %a1 = add <4 x i32> %x, <i32 42, i32 -7, i32 0, i32 -1>
+  call void @vec_use(<4 x i32> %a1)
+  %s = sub <4 x i32> <i32 -100, i32 1, i32 -1, i32 42>, %y
+  %a2 = add nuw <4 x i32> %s, %a1
+  ret <4 x i32> %a2
+}
+
+define i32 @addsub_combine_constants_use2(i32 %x, i32 %y) {
+; CHECK-LABEL: @addsub_combine_constants_use2(
+; CHECK-NEXT:    [[S:%.*]] = sub i32 100, [[Y:%.*]]
+; CHECK-NEXT:    call void @use(i32 [[S]])
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[X:%.*]], [[Y]]
+; CHECK-NEXT:    [[A2:%.*]] = add i32 [[TMP1]], 142
+; CHECK-NEXT:    ret i32 [[A2]]
+;
+  %a1 = add i32 %x, 42
+  %s = sub i32 100, %y
+  call void @use(i32 %s)
+  %a2 = add i32 %a1, %s
+  ret i32 %a2
+}
+
+; negative test - too many uses
+
+define i32 @addsub_combine_constants_use3(i32 %x, i32 %y) {
+; CHECK-LABEL: @addsub_combine_constants_use3(
+; CHECK-NEXT:    [[A1:%.*]] = add i32 [[X:%.*]], 42
+; CHECK-NEXT:    call void @use(i32 [[A1]])
+; CHECK-NEXT:    [[S:%.*]] = sub i32 100, [[Y:%.*]]
+; CHECK-NEXT:    call void @use(i32 [[S]])
+; CHECK-NEXT:    [[A2:%.*]] = add i32 [[A1]], [[S]]
+; CHECK-NEXT:    ret i32 [[A2]]
+;
+  %a1 = add i32 %x, 42
+  call void @use(i32 %a1)
+  %s = sub i32 100, %y
+  call void @use(i32 %s)
+  %a2 = add i32 %a1, %s
+  ret i32 %a2
+}
diff --git a/llvm/test/Transforms/InstCombine/eq-of-parts.ll b/llvm/test/Transforms/InstCombine/eq-of-parts.ll
index 3e7ac275e3847..3c1b9561a920c 100644
--- a/llvm/test/Transforms/InstCombine/eq-of-parts.ll
+++ b/llvm/test/Transforms/InstCombine/eq-of-parts.ll
@@ -352,6 +352,31 @@ define i1 @eq_21_extra_use_eq2(i32 %x, i32 %y) {
   ret i1 %c.210
 }
 
+; Logical and instead of bitwise and.
+
+define i1 @eq_21_logical(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_21_logical(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[Y:%.*]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i16 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[TMP5]]
+;
+  %x.321 = lshr i32 %x, 8
+  %x.1 = trunc i32 %x.321 to i8
+  %x.32 = lshr i32 %x, 16
+  %x.2 = trunc i32 %x.32 to i8
+  %y.321 = lshr i32 %y, 8
+  %y.1 = trunc i32 %y.321 to i8
+  %y.32 = lshr i32 %y, 16
+  %y.2 = trunc i32 %y.32 to i8
+  %c.1 = icmp eq i8 %x.1, %y.1
+  %c.2 = icmp eq i8 %x.2, %y.2
+  %c.210 = select i1 %c.2, i1 %c.1, i1 false
+  ret i1 %c.210
+}
+
 ; Negative tests.
 
 define i1 @eq_21_wrong_op1(i32 %x, i32 %y, i32 %z) {
@@ -992,6 +1017,31 @@ define i1 @ne_21_extra_use_ne2(i32 %x, i32 %y) {
   ret i1 %c.210
 }
 
+; Logical or instead of bitwise or.
+
+define i1 @ne_21_logical(i32 %x, i32 %y) {
+; CHECK-LABEL: @ne_21_logical(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[Y:%.*]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i16 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    ret i1 [[TMP5]]
+;
+  %x.321 = lshr i32 %x, 8
+  %x.1 = trunc i32 %x.321 to i8
+  %x.32 = lshr i32 %x, 16
+  %x.2 = trunc i32 %x.32 to i8
+  %y.321 = lshr i32 %y, 8
+  %y.1 = trunc i32 %y.321 to i8
+  %y.32 = lshr i32 %y, 16
+  %y.2 = trunc i32 %y.32 to i8
+  %c.1 = icmp ne i8 %x.1, %y.1
+  %c.2 = icmp ne i8 %x.2, %y.2
+  %c.210 = select i1 %c.2, i1 true, i1 %c.1
+  ret i1 %c.210
+}
+
 ; Negative tests.
 
 define i1 @ne_21_wrong_op1(i32 %x, i32 %y, i32 %z) {
diff --git a/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll b/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll
index faaaff8fec91a..dfa664fde2080 100644
--- a/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll
+++ b/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll
@@ -186,3 +186,71 @@ loop:
   call void @blackhole(<2 x i8*> %e6)
   br label %loop
 }
+
+; This would crash because we did not expect to be able to constant fold a GEP.
+
+define void @PR51485(<2 x i64> %v) {
+; CHECK-LABEL: @PR51485(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[SL1:%.*]] = shl nuw nsw <2 x i64> [[V:%.*]], <i64 7, i64 7>
+; CHECK-NEXT:    [[E6:%.*]] = getelementptr inbounds i8, i8* getelementptr (i8, i8* bitcast (void (<2 x i64>)* @PR51485 to i8*), i64 80), <2 x i64> [[SL1]]
+; CHECK-NEXT:    call void @blackhole(<2 x i8*> [[E6]])
+; CHECK-NEXT:    br label [[LOOP]]
+;
+entry:
+  br label %loop
+
+loop:
+  %sl1 = shl nuw nsw <2 x i64> %v, <i64 7, i64 7>
+  %e5 = getelementptr inbounds i8, i8* bitcast (void (<2 x i64>)* @PR51485 to i8*), <2 x i64> %sl1
+  %e6 = getelementptr inbounds i8, <2 x i8*> %e5, i64 80
+  call void @blackhole(<2 x i8*> %e6)
+  br label %loop
+}
+
+; Avoid folding the GEP outside the loop to inside, and increasing loop
+; instruction count.
+define float @gep_cross_loop(i64* %_arg_, float* %_arg_3, float %_arg_8)
+; CHECK-LABEL: @gep_cross_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[_ARG_:%.*]], align 8
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds float, float* [[_ARG_3:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+; CHECK:       for.cond.i:
+; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD11_I:%.*]], [[FOR_BODY_I:%.*]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I:%.*]], [[FOR_BODY_I]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX]], 17
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_I]], label [[FOR_COND_I_I_I_PREHEADER:%.*]]
+; CHECK:       for.cond.i.i.i.preheader:
+; CHECK-NEXT:    ret float [[SUM]]
+; CHECK:       for.body.i:
+; CHECK-NEXT:    [[ARRAYIDX_I84_I:%.*]] = getelementptr inbounds float, float* [[ADD_PTR]], i64 [[IDX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_I84_I]], align 4
+; CHECK-NEXT:    [[ADD_I]] = fadd fast float [[SUM]], [[TMP1]]
+; CHECK-NEXT:    [[ADD11_I]] = add nuw nsw i64 [[IDX]], 1
+; CHECK-NEXT:    br label [[FOR_COND_I]]
+;
+{
+entry:
+  %0 = load i64, i64* %_arg_, align 8
+  %add.ptr = getelementptr inbounds float, float* %_arg_3, i64 %0
+  br label %for.cond.i
+
+for.cond.i:                                       ; preds = %for.body.i, %entry
+  %idx = phi i64 [ 0, %entry ], [ %add11.i, %for.body.i ]
+  %sum = phi float [ 0.000000e+00, %entry ], [ %add.i, %for.body.i ]
+  %cmp = icmp ule i64 %idx, 16
+  br i1 %cmp, label %for.body.i, label %for.cond.i.i.i.preheader
+
+for.cond.i.i.i.preheader:                         ; preds = %for.cond.i
+  ret float %sum
+
+for.body.i:                                       ; preds = %for.cond.i
+  %arrayidx.i84.i = getelementptr inbounds float, float * %add.ptr, i64 %idx
+  %1 = load float, float* %arrayidx.i84.i, align 4
+  %add.i = fadd fast float %sum, %1
+  %add11.i = add nsw i64 %idx, 1
+  br label %for.cond.i
+}
diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
index c21724b46308a..ce7cc81b60e63 100644
--- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
@@ -547,8 +547,8 @@ define i8 @not_smin_of_nots(i8 %x, i8 %y) {
 ; CHECK-NEXT:    call void @use(i8 [[NOTY]])
 ; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.smin.i8(i8 [[NOTX]], i8 [[NOTY]])
 ; CHECK-NEXT:    call void @use(i8 [[M]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 [[Y]])
-; CHECK-NEXT:    ret i8 [[TMP1]]
+; CHECK-NEXT:    [[NOTM:%.*]] = xor i8 [[M]], -1
+; CHECK-NEXT:    ret i8 [[NOTM]]
 ;
   %notx = xor i8 %x, -1
   call void @use(i8 %notx)
@@ -1030,3 +1030,434 @@ define i8 @umin_demand_and_7_8(i8 %x) {
   %r = and i8 %m, -8
   ret i8 %r
 }
+
+define i8 @neg_neg_nsw_smax(i8 %x, i8 %y) {
+; CHECK-LABEL: @neg_neg_nsw_smax(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smin.i8(i8 [[X:%.*]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[M:%.*]] = sub nsw i8 0, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[M]]
+;
+  %nx = sub nsw i8 0, %x
+  %ny = sub nsw i8 0, %y
+  %m = call i8 @llvm.smax.i8(i8 %nx, i8 %ny)
+  ret i8 %m
+}
+
+define <3 x i8> @neg_neg_nsw_smin(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @neg_neg_nsw_smin(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x i8> @llvm.smax.v3i8(<3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]])
+; CHECK-NEXT:    [[M:%.*]] = sub nsw <3 x i8> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    ret <3 x i8> [[M]]
+;
+  %nx = sub nsw <3 x i8> zeroinitializer, %x
+  %ny = sub nsw <3 x i8> zeroinitializer, %y
+  %m = call <3 x i8> @llvm.smin.v3i8(<3 x i8> %nx, <3 x i8> %ny)
+  ret <3 x i8> %m
+}
+
+define i8 @neg_neg_nsw_smax_use0(i8 %x, i8 %y) {
+; CHECK-LABEL: @neg_neg_nsw_smax_use0(
+; CHECK-NEXT:    [[NX:%.*]] = sub nsw i8 0, [[X:%.*]]
+; CHECK-NEXT:    call void @use(i8 [[NX]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 [[Y:%.*]])
+; CHECK-NEXT:    [[M:%.*]] = sub nsw i8 0, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[M]]
+;
+  %nx = sub nsw i8 0, %x
+  call void @use(i8 %nx)
+  %ny = sub nsw i8 0, %y
+  %m = call i8 @llvm.smax.i8(i8 %nx, i8 %ny)
+  ret i8 %m
+}
+
+define i8 @neg_neg_nsw_smin_use1(i8 %x, i8 %y) {
+; CHECK-LABEL: @neg_neg_nsw_smin_use1(
+; CHECK-NEXT:    [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]]
+; CHECK-NEXT:    call void @use(i8 [[NY]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[X:%.*]], i8 [[Y]])
+; CHECK-NEXT:    [[M:%.*]] = sub nsw i8 0, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[M]]
+;
+  %nx = sub nsw i8 0, %x
+  %ny = sub nsw i8 0, %y
+  call void @use(i8 %ny)
+  %m = call i8 @llvm.smin.i8(i8 %nx, i8 %ny)
+  ret i8 %m
+}
+
+; negative test - too many uses
+
+define i8 @neg_neg_nsw_smin_use2(i8 %x, i8 %y) {
+; CHECK-LABEL: @neg_neg_nsw_smin_use2(
+; CHECK-NEXT:    [[NX:%.*]] = sub nsw i8 0, [[X:%.*]]
+; CHECK-NEXT:    call void @use(i8 [[NX]])
+; CHECK-NEXT:    [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]]
+; CHECK-NEXT:    call void @use(i8 [[NY]])
+; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.smin.i8(i8 [[NX]], i8 [[NY]])
+; CHECK-NEXT:    ret i8 [[M]]
+;
+  %nx = sub nsw i8 0, %x
+  call void @use(i8 %nx)
+  %ny = sub nsw i8 0, %y
+  call void @use(i8 %ny)
+  %m = call i8 @llvm.smin.i8(i8 %nx, i8 %ny)
+  ret i8 %m
+}
+
+; negative test - need nsw on both ops
+
+define i8 @neg_neg_smax(i8 %x, i8 %y) {
+; CHECK-LABEL: @neg_neg_smax(
+; CHECK-NEXT:    [[NX:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]]
+; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.smax.i8(i8 [[NX]], i8 [[NY]])
+; CHECK-NEXT:    ret i8 [[M]]
+;
+  %nx = sub i8 0, %x
+  %ny = sub nsw i8 0, %y
+  %m = call i8 @llvm.smax.i8(i8 %nx, i8 %ny)
+  ret i8 %m
+}
+
+; negative test - need nsw on both ops
+
+define i8 @neg_neg_smin(i8 %x, i8 %y) {
+; CHECK-LABEL: @neg_neg_smin(
+; CHECK-NEXT:    [[NX:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]]
+; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.smin.i8(i8 [[NX]], i8 [[NY]])
+; CHECK-NEXT:    ret i8 [[M]]
+;
+  %nx = sub i8 0, %x
+  %ny = sub nsw i8 0, %y
+  %m = call i8 @llvm.smin.i8(i8 %nx, i8 %ny)
+  ret i8 %m
+}
+
+; negative test - need signed min/max
+
+define i8 @neg_neg_nsw_umin(i8 %x, i8 %y) {
+; CHECK-LABEL: @neg_neg_nsw_umin(
+; CHECK-NEXT:    [[NX:%.*]] = sub nsw i8 0, [[X:%.*]]
+; CHECK-NEXT:    [[NY:%.*]] = sub nsw i8 0, [[Y:%.*]]
+; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.umin.i8(i8 [[NX]], i8 [[NY]])
+; CHECK-NEXT:    ret i8 [[M]]
+;
+  %nx = sub nsw i8 0, %x
+  %ny = sub nsw i8 0, %y
+  %m = call i8 @llvm.umin.i8(i8 %nx, i8 %ny)
+  ret i8 %m
+}
+
+define i8 @freeToInvertSub(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @freeToInvertSub(
+; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[NY:%.*]] = xor i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[NZ:%.*]] = xor i8 [[Z:%.*]], -1
+; CHECK-NEXT:    call void @use(i8 [[NX]])
+; CHECK-NEXT:    call void @use(i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[NZ]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[X]], i8 [[Y]])
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[TMP1]], [[Z]]
+; CHECK-NEXT:    ret i8 [[SUB]]
+;
+  %nx = xor i8 %x, -1
+  %ny = xor i8 %y, -1
+  %nz = xor i8 %z, -1
+  call void @use(i8 %nx)
+  call void @use(i8 %ny)
+  call void @use(i8 %nz)
+  %m = call i8 @llvm.umax.i8(i8 %nx, i8 %ny)
+  %sub = sub i8 %nz, %m
+  ret i8 %sub
+}
+
+define i8 @freeToInvertSub_uses(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @freeToInvertSub_uses(
+; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[NY:%.*]] = xor i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[NZ:%.*]] = xor i8 [[Z:%.*]], -1
+; CHECK-NEXT:    call void @use(i8 [[NX]])
+; CHECK-NEXT:    call void @use(i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[NZ]])
+; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.umax.i8(i8 [[NX]], i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[M]])
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[NZ]], [[M]]
+; CHECK-NEXT:    ret i8 [[SUB]]
+;
+  %nx = xor i8 %x, -1
+  %ny = xor i8 %y, -1
+  %nz = xor i8 %z, -1
+  call void @use(i8 %nx)
+  call void @use(i8 %ny)
+  call void @use(i8 %nz)
+  %m = call i8 @llvm.umax.i8(i8 %nx, i8 %ny)
+  call void @use(i8 %m)
+  %sub = sub i8 %nz, %m
+  ret i8 %sub
+}
+
+define i8 @freeToInvert(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @freeToInvert(
+; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[NY:%.*]] = xor i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[NZ:%.*]] = xor i8 [[Z:%.*]], -1
+; CHECK-NEXT:    call void @use(i8 [[NX]])
+; CHECK-NEXT:    call void @use(i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[NZ]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[X]], i8 [[Y]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP1]], i8 [[Z]])
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %nx = xor i8 %x, -1
+  %ny = xor i8 %y, -1
+  %nz = xor i8 %z, -1
+  call void @use(i8 %nx)
+  call void @use(i8 %ny)
+  call void @use(i8 %nz)
+  %m1 = call i8 @llvm.umax.i8(i8 %nx, i8 %ny)
+  %m2 = call i8 @llvm.smin.i8(i8 %m1, i8 %nz)
+  %not = xor i8 %m2, -1
+  ret i8 %not
+}
+
+define i8 @freeToInvert_use1(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @freeToInvert_use1(
+; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[NY:%.*]] = xor i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[NZ:%.*]] = xor i8 [[Z:%.*]], -1
+; CHECK-NEXT:    call void @use(i8 [[NX]])
+; CHECK-NEXT:    call void @use(i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[NZ]])
+; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.umax.i8(i8 [[NX]], i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[M1]])
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i8 [[M1]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.smax.i8(i8 [[Z]], i8 [[TMP1]])
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+  %nx = xor i8 %x, -1
+  %ny = xor i8 %y, -1
+  %nz = xor i8 %z, -1
+  call void @use(i8 %nx)
+  call void @use(i8 %ny)
+  call void @use(i8 %nz)
+  %m1 = call i8 @llvm.umax.i8(i8 %nx, i8 %ny)
+  call void @use(i8 %m1)
+  %m2 = call i8 @llvm.smin.i8(i8 %m1, i8 %nz)
+  %not = xor i8 %m2, -1
+  ret i8 %not
+}
+
+define i8 @freeToInvert_use2(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @freeToInvert_use2(
+; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[NY:%.*]] = xor i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[NZ:%.*]] = xor i8 [[Z:%.*]], -1
+; CHECK-NEXT:    call void @use(i8 [[NX]])
+; CHECK-NEXT:    call void @use(i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[NZ]])
+; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.umax.i8(i8 [[NX]], i8 [[NY]])
+; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.smin.i8(i8 [[M1]], i8 [[NZ]])
+; CHECK-NEXT:    call void @use(i8 [[M2]])
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[M2]], -1
+; CHECK-NEXT:    ret i8 [[NOT]]
+;
+  %nx = xor i8 %x, -1
+  %ny = xor i8 %y, -1
+  %nz = xor i8 %z, -1
+  call void @use(i8 %nx)
+  call void @use(i8 %ny)
+  call void @use(i8 %nz)
+  %m1 = call i8 @llvm.umax.i8(i8 %nx, i8 %ny)
+  %m2 = call i8 @llvm.smin.i8(i8 %m1, i8 %nz)
+  call void @use(i8 %m2)
+  %not = xor i8 %m2, -1
+  ret i8 %not
+}
+
+define i8 @freeToInvert_use3(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: @freeToInvert_use3(
+; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[NY:%.*]] = xor i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[NZ:%.*]] = xor i8 [[Z:%.*]], -1
+; CHECK-NEXT:    call void @use(i8 [[NX]])
+; CHECK-NEXT:    call void @use(i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[NZ]])
+; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.umax.i8(i8 [[NX]], i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[M1]])
+; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.smin.i8(i8 [[M1]], i8 [[NZ]])
+; CHECK-NEXT:    call void @use(i8 [[M2]])
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[M2]], -1
+; CHECK-NEXT:    ret i8 [[NOT]]
+;
+  %nx = xor i8 %x, -1
+  %ny = xor i8 %y, -1
+  %nz = xor i8 %z, -1
+  call void @use(i8 %nx)
+  call void @use(i8 %ny)
+  call void @use(i8 %nz)
+  %m1 = call i8 @llvm.umax.i8(i8 %nx, i8 %ny)
+  call void @use(i8 %m1)
+  %m2 = call i8 @llvm.smin.i8(i8 %m1, i8 %nz)
+  call void @use(i8 %m2)
+  %not = xor i8 %m2, -1
+  ret i8 %not
+}
+
+define i8 @freeToInvert_two_minmax_ops(i8 %x, i8 %y, i8 %z, i8 %w) {
+; CHECK-LABEL: @freeToInvert_two_minmax_ops(
+; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[NY:%.*]] = xor i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[NZ:%.*]] = xor i8 [[Z:%.*]], -1
+; CHECK-NEXT:    [[NW:%.*]] = xor i8 [[W:%.*]], -1
+; CHECK-NEXT:    call void @use(i8 [[NX]])
+; CHECK-NEXT:    call void @use(i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[NZ]])
+; CHECK-NEXT:    call void @use(i8 [[NW]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.umin.i8(i8 [[X]], i8 [[Y]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i8 @llvm.smin.i8(i8 [[W]], i8 [[Z]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP1]], i8 [[TMP2]])
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %nx = xor i8 %x, -1
+  %ny = xor i8 %y, -1
+  %nz = xor i8 %z, -1
+  %nw = xor i8 %w, -1
+  call void @use(i8 %nx)
+  call void @use(i8 %ny)
+  call void @use(i8 %nz)
+  call void @use(i8 %nw)
+  %m1 = call i8 @llvm.umax.i8(i8 %nx, i8 %ny)
+  %m2 = call i8 @llvm.smax.i8(i8 %nw, i8 %nz)
+  %m3 = call i8 @llvm.smin.i8(i8 %m1, i8 %m2)
+  %not = xor i8 %m3, -1
+  ret i8 %not
+}
+
+define i8 @freeToInvert_two_minmax_ops_use1(i8 %x, i8 %y, i8 %z, i8 %w) {
+; CHECK-LABEL: @freeToInvert_two_minmax_ops_use1(
+; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[NY:%.*]] = xor i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[NZ:%.*]] = xor i8 [[Z:%.*]], -1
+; CHECK-NEXT:    [[NW:%.*]] = xor i8 [[W:%.*]], -1
+; CHECK-NEXT:    call void @use(i8 [[NX]])
+; CHECK-NEXT:    call void @use(i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[NZ]])
+; CHECK-NEXT:    call void @use(i8 [[NW]])
+; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.umax.i8(i8 [[NX]], i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[M1]])
+; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.smax.i8(i8 [[NW]], i8 [[NZ]])
+; CHECK-NEXT:    [[M3:%.*]] = call i8 @llvm.smin.i8(i8 [[M1]], i8 [[M2]])
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[M3]], -1
+; CHECK-NEXT:    ret i8 [[NOT]]
+;
+  %nx = xor i8 %x, -1
+  %ny = xor i8 %y, -1
+  %nz = xor i8 %z, -1
+  %nw = xor i8 %w, -1
+  call void @use(i8 %nx)
+  call void @use(i8 %ny)
+  call void @use(i8 %nz)
+  call void @use(i8 %nw)
+  %m1 = call i8 @llvm.umax.i8(i8 %nx, i8 %ny)
+  call void @use(i8 %m1)
+  %m2 = call i8 @llvm.smax.i8(i8 %nw, i8 %nz)
+  %m3 = call i8 @llvm.smin.i8(i8 %m1, i8 %m2)
+  %not = xor i8 %m3, -1
+  ret i8 %not
+}
+
+define i8 @freeToInvert_two_minmax_ops_use2(i8 %x, i8 %y, i8 %z, i8 %w) {
+; CHECK-LABEL: @freeToInvert_two_minmax_ops_use2(
+; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[NY:%.*]] = xor i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[NZ:%.*]] = xor i8 [[Z:%.*]], -1
+; CHECK-NEXT:    [[NW:%.*]] = xor i8 [[W:%.*]], -1
+; CHECK-NEXT:    call void @use(i8 [[NX]])
+; CHECK-NEXT:    call void @use(i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[NZ]])
+; CHECK-NEXT:    call void @use(i8 [[NW]])
+; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.umax.i8(i8 [[NX]], i8 [[NY]])
+; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.smax.i8(i8 [[NW]], i8 [[NZ]])
+; CHECK-NEXT:    call void @use(i8 [[M2]])
+; CHECK-NEXT:    [[M3:%.*]] = call i8 @llvm.smin.i8(i8 [[M1]], i8 [[M2]])
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[M3]], -1
+; CHECK-NEXT:    ret i8 [[NOT]]
+;
+  %nx = xor i8 %x, -1
+  %ny = xor i8 %y, -1
+  %nz = xor i8 %z, -1
+  %nw = xor i8 %w, -1
+  call void @use(i8 %nx)
+  call void @use(i8 %ny)
+  call void @use(i8 %nz)
+  call void @use(i8 %nw)
+  %m1 = call i8 @llvm.umax.i8(i8 %nx, i8 %ny)
+  %m2 = call i8 @llvm.smax.i8(i8 %nw, i8 %nz)
+  call void @use(i8 %m2)
+  %m3 = call i8 @llvm.smin.i8(i8 %m1, i8 %m2)
+  %not = xor i8 %m3, -1
+  ret i8 %not
+}
+
+define i8 @freeToInvert_two_minmax_ops_use3(i8 %x, i8 %y, i8 %z, i8 %w) {
+; CHECK-LABEL: @freeToInvert_two_minmax_ops_use3(
+; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[NY:%.*]] = xor i8 [[Y:%.*]], -1
+; CHECK-NEXT:    [[NZ:%.*]] = xor i8 [[Z:%.*]], -1
+; CHECK-NEXT:    [[NW:%.*]] = xor i8 [[W:%.*]], -1
+; CHECK-NEXT:    call void @use(i8 [[NX]])
+; CHECK-NEXT:    call void @use(i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[NZ]])
+; CHECK-NEXT:    call void @use(i8 [[NW]])
+; CHECK-NEXT:    [[M1:%.*]] = call i8 @llvm.umax.i8(i8 [[NX]], i8 [[NY]])
+; CHECK-NEXT:    call void @use(i8 [[M1]])
+; CHECK-NEXT:    [[M2:%.*]] = call i8 @llvm.smax.i8(i8 [[NW]], i8 [[NZ]])
+; CHECK-NEXT:    call void @use(i8 [[M2]])
+; CHECK-NEXT:    [[M3:%.*]] = call i8 @llvm.smin.i8(i8 [[M1]], i8 [[M2]])
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[M3]], -1
+; CHECK-NEXT:    ret i8 [[NOT]]
+;
+  %nx = xor i8 %x, -1
+  %ny = xor i8 %y, -1
+  %nz = xor i8 %z, -1
+  %nw = xor i8 %w, -1
+  call void @use(i8 %nx)
+  call void @use(i8 %ny)
+  call void @use(i8 %nz)
+  call void @use(i8 %nw)
+  %m1 = call i8 @llvm.umax.i8(i8 %nx, i8 %ny)
+  call void @use(i8 %m1)
+  %m2 = call i8 @llvm.smax.i8(i8 %nw, i8 %nz)
+  call void @use(i8 %m2)
+  %m3 = call i8 @llvm.smin.i8(i8 %m1, i8 %m2)
+  %not = xor i8 %m3, -1
+  ret i8 %not
+}
+
+declare void @use4(i8, i8, i8, i8)
+
+define void @cmyk(i8 %r, i8 %g, i8 %b) {
+; CHECK-LABEL: @cmyk(
+; CHECK-NEXT:    [[NOTR:%.*]] = xor i8 [[R:%.*]], -1
+; CHECK-NEXT:    [[NOTG:%.*]] = xor i8 [[G:%.*]], -1
+; CHECK-NEXT:    [[NOTB:%.*]] = xor i8 [[B:%.*]], -1
+; CHECK-NEXT:    [[M:%.*]] = call i8 @llvm.smin.i8(i8 [[NOTR]], i8 [[NOTG]])
+; CHECK-NEXT:    [[K:%.*]] = call i8 @llvm.smin.i8(i8 [[M]], i8 [[NOTB]])
+; CHECK-NEXT:    [[CK:%.*]] = sub i8 [[NOTR]], [[K]]
+; CHECK-NEXT:    [[MK:%.*]] = sub i8 [[NOTG]], [[K]]
+; CHECK-NEXT:    [[YK:%.*]] = sub i8 [[NOTB]], [[K]]
+; CHECK-NEXT:    call void @use4(i8 [[CK]], i8 [[MK]], i8 [[YK]], i8 [[K]])
+; CHECK-NEXT:    ret void
+;
+  %notr = xor i8 %r, -1
+  %notg = xor i8 %g, -1
+  %notb = xor i8 %b, -1
+  %m = call i8 @llvm.smin.i8(i8 %notr, i8 %notg)
+  %k = call i8 @llvm.smin.i8(i8 %m, i8 %notb)
+  %ck = sub i8 %notr, %k
+  %mk = sub i8 %notg, %k
+  %yk = sub i8 %notb, %k
+  call void @use4(i8 %ck, i8 %mk, i8 %yk, i8 %k)
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/reassociate-nuw.ll b/llvm/test/Transforms/InstCombine/reassociate-nuw.ll
index a40d427fd29f3..6045f1ec88069 100644
--- a/llvm/test/Transforms/InstCombine/reassociate-nuw.ll
+++ b/llvm/test/Transforms/InstCombine/reassociate-nuw.ll
@@ -79,9 +79,8 @@ define i32 @reassoc_x2_mul_nuw(i32 %x, i32 %y) {
 
 define i32 @reassoc_x2_sub_nuw(i32 %x, i32 %y) {
 ; CHECK-LABEL: @reassoc_x2_sub_nuw(
-; CHECK-NEXT:    [[SUB0:%.*]] = add i32 [[X:%.*]], -4
-; CHECK-NEXT:    [[SUB1:%.*]] = add i32 [[Y:%.*]], -8
-; CHECK-NEXT:    [[SUB2:%.*]] = sub nuw i32 [[SUB0]], [[SUB1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP1]], 4
 ; CHECK-NEXT:    ret i32 [[SUB2]]
 ;
   %sub0 = sub nuw i32 %x, 4
diff --git a/llvm/test/Transforms/InstCombine/sadd_sat.ll b/llvm/test/Transforms/InstCombine/sadd_sat.ll
index 04dd4f5c038dc..b9a4771f6eeb4 100644
--- a/llvm/test/Transforms/InstCombine/sadd_sat.ll
+++ b/llvm/test/Transforms/InstCombine/sadd_sat.ll
@@ -21,6 +21,22 @@ entry:
   ret i32 %conv7
 }
 
+define i32 @sadd_sat32_mm(i32 %a, i32 %b) {
+; CHECK-LABEL: @sadd_sat32_mm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[B:%.*]], i32 [[A:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %add = add i64 %conv1, %conv
+  %spec.store.select = call i64 @llvm.smin.i64(i64 %add, i64 2147483647)
+  %spec.store.select8 = call i64 @llvm.smax.i64(i64 %spec.store.select, i64 -2147483648)
+  %conv7 = trunc i64 %spec.store.select8 to i32
+  ret i32 %conv7
+}
+
 define i32 @ssub_sat32(i32 %a, i32 %b) {
 ; CHECK-LABEL: @ssub_sat32(
 ; CHECK-NEXT:  entry:
@@ -39,6 +55,22 @@ entry:
   ret i32 %conv7
 }
 
+define i32 @ssub_sat32_mm(i32 %a, i32 %b) {
+; CHECK-LABEL: @ssub_sat32_mm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[A:%.*]], i32 [[B:%.*]])
+; CHECK-NEXT:    ret i32 [[TMP0]]
+;
+entry:
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %sub = sub i64 %conv, %conv1
+  %spec.store.select = call i64 @llvm.smin.i64(i64 %sub, i64 2147483647)
+  %spec.store.select8 = call i64 @llvm.smax.i64(i64 %spec.store.select, i64 -2147483648)
+  %conv7 = trunc i64 %spec.store.select8 to i32
+  ret i32 %conv7
+}
+
 define i32 @smul_sat32(i32 %a, i32 %b) {
 ; CHECK-LABEL: @smul_sat32(
 ; CHECK-NEXT:  entry:
@@ -64,6 +96,27 @@ entry:
   ret i32 %conv7
 }
 
+define i32 @smul_sat32_mm(i32 %a, i32 %b) {
+; CHECK-LABEL: @smul_sat32_mm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = mul nsw i64 [[CONV1]], [[CONV]]
+; CHECK-NEXT:    [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[ADD]], i64 2147483647)
+; CHECK-NEXT:    [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648)
+; CHECK-NEXT:    [[CONV7:%.*]] = trunc i64 [[SPEC_STORE_SELECT8]] to i32
+; CHECK-NEXT:    ret i32 [[CONV7]]
+;
+entry:
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %add = mul i64 %conv1, %conv
+  %spec.store.select = call i64 @llvm.smin.i64(i64 %add, i64 2147483647)
+  %spec.store.select8 = call i64 @llvm.smax.i64(i64 %spec.store.select, i64 -2147483648)
+  %conv7 = trunc i64 %spec.store.select8 to i32
+  ret i32 %conv7
+}
+
 define signext i16 @sadd_sat16(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: @sadd_sat16(
 ; CHECK-NEXT:  entry:
@@ -82,6 +135,22 @@ entry:
   ret i16 %conv9
 }
 
+define signext i16 @sadd_sat16_mm(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: @sadd_sat16_mm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i16 @llvm.sadd.sat.i16(i16 [[B:%.*]], i16 [[A:%.*]])
+; CHECK-NEXT:    ret i16 [[TMP0]]
+;
+entry:
+  %conv = sext i16 %a to i32
+  %conv1 = sext i16 %b to i32
+  %add = add i32 %conv1, %conv
+  %spec.store.select = call i32 @llvm.smin.i32(i32 %add, i32 32767)
+  %spec.store.select10 = call i32 @llvm.smax.i32(i32 %spec.store.select, i32 -32768)
+  %conv9 = trunc i32 %spec.store.select10 to i16
+  ret i16 %conv9
+}
+
 define signext i16 @ssub_sat16(i16 signext %a, i16 signext %b) {
 ; CHECK-LABEL: @ssub_sat16(
 ; CHECK-NEXT:  entry:
@@ -100,6 +169,22 @@ entry:
   ret i16 %conv9
 }
 
+define signext i16 @ssub_sat16_mm(i16 signext %a, i16 signext %b) {
+; CHECK-LABEL: @ssub_sat16_mm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i16 @llvm.ssub.sat.i16(i16 [[A:%.*]], i16 [[B:%.*]])
+; CHECK-NEXT:    ret i16 [[TMP0]]
+;
+entry:
+  %conv = sext i16 %a to i32
+  %conv1 = sext i16 %b to i32
+  %sub = sub i32 %conv, %conv1
+  %spec.store.select = call i32 @llvm.smin.i32(i32 %sub, i32 32767)
+  %spec.store.select10 = call i32 @llvm.smax.i32(i32 %spec.store.select, i32 -32768)
+  %conv9 = trunc i32 %spec.store.select10 to i16
+  ret i16 %conv9
+}
+
 define signext i8 @sadd_sat8(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: @sadd_sat8(
 ; CHECK-NEXT:  entry:
@@ -118,6 +203,22 @@ entry:
   ret i8 %conv9
 }
 
+define signext i8 @sadd_sat8_mm(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: @sadd_sat8_mm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @llvm.sadd.sat.i8(i8 [[B:%.*]], i8 [[A:%.*]])
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %conv = sext i8 %a to i32
+  %conv1 = sext i8 %b to i32
+  %add = add i32 %conv1, %conv
+  %spec.store.select = call i32 @llvm.smin.i32(i32 %add, i32 127)
+  %spec.store.select10 = call i32 @llvm.smax.i32(i32 %spec.store.select, i32 -128)
+  %conv9 = trunc i32 %spec.store.select10 to i8
+  ret i8 %conv9
+}
+
 define signext i8 @ssub_sat8(i8 signext %a, i8 signext %b) {
 ; CHECK-LABEL: @ssub_sat8(
 ; CHECK-NEXT:  entry:
@@ -136,6 +237,22 @@ entry:
   ret i8 %conv9
 }
 
+define signext i8 @ssub_sat8_mm(i8 signext %a, i8 signext %b) {
+; CHECK-LABEL: @ssub_sat8_mm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @llvm.ssub.sat.i8(i8 [[A:%.*]], i8 [[B:%.*]])
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  %conv = sext i8 %a to i32
+  %conv1 = sext i8 %b to i32
+  %sub = sub i32 %conv, %conv1
+  %spec.store.select = call i32 @llvm.smin.i32(i32 %sub, i32 127)
+  %spec.store.select10 = call i32 @llvm.smax.i32(i32 %spec.store.select, i32 -128)
+  %conv9 = trunc i32 %spec.store.select10 to i8
+  ret i8 %conv9
+}
+
 define signext i64 @sadd_sat64(i64 signext %a, i64 signext %b) {
 ; CHECK-LABEL: @sadd_sat64(
 ; CHECK-NEXT:  entry:
@@ -240,6 +357,22 @@ entry:
   ret <4 x i32> %conv7
 }
 
+define <4 x i32> @sadd_satv4i32_mm(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sadd_satv4i32_mm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]])
+; CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+;
+entry:
+  %conv = sext <4 x i32> %a to <4 x i64>
+  %conv1 = sext <4 x i32> %b to <4 x i64>
+  %add = add <4 x i64> %conv1, %conv
+  %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %add, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
+  %spec.store.select8 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %spec.store.select, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>)
+  %conv7 = trunc <4 x i64> %spec.store.select8 to <4 x i32>
+  ret <4 x i32> %conv7
+}
+
 define <4 x i32> @ssub_satv4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @ssub_satv4i32(
 ; CHECK-NEXT:  entry:
@@ -258,6 +391,22 @@ entry:
   ret <4 x i32> %conv7
 }
 
+define <4 x i32> @ssub_satv4i32_mm(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @ssub_satv4i32_mm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[B:%.*]], <4 x i32> [[A:%.*]])
+; CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+;
+entry:
+  %conv = sext <4 x i32> %a to <4 x i64>
+  %conv1 = sext <4 x i32> %b to <4 x i64>
+  %add = sub <4 x i64> %conv1, %conv
+  %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %add, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
+  %spec.store.select8 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> %spec.store.select, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>)
+  %conv7 = trunc <4 x i64> %spec.store.select8 to <4 x i32>
+  ret <4 x i32> %conv7
+}
+
 define <4 x i32> @sadd_satv4i4(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @sadd_satv4i4(
 ; CHECK-NEXT:  entry:
@@ -345,6 +494,29 @@ entry:
   ret i32 %conv7
 }
 
+define i32 @sadd_sat32_extrause_2_mm(i32 %a, i32 %b) {
+; CHECK-LABEL: @sadd_sat32_extrause_2_mm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[CONV1]], [[CONV]]
+; CHECK-NEXT:    [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[ADD]], i64 2147483647)
+; CHECK-NEXT:    [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648)
+; CHECK-NEXT:    [[CONV7:%.*]] = trunc i64 [[SPEC_STORE_SELECT8]] to i32
+; CHECK-NEXT:    call void @use64(i64 [[SPEC_STORE_SELECT]])
+; CHECK-NEXT:    ret i32 [[CONV7]]
+;
+entry:
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %add = add i64 %conv1, %conv
+  %spec.store.select = call i64 @llvm.smin.i64(i64 %add, i64 2147483647)
+  %spec.store.select8 = call i64 @llvm.smax.i64(i64 %spec.store.select, i64 -2147483648)
+  %conv7 = trunc i64 %spec.store.select8 to i32
+  call void @use64(i64 %spec.store.select)
+  ret i32 %conv7
+}
+
 define i32 @sadd_sat32_extrause_3(i32 %a, i32 %b) {
 ; CHECK-LABEL: @sadd_sat32_extrause_3(
 ; CHECK-NEXT:  entry:
@@ -372,6 +544,29 @@ entry:
   ret i32 %conv7
 }
 
+define i32 @sadd_sat32_extrause_3_mm(i32 %a, i32 %b) {
+; CHECK-LABEL: @sadd_sat32_extrause_3_mm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[CONV1]], [[CONV]]
+; CHECK-NEXT:    [[SPEC_STORE_SELECT:%.*]] = call i64 @llvm.smin.i64(i64 [[ADD]], i64 2147483647)
+; CHECK-NEXT:    [[SPEC_STORE_SELECT8:%.*]] = call i64 @llvm.smax.i64(i64 [[SPEC_STORE_SELECT]], i64 -2147483648)
+; CHECK-NEXT:    [[CONV7:%.*]] = trunc i64 [[SPEC_STORE_SELECT8]] to i32
+; CHECK-NEXT:    call void @use64(i64 [[ADD]])
+; CHECK-NEXT:    ret i32 [[CONV7]]
+;
+entry:
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %add = add i64 %conv1, %conv
+  %spec.store.select = call i64 @llvm.smin.i64(i64 %add, i64 2147483647)
+  %spec.store.select8 = call i64 @llvm.smax.i64(i64 %spec.store.select, i64 -2147483648)
+  %conv7 = trunc i64 %spec.store.select8 to i32
+  call void @use64(i64 %add)
+  ret i32 %conv7
+}
+
 define i32 @sadd_sat32_trunc(i32 %a, i32 %b) {
 ; CHECK-LABEL: @sadd_sat32_trunc(
 ; CHECK-NEXT:  entry:
@@ -501,3 +696,9 @@ entry:
 }
 
 declare void @use64(i64)
+declare i64 @llvm.smin.i64(i64, i64)
+declare i64 @llvm.smax.i64(i64, i64)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>)
+declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>)
diff --git a/llvm/test/Transforms/InstCombine/sprintf-1.ll b/llvm/test/Transforms/InstCombine/sprintf-1.ll
index ac9cb5857e050..dc11eec52c95f 100644
--- a/llvm/test/Transforms/InstCombine/sprintf-1.ll
+++ b/llvm/test/Transforms/InstCombine/sprintf-1.ll
@@ -7,6 +7,7 @@
 ; RUN: opt < %s -mtriple=i386-mingw32 -instcombine -S | FileCheck %s --check-prefixes=CHECK,WIN,NOSTPCPY
 ; RUN: opt < %s -mtriple=armv7-none-linux-android16 -instcombine -S | FileCheck %s --check-prefixes=CHECK,NOSTPCPY
 ; RUN: opt < %s -mtriple=armv7-none-linux-android21 -instcombine -S | FileCheck %s --check-prefixes=CHECK,WITHSTPCPY
+; RUN: opt < %s -mtriple=x86_64-scei-ps4 -instcombine -S | FileCheck %s --check-prefixes=CHECK,NOSTPCPY
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
index e228ac84c47c7..4cbad77b8c33a 100644
--- a/llvm/test/Transforms/InstCombine/sub.ll
+++ b/llvm/test/Transforms/InstCombine/sub.ll
@@ -1109,14 +1109,9 @@ define i32 @test57(i32 %A, i32 %B) {
 @dummy_global2 = external global i8*
 
 define i64 @test58([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
-; Note the reassociate pass and another instcombine pass will further optimize this to
-; "%sub = i64 %i, %j, ret i64 %sub"
-; gep1 and gep2 have only one use
 ; CHECK-LABEL: @test58(
-; CHECK-NEXT:    [[GEP1_OFFS:%.*]] = add nsw i64 [[I:%.*]], 4200
-; CHECK-NEXT:    [[GEP2_OFFS:%.*]] = add nsw i64 [[J:%.*]], 4200
-; CHECK-NEXT:    [[GEPDIFF:%.*]] = sub nsw i64 [[GEP1_OFFS]], [[GEP2_OFFS]]
-; CHECK-NEXT:    ret i64 [[GEPDIFF]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[I:%.*]], [[J:%.*]]
+; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
   %gep1 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %i
   %gep2 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %j
diff --git a/llvm/test/Transforms/InstSimplify/call.ll b/llvm/test/Transforms/InstSimplify/call.ll
index 8b9a8cb63d8dd..a4560f84b6c7b 100644
--- a/llvm/test/Transforms/InstSimplify/call.ll
+++ b/llvm/test/Transforms/InstSimplify/call.ll
@@ -958,6 +958,38 @@ define i9 @fshr_ops_poison6() {
   ret i9 %r
 }
 
+define i8 @fshl_zero(i8 %shamt) {
+; CHECK-LABEL: @fshl_zero(
+; CHECK-NEXT:    ret i8 0
+;
+  %r = call i8 @llvm.fshl.i8(i8 0, i8 0, i8 %shamt)
+  ret i8 %r
+}
+
+define <2 x i8> @fshr_zero_vec(<2 x i8> %shamt) {
+; CHECK-LABEL: @fshr_zero_vec(
+; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+;
+  %r = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> zeroinitializer, <2 x i8> <i8 0, i8 undef>, <2 x i8> %shamt)
+  ret <2 x i8> %r
+}
+
+define <2 x i7> @fshl_ones_vec(<2 x i7> %shamt) {
+; CHECK-LABEL: @fshl_ones_vec(
+; CHECK-NEXT:    ret <2 x i7> <i7 -1, i7 -1>
+;
+  %r = call <2 x i7> @llvm.fshl.v2i7(<2 x i7> <i7 undef, i7 -1>, <2 x i7> <i7 -1, i7 undef>, <2 x i7> %shamt)
+  ret <2 x i7> %r
+}
+
+define i9 @fshr_ones(i9 %shamt) {
+; CHECK-LABEL: @fshr_ones(
+; CHECK-NEXT:    ret i9 -1
+;
+  %r = call i9 @llvm.fshr.i9(i9 -1, i9 -1, i9 %shamt)
+  ret i9 %r
+}
+
 declare double @llvm.fma.f64(double,double,double)
 declare double @llvm.fmuladd.f64(double,double,double)
 
diff --git a/llvm/test/Transforms/LICM/argmemonly-call.ll b/llvm/test/Transforms/LICM/argmemonly-call.ll
index 4e505038c9b88..b015f60f7fde6 100644
--- a/llvm/test/Transforms/LICM/argmemonly-call.ll
+++ b/llvm/test/Transforms/LICM/argmemonly-call.ll
@@ -1,10 +1,5 @@
-; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 -enable-mssa-loop-dependency=false %s -enable-new-pm=0 | FileCheck %s
-; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 -enable-mssa-loop-dependency=true -verify-memoryssa %s -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2
-; RUN: opt -licm -basic-aa -licm-n2-threshold=200 < %s -S -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2
-
-; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=0 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=0 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s --check-prefix=ALIAS-N2
-; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=200 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s --check-prefix=ALIAS-N2
+; RUN: opt -S -basic-aa -licm -verify-memoryssa %s -enable-new-pm=0 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 
 declare i32 @foo() readonly argmemonly nounwind
 declare i32 @foo2() readonly nounwind
@@ -14,9 +9,6 @@ define void @test(i32* %loc) {
 ; CHECK-LABEL: @test
 ; CHECK: @foo
 ; CHECK-LABEL: loop:
-; ALIAS-N2-LABEL: @test
-; ALIAS-N2: @foo
-; ALIAS-N2-LABEL: loop:
   br label %loop
 
 loop:
@@ -30,9 +22,6 @@ define void @test_neg(i32* %loc) {
 ; CHECK-LABEL: @test_neg
 ; CHECK-LABEL: loop:
 ; CHECK: @foo
-; ALIAS-N2-LABEL: @test_neg
-; ALIAS-N2-LABEL: loop:
-; ALIAS-N2: @foo
   br label %loop
 
 loop:
@@ -45,9 +34,6 @@ define void @test2(i32* noalias %loc, i32* noalias %loc2) {
 ; CHECK-LABEL: @test2
 ; CHECK: @bar
 ; CHECK-LABEL: loop:
-; ALIAS-N2-LABEL: @test2
-; ALIAS-N2: @bar
-; ALIAS-N2-LABEL: loop:
   br label %loop
 
 loop:
@@ -61,9 +47,6 @@ define void @test3(i32* %loc) {
 ; CHECK-LABEL: @test3
 ; CHECK-LABEL: loop:
 ; CHECK: @bar
-; ALIAS-N2-LABEL: @test3
-; ALIAS-N2-LABEL: loop:
-; ALIAS-N2: @bar
   br label %loop
 
 loop:
@@ -79,9 +62,6 @@ define void @test4(i32* %loc, i32* %loc2) {
 ; CHECK-LABEL: @test4
 ; CHECK-LABEL: loop:
 ; CHECK: @bar
-; ALIAS-N2-LABEL: @test4
-; ALIAS-N2-LABEL: loop:
-; ALIAS-N2: @bar
   br label %loop
 
 loop:
@@ -91,20 +71,11 @@ loop:
 }
 
 declare i32 @foo_new(i32*) readonly
-; With the default AST mechanism used by LICM for alias analysis,
-; we clump foo_new with bar.
-; With the N2 Alias analysis diagnostic tool, we are able to hoist the
-; argmemonly bar call out of the loop.
-; Using MemorySSA we can also hoist bar.
 
 define void @test5(i32* %loc2, i32* noalias %loc) {
-; ALIAS-N2-LABEL: @test5
-; ALIAS-N2: @bar
-; ALIAS-N2-LABEL: loop:
-
 ; CHECK-LABEL: @test5
+; CHECK: @bar
 ; CHECK-LABEL: loop:
-; CHECK:  @bar
   br label %loop
 
 loop:
@@ -122,10 +93,6 @@ define void @test6(i32* noalias %loc, i32* noalias %loc2) {
 ; CHECK: %val = load i32, i32* %loc2
 ; CHECK-LABEL: loop:
 ; CHECK: @llvm.memcpy
-; ALIAS-N2-LABEL: @test6
-; ALIAS-N2: %val = load i32, i32* %loc2
-; ALIAS-N2-LABEL: loop:
-; ALIAS-N2: @llvm.memcpy
   br label %loop
 
 loop:
@@ -142,10 +109,6 @@ define void @test7(i32* noalias %loc, i32* noalias %loc2) {
 ; CHECK: %val = load i32, i32* %loc2
 ; CHECK-LABEL: loop:
 ; CHECK: @custom_memcpy
-; ALIAS-N2-LABEL: @test7
-; ALIAS-N2: %val = load i32, i32* %loc2
-; ALIAS-N2-LABEL: loop:
-; ALIAS-N2: @custom_memcpy
   br label %loop
 
 loop:
diff --git a/llvm/test/Transforms/LICM/assume.ll b/llvm/test/Transforms/LICM/assume.ll
index 8c3abb5f8b2f0..187ffb831893a 100644
--- a/llvm/test/Transforms/LICM/assume.ll
+++ b/llvm/test/Transforms/LICM/assume.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -licm -basic-aa < %s -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 
 define void @f_0(i1 %p) nounwind ssp {
 ; CHECK-LABEL: @f_0(
diff --git a/llvm/test/Transforms/LICM/atomics.ll b/llvm/test/Transforms/LICM/atomics.ll
index 1feb8932c337a..526b441c3b09c 100644
--- a/llvm/test/Transforms/LICM/atomics.ll
+++ b/llvm/test/Transforms/LICM/atomics.ll
@@ -1,7 +1,5 @@
-; RUN: opt < %s -S -basic-aa -licm -enable-mssa-loop-dependency=false -enable-new-pm=0 | FileCheck -check-prefixes=CHECK,AST %s
-; RUN: opt < %s -S -basic-aa -licm -enable-mssa-loop-dependency=true -enable-new-pm=0 | FileCheck -check-prefixes=CHECK,MSSA %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck -check-prefixes=CHECK,AST %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck -check-prefixes=CHECK,MSSA %s
+; RUN: opt < %s -S -basic-aa -licm -enable-new-pm=0 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 
 ; Check that we can hoist unordered loads
 define i32 @test1(i32* nocapture %y) nounwind uwtable ssp {
@@ -175,12 +173,10 @@ loop:
 end:
   ret i32 %vala
 ; CHECK-LABEL: define i32 @test7b(
-; AST-LABEL: entry:
-; AST: store i32 5, i32* %x
 ; CHECK-LABEL: loop:
 ; CHECK: load atomic i32, i32* %y monotonic
 ; CHECK-LABEL: end:
-; MSSA: store i32 5, i32* %x
+; CHECK: store i32 5, i32* %x
 ; CHECK: store atomic i32 %{{.+}}, i32* %z unordered, align 4
 }
 
diff --git a/llvm/test/Transforms/LICM/basictest.ll b/llvm/test/Transforms/LICM/basictest.ll
index 78c87ce765179..42a3a09795a73 100644
--- a/llvm/test/Transforms/LICM/basictest.ll
+++ b/llvm/test/Transforms/LICM/basictest.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm | llvm-dis
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s | llvm-dis
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s | llvm-dis
 
 define void @testfunc(i32 %i) {
 ; <label>:0
diff --git a/llvm/test/Transforms/LICM/call-hoisting.ll b/llvm/test/Transforms/LICM/call-hoisting.ll
index e5562961ac742..084d8e21f8616 100644
--- a/llvm/test/Transforms/LICM/call-hoisting.ll
+++ b/llvm/test/Transforms/LICM/call-hoisting.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basic-aa -licm %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 
 declare i32 @load(i32* %p) argmemonly readonly nounwind
 
diff --git a/llvm/test/Transforms/LICM/constexpr.ll b/llvm/test/Transforms/LICM/constexpr.ll
index 02a37feca243c..40d2ef25e8196 100644
--- a/llvm/test/Transforms/LICM/constexpr.ll
+++ b/llvm/test/Transforms/LICM/constexpr.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -S -basic-aa -licm | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 ; This fixes PR22460
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/LICM/crash.ll b/llvm/test/Transforms/LICM/crash.ll
index 93ea2192e03e2..6a740219133f9 100644
--- a/llvm/test/Transforms/LICM/crash.ll
+++ b/llvm/test/Transforms/LICM/crash.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -licm -disable-output < %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -disable-output < %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -disable-output < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/llvm/test/Transforms/LICM/debug-value.ll b/llvm/test/Transforms/LICM/debug-value.ll
index db566cc022e71..99fd7816f35f7 100644
--- a/llvm/test/Transforms/LICM/debug-value.ll
+++ b/llvm/test/Transforms/LICM/debug-value.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -licm -basic-aa < %s -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 
 define void @dgefa() nounwind ssp {
 entry:
diff --git a/llvm/test/Transforms/LICM/explicit_guards.ll b/llvm/test/Transforms/LICM/explicit_guards.ll
index d51a1e24db400..ffc6b55e26a20 100644
--- a/llvm/test/Transforms/LICM/explicit_guards.ll
+++ b/llvm/test/Transforms/LICM/explicit_guards.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -make-guards-explicit -basic-aa -licm < %s        | FileCheck %s
-; RUN: opt -S -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,make-guards-explicit,loop(licm)' < %s | FileCheck %s
+; RUN: opt -S -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,make-guards-explicit,loop-mssa(licm)' < %s | FileCheck %s
 
 declare void @llvm.experimental.guard(i1,...)
 declare void @maythrow()
diff --git a/llvm/test/Transforms/LICM/extra-copies.ll b/llvm/test/Transforms/LICM/extra-copies.ll
index 2f8e814c15eea..51b9ae465daf2 100644
--- a/llvm/test/Transforms/LICM/extra-copies.ll
+++ b/llvm/test/Transforms/LICM/extra-copies.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm -S | FileCheck %s
-; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 ; PR19835
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LICM/fence.ll b/llvm/test/Transforms/LICM/fence.ll
index 7df9e21039b87..a4883b8bf7e39 100644
--- a/llvm/test/Transforms/LICM/fence.ll
+++ b/llvm/test/Transforms/LICM/fence.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -licm -basic-aa < %s -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 
 define void @test1(i64 %n) {
 ; CHECK-LABEL: @test1
diff --git a/llvm/test/Transforms/LICM/funclet.ll b/llvm/test/Transforms/LICM/funclet.ll
index a8bba96f74dc9..d808cd9a0f50c 100644
--- a/llvm/test/Transforms/LICM/funclet.ll
+++ b/llvm/test/Transforms/LICM/funclet.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 
 target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
 target triple = "i386-pc-windows-msvc18.0.0"
diff --git a/llvm/test/Transforms/LICM/guards.ll b/llvm/test/Transforms/LICM/guards.ll
index e4dad88e49982..24d46405fe4c5 100644
--- a/llvm/test/Transforms/LICM/guards.ll
+++ b/llvm/test/Transforms/LICM/guards.ll
@@ -1,9 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
-; RUN: opt -licm -basic-aa -enable-mssa-loop-dependency=false -ipt-expensive-asserts=true < %s -S | FileCheck %s
-; RUN: opt -licm -basic-aa -enable-mssa-loop-dependency=true -ipt-expensive-asserts=true < %s -S | FileCheck %s --check-prefixes=CHECK,MSSA
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -ipt-expensive-asserts=true < %s -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -ipt-expensive-asserts=true < %s -S | FileCheck %s --check-prefixes=CHECK,MSSA
+; RUN: opt -licm -basic-aa -ipt-expensive-asserts=true < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -ipt-expensive-asserts=true < %s -S | FileCheck %s
 
 ; Hoist guard and load.
 define void @test1(i1 %cond, i32* %ptr) {
@@ -85,17 +83,17 @@ loop:
 
 ; But can hoist if the side effect is hoisted with MSSA
 define void @test2b_prime(i1 %cond, i32* noalias %ptr) {
-; MSSA-LABEL: @test2b_prime(
-; MSSA-NEXT:  entry:
-; MSSA-NEXT:    [[P2:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 1
-; MSSA-NEXT:    store i32 0, i32* [[P2]]
-; MSSA-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[COND:%.*]]) [ "deopt"(i32 0) ]
-; MSSA-NEXT:    [[VAL:%.*]] = load i32, i32* [[PTR]]
-; MSSA-NEXT:    br label [[LOOP:%.*]]
-; MSSA:       loop:
-; MSSA-NEXT:    [[X:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[X_INC:%.*]], [[LOOP]] ]
-; MSSA-NEXT:    [[X_INC]] = add i32 [[X]], [[VAL]]
-; MSSA-NEXT:    br label [[LOOP]]
+; CHECK-LABEL: @test2b_prime(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 1
+; CHECK-NEXT:    store i32 0, i32* [[P2]]
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[COND:%.*]]) [ "deopt"(i32 0) ]
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32* [[PTR]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[X:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[X_INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[X_INC]] = add i32 [[X]], [[VAL]]
+; CHECK-NEXT:    br label [[LOOP]]
 
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LICM/hoist-bitcast-load.ll b/llvm/test/Transforms/LICM/hoist-bitcast-load.ll
index d96b75bfbba75..8181b90b2fa61 100644
--- a/llvm/test/Transforms/LICM/hoist-bitcast-load.ll
+++ b/llvm/test/Transforms/LICM/hoist-bitcast-load.ll
@@ -1,6 +1,5 @@
-; RUN: opt -S -basic-aa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(loop-simplifycfg,licm)' -S < %s | FileCheck %s
-; RUN: opt -S -basic-aa -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop-mssa(loop-simplifycfg,licm)' -S < %s | FileCheck %s
+; RUN: opt -S -basic-aa -licm -verify-memoryssa < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LICM/hoist-debuginvariant.ll b/llvm/test/Transforms/LICM/hoist-debuginvariant.ll
index 6f851fb10b887..0bfaf232a3e9b 100644
--- a/llvm/test/Transforms/LICM/hoist-debuginvariant.ll
+++ b/llvm/test/Transforms/LICM/hoist-debuginvariant.ll
@@ -1,6 +1,5 @@
-; RUN: opt < %s -licm -S | FileCheck %s
 ; RUN: opt < %s -strip-debug -licm -S | FileCheck %s
-; RUN: opt < %s -licm -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
+; RUN: opt < %s -licm -verify-memoryssa -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LICM/hoist-deref-load.ll b/llvm/test/Transforms/LICM/hoist-deref-load.ll
index b13be8c9fb9a5..1887b80db0bd7 100644
--- a/llvm/test/Transforms/LICM/hoist-deref-load.ll
+++ b/llvm/test/Transforms/LICM/hoist-deref-load.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -basic-aa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(loop-simplifycfg,licm)' -S < %s | FileCheck %s
-; RUN: opt -S -basic-aa -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -basic-aa -licm -verify-memoryssa < %s | FileCheck %s
 ; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop-mssa(loop-simplifycfg,licm)' -verify-memoryssa -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Transforms/LICM/hoist-fast-fdiv.ll b/llvm/test/Transforms/LICM/hoist-fast-fdiv.ll
index bdefcf9e8f462..c48cac4d954d4 100644
--- a/llvm/test/Transforms/LICM/hoist-fast-fdiv.ll
+++ b/llvm/test/Transforms/LICM/hoist-fast-fdiv.ll
@@ -1,5 +1,4 @@
-; RUN: opt -licm -S < %s | FileCheck %s
-; RUN: opt -licm -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
+; RUN: opt -licm -verify-memoryssa -S < %s | FileCheck %s
 
 ; Function Attrs: noinline norecurse nounwind readnone ssp uwtable
 define zeroext i1 @invariant_denom(double %v) #0 {
diff --git a/llvm/test/Transforms/LICM/hoist-invariant-load.ll b/llvm/test/Transforms/LICM/hoist-invariant-load.ll
index f711d8017e98a..6562441d77c88 100644
--- a/llvm/test/Transforms/LICM/hoist-invariant-load.ll
+++ b/llvm/test/Transforms/LICM/hoist-invariant-load.ll
@@ -1,6 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -licm -disable-basic-aa -stats -S 2>&1 | grep "1 licm"
-; RUN: opt < %s -licm -enable-mssa-loop-dependency=true -verify-memoryssa -disable-basic-aa -stats -S 2>&1 | grep "1 licm"
+; RUN: opt < %s -licm -verify-memoryssa -disable-basic-aa -stats -S 2>&1 | grep "1 licm"
 
 @"\01L_OBJC_METH_VAR_NAME_" = internal global [4 x i8] c"foo\00", section "__TEXT,__objc_methname,cstring_literals", align 1
 @"\01L_OBJC_SELECTOR_REFERENCES_" = internal global i8* getelementptr inbounds ([4 x i8], [4 x i8]* @"\01L_OBJC_METH_VAR_NAME_", i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip"
diff --git a/llvm/test/Transforms/LICM/hoist-mustexec.ll b/llvm/test/Transforms/LICM/hoist-mustexec.ll
index 25ccaa0736203..592e20566a822 100644
--- a/llvm/test/Transforms/LICM/hoist-mustexec.ll
+++ b/llvm/test/Transforms/LICM/hoist-mustexec.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: opt -S -basic-aa -licm -ipt-expensive-asserts=true < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -ipt-expensive-asserts=true -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop-mssa(licm)' -ipt-expensive-asserts=true -S %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/LICM/hoist-nounwind.ll b/llvm/test/Transforms/LICM/hoist-nounwind.ll
index e74bf59bc36d1..59f2f803364e1 100644
--- a/llvm/test/Transforms/LICM/hoist-nounwind.ll
+++ b/llvm/test/Transforms/LICM/hoist-nounwind.ll
@@ -1,6 +1,5 @@
-; RUN: opt -S -basic-aa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
-; RUN: opt -S -basic-aa -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop-mssa(licm)' -S %s | FileCheck %s
+; RUN: opt -S -basic-aa -licm -verify-memoryssa < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -95,4 +94,4 @@ for.body:
 
 for.cond.cleanup:
   ret i32 %add
-}
\ No newline at end of file
+}
diff --git a/llvm/test/Transforms/LICM/hoist-phi.ll b/llvm/test/Transforms/LICM/hoist-phi.ll
index 26970eff23f83..005690dd0ba4e 100644
--- a/llvm/test/Transforms/LICM/hoist-phi.ll
+++ b/llvm/test/Transforms/LICM/hoist-phi.ll
@@ -1,9 +1,7 @@
 ; RUN: opt -S -licm < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
 ; RUN: opt -S -licm -licm-control-flow-hoisting=1 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-ENABLED
 ; RUN: opt -S -licm -licm-control-flow-hoisting=0 < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
-; RUN: opt -passes='require<opt-remark-emit>,loop(licm)' -S < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
-; RUN: opt -passes='require<opt-remark-emit>,loop(licm)' -licm-control-flow-hoisting=1 -S < %s | FileCheck %s -check-prefixes=CHECK,CHECK-ENABLED
-; RUN: opt -passes='require<opt-remark-emit>,loop(licm)' -licm-control-flow-hoisting=0 -S < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
+; RUN: opt -passes='require<opt-remark-emit>,loop-mssa(licm)' -S < %s | FileCheck %s -check-prefixes=CHECK,CHECK-DISABLED
 
 ; RUN: opt -passes='require<opt-remark-emit>,loop-mssa(licm)' -licm-control-flow-hoisting=1 -verify-memoryssa -S < %s | FileCheck %s -check-prefixes=CHECK,CHECK-ENABLED
 ; Enable run below when adding promotion. e.g. "store i32 %phi, i32* %p" is promoted to phi.lcssa.
diff --git a/llvm/test/Transforms/LICM/hoist-round.ll b/llvm/test/Transforms/LICM/hoist-round.ll
index f76919eac8182..0d31207cfb8ca 100644
--- a/llvm/test/Transforms/LICM/hoist-round.ll
+++ b/llvm/test/Transforms/LICM/hoist-round.ll
@@ -1,6 +1,5 @@
-; RUN: opt -S -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
-; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -S %s | FileCheck %s
+; RUN: opt -S -licm -verify-memoryssa < %s | FileCheck %s
 
 target datalayout = "E-m:e-p:32:32-i8:8:8-i16:16:16-i64:32:32-f64:32:32-v64:32:32-v128:32:32-a0:0:32-n32"
 
diff --git a/llvm/test/Transforms/LICM/hoisting.ll b/llvm/test/Transforms/LICM/hoisting.ll
index 00ac0f5756dea..88c5b04f786d6 100644
--- a/llvm/test/Transforms/LICM/hoisting.ll
+++ b/llvm/test/Transforms/LICM/hoisting.ll
@@ -1,6 +1,5 @@
-; RUN: opt < %s -licm -S | FileCheck %s
-; RUN: opt < %s -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -S | FileCheck %s
-; RUN: opt < %s -licm -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop-mssa(licm)' -S | FileCheck %s
+; RUN: opt < %s -licm -verify-memoryssa -S | FileCheck %s
 
 @X = global i32 0		; <i32*> [#uses=1]
 
diff --git a/llvm/test/Transforms/LICM/infinite_loops.ll b/llvm/test/Transforms/LICM/infinite_loops.ll
index 3f27d3563c184..df8c92b2243aa 100644
--- a/llvm/test/Transforms/LICM/infinite_loops.ll
+++ b/llvm/test/Transforms/LICM/infinite_loops.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -basic-aa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop-mssa(licm)' -S %s | FileCheck %s
 
 ; Make sure we don't hoist the unsafe division to some executable block.
 define void @test_impossible_exit_in_untaken_block(i32 %a, i32 %b, i32* %p) {
diff --git a/llvm/test/Transforms/LICM/invariant.start.ll b/llvm/test/Transforms/LICM/invariant.start.ll
index 73e23f0effb3c..bf0d826c2f37d 100644
--- a/llvm/test/Transforms/LICM/invariant.start.ll
+++ b/llvm/test/Transforms/LICM/invariant.start.ll
@@ -1,7 +1,5 @@
-; RUN: opt -licm -basic-aa -licm-n2-threshold=0 < %s -S | FileCheck %s
-; RUN: opt -licm -basic-aa -licm-n2-threshold=200 < %s -S | FileCheck %s --check-prefix=ALIAS-N2
-; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=0 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=200 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s --check-prefix=ALIAS-N2
+; RUN: opt -licm -basic-aa < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 
 define void @test1(i1 %cond, i32* %ptr) {
 ; CHECK-LABEL: @test1(
@@ -10,12 +8,6 @@ define void @test1(i1 %cond, i32* %ptr) {
 ; CHECK: %val = load i32, i32* %ptr
 ; CHECK-LABEL: loop:
 
-; ALIAS-N2-LABEL: @test1(
-; ALIAS-N2-LABEL: entry:
-; ALIAS-N2: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
-; ALIAS-N2: %val = load i32, i32* %ptr
-; ALIAS-N2-LABEL: loop:
-
 entry:
   br label %loop
 
@@ -35,12 +27,6 @@ define void @test2(i1 %cond, i32* %ptr) {
 ; CHECK: %val = load i32, i32* %ptr
 ; CHECK-LABEL: loop:
 ; CHECK: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %piv)
-
-; ALIAS-N2-LABEL: @test2(
-; ALIAS-N2-LABEL: entry:
-; ALIAS-N2:         %val = load i32, i32* %ptr
-; ALIAS-N2-LABEL: loop:
-; ALIAS-N2:         call {}* @llvm.invariant.start.p0i32(i64 4, i32* %piv)
 entry:
   br label %loop
 
@@ -59,12 +45,6 @@ define void @test3(i1 %cond, i32* %ptr) {
 ; CHECK: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
 ; CHECK: %val = load i32, i32* %ptr
 ; CHECK-LABEL: loop:
-
-; ALIAS-N2-LABEL: @test3(
-; ALIAS-N2-LABEL: entry:
-; ALIAS-N2: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
-; ALIAS-N2: %val = load i32, i32* %ptr
-; ALIAS-N2-LABEL: loop:
 entry:
   br label %loop
 
@@ -87,13 +67,6 @@ define void @test4(i1 %cond, i32* %ptr) {
 ; CHECK:   store i32 0, i32* %ptr
 ; CHECK: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
 ; CHECK: %val = load i32, i32* %ptr
-
-; ALIAS-N2-LABEL: @test4(
-; ALIAS-N2-LABEL: entry:
-; ALIAS-N2-LABEL: loop:
-; ALIAS-N2:   store i32 0, i32* %ptr
-; ALIAS-N2: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
-; ALIAS-N2: %val = load i32, i32* %ptr
 entry:
   br label %loop
 
@@ -114,13 +87,6 @@ define void @test5(i1 %cond, i32* %ptr) {
 ; CHECK:   store i32 0, i32* %ptr
 ; CHECK: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
 ; CHECK: %val = load i32, i32* %ptr
-
-; ALIAS-N2-LABEL: @test5(
-; ALIAS-N2-LABEL: entry:
-; ALIAS-N2-LABEL: loop:
-; ALIAS-N2:   store i32 0, i32* %ptr
-; ALIAS-N2: call {}* @llvm.invariant.start.p0i32(i64 4, i32* %ptr)
-; ALIAS-N2: %val = load i32, i32* %ptr
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/LICM/lcssa-ssa-promoter.ll b/llvm/test/Transforms/LICM/lcssa-ssa-promoter.ll
index 39e70748fb5d5..427d760c21891 100644
--- a/llvm/test/Transforms/LICM/lcssa-ssa-promoter.ll
+++ b/llvm/test/Transforms/LICM/lcssa-ssa-promoter.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -basic-aa -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s| FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -S %s| FileCheck %s
 ;
 ; Manually validate LCSSA form is preserved even after SSAUpdater is used to
 ; promote things in the loop bodies.
diff --git a/llvm/test/Transforms/LICM/lnicm-sink.ll b/llvm/test/Transforms/LICM/lnicm-sink.ll
index 0dc47e5f1a4a0..54dfd7bb26699 100644
--- a/llvm/test/Transforms/LICM/lnicm-sink.ll
+++ b/llvm/test/Transforms/LICM/lnicm-sink.ll
@@ -1,5 +1,5 @@
-; RUN: opt -passes='loop(licm)' -S %s | FileCheck %s --check-prefixes CHECK,LICM
-; RUN: opt -passes='loop(lnicm)' -S %s | FileCheck %s --check-prefixes CHECK,LNICM
+; RUN: opt -passes='loop-mssa(licm)' -S %s | FileCheck %s --check-prefixes CHECK,LICM
+; RUN: opt -passes='loop-mssa(lnicm)' -S %s | FileCheck %s --check-prefixes CHECK,LNICM
 
 ; This test represents the following function:
 ;
@@ -61,6 +61,72 @@ for.end7:
   ret double %t.0.lcssa
 }
 
+; double sin(double);
+; int abs(int);
+; double test(double x, int y[10]) {
+;   double t = 0; int s = 0;
+;   for (int i = 0; i < 10; i++) {
+;     for (int k = 0; k < 10; k++) {
+;       for (int j = 0; j < 10; j++) {
+;         t = sin(x);
+;         s = abs(i);
+;       }
+;     }
+;     y[i] = s;
+;   }
+;   return t;
+; }
+;
+define dso_local double @test2(double %x, i32* noalias %y) {
+entry:
+  br label %for.body
+
+for.body:
+  %i.02 = phi i32 [ 0, %entry ], [ %inc6, %for.end ]
+  br label %for.k
+
+for.k:
+  %k = phi i64 [ 0, %for.body ], [ %inc.k, %for.end.k ]
+  br label %for.body3
+
+; CHECK: for.body3:
+; LNICM: call i32 @abs(i32 %i.02)
+; LICM-NOT: call i32 @abs(i32 %i.02)
+for.body3:
+  %j.01 = phi i32 [ 0, %for.k ], [ %inc, %for.body3 ]
+  %call = call double @sin(double %x)
+  %call4 = call i32 @abs(i32 %i.02)
+  %inc = add nsw i32 %j.01, 1
+  %cmp2 = icmp slt i32 %inc, 10
+  br i1 %cmp2, label %for.body3, label %for.end.k
+
+for.end.k:
+  %s.lcssa.k = phi i32 [ %call4, %for.body3 ]
+  %t.lcssa.k = phi double [ %call, %for.body3 ]
+  %inc.k = add nsw i64 %k, 1
+  %cmp.k = icmp slt i64 %inc.k, 10
+  br i1 %cmp.k, label %for.k, label %for.end
+
+; CHECK: for.end:
+; LICM: call i32 @abs(i32 %i.02)
+; LNICM-NOT: call i32 @abs(i32 %i.02)
+for.end:
+  %s.1.lcssa = phi i32 [ %s.lcssa.k, %for.end.k ]
+  %t.1.lcssa = phi double [ %t.lcssa.k, %for.end.k ]
+  %idxprom = sext i32 %i.02 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %y, i64 %idxprom
+  store i32 %s.1.lcssa, i32* %arrayidx, align 4
+  %inc6 = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc6, 10
+  br i1 %cmp, label %for.body, label %for.end7
+
+; CHECK: for.end7:
+; CHECK: call double @sin(double %x)
+for.end7:
+  %t.0.lcssa = phi double [ %t.1.lcssa, %for.end ]
+  ret double %t.0.lcssa
+}
+
 declare dso_local double @sin(double) #0
 
 declare dso_local i32 @abs(i32) #0
diff --git a/llvm/test/Transforms/LICM/lnicm.ll b/llvm/test/Transforms/LICM/lnicm.ll
index a5c301df7c760..1ef0c1fd1421e 100644
--- a/llvm/test/Transforms/LICM/lnicm.ll
+++ b/llvm/test/Transforms/LICM/lnicm.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -aa-pipeline=basic-aa -passes='loop(loop-interchange)'       -S %s | FileCheck %s --check-prefixes INTC
-; RUN: opt -aa-pipeline=basic-aa -passes='loop(lnicm,loop-interchange)' -S %s | FileCheck %s --check-prefixes LNICM,CHECK
-; RUN: opt -aa-pipeline=basic-aa -passes='loop(licm,loop-interchange)'  -S %s | FileCheck %s --check-prefixes LICM,CHECK
+; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(lnicm),loop(loop-interchange)' -S %s | FileCheck %s --check-prefixes LNICM,CHECK
+; RUN: opt -aa-pipeline=basic-aa -passes='loop-mssa(licm),loop(loop-interchange)'  -S %s | FileCheck %s --check-prefixes LICM,CHECK
 
 ; This test represents the following function:
 ; void test(int x[10][10], int y[10], int *z) {
diff --git a/llvm/test/Transforms/LICM/no-preheader-test.ll b/llvm/test/Transforms/LICM/no-preheader-test.ll
index 5cfa462dfc4a0..867fa6abc4b84 100644
--- a/llvm/test/Transforms/LICM/no-preheader-test.ll
+++ b/llvm/test/Transforms/LICM/no-preheader-test.ll
@@ -1,6 +1,6 @@
 ; Test that LICM works when there is not a loop-preheader
 ; RUN: opt < %s -licm | llvm-dis
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s | llvm-dis
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s | llvm-dis
 
 define void @testfunc(i32 %i.s, i1 %ifcond) {
 	br i1 %ifcond, label %Then, label %Else
diff --git a/llvm/test/Transforms/LICM/opt-remarks-conditional-load.ll b/llvm/test/Transforms/LICM/opt-remarks-conditional-load.ll
index c79ce0111d67a..abe1fa9275a80 100644
--- a/llvm/test/Transforms/LICM/opt-remarks-conditional-load.ll
+++ b/llvm/test/Transforms/LICM/opt-remarks-conditional-load.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm -pass-remarks-missed=licm -o /dev/null 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' %s -o /dev/null -pass-remarks-missed=licm 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' %s -o /dev/null -pass-remarks-missed=licm 2>&1 | FileCheck %s
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 ; With the load from %p conditional, we can't optmize this and the remark
diff --git a/llvm/test/Transforms/LICM/opt-remarks-intervening-store.ll b/llvm/test/Transforms/LICM/opt-remarks-intervening-store.ll
index 519025fd6b67d..ce6dc61b4b737 100644
--- a/llvm/test/Transforms/LICM/opt-remarks-intervening-store.ll
+++ b/llvm/test/Transforms/LICM/opt-remarks-intervening-store.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm -pass-remarks-missed=licm -o /dev/null 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' %s -o /dev/null -pass-remarks-missed=licm 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' %s -o /dev/null -pass-remarks-missed=licm 2>&1 | FileCheck %s
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 ; Without the noalias on %p, we can't optmize this and the remark should tell
diff --git a/llvm/test/Transforms/LICM/opt-remarks.ll b/llvm/test/Transforms/LICM/opt-remarks.ll
index e2206ec12e479..a5cffd1145ba5 100644
--- a/llvm/test/Transforms/LICM/opt-remarks.ll
+++ b/llvm/test/Transforms/LICM/opt-remarks.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -licm -pass-remarks=licm -o /dev/null 2>&1 | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' %s -o /dev/null -pass-remarks=licm 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' %s -o /dev/null -pass-remarks=licm 2>&1 | FileCheck %s
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 define void @hoist(i32* %array, i32* noalias %p) {
diff --git a/llvm/test/Transforms/LICM/pr37323.ll b/llvm/test/Transforms/LICM/pr37323.ll
index 27534919e4091..4930199fef24f 100644
--- a/llvm/test/Transforms/LICM/pr37323.ll
+++ b/llvm/test/Transforms/LICM/pr37323.ll
@@ -1,5 +1,5 @@
 ;RUN: opt -verify-dom-info -loop-simplify -postdomtree -licm -adce -verify-loop-info -S -o - %s | FileCheck %s
-;RUN: opt -verify-dom-info -passes='loop-simplify,require<postdomtree>,require<opt-remark-emit>,loop(licm),function(adce)' -S -o - %s | FileCheck %s
+;RUN: opt -verify-dom-info -passes='loop-simplify,require<postdomtree>,require<opt-remark-emit>,loop-mssa(licm),function(adce)' -S -o - %s | FileCheck %s
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/LICM/pr38513.ll b/llvm/test/Transforms/LICM/pr38513.ll
index dea268d88cd73..0ed6ae1061ada 100644
--- a/llvm/test/Transforms/LICM/pr38513.ll
+++ b/llvm/test/Transforms/LICM/pr38513.ll
@@ -1,4 +1,4 @@
-; RUN: opt -enable-mssa-loop-dependency=false -disable-basic-aa -alias-set-saturation-threshold=2 -licm -S < %s | FileCheck %s
+; RUN: opt -disable-basic-aa -alias-set-saturation-threshold=2 -passes='loop-mssa(licm)' -S < %s | FileCheck %s
 ; REQUIRES: asserts
 
 ; CHECK-LABEL: @f1()
diff --git a/llvm/test/Transforms/LICM/pr40317.ll b/llvm/test/Transforms/LICM/pr40317.ll
index 487cbb23eb0b1..82c791f3bde68 100644
--- a/llvm/test/Transforms/LICM/pr40317.ll
+++ b/llvm/test/Transforms/LICM/pr40317.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -mcpu=z13 -tbaa -licm -enable-mssa-loop-dependency -licm-control-flow-hoisting -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -mcpu=z13 -tbaa -licm -licm-control-flow-hoisting -verify-memoryssa < %s | FileCheck %s
 
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
 target triple = "s390x-ibm-linux"
diff --git a/llvm/test/Transforms/LICM/pr42969.ll b/llvm/test/Transforms/LICM/pr42969.ll
index 3cb82dc8095a6..428e64b9a6408 100644
--- a/llvm/test/Transforms/LICM/pr42969.ll
+++ b/llvm/test/Transforms/LICM/pr42969.ll
@@ -1,4 +1,4 @@
-; RUN: opt %s -S -scoped-noalias-aa -enable-mssa-loop-dependency=true -licm | FileCheck %s
+; RUN: opt %s -S -scoped-noalias-aa -licm | FileCheck %s
 
 define i16 @main(i1 %a_b_mayalias, i16* %a, i16* %b) {
 ; CHECK:       scalar.body:
diff --git a/llvm/test/Transforms/LICM/preheader-safe.ll b/llvm/test/Transforms/LICM/preheader-safe.ll
index 03a7258df117c..dcad1671b7edf 100644
--- a/llvm/test/Transforms/LICM/preheader-safe.ll
+++ b/llvm/test/Transforms/LICM/preheader-safe.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -licm < %s | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -S %s | FileCheck %s
 
 declare void @use_nothrow(i64 %a) nounwind
 declare void @use(i64 %a)
diff --git a/llvm/test/Transforms/LICM/promote-order.ll b/llvm/test/Transforms/LICM/promote-order.ll
index 3e59ad1e2d0e5..5d002489a476d 100644
--- a/llvm/test/Transforms/LICM/promote-order.ll
+++ b/llvm/test/Transforms/LICM/promote-order.ll
@@ -1,7 +1,5 @@
-; RUN: opt -tbaa -basic-aa -licm -enable-mssa-loop-dependency=false -enable-new-pm=0 -S < %s | FileCheck %s --check-prefixes=CHECK,AST
-; RUN: opt -tbaa -basic-aa -licm -enable-mssa-loop-dependency=true -enable-new-pm=0 -S < %s | FileCheck %s --check-prefixes=CHECK,MSSA
-; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s  --check-prefixes=CHECK,AST
-; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -S %s | FileCheck %s --check-prefixes=CHECK,MSSA
+; RUN: opt -tbaa -basic-aa -licm -enable-new-pm=0 -S < %s | FileCheck %s
+; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -S %s | FileCheck %s
 
 ; LICM should keep the stores in their original order when it sinks/promotes them.
 ; rdar://12045203
@@ -14,9 +12,8 @@ target triple = "x86_64-apple-macosx10.8.0"
 define i32* @_Z4doiti(i32 %n, float* %tmp1, i32* %tmp3) nounwind {
 ; CHECK-LABEL: for.body.lr.ph:
 ; CHECK: store float 1.000000e+00, float* %tmp1
-; AST-LABEL: for.cond.for.end_crit_edge:
 ; CHECK: store i32 1, i32* %tmp3
-; MSSA-LABEL: for.cond.for.end_crit_edge:
+; CHECK-LABEL: for.cond.for.end_crit_edge:
 
 entry:
   %cmp1 = icmp slt i32 0, %n
diff --git a/llvm/test/Transforms/LICM/promote-tls.ll b/llvm/test/Transforms/LICM/promote-tls.ll
index 7438162bff679..abb9d96c3ea09 100644
--- a/llvm/test/Transforms/LICM/promote-tls.ll
+++ b/llvm/test/Transforms/LICM/promote-tls.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -tbaa -basic-aa -licm -S < %s | FileCheck %s
-; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -S %s | FileCheck %s
 
 ; If we can prove a local is thread local, we can insert stores during
 ; promotion which wouldn't be legal otherwise.
diff --git a/llvm/test/Transforms/LICM/read-only-calls.ll b/llvm/test/Transforms/LICM/read-only-calls.ll
index 0177c8e43204d..c280f53d4ba24 100644
--- a/llvm/test/Transforms/LICM/read-only-calls.ll
+++ b/llvm/test/Transforms/LICM/read-only-calls.ll
@@ -1,10 +1,5 @@
-; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 -enable-mssa-loop-dependency=false %s -enable-new-pm=0 | FileCheck %s
-; RUN: opt -S -basic-aa -licm -licm-n2-threshold=0 -enable-mssa-loop-dependency=true %s -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2
-; RUN: opt -licm -basic-aa -licm-n2-threshold=200 < %s -S -enable-new-pm=0 | FileCheck %s --check-prefix=ALIAS-N2
-
-; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=0 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=0 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s --check-prefix=ALIAS-N2
-; RUN: opt -aa-pipeline=basic-aa -licm-n2-threshold=200 -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck %s --check-prefix=ALIAS-N2
+; RUN: opt -S -basic-aa -licm %s -enable-new-pm=0 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 
 ; We should be able to hoist loads in presence of read only calls and stores
 ; that do not alias.
@@ -23,13 +18,8 @@ declare void @foo(i64, i32*) readonly
 define void @test1(i32* %ptr) {
 ; CHECK-LABEL: @test1(
 ; CHECK-LABEL: entry:
+; CHECK:         %val = load i32, i32* %ptr
 ; CHECK-LABEL: loop:
-; CHECK: %val = load i32, i32* %ptr
-
-; ALIAS-N2-LABEL: @test1(
-; ALIAS-N2-LABEL: entry:
-; ALIAS-N2:         %val = load i32, i32* %ptr
-; ALIAS-N2-LABEL: loop:
 entry:
   br label %loop
 
@@ -47,13 +37,8 @@ loop:
 define void @test2(i32* %ptr) {
 ; CHECK-LABEL: @test2(
 ; CHECK-LABEL: entry:
-; CHECK: %val = load i32, i32* %ptr
+; CHECK:         %val = load i32, i32* %ptr
 ; CHECK-LABEL: loop:
-
-; ALIAS-N2-LABEL: @test2(
-; ALIAS-N2-LABEL: entry:
-; ALIAS-N2:         %val = load i32, i32* %ptr
-; ALIAS-N2-LABEL: loop:
 entry:
   br label %loop
 
@@ -70,12 +55,7 @@ define void @test3(i32* %ptr) {
 ; CHECK-LABEL: @test3(
 ; CHECK-LABEL: entry:
 ; CHECK-LABEL: loop:
-; CHECK: %val = load i32, i32* %ptr
-
-; ALIAS-N2-LABEL: @test3(
-; ALIAS-N2-LABEL: entry:
-; ALIAS-N2-LABEL: loop:
-; ALIAS-N2:         %val = load i32, i32* %ptr
+; CHECK:         %val = load i32, i32* %ptr
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/LICM/scalar-promote-memmodel.ll b/llvm/test/Transforms/LICM/scalar-promote-memmodel.ll
index 6f3fc9d152e33..e858a251f649d 100644
--- a/llvm/test/Transforms/LICM/scalar-promote-memmodel.ll
+++ b/llvm/test/Transforms/LICM/scalar-promote-memmodel.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -basic-aa -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -S %s | FileCheck %s
 
 ; Make sure we don't hoist a conditionally-executed store out of the loop;
 ; it would violate the concurrency memory model
diff --git a/llvm/test/Transforms/LICM/scalar-promote-unwind.ll b/llvm/test/Transforms/LICM/scalar-promote-unwind.ll
index 6bcea9953aa3b..9c17764e2d48f 100644
--- a/llvm/test/Transforms/LICM/scalar-promote-unwind.ll
+++ b/llvm/test/Transforms/LICM/scalar-promote-unwind.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -basic-aa -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -S %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LICM/scalar-promote.ll b/llvm/test/Transforms/LICM/scalar-promote.ll
index c6b600e1ca693..f2c82b9579c98 100644
--- a/llvm/test/Transforms/LICM/scalar-promote.ll
+++ b/llvm/test/Transforms/LICM/scalar-promote.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -basic-aa -tbaa -licm -S | FileCheck %s
-; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -aa-pipeline=tbaa,basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -S %s | FileCheck %s
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
 @X = global i32 7   ; <i32*> [#uses=4]
diff --git a/llvm/test/Transforms/LICM/sink.ll b/llvm/test/Transforms/LICM/sink.ll
index 8a5da47847c86..d82168b147cc9 100644
--- a/llvm/test/Transforms/LICM/sink.ll
+++ b/llvm/test/Transforms/LICM/sink.ll
@@ -1,10 +1,9 @@
 ; RUN: opt -S -licm -licm-coldness-threshold=0 < %s | FileCheck %s --check-prefix=CHECK-LICM
-; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM
 ; RUN: opt -S -licm < %s | opt -S -loop-sink | FileCheck %s --check-prefix=CHECK-SINK
-; RUN: opt -S < %s -passes='require<opt-remark-emit>,loop(licm),loop-sink' \
+; RUN: opt -S < %s -passes='require<opt-remark-emit>,loop-mssa(licm),loop-sink' \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-SINK
-; RUN: opt -S -licm -licm-coldness-threshold=0 -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-LICM
-; RUN: opt -S -licm -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM
+; RUN: opt -S -licm -licm-coldness-threshold=0 -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-LICM
+; RUN: opt -S -licm -verify-memoryssa < %s | FileCheck %s --check-prefix=CHECK-BFI-LICM
 
 ; Original source code:
 ; int g;
diff --git a/llvm/test/Transforms/LICM/sinking.ll b/llvm/test/Transforms/LICM/sinking.ll
index e8660695aa79f..63d97afe150ba 100644
--- a/llvm/test/Transforms/LICM/sinking.ll
+++ b/llvm/test/Transforms/LICM/sinking.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -basic-aa -licm -S | FileCheck %s
-; RUN: opt < %s -basic-aa -licm -S -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -basic-aa -licm -S -verify-memoryssa | FileCheck %s
 
 
 declare i32 @strlen(i8*) readonly nounwind willreturn
diff --git a/llvm/test/Transforms/LICM/speculate.ll b/llvm/test/Transforms/LICM/speculate.ll
index 5d0108b129df0..dc6059cfe8ae5 100644
--- a/llvm/test/Transforms/LICM/speculate.ll
+++ b/llvm/test/Transforms/LICM/speculate.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -licm < %s | FileCheck %s
-; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S %s | FileCheck %s
+; RUN: opt -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -S %s | FileCheck %s
 
 ; UDiv is safe to speculate if the denominator is known non-zero.
 
diff --git a/llvm/test/Transforms/LICM/store-hoisting.ll b/llvm/test/Transforms/LICM/store-hoisting.ll
index f8e9616d4c82e..0544af3cd0236 100644
--- a/llvm/test/Transforms/LICM/store-hoisting.ll
+++ b/llvm/test/Transforms/LICM/store-hoisting.ll
@@ -1,7 +1,5 @@
-; RUN: opt -S -basic-aa -licm -enable-mssa-loop-dependency=false %s -enable-new-pm=0 | FileCheck -check-prefixes=CHECK,AST %s
-; RUN: opt -S -basic-aa -licm -enable-mssa-loop-dependency=true %s -enable-new-pm=0 | FileCheck  -check-prefixes=CHECK,MSSA %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' < %s -S | FileCheck -check-prefixes=CHECK,AST %s
-; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck -check-prefixes=CHECK,MSSA %s
+; RUN: opt -S -basic-aa -licm %s -enable-new-pm=0 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' < %s -S | FileCheck %s
 
 define void @test(i32* %loc) {
 ; CHECK-LABEL: @test
@@ -48,11 +46,9 @@ exit2:
 
 define i32* @false_negative_2use(i32* %loc) {
 ; CHECK-LABEL: @false_negative_2use
-; AST-LABEL: exit:
-; AST: store i32 0, i32* %loc
-; MSSA-LABEL: entry:
-; MSSA: store i32 0, i32* %loc
-; MSSA-LABEL: loop:
+; CHECK-LABEL: entry:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: loop:
 entry:
   br label %loop
 
@@ -349,11 +345,9 @@ exit:
 ; the load must observe.
 define i32 @test_dominated_read(i32* %loc) {
 ; CHECK-LABEL: @test_dominated_read
-; MSSA-LABEL: entry:
-; MSSA: store i32 0, i32* %loc
-; MSSA-LABEL: loop:
-; AST-LABEL: exit:
-; AST:  store i32 0, i32* %loc
+; CHECK-LABEL: entry:
+; CHECK: store i32 0, i32* %loc
+; CHECK-LABEL: loop:
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/LICM/unrolled-deeply-nested.ll b/llvm/test/Transforms/LICM/unrolled-deeply-nested.ll
index 84e0ee9a02718..87a0104c6c452 100644
--- a/llvm/test/Transforms/LICM/unrolled-deeply-nested.ll
+++ b/llvm/test/Transforms/LICM/unrolled-deeply-nested.ll
@@ -3,7 +3,7 @@
 ; alias information for inner loops, and in the new PM it is recomputed for each
 ; loop.
 ;
-; RUN: opt -S -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop(licm)' < %s | FileCheck %s
+; RUN: opt -S -aa-pipeline=basic-aa -passes='require<opt-remark-emit>,loop-mssa(licm)' < %s | FileCheck %s
 ; RUN: opt -S -basic-aa -licm < %s | FileCheck %s
 
 define i32 @test(i32* %a, i64 %n.0, i64 %n.0.0, i64 %n.0.0.0, i64 %n.0.0.0.0) nounwind uwtable readonly {
diff --git a/llvm/test/Transforms/LICM/update-scev.ll b/llvm/test/Transforms/LICM/update-scev.ll
index 7b63f0f1003c5..0f54f191eec14 100644
--- a/llvm/test/Transforms/LICM/update-scev.ll
+++ b/llvm/test/Transforms/LICM/update-scev.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=IR-AFTER-TRANSFORM
 ; RUN: opt -analyze -scalar-evolution -licm -scalar-evolution -enable-new-pm=0 < %s | FileCheck %s --check-prefix=SCEV-EXPRS
-; RUN: opt -passes='print<scalar-evolution>,loop(licm),print<scalar-evolution>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCEV-EXPRS
+; RUN: opt -passes='print<scalar-evolution>,loop-mssa(licm),print<scalar-evolution>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=SCEV-EXPRS
 
 declare void @clobber()
 
diff --git a/llvm/test/Transforms/LICM/volatile-alias.ll b/llvm/test/Transforms/LICM/volatile-alias.ll
index be310e398c000..9d12f7fd9638f 100644
--- a/llvm/test/Transforms/LICM/volatile-alias.ll
+++ b/llvm/test/Transforms/LICM/volatile-alias.ll
@@ -1,6 +1,5 @@
-; RUN: opt -basic-aa -sroa -loop-rotate -licm -S < %s | FileCheck %s
-; RUN: opt -basic-aa -sroa -loop-rotate %s | opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop(licm)' -S | FileCheck %s
-; RUN: opt -basic-aa -sroa -loop-rotate -licm -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
+; RUN: opt -basic-aa -sroa -loop-rotate %s | opt -aa-pipeline=basic-aa -passes='require<aa>,require<targetir>,require<scalar-evolution>,require<opt-remark-emit>,loop-mssa(licm)' -S | FileCheck %s
+; RUN: opt -basic-aa -sroa -loop-rotate -licm -verify-memoryssa -S < %s | FileCheck %s
 ; The objects *p and *q are aliased to each other, but even though *q is
 ; volatile, *p can be considered invariant in the loop. Check if it is moved
 ; out of the loop.
diff --git a/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll b/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll
index 6473696219a9c..18f4805191c79 100644
--- a/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll
+++ b/llvm/test/Transforms/LoopDeletion/eval_first_iteration.ll
@@ -353,9 +353,7 @@ define i32 @test_ne_const() {
 ; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE]] ], [ [[SUB]], [[LOOP]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[DONE:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       done:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
@@ -404,9 +402,7 @@ define i32 @test_slt_const() {
 ; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE]] ], [ [[SUB]], [[LOOP]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[SUM_NEXT]], 4
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[DONE:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       done:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
@@ -455,9 +451,7 @@ define i32 @test_ult_const() {
 ; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE]] ], [ [[SUB]], [[LOOP]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ult i32 [[SUM_NEXT]], 4
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[DONE:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       done:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
@@ -506,9 +500,7 @@ define i32 @test_sgt_const() {
 ; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE]] ], [ [[SUB]], [[LOOP]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp sgt i32 [[SUM_NEXT]], 4
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[DONE:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       done:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
@@ -557,9 +549,7 @@ define i32 @test_ugt_const() {
 ; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE]] ], [ [[SUB]], [[LOOP]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ugt i32 [[SUM_NEXT]], 4
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[DONE:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       done:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
@@ -613,9 +603,7 @@ define i32 @test_multiple_pred_const() {
 ; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE]] ], [ [[SUB]], [[IF_TRUE]] ], [ [[SUB]], [[IF_TRUE]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[DONE:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       done:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
@@ -680,9 +668,7 @@ define i32 @test_multiple_pred_2(i1 %cond, i1 %cond2) {
 ; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE_1]] ], [ 0, [[IF_FALSE_2]] ], [ [[SUB]], [[IF_TRUE_1]] ], [ [[SUB]], [[IF_TRUE_2]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[DONE:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       done:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
@@ -756,9 +742,7 @@ define i32 @test_multiple_pred_undef_1(i1 %cond, i1 %cond2) {
 ; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE_1]] ], [ 0, [[IF_FALSE_2]] ], [ [[SUB]], [[IF_TRUE_1]] ], [ undef, [[IF_TRUE_2]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[DONE:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       done:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
@@ -831,9 +815,7 @@ define i32 @test_multiple_pred_undef_2(i1 %cond, i1 %cond2) {
 ; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE_1]] ], [ 0, [[IF_FALSE_2]] ], [ undef, [[IF_TRUE_1]] ], [ [[SUB]], [[IF_TRUE_2]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[DONE:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       done:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
@@ -906,9 +888,7 @@ define i32 @test_multiple_pred_undef_3(i1 %cond, i1 %cond2) {
 ; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[IF_FALSE_1]] ], [ 0, [[IF_FALSE_2]] ], [ undef, [[IF_TRUE_1]] ], [ undef, [[IF_TRUE_2]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[DONE:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       done:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
@@ -1076,9 +1056,7 @@ define i32 @test_switch_ne_default() {
 ; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ [[SUB]], [[DEFAULT]] ], [ 0, [[ONZERO]] ], [ 1, [[ONONE]] ], [ 2, [[ONTWO]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[DONE:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       done:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
@@ -1143,9 +1121,7 @@ define i32 @test_switch_ne_one_case() {
 ; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 2, [[DEFAULT]] ], [ 0, [[ONZERO]] ], [ 1, [[ONONE]] ], [ [[SUB]], [[ONTWO]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 4
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[DONE:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       done:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
@@ -1206,9 +1182,7 @@ define i32 @test_switch_ne_one_case_identical_jumps() {
 ; CHECK-NEXT:    [[MERGE_PHI:%.*]] = phi i32 [ 0, [[DEFAULT]] ], [ 1, [[FIRST_BLOCK]] ], [ [[SUB]], [[LOOP]] ], [ [[SUB]], [[LOOP]] ]
 ; CHECK-NEXT:    [[SUM_NEXT:%.*]] = add i32 [[SUM]], [[MERGE_PHI]]
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp ne i32 [[SUM_NEXT]], 2
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[BACKEDGE_LOOP_CRIT_EDGE:%.*]], label [[DONE:%.*]]
-; CHECK:       backedge.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[DONE:%.*]]
 ; CHECK:       done:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_NEXT_LCSSA]]
diff --git a/llvm/test/Transforms/LoopDeletion/loops-with-irreducible-subloops.ll b/llvm/test/Transforms/LoopDeletion/loops-with-irreducible-subloops.ll
index 5c55c8a1362c9..2828a1c8e05c4 100644
--- a/llvm/test/Transforms/LoopDeletion/loops-with-irreducible-subloops.ll
+++ b/llvm/test/Transforms/LoopDeletion/loops-with-irreducible-subloops.ll
@@ -18,9 +18,7 @@ define void @irreducible_subloop_no_mustprogress(i1 %c1, i1 %c2, i1 %c3) {
 ; CHECK:       irr.bb2:
 ; CHECK-NEXT:    br i1 [[C3:%.*]], label [[LOOP1_LATCH]], label [[IRR_BB1]]
 ; CHECK:       loop1.latch:
-; CHECK-NEXT:    br i1 false, label [[LOOP1_LATCH_LOOP1_CRIT_EDGE:%.*]], label [[EXIT:%.*]]
-; CHECK:       loop1.latch.loop1_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopDeletion/noop-loops-with-subloops.ll b/llvm/test/Transforms/LoopDeletion/noop-loops-with-subloops.ll
index bf5288085df14..b1e91dc4475c7 100644
--- a/llvm/test/Transforms/LoopDeletion/noop-loops-with-subloops.ll
+++ b/llvm/test/Transforms/LoopDeletion/noop-loops-with-subloops.ll
@@ -168,9 +168,7 @@ define void @inner_loop_may_be_infinite(i1 %c1, i1 %c2) {
 ; CHECK:       loop1.latch.loopexit:
 ; CHECK-NEXT:    br label [[LOOP1_LATCH]]
 ; CHECK:       loop1.latch:
-; CHECK-NEXT:    br i1 false, label [[LOOP1_LATCH_LOOP1_CRIT_EDGE:%.*]], label [[EXIT:%.*]]
-; CHECK:       loop1.latch.loop1_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -280,9 +278,7 @@ define void @loop2_mustprogress_but_not_sibling_loop(i1 %c1, i1 %c2, i1 %c3) {
 ; CHECK:       loop1.latch.loopexit:
 ; CHECK-NEXT:    br label [[LOOP1_LATCH]]
 ; CHECK:       loop1.latch:
-; CHECK-NEXT:    br i1 false, label [[LOOP1_LATCH_LOOP1_CRIT_EDGE:%.*]], label [[EXIT:%.*]]
-; CHECK:       loop1.latch.loop1_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopDeletion/pr49967.ll b/llvm/test/Transforms/LoopDeletion/pr49967.ll
index 754ac31ed7b5b..e7f510210966c 100644
--- a/llvm/test/Transforms/LoopDeletion/pr49967.ll
+++ b/llvm/test/Transforms/LoopDeletion/pr49967.ll
@@ -10,7 +10,7 @@ define void @test() {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY63:%.*]]
 ; CHECK:       for.cond.cleanup62:
-; CHECK-NEXT:    br i1 true, label [[FOR_BODY151_PREHEADER:%.*]], label [[VECTOR_PH]]
+; CHECK-NEXT:    br label [[FOR_BODY151_PREHEADER:%.*]]
 ; CHECK:       for.body151.preheader:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP150_LOOPEXIT:%.*]]
 ; CHECK:       for.body63:
@@ -18,9 +18,7 @@ define void @test() {
 ; CHECK-NEXT:    store i16 undef, i16* undef, align 1
 ; CHECK-NEXT:    [[INC89:%.*]] = add nuw nsw i16 [[I58_010]], 1
 ; CHECK-NEXT:    [[EXITCOND12_NOT:%.*]] = icmp eq i16 [[INC89]], 33
-; CHECK-NEXT:    br i1 [[EXITCOND12_NOT]], label [[FOR_COND_CLEANUP62:%.*]], label [[FOR_BODY63_FOR_BODY63_CRIT_EDGE:%.*]]
-; CHECK:       for.body63.for.body63_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP62:%.*]]
 ; CHECK:       for.cond.cleanup150.loopexit:
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvm/test/Transforms/LoopDeletion/unreachable-loops.ll b/llvm/test/Transforms/LoopDeletion/unreachable-loops.ll
index 8e9b5459215d8..bd0d53f28a72c 100644
--- a/llvm/test/Transforms/LoopDeletion/unreachable-loops.ll
+++ b/llvm/test/Transforms/LoopDeletion/unreachable-loops.ll
@@ -317,21 +317,7 @@ exit:
 define void @test9(i64 %n) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[L1:%.*]]
-; CHECK:       L1.loopexit:
-; CHECK-NEXT:    br label [[L1]]
-; CHECK:       L1:
-; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[L2_PREHEADER:%.*]]
-; CHECK:       L2.preheader:
-; CHECK-NEXT:    br label [[L3_PREHEADER:%.*]]
-; CHECK:       L3.preheader:
-; CHECK-NEXT:    [[Y_L2_LCSSA:%.*]] = phi i64 [ undef, [[L2_PREHEADER]] ]
-; CHECK-NEXT:    br label [[L3:%.*]]
-; CHECK:       L3:
-; CHECK-NEXT:    [[COND2:%.*]] = icmp slt i64 [[Y_L2_LCSSA]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[COND2]], label [[L3_L3_CRIT_EDGE:%.*]], label [[L1_LOOPEXIT:%.*]]
-; CHECK:       L3.L3_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopDeletion/zero-btc.ll b/llvm/test/Transforms/LoopDeletion/zero-btc.ll
index 28648a5e3192f..161bb39ac1f3d 100644
--- a/llvm/test/Transforms/LoopDeletion/zero-btc.ll
+++ b/llvm/test/Transforms/LoopDeletion/zero-btc.ll
@@ -9,9 +9,7 @@ define void @test_trivial() {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    store i32 0, i32* @G, align 4
-; CHECK-NEXT:    br i1 false, label [[LOOP_LOOP_CRIT_EDGE:%.*]], label [[EXIT:%.*]]
-; CHECK:       loop.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -36,9 +34,7 @@ define void @test_bottom_tested() {
 ; CHECK-NEXT:    store i32 0, i32* @G, align 4
 ; CHECK-NEXT:    [[IV_INC:%.*]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[BE_TAKEN:%.*]] = icmp ne i32 [[IV_INC]], 1
-; CHECK-NEXT:    br i1 [[BE_TAKEN]], label [[LOOP_LOOP_CRIT_EDGE:%.*]], label [[EXIT:%.*]]
-; CHECK:       loop.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -67,8 +63,6 @@ define void @test_early_exit() {
 ; CHECK-NEXT:    [[BE_TAKEN:%.*]] = icmp ne i32 [[IV_INC]], 1
 ; CHECK-NEXT:    br i1 [[BE_TAKEN]], label [[LATCH:%.*]], label [[EXIT:%.*]]
 ; CHECK:       latch:
-; CHECK-NEXT:    br label [[LATCH_SPLIT:%.*]]
-; CHECK:       latch.split:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -102,9 +96,7 @@ define void @test_multi_exit1() {
 ; CHECK:       latch:
 ; CHECK-NEXT:    store i32 1, i32* @G, align 4
 ; CHECK-NEXT:    [[COND2:%.*]] = icmp ult i32 [[IV_INC]], 30
-; CHECK-NEXT:    br i1 [[COND2]], label [[LATCH_LOOP_CRIT_EDGE:%.*]], label [[EXIT]]
-; CHECK:       latch.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -135,9 +127,7 @@ define void @test_multi_exit2() {
 ; CHECK-NEXT:    br i1 true, label [[LATCH:%.*]], label [[EXIT:%.*]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    store i32 1, i32* @G, align 4
-; CHECK-NEXT:    br i1 false, label [[LATCH_LOOP_CRIT_EDGE:%.*]], label [[EXIT]]
-; CHECK:       latch.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -167,9 +157,7 @@ define void @test_multi_exit3(i1 %cond1) {
 ; CHECK-NEXT:    store i32 1, i32* @G, align 4
 ; CHECK-NEXT:    [[IV_INC:%.*]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[BE_TAKEN:%.*]] = icmp ne i32 [[IV_INC]], 1
-; CHECK-NEXT:    br i1 [[BE_TAKEN]], label [[LATCH_LOOP_CRIT_EDGE:%.*]], label [[EXIT]]
-; CHECK:       latch.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -229,9 +217,7 @@ define void @test_multi_exit5() {
 ; CHECK-NEXT:    br i1 true, label [[LATCH:%.*]], label [[EXIT1:%.*]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    store i32 1, i32* @G, align 4
-; CHECK-NEXT:    br i1 false, label [[LATCH_LOOP_CRIT_EDGE:%.*]], label [[EXIT2:%.*]]
-; CHECK:       latch.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[EXIT2:%.*]]
 ; CHECK:       exit1:
 ; CHECK-NEXT:    ret void
 ; CHECK:       exit2:
@@ -267,9 +253,7 @@ define void @test_live_inner() {
 ; CHECK-NEXT:    [[CND:%.*]] = icmp ult i32 [[IV_INC]], 200
 ; CHECK-NEXT:    br i1 [[CND]], label [[INNER]], label [[LATCH:%.*]]
 ; CHECK:       latch:
-; CHECK-NEXT:    br i1 false, label [[LATCH_LOOP_CRIT_EDGE:%.*]], label [[EXIT:%.*]]
-; CHECK:       latch.loop_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -303,9 +287,7 @@ define void @test_live_outer() {
 ; CHECK-NEXT:    br label [[INNER:%.*]]
 ; CHECK:       inner:
 ; CHECK-NEXT:    store i32 0, i32* @G, align 4
-; CHECK-NEXT:    br i1 false, label [[INNER_INNER_CRIT_EDGE:%.*]], label [[LATCH]]
-; CHECK:       inner.inner_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
 ; CHECK-NEXT:    store i32 [[IV]], i32* @G, align 4
 ; CHECK-NEXT:    [[IV_INC]] = add i32 [[IV]], 1
@@ -350,9 +332,7 @@ define void @loop_nest_lcssa() {
 ; CHECK-NEXT:    br i1 false, label [[INNER_LATCH:%.*]], label [[OUTER_LATCH:%.*]]
 ; CHECK:       inner_latch:
 ; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP0]], [[INNER_HEADER]] ]
-; CHECK-NEXT:    br i1 false, label [[INNER_LATCH_INNER_HEADER_CRIT_EDGE:%.*]], label [[LOOPEXIT:%.*]]
-; CHECK:       inner_latch.inner_header_crit_edge:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[LOOPEXIT:%.*]]
 ; CHECK:       outer_latch:
 ; CHECK-NEXT:    br label [[OUTER_HEADER]]
 ; CHECK:       loopexit:
diff --git a/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll b/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
index cee15ee8a44ad..4bf3129e877a4 100644
--- a/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
+++ b/llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll
@@ -786,6 +786,54 @@ for.empty:
   ret void
 } 
 
+; GEP doesn't dominate the loop latch so can't guarantee N*M won't overflow.
+@first = global i32 1, align 4
+@a = external global [0 x i8], align 1
+define void @overflow(i32 %lim, i8* %a) {
+entry:
+  %cmp17.not = icmp eq i32 %lim, 0
+  br i1 %cmp17.not, label %for.cond.cleanup, label %for.cond1.preheader.preheader
+
+for.cond1.preheader.preheader:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %i.018 = phi i32 [ %inc6, %for.cond.cleanup3 ], [ 0, %for.cond1.preheader.preheader ]
+  %mul = mul i32 %i.018, 100000
+  br label %for.body4
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.cond.cleanup3:
+  %inc6 = add i32 %i.018, 1
+  %cmp = icmp ult i32 %inc6, %lim
+  br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup.loopexit
+
+for.body4:
+  %j.016 = phi i32 [ 0, %for.cond1.preheader ], [ %inc, %if.end ]
+  %add = add i32 %j.016, %mul
+  %0 = load i32, i32* @first, align 4
+  %tobool.not = icmp eq i32 %0, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:
+  %arrayidx = getelementptr inbounds [0 x i8], [0 x i8]* @a, i32 0, i32 %add
+  %1 = load i8, i8* %arrayidx, align 1
+  tail call void asm sideeffect "", "r"(i8 %1)
+  store i32 0, i32* @first, align 4
+  br label %if.end
+
+if.end:
+  tail call void asm sideeffect "", "r"(i32 %add)
+  %inc = add nuw nsw i32 %j.016, 1
+  %cmp2 = icmp ult i32 %j.016, 99999
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
+}
+
 declare void @objc_enumerationMutation(i8*)
 declare dso_local void @f(i32*)
 declare dso_local void @g(...)
diff --git a/llvm/test/Transforms/LoopIdiom/memset-runtime-debug.ll b/llvm/test/Transforms/LoopIdiom/memset-runtime-debug.ll
new file mode 100644
index 0000000000000..8ee554eb6d25e
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/memset-runtime-debug.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: asserts
+; RUN: opt < %s -S -debug -passes=loop-idiom 2>&1 | FileCheck %s
+; The C code to generate this testcase:
+; void test(int *ar, int n, int m)
+; {
+;	  long i;
+;	  for (i=0; i<n; ++i) {
+;     int *arr = ar + i * m;
+;     memset(arr, 0, i + m * sizeof(int));
+;   }
+; }
+
+; Check on debug outputs...
+; CHECK: loop-idiom Scanning: F[MemsetSize_LoopVariant] Countable Loop %for.body
+; CHECK-NEXT: memset size is non-constant
+; CHECK-NEXT: memset size is not a loop-invariant, abort
+; CHECK: loop-idiom Scanning: F[MemsetSize_Stride_Mismatch] Countable Loop %for.body
+; CHECK-NEXT: memset size is non-constant
+; CHECK-NEXT: MemsetSizeSCEV: (4 * (sext i32 %m to i64))<nsw>
+; CHECK-NEXT: PositiveStrideSCEV: (4 + (4 * (sext i32 %m to i64))<nsw>)<nsw>
+; CHECK-NEXT: SCEV don't match, abort
+; CHECK: loop-idiom Scanning: F[NonZeroAddressSpace] Countable Loop %for.cond1.preheader
+; CHECK-NEXT: memset size is non-constant
+; CHECK-NEXT: pointer is not in address space zero, abort
+; CHECK: loop-idiom Scanning: F[NonAffinePointer] Countable Loop %for.body
+; CHECK-NEXT: Pointer is not affine, abort
+
+define void @MemsetSize_LoopVariant(i32* %ar, i32 %n, i32 %m) {
+; CHECK-LABEL: @MemsetSize_LoopVariant(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i64 0, [[CONV]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[M:%.*]] to i64
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[M]] to i64
+; CHECK-NEXT:    [[MUL3:%.*]] = mul i64 [[CONV2]], 4
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_02:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[I_02]], [[CONV1]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[AR:%.*]], i64 [[MUL]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ADD_PTR]] to i8*
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[I_02]], [[MUL3]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 [[ADD]], i1 false)
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_02]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INC]], [[CONV]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = sext i32 %n to i64
+  %cmp1 = icmp slt i64 0, %conv
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv1 = sext i32 %m to i64
+  %conv2 = sext i32 %m to i64
+  %mul3 = mul i64 %conv2, 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc
+  %i.02 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %mul = mul nsw i64 %i.02, %conv1
+  %add.ptr = getelementptr inbounds i32, i32* %ar, i64 %mul
+  %0 = bitcast i32* %add.ptr to i8*
+  %add = add nsw i64 %i.02, %mul3
+  call void @llvm.memset.p0i8.i64(i8* align 4 %0, i8 0, i64 %add, i1 false)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nuw nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, %conv
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
+; void test(int *ar, int n, int m)
+; {
+;	  long i;
+;	  for (i=0; i<n; ++i) {
+;     int *arr = ar + i + i * m;
+;     memset(arr, 0, m * sizeof(int));
+;   }
+; }
+define void @MemsetSize_Stride_Mismatch(i32* %ar, i32 %n, i32 %m) {
+; CHECK-LABEL: @MemsetSize_Stride_Mismatch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i64 0, [[CONV]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[M:%.*]] to i64
+; CHECK-NEXT:    [[CONV3:%.*]] = sext i32 [[M]] to i64
+; CHECK-NEXT:    [[MUL4:%.*]] = mul i64 [[CONV3]], 4
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_02:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[AR:%.*]], i64 [[I_02]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[I_02]], [[CONV1]]
+; CHECK-NEXT:    [[ADD_PTR2:%.*]] = getelementptr inbounds i32, i32* [[ADD_PTR]], i64 [[MUL]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ADD_PTR2]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 [[MUL4]], i1 false)
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_02]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INC]], [[CONV]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = sext i32 %n to i64
+  %cmp1 = icmp slt i64 0, %conv
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv1 = sext i32 %m to i64
+  %conv3 = sext i32 %m to i64
+  %mul4 = mul i64 %conv3, 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc
+  %i.02 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %add.ptr = getelementptr inbounds i32, i32* %ar, i64 %i.02
+  %mul = mul nsw i64 %i.02, %conv1
+  %add.ptr2 = getelementptr inbounds i32, i32* %add.ptr, i64 %mul
+  %0 = bitcast i32* %add.ptr2 to i8*
+  call void @llvm.memset.p0i8.i64(i8* align 4 %0, i8 0, i64 %mul4, i1 false)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nuw nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, %conv
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
+
+define void @NonZeroAddressSpace(i32 addrspace(2)* nocapture %ar, i64 %n, i64 %m) {
+; CHECK-LABEL: @NonZeroAddressSpace(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw i64 [[M:%.*]], 2
+; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
+; CHECK:       for.cond1.preheader:
+; CHECK-NEXT:    [[I_017:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC5:%.*]], [[FOR_INC4:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[M]], [[I_017]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(2)* [[AR:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = bitcast i32 addrspace(2)* [[SCEVGEP]] to i8 addrspace(2)*
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[I_017]], [[M]]
+; CHECK-NEXT:    call void @llvm.memset.p2i8.i64(i8 addrspace(2)* align 4 [[SCEVGEP1]], i8 0, i64 [[TMP0]], i1 false)
+; CHECK-NEXT:    br label [[FOR_INC4]]
+; CHECK:       for.inc4:
+; CHECK-NEXT:    [[INC5]] = add nuw nsw i64 [[I_017]], 1
+; CHECK-NEXT:    [[EXITCOND18_NOT:%.*]] = icmp eq i64 [[INC5]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND18_NOT]], label [[FOR_END6:%.*]], label [[FOR_COND1_PREHEADER]]
+; CHECK:       for.end6:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = shl nuw i64 %m, 2
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc4, %entry
+  %i.017 = phi i64 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %1 = mul i64 %m, %i.017
+  %scevgep = getelementptr i32, i32 addrspace(2)* %ar, i64 %1
+  %scevgep1 = bitcast i32 addrspace(2)* %scevgep to i8 addrspace(2)*
+  %mul = mul nsw i64 %i.017, %m
+  call void @llvm.memset.p2i8.i64(i8 addrspace(2)* align 4 %scevgep1, i8 0, i64 %0, i1 false)
+  br label %for.inc4
+
+for.inc4:                                         ; preds = %for.cond1.preheader
+  %inc5 = add nuw nsw i64 %i.017, 1
+  %exitcond18.not = icmp eq i64 %inc5, %n
+  br i1 %exitcond18.not, label %for.end6, label %for.cond1.preheader
+
+for.end6:                                         ; preds = %for.inc4
+  ret void
+}
+
+; void test(int *ar, int n, int m)
+; {
+;   long i;
+;   for (i=0; i<n; ++i) {
+;     int *arr = ar + i * m;
+;     memset(arr, 0, m * sizeof(int));
+; 	  ar = ar + i;
+;   }
+; }
+define void @NonAffinePointer(i32* %ar, i32 %n, i32 %m) {
+; CHECK-LABEL: @NonAffinePointer(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[N:%.*]] to i64
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i64 0, [[CONV]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[M:%.*]] to i64
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[M]] to i64
+; CHECK-NEXT:    [[MUL3:%.*]] = mul i64 [[CONV2]], 4
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[AR_ADDR_03:%.*]] = phi i32* [ [[AR:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR4:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[I_02:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[I_02]], [[CONV1]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[AR_ADDR_03]], i64 [[MUL]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[ADD_PTR]] to i8*
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 [[MUL3]], i1 false)
+; CHECK-NEXT:    [[ADD_PTR4]] = getelementptr inbounds i32, i32* [[AR_ADDR_03]], i64 [[I_02]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_02]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INC]], [[CONV]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
+; CHECK:       for.cond.for.end_crit_edge:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = sext i32 %n to i64
+  %cmp1 = icmp slt i64 0, %conv
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv1 = sext i32 %m to i64
+  %conv2 = sext i32 %m to i64
+  %mul3 = mul i64 %conv2, 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc
+  %ar.addr.03 = phi i32* [ %ar, %for.body.lr.ph ], [ %add.ptr4, %for.inc ]
+  %i.02 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  %mul = mul nsw i64 %i.02, %conv1
+  %add.ptr = getelementptr inbounds i32, i32* %ar.addr.03, i64 %mul
+  %0 = bitcast i32* %add.ptr to i8*
+  call void @llvm.memset.p0i8.i64(i8* align 4 %0, i8 0, i64 %mul3, i1 false)
+  %add.ptr4 = getelementptr inbounds i32, i32* %ar.addr.03, i64 %i.02
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nuw nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, %conv
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
+declare void @llvm.memset.p2i8.i64(i8 addrspace(2)* nocapture writeonly, i8, i64, i1 immarg)
diff --git a/llvm/test/Transforms/LoopIdiom/memset-runtime.ll b/llvm/test/Transforms/LoopIdiom/memset-runtime.ll
new file mode 100644
index 0000000000000..c5eba6cea7d22
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/memset-runtime.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes="function(loop(loop-idiom,loop-deletion),simplifycfg)" -S < %s | FileCheck %s
+; The C code to generate this testcase:
+; void test(int n, int m, int *ar)
+; {
+;	  long i;
+;	  for (i=0; i<n; ++i) {
+;     int *arr = ar + i * m; // ar[i];
+;     memset(arr, 0, m * sizeof(int));
+;   }
+; }
+; The optimized IR should be similar to the following:
+; void test(int n, int m, int *ar)
+; {
+;   memset(ar, 0, m * n * sizeof(int));
+; }
+define void @For_PositiveStride(i32* nocapture %ar, i64 %n, i64 %m) {
+; CHECK-LABEL: @For_PositiveStride(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AR1:%.*]] = bitcast i32* [[AR:%.*]] to i8*
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw i64 [[M:%.*]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[M]], [[N:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 2
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[AR1]], i8 0, i64 [[TMP2]], i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = shl nuw i64 %m, 2
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc4, %entry
+  %i.017 = phi i64 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %1 = mul i64 %m, %i.017
+  %scevgep = getelementptr i32, i32* %ar, i64 %1
+  %scevgep1 = bitcast i32* %scevgep to i8*
+  %mul = mul nsw i64 %i.017, %m
+  call void @llvm.memset.p0i8.i64(i8* align 4 %scevgep1, i8 0, i64 %0, i1 false)
+  br label %for.inc4
+
+for.inc4:                                         ; preds = %for.cond1.preheader
+  %inc5 = add nuw nsw i64 %i.017, 1
+  %exitcond18.not = icmp eq i64 %inc5, %n
+  br i1 %exitcond18.not, label %for.end6, label %for.cond1.preheader
+
+for.end6:                                         ; preds = %for.inc4
+  ret void
+}
+
+; The C code to generate this testcase:
+; void test(int n, int m, int *ar)
+; {
+;	  long i;
+;	  for (i=n-1; i>=0; i--) {
+;     int *arr = ar + i * m;
+;     memset(arr, 0, m * sizeof(int));
+;   }
+; }
+define void @For_NegativeStride(i32* %ar, i32 %n, i32 %m) {
+; CHECK-LABEL: @For_NegativeStride(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AR1:%.*]] = bitcast i32* [[AR:%.*]] to i8*
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[N:%.*]], 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[SUB]] to i64
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sge i64 [[CONV]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i32 [[M:%.*]] to i64
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[M]] to i64
+; CHECK-NEXT:    [[MUL3:%.*]] = mul i64 [[CONV2]], 4
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[CONV]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[CONV1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 2
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[AR1]], i8 0, i64 [[TMP2]], i1 false)
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub = sub nsw i32 %n, 1
+  %conv = sext i32 %sub to i64
+  %cmp1 = icmp sge i64 %conv, 0
+  br i1 %cmp1, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv1 = sext i32 %m to i64
+  %conv2 = sext i32 %m to i64
+  %mul3 = mul i64 %conv2, 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc
+  %i.02 = phi i64 [ %conv, %for.body.lr.ph ], [ %dec, %for.inc ]
+  %mul = mul nsw i64 %i.02, %conv1
+  %add.ptr = getelementptr inbounds i32, i32* %ar, i64 %mul
+  %0 = bitcast i32* %add.ptr to i8*
+  call void @llvm.memset.p0i8.i64(i8* align 4 %0, i8 0, i64 %mul3, i1 false)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %dec = add nsw i64 %i.02, -1
+  %cmp = icmp sge i64 %dec, 0
+  br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.inc
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
diff --git a/llvm/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll b/llvm/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll
index a09a2290e0a54..b7be94bf9669e 100644
--- a/llvm/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll
+++ b/llvm/test/Transforms/LoopRotate/2009-01-25-SingleEntryPhi.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output
-; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -verify-memoryssa -disable-output
 ; PR3408
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopRotate/PhiRename-1.ll b/llvm/test/Transforms/LoopRotate/PhiRename-1.ll
index 8bece445cf463..147c08c49c99d 100644
--- a/llvm/test/Transforms/LoopRotate/PhiRename-1.ll
+++ b/llvm/test/Transforms/LoopRotate/PhiRename-1.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -S | FileCheck %s
-; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -verify-memoryssa -S | FileCheck %s
 ; CHECK-NOT: [ {{.}}tmp224
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/llvm/test/Transforms/LoopRotate/PhiSelfReference-1.ll b/llvm/test/Transforms/LoopRotate/PhiSelfReference-1.ll
index 7726c53e55eee..c97d794811526 100644
--- a/llvm/test/Transforms/LoopRotate/PhiSelfReference-1.ll
+++ b/llvm/test/Transforms/LoopRotate/PhiSelfReference-1.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output
-; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -verify-memoryssa -disable-output
 ; ModuleID = 'PhiSelfReference-1.bc'
 
 define void @snrm2(i32 %incx) {
diff --git a/llvm/test/Transforms/LoopRotate/alloca.ll b/llvm/test/Transforms/LoopRotate/alloca.ll
index 59da33f8802ad..5014258010adc 100644
--- a/llvm/test/Transforms/LoopRotate/alloca.ll
+++ b/llvm/test/Transforms/LoopRotate/alloca.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -loop-rotate -S | FileCheck %s
-; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -verify-memoryssa -S | FileCheck %s
 
 ; Test alloca in -loop-rotate.
 
diff --git a/llvm/test/Transforms/LoopRotate/basic.ll b/llvm/test/Transforms/LoopRotate/basic.ll
index 440cc210643af..5f48d66e6b32d 100644
--- a/llvm/test/Transforms/LoopRotate/basic.ll
+++ b/llvm/test/Transforms/LoopRotate/basic.ll
@@ -1,5 +1,4 @@
-; RUN: opt -S -loop-rotate < %s | FileCheck %s
-; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s
 ; RUN: opt -S -passes='require<targetir>,require<assumptions>,loop(loop-rotate)' < %s | FileCheck %s
 ; RUN: opt -S -passes='require<targetir>,require<assumptions>,loop-mssa(loop-rotate)' -verify-memoryssa  < %s | FileCheck %s
 
diff --git a/llvm/test/Transforms/LoopRotate/callbr.ll b/llvm/test/Transforms/LoopRotate/callbr.ll
index 6eed2eb17dc3f..3883bc4af5b7b 100644
--- a/llvm/test/Transforms/LoopRotate/callbr.ll
+++ b/llvm/test/Transforms/LoopRotate/callbr.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info | FileCheck %s
-; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true | FileCheck %s
 
 @d = external global i64, align 8
 @f = external global i32, align 4
diff --git a/llvm/test/Transforms/LoopRotate/catchret.ll b/llvm/test/Transforms/LoopRotate/catchret.ll
index f28af8aed601c..91c1554ab8ce4 100755
--- a/llvm/test/Transforms/LoopRotate/catchret.ll
+++ b/llvm/test/Transforms/LoopRotate/catchret.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-rotate -S | FileCheck %s
-; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -verify-memoryssa -S | FileCheck %s
 
 target triple = "x86_64-pc-windows-msvc"
 
diff --git a/llvm/test/Transforms/LoopRotate/convergent.ll b/llvm/test/Transforms/LoopRotate/convergent.ll
index 37671562142eb..98733246270ae 100644
--- a/llvm/test/Transforms/LoopRotate/convergent.ll
+++ b/llvm/test/Transforms/LoopRotate/convergent.ll
@@ -1,5 +1,4 @@
-; RUN: opt -S -loop-rotate < %s | FileCheck %s
-; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s
 
 @e = global i32 10
 
diff --git a/llvm/test/Transforms/LoopRotate/crash.ll b/llvm/test/Transforms/LoopRotate/crash.ll
index 2a45e370e18c9..0a1688666ac11 100644
--- a/llvm/test/Transforms/LoopRotate/crash.ll
+++ b/llvm/test/Transforms/LoopRotate/crash.ll
@@ -1,5 +1,4 @@
-; RUN: opt -loop-rotate -disable-output -verify-dom-info -verify-loop-info < %s
-; RUN: opt -loop-rotate -disable-output -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa < %s
+; RUN: opt -loop-rotate -disable-output -verify-dom-info -verify-loop-info -verify-memoryssa < %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/llvm/test/Transforms/LoopRotate/dbg-value-duplicates.ll b/llvm/test/Transforms/LoopRotate/dbg-value-duplicates.ll
index ce7157c571f08..d55c024b9f56c 100644
--- a/llvm/test/Transforms/LoopRotate/dbg-value-duplicates.ll
+++ b/llvm/test/Transforms/LoopRotate/dbg-value-duplicates.ll
@@ -1,5 +1,4 @@
-; RUN: opt -S -loop-rotate < %s | FileCheck %s
-; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s
 source_filename = "/tmp/loop.c"
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.13.0"
diff --git a/llvm/test/Transforms/LoopRotate/dbgvalue.ll b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
index 93e3c4c252cae..4e872d255d7f8 100644
--- a/llvm/test/Transforms/LoopRotate/dbgvalue.ll
+++ b/llvm/test/Transforms/LoopRotate/dbgvalue.ll
@@ -1,5 +1,4 @@
-; RUN: opt -S -loop-rotate < %s | FileCheck %s
-; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone
diff --git a/llvm/test/Transforms/LoopRotate/indirectbr.ll b/llvm/test/Transforms/LoopRotate/indirectbr.ll
index a26ec375953d2..3abf0ba00de81 100644
--- a/llvm/test/Transforms/LoopRotate/indirectbr.ll
+++ b/llvm/test/Transforms/LoopRotate/indirectbr.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info | FileCheck %s
-; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -S -loop-rotate -o - -verify-loop-info -verify-dom-info -verify-memoryssa | FileCheck %s
 
 ; PR5502
 define void @z80_do_opcodes() nounwind {
diff --git a/llvm/test/Transforms/LoopRotate/loopexitinglatch.ll b/llvm/test/Transforms/LoopRotate/loopexitinglatch.ll
index dee29ec958e19..a8f7c0d878cb6 100644
--- a/llvm/test/Transforms/LoopRotate/loopexitinglatch.ll
+++ b/llvm/test/Transforms/LoopRotate/loopexitinglatch.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s
-; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
+; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info -verify-memoryssa | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8m.base-arm-none-eabi"
diff --git a/llvm/test/Transforms/LoopRotate/multiple-exits.ll b/llvm/test/Transforms/LoopRotate/multiple-exits.ll
index c6f153b8ca3ec..1a1ab3f32af36 100644
--- a/llvm/test/Transforms/LoopRotate/multiple-exits.ll
+++ b/llvm/test/Transforms/LoopRotate/multiple-exits.ll
@@ -1,5 +1,4 @@
-; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s
-; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
+; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info -verify-memoryssa | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/llvm/test/Transforms/LoopRotate/noalias.ll b/llvm/test/Transforms/LoopRotate/noalias.ll
index 9f2dbb85bf969..09b5c5fd306d2 100644
--- a/llvm/test/Transforms/LoopRotate/noalias.ll
+++ b/llvm/test/Transforms/LoopRotate/noalias.ll
@@ -1,7 +1,6 @@
-; RUN: opt -S -loop-rotate < %s | FileCheck %s
-; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s
 ; RUN: opt -S -passes='require<targetir>,require<assumptions>,loop(loop-rotate)' < %s | FileCheck %s
-; RUN: opt -S -passes='require<targetir>,require<assumptions>,loop(loop-rotate)' -enable-mssa-loop-dependency=true -verify-memoryssa  < %s | FileCheck %s
+; RUN: opt -S -passes='require<targetir>,require<assumptions>,loop(loop-rotate)' -verify-memoryssa  < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopRotate/phi-dbgvalue.ll b/llvm/test/Transforms/LoopRotate/phi-dbgvalue.ll
index c4e13dc710e7d..fb54cb8042e13 100644
--- a/llvm/test/Transforms/LoopRotate/phi-dbgvalue.ll
+++ b/llvm/test/Transforms/LoopRotate/phi-dbgvalue.ll
@@ -1,5 +1,4 @@
-; RUN: opt -S -loop-rotate < %s | FileCheck %s
-; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s
 
 ;CHECK-LABEL: func
 ;CHECK-LABEL: entry
diff --git a/llvm/test/Transforms/LoopRotate/phi-duplicate.ll b/llvm/test/Transforms/LoopRotate/phi-duplicate.ll
index d7f69d8c9cc3b..292c595aefc1e 100644
--- a/llvm/test/Transforms/LoopRotate/phi-duplicate.ll
+++ b/llvm/test/Transforms/LoopRotate/phi-duplicate.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -loop-rotate < %s | FileCheck %s
-; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0"
 
diff --git a/llvm/test/Transforms/LoopRotate/pr22337.ll b/llvm/test/Transforms/LoopRotate/pr22337.ll
index 8195affbcd3b2..bd8659b39fd2f 100644
--- a/llvm/test/Transforms/LoopRotate/pr22337.ll
+++ b/llvm/test/Transforms/LoopRotate/pr22337.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-rotate -S | FileCheck %s
-; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -verify-memoryssa -S | FileCheck %s
 
 @a = external global i8, align 4
 @tmp = global i8* @a
diff --git a/llvm/test/Transforms/LoopRotate/pr33701.ll b/llvm/test/Transforms/LoopRotate/pr33701.ll
index 8535e31767619..91ee2e84ec175 100644
--- a/llvm/test/Transforms/LoopRotate/pr33701.ll
+++ b/llvm/test/Transforms/LoopRotate/pr33701.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -disable-output
-; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -loop-rotate -verify-dom-info -verify-loop-info -verify-memoryssa -disable-output
 
 define void @func() {
 bb0:
diff --git a/llvm/test/Transforms/LoopRotate/pr37205.ll b/llvm/test/Transforms/LoopRotate/pr37205.ll
index 20ad756818984..161016a2a08c3 100644
--- a/llvm/test/Transforms/LoopRotate/pr37205.ll
+++ b/llvm/test/Transforms/LoopRotate/pr37205.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -indvars -verify -loop-rotate -loop-idiom < %s | FileCheck %s
-; RUN: opt -S -indvars -verify -loop-rotate -loop-idiom -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -indvars -verify -loop-rotate -loop-idiom -verify-memoryssa < %s | FileCheck %s
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Verify that we invalidate SCEV properly.
diff --git a/llvm/test/Transforms/LoopRotate/preserve-mssa.ll b/llvm/test/Transforms/LoopRotate/preserve-mssa.ll
index d975f80cd9e47..542bb7cb00ca3 100644
--- a/llvm/test/Transforms/LoopRotate/preserve-mssa.ll
+++ b/llvm/test/Transforms/LoopRotate/preserve-mssa.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-rotate -verify-memoryssa < %s | FileCheck %s
 
 ; CHECK-LABEL: @multiedge(
 define void @multiedge() {
diff --git a/llvm/test/Transforms/LoopRotate/preserve-scev.ll b/llvm/test/Transforms/LoopRotate/preserve-scev.ll
index 2faf8ec487aaa..c35572ab8ccd1 100644
--- a/llvm/test/Transforms/LoopRotate/preserve-scev.ll
+++ b/llvm/test/Transforms/LoopRotate/preserve-scev.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-rotate -loop-reduce -verify-dom-info -verify-loop-info -disable-output
-; RUN: opt < %s -loop-rotate -loop-reduce -enable-mssa-loop-dependency=true -verify-memoryssa -verify-dom-info -verify-loop-info -disable-output
+; RUN: opt < %s -loop-rotate -loop-reduce -verify-memoryssa -verify-dom-info -verify-loop-info -disable-output
 
 define fastcc void @foo(i32* %A, i64 %i) nounwind {
 BB:
diff --git a/llvm/test/Transforms/LoopRotate/vect.omp.persistence.ll b/llvm/test/Transforms/LoopRotate/vect.omp.persistence.ll
index c4c987e7b2baf..767d9093ad415 100644
--- a/llvm/test/Transforms/LoopRotate/vect.omp.persistence.ll
+++ b/llvm/test/Transforms/LoopRotate/vect.omp.persistence.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-rotate -S | FileCheck %s
-; RUN: opt < %s -loop-rotate -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
+; RUN: opt < %s -loop-rotate -verify-memoryssa -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
index 24e443a5ca869..3ab19b22c9c8e 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/constant-fold-branch.ll
@@ -2,7 +2,7 @@
 ; REQUIRES: asserts
 ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
 ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
-; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/irreducible_cfg.ll b/llvm/test/Transforms/LoopSimplifyCFG/irreducible_cfg.ll
index 201c5f10b5196..1a2368796a3b4 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/irreducible_cfg.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/irreducible_cfg.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
-; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
 ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(loop-simplifycfg)' -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
-; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-memoryssa -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
 
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/lcssa.ll b/llvm/test/Transforms/LoopSimplifyCFG/lcssa.ll
index 84bea41af1dbf..326f474ac05b9 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/lcssa.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/lcssa.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
 ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
-; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/live_block_marking.ll b/llvm/test/Transforms/LoopSimplifyCFG/live_block_marking.ll
index 0f290282cb2d4..96364cf5fe34a 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/live_block_marking.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/live_block_marking.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
-; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -indvars -loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
 ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(indvars,loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
-; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -indvars -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -indvars -loop-simplifycfg -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
 
 define void @test(i1 %c) {
 ; CHECK-LABEL: @test(
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/merge-header.ll b/llvm/test/Transforms/LoopSimplifyCFG/merge-header.ll
index 8db5de7f851de..dec27d91e2623 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/merge-header.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/merge-header.ll
@@ -1,6 +1,5 @@
-; RUN: opt -S -loop-simplifycfg < %s | FileCheck %s
 ; RUN: opt -S -passes='require<domtree>,loop(loop-simplifycfg)' < %s | FileCheck %s
-; RUN: opt -S -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-simplifycfg -verify-memoryssa < %s | FileCheck %s
 
 ; CHECK-LABEL: foo
 ; CHECK:      entry:
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/mssa_update.ll b/llvm/test/Transforms/LoopSimplifyCFG/mssa_update.ll
index 5a3ae453610dc..ab5442c066e07 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/mssa_update.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/mssa_update.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
-; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
 ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
-; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/phi_with_duplicating_inputs.ll b/llvm/test/Transforms/LoopSimplifyCFG/phi_with_duplicating_inputs.ll
index ffd298032222b..eeadd60d04868 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/phi_with_duplicating_inputs.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/phi_with_duplicating_inputs.ll
@@ -2,9 +2,8 @@
 ; This is currently failing because of bug in LoopSimplifyCFG. It does not update
 ; duplicating Phi inputs properly.
 ; REQUIRES: asserts
-; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
 ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(loop-simplifycfg)' -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
-; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-memoryssa -debug-only=loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
 
 target datalayout = "P40"
 
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/pr39783.ll b/llvm/test/Transforms/LoopSimplifyCFG/pr39783.ll
index 570d57cde2dc9..f06fab7fe0dc2 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/pr39783.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/pr39783.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -mcpu=z13 -S -loop-simplifycfg -enable-mssa-loop-dependency -enable-loop-simplifycfg-term-folding -verify-memoryssa 2>&1 < %s | FileCheck %s
+; RUN: opt -mcpu=z13 -S -loop-simplifycfg -enable-loop-simplifycfg-term-folding -verify-memoryssa 2>&1 < %s | FileCheck %s
 target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64"
 
 @global = external dso_local local_unnamed_addr global i8, align 2
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/scev.ll b/llvm/test/Transforms/LoopSimplifyCFG/scev.ll
index 123c7e6d4a869..d66e5261b769c 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/scev.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/scev.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -loop-simplifycfg -verify-scev < %s | FileCheck %s
-; RUN: opt -S -loop-simplifycfg -verify-scev -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-simplifycfg -verify-scev -verify-memoryssa < %s | FileCheck %s
 
 ; Verify that the scev information is still valid. Verification should not fail
 
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/update_parents.ll b/llvm/test/Transforms/LoopSimplifyCFG/update_parents.ll
index f1104a0f96f12..c120cae54cb72 100644
--- a/llvm/test/Transforms/LoopSimplifyCFG/update_parents.ll
+++ b/llvm/test/Transforms/LoopSimplifyCFG/update_parents.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: asserts
-; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
 ; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
-; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -enable-mssa-loop-dependency=true -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -loop-simplifycfg -verify-memoryssa -verify-loop-info -verify-dom-info -verify-loop-lcssa 2>&1 < %s | FileCheck %s
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/wrong-hoisting-iv.ll b/llvm/test/Transforms/LoopStrengthReduce/wrong-hoisting-iv.ll
new file mode 100644
index 0000000000000..289093f469e3b
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/wrong-hoisting-iv.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+
+; These are regression tests for PR43768.
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test checks that LSR does not hoist increment of %val9 while expanding the other pieces of formula
+; to original place in backedge causing incorrect SSA form.
+define void @test1() {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, i32 addrspace(3)* undef, align 4
+; CHECK-NEXT:    [[VAL1:%.*]] = add i32 undef, 12
+; CHECK-NEXT:    [[VAL2:%.*]] = trunc i64 undef to i32
+; CHECK-NEXT:    [[VAL3:%.*]] = mul i32 [[VAL1]], [[VAL2]]
+; CHECK-NEXT:    [[VAL4:%.*]] = sub i32 [[VAL]], [[VAL3]]
+; CHECK-NEXT:    [[VAL5:%.*]] = ashr i32 undef, undef
+; CHECK-NEXT:    [[VAL6:%.*]] = sub i32 [[VAL4]], [[VAL5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i32 [[VAL]], 7
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[VAL3]], 7
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[VAL5]], 7
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i32 [[VAL6]], 3
+; CHECK-NEXT:    br label [[BB7:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi i32 [ [[LSR_IV_NEXT2:%.*]], [[BB32:%.*]] ], [ 0, [[BB:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[BB32]] ], [ -8, [[BB]] ]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 8
+; CHECK-NEXT:    [[LSR_IV_NEXT2]] = add nuw nsw i32 [[LSR_IV1]], [[TMP5]]
+; CHECK-NEXT:    [[VAL10:%.*]] = icmp ult i64 [[LSR_IV_NEXT]], 65536
+; CHECK-NEXT:    br i1 [[VAL10]], label [[BB12:%.*]], label [[BB11:%.*]]
+; CHECK:       bb11:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb12:
+; CHECK-NEXT:    [[VAL14:%.*]] = icmp slt i32 undef, undef
+; CHECK-NEXT:    br i1 [[VAL14]], label [[BB17:%.*]], label [[BB12_BB15SPLITSPLITSPLITSPLITSPLIT_CRIT_EDGE:%.*]]
+; CHECK:       bb15splitsplitsplitsplitsplitsplit:
+; CHECK-NEXT:    br label [[BB15SPLITSPLITSPLITSPLITSPLIT:%.*]]
+; CHECK:       bb12.bb15splitsplitsplitsplitsplit_crit_edge:
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[VAL6]], [[LSR_IV1]]
+; CHECK-NEXT:    br label [[BB15SPLITSPLITSPLITSPLITSPLIT]]
+; CHECK:       bb15splitsplitsplitsplitsplit:
+; CHECK-NEXT:    [[VAL16_PH_PH_PH_PH_PH:%.*]] = phi i32 [ [[TMP6]], [[BB12_BB15SPLITSPLITSPLITSPLITSPLIT_CRIT_EDGE]] ], [ [[VAL35:%.*]], [[BB15SPLITSPLITSPLITSPLITSPLITSPLIT:%.*]] ]
+; CHECK-NEXT:    br label [[BB15SPLITSPLITSPLITSPLIT:%.*]]
+; CHECK:       bb17.bb15splitsplitsplitsplit_crit_edge:
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i32 [[VAL]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i32 [[VAL1]], [[VAL2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shl i32 [[TMP8]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shl i32 [[VAL5]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[LSR_IV1]]
+; CHECK-NEXT:    br label [[BB15SPLITSPLITSPLITSPLIT]]
+; CHECK:       bb15splitsplitsplitsplit:
+; CHECK-NEXT:    [[VAL16_PH_PH_PH_PH:%.*]] = phi i32 [ [[TMP13]], [[BB17_BB15SPLITSPLITSPLITSPLIT_CRIT_EDGE:%.*]] ], [ [[VAL16_PH_PH_PH_PH_PH]], [[BB15SPLITSPLITSPLITSPLITSPLIT]] ]
+; CHECK-NEXT:    br label [[BB15SPLITSPLITSPLIT:%.*]]
+; CHECK:       bb20.bb15splitsplitsplit_crit_edge:
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i32 [[VAL]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i32 [[VAL1]], [[VAL2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i32 [[TMP15]], 3
+; CHECK-NEXT:    [[TMP17:%.*]] = sub i32 [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[VAL5]], 3
+; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], [[LSR_IV1]]
+; CHECK-NEXT:    br label [[BB15SPLITSPLITSPLIT]]
+; CHECK:       bb15splitsplitsplit:
+; CHECK-NEXT:    [[VAL16_PH_PH_PH:%.*]] = phi i32 [ [[TMP20]], [[BB20_BB15SPLITSPLITSPLIT_CRIT_EDGE:%.*]] ], [ [[VAL16_PH_PH_PH_PH]], [[BB15SPLITSPLITSPLITSPLIT]] ]
+; CHECK-NEXT:    br label [[BB15SPLITSPLIT:%.*]]
+; CHECK:       bb23.bb15splitsplit_crit_edge:
+; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[VAL]], 2
+; CHECK-NEXT:    [[TMP22:%.*]] = mul i32 [[VAL1]], [[VAL2]]
+; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 2
+; CHECK-NEXT:    [[TMP24:%.*]] = sub i32 [[TMP21]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = shl i32 [[VAL5]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[TMP26]], [[LSR_IV1]]
+; CHECK-NEXT:    br label [[BB15SPLITSPLIT]]
+; CHECK:       bb15splitsplit:
+; CHECK-NEXT:    [[VAL16_PH_PH:%.*]] = phi i32 [ [[TMP27]], [[BB23_BB15SPLITSPLIT_CRIT_EDGE:%.*]] ], [ [[VAL16_PH_PH_PH]], [[BB15SPLITSPLITSPLIT]] ]
+; CHECK-NEXT:    br label [[BB15SPLIT:%.*]]
+; CHECK:       bb26.bb15split_crit_edge:
+; CHECK-NEXT:    [[TMP28:%.*]] = mul i32 [[VAL]], 5
+; CHECK-NEXT:    [[TMP29:%.*]] = mul i32 [[VAL1]], [[VAL2]]
+; CHECK-NEXT:    [[TMP30:%.*]] = mul i32 [[TMP29]], 5
+; CHECK-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP28]], [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = mul i32 [[VAL5]], 5
+; CHECK-NEXT:    [[TMP33:%.*]] = sub i32 [[TMP31]], [[TMP32]]
+; CHECK-NEXT:    [[TMP34:%.*]] = add i32 [[TMP33]], [[LSR_IV1]]
+; CHECK-NEXT:    br label [[BB15SPLIT]]
+; CHECK:       bb15split:
+; CHECK-NEXT:    [[VAL16_PH:%.*]] = phi i32 [ [[TMP34]], [[BB26_BB15SPLIT_CRIT_EDGE:%.*]] ], [ [[VAL16_PH_PH]], [[BB15SPLITSPLIT]] ]
+; CHECK-NEXT:    br label [[BB15:%.*]]
+; CHECK:       bb29.bb15_crit_edge:
+; CHECK-NEXT:    [[TMP35:%.*]] = mul i32 [[VAL]], 6
+; CHECK-NEXT:    [[TMP36:%.*]] = mul i32 [[VAL1]], [[VAL2]]
+; CHECK-NEXT:    [[TMP37:%.*]] = mul i32 [[TMP36]], 6
+; CHECK-NEXT:    [[TMP38:%.*]] = sub i32 [[TMP35]], [[TMP37]]
+; CHECK-NEXT:    [[TMP39:%.*]] = mul i32 [[VAL5]], 6
+; CHECK-NEXT:    [[TMP40:%.*]] = sub i32 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    [[TMP41:%.*]] = add i32 [[TMP40]], [[LSR_IV1]]
+; CHECK-NEXT:    br label [[BB15]]
+; CHECK:       bb15:
+; CHECK-NEXT:    [[VAL16:%.*]] = phi i32 [ [[TMP41]], [[BB29_BB15_CRIT_EDGE:%.*]] ], [ [[VAL16_PH]], [[BB15SPLIT]] ]
+; CHECK-NEXT:    call void @widget() [ "deopt"(i32 [[VAL16]], i32 3, i32 [[VAL]]) ]
+; CHECK-NEXT:    unreachable
+; CHECK:       bb17:
+; CHECK-NEXT:    [[VAL19:%.*]] = icmp slt i32 undef, undef
+; CHECK-NEXT:    br i1 [[VAL19]], label [[BB20:%.*]], label [[BB17_BB15SPLITSPLITSPLITSPLIT_CRIT_EDGE]]
+; CHECK:       bb20:
+; CHECK-NEXT:    [[VAL22:%.*]] = icmp slt i32 undef, undef
+; CHECK-NEXT:    br i1 [[VAL22]], label [[BB23:%.*]], label [[BB20_BB15SPLITSPLITSPLIT_CRIT_EDGE]]
+; CHECK:       bb23:
+; CHECK-NEXT:    [[VAL25:%.*]] = icmp slt i32 undef, undef
+; CHECK-NEXT:    br i1 [[VAL25]], label [[BB26:%.*]], label [[BB23_BB15SPLITSPLIT_CRIT_EDGE]]
+; CHECK:       bb26:
+; CHECK-NEXT:    [[VAL28:%.*]] = icmp slt i32 undef, undef
+; CHECK-NEXT:    br i1 [[VAL28]], label [[BB29:%.*]], label [[BB26_BB15SPLIT_CRIT_EDGE]]
+; CHECK:       bb29:
+; CHECK-NEXT:    [[VAL31:%.*]] = icmp slt i32 undef, undef
+; CHECK-NEXT:    br i1 [[VAL31]], label [[BB32]], label [[BB29_BB15_CRIT_EDGE]]
+; CHECK:       bb32:
+; CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP4]], [[LSR_IV1]]
+; CHECK-NEXT:    [[VAL35]] = add i32 [[TMP42]], [[VAL6]]
+; CHECK-NEXT:    br i1 false, label [[BB7]], label [[BB15SPLITSPLITSPLITSPLITSPLITSPLIT]]
+;
+bb:
+  %val = load i32, i32 addrspace(3)* undef, align 4
+  %val1 = add i32 undef, 12
+  %val2 = trunc i64 undef to i32
+  %val3 = mul i32 %val1, %val2
+  %val4 = sub i32 %val, %val3
+  %val5 = ashr i32 undef, undef
+  %val6 = sub i32 %val4, %val5
+  br label %bb7
+
+bb7:                                              ; preds = %bb32, %bb
+  %val8 = phi i64 [ 0, %bb ], [ %val34, %bb32 ]
+  %val9 = phi i32 [ 0, %bb ], [ %val35, %bb32 ]
+  %val10 = icmp ult i64 %val8, 65536
+  br i1 %val10, label %bb12, label %bb11
+
+bb11:                                             ; preds = %bb7
+  unreachable
+
+bb12:                                             ; preds = %bb7
+  %val13 = add i32 %val9, %val6
+  %val14 = icmp slt i32 undef, undef
+  br i1 %val14, label %bb17, label %bb15
+
+bb15:                                             ; preds = %bb32, %bb29, %bb26, %bb23, %bb20, %bb17, %bb12
+  %val16 = phi i32 [ %val35, %bb32 ], [ %val30, %bb29 ], [ %val27, %bb26 ], [ %val24, %bb23 ], [ %val21, %bb20 ], [ %val18, %bb17 ], [ %val13, %bb12 ]
+  call void @widget() [ "deopt"(i32 %val16, i32 3, i32 %val) ]
+  unreachable
+
+bb17:                                             ; preds = %bb12
+  %val18 = add i32 %val13, %val6
+  %val19 = icmp slt i32 undef, undef
+  br i1 %val19, label %bb20, label %bb15
+
+bb20:                                             ; preds = %bb17
+  %val21 = add i32 %val18, %val6
+  %val22 = icmp slt i32 undef, undef
+  br i1 %val22, label %bb23, label %bb15
+
+bb23:                                             ; preds = %bb20
+  %val24 = add i32 %val21, %val6
+  %val25 = icmp slt i32 undef, undef
+  br i1 %val25, label %bb26, label %bb15
+
+bb26:                                             ; preds = %bb23
+  %val27 = add i32 %val24, %val6
+  %val28 = icmp slt i32 undef, undef
+  br i1 %val28, label %bb29, label %bb15
+
+bb29:                                             ; preds = %bb26
+  %val30 = add i32 %val27, %val6
+  %val31 = icmp slt i32 undef, undef
+  br i1 %val31, label %bb32, label %bb15
+
+bb32:                                             ; preds = %bb29
+  %val33 = add i32 %val30, %val6
+  %val34 = add nuw nsw i64 %val8, 8
+  %val35 = add i32 %val33, %val6
+  br i1 false, label %bb7, label %bb15
+}
+
+; Test checks that LSR does not hoist increment of %val8 while expanding the other pieces of formula
+; to original place in backedge causing incorrect SSA form.
+define void @test2() {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[VAL:%.*]] = bitcast i8* null to i32*
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, i32* [[VAL]], align 4
+; CHECK-NEXT:    [[VAL2:%.*]] = bitcast i8* null to i32*
+; CHECK-NEXT:    [[VAL3:%.*]] = load i32, i32* [[VAL2]], align 4
+; CHECK-NEXT:    br label [[BB6:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    [[VAL5:%.*]] = sext i32 [[VAL16:%.*]] to i64
+; CHECK-NEXT:    unreachable
+; CHECK:       bb6:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[BB12:%.*]] ], [ -1, [[BB:%.*]] ]
+; CHECK-NEXT:    [[VAL8:%.*]] = phi i32 [ [[VAL16]], [[BB12]] ], [ [[VAL3]], [[BB]] ]
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 1
+; CHECK-NEXT:    [[VAL10:%.*]] = icmp ult i64 [[LSR_IV_NEXT]], 1048576
+; CHECK-NEXT:    br i1 [[VAL10]], label [[BB12]], label [[BB11:%.*]]
+; CHECK:       bb11:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb12:
+; CHECK-NEXT:    [[VAL14:%.*]] = add i32 [[VAL8]], [[VAL1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[VAL1]], [[VAL8]]
+; CHECK-NEXT:    [[VAL15:%.*]] = select i1 false, i32 [[VAL14]], i32 [[VAL8]]
+; CHECK-NEXT:    [[VAL16]] = add i32 [[TMP0]], 1
+; CHECK-NEXT:    [[VAL17:%.*]] = fcmp olt double 0.000000e+00, 2.270000e+02
+; CHECK-NEXT:    br i1 [[VAL17]], label [[BB6]], label [[BB4:%.*]]
+;
+bb:
+  %val = bitcast i8* null to i32*
+  %val1 = load i32, i32* %val, align 4
+  %val2 = bitcast i8* null to i32*
+  %val3 = load i32, i32* %val2, align 4
+  br label %bb6
+
+bb4:                                              ; preds = %bb12
+  %val5 = sext i32 %val16 to i64
+  unreachable
+
+bb6:                                              ; preds = %bb12, %bb
+  %val7 = phi i64 [ %val9, %bb12 ], [ 0, %bb ]
+  %val8 = phi i32 [ %val16, %bb12 ], [ %val3, %bb ]
+  %val9 = add nuw nsw i64 %val7, 1
+  %val10 = icmp ult i64 %val7, 1048576
+  br i1 %val10, label %bb12, label %bb11
+
+bb11:                                             ; preds = %bb6
+  unreachable
+
+bb12:                                             ; preds = %bb6
+  %val13 = select i1 false, i32 0, i32 %val8
+  %val14 = add i32 %val8, %val1
+  %val15 = select i1 false, i32 %val14, i32 %val13
+  %val16 = add i32 %val14, 1
+  %val17 = fcmp olt double 0.000000e+00, 2.270000e+02
+  br i1 %val17, label %bb6, label %bb4
+}
+
+declare void @widget()
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/unroll-upperbound.ll b/llvm/test/Transforms/LoopUnroll/AArch64/unroll-upperbound.ll
new file mode 100644
index 0000000000000..3b82365d1a6ef
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/unroll-upperbound.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -loop-unroll -mtriple aarch64 | FileCheck %s
+
+; Below loop's trip count is not constant and it blocks to unroll the loop.
+; After setting up `UP.UpperBound = true` in `getUnrollingPreferences`,
+; the loop should be unrolled.
+
+define void @test(i1 %cond) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    switch i32 0, label [[SW_DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 2, label [[LATCH:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.default:
+; CHECK-NEXT:    tail call void @foo()
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       latch:
+; CHECK-NEXT:    br i1 false, label [[FOR_END:%.*]], label [[FOR_BODY_1:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body.1:
+; CHECK-NEXT:    switch i32 1, label [[SW_DEFAULT_1:%.*]] [
+; CHECK-NEXT:    i32 2, label [[LATCH_1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.default.1:
+; CHECK-NEXT:    tail call void @foo()
+; CHECK-NEXT:    br label [[LATCH_1]]
+; CHECK:       latch.1:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[FOR_END]], label [[FOR_BODY_2:%.*]]
+; CHECK:       for.body.2:
+; CHECK-NEXT:    switch i32 2, label [[SW_DEFAULT_2:%.*]] [
+; CHECK-NEXT:    i32 2, label [[LATCH_2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.default.2:
+; CHECK-NEXT:    tail call void @foo()
+; CHECK-NEXT:    br label [[LATCH_2]]
+; CHECK:       latch.2:
+; CHECK-NEXT:    br label [[FOR_END]]
+;
+entry:
+  %0 = select i1 %cond, i32 2, i32 3
+  br label %for.body
+
+for.body:
+  %i.017 = phi i32 [ 0, %entry ], [ %inc, %latch ]
+  switch i32 %i.017, label %sw.default [
+  i32 2, label %latch
+  ]
+
+sw.default:
+  tail call void @foo()
+  br label %latch
+
+latch:
+  %inc = add nuw nsw i32 %i.017, 1
+  %exitcond.not = icmp eq i32 %inc, %0
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare void @foo()
diff --git a/llvm/test/Transforms/LoopUnroll/peel-multiple-unreachable-exits.ll b/llvm/test/Transforms/LoopUnroll/peel-multiple-unreachable-exits.ll
new file mode 100644
index 0000000000000..435a8015010de
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/peel-multiple-unreachable-exits.ll
@@ -0,0 +1,239 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-unroll -S %s | FileCheck %s
+
+declare void @foo()
+
+define void @peel_unreachable_exit_and_latch_exit(i32* %ptr, i32 %N, i32 %x) {
+; CHECK-LABEL: @peel_unreachable_exit_and_latch_exit(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], 2
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       else:
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i32 [[IV]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[UNREACHABLE_EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[M:%.*]] = phi i32 [ 0, [[THEN]] ], [ [[X]], [[ELSE]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[IV]]
+; CHECK-NEXT:    store i32 [[M]], i32* [[GEP]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[C_3:%.*]] = icmp ult i32 [[IV]], 1000
+; CHECK-NEXT:    br i1 [[C_3]], label [[LOOP_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       unreachable.exit:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    unreachable
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %c = icmp ult i32 %iv, 2
+  br i1 %c, label %then, label %else
+
+then:
+  br label %loop.latch
+
+else:
+  %c.2 = icmp eq i32 %iv, %x
+  br i1 %c.2, label %unreachable.exit, label %loop.latch
+
+loop.latch:
+  %m = phi i32 [ 0, %then ], [ %x, %else ]
+  %gep = getelementptr i32, i32* %ptr, i32 %iv
+  store i32 %m, i32* %gep
+  %iv.next = add nuw nsw i32  %iv, 1
+  %c.3 = icmp ult i32 %iv, 1000
+  br i1 %c.3, label %loop.header, label %exit
+
+exit:
+  ret void
+
+unreachable.exit:
+  call void @foo()
+  unreachable
+}
+
+define void @peel_unreachable_exit_and_header_exit(i32* %ptr, i32 %N, i32 %x) {
+; CHECK-LABEL: @peel_unreachable_exit_and_header_exit(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i32 1, [[X:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[UNREACHABLE_EXIT:%.*]], label [[LOOP_LATCH:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 1
+; CHECK-NEXT:    store i32 [[X]], i32* [[GEP]], align 4
+; CHECK-NEXT:    unreachable
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       unreachable.exit:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    unreachable
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %c = icmp ult i32 %iv, 1000
+  br i1 %c, label %exit, label %else
+
+else:
+  %c.2 = icmp eq i32 %iv, %x
+  br i1 %c.2, label %unreachable.exit, label %loop.latch
+
+loop.latch:
+  %gep = getelementptr i32, i32* %ptr, i32 %iv
+  store i32 %x, i32* %gep
+  %iv.next = add nuw nsw i32  %iv, 1
+  br label %loop.header
+
+exit:
+  ret void
+
+unreachable.exit:
+  call void @foo()
+  unreachable
+}
+
+define void @peel_unreachable_and_multiple_reachable_exits(i32* %ptr, i32 %N, i32 %x) {
+; CHECK-LABEL: @peel_unreachable_and_multiple_reachable_exits(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], 2
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[C_2:%.*]] = icmp sgt i32 [[IV]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       else:
+; CHECK-NEXT:    [[C_3:%.*]] = icmp eq i32 [[IV]], [[X]]
+; CHECK-NEXT:    br i1 [[C_3]], label [[UNREACHABLE_EXIT:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[M:%.*]] = phi i32 [ 0, [[THEN]] ], [ [[X]], [[ELSE]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[IV]]
+; CHECK-NEXT:    store i32 [[M]], i32* [[GEP]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[C_4:%.*]] = icmp ult i32 [[IV]], 1000
+; CHECK-NEXT:    br i1 [[C_4]], label [[LOOP_HEADER]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       unreachable.exit:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    unreachable
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %c = icmp ult i32 %iv, 2
+  br i1 %c, label %then, label %else
+
+then:
+  %c.2 = icmp sgt i32 %iv, %x
+  br i1 %c.2, label %exit, label %loop.latch
+
+else:
+  %c.3 = icmp eq i32 %iv, %x
+  br i1 %c.3, label %unreachable.exit, label %loop.latch
+
+loop.latch:
+  %m = phi i32 [ 0, %then ], [ %x, %else ]
+  %gep = getelementptr i32, i32* %ptr, i32 %iv
+  store i32 %m, i32* %gep
+  %iv.next = add nuw nsw i32  %iv, 1
+  %c.4 = icmp ult i32 %iv, 1000
+  br i1 %c.4, label %loop.header, label %exit
+
+exit:
+  ret void
+
+unreachable.exit:
+  call void @foo()
+  unreachable
+}
+
+define void @peel_exits_to_blocks_branch_to_unreachable_block(i32* %ptr, i32 %N, i32 %x, i1 %c.1) {
+; CHECK-LABEL: @peel_exits_to_blocks_branch_to_unreachable_block(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], 2
+; CHECK-NEXT:    br i1 [[C]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br i1 [[C_1:%.*]], label [[EXIT_1:%.*]], label [[LOOP_LATCH]]
+; CHECK:       else:
+; CHECK-NEXT:    [[C_2:%.*]] = icmp eq i32 [[IV]], [[X:%.*]]
+; CHECK-NEXT:    br i1 [[C_2]], label [[EXIT_2:%.*]], label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[M:%.*]] = phi i32 [ 0, [[THEN]] ], [ [[X]], [[ELSE]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[IV]]
+; CHECK-NEXT:    store i32 [[M]], i32* [[GEP]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[C_3:%.*]] = icmp ult i32 [[IV]], 1000
+; CHECK-NEXT:    br i1 [[C_3]], label [[LOOP_HEADER]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       exit.1:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    br label [[UNREACHABLE_TERM:%.*]]
+; CHECK:       exit.2:
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    br label [[UNREACHABLE_TERM]]
+; CHECK:       unreachable.term:
+; CHECK-NEXT:    call void @baz()
+; CHECK-NEXT:    unreachable
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %loop.latch ]
+  %c = icmp ult i32 %iv, 2
+  br i1 %c, label %then, label %else
+
+then:
+  br i1 %c.1, label %exit.1, label %loop.latch
+
+else:
+  %c.2 = icmp eq i32 %iv, %x
+  br i1 %c.2, label %exit.2, label %loop.latch
+
+loop.latch:
+  %m = phi i32 [ 0, %then ], [ %x, %else ]
+  %gep = getelementptr i32, i32* %ptr, i32 %iv
+  store i32 %m, i32* %gep
+  %iv.next = add nuw nsw i32  %iv, 1
+  %c.3 = icmp ult i32 %iv, 1000
+  br i1 %c.3, label %loop.header, label %exit
+
+exit:
+  ret void
+
+exit.1:
+  call void @foo()
+  br label %unreachable.term
+
+exit.2:
+  call void @bar()
+  br label %unreachable.term
+
+unreachable.term:
+  call void @baz()
+  unreachable
+}
+
+declare void @bar()
+declare void @baz()
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
index c52575e4b03a3..35056394da0c7 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p
-; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-runtime-multi-exit=true -verify-loop-lcssa -verify-dom-info -verify-loop-info -S | FileCheck %s -check-prefixes=CHECK,EPILOG-NO-IC
 ; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-runtime-multi-exit=true -verify-loop-lcssa -verify-dom-info -verify-loop-info -S | FileCheck %s -check-prefixes=CHECK,EPILOG
 ; RUN: opt < %s -loop-unroll -unroll-runtime -unroll-count=2 -unroll-runtime-epilog=true -unroll-runtime-multi-exit=true -verify-loop-lcssa -verify-dom-info -verify-loop-info -S | FileCheck %s -check-prefixes=CHECK,EPILOG-BLOCK
 ; RUN: opt < %s -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false -unroll-runtime-multi-exit=true -verify-loop-lcssa -verify-dom-info -verify-loop-info -S | FileCheck %s -check-prefixes=CHECK,PROLOG
@@ -10,124 +9,6 @@
 ; test with three exiting and three exit blocks.
 ; none of the exit blocks have successors
 define void @test1(i64 %trip, i1 %cond) {
-; EPILOG-NO-IC-LABEL: @test1(
-; EPILOG-NO-IC-NEXT:  entry:
-; EPILOG-NO-IC-NEXT:    %0 = add i64 %trip, -1
-; EPILOG-NO-IC-NEXT:    %xtraiter = and i64 %trip, 7
-; EPILOG-NO-IC-NEXT:    %1 = icmp ult i64 %0, 7
-; EPILOG-NO-IC-NEXT:    br i1 %1, label %exit2.loopexit.unr-lcssa, label %entry.new
-; EPILOG-NO-IC:       entry.new:
-; EPILOG-NO-IC-NEXT:    %unroll_iter = sub i64 %trip, %xtraiter
-; EPILOG-NO-IC-NEXT:    br label %loop_header
-; EPILOG-NO-IC:       loop_header:
-; EPILOG-NO-IC-NEXT:    %iv = phi i64 [ 0, %entry.new ], [ %iv_next.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch, label %loop_exiting_bb1
-; EPILOG-NO-IC:       loop_exiting_bb1:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_exiting_bb2, label %exit1.loopexit
-; EPILOG-NO-IC:       loop_exiting_bb2:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_latch, label %exit3.loopexit
-; EPILOG-NO-IC:       exit3.loopexit:
-; EPILOG-NO-IC-NEXT:    br label %exit3
-; EPILOG-NO-IC:       exit3.loopexit2:
-; EPILOG-NO-IC-NEXT:    br label %exit3
-; EPILOG-NO-IC:       exit3:
-; EPILOG-NO-IC-NEXT:    ret void
-; EPILOG-NO-IC:       loop_latch:
-; EPILOG-NO-IC-NEXT:    %iv_next = add nuw nsw i64 %iv, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub = sub i64 %niter, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.1, label %loop_exiting_bb1.1
-; EPILOG-NO-IC:       exit1.loopexit:
-; EPILOG-NO-IC-NEXT:    br label %exit1
-; EPILOG-NO-IC:       exit1.loopexit1:
-; EPILOG-NO-IC-NEXT:    br label %exit1
-; EPILOG-NO-IC:       exit1:
-; EPILOG-NO-IC-NEXT:    ret void
-; EPILOG-NO-IC:       exit2.loopexit.unr-lcssa.loopexit:
-; EPILOG-NO-IC-NEXT:    %iv.unr.ph = phi i64 [ %iv_next.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    br label %exit2.loopexit.unr-lcssa
-; EPILOG-NO-IC:       exit2.loopexit.unr-lcssa:
-; EPILOG-NO-IC-NEXT:    %iv.unr = phi i64 [ 0, %entry ], [ %iv.unr.ph, %exit2.loopexit.unr-lcssa.loopexit ]
-; EPILOG-NO-IC-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
-; EPILOG-NO-IC-NEXT:    br i1 %lcmp.mod, label %loop_header.epil.preheader, label %exit2.loopexit
-; EPILOG-NO-IC:       loop_header.epil.preheader:
-; EPILOG-NO-IC-NEXT:    br label %loop_header.epil
-; EPILOG-NO-IC:       loop_header.epil:
-; EPILOG-NO-IC-NEXT:    %iv.epil = phi i64 [ %iv.unr, %loop_header.epil.preheader ], [ %iv_next.epil, %loop_latch.epil ]
-; EPILOG-NO-IC-NEXT:    %epil.iter = phi i64 [ %xtraiter, %loop_header.epil.preheader ], [ %epil.iter.sub, %loop_latch.epil ]
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.epil, label %loop_exiting_bb1.epil
-; EPILOG-NO-IC:       loop_exiting_bb1.epil:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_exiting_bb2.epil, label %exit1.loopexit1
-; EPILOG-NO-IC:       loop_exiting_bb2.epil:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_latch.epil, label %exit3.loopexit2
-; EPILOG-NO-IC:       loop_latch.epil:
-; EPILOG-NO-IC-NEXT:    %iv_next.epil = add i64 %iv.epil, 1
-; EPILOG-NO-IC-NEXT:    %cmp.epil = icmp ne i64 %iv_next.epil, %trip
-; EPILOG-NO-IC-NEXT:    %epil.iter.sub = sub i64 %epil.iter, 1
-; EPILOG-NO-IC-NEXT:    %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0
-; EPILOG-NO-IC-NEXT:    br i1 %epil.iter.cmp, label %loop_header.epil, label %exit2.loopexit.epilog-lcssa, !llvm.loop !0
-; EPILOG-NO-IC:       exit2.loopexit.epilog-lcssa:
-; EPILOG-NO-IC-NEXT:    br label %exit2.loopexit
-; EPILOG-NO-IC:       exit2.loopexit:
-; EPILOG-NO-IC-NEXT:    ret void
-; EPILOG-NO-IC:       loop_exiting_bb1.1:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_exiting_bb2.1, label %exit1.loopexit
-; EPILOG-NO-IC:       loop_exiting_bb2.1:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_latch.1, label %exit3.loopexit
-; EPILOG-NO-IC:       loop_latch.1:
-; EPILOG-NO-IC-NEXT:    %iv_next.1 = add nuw nsw i64 %iv_next, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.2, label %loop_exiting_bb1.2
-; EPILOG-NO-IC:       loop_exiting_bb1.2:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_exiting_bb2.2, label %exit1.loopexit
-; EPILOG-NO-IC:       loop_exiting_bb2.2:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_latch.2, label %exit3.loopexit
-; EPILOG-NO-IC:       loop_latch.2:
-; EPILOG-NO-IC-NEXT:    %iv_next.2 = add nuw nsw i64 %iv_next.1, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.2 = sub i64 %niter.nsub.1, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.3, label %loop_exiting_bb1.3
-; EPILOG-NO-IC:       loop_exiting_bb1.3:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_exiting_bb2.3, label %exit1.loopexit
-; EPILOG-NO-IC:       loop_exiting_bb2.3:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_latch.3, label %exit3.loopexit
-; EPILOG-NO-IC:       loop_latch.3:
-; EPILOG-NO-IC-NEXT:    %iv_next.3 = add nuw nsw i64 %iv_next.2, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.3 = sub i64 %niter.nsub.2, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.4, label %loop_exiting_bb1.4
-; EPILOG-NO-IC:       loop_exiting_bb1.4:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_exiting_bb2.4, label %exit1.loopexit
-; EPILOG-NO-IC:       loop_exiting_bb2.4:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_latch.4, label %exit3.loopexit
-; EPILOG-NO-IC:       loop_latch.4:
-; EPILOG-NO-IC-NEXT:    %iv_next.4 = add nuw nsw i64 %iv_next.3, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.4 = sub i64 %niter.nsub.3, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.5, label %loop_exiting_bb1.5
-; EPILOG-NO-IC:       loop_exiting_bb1.5:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_exiting_bb2.5, label %exit1.loopexit
-; EPILOG-NO-IC:       loop_exiting_bb2.5:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_latch.5, label %exit3.loopexit
-; EPILOG-NO-IC:       loop_latch.5:
-; EPILOG-NO-IC-NEXT:    %iv_next.5 = add nuw nsw i64 %iv_next.4, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.5 = sub i64 %niter.nsub.4, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.6, label %loop_exiting_bb1.6
-; EPILOG-NO-IC:       loop_exiting_bb1.6:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_exiting_bb2.6, label %exit1.loopexit
-; EPILOG-NO-IC:       loop_exiting_bb2.6:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_latch.6, label %exit3.loopexit
-; EPILOG-NO-IC:       loop_latch.6:
-; EPILOG-NO-IC-NEXT:    %iv_next.6 = add nuw nsw i64 %iv_next.5, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.6 = sub i64 %niter.nsub.5, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.7, label %loop_exiting_bb1.7
-; EPILOG-NO-IC:       loop_exiting_bb1.7:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_exiting_bb2.7, label %exit1.loopexit
-; EPILOG-NO-IC:       loop_exiting_bb2.7:
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loop_latch.7, label %exit3.loopexit
-; EPILOG-NO-IC:       loop_latch.7:
-; EPILOG-NO-IC-NEXT:    %iv_next.7 = add i64 %iv_next.6, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.7 = sub i64 %niter.nsub.6, 1
-; EPILOG-NO-IC-NEXT:    %niter.ncmp.7 = icmp ne i64 %niter.nsub.7, 0
-; EPILOG-NO-IC-NEXT:    br i1 %niter.ncmp.7, label %loop_header, label %exit2.loopexit.unr-lcssa.loopexit
-;
 ; EPILOG-LABEL: @test1(
 ; EPILOG-NEXT:  entry:
 ; EPILOG-NEXT:    %0 = add i64 %trip, -1
@@ -508,147 +389,6 @@ exit2.loopexit:
 ; %sum.02 and %add. Both of these are incoming values for phi from every exiting
 ; unrolled block.
 define i32 @test2(i32* nocapture %a, i64 %n) {
-; EPILOG-NO-IC-LABEL: @test2(
-; EPILOG-NO-IC-NEXT:  entry:
-; EPILOG-NO-IC-NEXT:    %0 = add i64 %n, -1
-; EPILOG-NO-IC-NEXT:    %xtraiter = and i64 %n, 7
-; EPILOG-NO-IC-NEXT:    %1 = icmp ult i64 %0, 7
-; EPILOG-NO-IC-NEXT:    br i1 %1, label %for.end.unr-lcssa, label %entry.new
-; EPILOG-NO-IC:       entry.new:
-; EPILOG-NO-IC-NEXT:    %unroll_iter = sub i64 %n, %xtraiter
-; EPILOG-NO-IC-NEXT:    br label %header
-; EPILOG-NO-IC:       header:
-; EPILOG-NO-IC-NEXT:    %indvars.iv = phi i64 [ 0, %entry.new ], [ %indvars.iv.next.7, %for.body.7 ]
-; EPILOG-NO-IC-NEXT:    %sum.02 = phi i32 [ 0, %entry.new ], [ %add.7, %for.body.7 ]
-; EPILOG-NO-IC-NEXT:    %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.7, %for.body.7 ]
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block
-; EPILOG-NO-IC:       for.exiting_block:
-; EPILOG-NO-IC-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp, label %for.exit2.loopexit, label %for.body
-; EPILOG-NO-IC:       for.body:
-; EPILOG-NO-IC-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-NO-IC-NEXT:    %2 = load i32, i32* %arrayidx, align 4
-; EPILOG-NO-IC-NEXT:    %add = add nsw i32 %2, %sum.02
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub = sub i64 %niter, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.1
-; EPILOG-NO-IC:       for.end.unr-lcssa.loopexit:
-; EPILOG-NO-IC-NEXT:    %sum.0.lcssa.ph.ph = phi i32 [ %add.7, %for.body.7 ]
-; EPILOG-NO-IC-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.7, %for.body.7 ]
-; EPILOG-NO-IC-NEXT:    %sum.02.unr.ph = phi i32 [ %add.7, %for.body.7 ]
-; EPILOG-NO-IC-NEXT:    br label %for.end.unr-lcssa
-; EPILOG-NO-IC:       for.end.unr-lcssa:
-; EPILOG-NO-IC-NEXT:    %sum.0.lcssa.ph = phi i32 [ undef, %entry ], [ %sum.0.lcssa.ph.ph, %for.end.unr-lcssa.loopexit ]
-; EPILOG-NO-IC-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %for.end.unr-lcssa.loopexit ]
-; EPILOG-NO-IC-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %for.end.unr-lcssa.loopexit ]
-; EPILOG-NO-IC-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
-; EPILOG-NO-IC-NEXT:    br i1 %lcmp.mod, label %header.epil.preheader, label %for.end
-; EPILOG-NO-IC:       header.epil.preheader:
-; EPILOG-NO-IC-NEXT:    br label %header.epil
-; EPILOG-NO-IC:       header.epil:
-; EPILOG-NO-IC-NEXT:    %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %for.body.epil ], [ %indvars.iv.unr, %header.epil.preheader ]
-; EPILOG-NO-IC-NEXT:    %sum.02.epil = phi i32 [ %add.epil, %for.body.epil ], [ %sum.02.unr, %header.epil.preheader ]
-; EPILOG-NO-IC-NEXT:    %epil.iter = phi i64 [ %xtraiter, %header.epil.preheader ], [ %epil.iter.sub, %for.body.epil ]
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit2, label %for.exiting_block.epil
-; EPILOG-NO-IC:       for.exiting_block.epil:
-; EPILOG-NO-IC-NEXT:    %cmp.epil = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.epil, label %for.exit2.loopexit2, label %for.body.epil
-; EPILOG-NO-IC:       for.body.epil:
-; EPILOG-NO-IC-NEXT:    %arrayidx.epil = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.epil
-; EPILOG-NO-IC-NEXT:    %3 = load i32, i32* %arrayidx.epil, align 4
-; EPILOG-NO-IC-NEXT:    %add.epil = add nsw i32 %3, %sum.02.epil
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.epil = add i64 %indvars.iv.epil, 1
-; EPILOG-NO-IC-NEXT:    %exitcond.epil = icmp eq i64 %indvars.iv.next.epil, %n
-; EPILOG-NO-IC-NEXT:    %epil.iter.sub = sub i64 %epil.iter, 1
-; EPILOG-NO-IC-NEXT:    %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0
-; EPILOG-NO-IC-NEXT:    br i1 %epil.iter.cmp, label %header.epil, label %for.end.epilog-lcssa, !llvm.loop !2
-; EPILOG-NO-IC:       for.end.epilog-lcssa:
-; EPILOG-NO-IC-NEXT:    %sum.0.lcssa.ph1 = phi i32 [ %add.epil, %for.body.epil ]
-; EPILOG-NO-IC-NEXT:    br label %for.end
-; EPILOG-NO-IC:       for.end:
-; EPILOG-NO-IC-NEXT:    %sum.0.lcssa = phi i32 [ %sum.0.lcssa.ph, %for.end.unr-lcssa ], [ %sum.0.lcssa.ph1, %for.end.epilog-lcssa ]
-; EPILOG-NO-IC-NEXT:    ret i32 %sum.0.lcssa
-; EPILOG-NO-IC:       for.exit2.loopexit:
-; EPILOG-NO-IC-NEXT:    %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %for.body ], [ 42, %for.exiting_block.1 ], [ %add.1, %for.body.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %for.body.2 ], [ 42, %for.exiting_block.3 ], [ %add.3, %for.body.3 ], [ 42, %for.exiting_block.4 ], [ %add.4, %for.body.4 ], [ 42, %for.exiting_block.5 ], [ %add.5, %for.body.5 ], [ 42, %for.exiting_block.6 ], [ %add.6, %for.body.6 ], [ 42, %for.exiting_block.7 ]
-; EPILOG-NO-IC-NEXT:    br label %for.exit2
-; EPILOG-NO-IC:       for.exit2.loopexit2:
-; EPILOG-NO-IC-NEXT:    %retval.ph3 = phi i32 [ 42, %for.exiting_block.epil ], [ %sum.02.epil, %header.epil ]
-; EPILOG-NO-IC-NEXT:    br label %for.exit2
-; EPILOG-NO-IC:       for.exit2:
-; EPILOG-NO-IC-NEXT:    %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph3, %for.exit2.loopexit2 ]
-; EPILOG-NO-IC-NEXT:    ret i32 %retval
-; EPILOG-NO-IC:       for.exiting_block.1:
-; EPILOG-NO-IC-NEXT:    %cmp.1 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.1, label %for.exit2.loopexit, label %for.body.1
-; EPILOG-NO-IC:       for.body.1:
-; EPILOG-NO-IC-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
-; EPILOG-NO-IC-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
-; EPILOG-NO-IC-NEXT:    %add.1 = add nsw i32 %4, %add
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv.next, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.2
-; EPILOG-NO-IC:       for.exiting_block.2:
-; EPILOG-NO-IC-NEXT:    %cmp.2 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.2, label %for.exit2.loopexit, label %for.body.2
-; EPILOG-NO-IC:       for.body.2:
-; EPILOG-NO-IC-NEXT:    %arrayidx.2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.1
-; EPILOG-NO-IC-NEXT:    %5 = load i32, i32* %arrayidx.2, align 4
-; EPILOG-NO-IC-NEXT:    %add.2 = add nsw i32 %5, %add.1
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.2 = add nuw nsw i64 %indvars.iv.next.1, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.2 = sub i64 %niter.nsub.1, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.3
-; EPILOG-NO-IC:       for.exiting_block.3:
-; EPILOG-NO-IC-NEXT:    %cmp.3 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.3, label %for.exit2.loopexit, label %for.body.3
-; EPILOG-NO-IC:       for.body.3:
-; EPILOG-NO-IC-NEXT:    %arrayidx.3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.2
-; EPILOG-NO-IC-NEXT:    %6 = load i32, i32* %arrayidx.3, align 4
-; EPILOG-NO-IC-NEXT:    %add.3 = add nsw i32 %6, %add.2
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv.next.2, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.3 = sub i64 %niter.nsub.2, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.4
-; EPILOG-NO-IC:       for.exiting_block.4:
-; EPILOG-NO-IC-NEXT:    %cmp.4 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.4, label %for.exit2.loopexit, label %for.body.4
-; EPILOG-NO-IC:       for.body.4:
-; EPILOG-NO-IC-NEXT:    %arrayidx.4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.3
-; EPILOG-NO-IC-NEXT:    %7 = load i32, i32* %arrayidx.4, align 4
-; EPILOG-NO-IC-NEXT:    %add.4 = add nsw i32 %7, %add.3
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.4 = add nuw nsw i64 %indvars.iv.next.3, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.4 = sub i64 %niter.nsub.3, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.5
-; EPILOG-NO-IC:       for.exiting_block.5:
-; EPILOG-NO-IC-NEXT:    %cmp.5 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.5, label %for.exit2.loopexit, label %for.body.5
-; EPILOG-NO-IC:       for.body.5:
-; EPILOG-NO-IC-NEXT:    %arrayidx.5 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.4
-; EPILOG-NO-IC-NEXT:    %8 = load i32, i32* %arrayidx.5, align 4
-; EPILOG-NO-IC-NEXT:    %add.5 = add nsw i32 %8, %add.4
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.5 = add nuw nsw i64 %indvars.iv.next.4, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.5 = sub i64 %niter.nsub.4, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.6
-; EPILOG-NO-IC:       for.exiting_block.6:
-; EPILOG-NO-IC-NEXT:    %cmp.6 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.6, label %for.exit2.loopexit, label %for.body.6
-; EPILOG-NO-IC:       for.body.6:
-; EPILOG-NO-IC-NEXT:    %arrayidx.6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.5
-; EPILOG-NO-IC-NEXT:    %9 = load i32, i32* %arrayidx.6, align 4
-; EPILOG-NO-IC-NEXT:    %add.6 = add nsw i32 %9, %add.5
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.6 = add nuw nsw i64 %indvars.iv.next.5, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.6 = sub i64 %niter.nsub.5, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.7
-; EPILOG-NO-IC:       for.exiting_block.7:
-; EPILOG-NO-IC-NEXT:    %cmp.7 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.7, label %for.exit2.loopexit, label %for.body.7
-; EPILOG-NO-IC:       for.body.7:
-; EPILOG-NO-IC-NEXT:    %arrayidx.7 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.6
-; EPILOG-NO-IC-NEXT:    %10 = load i32, i32* %arrayidx.7, align 4
-; EPILOG-NO-IC-NEXT:    %add.7 = add nsw i32 %10, %add.6
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.7 = add i64 %indvars.iv.next.6, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.7 = sub i64 %niter.nsub.6, 1
-; EPILOG-NO-IC-NEXT:    %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0
-; EPILOG-NO-IC-NEXT:    br i1 %niter.ncmp.7, label %for.end.unr-lcssa.loopexit, label %header
-;
 ; EPILOG-LABEL: @test2(
 ; EPILOG-NEXT:  entry:
 ; EPILOG-NEXT:    %0 = add i64 %n, -1
@@ -1093,146 +833,6 @@ for.exit2:
 ; test with two exiting and three exit blocks.
 ; the non-latch exiting block has a switch.
 define void @test3(i64 %trip, i64 %add) {
-; EPILOG-NO-IC-LABEL: @test3(
-; EPILOG-NO-IC-NEXT:  entry:
-; EPILOG-NO-IC-NEXT:    %0 = add i64 %trip, -1
-; EPILOG-NO-IC-NEXT:    %xtraiter = and i64 %trip, 7
-; EPILOG-NO-IC-NEXT:    %1 = icmp ult i64 %0, 7
-; EPILOG-NO-IC-NEXT:    br i1 %1, label %exit2.loopexit.unr-lcssa, label %entry.new
-; EPILOG-NO-IC:       entry.new:
-; EPILOG-NO-IC-NEXT:    %unroll_iter = sub i64 %trip, %xtraiter
-; EPILOG-NO-IC-NEXT:    br label %loop_header
-; EPILOG-NO-IC:       loop_header:
-; EPILOG-NO-IC-NEXT:    %iv = phi i64 [ 0, %entry.new ], [ %iv_next.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    %sum = phi i64 [ 0, %entry.new ], [ %sum.next.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    br i1 undef, label %loop_latch, label %loop_exiting_bb1
-; EPILOG-NO-IC:       loop_exiting_bb1:
-; EPILOG-NO-IC-NEXT:    switch i64 %sum, label %loop_latch [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit3.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       exit3.loopexit:
-; EPILOG-NO-IC-NEXT:    br label %exit3
-; EPILOG-NO-IC:       exit3.loopexit2:
-; EPILOG-NO-IC-NEXT:    br label %exit3
-; EPILOG-NO-IC:       exit3:
-; EPILOG-NO-IC-NEXT:    ret void
-; EPILOG-NO-IC:       loop_latch:
-; EPILOG-NO-IC-NEXT:    %iv_next = add nuw nsw i64 %iv, 1
-; EPILOG-NO-IC-NEXT:    %sum.next = add i64 %sum, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub = sub i64 %niter, 1
-; EPILOG-NO-IC-NEXT:    br i1 undef, label %loop_latch.1, label %loop_exiting_bb1.1
-; EPILOG-NO-IC:       exit1.loopexit:
-; EPILOG-NO-IC-NEXT:    br label %exit1
-; EPILOG-NO-IC:       exit1.loopexit1:
-; EPILOG-NO-IC-NEXT:    br label %exit1
-; EPILOG-NO-IC:       exit1:
-; EPILOG-NO-IC-NEXT:    ret void
-; EPILOG-NO-IC:       exit2.loopexit.unr-lcssa.loopexit:
-; EPILOG-NO-IC-NEXT:    %iv.unr.ph = phi i64 [ %iv_next.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    %sum.unr.ph = phi i64 [ %sum.next.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    br label %exit2.loopexit.unr-lcssa
-; EPILOG-NO-IC:       exit2.loopexit.unr-lcssa:
-; EPILOG-NO-IC-NEXT:    %iv.unr = phi i64 [ 0, %entry ], [ %iv.unr.ph, %exit2.loopexit.unr-lcssa.loopexit ]
-; EPILOG-NO-IC-NEXT:    %sum.unr = phi i64 [ 0, %entry ], [ %sum.unr.ph, %exit2.loopexit.unr-lcssa.loopexit ]
-; EPILOG-NO-IC-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
-; EPILOG-NO-IC-NEXT:    br i1 %lcmp.mod, label %loop_header.epil.preheader, label %exit2.loopexit
-; EPILOG-NO-IC:       loop_header.epil.preheader:
-; EPILOG-NO-IC-NEXT:    br label %loop_header.epil
-; EPILOG-NO-IC:       loop_header.epil:
-; EPILOG-NO-IC-NEXT:    %iv.epil = phi i64 [ %iv.unr, %loop_header.epil.preheader ], [ %iv_next.epil, %loop_latch.epil ]
-; EPILOG-NO-IC-NEXT:    %sum.epil = phi i64 [ %sum.unr, %loop_header.epil.preheader ], [ %sum.next.epil, %loop_latch.epil ]
-; EPILOG-NO-IC-NEXT:    %epil.iter = phi i64 [ %xtraiter, %loop_header.epil.preheader ], [ %epil.iter.sub, %loop_latch.epil ]
-; EPILOG-NO-IC-NEXT:    br i1 undef, label %loop_latch.epil, label %loop_exiting_bb1.epil
-; EPILOG-NO-IC:       loop_exiting_bb1.epil:
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.epil, label %loop_latch.epil [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit1
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit3.loopexit2
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.epil:
-; EPILOG-NO-IC-NEXT:    %iv_next.epil = add nuw nsw i64 %iv.epil, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.epil = add i64 %sum.epil, %add
-; EPILOG-NO-IC-NEXT:    %cmp.epil = icmp ne i64 %iv_next.epil, %trip
-; EPILOG-NO-IC-NEXT:    %epil.iter.sub = sub i64 %epil.iter, 1
-; EPILOG-NO-IC-NEXT:    %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0
-; EPILOG-NO-IC-NEXT:    br i1 %epil.iter.cmp, label %loop_header.epil, label %exit2.loopexit.epilog-lcssa, !llvm.loop !3
-; EPILOG-NO-IC:       exit2.loopexit.epilog-lcssa:
-; EPILOG-NO-IC-NEXT:    br label %exit2.loopexit
-; EPILOG-NO-IC:       exit2.loopexit:
-; EPILOG-NO-IC-NEXT:    ret void
-; EPILOG-NO-IC:       loop_exiting_bb1.1:
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next, label %loop_latch.1 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit3.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.1:
-; EPILOG-NO-IC-NEXT:    %iv_next.1 = add nuw nsw i64 %iv_next, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.1 = add i64 %sum.next, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
-; EPILOG-NO-IC-NEXT:    br i1 undef, label %loop_latch.2, label %loop_exiting_bb1.2
-; EPILOG-NO-IC:       loop_exiting_bb1.2:
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next.1, label %loop_latch.2 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit3.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.2:
-; EPILOG-NO-IC-NEXT:    %iv_next.2 = add nuw nsw i64 %iv_next.1, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.2 = add i64 %sum.next.1, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.2 = sub i64 %niter.nsub.1, 1
-; EPILOG-NO-IC-NEXT:    br i1 undef, label %loop_latch.3, label %loop_exiting_bb1.3
-; EPILOG-NO-IC:       loop_exiting_bb1.3:
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next.2, label %loop_latch.3 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit3.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.3:
-; EPILOG-NO-IC-NEXT:    %iv_next.3 = add nuw nsw i64 %iv_next.2, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.3 = add i64 %sum.next.2, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.3 = sub i64 %niter.nsub.2, 1
-; EPILOG-NO-IC-NEXT:    br i1 undef, label %loop_latch.4, label %loop_exiting_bb1.4
-; EPILOG-NO-IC:       loop_exiting_bb1.4:
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next.3, label %loop_latch.4 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit3.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.4:
-; EPILOG-NO-IC-NEXT:    %iv_next.4 = add nuw nsw i64 %iv_next.3, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.4 = add i64 %sum.next.3, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.4 = sub i64 %niter.nsub.3, 1
-; EPILOG-NO-IC-NEXT:    br i1 undef, label %loop_latch.5, label %loop_exiting_bb1.5
-; EPILOG-NO-IC:       loop_exiting_bb1.5:
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next.4, label %loop_latch.5 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit3.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.5:
-; EPILOG-NO-IC-NEXT:    %iv_next.5 = add nuw nsw i64 %iv_next.4, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.5 = add i64 %sum.next.4, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.5 = sub i64 %niter.nsub.4, 1
-; EPILOG-NO-IC-NEXT:    br i1 undef, label %loop_latch.6, label %loop_exiting_bb1.6
-; EPILOG-NO-IC:       loop_exiting_bb1.6:
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next.5, label %loop_latch.6 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit3.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.6:
-; EPILOG-NO-IC-NEXT:    %iv_next.6 = add nuw nsw i64 %iv_next.5, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.6 = add i64 %sum.next.5, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.6 = sub i64 %niter.nsub.5, 1
-; EPILOG-NO-IC-NEXT:    br i1 undef, label %loop_latch.7, label %loop_exiting_bb1.7
-; EPILOG-NO-IC:       loop_exiting_bb1.7:
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next.6, label %loop_latch.7 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit3.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.7:
-; EPILOG-NO-IC-NEXT:    %iv_next.7 = add nuw nsw i64 %iv_next.6, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.7 = add i64 %sum.next.6, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.7 = sub i64 %niter.nsub.6, 1
-; EPILOG-NO-IC-NEXT:    %niter.ncmp.7 = icmp ne i64 %niter.nsub.7, 0
-; EPILOG-NO-IC-NEXT:    br i1 %niter.ncmp.7, label %loop_header, label %exit2.loopexit.unr-lcssa.loopexit
-;
 ; EPILOG-LABEL: @test3(
 ; EPILOG-NEXT:  entry:
 ; EPILOG-NEXT:    %0 = add i64 %trip, -1
@@ -1673,144 +1273,424 @@ exit2.loopexit:
 ; FIXME: Support multiple exiting blocks to the same latch exit block.
 ; Three exiting blocks where header and latch exit to same LatchExit.
 define i32 @hdr_latch_same_exit(i32* nocapture %a, i64 %n, i1 %cond) {
-; EPILOG-NO-IC-LABEL: @hdr_latch_same_exit(
-; EPILOG-NO-IC-NEXT:  entry:
-; EPILOG-NO-IC-NEXT:    br label %header
-; EPILOG-NO-IC:       header:
-; EPILOG-NO-IC-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
-; EPILOG-NO-IC-NEXT:    %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %latchExit, label %for.exiting_block
-; EPILOG-NO-IC:       for.exiting_block:
-; EPILOG-NO-IC-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp, label %for.exit2, label %latch
-; EPILOG-NO-IC:       latch:
-; EPILOG-NO-IC-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-NO-IC-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; EPILOG-NO-IC-NEXT:    %add = add nsw i32 %0, %sum.02
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
-; EPILOG-NO-IC-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; EPILOG-NO-IC-NEXT:    br i1 %exitcond, label %latchExit, label %header
-; EPILOG-NO-IC:       latchExit:
-; EPILOG-NO-IC-NEXT:    %result = phi i32 [ 0, %header ], [ %add, %latch ]
-; EPILOG-NO-IC-NEXT:    ret i32 %result
-; EPILOG-NO-IC:       for.exit2:
-; EPILOG-NO-IC-NEXT:    ret i32 42
-;
 ; EPILOG-LABEL: @hdr_latch_same_exit(
 ; EPILOG-NEXT:  entry:
+; EPILOG-NEXT:    %0 = add i64 %n, -1
+; EPILOG-NEXT:    %xtraiter = and i64 %n, 7
+; EPILOG-NEXT:    %1 = icmp ult i64 %0, 7
+; EPILOG-NEXT:    br i1 %1, label %latchExit.unr-lcssa, label %entry.new
+; EPILOG:       entry.new:
+; EPILOG-NEXT:    %unroll_iter = sub i64 %n, %xtraiter
 ; EPILOG-NEXT:    br label %header
 ; EPILOG:       header:
-; EPILOG-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
-; EPILOG-NEXT:    %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
-; EPILOG-NEXT:    br i1 %cond, label %latchExit, label %for.exiting_block
+; EPILOG-NEXT:    %indvars.iv = phi i64 [ 0, %entry.new ], [ %indvars.iv.next.7, %latch.7 ]
+; EPILOG-NEXT:    %sum.02 = phi i32 [ 0, %entry.new ], [ %add.7, %latch.7 ]
+; EPILOG-NEXT:    %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.7, %latch.7 ]
+; EPILOG-NEXT:    br i1 %cond, label %latchExit.epilog-lcssa.loopexit, label %for.exiting_block
 ; EPILOG:       for.exiting_block:
 ; EPILOG-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-NEXT:    br i1 %cmp, label %for.exit2, label %latch
+; EPILOG-NEXT:    br i1 %cmp, label %for.exit2.loopexit, label %latch
 ; EPILOG:       latch:
 ; EPILOG-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; EPILOG-NEXT:    %add = add nsw i32 %0, %sum.02
-; EPILOG-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
-; EPILOG-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; EPILOG-NEXT:    br i1 %exitcond, label %latchExit, label %header
+; EPILOG-NEXT:    %2 = load i32, i32* %arrayidx, align 4
+; EPILOG-NEXT:    %add = add nsw i32 %2, %sum.02
+; EPILOG-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; EPILOG-NEXT:    %niter.nsub = sub i64 %niter, 1
+; EPILOG-NEXT:    br i1 %cond, label %latchExit.epilog-lcssa.loopexit, label %for.exiting_block.1
+; EPILOG:       latchExit.unr-lcssa.loopexit:
+; EPILOG-NEXT:    %result.ph.ph = phi i32 [ %add.7, %latch.7 ]
+; EPILOG-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.7, %latch.7 ]
+; EPILOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.7, %latch.7 ]
+; EPILOG-NEXT:    br label %latchExit.unr-lcssa
+; EPILOG:       latchExit.unr-lcssa:
+; EPILOG-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; EPILOG-NEXT:    br i1 %lcmp.mod, label %header.epil.preheader, label %latchExit
+; EPILOG:       header.epil.preheader:
+; EPILOG-NEXT:    br label %header.epil
+; EPILOG:       header.epil:
+; EPILOG-NEXT:    %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %latch.epil ], [ %indvars.iv.unr, %header.epil.preheader ]
+; EPILOG-NEXT:    %sum.02.epil = phi i32 [ %add.epil, %latch.epil ], [ %sum.02.unr, %header.epil.preheader ]
+; EPILOG-NEXT:    %epil.iter = phi i64 [ %xtraiter, %header.epil.preheader ], [ %epil.iter.sub, %latch.epil ]
+; EPILOG-NEXT:    br i1 %cond, label %latchExit.epilog-lcssa.loopexit2, label %for.exiting_block.epil
+; EPILOG:       for.exiting_block.epil:
+; EPILOG-NEXT:    %cmp.epil = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.epil, label %for.exit2.loopexit4, label %latch.epil
+; EPILOG:       latch.epil:
+; EPILOG-NEXT:    %arrayidx.epil = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.epil
+; EPILOG-NEXT:    %3 = load i32, i32* %arrayidx.epil, align 4
+; EPILOG-NEXT:    %add.epil = add nsw i32 %3, %sum.02.epil
+; EPILOG-NEXT:    %indvars.iv.next.epil = add i64 %indvars.iv.epil, 1
+; EPILOG-NEXT:    %exitcond.epil = icmp eq i64 %indvars.iv.next.epil, %n
+; EPILOG-NEXT:    %epil.iter.sub = sub i64 %epil.iter, 1
+; EPILOG-NEXT:    %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0
+; EPILOG-NEXT:    br i1 %epil.iter.cmp, label %header.epil, label %latchExit.epilog-lcssa.loopexit2, !llvm.loop !4
+; EPILOG:       latchExit.epilog-lcssa.loopexit:
+; EPILOG-NEXT:    %result.ph1.ph = phi i32 [ 0, %header ], [ 0, %latch ], [ 0, %latch.1 ], [ 0, %latch.2 ], [ 0, %latch.3 ], [ 0, %latch.4 ], [ 0, %latch.5 ], [ 0, %latch.6 ]
+; EPILOG-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG:       latchExit.epilog-lcssa.loopexit2:
+; EPILOG-NEXT:    %result.ph1.ph3 = phi i32 [ 0, %header.epil ], [ %add.epil, %latch.epil ]
+; EPILOG-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG:       latchExit.epilog-lcssa:
+; EPILOG-NEXT:    %result.ph1 = phi i32 [ %result.ph1.ph, %latchExit.epilog-lcssa.loopexit ], [ %result.ph1.ph3, %latchExit.epilog-lcssa.loopexit2 ]
+; EPILOG-NEXT:    br label %latchExit
 ; EPILOG:       latchExit:
-; EPILOG-NEXT:    %result = phi i32 [ 0, %header ], [ %add, %latch ]
+; EPILOG-NEXT:    %result = phi i32 [ %result.ph, %latchExit.unr-lcssa ], [ %result.ph1, %latchExit.epilog-lcssa ]
 ; EPILOG-NEXT:    ret i32 %result
+; EPILOG:       for.exit2.loopexit:
+; EPILOG-NEXT:    br label %for.exit2
+; EPILOG:       for.exit2.loopexit4:
+; EPILOG-NEXT:    br label %for.exit2
 ; EPILOG:       for.exit2:
 ; EPILOG-NEXT:    ret i32 42
+; EPILOG:       for.exiting_block.1:
+; EPILOG-NEXT:    %cmp.1 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.1, label %for.exit2.loopexit, label %latch.1
+; EPILOG:       latch.1:
+; EPILOG-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+; EPILOG-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; EPILOG-NEXT:    %add.1 = add nsw i32 %4, %add
+; EPILOG-NEXT:    %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv.next, 1
+; EPILOG-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
+; EPILOG-NEXT:    br i1 %cond, label %latchExit.epilog-lcssa.loopexit, label %for.exiting_block.2
+; EPILOG:       for.exiting_block.2:
+; EPILOG-NEXT:    %cmp.2 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.2, label %for.exit2.loopexit, label %latch.2
+; EPILOG:       latch.2:
+; EPILOG-NEXT:    %arrayidx.2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.1
+; EPILOG-NEXT:    %5 = load i32, i32* %arrayidx.2, align 4
+; EPILOG-NEXT:    %add.2 = add nsw i32 %5, %add.1
+; EPILOG-NEXT:    %indvars.iv.next.2 = add nuw nsw i64 %indvars.iv.next.1, 1
+; EPILOG-NEXT:    %niter.nsub.2 = sub i64 %niter.nsub.1, 1
+; EPILOG-NEXT:    br i1 %cond, label %latchExit.epilog-lcssa.loopexit, label %for.exiting_block.3
+; EPILOG:       for.exiting_block.3:
+; EPILOG-NEXT:    %cmp.3 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.3, label %for.exit2.loopexit, label %latch.3
+; EPILOG:       latch.3:
+; EPILOG-NEXT:    %arrayidx.3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.2
+; EPILOG-NEXT:    %6 = load i32, i32* %arrayidx.3, align 4
+; EPILOG-NEXT:    %add.3 = add nsw i32 %6, %add.2
+; EPILOG-NEXT:    %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv.next.2, 1
+; EPILOG-NEXT:    %niter.nsub.3 = sub i64 %niter.nsub.2, 1
+; EPILOG-NEXT:    br i1 %cond, label %latchExit.epilog-lcssa.loopexit, label %for.exiting_block.4
+; EPILOG:       for.exiting_block.4:
+; EPILOG-NEXT:    %cmp.4 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.4, label %for.exit2.loopexit, label %latch.4
+; EPILOG:       latch.4:
+; EPILOG-NEXT:    %arrayidx.4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.3
+; EPILOG-NEXT:    %7 = load i32, i32* %arrayidx.4, align 4
+; EPILOG-NEXT:    %add.4 = add nsw i32 %7, %add.3
+; EPILOG-NEXT:    %indvars.iv.next.4 = add nuw nsw i64 %indvars.iv.next.3, 1
+; EPILOG-NEXT:    %niter.nsub.4 = sub i64 %niter.nsub.3, 1
+; EPILOG-NEXT:    br i1 %cond, label %latchExit.epilog-lcssa.loopexit, label %for.exiting_block.5
+; EPILOG:       for.exiting_block.5:
+; EPILOG-NEXT:    %cmp.5 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.5, label %for.exit2.loopexit, label %latch.5
+; EPILOG:       latch.5:
+; EPILOG-NEXT:    %arrayidx.5 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.4
+; EPILOG-NEXT:    %8 = load i32, i32* %arrayidx.5, align 4
+; EPILOG-NEXT:    %add.5 = add nsw i32 %8, %add.4
+; EPILOG-NEXT:    %indvars.iv.next.5 = add nuw nsw i64 %indvars.iv.next.4, 1
+; EPILOG-NEXT:    %niter.nsub.5 = sub i64 %niter.nsub.4, 1
+; EPILOG-NEXT:    br i1 %cond, label %latchExit.epilog-lcssa.loopexit, label %for.exiting_block.6
+; EPILOG:       for.exiting_block.6:
+; EPILOG-NEXT:    %cmp.6 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.6, label %for.exit2.loopexit, label %latch.6
+; EPILOG:       latch.6:
+; EPILOG-NEXT:    %arrayidx.6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.5
+; EPILOG-NEXT:    %9 = load i32, i32* %arrayidx.6, align 4
+; EPILOG-NEXT:    %add.6 = add nsw i32 %9, %add.5
+; EPILOG-NEXT:    %indvars.iv.next.6 = add nuw nsw i64 %indvars.iv.next.5, 1
+; EPILOG-NEXT:    %niter.nsub.6 = sub i64 %niter.nsub.5, 1
+; EPILOG-NEXT:    br i1 %cond, label %latchExit.epilog-lcssa.loopexit, label %for.exiting_block.7
+; EPILOG:       for.exiting_block.7:
+; EPILOG-NEXT:    %cmp.7 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.7, label %for.exit2.loopexit, label %latch.7
+; EPILOG:       latch.7:
+; EPILOG-NEXT:    %arrayidx.7 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.6
+; EPILOG-NEXT:    %10 = load i32, i32* %arrayidx.7, align 4
+; EPILOG-NEXT:    %add.7 = add nsw i32 %10, %add.6
+; EPILOG-NEXT:    %indvars.iv.next.7 = add i64 %indvars.iv.next.6, 1
+; EPILOG-NEXT:    %niter.nsub.7 = sub i64 %niter.nsub.6, 1
+; EPILOG-NEXT:    %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0
+; EPILOG-NEXT:    br i1 %niter.ncmp.7, label %latchExit.unr-lcssa.loopexit, label %header
 ;
 ; EPILOG-BLOCK-LABEL: @hdr_latch_same_exit(
 ; EPILOG-BLOCK-NEXT:  entry:
+; EPILOG-BLOCK-NEXT:    %0 = add i64 %n, -1
+; EPILOG-BLOCK-NEXT:    %xtraiter = and i64 %n, 1
+; EPILOG-BLOCK-NEXT:    %1 = icmp ult i64 %0, 1
+; EPILOG-BLOCK-NEXT:    br i1 %1, label %latchExit.unr-lcssa, label %entry.new
+; EPILOG-BLOCK:       entry.new:
+; EPILOG-BLOCK-NEXT:    %unroll_iter = sub i64 %n, %xtraiter
 ; EPILOG-BLOCK-NEXT:    br label %header
 ; EPILOG-BLOCK:       header:
-; EPILOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %latch.1 ]
-; EPILOG-BLOCK-NEXT:    %sum.02 = phi i32 [ 0, %entry ], [ %add.1, %latch.1 ]
-; EPILOG-BLOCK-NEXT:    br i1 %cond, label %latchExit, label %for.exiting_block
+; EPILOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ 0, %entry.new ], [ %indvars.iv.next.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %sum.02 = phi i32 [ 0, %entry.new ], [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    br i1 %cond, label %latchExit.epilog-lcssa.loopexit, label %for.exiting_block
 ; EPILOG-BLOCK:       for.exiting_block:
 ; EPILOG-BLOCK-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-BLOCK-NEXT:    br i1 %cmp, label %for.exit2, label %latch
+; EPILOG-BLOCK-NEXT:    br i1 %cmp, label %for.exit2.loopexit, label %latch
 ; EPILOG-BLOCK:       latch:
 ; EPILOG-BLOCK-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-BLOCK-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; EPILOG-BLOCK-NEXT:    %add = add nsw i32 %0, %sum.02
+; EPILOG-BLOCK-NEXT:    %2 = load i32, i32* %arrayidx, align 4
+; EPILOG-BLOCK-NEXT:    %add = add nsw i32 %2, %sum.02
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; EPILOG-BLOCK-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; EPILOG-BLOCK-NEXT:    br i1 %exitcond, label %latchExit, label %header.1
+; EPILOG-BLOCK-NEXT:    %niter.nsub = sub i64 %niter, 1
+; EPILOG-BLOCK-NEXT:    br i1 %cond, label %latchExit.epilog-lcssa.loopexit, label %for.exiting_block.1
+; EPILOG-BLOCK:       latchExit.unr-lcssa.loopexit:
+; EPILOG-BLOCK-NEXT:    %result.ph.ph = phi i32 [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %sum.02.unr.ph = phi i32 [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    br label %latchExit.unr-lcssa
+; EPILOG-BLOCK:       latchExit.unr-lcssa:
+; EPILOG-BLOCK-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; EPILOG-BLOCK-NEXT:    br i1 %lcmp.mod, label %header.epil.preheader, label %latchExit
+; EPILOG-BLOCK:       header.epil.preheader:
+; EPILOG-BLOCK-NEXT:    br label %header.epil
+; EPILOG-BLOCK:       header.epil:
+; EPILOG-BLOCK-NEXT:    %indvars.iv.epil = phi i64 [ %indvars.iv.unr, %header.epil.preheader ]
+; EPILOG-BLOCK-NEXT:    %sum.02.epil = phi i32 [ %sum.02.unr, %header.epil.preheader ]
+; EPILOG-BLOCK-NEXT:    br i1 %cond, label %latchExit.epilog-lcssa, label %for.exiting_block.epil
+; EPILOG-BLOCK:       for.exiting_block.epil:
+; EPILOG-BLOCK-NEXT:    %cmp.epil = icmp eq i64 %n, 42
+; EPILOG-BLOCK-NEXT:    br i1 %cmp.epil, label %for.exit2, label %latch.epil
+; EPILOG-BLOCK:       latch.epil:
+; EPILOG-BLOCK-NEXT:    %arrayidx.epil = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.epil
+; EPILOG-BLOCK-NEXT:    %3 = load i32, i32* %arrayidx.epil, align 4
+; EPILOG-BLOCK-NEXT:    %add.epil = add nsw i32 %3, %sum.02.epil
+; EPILOG-BLOCK-NEXT:    %indvars.iv.next.epil = add i64 %indvars.iv.epil, 1
+; EPILOG-BLOCK-NEXT:    %exitcond.epil = icmp eq i64 %indvars.iv.next.epil, %n
+; EPILOG-BLOCK-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG-BLOCK:       latchExit.epilog-lcssa.loopexit:
+; EPILOG-BLOCK-NEXT:    %result.ph1.ph = phi i32 [ 0, %header ], [ 0, %latch ]
+; EPILOG-BLOCK-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG-BLOCK:       latchExit.epilog-lcssa:
+; EPILOG-BLOCK-NEXT:    %result.ph1 = phi i32 [ %add.epil, %latch.epil ], [ 0, %header.epil ], [ %result.ph1.ph, %latchExit.epilog-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    br label %latchExit
 ; EPILOG-BLOCK:       latchExit:
-; EPILOG-BLOCK-NEXT:    %result = phi i32 [ 0, %header ], [ %add, %latch ], [ 0, %header.1 ], [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %result = phi i32 [ %result.ph, %latchExit.unr-lcssa ], [ %result.ph1, %latchExit.epilog-lcssa ]
 ; EPILOG-BLOCK-NEXT:    ret i32 %result
+; EPILOG-BLOCK:       for.exit2.loopexit:
+; EPILOG-BLOCK-NEXT:    br label %for.exit2
 ; EPILOG-BLOCK:       for.exit2:
 ; EPILOG-BLOCK-NEXT:    ret i32 42
-; EPILOG-BLOCK:       header.1:
-; EPILOG-BLOCK-NEXT:    br i1 %cond, label %latchExit, label %for.exiting_block.1
 ; EPILOG-BLOCK:       for.exiting_block.1:
 ; EPILOG-BLOCK-NEXT:    %cmp.1 = icmp eq i64 %n, 42
-; EPILOG-BLOCK-NEXT:    br i1 %cmp.1, label %for.exit2, label %latch.1
+; EPILOG-BLOCK-NEXT:    br i1 %cmp.1, label %for.exit2.loopexit, label %latch.1
 ; EPILOG-BLOCK:       latch.1:
 ; EPILOG-BLOCK-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
-; EPILOG-BLOCK-NEXT:    %1 = load i32, i32* %arrayidx.1, align 4
-; EPILOG-BLOCK-NEXT:    %add.1 = add nsw i32 %1, %add
+; EPILOG-BLOCK-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; EPILOG-BLOCK-NEXT:    %add.1 = add nsw i32 %4, %add
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.next.1 = add i64 %indvars.iv.next, 1
-; EPILOG-BLOCK-NEXT:    %exitcond.1 = icmp eq i64 %indvars.iv.next.1, %n
-; EPILOG-BLOCK-NEXT:    br i1 %exitcond.1, label %latchExit, label %header, !llvm.loop !4
+; EPILOG-BLOCK-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
+; EPILOG-BLOCK-NEXT:    %niter.ncmp.1 = icmp eq i64 %niter.nsub.1, 0
+; EPILOG-BLOCK-NEXT:    br i1 %niter.ncmp.1, label %latchExit.unr-lcssa.loopexit, label %header, !llvm.loop !4
 ;
 ; PROLOG-LABEL: @hdr_latch_same_exit(
 ; PROLOG-NEXT:  entry:
+; PROLOG-NEXT:    %0 = add i64 %n, -1
+; PROLOG-NEXT:    %xtraiter = and i64 %n, 7
+; PROLOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; PROLOG-NEXT:    br i1 %lcmp.mod, label %header.prol.preheader, label %header.prol.loopexit
+; PROLOG:       header.prol.preheader:
+; PROLOG-NEXT:    br label %header.prol
+; PROLOG:       header.prol:
+; PROLOG-NEXT:    %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %latch.prol ], [ 0, %header.prol.preheader ]
+; PROLOG-NEXT:    %sum.02.prol = phi i32 [ %add.prol, %latch.prol ], [ 0, %header.prol.preheader ]
+; PROLOG-NEXT:    %prol.iter = phi i64 [ %xtraiter, %header.prol.preheader ], [ %prol.iter.sub, %latch.prol ]
+; PROLOG-NEXT:    br i1 %cond, label %latchExit.unr-lcssa.loopexit1, label %for.exiting_block.prol
+; PROLOG:       for.exiting_block.prol:
+; PROLOG-NEXT:    %cmp.prol = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.prol, label %for.exit2.loopexit3, label %latch.prol
+; PROLOG:       latch.prol:
+; PROLOG-NEXT:    %arrayidx.prol = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.prol
+; PROLOG-NEXT:    %1 = load i32, i32* %arrayidx.prol, align 4
+; PROLOG-NEXT:    %add.prol = add nsw i32 %1, %sum.02.prol
+; PROLOG-NEXT:    %indvars.iv.next.prol = add i64 %indvars.iv.prol, 1
+; PROLOG-NEXT:    %exitcond.prol = icmp eq i64 %indvars.iv.next.prol, %n
+; PROLOG-NEXT:    %prol.iter.sub = sub i64 %prol.iter, 1
+; PROLOG-NEXT:    %prol.iter.cmp = icmp ne i64 %prol.iter.sub, 0
+; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %header.prol, label %header.prol.loopexit.unr-lcssa, !llvm.loop !4
+; PROLOG:       header.prol.loopexit.unr-lcssa:
+; PROLOG-NEXT:    %result.unr.ph = phi i32 [ %add.prol, %latch.prol ]
+; PROLOG-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.prol, %latch.prol ]
+; PROLOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.prol, %latch.prol ]
+; PROLOG-NEXT:    br label %header.prol.loopexit
+; PROLOG:       header.prol.loopexit:
+; PROLOG-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %result.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %2 = icmp ult i64 %0, 7
+; PROLOG-NEXT:    br i1 %2, label %latchExit, label %entry.new
+; PROLOG:       entry.new:
 ; PROLOG-NEXT:    br label %header
 ; PROLOG:       header:
-; PROLOG-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
-; PROLOG-NEXT:    %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
-; PROLOG-NEXT:    br i1 %cond, label %latchExit, label %for.exiting_block
+; PROLOG-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.unr, %entry.new ], [ %indvars.iv.next.7, %latch.7 ]
+; PROLOG-NEXT:    %sum.02 = phi i32 [ %sum.02.unr, %entry.new ], [ %add.7, %latch.7 ]
+; PROLOG-NEXT:    br i1 %cond, label %latchExit.unr-lcssa.loopexit, label %for.exiting_block
 ; PROLOG:       for.exiting_block:
 ; PROLOG-NEXT:    %cmp = icmp eq i64 %n, 42
-; PROLOG-NEXT:    br i1 %cmp, label %for.exit2, label %latch
+; PROLOG-NEXT:    br i1 %cmp, label %for.exit2.loopexit, label %latch
 ; PROLOG:       latch:
 ; PROLOG-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; PROLOG-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; PROLOG-NEXT:    %add = add nsw i32 %0, %sum.02
+; PROLOG-NEXT:    %3 = load i32, i32* %arrayidx, align 4
+; PROLOG-NEXT:    %add = add nsw i32 %3, %sum.02
 ; PROLOG-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
-; PROLOG-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; PROLOG-NEXT:    br i1 %exitcond, label %latchExit, label %header
+; PROLOG-NEXT:    br i1 %cond, label %latchExit.unr-lcssa.loopexit, label %for.exiting_block.1
+; PROLOG:       latchExit.unr-lcssa.loopexit:
+; PROLOG-NEXT:    %result.ph.ph = phi i32 [ 0, %header ], [ 0, %latch ], [ 0, %latch.1 ], [ 0, %latch.2 ], [ 0, %latch.3 ], [ 0, %latch.4 ], [ 0, %latch.5 ], [ 0, %latch.6 ], [ %add.7, %latch.7 ]
+; PROLOG-NEXT:    br label %latchExit.unr-lcssa
+; PROLOG:       latchExit.unr-lcssa.loopexit1:
+; PROLOG-NEXT:    %result.ph.ph2 = phi i32 [ 0, %header.prol ]
+; PROLOG-NEXT:    br label %latchExit.unr-lcssa
+; PROLOG:       latchExit.unr-lcssa:
+; PROLOG-NEXT:    %result.ph = phi i32 [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ], [ %result.ph.ph2, %latchExit.unr-lcssa.loopexit1 ]
+; PROLOG-NEXT:    br label %latchExit
 ; PROLOG:       latchExit:
-; PROLOG-NEXT:    %result = phi i32 [ 0, %header ], [ %add, %latch ]
+; PROLOG-NEXT:    %result = phi i32 [ %result.unr, %header.prol.loopexit ], [ %result.ph, %latchExit.unr-lcssa ]
 ; PROLOG-NEXT:    ret i32 %result
+; PROLOG:       for.exit2.loopexit:
+; PROLOG-NEXT:    br label %for.exit2
+; PROLOG:       for.exit2.loopexit3:
+; PROLOG-NEXT:    br label %for.exit2
 ; PROLOG:       for.exit2:
 ; PROLOG-NEXT:    ret i32 42
+; PROLOG:       for.exiting_block.1:
+; PROLOG-NEXT:    %cmp.1 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.1, label %for.exit2.loopexit, label %latch.1
+; PROLOG:       latch.1:
+; PROLOG-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+; PROLOG-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; PROLOG-NEXT:    %add.1 = add nsw i32 %4, %add
+; PROLOG-NEXT:    %indvars.iv.next.1 = add i64 %indvars.iv.next, 1
+; PROLOG-NEXT:    br i1 %cond, label %latchExit.unr-lcssa.loopexit, label %for.exiting_block.2
+; PROLOG:       for.exiting_block.2:
+; PROLOG-NEXT:    %cmp.2 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.2, label %for.exit2.loopexit, label %latch.2
+; PROLOG:       latch.2:
+; PROLOG-NEXT:    %arrayidx.2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.1
+; PROLOG-NEXT:    %5 = load i32, i32* %arrayidx.2, align 4
+; PROLOG-NEXT:    %add.2 = add nsw i32 %5, %add.1
+; PROLOG-NEXT:    %indvars.iv.next.2 = add i64 %indvars.iv.next.1, 1
+; PROLOG-NEXT:    br i1 %cond, label %latchExit.unr-lcssa.loopexit, label %for.exiting_block.3
+; PROLOG:       for.exiting_block.3:
+; PROLOG-NEXT:    %cmp.3 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.3, label %for.exit2.loopexit, label %latch.3
+; PROLOG:       latch.3:
+; PROLOG-NEXT:    %arrayidx.3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.2
+; PROLOG-NEXT:    %6 = load i32, i32* %arrayidx.3, align 4
+; PROLOG-NEXT:    %add.3 = add nsw i32 %6, %add.2
+; PROLOG-NEXT:    %indvars.iv.next.3 = add i64 %indvars.iv.next.2, 1
+; PROLOG-NEXT:    br i1 %cond, label %latchExit.unr-lcssa.loopexit, label %for.exiting_block.4
+; PROLOG:       for.exiting_block.4:
+; PROLOG-NEXT:    %cmp.4 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.4, label %for.exit2.loopexit, label %latch.4
+; PROLOG:       latch.4:
+; PROLOG-NEXT:    %arrayidx.4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.3
+; PROLOG-NEXT:    %7 = load i32, i32* %arrayidx.4, align 4
+; PROLOG-NEXT:    %add.4 = add nsw i32 %7, %add.3
+; PROLOG-NEXT:    %indvars.iv.next.4 = add i64 %indvars.iv.next.3, 1
+; PROLOG-NEXT:    br i1 %cond, label %latchExit.unr-lcssa.loopexit, label %for.exiting_block.5
+; PROLOG:       for.exiting_block.5:
+; PROLOG-NEXT:    %cmp.5 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.5, label %for.exit2.loopexit, label %latch.5
+; PROLOG:       latch.5:
+; PROLOG-NEXT:    %arrayidx.5 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.4
+; PROLOG-NEXT:    %8 = load i32, i32* %arrayidx.5, align 4
+; PROLOG-NEXT:    %add.5 = add nsw i32 %8, %add.4
+; PROLOG-NEXT:    %indvars.iv.next.5 = add i64 %indvars.iv.next.4, 1
+; PROLOG-NEXT:    br i1 %cond, label %latchExit.unr-lcssa.loopexit, label %for.exiting_block.6
+; PROLOG:       for.exiting_block.6:
+; PROLOG-NEXT:    %cmp.6 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.6, label %for.exit2.loopexit, label %latch.6
+; PROLOG:       latch.6:
+; PROLOG-NEXT:    %arrayidx.6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.5
+; PROLOG-NEXT:    %9 = load i32, i32* %arrayidx.6, align 4
+; PROLOG-NEXT:    %add.6 = add nsw i32 %9, %add.5
+; PROLOG-NEXT:    %indvars.iv.next.6 = add i64 %indvars.iv.next.5, 1
+; PROLOG-NEXT:    br i1 %cond, label %latchExit.unr-lcssa.loopexit, label %for.exiting_block.7
+; PROLOG:       for.exiting_block.7:
+; PROLOG-NEXT:    %cmp.7 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.7, label %for.exit2.loopexit, label %latch.7
+; PROLOG:       latch.7:
+; PROLOG-NEXT:    %arrayidx.7 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.6
+; PROLOG-NEXT:    %10 = load i32, i32* %arrayidx.7, align 4
+; PROLOG-NEXT:    %add.7 = add nsw i32 %10, %add.6
+; PROLOG-NEXT:    %indvars.iv.next.7 = add i64 %indvars.iv.next.6, 1
+; PROLOG-NEXT:    %exitcond.7 = icmp eq i64 %indvars.iv.next.7, %n
+; PROLOG-NEXT:    br i1 %exitcond.7, label %latchExit.unr-lcssa.loopexit, label %header
 ;
 ; PROLOG-BLOCK-LABEL: @hdr_latch_same_exit(
 ; PROLOG-BLOCK-NEXT:  entry:
+; PROLOG-BLOCK-NEXT:    %0 = add i64 %n, -1
+; PROLOG-BLOCK-NEXT:    %xtraiter = and i64 %n, 1
+; PROLOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; PROLOG-BLOCK-NEXT:    br i1 %lcmp.mod, label %header.prol.preheader, label %header.prol.loopexit
+; PROLOG-BLOCK:       header.prol.preheader:
+; PROLOG-BLOCK-NEXT:    br label %header.prol
+; PROLOG-BLOCK:       header.prol:
+; PROLOG-BLOCK-NEXT:    br i1 %cond, label %latchExit.unr-lcssa, label %for.exiting_block.prol
+; PROLOG-BLOCK:       for.exiting_block.prol:
+; PROLOG-BLOCK-NEXT:    %cmp.prol = icmp eq i64 %n, 42
+; PROLOG-BLOCK-NEXT:    br i1 %cmp.prol, label %for.exit2, label %latch.prol
+; PROLOG-BLOCK:       latch.prol:
+; PROLOG-BLOCK-NEXT:    %arrayidx.prol = getelementptr inbounds i32, i32* %a, i64 0
+; PROLOG-BLOCK-NEXT:    %1 = load i32, i32* %arrayidx.prol, align 4
+; PROLOG-BLOCK-NEXT:    %add.prol = add nsw i32 %1, 0
+; PROLOG-BLOCK-NEXT:    %indvars.iv.next.prol = add i64 0, 1
+; PROLOG-BLOCK-NEXT:    %exitcond.prol = icmp eq i64 %indvars.iv.next.prol, %n
+; PROLOG-BLOCK-NEXT:    br label %header.prol.loopexit
+; PROLOG-BLOCK:       header.prol.loopexit:
+; PROLOG-BLOCK-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %add.prol, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.next.prol, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %add.prol, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %2 = icmp ult i64 %0, 1
+; PROLOG-BLOCK-NEXT:    br i1 %2, label %latchExit, label %entry.new
+; PROLOG-BLOCK:       entry.new:
 ; PROLOG-BLOCK-NEXT:    br label %header
 ; PROLOG-BLOCK:       header:
-; PROLOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %latch.1 ]
-; PROLOG-BLOCK-NEXT:    %sum.02 = phi i32 [ 0, %entry ], [ %add.1, %latch.1 ]
-; PROLOG-BLOCK-NEXT:    br i1 %cond, label %latchExit, label %for.exiting_block
+; PROLOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.unr, %entry.new ], [ %indvars.iv.next.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    %sum.02 = phi i32 [ %sum.02.unr, %entry.new ], [ %add.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    br i1 %cond, label %latchExit.unr-lcssa.loopexit, label %for.exiting_block
 ; PROLOG-BLOCK:       for.exiting_block:
 ; PROLOG-BLOCK-NEXT:    %cmp = icmp eq i64 %n, 42
-; PROLOG-BLOCK-NEXT:    br i1 %cmp, label %for.exit2, label %latch
+; PROLOG-BLOCK-NEXT:    br i1 %cmp, label %for.exit2.loopexit, label %latch
 ; PROLOG-BLOCK:       latch:
 ; PROLOG-BLOCK-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; PROLOG-BLOCK-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; PROLOG-BLOCK-NEXT:    %add = add nsw i32 %0, %sum.02
-; PROLOG-BLOCK-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; PROLOG-BLOCK-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; PROLOG-BLOCK-NEXT:    br i1 %exitcond, label %latchExit, label %header.1
+; PROLOG-BLOCK-NEXT:    %3 = load i32, i32* %arrayidx, align 4
+; PROLOG-BLOCK-NEXT:    %add = add nsw i32 %3, %sum.02
+; PROLOG-BLOCK-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
+; PROLOG-BLOCK-NEXT:    br i1 %cond, label %latchExit.unr-lcssa.loopexit, label %for.exiting_block.1
+; PROLOG-BLOCK:       latchExit.unr-lcssa.loopexit:
+; PROLOG-BLOCK-NEXT:    %result.ph.ph = phi i32 [ 0, %header ], [ 0, %latch ], [ %add.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    br label %latchExit.unr-lcssa
+; PROLOG-BLOCK:       latchExit.unr-lcssa:
+; PROLOG-BLOCK-NEXT:    %result.ph = phi i32 [ 0, %header.prol ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; PROLOG-BLOCK-NEXT:    br label %latchExit
 ; PROLOG-BLOCK:       latchExit:
-; PROLOG-BLOCK-NEXT:    %result = phi i32 [ 0, %header ], [ %add, %latch ], [ 0, %header.1 ], [ %add.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    %result = phi i32 [ %result.unr, %header.prol.loopexit ], [ %result.ph, %latchExit.unr-lcssa ]
 ; PROLOG-BLOCK-NEXT:    ret i32 %result
+; PROLOG-BLOCK:       for.exit2.loopexit:
+; PROLOG-BLOCK-NEXT:    br label %for.exit2
 ; PROLOG-BLOCK:       for.exit2:
 ; PROLOG-BLOCK-NEXT:    ret i32 42
-; PROLOG-BLOCK:       header.1:
-; PROLOG-BLOCK-NEXT:    br i1 %cond, label %latchExit, label %for.exiting_block.1
 ; PROLOG-BLOCK:       for.exiting_block.1:
 ; PROLOG-BLOCK-NEXT:    %cmp.1 = icmp eq i64 %n, 42
-; PROLOG-BLOCK-NEXT:    br i1 %cmp.1, label %for.exit2, label %latch.1
+; PROLOG-BLOCK-NEXT:    br i1 %cmp.1, label %for.exit2.loopexit, label %latch.1
 ; PROLOG-BLOCK:       latch.1:
 ; PROLOG-BLOCK-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
-; PROLOG-BLOCK-NEXT:    %1 = load i32, i32* %arrayidx.1, align 4
-; PROLOG-BLOCK-NEXT:    %add.1 = add nsw i32 %1, %add
+; PROLOG-BLOCK-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; PROLOG-BLOCK-NEXT:    %add.1 = add nsw i32 %4, %add
 ; PROLOG-BLOCK-NEXT:    %indvars.iv.next.1 = add i64 %indvars.iv.next, 1
 ; PROLOG-BLOCK-NEXT:    %exitcond.1 = icmp eq i64 %indvars.iv.next.1, %n
-; PROLOG-BLOCK-NEXT:    br i1 %exitcond.1, label %latchExit, label %header, !llvm.loop !4
+; PROLOG-BLOCK-NEXT:    br i1 %exitcond.1, label %latchExit.unr-lcssa.loopexit, label %header, !llvm.loop !4
 ;
 
 entry:
@@ -1845,144 +1725,424 @@ for.exit2:
 ; non-header
 ; FIXME: We should unroll this loop.
 define i32 @otherblock_latch_same_exit(i32* nocapture %a, i64 %n, i1 %cond) {
-; EPILOG-NO-IC-LABEL: @otherblock_latch_same_exit(
-; EPILOG-NO-IC-NEXT:  entry:
-; EPILOG-NO-IC-NEXT:    br label %header
-; EPILOG-NO-IC:       header:
-; EPILOG-NO-IC-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
-; EPILOG-NO-IC-NEXT:    %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
-; EPILOG-NO-IC:       for.exiting_block:
-; EPILOG-NO-IC-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp, label %latchExit, label %latch
-; EPILOG-NO-IC:       latch:
-; EPILOG-NO-IC-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-NO-IC-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; EPILOG-NO-IC-NEXT:    %add = add nsw i32 %0, %sum.02
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
-; EPILOG-NO-IC-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; EPILOG-NO-IC-NEXT:    br i1 %exitcond, label %latchExit, label %header
-; EPILOG-NO-IC:       latchExit:
-; EPILOG-NO-IC-NEXT:    %result = phi i32 [ 2, %for.exiting_block ], [ %add, %latch ]
-; EPILOG-NO-IC-NEXT:    ret i32 %result
-; EPILOG-NO-IC:       for.exit2:
-; EPILOG-NO-IC-NEXT:    ret i32 42
-;
 ; EPILOG-LABEL: @otherblock_latch_same_exit(
 ; EPILOG-NEXT:  entry:
+; EPILOG-NEXT:    %0 = add i64 %n, -1
+; EPILOG-NEXT:    %xtraiter = and i64 %n, 7
+; EPILOG-NEXT:    %1 = icmp ult i64 %0, 7
+; EPILOG-NEXT:    br i1 %1, label %latchExit.unr-lcssa, label %entry.new
+; EPILOG:       entry.new:
+; EPILOG-NEXT:    %unroll_iter = sub i64 %n, %xtraiter
 ; EPILOG-NEXT:    br label %header
 ; EPILOG:       header:
-; EPILOG-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
-; EPILOG-NEXT:    %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
-; EPILOG-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
+; EPILOG-NEXT:    %indvars.iv = phi i64 [ 0, %entry.new ], [ %indvars.iv.next.7, %latch.7 ]
+; EPILOG-NEXT:    %sum.02 = phi i32 [ 0, %entry.new ], [ %add.7, %latch.7 ]
+; EPILOG-NEXT:    %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.7, %latch.7 ]
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block
 ; EPILOG:       for.exiting_block:
 ; EPILOG-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-NEXT:    br i1 %cmp, label %latchExit, label %latch
+; EPILOG-NEXT:    br i1 %cmp, label %latchExit.epilog-lcssa.loopexit, label %latch
 ; EPILOG:       latch:
 ; EPILOG-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; EPILOG-NEXT:    %add = add nsw i32 %0, %sum.02
-; EPILOG-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
-; EPILOG-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; EPILOG-NEXT:    br i1 %exitcond, label %latchExit, label %header
+; EPILOG-NEXT:    %2 = load i32, i32* %arrayidx, align 4
+; EPILOG-NEXT:    %add = add nsw i32 %2, %sum.02
+; EPILOG-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; EPILOG-NEXT:    %niter.nsub = sub i64 %niter, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.1
+; EPILOG:       latchExit.unr-lcssa.loopexit:
+; EPILOG-NEXT:    %result.ph.ph = phi i32 [ %add.7, %latch.7 ]
+; EPILOG-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.7, %latch.7 ]
+; EPILOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.7, %latch.7 ]
+; EPILOG-NEXT:    br label %latchExit.unr-lcssa
+; EPILOG:       latchExit.unr-lcssa:
+; EPILOG-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; EPILOG-NEXT:    br i1 %lcmp.mod, label %header.epil.preheader, label %latchExit
+; EPILOG:       header.epil.preheader:
+; EPILOG-NEXT:    br label %header.epil
+; EPILOG:       header.epil:
+; EPILOG-NEXT:    %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %latch.epil ], [ %indvars.iv.unr, %header.epil.preheader ]
+; EPILOG-NEXT:    %sum.02.epil = phi i32 [ %add.epil, %latch.epil ], [ %sum.02.unr, %header.epil.preheader ]
+; EPILOG-NEXT:    %epil.iter = phi i64 [ %xtraiter, %header.epil.preheader ], [ %epil.iter.sub, %latch.epil ]
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit2, label %for.exiting_block.epil
+; EPILOG:       for.exiting_block.epil:
+; EPILOG-NEXT:    %cmp.epil = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.epil, label %latchExit.epilog-lcssa.loopexit3, label %latch.epil
+; EPILOG:       latch.epil:
+; EPILOG-NEXT:    %arrayidx.epil = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.epil
+; EPILOG-NEXT:    %3 = load i32, i32* %arrayidx.epil, align 4
+; EPILOG-NEXT:    %add.epil = add nsw i32 %3, %sum.02.epil
+; EPILOG-NEXT:    %indvars.iv.next.epil = add i64 %indvars.iv.epil, 1
+; EPILOG-NEXT:    %exitcond.epil = icmp eq i64 %indvars.iv.next.epil, %n
+; EPILOG-NEXT:    %epil.iter.sub = sub i64 %epil.iter, 1
+; EPILOG-NEXT:    %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0
+; EPILOG-NEXT:    br i1 %epil.iter.cmp, label %header.epil, label %latchExit.epilog-lcssa.loopexit3, !llvm.loop !5
+; EPILOG:       latchExit.epilog-lcssa.loopexit:
+; EPILOG-NEXT:    %result.ph1.ph = phi i32 [ 2, %for.exiting_block ], [ 2, %for.exiting_block.1 ], [ 2, %for.exiting_block.2 ], [ 2, %for.exiting_block.3 ], [ 2, %for.exiting_block.4 ], [ 2, %for.exiting_block.5 ], [ 2, %for.exiting_block.6 ], [ 2, %for.exiting_block.7 ]
+; EPILOG-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG:       latchExit.epilog-lcssa.loopexit3:
+; EPILOG-NEXT:    %result.ph1.ph4 = phi i32 [ 2, %for.exiting_block.epil ], [ %add.epil, %latch.epil ]
+; EPILOG-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG:       latchExit.epilog-lcssa:
+; EPILOG-NEXT:    %result.ph1 = phi i32 [ %result.ph1.ph, %latchExit.epilog-lcssa.loopexit ], [ %result.ph1.ph4, %latchExit.epilog-lcssa.loopexit3 ]
+; EPILOG-NEXT:    br label %latchExit
 ; EPILOG:       latchExit:
-; EPILOG-NEXT:    %result = phi i32 [ 2, %for.exiting_block ], [ %add, %latch ]
+; EPILOG-NEXT:    %result = phi i32 [ %result.ph, %latchExit.unr-lcssa ], [ %result.ph1, %latchExit.epilog-lcssa ]
 ; EPILOG-NEXT:    ret i32 %result
+; EPILOG:       for.exit2.loopexit:
+; EPILOG-NEXT:    br label %for.exit2
+; EPILOG:       for.exit2.loopexit2:
+; EPILOG-NEXT:    br label %for.exit2
 ; EPILOG:       for.exit2:
 ; EPILOG-NEXT:    ret i32 42
+; EPILOG:       for.exiting_block.1:
+; EPILOG-NEXT:    %cmp.1 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.1, label %latchExit.epilog-lcssa.loopexit, label %latch.1
+; EPILOG:       latch.1:
+; EPILOG-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+; EPILOG-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; EPILOG-NEXT:    %add.1 = add nsw i32 %4, %add
+; EPILOG-NEXT:    %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv.next, 1
+; EPILOG-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.2
+; EPILOG:       for.exiting_block.2:
+; EPILOG-NEXT:    %cmp.2 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.2, label %latchExit.epilog-lcssa.loopexit, label %latch.2
+; EPILOG:       latch.2:
+; EPILOG-NEXT:    %arrayidx.2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.1
+; EPILOG-NEXT:    %5 = load i32, i32* %arrayidx.2, align 4
+; EPILOG-NEXT:    %add.2 = add nsw i32 %5, %add.1
+; EPILOG-NEXT:    %indvars.iv.next.2 = add nuw nsw i64 %indvars.iv.next.1, 1
+; EPILOG-NEXT:    %niter.nsub.2 = sub i64 %niter.nsub.1, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.3
+; EPILOG:       for.exiting_block.3:
+; EPILOG-NEXT:    %cmp.3 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.3, label %latchExit.epilog-lcssa.loopexit, label %latch.3
+; EPILOG:       latch.3:
+; EPILOG-NEXT:    %arrayidx.3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.2
+; EPILOG-NEXT:    %6 = load i32, i32* %arrayidx.3, align 4
+; EPILOG-NEXT:    %add.3 = add nsw i32 %6, %add.2
+; EPILOG-NEXT:    %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv.next.2, 1
+; EPILOG-NEXT:    %niter.nsub.3 = sub i64 %niter.nsub.2, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.4
+; EPILOG:       for.exiting_block.4:
+; EPILOG-NEXT:    %cmp.4 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.4, label %latchExit.epilog-lcssa.loopexit, label %latch.4
+; EPILOG:       latch.4:
+; EPILOG-NEXT:    %arrayidx.4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.3
+; EPILOG-NEXT:    %7 = load i32, i32* %arrayidx.4, align 4
+; EPILOG-NEXT:    %add.4 = add nsw i32 %7, %add.3
+; EPILOG-NEXT:    %indvars.iv.next.4 = add nuw nsw i64 %indvars.iv.next.3, 1
+; EPILOG-NEXT:    %niter.nsub.4 = sub i64 %niter.nsub.3, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.5
+; EPILOG:       for.exiting_block.5:
+; EPILOG-NEXT:    %cmp.5 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.5, label %latchExit.epilog-lcssa.loopexit, label %latch.5
+; EPILOG:       latch.5:
+; EPILOG-NEXT:    %arrayidx.5 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.4
+; EPILOG-NEXT:    %8 = load i32, i32* %arrayidx.5, align 4
+; EPILOG-NEXT:    %add.5 = add nsw i32 %8, %add.4
+; EPILOG-NEXT:    %indvars.iv.next.5 = add nuw nsw i64 %indvars.iv.next.4, 1
+; EPILOG-NEXT:    %niter.nsub.5 = sub i64 %niter.nsub.4, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.6
+; EPILOG:       for.exiting_block.6:
+; EPILOG-NEXT:    %cmp.6 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.6, label %latchExit.epilog-lcssa.loopexit, label %latch.6
+; EPILOG:       latch.6:
+; EPILOG-NEXT:    %arrayidx.6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.5
+; EPILOG-NEXT:    %9 = load i32, i32* %arrayidx.6, align 4
+; EPILOG-NEXT:    %add.6 = add nsw i32 %9, %add.5
+; EPILOG-NEXT:    %indvars.iv.next.6 = add nuw nsw i64 %indvars.iv.next.5, 1
+; EPILOG-NEXT:    %niter.nsub.6 = sub i64 %niter.nsub.5, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.7
+; EPILOG:       for.exiting_block.7:
+; EPILOG-NEXT:    %cmp.7 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.7, label %latchExit.epilog-lcssa.loopexit, label %latch.7
+; EPILOG:       latch.7:
+; EPILOG-NEXT:    %arrayidx.7 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.6
+; EPILOG-NEXT:    %10 = load i32, i32* %arrayidx.7, align 4
+; EPILOG-NEXT:    %add.7 = add nsw i32 %10, %add.6
+; EPILOG-NEXT:    %indvars.iv.next.7 = add i64 %indvars.iv.next.6, 1
+; EPILOG-NEXT:    %niter.nsub.7 = sub i64 %niter.nsub.6, 1
+; EPILOG-NEXT:    %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0
+; EPILOG-NEXT:    br i1 %niter.ncmp.7, label %latchExit.unr-lcssa.loopexit, label %header
 ;
 ; EPILOG-BLOCK-LABEL: @otherblock_latch_same_exit(
 ; EPILOG-BLOCK-NEXT:  entry:
+; EPILOG-BLOCK-NEXT:    %0 = add i64 %n, -1
+; EPILOG-BLOCK-NEXT:    %xtraiter = and i64 %n, 1
+; EPILOG-BLOCK-NEXT:    %1 = icmp ult i64 %0, 1
+; EPILOG-BLOCK-NEXT:    br i1 %1, label %latchExit.unr-lcssa, label %entry.new
+; EPILOG-BLOCK:       entry.new:
+; EPILOG-BLOCK-NEXT:    %unroll_iter = sub i64 %n, %xtraiter
 ; EPILOG-BLOCK-NEXT:    br label %header
 ; EPILOG-BLOCK:       header:
-; EPILOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %latch.1 ]
-; EPILOG-BLOCK-NEXT:    %sum.02 = phi i32 [ 0, %entry ], [ %add.1, %latch.1 ]
-; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
+; EPILOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ 0, %entry.new ], [ %indvars.iv.next.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %sum.02 = phi i32 [ 0, %entry.new ], [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block
 ; EPILOG-BLOCK:       for.exiting_block:
 ; EPILOG-BLOCK-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-BLOCK-NEXT:    br i1 %cmp, label %latchExit, label %latch
+; EPILOG-BLOCK-NEXT:    br i1 %cmp, label %latchExit.epilog-lcssa.loopexit, label %latch
 ; EPILOG-BLOCK:       latch:
 ; EPILOG-BLOCK-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-BLOCK-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; EPILOG-BLOCK-NEXT:    %add = add nsw i32 %0, %sum.02
+; EPILOG-BLOCK-NEXT:    %2 = load i32, i32* %arrayidx, align 4
+; EPILOG-BLOCK-NEXT:    %add = add nsw i32 %2, %sum.02
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; EPILOG-BLOCK-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; EPILOG-BLOCK-NEXT:    br i1 %exitcond, label %latchExit, label %header.1
+; EPILOG-BLOCK-NEXT:    %niter.nsub = sub i64 %niter, 1
+; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.1
+; EPILOG-BLOCK:       latchExit.unr-lcssa.loopexit:
+; EPILOG-BLOCK-NEXT:    %result.ph.ph = phi i32 [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %sum.02.unr.ph = phi i32 [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    br label %latchExit.unr-lcssa
+; EPILOG-BLOCK:       latchExit.unr-lcssa:
+; EPILOG-BLOCK-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; EPILOG-BLOCK-NEXT:    br i1 %lcmp.mod, label %header.epil.preheader, label %latchExit
+; EPILOG-BLOCK:       header.epil.preheader:
+; EPILOG-BLOCK-NEXT:    br label %header.epil
+; EPILOG-BLOCK:       header.epil:
+; EPILOG-BLOCK-NEXT:    %indvars.iv.epil = phi i64 [ %indvars.iv.unr, %header.epil.preheader ]
+; EPILOG-BLOCK-NEXT:    %sum.02.epil = phi i32 [ %sum.02.unr, %header.epil.preheader ]
+; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block.epil
+; EPILOG-BLOCK:       for.exiting_block.epil:
+; EPILOG-BLOCK-NEXT:    %cmp.epil = icmp eq i64 %n, 42
+; EPILOG-BLOCK-NEXT:    br i1 %cmp.epil, label %latchExit.epilog-lcssa, label %latch.epil
+; EPILOG-BLOCK:       latch.epil:
+; EPILOG-BLOCK-NEXT:    %arrayidx.epil = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.epil
+; EPILOG-BLOCK-NEXT:    %3 = load i32, i32* %arrayidx.epil, align 4
+; EPILOG-BLOCK-NEXT:    %add.epil = add nsw i32 %3, %sum.02.epil
+; EPILOG-BLOCK-NEXT:    %indvars.iv.next.epil = add i64 %indvars.iv.epil, 1
+; EPILOG-BLOCK-NEXT:    %exitcond.epil = icmp eq i64 %indvars.iv.next.epil, %n
+; EPILOG-BLOCK-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG-BLOCK:       latchExit.epilog-lcssa.loopexit:
+; EPILOG-BLOCK-NEXT:    %result.ph1.ph = phi i32 [ 2, %for.exiting_block ], [ 2, %for.exiting_block.1 ]
+; EPILOG-BLOCK-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG-BLOCK:       latchExit.epilog-lcssa:
+; EPILOG-BLOCK-NEXT:    %result.ph1 = phi i32 [ %add.epil, %latch.epil ], [ 2, %for.exiting_block.epil ], [ %result.ph1.ph, %latchExit.epilog-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    br label %latchExit
 ; EPILOG-BLOCK:       latchExit:
-; EPILOG-BLOCK-NEXT:    %result = phi i32 [ 2, %for.exiting_block ], [ %add, %latch ], [ 2, %for.exiting_block.1 ], [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %result = phi i32 [ %result.ph, %latchExit.unr-lcssa ], [ %result.ph1, %latchExit.epilog-lcssa ]
 ; EPILOG-BLOCK-NEXT:    ret i32 %result
+; EPILOG-BLOCK:       for.exit2.loopexit:
+; EPILOG-BLOCK-NEXT:    br label %for.exit2
 ; EPILOG-BLOCK:       for.exit2:
 ; EPILOG-BLOCK-NEXT:    ret i32 42
-; EPILOG-BLOCK:       header.1:
-; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block.1
 ; EPILOG-BLOCK:       for.exiting_block.1:
 ; EPILOG-BLOCK-NEXT:    %cmp.1 = icmp eq i64 %n, 42
-; EPILOG-BLOCK-NEXT:    br i1 %cmp.1, label %latchExit, label %latch.1
+; EPILOG-BLOCK-NEXT:    br i1 %cmp.1, label %latchExit.epilog-lcssa.loopexit, label %latch.1
 ; EPILOG-BLOCK:       latch.1:
 ; EPILOG-BLOCK-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
-; EPILOG-BLOCK-NEXT:    %1 = load i32, i32* %arrayidx.1, align 4
-; EPILOG-BLOCK-NEXT:    %add.1 = add nsw i32 %1, %add
+; EPILOG-BLOCK-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; EPILOG-BLOCK-NEXT:    %add.1 = add nsw i32 %4, %add
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.next.1 = add i64 %indvars.iv.next, 1
-; EPILOG-BLOCK-NEXT:    %exitcond.1 = icmp eq i64 %indvars.iv.next.1, %n
-; EPILOG-BLOCK-NEXT:    br i1 %exitcond.1, label %latchExit, label %header, !llvm.loop !5
+; EPILOG-BLOCK-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
+; EPILOG-BLOCK-NEXT:    %niter.ncmp.1 = icmp eq i64 %niter.nsub.1, 0
+; EPILOG-BLOCK-NEXT:    br i1 %niter.ncmp.1, label %latchExit.unr-lcssa.loopexit, label %header, !llvm.loop !5
 ;
 ; PROLOG-LABEL: @otherblock_latch_same_exit(
 ; PROLOG-NEXT:  entry:
+; PROLOG-NEXT:    %0 = add i64 %n, -1
+; PROLOG-NEXT:    %xtraiter = and i64 %n, 7
+; PROLOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; PROLOG-NEXT:    br i1 %lcmp.mod, label %header.prol.preheader, label %header.prol.loopexit
+; PROLOG:       header.prol.preheader:
+; PROLOG-NEXT:    br label %header.prol
+; PROLOG:       header.prol:
+; PROLOG-NEXT:    %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %latch.prol ], [ 0, %header.prol.preheader ]
+; PROLOG-NEXT:    %sum.02.prol = phi i32 [ %add.prol, %latch.prol ], [ 0, %header.prol.preheader ]
+; PROLOG-NEXT:    %prol.iter = phi i64 [ %xtraiter, %header.prol.preheader ], [ %prol.iter.sub, %latch.prol ]
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit1, label %for.exiting_block.prol
+; PROLOG:       for.exiting_block.prol:
+; PROLOG-NEXT:    %cmp.prol = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.prol, label %latchExit.unr-lcssa.loopexit2, label %latch.prol
+; PROLOG:       latch.prol:
+; PROLOG-NEXT:    %arrayidx.prol = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.prol
+; PROLOG-NEXT:    %1 = load i32, i32* %arrayidx.prol, align 4
+; PROLOG-NEXT:    %add.prol = add nsw i32 %1, %sum.02.prol
+; PROLOG-NEXT:    %indvars.iv.next.prol = add i64 %indvars.iv.prol, 1
+; PROLOG-NEXT:    %exitcond.prol = icmp eq i64 %indvars.iv.next.prol, %n
+; PROLOG-NEXT:    %prol.iter.sub = sub i64 %prol.iter, 1
+; PROLOG-NEXT:    %prol.iter.cmp = icmp ne i64 %prol.iter.sub, 0
+; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %header.prol, label %header.prol.loopexit.unr-lcssa, !llvm.loop !5
+; PROLOG:       header.prol.loopexit.unr-lcssa:
+; PROLOG-NEXT:    %result.unr.ph = phi i32 [ %add.prol, %latch.prol ]
+; PROLOG-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.prol, %latch.prol ]
+; PROLOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.prol, %latch.prol ]
+; PROLOG-NEXT:    br label %header.prol.loopexit
+; PROLOG:       header.prol.loopexit:
+; PROLOG-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %result.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %2 = icmp ult i64 %0, 7
+; PROLOG-NEXT:    br i1 %2, label %latchExit, label %entry.new
+; PROLOG:       entry.new:
 ; PROLOG-NEXT:    br label %header
 ; PROLOG:       header:
-; PROLOG-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
-; PROLOG-NEXT:    %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
-; PROLOG-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
+; PROLOG-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.unr, %entry.new ], [ %indvars.iv.next.7, %latch.7 ]
+; PROLOG-NEXT:    %sum.02 = phi i32 [ %sum.02.unr, %entry.new ], [ %add.7, %latch.7 ]
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block
 ; PROLOG:       for.exiting_block:
 ; PROLOG-NEXT:    %cmp = icmp eq i64 %n, 42
-; PROLOG-NEXT:    br i1 %cmp, label %latchExit, label %latch
+; PROLOG-NEXT:    br i1 %cmp, label %latchExit.unr-lcssa.loopexit, label %latch
 ; PROLOG:       latch:
 ; PROLOG-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; PROLOG-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; PROLOG-NEXT:    %add = add nsw i32 %0, %sum.02
+; PROLOG-NEXT:    %3 = load i32, i32* %arrayidx, align 4
+; PROLOG-NEXT:    %add = add nsw i32 %3, %sum.02
 ; PROLOG-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
-; PROLOG-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; PROLOG-NEXT:    br i1 %exitcond, label %latchExit, label %header
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.1
+; PROLOG:       latchExit.unr-lcssa.loopexit:
+; PROLOG-NEXT:    %result.ph.ph = phi i32 [ 2, %for.exiting_block ], [ 2, %for.exiting_block.1 ], [ 2, %for.exiting_block.2 ], [ 2, %for.exiting_block.3 ], [ 2, %for.exiting_block.4 ], [ 2, %for.exiting_block.5 ], [ 2, %for.exiting_block.6 ], [ 2, %for.exiting_block.7 ], [ %add.7, %latch.7 ]
+; PROLOG-NEXT:    br label %latchExit.unr-lcssa
+; PROLOG:       latchExit.unr-lcssa.loopexit2:
+; PROLOG-NEXT:    %result.ph.ph3 = phi i32 [ 2, %for.exiting_block.prol ]
+; PROLOG-NEXT:    br label %latchExit.unr-lcssa
+; PROLOG:       latchExit.unr-lcssa:
+; PROLOG-NEXT:    %result.ph = phi i32 [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ], [ %result.ph.ph3, %latchExit.unr-lcssa.loopexit2 ]
+; PROLOG-NEXT:    br label %latchExit
 ; PROLOG:       latchExit:
-; PROLOG-NEXT:    %result = phi i32 [ 2, %for.exiting_block ], [ %add, %latch ]
+; PROLOG-NEXT:    %result = phi i32 [ %result.unr, %header.prol.loopexit ], [ %result.ph, %latchExit.unr-lcssa ]
 ; PROLOG-NEXT:    ret i32 %result
+; PROLOG:       for.exit2.loopexit:
+; PROLOG-NEXT:    br label %for.exit2
+; PROLOG:       for.exit2.loopexit1:
+; PROLOG-NEXT:    br label %for.exit2
 ; PROLOG:       for.exit2:
 ; PROLOG-NEXT:    ret i32 42
+; PROLOG:       for.exiting_block.1:
+; PROLOG-NEXT:    %cmp.1 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.1, label %latchExit.unr-lcssa.loopexit, label %latch.1
+; PROLOG:       latch.1:
+; PROLOG-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+; PROLOG-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; PROLOG-NEXT:    %add.1 = add nsw i32 %4, %add
+; PROLOG-NEXT:    %indvars.iv.next.1 = add i64 %indvars.iv.next, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.2
+; PROLOG:       for.exiting_block.2:
+; PROLOG-NEXT:    %cmp.2 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.2, label %latchExit.unr-lcssa.loopexit, label %latch.2
+; PROLOG:       latch.2:
+; PROLOG-NEXT:    %arrayidx.2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.1
+; PROLOG-NEXT:    %5 = load i32, i32* %arrayidx.2, align 4
+; PROLOG-NEXT:    %add.2 = add nsw i32 %5, %add.1
+; PROLOG-NEXT:    %indvars.iv.next.2 = add i64 %indvars.iv.next.1, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.3
+; PROLOG:       for.exiting_block.3:
+; PROLOG-NEXT:    %cmp.3 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.3, label %latchExit.unr-lcssa.loopexit, label %latch.3
+; PROLOG:       latch.3:
+; PROLOG-NEXT:    %arrayidx.3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.2
+; PROLOG-NEXT:    %6 = load i32, i32* %arrayidx.3, align 4
+; PROLOG-NEXT:    %add.3 = add nsw i32 %6, %add.2
+; PROLOG-NEXT:    %indvars.iv.next.3 = add i64 %indvars.iv.next.2, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.4
+; PROLOG:       for.exiting_block.4:
+; PROLOG-NEXT:    %cmp.4 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.4, label %latchExit.unr-lcssa.loopexit, label %latch.4
+; PROLOG:       latch.4:
+; PROLOG-NEXT:    %arrayidx.4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.3
+; PROLOG-NEXT:    %7 = load i32, i32* %arrayidx.4, align 4
+; PROLOG-NEXT:    %add.4 = add nsw i32 %7, %add.3
+; PROLOG-NEXT:    %indvars.iv.next.4 = add i64 %indvars.iv.next.3, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.5
+; PROLOG:       for.exiting_block.5:
+; PROLOG-NEXT:    %cmp.5 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.5, label %latchExit.unr-lcssa.loopexit, label %latch.5
+; PROLOG:       latch.5:
+; PROLOG-NEXT:    %arrayidx.5 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.4
+; PROLOG-NEXT:    %8 = load i32, i32* %arrayidx.5, align 4
+; PROLOG-NEXT:    %add.5 = add nsw i32 %8, %add.4
+; PROLOG-NEXT:    %indvars.iv.next.5 = add i64 %indvars.iv.next.4, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.6
+; PROLOG:       for.exiting_block.6:
+; PROLOG-NEXT:    %cmp.6 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.6, label %latchExit.unr-lcssa.loopexit, label %latch.6
+; PROLOG:       latch.6:
+; PROLOG-NEXT:    %arrayidx.6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.5
+; PROLOG-NEXT:    %9 = load i32, i32* %arrayidx.6, align 4
+; PROLOG-NEXT:    %add.6 = add nsw i32 %9, %add.5
+; PROLOG-NEXT:    %indvars.iv.next.6 = add i64 %indvars.iv.next.5, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.7
+; PROLOG:       for.exiting_block.7:
+; PROLOG-NEXT:    %cmp.7 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.7, label %latchExit.unr-lcssa.loopexit, label %latch.7
+; PROLOG:       latch.7:
+; PROLOG-NEXT:    %arrayidx.7 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.6
+; PROLOG-NEXT:    %10 = load i32, i32* %arrayidx.7, align 4
+; PROLOG-NEXT:    %add.7 = add nsw i32 %10, %add.6
+; PROLOG-NEXT:    %indvars.iv.next.7 = add i64 %indvars.iv.next.6, 1
+; PROLOG-NEXT:    %exitcond.7 = icmp eq i64 %indvars.iv.next.7, %n
+; PROLOG-NEXT:    br i1 %exitcond.7, label %latchExit.unr-lcssa.loopexit, label %header
 ;
 ; PROLOG-BLOCK-LABEL: @otherblock_latch_same_exit(
 ; PROLOG-BLOCK-NEXT:  entry:
+; PROLOG-BLOCK-NEXT:    %0 = add i64 %n, -1
+; PROLOG-BLOCK-NEXT:    %xtraiter = and i64 %n, 1
+; PROLOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; PROLOG-BLOCK-NEXT:    br i1 %lcmp.mod, label %header.prol.preheader, label %header.prol.loopexit
+; PROLOG-BLOCK:       header.prol.preheader:
+; PROLOG-BLOCK-NEXT:    br label %header.prol
+; PROLOG-BLOCK:       header.prol:
+; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block.prol
+; PROLOG-BLOCK:       for.exiting_block.prol:
+; PROLOG-BLOCK-NEXT:    %cmp.prol = icmp eq i64 %n, 42
+; PROLOG-BLOCK-NEXT:    br i1 %cmp.prol, label %latchExit.unr-lcssa, label %latch.prol
+; PROLOG-BLOCK:       latch.prol:
+; PROLOG-BLOCK-NEXT:    %arrayidx.prol = getelementptr inbounds i32, i32* %a, i64 0
+; PROLOG-BLOCK-NEXT:    %1 = load i32, i32* %arrayidx.prol, align 4
+; PROLOG-BLOCK-NEXT:    %add.prol = add nsw i32 %1, 0
+; PROLOG-BLOCK-NEXT:    %indvars.iv.next.prol = add i64 0, 1
+; PROLOG-BLOCK-NEXT:    %exitcond.prol = icmp eq i64 %indvars.iv.next.prol, %n
+; PROLOG-BLOCK-NEXT:    br label %header.prol.loopexit
+; PROLOG-BLOCK:       header.prol.loopexit:
+; PROLOG-BLOCK-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %add.prol, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.next.prol, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %add.prol, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %2 = icmp ult i64 %0, 1
+; PROLOG-BLOCK-NEXT:    br i1 %2, label %latchExit, label %entry.new
+; PROLOG-BLOCK:       entry.new:
 ; PROLOG-BLOCK-NEXT:    br label %header
 ; PROLOG-BLOCK:       header:
-; PROLOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %latch.1 ]
-; PROLOG-BLOCK-NEXT:    %sum.02 = phi i32 [ 0, %entry ], [ %add.1, %latch.1 ]
-; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
+; PROLOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.unr, %entry.new ], [ %indvars.iv.next.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    %sum.02 = phi i32 [ %sum.02.unr, %entry.new ], [ %add.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block
 ; PROLOG-BLOCK:       for.exiting_block:
 ; PROLOG-BLOCK-NEXT:    %cmp = icmp eq i64 %n, 42
-; PROLOG-BLOCK-NEXT:    br i1 %cmp, label %latchExit, label %latch
+; PROLOG-BLOCK-NEXT:    br i1 %cmp, label %latchExit.unr-lcssa.loopexit, label %latch
 ; PROLOG-BLOCK:       latch:
 ; PROLOG-BLOCK-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; PROLOG-BLOCK-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; PROLOG-BLOCK-NEXT:    %add = add nsw i32 %0, %sum.02
-; PROLOG-BLOCK-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; PROLOG-BLOCK-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; PROLOG-BLOCK-NEXT:    br i1 %exitcond, label %latchExit, label %header.1
+; PROLOG-BLOCK-NEXT:    %3 = load i32, i32* %arrayidx, align 4
+; PROLOG-BLOCK-NEXT:    %add = add nsw i32 %3, %sum.02
+; PROLOG-BLOCK-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
+; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.1
+; PROLOG-BLOCK:       latchExit.unr-lcssa.loopexit:
+; PROLOG-BLOCK-NEXT:    %result.ph.ph = phi i32 [ 2, %for.exiting_block ], [ 2, %for.exiting_block.1 ], [ %add.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    br label %latchExit.unr-lcssa
+; PROLOG-BLOCK:       latchExit.unr-lcssa:
+; PROLOG-BLOCK-NEXT:    %result.ph = phi i32 [ 2, %for.exiting_block.prol ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; PROLOG-BLOCK-NEXT:    br label %latchExit
 ; PROLOG-BLOCK:       latchExit:
-; PROLOG-BLOCK-NEXT:    %result = phi i32 [ 2, %for.exiting_block ], [ %add, %latch ], [ 2, %for.exiting_block.1 ], [ %add.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    %result = phi i32 [ %result.unr, %header.prol.loopexit ], [ %result.ph, %latchExit.unr-lcssa ]
 ; PROLOG-BLOCK-NEXT:    ret i32 %result
+; PROLOG-BLOCK:       for.exit2.loopexit:
+; PROLOG-BLOCK-NEXT:    br label %for.exit2
 ; PROLOG-BLOCK:       for.exit2:
 ; PROLOG-BLOCK-NEXT:    ret i32 42
-; PROLOG-BLOCK:       header.1:
-; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block.1
 ; PROLOG-BLOCK:       for.exiting_block.1:
 ; PROLOG-BLOCK-NEXT:    %cmp.1 = icmp eq i64 %n, 42
-; PROLOG-BLOCK-NEXT:    br i1 %cmp.1, label %latchExit, label %latch.1
+; PROLOG-BLOCK-NEXT:    br i1 %cmp.1, label %latchExit.unr-lcssa.loopexit, label %latch.1
 ; PROLOG-BLOCK:       latch.1:
 ; PROLOG-BLOCK-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
-; PROLOG-BLOCK-NEXT:    %1 = load i32, i32* %arrayidx.1, align 4
-; PROLOG-BLOCK-NEXT:    %add.1 = add nsw i32 %1, %add
+; PROLOG-BLOCK-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; PROLOG-BLOCK-NEXT:    %add.1 = add nsw i32 %4, %add
 ; PROLOG-BLOCK-NEXT:    %indvars.iv.next.1 = add i64 %indvars.iv.next, 1
 ; PROLOG-BLOCK-NEXT:    %exitcond.1 = icmp eq i64 %indvars.iv.next.1, %n
-; PROLOG-BLOCK-NEXT:    br i1 %exitcond.1, label %latchExit, label %header, !llvm.loop !5
+; PROLOG-BLOCK-NEXT:    br i1 %exitcond.1, label %latchExit.unr-lcssa.loopexit, label %header, !llvm.loop !5
 ;
 
 entry:
@@ -2018,144 +2178,424 @@ for.exit2:
 ; Same as above test except the incoming value for latch Phi is from the header
 ; FIXME: We should be able to runtime unroll.
 define i32 @otherblock_latch_same_exit2(i32* nocapture %a, i64 %n, i1 %cond) {
-; EPILOG-NO-IC-LABEL: @otherblock_latch_same_exit2(
-; EPILOG-NO-IC-NEXT:  entry:
-; EPILOG-NO-IC-NEXT:    br label %header
-; EPILOG-NO-IC:       header:
-; EPILOG-NO-IC-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
-; EPILOG-NO-IC-NEXT:    %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
-; EPILOG-NO-IC:       for.exiting_block:
-; EPILOG-NO-IC-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp, label %latchExit, label %latch
-; EPILOG-NO-IC:       latch:
-; EPILOG-NO-IC-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-NO-IC-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; EPILOG-NO-IC-NEXT:    %add = add nsw i32 %0, %sum.02
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
-; EPILOG-NO-IC-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; EPILOG-NO-IC-NEXT:    br i1 %exitcond, label %latchExit, label %header
-; EPILOG-NO-IC:       latchExit:
-; EPILOG-NO-IC-NEXT:    %result = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %latch ]
-; EPILOG-NO-IC-NEXT:    ret i32 %result
-; EPILOG-NO-IC:       for.exit2:
-; EPILOG-NO-IC-NEXT:    ret i32 42
-;
 ; EPILOG-LABEL: @otherblock_latch_same_exit2(
 ; EPILOG-NEXT:  entry:
+; EPILOG-NEXT:    %0 = add i64 %n, -1
+; EPILOG-NEXT:    %xtraiter = and i64 %n, 7
+; EPILOG-NEXT:    %1 = icmp ult i64 %0, 7
+; EPILOG-NEXT:    br i1 %1, label %latchExit.unr-lcssa, label %entry.new
+; EPILOG:       entry.new:
+; EPILOG-NEXT:    %unroll_iter = sub i64 %n, %xtraiter
 ; EPILOG-NEXT:    br label %header
 ; EPILOG:       header:
-; EPILOG-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
-; EPILOG-NEXT:    %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
-; EPILOG-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
+; EPILOG-NEXT:    %indvars.iv = phi i64 [ 0, %entry.new ], [ %indvars.iv.next.7, %latch.7 ]
+; EPILOG-NEXT:    %sum.02 = phi i32 [ 0, %entry.new ], [ %add.7, %latch.7 ]
+; EPILOG-NEXT:    %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.7, %latch.7 ]
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block
 ; EPILOG:       for.exiting_block:
 ; EPILOG-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-NEXT:    br i1 %cmp, label %latchExit, label %latch
+; EPILOG-NEXT:    br i1 %cmp, label %latchExit.epilog-lcssa.loopexit, label %latch
 ; EPILOG:       latch:
 ; EPILOG-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; EPILOG-NEXT:    %add = add nsw i32 %0, %sum.02
-; EPILOG-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
-; EPILOG-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; EPILOG-NEXT:    br i1 %exitcond, label %latchExit, label %header
+; EPILOG-NEXT:    %2 = load i32, i32* %arrayidx, align 4
+; EPILOG-NEXT:    %add = add nsw i32 %2, %sum.02
+; EPILOG-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; EPILOG-NEXT:    %niter.nsub = sub i64 %niter, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.1
+; EPILOG:       latchExit.unr-lcssa.loopexit:
+; EPILOG-NEXT:    %result.ph.ph = phi i32 [ %add.7, %latch.7 ]
+; EPILOG-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.7, %latch.7 ]
+; EPILOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.7, %latch.7 ]
+; EPILOG-NEXT:    br label %latchExit.unr-lcssa
+; EPILOG:       latchExit.unr-lcssa:
+; EPILOG-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; EPILOG-NEXT:    br i1 %lcmp.mod, label %header.epil.preheader, label %latchExit
+; EPILOG:       header.epil.preheader:
+; EPILOG-NEXT:    br label %header.epil
+; EPILOG:       header.epil:
+; EPILOG-NEXT:    %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %latch.epil ], [ %indvars.iv.unr, %header.epil.preheader ]
+; EPILOG-NEXT:    %sum.02.epil = phi i32 [ %add.epil, %latch.epil ], [ %sum.02.unr, %header.epil.preheader ]
+; EPILOG-NEXT:    %epil.iter = phi i64 [ %xtraiter, %header.epil.preheader ], [ %epil.iter.sub, %latch.epil ]
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit2, label %for.exiting_block.epil
+; EPILOG:       for.exiting_block.epil:
+; EPILOG-NEXT:    %cmp.epil = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.epil, label %latchExit.epilog-lcssa.loopexit3, label %latch.epil
+; EPILOG:       latch.epil:
+; EPILOG-NEXT:    %arrayidx.epil = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.epil
+; EPILOG-NEXT:    %3 = load i32, i32* %arrayidx.epil, align 4
+; EPILOG-NEXT:    %add.epil = add nsw i32 %3, %sum.02.epil
+; EPILOG-NEXT:    %indvars.iv.next.epil = add i64 %indvars.iv.epil, 1
+; EPILOG-NEXT:    %exitcond.epil = icmp eq i64 %indvars.iv.next.epil, %n
+; EPILOG-NEXT:    %epil.iter.sub = sub i64 %epil.iter, 1
+; EPILOG-NEXT:    %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0
+; EPILOG-NEXT:    br i1 %epil.iter.cmp, label %header.epil, label %latchExit.epilog-lcssa.loopexit3, !llvm.loop !6
+; EPILOG:       latchExit.epilog-lcssa.loopexit:
+; EPILOG-NEXT:    %result.ph1.ph = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %for.exiting_block.1 ], [ %add.1, %for.exiting_block.2 ], [ %add.2, %for.exiting_block.3 ], [ %add.3, %for.exiting_block.4 ], [ %add.4, %for.exiting_block.5 ], [ %add.5, %for.exiting_block.6 ], [ %add.6, %for.exiting_block.7 ]
+; EPILOG-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG:       latchExit.epilog-lcssa.loopexit3:
+; EPILOG-NEXT:    %result.ph1.ph4 = phi i32 [ %sum.02.epil, %for.exiting_block.epil ], [ %add.epil, %latch.epil ]
+; EPILOG-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG:       latchExit.epilog-lcssa:
+; EPILOG-NEXT:    %result.ph1 = phi i32 [ %result.ph1.ph, %latchExit.epilog-lcssa.loopexit ], [ %result.ph1.ph4, %latchExit.epilog-lcssa.loopexit3 ]
+; EPILOG-NEXT:    br label %latchExit
 ; EPILOG:       latchExit:
-; EPILOG-NEXT:    %result = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %latch ]
+; EPILOG-NEXT:    %result = phi i32 [ %result.ph, %latchExit.unr-lcssa ], [ %result.ph1, %latchExit.epilog-lcssa ]
 ; EPILOG-NEXT:    ret i32 %result
+; EPILOG:       for.exit2.loopexit:
+; EPILOG-NEXT:    br label %for.exit2
+; EPILOG:       for.exit2.loopexit2:
+; EPILOG-NEXT:    br label %for.exit2
 ; EPILOG:       for.exit2:
 ; EPILOG-NEXT:    ret i32 42
+; EPILOG:       for.exiting_block.1:
+; EPILOG-NEXT:    %cmp.1 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.1, label %latchExit.epilog-lcssa.loopexit, label %latch.1
+; EPILOG:       latch.1:
+; EPILOG-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+; EPILOG-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; EPILOG-NEXT:    %add.1 = add nsw i32 %4, %add
+; EPILOG-NEXT:    %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv.next, 1
+; EPILOG-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.2
+; EPILOG:       for.exiting_block.2:
+; EPILOG-NEXT:    %cmp.2 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.2, label %latchExit.epilog-lcssa.loopexit, label %latch.2
+; EPILOG:       latch.2:
+; EPILOG-NEXT:    %arrayidx.2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.1
+; EPILOG-NEXT:    %5 = load i32, i32* %arrayidx.2, align 4
+; EPILOG-NEXT:    %add.2 = add nsw i32 %5, %add.1
+; EPILOG-NEXT:    %indvars.iv.next.2 = add nuw nsw i64 %indvars.iv.next.1, 1
+; EPILOG-NEXT:    %niter.nsub.2 = sub i64 %niter.nsub.1, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.3
+; EPILOG:       for.exiting_block.3:
+; EPILOG-NEXT:    %cmp.3 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.3, label %latchExit.epilog-lcssa.loopexit, label %latch.3
+; EPILOG:       latch.3:
+; EPILOG-NEXT:    %arrayidx.3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.2
+; EPILOG-NEXT:    %6 = load i32, i32* %arrayidx.3, align 4
+; EPILOG-NEXT:    %add.3 = add nsw i32 %6, %add.2
+; EPILOG-NEXT:    %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv.next.2, 1
+; EPILOG-NEXT:    %niter.nsub.3 = sub i64 %niter.nsub.2, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.4
+; EPILOG:       for.exiting_block.4:
+; EPILOG-NEXT:    %cmp.4 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.4, label %latchExit.epilog-lcssa.loopexit, label %latch.4
+; EPILOG:       latch.4:
+; EPILOG-NEXT:    %arrayidx.4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.3
+; EPILOG-NEXT:    %7 = load i32, i32* %arrayidx.4, align 4
+; EPILOG-NEXT:    %add.4 = add nsw i32 %7, %add.3
+; EPILOG-NEXT:    %indvars.iv.next.4 = add nuw nsw i64 %indvars.iv.next.3, 1
+; EPILOG-NEXT:    %niter.nsub.4 = sub i64 %niter.nsub.3, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.5
+; EPILOG:       for.exiting_block.5:
+; EPILOG-NEXT:    %cmp.5 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.5, label %latchExit.epilog-lcssa.loopexit, label %latch.5
+; EPILOG:       latch.5:
+; EPILOG-NEXT:    %arrayidx.5 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.4
+; EPILOG-NEXT:    %8 = load i32, i32* %arrayidx.5, align 4
+; EPILOG-NEXT:    %add.5 = add nsw i32 %8, %add.4
+; EPILOG-NEXT:    %indvars.iv.next.5 = add nuw nsw i64 %indvars.iv.next.4, 1
+; EPILOG-NEXT:    %niter.nsub.5 = sub i64 %niter.nsub.4, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.6
+; EPILOG:       for.exiting_block.6:
+; EPILOG-NEXT:    %cmp.6 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.6, label %latchExit.epilog-lcssa.loopexit, label %latch.6
+; EPILOG:       latch.6:
+; EPILOG-NEXT:    %arrayidx.6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.5
+; EPILOG-NEXT:    %9 = load i32, i32* %arrayidx.6, align 4
+; EPILOG-NEXT:    %add.6 = add nsw i32 %9, %add.5
+; EPILOG-NEXT:    %indvars.iv.next.6 = add nuw nsw i64 %indvars.iv.next.5, 1
+; EPILOG-NEXT:    %niter.nsub.6 = sub i64 %niter.nsub.5, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.7
+; EPILOG:       for.exiting_block.7:
+; EPILOG-NEXT:    %cmp.7 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.7, label %latchExit.epilog-lcssa.loopexit, label %latch.7
+; EPILOG:       latch.7:
+; EPILOG-NEXT:    %arrayidx.7 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.6
+; EPILOG-NEXT:    %10 = load i32, i32* %arrayidx.7, align 4
+; EPILOG-NEXT:    %add.7 = add nsw i32 %10, %add.6
+; EPILOG-NEXT:    %indvars.iv.next.7 = add i64 %indvars.iv.next.6, 1
+; EPILOG-NEXT:    %niter.nsub.7 = sub i64 %niter.nsub.6, 1
+; EPILOG-NEXT:    %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0
+; EPILOG-NEXT:    br i1 %niter.ncmp.7, label %latchExit.unr-lcssa.loopexit, label %header
 ;
 ; EPILOG-BLOCK-LABEL: @otherblock_latch_same_exit2(
 ; EPILOG-BLOCK-NEXT:  entry:
+; EPILOG-BLOCK-NEXT:    %0 = add i64 %n, -1
+; EPILOG-BLOCK-NEXT:    %xtraiter = and i64 %n, 1
+; EPILOG-BLOCK-NEXT:    %1 = icmp ult i64 %0, 1
+; EPILOG-BLOCK-NEXT:    br i1 %1, label %latchExit.unr-lcssa, label %entry.new
+; EPILOG-BLOCK:       entry.new:
+; EPILOG-BLOCK-NEXT:    %unroll_iter = sub i64 %n, %xtraiter
 ; EPILOG-BLOCK-NEXT:    br label %header
 ; EPILOG-BLOCK:       header:
-; EPILOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %latch.1 ]
-; EPILOG-BLOCK-NEXT:    %sum.02 = phi i32 [ 0, %entry ], [ %add.1, %latch.1 ]
-; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
+; EPILOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ 0, %entry.new ], [ %indvars.iv.next.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %sum.02 = phi i32 [ 0, %entry.new ], [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block
 ; EPILOG-BLOCK:       for.exiting_block:
 ; EPILOG-BLOCK-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-BLOCK-NEXT:    br i1 %cmp, label %latchExit, label %latch
+; EPILOG-BLOCK-NEXT:    br i1 %cmp, label %latchExit.epilog-lcssa.loopexit, label %latch
 ; EPILOG-BLOCK:       latch:
 ; EPILOG-BLOCK-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-BLOCK-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; EPILOG-BLOCK-NEXT:    %add = add nsw i32 %0, %sum.02
+; EPILOG-BLOCK-NEXT:    %2 = load i32, i32* %arrayidx, align 4
+; EPILOG-BLOCK-NEXT:    %add = add nsw i32 %2, %sum.02
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; EPILOG-BLOCK-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; EPILOG-BLOCK-NEXT:    br i1 %exitcond, label %latchExit, label %header.1
+; EPILOG-BLOCK-NEXT:    %niter.nsub = sub i64 %niter, 1
+; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.1
+; EPILOG-BLOCK:       latchExit.unr-lcssa.loopexit:
+; EPILOG-BLOCK-NEXT:    %result.ph.ph = phi i32 [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %sum.02.unr.ph = phi i32 [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    br label %latchExit.unr-lcssa
+; EPILOG-BLOCK:       latchExit.unr-lcssa:
+; EPILOG-BLOCK-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; EPILOG-BLOCK-NEXT:    br i1 %lcmp.mod, label %header.epil.preheader, label %latchExit
+; EPILOG-BLOCK:       header.epil.preheader:
+; EPILOG-BLOCK-NEXT:    br label %header.epil
+; EPILOG-BLOCK:       header.epil:
+; EPILOG-BLOCK-NEXT:    %indvars.iv.epil = phi i64 [ %indvars.iv.unr, %header.epil.preheader ]
+; EPILOG-BLOCK-NEXT:    %sum.02.epil = phi i32 [ %sum.02.unr, %header.epil.preheader ]
+; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block.epil
+; EPILOG-BLOCK:       for.exiting_block.epil:
+; EPILOG-BLOCK-NEXT:    %cmp.epil = icmp eq i64 %n, 42
+; EPILOG-BLOCK-NEXT:    br i1 %cmp.epil, label %latchExit.epilog-lcssa, label %latch.epil
+; EPILOG-BLOCK:       latch.epil:
+; EPILOG-BLOCK-NEXT:    %arrayidx.epil = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.epil
+; EPILOG-BLOCK-NEXT:    %3 = load i32, i32* %arrayidx.epil, align 4
+; EPILOG-BLOCK-NEXT:    %add.epil = add nsw i32 %3, %sum.02.epil
+; EPILOG-BLOCK-NEXT:    %indvars.iv.next.epil = add i64 %indvars.iv.epil, 1
+; EPILOG-BLOCK-NEXT:    %exitcond.epil = icmp eq i64 %indvars.iv.next.epil, %n
+; EPILOG-BLOCK-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG-BLOCK:       latchExit.epilog-lcssa.loopexit:
+; EPILOG-BLOCK-NEXT:    %result.ph1.ph = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %for.exiting_block.1 ]
+; EPILOG-BLOCK-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG-BLOCK:       latchExit.epilog-lcssa:
+; EPILOG-BLOCK-NEXT:    %result.ph1 = phi i32 [ %add.epil, %latch.epil ], [ %sum.02.epil, %for.exiting_block.epil ], [ %result.ph1.ph, %latchExit.epilog-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    br label %latchExit
 ; EPILOG-BLOCK:       latchExit:
-; EPILOG-BLOCK-NEXT:    %result = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %latch ], [ %add, %for.exiting_block.1 ], [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %result = phi i32 [ %result.ph, %latchExit.unr-lcssa ], [ %result.ph1, %latchExit.epilog-lcssa ]
 ; EPILOG-BLOCK-NEXT:    ret i32 %result
+; EPILOG-BLOCK:       for.exit2.loopexit:
+; EPILOG-BLOCK-NEXT:    br label %for.exit2
 ; EPILOG-BLOCK:       for.exit2:
 ; EPILOG-BLOCK-NEXT:    ret i32 42
-; EPILOG-BLOCK:       header.1:
-; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block.1
 ; EPILOG-BLOCK:       for.exiting_block.1:
 ; EPILOG-BLOCK-NEXT:    %cmp.1 = icmp eq i64 %n, 42
-; EPILOG-BLOCK-NEXT:    br i1 %cmp.1, label %latchExit, label %latch.1
+; EPILOG-BLOCK-NEXT:    br i1 %cmp.1, label %latchExit.epilog-lcssa.loopexit, label %latch.1
 ; EPILOG-BLOCK:       latch.1:
 ; EPILOG-BLOCK-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
-; EPILOG-BLOCK-NEXT:    %1 = load i32, i32* %arrayidx.1, align 4
-; EPILOG-BLOCK-NEXT:    %add.1 = add nsw i32 %1, %add
+; EPILOG-BLOCK-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; EPILOG-BLOCK-NEXT:    %add.1 = add nsw i32 %4, %add
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.next.1 = add i64 %indvars.iv.next, 1
-; EPILOG-BLOCK-NEXT:    %exitcond.1 = icmp eq i64 %indvars.iv.next.1, %n
-; EPILOG-BLOCK-NEXT:    br i1 %exitcond.1, label %latchExit, label %header, !llvm.loop !6
+; EPILOG-BLOCK-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
+; EPILOG-BLOCK-NEXT:    %niter.ncmp.1 = icmp eq i64 %niter.nsub.1, 0
+; EPILOG-BLOCK-NEXT:    br i1 %niter.ncmp.1, label %latchExit.unr-lcssa.loopexit, label %header, !llvm.loop !6
 ;
 ; PROLOG-LABEL: @otherblock_latch_same_exit2(
 ; PROLOG-NEXT:  entry:
+; PROLOG-NEXT:    %0 = add i64 %n, -1
+; PROLOG-NEXT:    %xtraiter = and i64 %n, 7
+; PROLOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; PROLOG-NEXT:    br i1 %lcmp.mod, label %header.prol.preheader, label %header.prol.loopexit
+; PROLOG:       header.prol.preheader:
+; PROLOG-NEXT:    br label %header.prol
+; PROLOG:       header.prol:
+; PROLOG-NEXT:    %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %latch.prol ], [ 0, %header.prol.preheader ]
+; PROLOG-NEXT:    %sum.02.prol = phi i32 [ %add.prol, %latch.prol ], [ 0, %header.prol.preheader ]
+; PROLOG-NEXT:    %prol.iter = phi i64 [ %xtraiter, %header.prol.preheader ], [ %prol.iter.sub, %latch.prol ]
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit1, label %for.exiting_block.prol
+; PROLOG:       for.exiting_block.prol:
+; PROLOG-NEXT:    %cmp.prol = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.prol, label %latchExit.unr-lcssa.loopexit2, label %latch.prol
+; PROLOG:       latch.prol:
+; PROLOG-NEXT:    %arrayidx.prol = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.prol
+; PROLOG-NEXT:    %1 = load i32, i32* %arrayidx.prol, align 4
+; PROLOG-NEXT:    %add.prol = add nsw i32 %1, %sum.02.prol
+; PROLOG-NEXT:    %indvars.iv.next.prol = add i64 %indvars.iv.prol, 1
+; PROLOG-NEXT:    %exitcond.prol = icmp eq i64 %indvars.iv.next.prol, %n
+; PROLOG-NEXT:    %prol.iter.sub = sub i64 %prol.iter, 1
+; PROLOG-NEXT:    %prol.iter.cmp = icmp ne i64 %prol.iter.sub, 0
+; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %header.prol, label %header.prol.loopexit.unr-lcssa, !llvm.loop !6
+; PROLOG:       header.prol.loopexit.unr-lcssa:
+; PROLOG-NEXT:    %result.unr.ph = phi i32 [ %add.prol, %latch.prol ]
+; PROLOG-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.prol, %latch.prol ]
+; PROLOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.prol, %latch.prol ]
+; PROLOG-NEXT:    br label %header.prol.loopexit
+; PROLOG:       header.prol.loopexit:
+; PROLOG-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %result.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %2 = icmp ult i64 %0, 7
+; PROLOG-NEXT:    br i1 %2, label %latchExit, label %entry.new
+; PROLOG:       entry.new:
 ; PROLOG-NEXT:    br label %header
 ; PROLOG:       header:
-; PROLOG-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
-; PROLOG-NEXT:    %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
-; PROLOG-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
+; PROLOG-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.unr, %entry.new ], [ %indvars.iv.next.7, %latch.7 ]
+; PROLOG-NEXT:    %sum.02 = phi i32 [ %sum.02.unr, %entry.new ], [ %add.7, %latch.7 ]
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block
 ; PROLOG:       for.exiting_block:
 ; PROLOG-NEXT:    %cmp = icmp eq i64 %n, 42
-; PROLOG-NEXT:    br i1 %cmp, label %latchExit, label %latch
+; PROLOG-NEXT:    br i1 %cmp, label %latchExit.unr-lcssa.loopexit, label %latch
 ; PROLOG:       latch:
 ; PROLOG-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; PROLOG-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; PROLOG-NEXT:    %add = add nsw i32 %0, %sum.02
+; PROLOG-NEXT:    %3 = load i32, i32* %arrayidx, align 4
+; PROLOG-NEXT:    %add = add nsw i32 %3, %sum.02
 ; PROLOG-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
-; PROLOG-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; PROLOG-NEXT:    br i1 %exitcond, label %latchExit, label %header
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.1
+; PROLOG:       latchExit.unr-lcssa.loopexit:
+; PROLOG-NEXT:    %result.ph.ph = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %for.exiting_block.1 ], [ %add.1, %for.exiting_block.2 ], [ %add.2, %for.exiting_block.3 ], [ %add.3, %for.exiting_block.4 ], [ %add.4, %for.exiting_block.5 ], [ %add.5, %for.exiting_block.6 ], [ %add.6, %for.exiting_block.7 ], [ %add.7, %latch.7 ]
+; PROLOG-NEXT:    br label %latchExit.unr-lcssa
+; PROLOG:       latchExit.unr-lcssa.loopexit2:
+; PROLOG-NEXT:    %result.ph.ph3 = phi i32 [ %sum.02.prol, %for.exiting_block.prol ]
+; PROLOG-NEXT:    br label %latchExit.unr-lcssa
+; PROLOG:       latchExit.unr-lcssa:
+; PROLOG-NEXT:    %result.ph = phi i32 [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ], [ %result.ph.ph3, %latchExit.unr-lcssa.loopexit2 ]
+; PROLOG-NEXT:    br label %latchExit
 ; PROLOG:       latchExit:
-; PROLOG-NEXT:    %result = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %latch ]
+; PROLOG-NEXT:    %result = phi i32 [ %result.unr, %header.prol.loopexit ], [ %result.ph, %latchExit.unr-lcssa ]
 ; PROLOG-NEXT:    ret i32 %result
+; PROLOG:       for.exit2.loopexit:
+; PROLOG-NEXT:    br label %for.exit2
+; PROLOG:       for.exit2.loopexit1:
+; PROLOG-NEXT:    br label %for.exit2
 ; PROLOG:       for.exit2:
 ; PROLOG-NEXT:    ret i32 42
+; PROLOG:       for.exiting_block.1:
+; PROLOG-NEXT:    %cmp.1 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.1, label %latchExit.unr-lcssa.loopexit, label %latch.1
+; PROLOG:       latch.1:
+; PROLOG-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+; PROLOG-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; PROLOG-NEXT:    %add.1 = add nsw i32 %4, %add
+; PROLOG-NEXT:    %indvars.iv.next.1 = add i64 %indvars.iv.next, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.2
+; PROLOG:       for.exiting_block.2:
+; PROLOG-NEXT:    %cmp.2 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.2, label %latchExit.unr-lcssa.loopexit, label %latch.2
+; PROLOG:       latch.2:
+; PROLOG-NEXT:    %arrayidx.2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.1
+; PROLOG-NEXT:    %5 = load i32, i32* %arrayidx.2, align 4
+; PROLOG-NEXT:    %add.2 = add nsw i32 %5, %add.1
+; PROLOG-NEXT:    %indvars.iv.next.2 = add i64 %indvars.iv.next.1, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.3
+; PROLOG:       for.exiting_block.3:
+; PROLOG-NEXT:    %cmp.3 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.3, label %latchExit.unr-lcssa.loopexit, label %latch.3
+; PROLOG:       latch.3:
+; PROLOG-NEXT:    %arrayidx.3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.2
+; PROLOG-NEXT:    %6 = load i32, i32* %arrayidx.3, align 4
+; PROLOG-NEXT:    %add.3 = add nsw i32 %6, %add.2
+; PROLOG-NEXT:    %indvars.iv.next.3 = add i64 %indvars.iv.next.2, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.4
+; PROLOG:       for.exiting_block.4:
+; PROLOG-NEXT:    %cmp.4 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.4, label %latchExit.unr-lcssa.loopexit, label %latch.4
+; PROLOG:       latch.4:
+; PROLOG-NEXT:    %arrayidx.4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.3
+; PROLOG-NEXT:    %7 = load i32, i32* %arrayidx.4, align 4
+; PROLOG-NEXT:    %add.4 = add nsw i32 %7, %add.3
+; PROLOG-NEXT:    %indvars.iv.next.4 = add i64 %indvars.iv.next.3, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.5
+; PROLOG:       for.exiting_block.5:
+; PROLOG-NEXT:    %cmp.5 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.5, label %latchExit.unr-lcssa.loopexit, label %latch.5
+; PROLOG:       latch.5:
+; PROLOG-NEXT:    %arrayidx.5 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.4
+; PROLOG-NEXT:    %8 = load i32, i32* %arrayidx.5, align 4
+; PROLOG-NEXT:    %add.5 = add nsw i32 %8, %add.4
+; PROLOG-NEXT:    %indvars.iv.next.5 = add i64 %indvars.iv.next.4, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.6
+; PROLOG:       for.exiting_block.6:
+; PROLOG-NEXT:    %cmp.6 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.6, label %latchExit.unr-lcssa.loopexit, label %latch.6
+; PROLOG:       latch.6:
+; PROLOG-NEXT:    %arrayidx.6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.5
+; PROLOG-NEXT:    %9 = load i32, i32* %arrayidx.6, align 4
+; PROLOG-NEXT:    %add.6 = add nsw i32 %9, %add.5
+; PROLOG-NEXT:    %indvars.iv.next.6 = add i64 %indvars.iv.next.5, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.7
+; PROLOG:       for.exiting_block.7:
+; PROLOG-NEXT:    %cmp.7 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.7, label %latchExit.unr-lcssa.loopexit, label %latch.7
+; PROLOG:       latch.7:
+; PROLOG-NEXT:    %arrayidx.7 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.6
+; PROLOG-NEXT:    %10 = load i32, i32* %arrayidx.7, align 4
+; PROLOG-NEXT:    %add.7 = add nsw i32 %10, %add.6
+; PROLOG-NEXT:    %indvars.iv.next.7 = add i64 %indvars.iv.next.6, 1
+; PROLOG-NEXT:    %exitcond.7 = icmp eq i64 %indvars.iv.next.7, %n
+; PROLOG-NEXT:    br i1 %exitcond.7, label %latchExit.unr-lcssa.loopexit, label %header
 ;
 ; PROLOG-BLOCK-LABEL: @otherblock_latch_same_exit2(
 ; PROLOG-BLOCK-NEXT:  entry:
+; PROLOG-BLOCK-NEXT:    %0 = add i64 %n, -1
+; PROLOG-BLOCK-NEXT:    %xtraiter = and i64 %n, 1
+; PROLOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; PROLOG-BLOCK-NEXT:    br i1 %lcmp.mod, label %header.prol.preheader, label %header.prol.loopexit
+; PROLOG-BLOCK:       header.prol.preheader:
+; PROLOG-BLOCK-NEXT:    br label %header.prol
+; PROLOG-BLOCK:       header.prol:
+; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block.prol
+; PROLOG-BLOCK:       for.exiting_block.prol:
+; PROLOG-BLOCK-NEXT:    %cmp.prol = icmp eq i64 %n, 42
+; PROLOG-BLOCK-NEXT:    br i1 %cmp.prol, label %latchExit.unr-lcssa, label %latch.prol
+; PROLOG-BLOCK:       latch.prol:
+; PROLOG-BLOCK-NEXT:    %arrayidx.prol = getelementptr inbounds i32, i32* %a, i64 0
+; PROLOG-BLOCK-NEXT:    %1 = load i32, i32* %arrayidx.prol, align 4
+; PROLOG-BLOCK-NEXT:    %add.prol = add nsw i32 %1, 0
+; PROLOG-BLOCK-NEXT:    %indvars.iv.next.prol = add i64 0, 1
+; PROLOG-BLOCK-NEXT:    %exitcond.prol = icmp eq i64 %indvars.iv.next.prol, %n
+; PROLOG-BLOCK-NEXT:    br label %header.prol.loopexit
+; PROLOG-BLOCK:       header.prol.loopexit:
+; PROLOG-BLOCK-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %add.prol, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.next.prol, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %add.prol, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %2 = icmp ult i64 %0, 1
+; PROLOG-BLOCK-NEXT:    br i1 %2, label %latchExit, label %entry.new
+; PROLOG-BLOCK:       entry.new:
 ; PROLOG-BLOCK-NEXT:    br label %header
 ; PROLOG-BLOCK:       header:
-; PROLOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %latch.1 ]
-; PROLOG-BLOCK-NEXT:    %sum.02 = phi i32 [ 0, %entry ], [ %add.1, %latch.1 ]
-; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
+; PROLOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.unr, %entry.new ], [ %indvars.iv.next.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    %sum.02 = phi i32 [ %sum.02.unr, %entry.new ], [ %add.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block
 ; PROLOG-BLOCK:       for.exiting_block:
 ; PROLOG-BLOCK-NEXT:    %cmp = icmp eq i64 %n, 42
-; PROLOG-BLOCK-NEXT:    br i1 %cmp, label %latchExit, label %latch
+; PROLOG-BLOCK-NEXT:    br i1 %cmp, label %latchExit.unr-lcssa.loopexit, label %latch
 ; PROLOG-BLOCK:       latch:
 ; PROLOG-BLOCK-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; PROLOG-BLOCK-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; PROLOG-BLOCK-NEXT:    %add = add nsw i32 %0, %sum.02
-; PROLOG-BLOCK-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; PROLOG-BLOCK-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; PROLOG-BLOCK-NEXT:    br i1 %exitcond, label %latchExit, label %header.1
+; PROLOG-BLOCK-NEXT:    %3 = load i32, i32* %arrayidx, align 4
+; PROLOG-BLOCK-NEXT:    %add = add nsw i32 %3, %sum.02
+; PROLOG-BLOCK-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
+; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.1
+; PROLOG-BLOCK:       latchExit.unr-lcssa.loopexit:
+; PROLOG-BLOCK-NEXT:    %result.ph.ph = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %for.exiting_block.1 ], [ %add.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    br label %latchExit.unr-lcssa
+; PROLOG-BLOCK:       latchExit.unr-lcssa:
+; PROLOG-BLOCK-NEXT:    %result.ph = phi i32 [ 0, %for.exiting_block.prol ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; PROLOG-BLOCK-NEXT:    br label %latchExit
 ; PROLOG-BLOCK:       latchExit:
-; PROLOG-BLOCK-NEXT:    %result = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %latch ], [ %add, %for.exiting_block.1 ], [ %add.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    %result = phi i32 [ %result.unr, %header.prol.loopexit ], [ %result.ph, %latchExit.unr-lcssa ]
 ; PROLOG-BLOCK-NEXT:    ret i32 %result
+; PROLOG-BLOCK:       for.exit2.loopexit:
+; PROLOG-BLOCK-NEXT:    br label %for.exit2
 ; PROLOG-BLOCK:       for.exit2:
 ; PROLOG-BLOCK-NEXT:    ret i32 42
-; PROLOG-BLOCK:       header.1:
-; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block.1
 ; PROLOG-BLOCK:       for.exiting_block.1:
 ; PROLOG-BLOCK-NEXT:    %cmp.1 = icmp eq i64 %n, 42
-; PROLOG-BLOCK-NEXT:    br i1 %cmp.1, label %latchExit, label %latch.1
+; PROLOG-BLOCK-NEXT:    br i1 %cmp.1, label %latchExit.unr-lcssa.loopexit, label %latch.1
 ; PROLOG-BLOCK:       latch.1:
 ; PROLOG-BLOCK-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
-; PROLOG-BLOCK-NEXT:    %1 = load i32, i32* %arrayidx.1, align 4
-; PROLOG-BLOCK-NEXT:    %add.1 = add nsw i32 %1, %add
+; PROLOG-BLOCK-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; PROLOG-BLOCK-NEXT:    %add.1 = add nsw i32 %4, %add
 ; PROLOG-BLOCK-NEXT:    %indvars.iv.next.1 = add i64 %indvars.iv.next, 1
 ; PROLOG-BLOCK-NEXT:    %exitcond.1 = icmp eq i64 %indvars.iv.next.1, %n
-; PROLOG-BLOCK-NEXT:    br i1 %exitcond.1, label %latchExit, label %header, !llvm.loop !6
+; PROLOG-BLOCK-NEXT:    br i1 %exitcond.1, label %latchExit.unr-lcssa.loopexit, label %header, !llvm.loop !6
 ;
 
 entry:
@@ -2192,144 +2632,424 @@ for.exit2:
 ; for.exiting_block.
 ; FIXME: We should be able to runtime unroll.
 define i32 @otherblock_latch_same_exit3(i32* nocapture %a, i64 %n, i1 %cond) {
-; EPILOG-NO-IC-LABEL: @otherblock_latch_same_exit3(
-; EPILOG-NO-IC-NEXT:  entry:
-; EPILOG-NO-IC-NEXT:    br label %header
-; EPILOG-NO-IC:       header:
-; EPILOG-NO-IC-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
-; EPILOG-NO-IC-NEXT:    %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
-; EPILOG-NO-IC:       for.exiting_block:
-; EPILOG-NO-IC-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-NO-IC-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; EPILOG-NO-IC-NEXT:    %add = add nsw i32 %0, %sum.02
-; EPILOG-NO-IC-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp, label %latchExit, label %latch
-; EPILOG-NO-IC:       latch:
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
-; EPILOG-NO-IC-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; EPILOG-NO-IC-NEXT:    br i1 %exitcond, label %latchExit, label %header
-; EPILOG-NO-IC:       latchExit:
-; EPILOG-NO-IC-NEXT:    %result = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %latch ]
-; EPILOG-NO-IC-NEXT:    ret i32 %result
-; EPILOG-NO-IC:       for.exit2:
-; EPILOG-NO-IC-NEXT:    ret i32 42
-;
 ; EPILOG-LABEL: @otherblock_latch_same_exit3(
 ; EPILOG-NEXT:  entry:
+; EPILOG-NEXT:    %0 = add i64 %n, -1
+; EPILOG-NEXT:    %xtraiter = and i64 %n, 7
+; EPILOG-NEXT:    %1 = icmp ult i64 %0, 7
+; EPILOG-NEXT:    br i1 %1, label %latchExit.unr-lcssa, label %entry.new
+; EPILOG:       entry.new:
+; EPILOG-NEXT:    %unroll_iter = sub i64 %n, %xtraiter
 ; EPILOG-NEXT:    br label %header
 ; EPILOG:       header:
-; EPILOG-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
-; EPILOG-NEXT:    %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
-; EPILOG-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
+; EPILOG-NEXT:    %indvars.iv = phi i64 [ 0, %entry.new ], [ %indvars.iv.next.7, %latch.7 ]
+; EPILOG-NEXT:    %sum.02 = phi i32 [ 0, %entry.new ], [ %add.7, %latch.7 ]
+; EPILOG-NEXT:    %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.7, %latch.7 ]
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block
 ; EPILOG:       for.exiting_block:
 ; EPILOG-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; EPILOG-NEXT:    %add = add nsw i32 %0, %sum.02
+; EPILOG-NEXT:    %2 = load i32, i32* %arrayidx, align 4
+; EPILOG-NEXT:    %add = add nsw i32 %2, %sum.02
 ; EPILOG-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-NEXT:    br i1 %cmp, label %latchExit, label %latch
+; EPILOG-NEXT:    br i1 %cmp, label %latchExit.epilog-lcssa.loopexit, label %latch
 ; EPILOG:       latch:
-; EPILOG-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
-; EPILOG-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; EPILOG-NEXT:    br i1 %exitcond, label %latchExit, label %header
+; EPILOG-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+; EPILOG-NEXT:    %niter.nsub = sub i64 %niter, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.1
+; EPILOG:       latchExit.unr-lcssa.loopexit:
+; EPILOG-NEXT:    %result.ph.ph = phi i32 [ %add.7, %latch.7 ]
+; EPILOG-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.7, %latch.7 ]
+; EPILOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.7, %latch.7 ]
+; EPILOG-NEXT:    br label %latchExit.unr-lcssa
+; EPILOG:       latchExit.unr-lcssa:
+; EPILOG-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; EPILOG-NEXT:    br i1 %lcmp.mod, label %header.epil.preheader, label %latchExit
+; EPILOG:       header.epil.preheader:
+; EPILOG-NEXT:    br label %header.epil
+; EPILOG:       header.epil:
+; EPILOG-NEXT:    %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %latch.epil ], [ %indvars.iv.unr, %header.epil.preheader ]
+; EPILOG-NEXT:    %sum.02.epil = phi i32 [ %add.epil, %latch.epil ], [ %sum.02.unr, %header.epil.preheader ]
+; EPILOG-NEXT:    %epil.iter = phi i64 [ %xtraiter, %header.epil.preheader ], [ %epil.iter.sub, %latch.epil ]
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit2, label %for.exiting_block.epil
+; EPILOG:       for.exiting_block.epil:
+; EPILOG-NEXT:    %arrayidx.epil = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.epil
+; EPILOG-NEXT:    %3 = load i32, i32* %arrayidx.epil, align 4
+; EPILOG-NEXT:    %add.epil = add nsw i32 %3, %sum.02.epil
+; EPILOG-NEXT:    %cmp.epil = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.epil, label %latchExit.epilog-lcssa.loopexit3, label %latch.epil
+; EPILOG:       latch.epil:
+; EPILOG-NEXT:    %indvars.iv.next.epil = add i64 %indvars.iv.epil, 1
+; EPILOG-NEXT:    %exitcond.epil = icmp eq i64 %indvars.iv.next.epil, %n
+; EPILOG-NEXT:    %epil.iter.sub = sub i64 %epil.iter, 1
+; EPILOG-NEXT:    %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0
+; EPILOG-NEXT:    br i1 %epil.iter.cmp, label %header.epil, label %latchExit.epilog-lcssa.loopexit3, !llvm.loop !7
+; EPILOG:       latchExit.epilog-lcssa.loopexit:
+; EPILOG-NEXT:    %result.ph1.ph = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %for.exiting_block.1 ], [ %add.1, %for.exiting_block.2 ], [ %add.2, %for.exiting_block.3 ], [ %add.3, %for.exiting_block.4 ], [ %add.4, %for.exiting_block.5 ], [ %add.5, %for.exiting_block.6 ], [ %add.6, %for.exiting_block.7 ]
+; EPILOG-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG:       latchExit.epilog-lcssa.loopexit3:
+; EPILOG-NEXT:    %result.ph1.ph4 = phi i32 [ %sum.02.epil, %for.exiting_block.epil ], [ %add.epil, %latch.epil ]
+; EPILOG-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG:       latchExit.epilog-lcssa:
+; EPILOG-NEXT:    %result.ph1 = phi i32 [ %result.ph1.ph, %latchExit.epilog-lcssa.loopexit ], [ %result.ph1.ph4, %latchExit.epilog-lcssa.loopexit3 ]
+; EPILOG-NEXT:    br label %latchExit
 ; EPILOG:       latchExit:
-; EPILOG-NEXT:    %result = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %latch ]
+; EPILOG-NEXT:    %result = phi i32 [ %result.ph, %latchExit.unr-lcssa ], [ %result.ph1, %latchExit.epilog-lcssa ]
 ; EPILOG-NEXT:    ret i32 %result
+; EPILOG:       for.exit2.loopexit:
+; EPILOG-NEXT:    br label %for.exit2
+; EPILOG:       for.exit2.loopexit2:
+; EPILOG-NEXT:    br label %for.exit2
 ; EPILOG:       for.exit2:
 ; EPILOG-NEXT:    ret i32 42
+; EPILOG:       for.exiting_block.1:
+; EPILOG-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+; EPILOG-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; EPILOG-NEXT:    %add.1 = add nsw i32 %4, %add
+; EPILOG-NEXT:    %cmp.1 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.1, label %latchExit.epilog-lcssa.loopexit, label %latch.1
+; EPILOG:       latch.1:
+; EPILOG-NEXT:    %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv.next, 1
+; EPILOG-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.2
+; EPILOG:       for.exiting_block.2:
+; EPILOG-NEXT:    %arrayidx.2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.1
+; EPILOG-NEXT:    %5 = load i32, i32* %arrayidx.2, align 4
+; EPILOG-NEXT:    %add.2 = add nsw i32 %5, %add.1
+; EPILOG-NEXT:    %cmp.2 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.2, label %latchExit.epilog-lcssa.loopexit, label %latch.2
+; EPILOG:       latch.2:
+; EPILOG-NEXT:    %indvars.iv.next.2 = add nuw nsw i64 %indvars.iv.next.1, 1
+; EPILOG-NEXT:    %niter.nsub.2 = sub i64 %niter.nsub.1, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.3
+; EPILOG:       for.exiting_block.3:
+; EPILOG-NEXT:    %arrayidx.3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.2
+; EPILOG-NEXT:    %6 = load i32, i32* %arrayidx.3, align 4
+; EPILOG-NEXT:    %add.3 = add nsw i32 %6, %add.2
+; EPILOG-NEXT:    %cmp.3 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.3, label %latchExit.epilog-lcssa.loopexit, label %latch.3
+; EPILOG:       latch.3:
+; EPILOG-NEXT:    %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv.next.2, 1
+; EPILOG-NEXT:    %niter.nsub.3 = sub i64 %niter.nsub.2, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.4
+; EPILOG:       for.exiting_block.4:
+; EPILOG-NEXT:    %arrayidx.4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.3
+; EPILOG-NEXT:    %7 = load i32, i32* %arrayidx.4, align 4
+; EPILOG-NEXT:    %add.4 = add nsw i32 %7, %add.3
+; EPILOG-NEXT:    %cmp.4 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.4, label %latchExit.epilog-lcssa.loopexit, label %latch.4
+; EPILOG:       latch.4:
+; EPILOG-NEXT:    %indvars.iv.next.4 = add nuw nsw i64 %indvars.iv.next.3, 1
+; EPILOG-NEXT:    %niter.nsub.4 = sub i64 %niter.nsub.3, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.5
+; EPILOG:       for.exiting_block.5:
+; EPILOG-NEXT:    %arrayidx.5 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.4
+; EPILOG-NEXT:    %8 = load i32, i32* %arrayidx.5, align 4
+; EPILOG-NEXT:    %add.5 = add nsw i32 %8, %add.4
+; EPILOG-NEXT:    %cmp.5 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.5, label %latchExit.epilog-lcssa.loopexit, label %latch.5
+; EPILOG:       latch.5:
+; EPILOG-NEXT:    %indvars.iv.next.5 = add nuw nsw i64 %indvars.iv.next.4, 1
+; EPILOG-NEXT:    %niter.nsub.5 = sub i64 %niter.nsub.4, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.6
+; EPILOG:       for.exiting_block.6:
+; EPILOG-NEXT:    %arrayidx.6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.5
+; EPILOG-NEXT:    %9 = load i32, i32* %arrayidx.6, align 4
+; EPILOG-NEXT:    %add.6 = add nsw i32 %9, %add.5
+; EPILOG-NEXT:    %cmp.6 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.6, label %latchExit.epilog-lcssa.loopexit, label %latch.6
+; EPILOG:       latch.6:
+; EPILOG-NEXT:    %indvars.iv.next.6 = add nuw nsw i64 %indvars.iv.next.5, 1
+; EPILOG-NEXT:    %niter.nsub.6 = sub i64 %niter.nsub.5, 1
+; EPILOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.7
+; EPILOG:       for.exiting_block.7:
+; EPILOG-NEXT:    %arrayidx.7 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.6
+; EPILOG-NEXT:    %10 = load i32, i32* %arrayidx.7, align 4
+; EPILOG-NEXT:    %add.7 = add nsw i32 %10, %add.6
+; EPILOG-NEXT:    %cmp.7 = icmp eq i64 %n, 42
+; EPILOG-NEXT:    br i1 %cmp.7, label %latchExit.epilog-lcssa.loopexit, label %latch.7
+; EPILOG:       latch.7:
+; EPILOG-NEXT:    %indvars.iv.next.7 = add i64 %indvars.iv.next.6, 1
+; EPILOG-NEXT:    %niter.nsub.7 = sub i64 %niter.nsub.6, 1
+; EPILOG-NEXT:    %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0
+; EPILOG-NEXT:    br i1 %niter.ncmp.7, label %latchExit.unr-lcssa.loopexit, label %header
 ;
 ; EPILOG-BLOCK-LABEL: @otherblock_latch_same_exit3(
 ; EPILOG-BLOCK-NEXT:  entry:
+; EPILOG-BLOCK-NEXT:    %0 = add i64 %n, -1
+; EPILOG-BLOCK-NEXT:    %xtraiter = and i64 %n, 1
+; EPILOG-BLOCK-NEXT:    %1 = icmp ult i64 %0, 1
+; EPILOG-BLOCK-NEXT:    br i1 %1, label %latchExit.unr-lcssa, label %entry.new
+; EPILOG-BLOCK:       entry.new:
+; EPILOG-BLOCK-NEXT:    %unroll_iter = sub i64 %n, %xtraiter
 ; EPILOG-BLOCK-NEXT:    br label %header
 ; EPILOG-BLOCK:       header:
-; EPILOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %latch.1 ]
-; EPILOG-BLOCK-NEXT:    %sum.02 = phi i32 [ 0, %entry ], [ %add.1, %latch.1 ]
-; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
+; EPILOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ 0, %entry.new ], [ %indvars.iv.next.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %sum.02 = phi i32 [ 0, %entry.new ], [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block
 ; EPILOG-BLOCK:       for.exiting_block:
 ; EPILOG-BLOCK-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-BLOCK-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; EPILOG-BLOCK-NEXT:    %add = add nsw i32 %0, %sum.02
+; EPILOG-BLOCK-NEXT:    %2 = load i32, i32* %arrayidx, align 4
+; EPILOG-BLOCK-NEXT:    %add = add nsw i32 %2, %sum.02
 ; EPILOG-BLOCK-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-BLOCK-NEXT:    br i1 %cmp, label %latchExit, label %latch
+; EPILOG-BLOCK-NEXT:    br i1 %cmp, label %latchExit.epilog-lcssa.loopexit, label %latch
 ; EPILOG-BLOCK:       latch:
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; EPILOG-BLOCK-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; EPILOG-BLOCK-NEXT:    br i1 %exitcond, label %latchExit, label %header.1
+; EPILOG-BLOCK-NEXT:    %niter.nsub = sub i64 %niter, 1
+; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.1
+; EPILOG-BLOCK:       latchExit.unr-lcssa.loopexit:
+; EPILOG-BLOCK-NEXT:    %result.ph.ph = phi i32 [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %sum.02.unr.ph = phi i32 [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    br label %latchExit.unr-lcssa
+; EPILOG-BLOCK:       latchExit.unr-lcssa:
+; EPILOG-BLOCK-NEXT:    %result.ph = phi i32 [ undef, %entry ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latchExit.unr-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; EPILOG-BLOCK-NEXT:    br i1 %lcmp.mod, label %header.epil.preheader, label %latchExit
+; EPILOG-BLOCK:       header.epil.preheader:
+; EPILOG-BLOCK-NEXT:    br label %header.epil
+; EPILOG-BLOCK:       header.epil:
+; EPILOG-BLOCK-NEXT:    %indvars.iv.epil = phi i64 [ %indvars.iv.unr, %header.epil.preheader ]
+; EPILOG-BLOCK-NEXT:    %sum.02.epil = phi i32 [ %sum.02.unr, %header.epil.preheader ]
+; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block.epil
+; EPILOG-BLOCK:       for.exiting_block.epil:
+; EPILOG-BLOCK-NEXT:    %arrayidx.epil = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.epil
+; EPILOG-BLOCK-NEXT:    %3 = load i32, i32* %arrayidx.epil, align 4
+; EPILOG-BLOCK-NEXT:    %add.epil = add nsw i32 %3, %sum.02.epil
+; EPILOG-BLOCK-NEXT:    %cmp.epil = icmp eq i64 %n, 42
+; EPILOG-BLOCK-NEXT:    br i1 %cmp.epil, label %latchExit.epilog-lcssa, label %latch.epil
+; EPILOG-BLOCK:       latch.epil:
+; EPILOG-BLOCK-NEXT:    %indvars.iv.next.epil = add i64 %indvars.iv.epil, 1
+; EPILOG-BLOCK-NEXT:    %exitcond.epil = icmp eq i64 %indvars.iv.next.epil, %n
+; EPILOG-BLOCK-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG-BLOCK:       latchExit.epilog-lcssa.loopexit:
+; EPILOG-BLOCK-NEXT:    %result.ph1.ph = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %for.exiting_block.1 ]
+; EPILOG-BLOCK-NEXT:    br label %latchExit.epilog-lcssa
+; EPILOG-BLOCK:       latchExit.epilog-lcssa:
+; EPILOG-BLOCK-NEXT:    %result.ph1 = phi i32 [ %add.epil, %latch.epil ], [ %sum.02.epil, %for.exiting_block.epil ], [ %result.ph1.ph, %latchExit.epilog-lcssa.loopexit ]
+; EPILOG-BLOCK-NEXT:    br label %latchExit
 ; EPILOG-BLOCK:       latchExit:
-; EPILOG-BLOCK-NEXT:    %result = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %latch ], [ %add, %for.exiting_block.1 ], [ %add.1, %latch.1 ]
+; EPILOG-BLOCK-NEXT:    %result = phi i32 [ %result.ph, %latchExit.unr-lcssa ], [ %result.ph1, %latchExit.epilog-lcssa ]
 ; EPILOG-BLOCK-NEXT:    ret i32 %result
+; EPILOG-BLOCK:       for.exit2.loopexit:
+; EPILOG-BLOCK-NEXT:    br label %for.exit2
 ; EPILOG-BLOCK:       for.exit2:
 ; EPILOG-BLOCK-NEXT:    ret i32 42
-; EPILOG-BLOCK:       header.1:
-; EPILOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block.1
 ; EPILOG-BLOCK:       for.exiting_block.1:
 ; EPILOG-BLOCK-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
-; EPILOG-BLOCK-NEXT:    %1 = load i32, i32* %arrayidx.1, align 4
-; EPILOG-BLOCK-NEXT:    %add.1 = add nsw i32 %1, %add
+; EPILOG-BLOCK-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; EPILOG-BLOCK-NEXT:    %add.1 = add nsw i32 %4, %add
 ; EPILOG-BLOCK-NEXT:    %cmp.1 = icmp eq i64 %n, 42
-; EPILOG-BLOCK-NEXT:    br i1 %cmp.1, label %latchExit, label %latch.1
+; EPILOG-BLOCK-NEXT:    br i1 %cmp.1, label %latchExit.epilog-lcssa.loopexit, label %latch.1
 ; EPILOG-BLOCK:       latch.1:
 ; EPILOG-BLOCK-NEXT:    %indvars.iv.next.1 = add i64 %indvars.iv.next, 1
-; EPILOG-BLOCK-NEXT:    %exitcond.1 = icmp eq i64 %indvars.iv.next.1, %n
-; EPILOG-BLOCK-NEXT:    br i1 %exitcond.1, label %latchExit, label %header, !llvm.loop !7
+; EPILOG-BLOCK-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
+; EPILOG-BLOCK-NEXT:    %niter.ncmp.1 = icmp eq i64 %niter.nsub.1, 0
+; EPILOG-BLOCK-NEXT:    br i1 %niter.ncmp.1, label %latchExit.unr-lcssa.loopexit, label %header, !llvm.loop !7
 ;
 ; PROLOG-LABEL: @otherblock_latch_same_exit3(
 ; PROLOG-NEXT:  entry:
+; PROLOG-NEXT:    %0 = add i64 %n, -1
+; PROLOG-NEXT:    %xtraiter = and i64 %n, 7
+; PROLOG-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; PROLOG-NEXT:    br i1 %lcmp.mod, label %header.prol.preheader, label %header.prol.loopexit
+; PROLOG:       header.prol.preheader:
+; PROLOG-NEXT:    br label %header.prol
+; PROLOG:       header.prol:
+; PROLOG-NEXT:    %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %latch.prol ], [ 0, %header.prol.preheader ]
+; PROLOG-NEXT:    %sum.02.prol = phi i32 [ %add.prol, %latch.prol ], [ 0, %header.prol.preheader ]
+; PROLOG-NEXT:    %prol.iter = phi i64 [ %xtraiter, %header.prol.preheader ], [ %prol.iter.sub, %latch.prol ]
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit1, label %for.exiting_block.prol
+; PROLOG:       for.exiting_block.prol:
+; PROLOG-NEXT:    %arrayidx.prol = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.prol
+; PROLOG-NEXT:    %1 = load i32, i32* %arrayidx.prol, align 4
+; PROLOG-NEXT:    %add.prol = add nsw i32 %1, %sum.02.prol
+; PROLOG-NEXT:    %cmp.prol = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.prol, label %latchExit.unr-lcssa.loopexit2, label %latch.prol
+; PROLOG:       latch.prol:
+; PROLOG-NEXT:    %indvars.iv.next.prol = add i64 %indvars.iv.prol, 1
+; PROLOG-NEXT:    %exitcond.prol = icmp eq i64 %indvars.iv.next.prol, %n
+; PROLOG-NEXT:    %prol.iter.sub = sub i64 %prol.iter, 1
+; PROLOG-NEXT:    %prol.iter.cmp = icmp ne i64 %prol.iter.sub, 0
+; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %header.prol, label %header.prol.loopexit.unr-lcssa, !llvm.loop !7
+; PROLOG:       header.prol.loopexit.unr-lcssa:
+; PROLOG-NEXT:    %result.unr.ph = phi i32 [ %add.prol, %latch.prol ]
+; PROLOG-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.prol, %latch.prol ]
+; PROLOG-NEXT:    %sum.02.unr.ph = phi i32 [ %add.prol, %latch.prol ]
+; PROLOG-NEXT:    br label %header.prol.loopexit
+; PROLOG:       header.prol.loopexit:
+; PROLOG-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %result.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %header.prol.loopexit.unr-lcssa ]
+; PROLOG-NEXT:    %2 = icmp ult i64 %0, 7
+; PROLOG-NEXT:    br i1 %2, label %latchExit, label %entry.new
+; PROLOG:       entry.new:
 ; PROLOG-NEXT:    br label %header
 ; PROLOG:       header:
-; PROLOG-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
-; PROLOG-NEXT:    %sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
-; PROLOG-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
+; PROLOG-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.unr, %entry.new ], [ %indvars.iv.next.7, %latch.7 ]
+; PROLOG-NEXT:    %sum.02 = phi i32 [ %sum.02.unr, %entry.new ], [ %add.7, %latch.7 ]
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block
 ; PROLOG:       for.exiting_block:
 ; PROLOG-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; PROLOG-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; PROLOG-NEXT:    %add = add nsw i32 %0, %sum.02
+; PROLOG-NEXT:    %3 = load i32, i32* %arrayidx, align 4
+; PROLOG-NEXT:    %add = add nsw i32 %3, %sum.02
 ; PROLOG-NEXT:    %cmp = icmp eq i64 %n, 42
-; PROLOG-NEXT:    br i1 %cmp, label %latchExit, label %latch
+; PROLOG-NEXT:    br i1 %cmp, label %latchExit.unr-lcssa.loopexit, label %latch
 ; PROLOG:       latch:
 ; PROLOG-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
-; PROLOG-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; PROLOG-NEXT:    br i1 %exitcond, label %latchExit, label %header
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.1
+; PROLOG:       latchExit.unr-lcssa.loopexit:
+; PROLOG-NEXT:    %result.ph.ph = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %for.exiting_block.1 ], [ %add.1, %for.exiting_block.2 ], [ %add.2, %for.exiting_block.3 ], [ %add.3, %for.exiting_block.4 ], [ %add.4, %for.exiting_block.5 ], [ %add.5, %for.exiting_block.6 ], [ %add.6, %for.exiting_block.7 ], [ %add.7, %latch.7 ]
+; PROLOG-NEXT:    br label %latchExit.unr-lcssa
+; PROLOG:       latchExit.unr-lcssa.loopexit2:
+; PROLOG-NEXT:    %result.ph.ph3 = phi i32 [ %sum.02.prol, %for.exiting_block.prol ]
+; PROLOG-NEXT:    br label %latchExit.unr-lcssa
+; PROLOG:       latchExit.unr-lcssa:
+; PROLOG-NEXT:    %result.ph = phi i32 [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ], [ %result.ph.ph3, %latchExit.unr-lcssa.loopexit2 ]
+; PROLOG-NEXT:    br label %latchExit
 ; PROLOG:       latchExit:
-; PROLOG-NEXT:    %result = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %latch ]
+; PROLOG-NEXT:    %result = phi i32 [ %result.unr, %header.prol.loopexit ], [ %result.ph, %latchExit.unr-lcssa ]
 ; PROLOG-NEXT:    ret i32 %result
+; PROLOG:       for.exit2.loopexit:
+; PROLOG-NEXT:    br label %for.exit2
+; PROLOG:       for.exit2.loopexit1:
+; PROLOG-NEXT:    br label %for.exit2
 ; PROLOG:       for.exit2:
 ; PROLOG-NEXT:    ret i32 42
+; PROLOG:       for.exiting_block.1:
+; PROLOG-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+; PROLOG-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; PROLOG-NEXT:    %add.1 = add nsw i32 %4, %add
+; PROLOG-NEXT:    %cmp.1 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.1, label %latchExit.unr-lcssa.loopexit, label %latch.1
+; PROLOG:       latch.1:
+; PROLOG-NEXT:    %indvars.iv.next.1 = add i64 %indvars.iv.next, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.2
+; PROLOG:       for.exiting_block.2:
+; PROLOG-NEXT:    %arrayidx.2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.1
+; PROLOG-NEXT:    %5 = load i32, i32* %arrayidx.2, align 4
+; PROLOG-NEXT:    %add.2 = add nsw i32 %5, %add.1
+; PROLOG-NEXT:    %cmp.2 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.2, label %latchExit.unr-lcssa.loopexit, label %latch.2
+; PROLOG:       latch.2:
+; PROLOG-NEXT:    %indvars.iv.next.2 = add i64 %indvars.iv.next.1, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.3
+; PROLOG:       for.exiting_block.3:
+; PROLOG-NEXT:    %arrayidx.3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.2
+; PROLOG-NEXT:    %6 = load i32, i32* %arrayidx.3, align 4
+; PROLOG-NEXT:    %add.3 = add nsw i32 %6, %add.2
+; PROLOG-NEXT:    %cmp.3 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.3, label %latchExit.unr-lcssa.loopexit, label %latch.3
+; PROLOG:       latch.3:
+; PROLOG-NEXT:    %indvars.iv.next.3 = add i64 %indvars.iv.next.2, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.4
+; PROLOG:       for.exiting_block.4:
+; PROLOG-NEXT:    %arrayidx.4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.3
+; PROLOG-NEXT:    %7 = load i32, i32* %arrayidx.4, align 4
+; PROLOG-NEXT:    %add.4 = add nsw i32 %7, %add.3
+; PROLOG-NEXT:    %cmp.4 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.4, label %latchExit.unr-lcssa.loopexit, label %latch.4
+; PROLOG:       latch.4:
+; PROLOG-NEXT:    %indvars.iv.next.4 = add i64 %indvars.iv.next.3, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.5
+; PROLOG:       for.exiting_block.5:
+; PROLOG-NEXT:    %arrayidx.5 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.4
+; PROLOG-NEXT:    %8 = load i32, i32* %arrayidx.5, align 4
+; PROLOG-NEXT:    %add.5 = add nsw i32 %8, %add.4
+; PROLOG-NEXT:    %cmp.5 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.5, label %latchExit.unr-lcssa.loopexit, label %latch.5
+; PROLOG:       latch.5:
+; PROLOG-NEXT:    %indvars.iv.next.5 = add i64 %indvars.iv.next.4, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.6
+; PROLOG:       for.exiting_block.6:
+; PROLOG-NEXT:    %arrayidx.6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.5
+; PROLOG-NEXT:    %9 = load i32, i32* %arrayidx.6, align 4
+; PROLOG-NEXT:    %add.6 = add nsw i32 %9, %add.5
+; PROLOG-NEXT:    %cmp.6 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.6, label %latchExit.unr-lcssa.loopexit, label %latch.6
+; PROLOG:       latch.6:
+; PROLOG-NEXT:    %indvars.iv.next.6 = add i64 %indvars.iv.next.5, 1
+; PROLOG-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.7
+; PROLOG:       for.exiting_block.7:
+; PROLOG-NEXT:    %arrayidx.7 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.6
+; PROLOG-NEXT:    %10 = load i32, i32* %arrayidx.7, align 4
+; PROLOG-NEXT:    %add.7 = add nsw i32 %10, %add.6
+; PROLOG-NEXT:    %cmp.7 = icmp eq i64 %n, 42
+; PROLOG-NEXT:    br i1 %cmp.7, label %latchExit.unr-lcssa.loopexit, label %latch.7
+; PROLOG:       latch.7:
+; PROLOG-NEXT:    %indvars.iv.next.7 = add i64 %indvars.iv.next.6, 1
+; PROLOG-NEXT:    %exitcond.7 = icmp eq i64 %indvars.iv.next.7, %n
+; PROLOG-NEXT:    br i1 %exitcond.7, label %latchExit.unr-lcssa.loopexit, label %header
 ;
 ; PROLOG-BLOCK-LABEL: @otherblock_latch_same_exit3(
 ; PROLOG-BLOCK-NEXT:  entry:
+; PROLOG-BLOCK-NEXT:    %0 = add i64 %n, -1
+; PROLOG-BLOCK-NEXT:    %xtraiter = and i64 %n, 1
+; PROLOG-BLOCK-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
+; PROLOG-BLOCK-NEXT:    br i1 %lcmp.mod, label %header.prol.preheader, label %header.prol.loopexit
+; PROLOG-BLOCK:       header.prol.preheader:
+; PROLOG-BLOCK-NEXT:    br label %header.prol
+; PROLOG-BLOCK:       header.prol:
+; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block.prol
+; PROLOG-BLOCK:       for.exiting_block.prol:
+; PROLOG-BLOCK-NEXT:    %arrayidx.prol = getelementptr inbounds i32, i32* %a, i64 0
+; PROLOG-BLOCK-NEXT:    %1 = load i32, i32* %arrayidx.prol, align 4
+; PROLOG-BLOCK-NEXT:    %add.prol = add nsw i32 %1, 0
+; PROLOG-BLOCK-NEXT:    %cmp.prol = icmp eq i64 %n, 42
+; PROLOG-BLOCK-NEXT:    br i1 %cmp.prol, label %latchExit.unr-lcssa, label %latch.prol
+; PROLOG-BLOCK:       latch.prol:
+; PROLOG-BLOCK-NEXT:    %indvars.iv.next.prol = add i64 0, 1
+; PROLOG-BLOCK-NEXT:    %exitcond.prol = icmp eq i64 %indvars.iv.next.prol, %n
+; PROLOG-BLOCK-NEXT:    br label %header.prol.loopexit
+; PROLOG-BLOCK:       header.prol.loopexit:
+; PROLOG-BLOCK-NEXT:    %result.unr = phi i32 [ undef, %entry ], [ %add.prol, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.next.prol, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %add.prol, %latch.prol ]
+; PROLOG-BLOCK-NEXT:    %2 = icmp ult i64 %0, 1
+; PROLOG-BLOCK-NEXT:    br i1 %2, label %latchExit, label %entry.new
+; PROLOG-BLOCK:       entry.new:
 ; PROLOG-BLOCK-NEXT:    br label %header
 ; PROLOG-BLOCK:       header:
-; PROLOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %latch.1 ]
-; PROLOG-BLOCK-NEXT:    %sum.02 = phi i32 [ 0, %entry ], [ %add.1, %latch.1 ]
-; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block
+; PROLOG-BLOCK-NEXT:    %indvars.iv = phi i64 [ %indvars.iv.unr, %entry.new ], [ %indvars.iv.next.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    %sum.02 = phi i32 [ %sum.02.unr, %entry.new ], [ %add.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block
 ; PROLOG-BLOCK:       for.exiting_block:
 ; PROLOG-BLOCK-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; PROLOG-BLOCK-NEXT:    %0 = load i32, i32* %arrayidx, align 4
-; PROLOG-BLOCK-NEXT:    %add = add nsw i32 %0, %sum.02
+; PROLOG-BLOCK-NEXT:    %3 = load i32, i32* %arrayidx, align 4
+; PROLOG-BLOCK-NEXT:    %add = add nsw i32 %3, %sum.02
 ; PROLOG-BLOCK-NEXT:    %cmp = icmp eq i64 %n, 42
-; PROLOG-BLOCK-NEXT:    br i1 %cmp, label %latchExit, label %latch
+; PROLOG-BLOCK-NEXT:    br i1 %cmp, label %latchExit.unr-lcssa.loopexit, label %latch
 ; PROLOG-BLOCK:       latch:
-; PROLOG-BLOCK-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; PROLOG-BLOCK-NEXT:    %exitcond = icmp eq i64 %indvars.iv.next, %n
-; PROLOG-BLOCK-NEXT:    br i1 %exitcond, label %latchExit, label %header.1
+; PROLOG-BLOCK-NEXT:    %indvars.iv.next = add i64 %indvars.iv, 1
+; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2.loopexit, label %for.exiting_block.1
+; PROLOG-BLOCK:       latchExit.unr-lcssa.loopexit:
+; PROLOG-BLOCK-NEXT:    %result.ph.ph = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %for.exiting_block.1 ], [ %add.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    br label %latchExit.unr-lcssa
+; PROLOG-BLOCK:       latchExit.unr-lcssa:
+; PROLOG-BLOCK-NEXT:    %result.ph = phi i32 [ 0, %for.exiting_block.prol ], [ %result.ph.ph, %latchExit.unr-lcssa.loopexit ]
+; PROLOG-BLOCK-NEXT:    br label %latchExit
 ; PROLOG-BLOCK:       latchExit:
-; PROLOG-BLOCK-NEXT:    %result = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %latch ], [ %add, %for.exiting_block.1 ], [ %add.1, %latch.1 ]
+; PROLOG-BLOCK-NEXT:    %result = phi i32 [ %result.unr, %header.prol.loopexit ], [ %result.ph, %latchExit.unr-lcssa ]
 ; PROLOG-BLOCK-NEXT:    ret i32 %result
+; PROLOG-BLOCK:       for.exit2.loopexit:
+; PROLOG-BLOCK-NEXT:    br label %for.exit2
 ; PROLOG-BLOCK:       for.exit2:
 ; PROLOG-BLOCK-NEXT:    ret i32 42
-; PROLOG-BLOCK:       header.1:
-; PROLOG-BLOCK-NEXT:    br i1 %cond, label %for.exit2, label %for.exiting_block.1
 ; PROLOG-BLOCK:       for.exiting_block.1:
 ; PROLOG-BLOCK-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
-; PROLOG-BLOCK-NEXT:    %1 = load i32, i32* %arrayidx.1, align 4
-; PROLOG-BLOCK-NEXT:    %add.1 = add nsw i32 %1, %add
+; PROLOG-BLOCK-NEXT:    %4 = load i32, i32* %arrayidx.1, align 4
+; PROLOG-BLOCK-NEXT:    %add.1 = add nsw i32 %4, %add
 ; PROLOG-BLOCK-NEXT:    %cmp.1 = icmp eq i64 %n, 42
-; PROLOG-BLOCK-NEXT:    br i1 %cmp.1, label %latchExit, label %latch.1
+; PROLOG-BLOCK-NEXT:    br i1 %cmp.1, label %latchExit.unr-lcssa.loopexit, label %latch.1
 ; PROLOG-BLOCK:       latch.1:
 ; PROLOG-BLOCK-NEXT:    %indvars.iv.next.1 = add i64 %indvars.iv.next, 1
 ; PROLOG-BLOCK-NEXT:    %exitcond.1 = icmp eq i64 %indvars.iv.next.1, %n
-; PROLOG-BLOCK-NEXT:    br i1 %exitcond.1, label %latchExit, label %header, !llvm.loop !7
+; PROLOG-BLOCK-NEXT:    br i1 %exitcond.1, label %latchExit.unr-lcssa.loopexit, label %header, !llvm.loop !7
 ;
 
 entry:
@@ -2409,156 +3129,6 @@ returnblock:                                         ; preds = %latchExit, %entr
 ; two exiting and two exit blocks.
 ; the non-latch exiting block has duplicate edges to the non-latch exit block.
 define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
-; EPILOG-NO-IC-LABEL: @test5(
-; EPILOG-NO-IC-NEXT:  entry:
-; EPILOG-NO-IC-NEXT:    %0 = add i64 %trip, -1
-; EPILOG-NO-IC-NEXT:    %xtraiter = and i64 %trip, 7
-; EPILOG-NO-IC-NEXT:    %1 = icmp ult i64 %0, 7
-; EPILOG-NO-IC-NEXT:    br i1 %1, label %latchexit.unr-lcssa, label %entry.new
-; EPILOG-NO-IC:       entry.new:
-; EPILOG-NO-IC-NEXT:    %unroll_iter = sub i64 %trip, %xtraiter
-; EPILOG-NO-IC-NEXT:    br label %loop_header
-; EPILOG-NO-IC:       loop_header:
-; EPILOG-NO-IC-NEXT:    %iv = phi i64 [ 0, %entry.new ], [ %iv_next.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    %sum = phi i64 [ 0, %entry.new ], [ %sum.next.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch, label %loop_exiting
-; EPILOG-NO-IC:       loop_exiting:
-; EPILOG-NO-IC-NEXT:    %ivy = add i64 %iv, %add
-; EPILOG-NO-IC-NEXT:    switch i64 %sum, label %loop_latch [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch:
-; EPILOG-NO-IC-NEXT:    %iv_next = add nuw nsw i64 %iv, 1
-; EPILOG-NO-IC-NEXT:    %sum.next = add i64 %sum, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub = sub i64 %niter, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.1, label %loop_exiting.1
-; EPILOG-NO-IC:       exit1.loopexit:
-; EPILOG-NO-IC-NEXT:    %result.ph = phi i64 [ %ivy, %loop_exiting ], [ %ivy, %loop_exiting ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.1, %loop_exiting.1 ], [ %ivy.2, %loop_exiting.2 ], [ %ivy.2, %loop_exiting.2 ], [ %ivy.3, %loop_exiting.3 ], [ %ivy.3, %loop_exiting.3 ], [ %ivy.4, %loop_exiting.4 ], [ %ivy.4, %loop_exiting.4 ], [ %ivy.5, %loop_exiting.5 ], [ %ivy.5, %loop_exiting.5 ], [ %ivy.6, %loop_exiting.6 ], [ %ivy.6, %loop_exiting.6 ], [ %ivy.7, %loop_exiting.7 ], [ %ivy.7, %loop_exiting.7 ]
-; EPILOG-NO-IC-NEXT:    br label %exit1
-; EPILOG-NO-IC:       exit1.loopexit2:
-; EPILOG-NO-IC-NEXT:    %result.ph3 = phi i64 [ %ivy.epil, %loop_exiting.epil ], [ %ivy.epil, %loop_exiting.epil ]
-; EPILOG-NO-IC-NEXT:    br label %exit1
-; EPILOG-NO-IC:       exit1:
-; EPILOG-NO-IC-NEXT:    %result = phi i64 [ %result.ph, %exit1.loopexit ], [ %result.ph3, %exit1.loopexit2 ]
-; EPILOG-NO-IC-NEXT:    ret i64 %result
-; EPILOG-NO-IC:       latchexit.unr-lcssa.loopexit:
-; EPILOG-NO-IC-NEXT:    %sum.next.lcssa.ph.ph = phi i64 [ %sum.next.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    %iv.unr.ph = phi i64 [ %iv_next.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    %sum.unr.ph = phi i64 [ %sum.next.7, %loop_latch.7 ]
-; EPILOG-NO-IC-NEXT:    br label %latchexit.unr-lcssa
-; EPILOG-NO-IC:       latchexit.unr-lcssa:
-; EPILOG-NO-IC-NEXT:    %sum.next.lcssa.ph = phi i64 [ undef, %entry ], [ %sum.next.lcssa.ph.ph, %latchexit.unr-lcssa.loopexit ]
-; EPILOG-NO-IC-NEXT:    %iv.unr = phi i64 [ 0, %entry ], [ %iv.unr.ph, %latchexit.unr-lcssa.loopexit ]
-; EPILOG-NO-IC-NEXT:    %sum.unr = phi i64 [ 0, %entry ], [ %sum.unr.ph, %latchexit.unr-lcssa.loopexit ]
-; EPILOG-NO-IC-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
-; EPILOG-NO-IC-NEXT:    br i1 %lcmp.mod, label %loop_header.epil.preheader, label %latchexit
-; EPILOG-NO-IC:       loop_header.epil.preheader:
-; EPILOG-NO-IC-NEXT:    br label %loop_header.epil
-; EPILOG-NO-IC:       loop_header.epil:
-; EPILOG-NO-IC-NEXT:    %iv.epil = phi i64 [ %iv.unr, %loop_header.epil.preheader ], [ %iv_next.epil, %loop_latch.epil ]
-; EPILOG-NO-IC-NEXT:    %sum.epil = phi i64 [ %sum.unr, %loop_header.epil.preheader ], [ %sum.next.epil, %loop_latch.epil ]
-; EPILOG-NO-IC-NEXT:    %epil.iter = phi i64 [ %xtraiter, %loop_header.epil.preheader ], [ %epil.iter.sub, %loop_latch.epil ]
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.epil, label %loop_exiting.epil
-; EPILOG-NO-IC:       loop_exiting.epil:
-; EPILOG-NO-IC-NEXT:    %ivy.epil = add i64 %iv.epil, %add
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.epil, label %loop_latch.epil [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit2
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit1.loopexit2
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.epil:
-; EPILOG-NO-IC-NEXT:    %iv_next.epil = add nuw nsw i64 %iv.epil, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.epil = add i64 %sum.epil, %add
-; EPILOG-NO-IC-NEXT:    %cmp.epil = icmp ne i64 %iv_next.epil, %trip
-; EPILOG-NO-IC-NEXT:    %epil.iter.sub = sub i64 %epil.iter, 1
-; EPILOG-NO-IC-NEXT:    %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0
-; EPILOG-NO-IC-NEXT:    br i1 %epil.iter.cmp, label %loop_header.epil, label %latchexit.epilog-lcssa, !llvm.loop !4
-; EPILOG-NO-IC:       latchexit.epilog-lcssa:
-; EPILOG-NO-IC-NEXT:    %sum.next.lcssa.ph1 = phi i64 [ %sum.next.epil, %loop_latch.epil ]
-; EPILOG-NO-IC-NEXT:    br label %latchexit
-; EPILOG-NO-IC:       latchexit:
-; EPILOG-NO-IC-NEXT:    %sum.next.lcssa = phi i64 [ %sum.next.lcssa.ph, %latchexit.unr-lcssa ], [ %sum.next.lcssa.ph1, %latchexit.epilog-lcssa ]
-; EPILOG-NO-IC-NEXT:    ret i64 %sum.next.lcssa
-; EPILOG-NO-IC:       loop_exiting.1:
-; EPILOG-NO-IC-NEXT:    %ivy.1 = add i64 %iv_next, %add
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next, label %loop_latch.1 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.1:
-; EPILOG-NO-IC-NEXT:    %iv_next.1 = add nuw nsw i64 %iv_next, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.1 = add i64 %sum.next, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.2, label %loop_exiting.2
-; EPILOG-NO-IC:       loop_exiting.2:
-; EPILOG-NO-IC-NEXT:    %ivy.2 = add i64 %iv_next.1, %add
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next.1, label %loop_latch.2 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.2:
-; EPILOG-NO-IC-NEXT:    %iv_next.2 = add nuw nsw i64 %iv_next.1, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.2 = add i64 %sum.next.1, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.2 = sub i64 %niter.nsub.1, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.3, label %loop_exiting.3
-; EPILOG-NO-IC:       loop_exiting.3:
-; EPILOG-NO-IC-NEXT:    %ivy.3 = add i64 %iv_next.2, %add
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next.2, label %loop_latch.3 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.3:
-; EPILOG-NO-IC-NEXT:    %iv_next.3 = add nuw nsw i64 %iv_next.2, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.3 = add i64 %sum.next.2, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.3 = sub i64 %niter.nsub.2, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.4, label %loop_exiting.4
-; EPILOG-NO-IC:       loop_exiting.4:
-; EPILOG-NO-IC-NEXT:    %ivy.4 = add i64 %iv_next.3, %add
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next.3, label %loop_latch.4 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.4:
-; EPILOG-NO-IC-NEXT:    %iv_next.4 = add nuw nsw i64 %iv_next.3, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.4 = add i64 %sum.next.3, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.4 = sub i64 %niter.nsub.3, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.5, label %loop_exiting.5
-; EPILOG-NO-IC:       loop_exiting.5:
-; EPILOG-NO-IC-NEXT:    %ivy.5 = add i64 %iv_next.4, %add
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next.4, label %loop_latch.5 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.5:
-; EPILOG-NO-IC-NEXT:    %iv_next.5 = add nuw nsw i64 %iv_next.4, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.5 = add i64 %sum.next.4, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.5 = sub i64 %niter.nsub.4, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.6, label %loop_exiting.6
-; EPILOG-NO-IC:       loop_exiting.6:
-; EPILOG-NO-IC-NEXT:    %ivy.6 = add i64 %iv_next.5, %add
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next.5, label %loop_latch.6 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.6:
-; EPILOG-NO-IC-NEXT:    %iv_next.6 = add nuw nsw i64 %iv_next.5, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.6 = add i64 %sum.next.5, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.6 = sub i64 %niter.nsub.5, 1
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %loop_latch.7, label %loop_exiting.7
-; EPILOG-NO-IC:       loop_exiting.7:
-; EPILOG-NO-IC-NEXT:    %ivy.7 = add i64 %iv_next.6, %add
-; EPILOG-NO-IC-NEXT:    switch i64 %sum.next.6, label %loop_latch.7 [
-; EPILOG-NO-IC-NEXT:    i64 24, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    i64 42, label %exit1.loopexit
-; EPILOG-NO-IC-NEXT:    ]
-; EPILOG-NO-IC:       loop_latch.7:
-; EPILOG-NO-IC-NEXT:    %iv_next.7 = add nuw nsw i64 %iv_next.6, 1
-; EPILOG-NO-IC-NEXT:    %sum.next.7 = add i64 %sum.next.6, %add
-; EPILOG-NO-IC-NEXT:    %niter.nsub.7 = sub i64 %niter.nsub.6, 1
-; EPILOG-NO-IC-NEXT:    %niter.ncmp.7 = icmp ne i64 %niter.nsub.7, 0
-; EPILOG-NO-IC-NEXT:    br i1 %niter.ncmp.7, label %loop_header, label %latchexit.unr-lcssa.loopexit
-;
 ; EPILOG-LABEL: @test5(
 ; EPILOG-NEXT:  entry:
 ; EPILOG-NEXT:    %0 = add i64 %trip, -1
@@ -2623,7 +3193,7 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; EPILOG-NEXT:    %cmp.epil = icmp ne i64 %iv_next.epil, %trip
 ; EPILOG-NEXT:    %epil.iter.sub = sub i64 %epil.iter, 1
 ; EPILOG-NEXT:    %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0
-; EPILOG-NEXT:    br i1 %epil.iter.cmp, label %loop_header.epil, label %latchexit.epilog-lcssa, !llvm.loop !4
+; EPILOG-NEXT:    br i1 %epil.iter.cmp, label %loop_header.epil, label %latchexit.epilog-lcssa, !llvm.loop !8
 ; EPILOG:       latchexit.epilog-lcssa:
 ; EPILOG-NEXT:    %sum.next.lcssa.ph1 = phi i64 [ %sum.next.epil, %loop_latch.epil ]
 ; EPILOG-NEXT:    br label %latchexit
@@ -2811,7 +3381,7 @@ define i64 @test5(i64 %trip, i64 %add, i1 %cond) {
 ; PROLOG-NEXT:    %cmp.prol = icmp ne i64 %iv_next.prol, %trip
 ; PROLOG-NEXT:    %prol.iter.sub = sub i64 %prol.iter, 1
 ; PROLOG-NEXT:    %prol.iter.cmp = icmp ne i64 %prol.iter.sub, 0
-; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %loop_header.prol, label %loop_header.prol.loopexit.unr-lcssa, !llvm.loop !4
+; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %loop_header.prol, label %loop_header.prol.loopexit.unr-lcssa, !llvm.loop !8
 ; PROLOG:       loop_header.prol.loopexit.unr-lcssa:
 ; PROLOG-NEXT:    %iv.unr.ph = phi i64 [ %iv_next.prol, %loop_latch.prol ]
 ; PROLOG-NEXT:    %sum.unr.ph = phi i64 [ %sum.next.prol, %loop_latch.prol ]
@@ -3025,152 +3595,6 @@ latchexit:
 
 ; test when exit blocks have successors.
 define i32 @test6(i32* nocapture %a, i64 %n, i1 %cond, i32 %x) {
-; EPILOG-NO-IC-LABEL: @test6(
-; EPILOG-NO-IC-NEXT:  entry:
-; EPILOG-NO-IC-NEXT:    %0 = add i64 %n, -1
-; EPILOG-NO-IC-NEXT:    %xtraiter = and i64 %n, 7
-; EPILOG-NO-IC-NEXT:    %1 = icmp ult i64 %0, 7
-; EPILOG-NO-IC-NEXT:    br i1 %1, label %latch_exit.unr-lcssa, label %entry.new
-; EPILOG-NO-IC:       entry.new:
-; EPILOG-NO-IC-NEXT:    %unroll_iter = sub i64 %n, %xtraiter
-; EPILOG-NO-IC-NEXT:    br label %header
-; EPILOG-NO-IC:       header:
-; EPILOG-NO-IC-NEXT:    %indvars.iv = phi i64 [ 0, %entry.new ], [ %indvars.iv.next.7, %latch.7 ]
-; EPILOG-NO-IC-NEXT:    %sum.02 = phi i32 [ 0, %entry.new ], [ %add.7, %latch.7 ]
-; EPILOG-NO-IC-NEXT:    %niter = phi i64 [ %unroll_iter, %entry.new ], [ %niter.nsub.7, %latch.7 ]
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block
-; EPILOG-NO-IC:       for.exiting_block:
-; EPILOG-NO-IC-NEXT:    %cmp = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp, label %for.exit2.loopexit, label %latch
-; EPILOG-NO-IC:       latch:
-; EPILOG-NO-IC-NEXT:    %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-; EPILOG-NO-IC-NEXT:    %load = load i32, i32* %arrayidx, align 4
-; EPILOG-NO-IC-NEXT:    %add = add nsw i32 %load, %sum.02
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub = sub i64 %niter, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.1
-; EPILOG-NO-IC:       latch_exit.unr-lcssa.loopexit:
-; EPILOG-NO-IC-NEXT:    %sum.0.lcssa.ph.ph = phi i32 [ %add.7, %latch.7 ]
-; EPILOG-NO-IC-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.7, %latch.7 ]
-; EPILOG-NO-IC-NEXT:    %sum.02.unr.ph = phi i32 [ %add.7, %latch.7 ]
-; EPILOG-NO-IC-NEXT:    br label %latch_exit.unr-lcssa
-; EPILOG-NO-IC:       latch_exit.unr-lcssa:
-; EPILOG-NO-IC-NEXT:    %sum.0.lcssa.ph = phi i32 [ undef, %entry ], [ %sum.0.lcssa.ph.ph, %latch_exit.unr-lcssa.loopexit ]
-; EPILOG-NO-IC-NEXT:    %indvars.iv.unr = phi i64 [ 0, %entry ], [ %indvars.iv.unr.ph, %latch_exit.unr-lcssa.loopexit ]
-; EPILOG-NO-IC-NEXT:    %sum.02.unr = phi i32 [ 0, %entry ], [ %sum.02.unr.ph, %latch_exit.unr-lcssa.loopexit ]
-; EPILOG-NO-IC-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
-; EPILOG-NO-IC-NEXT:    br i1 %lcmp.mod, label %header.epil.preheader, label %latch_exit
-; EPILOG-NO-IC:       header.epil.preheader:
-; EPILOG-NO-IC-NEXT:    br label %header.epil
-; EPILOG-NO-IC:       header.epil:
-; EPILOG-NO-IC-NEXT:    %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %latch.epil ], [ %indvars.iv.unr, %header.epil.preheader ]
-; EPILOG-NO-IC-NEXT:    %sum.02.epil = phi i32 [ %add.epil, %latch.epil ], [ %sum.02.unr, %header.epil.preheader ]
-; EPILOG-NO-IC-NEXT:    %epil.iter = phi i64 [ %xtraiter, %header.epil.preheader ], [ %epil.iter.sub, %latch.epil ]
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit2, label %for.exiting_block.epil
-; EPILOG-NO-IC:       for.exiting_block.epil:
-; EPILOG-NO-IC-NEXT:    %cmp.epil = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.epil, label %for.exit2.loopexit2, label %latch.epil
-; EPILOG-NO-IC:       latch.epil:
-; EPILOG-NO-IC-NEXT:    %arrayidx.epil = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.epil
-; EPILOG-NO-IC-NEXT:    %load.epil = load i32, i32* %arrayidx.epil, align 4
-; EPILOG-NO-IC-NEXT:    %add.epil = add nsw i32 %load.epil, %sum.02.epil
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.epil = add i64 %indvars.iv.epil, 1
-; EPILOG-NO-IC-NEXT:    %exitcond.epil = icmp eq i64 %indvars.iv.next.epil, %n
-; EPILOG-NO-IC-NEXT:    %epil.iter.sub = sub i64 %epil.iter, 1
-; EPILOG-NO-IC-NEXT:    %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0
-; EPILOG-NO-IC-NEXT:    br i1 %epil.iter.cmp, label %header.epil, label %latch_exit.epilog-lcssa, !llvm.loop !5
-; EPILOG-NO-IC:       latch_exit.epilog-lcssa:
-; EPILOG-NO-IC-NEXT:    %sum.0.lcssa.ph1 = phi i32 [ %add.epil, %latch.epil ]
-; EPILOG-NO-IC-NEXT:    br label %latch_exit
-; EPILOG-NO-IC:       latch_exit:
-; EPILOG-NO-IC-NEXT:    %sum.0.lcssa = phi i32 [ %sum.0.lcssa.ph, %latch_exit.unr-lcssa ], [ %sum.0.lcssa.ph1, %latch_exit.epilog-lcssa ]
-; EPILOG-NO-IC-NEXT:    ret i32 %sum.0.lcssa
-; EPILOG-NO-IC:       for.exit2.loopexit:
-; EPILOG-NO-IC-NEXT:    %retval.ph = phi i32 [ 42, %for.exiting_block ], [ %sum.02, %header ], [ %add, %latch ], [ 42, %for.exiting_block.1 ], [ %add.1, %latch.1 ], [ 42, %for.exiting_block.2 ], [ %add.2, %latch.2 ], [ 42, %for.exiting_block.3 ], [ %add.3, %latch.3 ], [ 42, %for.exiting_block.4 ], [ %add.4, %latch.4 ], [ 42, %for.exiting_block.5 ], [ %add.5, %latch.5 ], [ 42, %for.exiting_block.6 ], [ %add.6, %latch.6 ], [ 42, %for.exiting_block.7 ]
-; EPILOG-NO-IC-NEXT:    br label %for.exit2
-; EPILOG-NO-IC:       for.exit2.loopexit2:
-; EPILOG-NO-IC-NEXT:    %retval.ph3 = phi i32 [ 42, %for.exiting_block.epil ], [ %sum.02.epil, %header.epil ]
-; EPILOG-NO-IC-NEXT:    br label %for.exit2
-; EPILOG-NO-IC:       for.exit2:
-; EPILOG-NO-IC-NEXT:    %retval = phi i32 [ %retval.ph, %for.exit2.loopexit ], [ %retval.ph3, %for.exit2.loopexit2 ]
-; EPILOG-NO-IC-NEXT:    %addx = add i32 %retval, %x
-; EPILOG-NO-IC-NEXT:    br i1 %cond, label %exit_true, label %exit_false
-; EPILOG-NO-IC:       exit_true:
-; EPILOG-NO-IC-NEXT:    ret i32 %retval
-; EPILOG-NO-IC:       exit_false:
-; EPILOG-NO-IC-NEXT:    ret i32 %addx
-; EPILOG-NO-IC:       for.exiting_block.1:
-; EPILOG-NO-IC-NEXT:    %cmp.1 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.1, label %for.exit2.loopexit, label %latch.1
-; EPILOG-NO-IC:       latch.1:
-; EPILOG-NO-IC-NEXT:    %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
-; EPILOG-NO-IC-NEXT:    %load.1 = load i32, i32* %arrayidx.1, align 4
-; EPILOG-NO-IC-NEXT:    %add.1 = add nsw i32 %load.1, %add
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv.next, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.2
-; EPILOG-NO-IC:       for.exiting_block.2:
-; EPILOG-NO-IC-NEXT:    %cmp.2 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.2, label %for.exit2.loopexit, label %latch.2
-; EPILOG-NO-IC:       latch.2:
-; EPILOG-NO-IC-NEXT:    %arrayidx.2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.1
-; EPILOG-NO-IC-NEXT:    %load.2 = load i32, i32* %arrayidx.2, align 4
-; EPILOG-NO-IC-NEXT:    %add.2 = add nsw i32 %load.2, %add.1
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.2 = add nuw nsw i64 %indvars.iv.next.1, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.2 = sub i64 %niter.nsub.1, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.3
-; EPILOG-NO-IC:       for.exiting_block.3:
-; EPILOG-NO-IC-NEXT:    %cmp.3 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.3, label %for.exit2.loopexit, label %latch.3
-; EPILOG-NO-IC:       latch.3:
-; EPILOG-NO-IC-NEXT:    %arrayidx.3 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.2
-; EPILOG-NO-IC-NEXT:    %load.3 = load i32, i32* %arrayidx.3, align 4
-; EPILOG-NO-IC-NEXT:    %add.3 = add nsw i32 %load.3, %add.2
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.3 = add nuw nsw i64 %indvars.iv.next.2, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.3 = sub i64 %niter.nsub.2, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.4
-; EPILOG-NO-IC:       for.exiting_block.4:
-; EPILOG-NO-IC-NEXT:    %cmp.4 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.4, label %for.exit2.loopexit, label %latch.4
-; EPILOG-NO-IC:       latch.4:
-; EPILOG-NO-IC-NEXT:    %arrayidx.4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.3
-; EPILOG-NO-IC-NEXT:    %load.4 = load i32, i32* %arrayidx.4, align 4
-; EPILOG-NO-IC-NEXT:    %add.4 = add nsw i32 %load.4, %add.3
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.4 = add nuw nsw i64 %indvars.iv.next.3, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.4 = sub i64 %niter.nsub.3, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.5
-; EPILOG-NO-IC:       for.exiting_block.5:
-; EPILOG-NO-IC-NEXT:    %cmp.5 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.5, label %for.exit2.loopexit, label %latch.5
-; EPILOG-NO-IC:       latch.5:
-; EPILOG-NO-IC-NEXT:    %arrayidx.5 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.4
-; EPILOG-NO-IC-NEXT:    %load.5 = load i32, i32* %arrayidx.5, align 4
-; EPILOG-NO-IC-NEXT:    %add.5 = add nsw i32 %load.5, %add.4
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.5 = add nuw nsw i64 %indvars.iv.next.4, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.5 = sub i64 %niter.nsub.4, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.6
-; EPILOG-NO-IC:       for.exiting_block.6:
-; EPILOG-NO-IC-NEXT:    %cmp.6 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.6, label %for.exit2.loopexit, label %latch.6
-; EPILOG-NO-IC:       latch.6:
-; EPILOG-NO-IC-NEXT:    %arrayidx.6 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.5
-; EPILOG-NO-IC-NEXT:    %load.6 = load i32, i32* %arrayidx.6, align 4
-; EPILOG-NO-IC-NEXT:    %add.6 = add nsw i32 %load.6, %add.5
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.6 = add nuw nsw i64 %indvars.iv.next.5, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.6 = sub i64 %niter.nsub.5, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %for.exit2.loopexit, label %for.exiting_block.7
-; EPILOG-NO-IC:       for.exiting_block.7:
-; EPILOG-NO-IC-NEXT:    %cmp.7 = icmp eq i64 %n, 42
-; EPILOG-NO-IC-NEXT:    br i1 %cmp.7, label %for.exit2.loopexit, label %latch.7
-; EPILOG-NO-IC:       latch.7:
-; EPILOG-NO-IC-NEXT:    %arrayidx.7 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next.6
-; EPILOG-NO-IC-NEXT:    %load.7 = load i32, i32* %arrayidx.7, align 4
-; EPILOG-NO-IC-NEXT:    %add.7 = add nsw i32 %load.7, %add.6
-; EPILOG-NO-IC-NEXT:    %indvars.iv.next.7 = add i64 %indvars.iv.next.6, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.7 = sub i64 %niter.nsub.6, 1
-; EPILOG-NO-IC-NEXT:    %niter.ncmp.7 = icmp eq i64 %niter.nsub.7, 0
-; EPILOG-NO-IC-NEXT:    br i1 %niter.ncmp.7, label %latch_exit.unr-lcssa.loopexit, label %header
-;
 ; EPILOG-LABEL: @test6(
 ; EPILOG-NEXT:  entry:
 ; EPILOG-NEXT:    %0 = add i64 %n, -1
@@ -3224,7 +3648,7 @@ define i32 @test6(i32* nocapture %a, i64 %n, i1 %cond, i32 %x) {
 ; EPILOG-NEXT:    %exitcond.epil = icmp eq i64 %indvars.iv.next.epil, %n
 ; EPILOG-NEXT:    %epil.iter.sub = sub i64 %epil.iter, 1
 ; EPILOG-NEXT:    %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0
-; EPILOG-NEXT:    br i1 %epil.iter.cmp, label %header.epil, label %latch_exit.epilog-lcssa, !llvm.loop !5
+; EPILOG-NEXT:    br i1 %epil.iter.cmp, label %header.epil, label %latch_exit.epilog-lcssa, !llvm.loop !9
 ; EPILOG:       latch_exit.epilog-lcssa:
 ; EPILOG-NEXT:    %sum.0.lcssa.ph1 = phi i32 [ %add.epil, %latch.epil ]
 ; EPILOG-NEXT:    br label %latch_exit
@@ -3420,7 +3844,7 @@ define i32 @test6(i32* nocapture %a, i64 %n, i1 %cond, i32 %x) {
 ; PROLOG-NEXT:    %exitcond.prol = icmp eq i64 %indvars.iv.next.prol, %n
 ; PROLOG-NEXT:    %prol.iter.sub = sub i64 %prol.iter, 1
 ; PROLOG-NEXT:    %prol.iter.cmp = icmp ne i64 %prol.iter.sub, 0
-; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %header.prol, label %header.prol.loopexit.unr-lcssa, !llvm.loop !5
+; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %header.prol, label %header.prol.loopexit.unr-lcssa, !llvm.loop !9
 ; PROLOG:       header.prol.loopexit.unr-lcssa:
 ; PROLOG-NEXT:    %sum.0.lcssa.unr.ph = phi i32 [ %add.prol, %latch.prol ]
 ; PROLOG-NEXT:    %indvars.iv.unr.ph = phi i64 [ %indvars.iv.next.prol, %latch.prol ]
@@ -3640,93 +4064,6 @@ exit_false:
 
 ; test when value in exit block does not have VMap.
 define i32 @test7(i32 %arg, i32 %arg1, i32 %arg2) {
-; EPILOG-NO-IC-LABEL: @test7(
-; EPILOG-NO-IC-NEXT:  bb:
-; EPILOG-NO-IC-NEXT:    %i = icmp slt i32 undef, 2
-; EPILOG-NO-IC-NEXT:    %sext = sext i32 undef to i64
-; EPILOG-NO-IC-NEXT:    %shft = ashr exact i32 %arg, 16
-; EPILOG-NO-IC-NEXT:    br i1 %i, label %loopexit2, label %preheader
-; EPILOG-NO-IC:       preheader:
-; EPILOG-NO-IC-NEXT:    %0 = add nsw i64 %sext, -1
-; EPILOG-NO-IC-NEXT:    %1 = add nsw i64 %sext, -2
-; EPILOG-NO-IC-NEXT:    %xtraiter = and i64 %0, 7
-; EPILOG-NO-IC-NEXT:    %2 = icmp ult i64 %1, 7
-; EPILOG-NO-IC-NEXT:    br i1 %2, label %latchexit.unr-lcssa, label %preheader.new
-; EPILOG-NO-IC:       preheader.new:
-; EPILOG-NO-IC-NEXT:    %unroll_iter = sub i64 %0, %xtraiter
-; EPILOG-NO-IC-NEXT:    br label %header
-; EPILOG-NO-IC:       header:
-; EPILOG-NO-IC-NEXT:    %i6 = phi i64 [ 1, %preheader.new ], [ %add.7, %latch.7 ]
-; EPILOG-NO-IC-NEXT:    %niter = phi i64 [ %unroll_iter, %preheader.new ], [ %niter.nsub.7, %latch.7 ]
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loopexit1.loopexit, label %latch
-; EPILOG-NO-IC:       latch:
-; EPILOG-NO-IC-NEXT:    %add = add nuw nsw i64 %i6, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub = sub i64 %niter, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loopexit1.loopexit, label %latch.1
-; EPILOG-NO-IC:       latchexit.unr-lcssa.loopexit:
-; EPILOG-NO-IC-NEXT:    %i6.unr.ph = phi i64 [ %add.7, %latch.7 ]
-; EPILOG-NO-IC-NEXT:    br label %latchexit.unr-lcssa
-; EPILOG-NO-IC:       latchexit.unr-lcssa:
-; EPILOG-NO-IC-NEXT:    %i6.unr = phi i64 [ 1, %preheader ], [ %i6.unr.ph, %latchexit.unr-lcssa.loopexit ]
-; EPILOG-NO-IC-NEXT:    %lcmp.mod = icmp ne i64 %xtraiter, 0
-; EPILOG-NO-IC-NEXT:    br i1 %lcmp.mod, label %header.epil.preheader, label %latchexit
-; EPILOG-NO-IC:       header.epil.preheader:
-; EPILOG-NO-IC-NEXT:    br label %header.epil
-; EPILOG-NO-IC:       header.epil:
-; EPILOG-NO-IC-NEXT:    %i6.epil = phi i64 [ %i6.unr, %header.epil.preheader ], [ %add.epil, %latch.epil ]
-; EPILOG-NO-IC-NEXT:    %epil.iter = phi i64 [ %xtraiter, %header.epil.preheader ], [ %epil.iter.sub, %latch.epil ]
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loopexit1.loopexit1, label %latch.epil
-; EPILOG-NO-IC:       latch.epil:
-; EPILOG-NO-IC-NEXT:    %add.epil = add nuw nsw i64 %i6.epil, 1
-; EPILOG-NO-IC-NEXT:    %i9.epil = icmp slt i64 %add.epil, %sext
-; EPILOG-NO-IC-NEXT:    %epil.iter.sub = sub i64 %epil.iter, 1
-; EPILOG-NO-IC-NEXT:    %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0
-; EPILOG-NO-IC-NEXT:    br i1 %epil.iter.cmp, label %header.epil, label %latchexit.epilog-lcssa, !llvm.loop !6
-; EPILOG-NO-IC:       latchexit.epilog-lcssa:
-; EPILOG-NO-IC-NEXT:    br label %latchexit
-; EPILOG-NO-IC:       latchexit:
-; EPILOG-NO-IC-NEXT:    unreachable
-; EPILOG-NO-IC:       loopexit2:
-; EPILOG-NO-IC-NEXT:    ret i32 %shft
-; EPILOG-NO-IC:       loopexit1.loopexit:
-; EPILOG-NO-IC-NEXT:    %sext3.ph = phi i32 [ %shft, %header ], [ %shft, %latch ], [ %shft, %latch.1 ], [ %shft, %latch.2 ], [ %shft, %latch.3 ], [ %shft, %latch.4 ], [ %shft, %latch.5 ], [ %shft, %latch.6 ]
-; EPILOG-NO-IC-NEXT:    br label %loopexit1
-; EPILOG-NO-IC:       loopexit1.loopexit1:
-; EPILOG-NO-IC-NEXT:    %sext3.ph2 = phi i32 [ %shft, %header.epil ]
-; EPILOG-NO-IC-NEXT:    br label %loopexit1
-; EPILOG-NO-IC:       loopexit1:
-; EPILOG-NO-IC-NEXT:    %sext3 = phi i32 [ %sext3.ph, %loopexit1.loopexit ], [ %sext3.ph2, %loopexit1.loopexit1 ]
-; EPILOG-NO-IC-NEXT:    ret i32 %sext3
-; EPILOG-NO-IC:       latch.1:
-; EPILOG-NO-IC-NEXT:    %add.1 = add nuw nsw i64 %add, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.1 = sub i64 %niter.nsub, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loopexit1.loopexit, label %latch.2
-; EPILOG-NO-IC:       latch.2:
-; EPILOG-NO-IC-NEXT:    %add.2 = add nuw nsw i64 %add.1, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.2 = sub i64 %niter.nsub.1, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loopexit1.loopexit, label %latch.3
-; EPILOG-NO-IC:       latch.3:
-; EPILOG-NO-IC-NEXT:    %add.3 = add nuw nsw i64 %add.2, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.3 = sub i64 %niter.nsub.2, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loopexit1.loopexit, label %latch.4
-; EPILOG-NO-IC:       latch.4:
-; EPILOG-NO-IC-NEXT:    %add.4 = add nuw nsw i64 %add.3, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.4 = sub i64 %niter.nsub.3, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loopexit1.loopexit, label %latch.5
-; EPILOG-NO-IC:       latch.5:
-; EPILOG-NO-IC-NEXT:    %add.5 = add nuw nsw i64 %add.4, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.5 = sub i64 %niter.nsub.4, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loopexit1.loopexit, label %latch.6
-; EPILOG-NO-IC:       latch.6:
-; EPILOG-NO-IC-NEXT:    %add.6 = add nuw nsw i64 %add.5, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.6 = sub i64 %niter.nsub.5, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %loopexit1.loopexit, label %latch.7
-; EPILOG-NO-IC:       latch.7:
-; EPILOG-NO-IC-NEXT:    %add.7 = add nuw nsw i64 %add.6, 1
-; EPILOG-NO-IC-NEXT:    %niter.nsub.7 = sub i64 %niter.nsub.6, 1
-; EPILOG-NO-IC-NEXT:    %niter.ncmp.7 = icmp ne i64 %niter.nsub.7, 0
-; EPILOG-NO-IC-NEXT:    br i1 %niter.ncmp.7, label %header, label %latchexit.unr-lcssa.loopexit
-;
 ; EPILOG-LABEL: @test7(
 ; EPILOG-NEXT:  bb:
 ; EPILOG-NEXT:    %i = icmp slt i32 undef, 2
@@ -3768,7 +4105,7 @@ define i32 @test7(i32 %arg, i32 %arg1, i32 %arg2) {
 ; EPILOG-NEXT:    %i9.epil = icmp slt i64 %add.epil, %sext
 ; EPILOG-NEXT:    %epil.iter.sub = sub i64 %epil.iter, 1
 ; EPILOG-NEXT:    %epil.iter.cmp = icmp ne i64 %epil.iter.sub, 0
-; EPILOG-NEXT:    br i1 %epil.iter.cmp, label %header.epil, label %latchexit.epilog-lcssa, !llvm.loop !6
+; EPILOG-NEXT:    br i1 %epil.iter.cmp, label %header.epil, label %latchexit.epilog-lcssa, !llvm.loop !10
 ; EPILOG:       latchexit.epilog-lcssa:
 ; EPILOG-NEXT:    br label %latchexit
 ; EPILOG:       latchexit:
@@ -3894,7 +4231,7 @@ define i32 @test7(i32 %arg, i32 %arg1, i32 %arg2) {
 ; PROLOG-NEXT:    %i9.prol = icmp slt i64 %add.prol, %sext
 ; PROLOG-NEXT:    %prol.iter.sub = sub i64 %prol.iter, 1
 ; PROLOG-NEXT:    %prol.iter.cmp = icmp ne i64 %prol.iter.sub, 0
-; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %header.prol, label %header.prol.loopexit.unr-lcssa, !llvm.loop !6
+; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %header.prol, label %header.prol.loopexit.unr-lcssa, !llvm.loop !10
 ; PROLOG:       header.prol.loopexit.unr-lcssa:
 ; PROLOG-NEXT:    %i6.unr.ph = phi i64 [ %add.prol, %latch.prol ]
 ; PROLOG-NEXT:    br label %header.prol.loopexit
@@ -4036,24 +4373,6 @@ loopexit1:                                             ; preds = %header
 ; The outer loop header is the preheader for the inner loop and the inner header
 ; branches back to the outer loop.
 define void @test8() {
-; EPILOG-NO-IC-LABEL: @test8(
-; EPILOG-NO-IC-NEXT:  bb:
-; EPILOG-NO-IC-NEXT:    br label %outerloop
-; EPILOG-NO-IC:       outerloop.loopexit:
-; EPILOG-NO-IC-NEXT:    br label %outerloop
-; EPILOG-NO-IC:       outerloop:
-; EPILOG-NO-IC-NEXT:    %i = phi i64 [ 3, %bb ], [ 0, %outerloop.loopexit ]
-; EPILOG-NO-IC-NEXT:    br label %innerH
-; EPILOG-NO-IC:       innerH:
-; EPILOG-NO-IC-NEXT:    %i3 = phi i64 [ %i4, %latch ], [ %i, %outerloop ]
-; EPILOG-NO-IC-NEXT:    %i4 = add nuw nsw i64 %i3, 1
-; EPILOG-NO-IC-NEXT:    br i1 false, label %outerloop.loopexit, label %latch
-; EPILOG-NO-IC:       latch:
-; EPILOG-NO-IC-NEXT:    %i6 = icmp ult i64 %i4, 100
-; EPILOG-NO-IC-NEXT:    br i1 %i6, label %innerH, label %exit
-; EPILOG-NO-IC:       exit:
-; EPILOG-NO-IC-NEXT:    ret void
-;
 ; EPILOG-LABEL: @test8(
 ; EPILOG-NEXT:  bb:
 ; EPILOG-NEXT:    br label %outerloop
@@ -4142,7 +4461,7 @@ define void @test8() {
 ; PROLOG-NEXT:    %i6.prol = icmp ult i64 %i4.prol, 100
 ; PROLOG-NEXT:    %prol.iter.sub = sub i64 %prol.iter, 1
 ; PROLOG-NEXT:    %prol.iter.cmp = icmp ne i64 %prol.iter.sub, 0
-; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %innerH.prol, label %innerH.prol.loopexit.unr-lcssa, !llvm.loop !7
+; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %innerH.prol, label %innerH.prol.loopexit.unr-lcssa, !llvm.loop !11
 ; PROLOG:       innerH.prol.loopexit.unr-lcssa:
 ; PROLOG-NEXT:    %i3.unr.ph = phi i64 [ %i4.prol, %latch.prol ]
 ; PROLOG-NEXT:    br label %innerH.prol.loopexit
@@ -4290,34 +4609,6 @@ declare i8 addrspace(1)* @foo(i32)
 ; a value from outer loop is used in exit block of inner loop.
 ; Don't create VMap entries for such values (%trip).
 define i8 addrspace(1)* @test9(i8* nocapture readonly %arg, i32 %n) {
-; EPILOG-NO-IC-LABEL: @test9(
-; EPILOG-NO-IC-NEXT:  bb:
-; EPILOG-NO-IC-NEXT:    br label %outerloopHdr
-; EPILOG-NO-IC:       outerloopHdr:
-; EPILOG-NO-IC-NEXT:    %trip = add i32 %n, -1
-; EPILOG-NO-IC-NEXT:    %outercnd = icmp slt i32 0, %trip
-; EPILOG-NO-IC-NEXT:    br i1 %outercnd, label %preheader, label %outerLatch
-; EPILOG-NO-IC:       preheader:
-; EPILOG-NO-IC-NEXT:    %i4 = zext i32 0 to i64
-; EPILOG-NO-IC-NEXT:    br label %header
-; EPILOG-NO-IC:       header:
-; EPILOG-NO-IC-NEXT:    %phi = phi i64 [ %i4, %preheader ], [ %iv.next, %latch ]
-; EPILOG-NO-IC-NEXT:    %i7 = trunc i64 %phi to i32
-; EPILOG-NO-IC-NEXT:    br i1 true, label %latch, label %innerexit
-; EPILOG-NO-IC:       innerexit:
-; EPILOG-NO-IC-NEXT:    %trip.lcssa = phi i32 [ %trip, %header ]
-; EPILOG-NO-IC-NEXT:    %i9 = call i8 addrspace(1)* @foo(i32 %trip.lcssa)
-; EPILOG-NO-IC-NEXT:    ret i8 addrspace(1)* %i9
-; EPILOG-NO-IC:       latch:
-; EPILOG-NO-IC-NEXT:    %i11 = add nsw i32 %i7, 1
-; EPILOG-NO-IC-NEXT:    %innercnd = icmp slt i32 %i11, %trip
-; EPILOG-NO-IC-NEXT:    %iv.next = add nuw nsw i64 %phi, 1
-; EPILOG-NO-IC-NEXT:    br i1 %innercnd, label %header, label %outerLatch.loopexit
-; EPILOG-NO-IC:       outerLatch.loopexit:
-; EPILOG-NO-IC-NEXT:    br label %outerLatch
-; EPILOG-NO-IC:       outerLatch:
-; EPILOG-NO-IC-NEXT:    br label %outerloopHdr
-;
 ; EPILOG-LABEL: @test9(
 ; EPILOG-NEXT:  bb:
 ; EPILOG-NEXT:    br label %outerloopHdr
@@ -4439,7 +4730,7 @@ define i8 addrspace(1)* @test9(i8* nocapture readonly %arg, i32 %n) {
 ; PROLOG-NEXT:    %iv.next.prol = add nuw nsw i64 %phi.prol, 1
 ; PROLOG-NEXT:    %prol.iter.sub = sub i32 %prol.iter, 1
 ; PROLOG-NEXT:    %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0
-; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %header.prol, label %header.prol.loopexit.unr-lcssa, !llvm.loop !8
+; PROLOG-NEXT:    br i1 %prol.iter.cmp, label %header.prol, label %header.prol.loopexit.unr-lcssa, !llvm.loop !12
 ; PROLOG:       header.prol.loopexit.unr-lcssa:
 ; PROLOG-NEXT:    %phi.unr.ph = phi i64 [ %iv.next.prol, %latch.prol ]
 ; PROLOG-NEXT:    br label %header.prol.loopexit
diff --git a/llvm/test/Transforms/LoopUnswitch/2006-06-13-SingleEntryPHI.ll b/llvm/test/Transforms/LoopUnswitch/2006-06-13-SingleEntryPHI.ll
index cbe5d6b694480..189f4799f46f5 100644
--- a/llvm/test/Transforms/LoopUnswitch/2006-06-13-SingleEntryPHI.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2006-06-13-SingleEntryPHI.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output
 
 	%struct.BLEND_MAP = type { i16, i16, i16, i32, %struct.BLEND_MAP_ENTRY* }
 	%struct.BLEND_MAP_ENTRY = type { float, i8, { [5 x float], [4 x i8] } }
diff --git a/llvm/test/Transforms/LoopUnswitch/2006-06-27-DeadSwitchCase.ll b/llvm/test/Transforms/LoopUnswitch/2006-06-27-DeadSwitchCase.ll
index 3f17b36b1a568..bbd1bac1fc418 100644
--- a/llvm/test/Transforms/LoopUnswitch/2006-06-27-DeadSwitchCase.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2006-06-27-DeadSwitchCase.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output
 
 define void @init_caller_save() {
 entry:
diff --git a/llvm/test/Transforms/LoopUnswitch/2007-05-09-Unreachable.ll b/llvm/test/Transforms/LoopUnswitch/2007-05-09-Unreachable.ll
index 8f8cbc8dc6f10..660975cb4be44 100644
--- a/llvm/test/Transforms/LoopUnswitch/2007-05-09-Unreachable.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2007-05-09-Unreachable.ll
@@ -1,6 +1,6 @@
 ; PR1333
 ; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "i686-pc-linux-gnu"
diff --git a/llvm/test/Transforms/LoopUnswitch/2007-05-09-tl.ll b/llvm/test/Transforms/LoopUnswitch/2007-05-09-tl.ll
index bb9f07fbe0eef..237c92400eaf2 100644
--- a/llvm/test/Transforms/LoopUnswitch/2007-05-09-tl.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2007-05-09-tl.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output
 ; PR1333
 
 define void @pp_cxx_expression() {
diff --git a/llvm/test/Transforms/LoopUnswitch/2007-07-12-ExitDomInfo.ll b/llvm/test/Transforms/LoopUnswitch/2007-07-12-ExitDomInfo.ll
index 673ac037ae3ed..d964beb589710 100644
--- a/llvm/test/Transforms/LoopUnswitch/2007-07-12-ExitDomInfo.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2007-07-12-ExitDomInfo.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -instcombine -disable-output
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -instcombine -disable-output
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -instcombine -disable-output
 
 @str3 = external constant [3 x i8]		; <[3 x i8]*> [#uses=1]
 
diff --git a/llvm/test/Transforms/LoopUnswitch/2007-07-13-DomInfo.ll b/llvm/test/Transforms/LoopUnswitch/2007-07-13-DomInfo.ll
index 0ccb353a1573c..c282ed3c55a87 100644
--- a/llvm/test/Transforms/LoopUnswitch/2007-07-13-DomInfo.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2007-07-13-DomInfo.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output
 
 define i32 @main(i32 %argc, i8** %argv) {
 entry:
diff --git a/llvm/test/Transforms/LoopUnswitch/2007-07-18-DomInfo.ll b/llvm/test/Transforms/LoopUnswitch/2007-07-18-DomInfo.ll
index 80a3dc6a0a056..ab986d257aa59 100644
--- a/llvm/test/Transforms/LoopUnswitch/2007-07-18-DomInfo.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2007-07-18-DomInfo.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output
 ; PR1559
 
 target triple = "i686-pc-linux-gnu"
diff --git a/llvm/test/Transforms/LoopUnswitch/2007-08-01-LCSSA.ll b/llvm/test/Transforms/LoopUnswitch/2007-08-01-LCSSA.ll
index 64508a63715f0..53d1f63df4a28 100644
--- a/llvm/test/Transforms/LoopUnswitch/2007-08-01-LCSSA.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2007-08-01-LCSSA.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -instcombine -disable-output
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -instcombine -disable-output
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -instcombine -disable-output
 	%struct.ClassDef = type { %struct.QByteArray, %struct.QByteArray, %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", i8, i8, %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QMap<QByteArray,QByteArray>", %"struct.QList<ArgumentDef>", %"struct.QMap<QByteArray,QByteArray>", i32, i32 }
 	%struct.FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct.FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] }
 	%struct.Generator = type { %struct.FILE*, %struct.ClassDef*, %"struct.QList<ArgumentDef>", %struct.QByteArray, %"struct.QList<ArgumentDef>" }
diff --git a/llvm/test/Transforms/LoopUnswitch/2008-06-02-DomInfo.ll b/llvm/test/Transforms/LoopUnswitch/2008-06-02-DomInfo.ll
index 4c8230d29306c..ff6207de80a91 100644
--- a/llvm/test/Transforms/LoopUnswitch/2008-06-02-DomInfo.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2008-06-02-DomInfo.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -instcombine -gvn -disable-output
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -instcombine -gvn -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -instcombine -gvn -verify-memoryssa -disable-output
 ; PR2372
 target triple = "i386-pc-linux-gnu"
 
diff --git a/llvm/test/Transforms/LoopUnswitch/2008-11-03-Invariant.ll b/llvm/test/Transforms/LoopUnswitch/2008-11-03-Invariant.ll
index 8491bdb2acf12..a464bf4c5bacb 100644
--- a/llvm/test/Transforms/LoopUnswitch/2008-11-03-Invariant.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2008-11-03-Invariant.ll
@@ -1,6 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -stats -disable-output 2>&1 | FileCheck %s
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -stats -disable-output 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -stats -disable-output 2>&1 | FileCheck %s
 ; PR 3170
 
 define i32 @a(i32 %x, i32 %y) nounwind {
diff --git a/llvm/test/Transforms/LoopUnswitch/2010-11-18-LCSSA.ll b/llvm/test/Transforms/LoopUnswitch/2010-11-18-LCSSA.ll
index 8773b56b938ec..01adb6c3a3c70 100644
--- a/llvm/test/Transforms/LoopUnswitch/2010-11-18-LCSSA.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2010-11-18-LCSSA.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa
 ; PR8622
 @g_38 = external global i32, align 4
 
diff --git a/llvm/test/Transforms/LoopUnswitch/2011-09-26-EHCrash.ll b/llvm/test/Transforms/LoopUnswitch/2011-09-26-EHCrash.ll
index 8672c2d21c866..12f0c6f151662 100644
--- a/llvm/test/Transforms/LoopUnswitch/2011-09-26-EHCrash.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2011-09-26-EHCrash.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -sroa -loop-unswitch -enable-new-pm=0 -disable-output
-; RUN: opt < %s -sroa -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -sroa -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output
 ; PR11016
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.7.2"
diff --git a/llvm/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll b/llvm/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll
index e8d66e91bd1fb..6dddfea72e3f1 100644
--- a/llvm/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2011-11-18-SimpleSwitch.ll
@@ -1,7 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: opt -loop-unswitch -enable-new-pm=0 -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s
-; RUN: opt -S -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info < %s | FileCheck %s
-; RUN: opt -S -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info -verify-memoryssa < %s | FileCheck %s
 
 ; STATS: 2 loop-unswitch - Number of switches unswitched
 
diff --git a/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll b/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll
index ef4a611eba683..71ca7d1883a91 100644
--- a/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches-Threshold.ll
@@ -1,7 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: opt -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 13 -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s
-; RUN: opt -S -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 13 -verify-loop-info -verify-dom-info < %s | FileCheck %s
-; RUN: opt -S -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 13 -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 13 -verify-loop-info -verify-dom-info -verify-memoryssa < %s | FileCheck %s
 
 ; STATS: 1 loop-unswitch - Number of switches unswitched
 
diff --git a/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll b/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll
index 6cbff0ed9c210..0e070989bb70f 100644
--- a/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2011-11-18-TwoSwitches.ll
@@ -1,7 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: opt -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 1000 -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s
-; RUN: opt -S -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 1000 -verify-loop-info -verify-dom-info < %s | FileCheck %s
-; RUN: opt -S -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 1000 -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold 1000 -verify-loop-info -verify-dom-info -verify-memoryssa < %s | FileCheck %s
 
 ; STATS: 3 loop-unswitch - Number of switches unswitched
 
diff --git a/llvm/test/Transforms/LoopUnswitch/2012-04-02-IndirectBr.ll b/llvm/test/Transforms/LoopUnswitch/2012-04-02-IndirectBr.ll
index 4f271a832e3db..660cc7115c808 100644
--- a/llvm/test/Transforms/LoopUnswitch/2012-04-02-IndirectBr.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2012-04-02-IndirectBr.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -S -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info | FileCheck %s
-; RUN: opt < %s -S -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -S -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info -verify-memoryssa | FileCheck %s
 ; PR12343: -loop-unswitch -enable-new-pm=0 crash on indirect branch
 
 ; CHECK:       %0 = icmp eq i64 undef, 0
diff --git a/llvm/test/Transforms/LoopUnswitch/2012-05-20-Phi.ll b/llvm/test/Transforms/LoopUnswitch/2012-05-20-Phi.ll
index 13aadfac9f732..055856d0b2a9c 100644
--- a/llvm/test/Transforms/LoopUnswitch/2012-05-20-Phi.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2012-05-20-Phi.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output
 ; PR12887
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopUnswitch/2015-09-18-Addrspace.ll b/llvm/test/Transforms/LoopUnswitch/2015-09-18-Addrspace.ll
index f50a3e6bf25dc..48be24d3165c2 100644
--- a/llvm/test/Transforms/LoopUnswitch/2015-09-18-Addrspace.ll
+++ b/llvm/test/Transforms/LoopUnswitch/2015-09-18-Addrspace.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -S | FileCheck %s
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -S | FileCheck %s
 
 ; In cases where two address spaces do not have the same size pointer, the
 ; input for the addrspacecast should not be used as a substitute for itself
diff --git a/llvm/test/Transforms/LoopUnswitch/LIV-loop-condtion.ll b/llvm/test/Transforms/LoopUnswitch/LIV-loop-condtion.ll
index c93e3964edb08..af09e65ca4f94 100644
--- a/llvm/test/Transforms/LoopUnswitch/LIV-loop-condtion.ll
+++ b/llvm/test/Transforms/LoopUnswitch/LIV-loop-condtion.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=0 -S 2>&1 | FileCheck %s
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=0 -enable-mssa-loop-dependency=true -verify-memoryssa -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=0 -verify-memoryssa -S 2>&1 | FileCheck %s
 
 ; This is to test trivial loop unswitch only happens when trivial condition
 ; itself is an LIV loop condition (not partial LIV which could occur in and/or).
diff --git a/llvm/test/Transforms/LoopUnswitch/basictest.ll b/llvm/test/Transforms/LoopUnswitch/basictest.ll
index 812fb75820afb..61647c8f64442 100644
--- a/llvm/test/Transforms/LoopUnswitch/basictest.ll
+++ b/llvm/test/Transforms/LoopUnswitch/basictest.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -S < %s 2>&1 | FileCheck %s
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-memoryssa -S < %s 2>&1 | FileCheck %s
 
 define i32 @test(i32* %A, i1 %C) {
 entry:
diff --git a/llvm/test/Transforms/LoopUnswitch/cleanuppad.ll b/llvm/test/Transforms/LoopUnswitch/cleanuppad.ll
index fe50c058883cf..f0ffbb9caf9eb 100644
--- a/llvm/test/Transforms/LoopUnswitch/cleanuppad.ll
+++ b/llvm/test/Transforms/LoopUnswitch/cleanuppad.ll
@@ -1,5 +1,4 @@
-; RUN: opt -S -loop-unswitch -enable-new-pm=0 < %s | FileCheck %s
-; RUN: opt -S -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-unswitch -enable-new-pm=0 -verify-memoryssa < %s | FileCheck %s
 target triple = "x86_64-pc-win32"
 
 define void @f(i32 %doit, i1 %x, i1 %y) personality i32 (...)* @__CxxFrameHandler3 {
diff --git a/llvm/test/Transforms/LoopUnswitch/copy-metadata.ll b/llvm/test/Transforms/LoopUnswitch/copy-metadata.ll
index 7b8b5b18a1a51..14fca405ecf17 100644
--- a/llvm/test/Transforms/LoopUnswitch/copy-metadata.ll
+++ b/llvm/test/Transforms/LoopUnswitch/copy-metadata.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -S < %s 2>&1 | FileCheck %s
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -S < %s 2>&1 | FileCheck %s
 
 ; This test checks if unswitched condition preserve make.implicit metadata.
 
diff --git a/llvm/test/Transforms/LoopUnswitch/crash.ll b/llvm/test/Transforms/LoopUnswitch/crash.ll
index c36a83ea8ace3..43cecd7377d97 100644
--- a/llvm/test/Transforms/LoopUnswitch/crash.ll
+++ b/llvm/test/Transforms/LoopUnswitch/crash.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output
 
 define void @test1(i32* %S2) {
 entry:
diff --git a/llvm/test/Transforms/LoopUnswitch/elseif-non-exponential-behavior.ll b/llvm/test/Transforms/LoopUnswitch/elseif-non-exponential-behavior.ll
index a609d5d11dab7..0f08e47365058 100644
--- a/llvm/test/Transforms/LoopUnswitch/elseif-non-exponential-behavior.ll
+++ b/llvm/test/Transforms/LoopUnswitch/elseif-non-exponential-behavior.ll
@@ -1,5 +1,4 @@
-; RUN: opt -loop-unswitch -enable-new-pm=0 -S - < %s | FileCheck %s
-; RUN: opt -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -S - < %s | FileCheck %s
+; RUN: opt -loop-unswitch -enable-new-pm=0 -verify-memoryssa -S - < %s | FileCheck %s
 
 ;CHECK-LABEL: @b
 ;CHECK: [[Loop1:for\.end.*]]:                              ; preds = %for.cond.us
diff --git a/llvm/test/Transforms/LoopUnswitch/exponential-behavior.ll b/llvm/test/Transforms/LoopUnswitch/exponential-behavior.ll
index 15bec6701f347..8a165ab942722 100644
--- a/llvm/test/Transforms/LoopUnswitch/exponential-behavior.ll
+++ b/llvm/test/Transforms/LoopUnswitch/exponential-behavior.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -loop-unswitch -enable-new-pm=0 -S < %s | FileCheck %s
-; RUN: opt -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
+; RUN: opt -loop-unswitch -enable-new-pm=0 -verify-memoryssa -S < %s | FileCheck %s
 
 define void @f(i32 %n, i32* %ptr) {
 ; CHECK-LABEL: @f(
diff --git a/llvm/test/Transforms/LoopUnswitch/guards.ll b/llvm/test/Transforms/LoopUnswitch/guards.ll
index 727d488dd9307..e4aa9679b65ca 100644
--- a/llvm/test/Transforms/LoopUnswitch/guards.ll
+++ b/llvm/test/Transforms/LoopUnswitch/guards.ll
@@ -1,5 +1,4 @@
-; RUN: opt -S -loop-unswitch -enable-new-pm=0 < %s | FileCheck %s
-; RUN: opt -S -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -loop-unswitch -enable-new-pm=0 -verify-memoryssa < %s | FileCheck %s
 
 declare void @llvm.experimental.guard(i1, ...)
 
diff --git a/llvm/test/Transforms/LoopUnswitch/infinite-loop.ll b/llvm/test/Transforms/LoopUnswitch/infinite-loop.ll
index 6e9cfa9559ceb..82aab701fce44 100644
--- a/llvm/test/Transforms/LoopUnswitch/infinite-loop.ll
+++ b/llvm/test/Transforms/LoopUnswitch/infinite-loop.ll
@@ -1,6 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt -loop-unswitch -enable-new-pm=0 -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s
-; RUN: opt -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s
+; RUN: opt -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s
 ; RUN: opt -loop-unswitch -enable-new-pm=0 -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S < %s | FileCheck %s
 ; PR5373
 
diff --git a/llvm/test/Transforms/LoopUnswitch/invalidate-scev.ll b/llvm/test/Transforms/LoopUnswitch/invalidate-scev.ll
index edd1e486f0a8c..b621640d7f263 100644
--- a/llvm/test/Transforms/LoopUnswitch/invalidate-scev.ll
+++ b/llvm/test/Transforms/LoopUnswitch/invalidate-scev.ll
@@ -1,5 +1,4 @@
-; RUN: opt -S -indvars -loop-unswitch -enable-new-pm=0 < %s | FileCheck %s
-; RUN: opt -S -indvars -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -indvars -loop-unswitch -enable-new-pm=0 -verify-memoryssa < %s | FileCheck %s
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Transforms/LoopUnswitch/msan.ll b/llvm/test/Transforms/LoopUnswitch/msan.ll
index 1dd6fb534f480..81fd64bfcf63d 100644
--- a/llvm/test/Transforms/LoopUnswitch/msan.ll
+++ b/llvm/test/Transforms/LoopUnswitch/msan.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -S < %s 2>&1 | FileCheck %s
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-memoryssa -S < %s 2>&1 | FileCheck %s
 
 @sink = global i32 0, align 4
 @y = global i64 0, align 8
diff --git a/llvm/test/Transforms/LoopUnswitch/pr32818.ll b/llvm/test/Transforms/LoopUnswitch/pr32818.ll
index c8d0508ef580f..6f22a7aa04e85 100644
--- a/llvm/test/Transforms/LoopUnswitch/pr32818.ll
+++ b/llvm/test/Transforms/LoopUnswitch/pr32818.ll
@@ -1,7 +1,6 @@
 ; Check that the call doesn't get removed even if
 ; it has no uses. It could have side-effects.
-; RUN: opt -loop-unswitch -enable-new-pm=0 -S %s | FileCheck %s
-; RUN: opt -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -S %s | FileCheck %s
+; RUN: opt -loop-unswitch -enable-new-pm=0 -verify-memoryssa -S %s | FileCheck %s
 
 ; CHECK-LABEL: @tinky
 define i32 @tinkywinky(i8 %patatino) {
diff --git a/llvm/test/Transforms/LoopUnswitch/preserve-analyses.ll b/llvm/test/Transforms/LoopUnswitch/preserve-analyses.ll
index ae0f837d18b80..439dfda43ba27 100644
--- a/llvm/test/Transforms/LoopUnswitch/preserve-analyses.ll
+++ b/llvm/test/Transforms/LoopUnswitch/preserve-analyses.ll
@@ -1,5 +1,4 @@
-; RUN: opt -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-dom-info -disable-output < %s
-; RUN: opt -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -verify-loop-info -verify-dom-info -disable-output < %s
+; RUN: opt -loop-unswitch -enable-new-pm=0 -verify-memoryssa -verify-loop-info -verify-dom-info -disable-output < %s
 
 ; Loop unswitch should be able to unswitch these loops and
 ; preserve LCSSA and LoopSimplify forms.
diff --git a/llvm/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll b/llvm/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll
index c75752d0294b7..590e9b6da604c 100644
--- a/llvm/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll
+++ b/llvm/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -S < %s 2>&1 | FileCheck %s
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-loop-info -verify-memoryssa -S < %s 2>&1 | FileCheck %s
 
 ; There are 1 case and 1 default case in the switch. after we unswitch, we know the
 ; %a is definitely not 0 in one of the unswitched loop, make sure we take advantage
diff --git a/llvm/test/Transforms/LoopUnswitch/trivial-unswitch.ll b/llvm/test/Transforms/LoopUnswitch/trivial-unswitch.ll
index 0a76d081eb128..24d065eb7b834 100644
--- a/llvm/test/Transforms/LoopUnswitch/trivial-unswitch.ll
+++ b/llvm/test/Transforms/LoopUnswitch/trivial-unswitch.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=0 -verify-loop-info -S < %s 2>&1 | FileCheck %s
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=0 -verify-loop-info -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=0 -verify-loop-info -verify-memoryssa -S < %s 2>&1 | FileCheck %s
 
 ; This test contains two trivial unswitch condition in one loop. 
 ; LoopUnswitch pass should be able to unswitch the second one 
diff --git a/llvm/test/Transforms/LoopUnswitch/unswitch-equality-undef.ll b/llvm/test/Transforms/LoopUnswitch/unswitch-equality-undef.ll
index e81e07d165477..70e34aa791d13 100644
--- a/llvm/test/Transforms/LoopUnswitch/unswitch-equality-undef.ll
+++ b/llvm/test/Transforms/LoopUnswitch/unswitch-equality-undef.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -instcombine -licm -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=1000 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output -stats 2>&1| FileCheck %s
+; RUN: opt < %s -instcombine -licm -loop-unswitch -enable-new-pm=0 -loop-unswitch-threshold=1000 -verify-memoryssa -disable-output -stats 2>&1| FileCheck %s
 ; Check no loop unswitch is done because unswitching of equality expr with
 ; undef is unsafe before the freeze patch is committed.
 ; CHECK-NOT: Number of branches unswitched
diff --git a/llvm/test/Transforms/LoopUnswitch/unswitch-select.ll b/llvm/test/Transforms/LoopUnswitch/unswitch-select.ll
index f67d0ffa01d55..9de9b32bdb3fe 100644
--- a/llvm/test/Transforms/LoopUnswitch/unswitch-select.ll
+++ b/llvm/test/Transforms/LoopUnswitch/unswitch-select.ll
@@ -1,6 +1,5 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -disable-output -stats 2>&1| FileCheck %s
-; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output -stats 2>&1| FileCheck %s
+; RUN: opt < %s -loop-unswitch -enable-new-pm=0 -verify-memoryssa -disable-output -stats 2>&1| FileCheck %s
 
 ; Check the select statement in the loop will be unswitched.
 ; CHECK: 1 loop-unswitch - Number of selects unswitched
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
index 1669f4aa476ea..f9065a6126574 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
@@ -18,7 +18,7 @@
 ;   return a;
 ; }
 ;
-define i32 @PR33613(double* %b, double %j, i32 %d) {
+define i32 @PR33613(double* %b, double %j, i32 %d) #0 {
 ; CHECK-VF4UF2-LABEL: @PR33613
 ; CHECK-VF4UF2: vector.body
 ; CHECK-VF4UF2: %[[VEC_RECUR:.*]] = phi <vscale x 4 x double> [ {{.*}}, %vector.ph ], [ {{.*}}, %vector.body ]
@@ -66,7 +66,7 @@ for.body:
 ; }
 ;
 ; Check that the sext sank after the load in the vector loop.
-define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) {
+define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) #0 {
 ; CHECK-VF4UF1-LABEL: @PR34711
 ; CHECK-VF4UF1: vector.body
 ; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[MGATHER:.*]], %vector.body ]
@@ -100,5 +100,6 @@ for.end:
   ret void
 }
 
+attributes #0 = { vscale_range(0, 16) }
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
index 007670324fadc..cba948ed1dae0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -4,7 +4,7 @@
 ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=true  -hints-allow-reordering=true  -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
 ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED
 
-define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
+define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-LABEL: @fadd_strict
 ; CHECK-ORDERED: vector.body:
 ; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
@@ -49,7 +49,7 @@ for.end:
   ret float %add
 }
 
-define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) {
+define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) #0 {
 ; CHECK-ORDERED-LABEL: @fadd_strict_unroll
 ; CHECK-ORDERED: vector.body:
 ; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
@@ -113,7 +113,7 @@ for.end:
   ret float %add
 }
 
-define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
 ; CHECK-ORDERED-LABEL: @fadd_strict_interleave
 ; CHECK-ORDERED: entry
 ; CHECK-ORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1
@@ -206,7 +206,7 @@ for.end:
   ret void
 }
 
-define float @fadd_of_sum(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+define float @fadd_of_sum(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
 ; CHECK-ORDERED-LABEL: @fadd_of_sum
 ; CHECK-ORDERED: vector.body
 ; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
@@ -268,7 +268,7 @@ for.end:                                 ; preds = %for.body, %entry
   ret float %res
 }
 
-define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
 ; CHECK-ORDERED-LABEL: @fadd_conditional
 ; CHECK-ORDERED: vector.body
 ; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
@@ -343,7 +343,7 @@ for.end:
 }
 
 ; Negative test - loop contains multiple fadds which we cannot safely reorder
-define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) {
+define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) #0 {
 ; CHECK-ORDERED-LABEL: @fadd_multiple
 ; CHECK-ORDERED-NOT: vector.body
 
@@ -390,6 +390,7 @@ for.end:                                         ; preds = %for.body
   ret float %rdx
 }
 
+attributes #0 = { vscale_range(0, 16) }
 !0 = distinct !{!0, !3, !6, !8}
 !1 = distinct !{!1, !3, !7, !8}
 !2 = distinct !{!2, !4, !6, !8}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
index 1d9d9d8545408..8d53ae5a0b5d9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
@@ -7,7 +7,7 @@
 ; Test that the MaxVF for the following loop, that has no dependence distances,
 ; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16
 ; (maximized bandwidth for i8 in the loop).
-define void @test0(i32* %a, i8* %b, i32* %c) {
+define void @test0(i32* %a, i8* %b, i32* %c) #0 {
 ; CHECK: LV: Checking a loop in "test0"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
 ; CHECK_SCALABLE_ON: LV: Selecting VF: 4
@@ -40,7 +40,7 @@ exit:
 
 ; Test that the MaxVF for the following loop, with a dependence distance
 ; of 64 elements, is calculated as (maxvscale = 16) * 4.
-define void @test1(i32* %a, i8* %b) {
+define void @test1(i32* %a, i8* %b) #0 {
 ; CHECK: LV: Checking a loop in "test1"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
 ; CHECK_SCALABLE_ON: LV: Selecting VF: 4
@@ -74,7 +74,7 @@ exit:
 
 ; Test that the MaxVF for the following loop, with a dependence distance
 ; of 32 elements, is calculated as (maxvscale = 16) * 2.
-define void @test2(i32* %a, i8* %b) {
+define void @test2(i32* %a, i8* %b) #0 {
 ; CHECK: LV: Checking a loop in "test2"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2
 ; CHECK_SCALABLE_ON: LV: Selecting VF: 4
@@ -108,7 +108,7 @@ exit:
 
 ; Test that the MaxVF for the following loop, with a dependence distance
 ; of 16 elements, is calculated as (maxvscale = 16) * 1.
-define void @test3(i32* %a, i8* %b) {
+define void @test3(i32* %a, i8* %b) #0 {
 ; CHECK: LV: Checking a loop in "test3"
 ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1
 ; CHECK_SCALABLE_ON: LV: Selecting VF: 4
@@ -142,7 +142,7 @@ exit:
 
 ; Test the fallback mechanism when scalable vectors are not feasible due
 ; to e.g. dependence distance.
-define void @test4(i32* %a, i32* %b) {
+define void @test4(i32* %a, i32* %b) #0 {
 ; CHECK: LV: Checking a loop in "test4"
 ; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF
 ; CHECK_SCALABLE_ON: LV: Selecting VF: 4
@@ -172,3 +172,5 @@ loop:
 exit:
   ret void
 }
+
+attributes #0 = { vscale_range(0, 16) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
index 2e600d461e899..246dcd2370880 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-hint.ll
@@ -38,13 +38,14 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 ; fixed-width vectorization is used instead.
 
 ; CHECK-DBG: LV: Checking a loop in "test1"
+; CHECK-DBG: LV: Scalable vectorization is available
 ; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible.
 ; CHECK-DBG: remark: <unknown>:0:0: Max legal vector width too small, scalable vectorization unfeasible.
 ; CHECK-DBG: LV: The max safe fixed VF is: 8.
 ; CHECK-DBG: LV: Selecting VF: 4.
 ; CHECK-LABEL: @test1
 ; CHECK: <4 x i32>
-define void @test1(i32* %a, i32* %b) {
+define void @test1(i32* %a, i32* %b) #0 {
 entry:
   br label %loop
 
@@ -82,13 +83,14 @@ exit:
 ; }
 
 ; CHECK-DBG: LV: Checking a loop in "test2"
+; CHECK-DBG: LV: Scalable vectorization is available
 ; CHECK-DBG: LV: Max legal vector width too small, scalable vectorization unfeasible.
 ; CHECK-DBG: LV: The max safe fixed VF is: 4.
 ; CHECK-DBG: LV: User VF=vscale x 8 is unsafe. Ignoring scalable UserVF.
 ; CHECK-DBG: LV: Selecting VF: 4.
 ; CHECK-LABEL: @test2
 ; CHECK: <4 x i32>
-define void @test2(i32* %a, i32* %b) {
+define void @test2(i32* %a, i32* %b) #0 {
 entry:
   br label %loop
 
@@ -131,11 +133,12 @@ exit:
 ; Max fixed VF=32, Max scalable VF=2, safe to vectorize.
 
 ; CHECK-DBG-LABEL: LV: Checking a loop in "test3"
+; CHECK-DBG: LV: Scalable vectorization is available
 ; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2.
 ; CHECK-DBG: LV: Using user VF vscale x 2.
 ; CHECK-LABEL: @test3
 ; CHECK: <vscale x 2 x i32>
-define void @test3(i32* %a, i32* %b) {
+define void @test3(i32* %a, i32* %b) #0 {
 entry:
   br label %loop
 
@@ -179,14 +182,15 @@ exit:
 ; Max fixed VF=32, Max scalable VF=2, unsafe to vectorize.
 
 ; CHECK-DBG-LABEL: LV: Checking a loop in "test4"
+; CHECK-DBG: LV: Scalable vectorization is available
 ; CHECK-DBG: LV: The max safe scalable VF is: vscale x 2.
 ; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
-; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a suitable VF.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is unsafe. Ignoring the hint to let the compiler pick a more suitable value.
 ; CHECK-DBG: Found feasible scalable VF = vscale x 2
 ; CHECK-DBG: LV: Selecting VF: 4.
 ; CHECK-LABEL: @test4
 ; CHECK: <4 x i32>
-define void @test4(i32* %a, i32* %b) {
+define void @test4(i32* %a, i32* %b) #0 {
 entry:
   br label %loop
 
@@ -229,11 +233,12 @@ exit:
 ; Max fixed VF=128, Max scalable VF=8, safe to vectorize.
 
 ; CHECK-DBG-LABEL: LV: Checking a loop in "test5"
+; CHECK-DBG: LV: Scalable vectorization is available
 ; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8.
 ; CHECK-DBG: LV: Using user VF vscale x 4
 ; CHECK-LABEL: @test5
 ; CHECK: <vscale x 4 x i32>
-define void @test5(i32* %a, i32* %b) {
+define void @test5(i32* %a, i32* %b) #0 {
 entry:
   br label %loop
 
@@ -276,14 +281,15 @@ exit:
 ; Max fixed VF=128, Max scalable VF=8, unsafe to vectorize.
 
 ; CHECK-DBG-LABEL: LV: Checking a loop in "test6"
+; CHECK-DBG: LV: Scalable vectorization is available
 ; CHECK-DBG: LV: The max safe scalable VF is: vscale x 8.
 ; CHECK-DBG: LV: User VF=vscale x 16 is unsafe. Ignoring scalable UserVF.
-; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe. Ignoring the hint to let the compiler pick a suitable VF.
+; CHECK-DBG: remark: <unknown>:0:0: User-specified vectorization factor vscale x 16 is unsafe. Ignoring the hint to let the compiler pick a more suitable value.
 ; CHECK-DBG: LV: Found feasible scalable VF = vscale x 4
-; CHECK-DBG: Selecting VF: 4.
+; CHECK-DBG: Selecting VF: vscale x 4.
 ; CHECK-LABEL: @test6
-; CHECK: <4 x i32>
-define void @test6(i32* %a, i32* %b) {
+; CHECK: <vscale x 4 x i32>
+define void @test6(i32* %a, i32* %b) #0 {
 entry:
   br label %loop
 
@@ -310,14 +316,13 @@ exit:
 !17 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
 
 ; CHECK-NO-SVE-REMARKS-LABEL: LV: Checking a loop in "test_no_sve"
-; CHECK-NO-SVE-REMARKS: LV: Disabling scalable vectorization, because target does not support scalable vectors.
-; CHECK-NO-SVE-REMARKS: remark: <unknown>:0:0: Disabling scalable vectorization, because target does not support scalable vectors.
-; CHECK-NO-SVE-REMARKS: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
+; CHECK-NO-SVE-REMARKS: LV: User VF=vscale x 4 is ignored because scalable vectors are not available.
+; CHECK-NO-SVE-REMARKS: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is ignored because the target does not support scalable vectors. The compiler will pick a more suitable value.
 ; CHECK-NO-SVE-REMARKS: LV: Selecting VF: 4.
 ; CHECK-NO-SVE-LABEL: @test_no_sve
 ; CHECK-NO-SVE: <4 x i32>
 ; CHECK-NO-SVE-NOT: <vscale x 4 x i32>
-define void @test_no_sve(i32* %a, i32* %b) {
+define void @test_no_sve(i32* %a, i32* %b) #0 {
 entry:
   br label %loop
 
@@ -344,13 +349,14 @@ exit:
 ; Test the LV falls back to fixed-width vectorization if scalable vectors are
 ; supported but max vscale is undefined.
 ;
-; CHECK-NO-SVE-REMARKS-LABEL: LV: Checking a loop in "test_no_max_vscale"
-; CHECK-NO-SVE-REMARKS: The max safe fixed VF is: 4.
-; CHECK-NO-SVE-REMARKS: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
-; CHECK-NO-SVE-REMARKS: LV: Selecting VF: 4.
-; CHECK-NO-SVE-LABEL: @test_no_max_vscale
-; CHECK-NO-SVE: <4 x i32>
-define void @test_no_max_vscale(i32* %a, i32* %b) {
+; CHECK-DBG-LABEL: LV: Checking a loop in "test_no_max_vscale"
+; CHECK-DBG: LV: Scalable vectorization is available
+; CHECK-DBG: The max safe fixed VF is: 4.
+; CHECK-DBG: LV: User VF=vscale x 4 is unsafe. Ignoring scalable UserVF.
+; CHECK-DBG: LV: Selecting VF: 4.
+; CHECK-LABEL: @test_no_max_vscale
+; CHECK: <4 x i32>
+define void @test_no_max_vscale(i32* %a, i32* %b) #0 {
 entry:
   br label %loop
 
@@ -372,6 +378,7 @@ exit:
   ret void
 }
 
+attributes #0 = { vscale_range(0, 16) }
 !21 = !{!21, !22, !23}
 !22 = !{!"llvm.loop.vectorize.width", i32 4}
 !23 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
index 1f309ff3cac14..20ed971e409f3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
@@ -6,8 +6,8 @@
 
 target triple="aarch64-unknown-linux-gnu"
 
-; CHECK-VF4: Found an estimated cost of 17 for VF 4 For instruction:   %add = fadd float %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 34 for VF 8 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 21 for VF 4 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 42 for VF 8 For instruction:   %add = fadd float %0, %sum.07
 
 define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) {
 entry:
@@ -28,8 +28,8 @@ for.end:
 }
 
 
-; CHECK-VF4: Found an estimated cost of 14 for VF 4 For instruction:   %add = fadd double %0, %sum.07
-; CHECK-VF8: Found an estimated cost of 28 for VF 8 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-VF4: Found an estimated cost of 18 for VF 4 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 36 for VF 8 For instruction:   %add = fadd double %0, %sum.07
 
 define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) {
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
index 0722ac3783e2b..07d81fcc2913d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll
@@ -767,6 +767,51 @@ for.end:
   ret float %add
 }
 
+; Test case where the reduction step is a first-order recurrence.
+define double @reduction_increment_by_first_order_recurrence() {
+; CHECK-ORDERED-LABEL: @reduction_increment_by_first_order_recurrence(
+; CHECK-ORDERED:  vector.body:
+; CHECK-ORDERED:    [[RED:%.*]] = phi double [ 0.000000e+00, %vector.ph ], [ [[RED_NEXT:%.*]], %vector.body ]
+; CHECK-ORDERED:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[FOR_NEXT:%.*]], %vector.body ]
+; CHECK-ORDERED:    [[FOR_NEXT]] = sitofp <4 x i32> %vec.ind to <4 x double>
+; CHECK-ORDERED:    [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[FOR_NEXT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-ORDERED:    [[RED_NEXT]] = call double @llvm.vector.reduce.fadd.v4f64(double [[RED]], <4 x double> [[TMP1]])
+; CHECK-ORDERED:  scalar.ph:
+; CHECK-ORDERED:    = phi double [ 0.000000e+00, %entry ], [ [[RED_NEXT]], %middle.block ]
+;
+; CHECK-UNORDERED-LABEL: @reduction_increment_by_first_order_recurrence(
+; CHECK-UNORDERED:  vector.body:
+; CHECK-UNORDERED:    [[RED:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %vector.ph ], [ [[RED_NEXT:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[FOR_NEXT:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[FOR_NEXT]] = sitofp <4 x i32> %vec.ind to <4 x double>
+; CHECK-UNORDERED:    [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[FOR_NEXT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-UNORDERED:    [[RED_NEXT]] = fadd <4 x double> [[TMP1]], [[RED]]
+; CHECK-UNORDERED:  middle.block:
+; CHECK-UNORDERED:    [[RDX:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[RED_NEXT]])
+; CHECK-UNORDERED:  scalar.ph:
+; CHECK-UNORDERED:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, %entry ], [ [[RDX]], %middle.block ]
+;
+; CHECK-NOT-VECTORIZED-LABEL: @reduction_increment_by_first_order_recurrence(
+; CHECK-NOT-VECTORIZED-NOT: vector.body
+;
+entry:
+  br label %loop
+
+loop:
+  %red = phi double [ 0.0, %entry ], [ %red.next, %loop ]
+  %for = phi double [ 0.0, %entry ], [ %for.next, %loop ]
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %red.next = fadd double %for, %red
+  %for.next = sitofp i32 %iv to double
+  %iv.next = add nsw i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 0
+  br i1 %ec, label %exit, label %loop, !llvm.loop !13
+
+exit:
+  %res = phi double [ %red.next, %loop ]
+  ret double %res
+}
+
 !0 = distinct !{!0, !5, !9, !11}
 !1 = distinct !{!1, !5, !10, !11}
 !2 = distinct !{!2, !6, !9, !11}
@@ -780,3 +825,4 @@ for.end:
 !10 = !{!"llvm.loop.interleave.count", i32 4}
 !11 = !{!"llvm.loop.vectorize.enable", i1 true}
 !12 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!13 = distinct !{!13, !6, !9, !11}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
index 3054f3a6ac971..bc083a2bc870c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -loop-vectorize -scalable-vectorization=on -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S %s -o - | FileCheck %s
 
-define void @cond_inv_load_i32i32i16(i32* noalias nocapture %a, i32* noalias nocapture readonly %cond, i16* noalias nocapture readonly %inv, i64 %n) {
+define void @cond_inv_load_i32i32i16(i32* noalias nocapture %a, i32* noalias nocapture readonly %cond, i16* noalias nocapture readonly %inv, i64 %n) #0 {
 ; CHECK-LABEL: @cond_inv_load_i32i32i16
 ; CHECK:     vector.ph:
 ; CHECK:       %[[INVINS:.*]] = insertelement <vscale x 4 x i16*> poison, i16* %inv, i32 0
@@ -39,7 +39,7 @@ exit:                        ; preds = %for.inc
   ret void
 }
 
-define void @cond_inv_load_f64f64f64(double* noalias nocapture %a, double* noalias nocapture readonly %cond, double* noalias nocapture readonly %inv, i64 %n) {
+define void @cond_inv_load_f64f64f64(double* noalias nocapture %a, double* noalias nocapture readonly %cond, double* noalias nocapture readonly %inv, i64 %n) #0 {
 ; CHECK-LABEL: @cond_inv_load_f64f64f64
 ; CHECK:     vector.ph:
 ; CHECK:       %[[INVINS:.*]] = insertelement <vscale x 4 x double*> poison, double* %inv, i32 0
@@ -76,7 +76,7 @@ exit:                        ; preds = %for.inc
   ret void
 }
 
-define void @invariant_load_cond(i32* noalias nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %cond, i64 %n) {
+define void @invariant_load_cond(i32* noalias nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %cond, i64 %n) #0 {
 ; CHECK-LABEL: @invariant_load_cond
 ; CHECK: vector.body
 ; CHECK: %[[GEP:.*]] = getelementptr inbounds i32, i32* %b, i64 42
@@ -117,6 +117,7 @@ for.end:
   ret void
 }
 
+attributes #0 = { vscale_range(0, 16) }
 !0 = distinct !{!0, !1, !2, !3, !4, !5}
 !1 = !{!"llvm.loop.mustprogress"}
 !2 = !{!"llvm.loop.vectorize.width", i32 4}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
index 4fbad7ab4e247..a2760c79a838e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S %s -scalable-vectorization=preferred -force-target-instruction-cost=1 -o - | FileCheck %s
 
-define void @gather_nxv4i32_ind64(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) {
+define void @gather_nxv4i32_ind64(float* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, float* noalias nocapture %c, i64 %n) #0 {
 ; CHECK-LABEL: @gather_nxv4i32_ind64
 ; CHECK: vector.body:
 ; CHECK:   %[[IND:.*]] = load <vscale x 4 x i64>, <vscale x 4 x i64>*
@@ -29,7 +29,7 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 ; NOTE: I deliberately chose '%b' as an array of i32 indices, since the
 ; additional 'sext' in the for.body loop exposes additional code paths
 ; during vectorisation.
-define void @scatter_nxv4i32_ind32(float* noalias nocapture %a, i32* noalias nocapture readonly %b, float* noalias nocapture readonly %c, i64 %n) {
+define void @scatter_nxv4i32_ind32(float* noalias nocapture %a, i32* noalias nocapture readonly %b, float* noalias nocapture readonly %c, i64 %n) #0 {
 ; CHECK-LABEL: @scatter_nxv4i32_ind32
 ; CHECK: vector.body:
 ; CHECK:   %[[VALS:.*]] = load <vscale x 4 x float>
@@ -57,7 +57,7 @@ for.cond.cleanup:                                 ; preds = %for.body, %entry
   ret void
 }
 
-define void @scatter_inv_nxv4i32(i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) {
+define void @scatter_inv_nxv4i32(i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 {
 ; CHECK-LABEL: @scatter_inv_nxv4i32
 ; CHECK: vector.ph:
 ; CHECK:   %[[INS:.*]] = insertelement <vscale x 4 x i32*> poison, i32* %inv, i32 0
@@ -89,7 +89,7 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
   ret void
 }
 
-define void @gather_inv_nxv4i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %inv, i64 %n) {
+define void @gather_inv_nxv4i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %inv, i64 %n) #0 {
 ; CHECK-LABEL: @gather_inv_nxv4i32
 ; CHECK: vector.ph:
 ; CHECK:   %[[INS:.*]] = insertelement <vscale x 4 x i32*> poison, i32* %inv, i32 0
@@ -124,7 +124,7 @@ for.cond.cleanup:                                 ; preds = %for.inc, %entry
 
 
 
-define void @gather_nxv4i32_ind64_stride2(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
+define void @gather_nxv4i32_ind64_stride2(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
 ; CHECK-LABEL: @gather_nxv4i32_ind64_stride2
 ; CHECK: vector.body:
 ; CHECK:      %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
@@ -153,6 +153,8 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
   ret void
 }
 
+attributes #0 = { vscale_range(0, 16) }
+
 !0 = distinct !{!0, !1, !2, !3, !4, !5}
 !1 = !{!"llvm.loop.mustprogress"}
 !2 = !{!"llvm.loop.vectorize.width", i32 4}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
index 0e02af631d205..b534171274047 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inv-store.ll
@@ -59,7 +59,7 @@ for.end:                                          ; preds = %for.inc, %entry
   ret void
 }
 
-attributes #0 = { "target-features"="+neon,+sve" }
+attributes #0 = { "target-features"="+neon,+sve" vscale_range(0, 16) }
 
 !0 = distinct !{!0, !1, !2, !3, !4, !5}
 !1 = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
index 8327e09063b68..23eb2d0b0aba0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -scalable-vectorization=on -dce -instcombine -S <%s | FileCheck %s
 
-define void @stride7_i32(i32* noalias nocapture %dst, i64 %n) {
+define void @stride7_i32(i32* noalias nocapture %dst, i64 %n) #0 {
 ; CHECK-LABEL: @stride7_i32(
 ; CHECK:      vector.body
 ; CHECK:        %[[VEC_IND:.*]] = phi <vscale x 4 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
@@ -27,7 +27,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
   ret void
 }
 
-define void @stride7_f64(double* noalias nocapture %dst, i64 %n) {
+define void @stride7_f64(double* noalias nocapture %dst, i64 %n) #0 {
 ; CHECK-LABEL: @stride7_f64(
 ; CHECK:      vector.body
 ; CHECK:        %[[VEC_IND:.*]] = phi <vscale x 2 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
@@ -55,7 +55,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 }
 
 
-define void @cond_stride7_f64(double* noalias nocapture %dst, i64* noalias nocapture readonly %cond, i64 %n) {
+define void @cond_stride7_f64(double* noalias nocapture %dst, i64* noalias nocapture readonly %cond, i64 %n) #0 {
 ; CHECK-LABEL: @cond_stride7_f64(
 ; CHECK:      vector.body
 ; CHECK:        %[[MASK:.*]] = icmp ne <vscale x 2 x i64>
@@ -90,7 +90,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
   ret void
 }
 
-
+attributes #0 = { vscale_range(0, 16) }
 !0 = distinct !{!0, !1, !2, !3, !4, !5}
 !1 = !{!"llvm.loop.mustprogress"}
 !2 = !{!"llvm.loop.vectorize.width", i32 4}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
index 1aef842b297fb..0221c890a6e1b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-strict-fadd-cost.ll
@@ -49,7 +49,7 @@ for.end:
   ret double %add
 }
 
-attributes #0 = { "target-features"="+sve" }
+attributes #0 = { "target-features"="+sve" vscale_range(0, 16) }
 
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index afa2bd093c273..1881801ec2579 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -12,7 +12,7 @@
 ; that we can use gather instructions with the correct offsets, taking
 ; vscale into account.
 
-define void @widen_ptr_phi_unrolled(i32* noalias nocapture %a, i32* noalias nocapture %b, i32* nocapture readonly %c, i64 %n) {
+define void @widen_ptr_phi_unrolled(i32* noalias nocapture %a, i32* noalias nocapture %b, i32* nocapture readonly %c, i64 %n) #0 {
 ; CHECK-LABEL: @widen_ptr_phi_unrolled(
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i32* [ %c, %vector.ph ], [ %[[PTR_IND:.*]], %vector.body ]
@@ -122,7 +122,7 @@ for.cond.cleanup:                                 ; preds = %for.body
 ; because it is stored to memory.
 ;
 
-define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) {
+define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) #0 {
 ; CHECK-LABEL: @pointer_iv_mixed(
 ; CHECK:     vector.body
 ; CHECK:       %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
@@ -170,7 +170,7 @@ for.end:
   ret i32 %tmp5
 }
 
-
+attributes #0 = { vscale_range(0, 16) }
 !0 = distinct !{!0, !1, !2, !3, !4, !5}
 !1 = !{!"llvm.loop.mustprogress"}
 !2 = !{!"llvm.loop.vectorize.width", i32 4}
diff --git a/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll b/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll
index 12e1b5e7ff2af..353fbe013f12f 100644
--- a/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll
+++ b/llvm/test/Transforms/LoopVectorize/novect-lcssa-cfg-invalidation.ll
@@ -13,7 +13,6 @@ define i32 @novect(i32* %p) {
 ; CHECK-NOT:       Invalidating analysis: BranchProbabilityAnalysis on novect
 ; CHECK-NOT:       Invalidating analysis: BlockFrequencyAnalysis on novect
 ; CHECK:           Invalidating analysis: DemandedBitsAnalysis on novect
-; CHECK:           Invalidating analysis: MemorySSAAnalysis on novect
 ; CHECK:           Running pass: JumpThreadingPass on novect
 
 ; CHECK:           entry:
diff --git a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
index 6f5a591fa6d0d..ad2571d7a34a8 100644
--- a/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalable-vf-hint.ll
@@ -3,8 +3,8 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-; CHECK: LV: Disabling scalable vectorization, because target does not support scalable vectors.
-; CHECK: remark: <unknown>:0:0: Disabling scalable vectorization, because target does not support scalable vectors.
+; CHECK: LV: User VF=vscale x 4 is ignored because scalable vectors are not available.
+; CHECK: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is ignored because the target does not support scalable vectors. The compiler will pick a more suitable value.
 ; CHECK: LV: The Widest register safe to use is: 32 bits.
 define void @test1(i32* %a, i32* %b) {
 entry:
diff --git a/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll b/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll
index 24164a872a295..7f99e6bb404db 100644
--- a/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll
+++ b/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s  -O1  -S -loop-versioning-licm -licm -debug-only=loop-versioning-licm -enable-new-pm=0 2>&1 | FileCheck %s
-; RUN: opt < %s -S -passes='default<O1>,loop-versioning-licm,licm' -debug-only=loop-versioning-licm 2>&1 | FileCheck %s
+; RUN: opt < %s -S -passes='default<O1>,function(loop-versioning-licm,loop-mssa(licm))' -debug-only=loop-versioning-licm 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; Test to confirm loop is a candidate for LoopVersioningLICM.
diff --git a/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll b/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll
index c420f7d37317e..28879efc75d06 100644
--- a/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll
+++ b/llvm/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s  -O1  -S -loop-versioning-licm -licm -debug-only=loop-versioning-licm -disable-loop-unrolling -enable-new-pm=0 2>&1 | FileCheck %s
-; RUN: opt < %s  -S -passes='default<O1>,loop-versioning-licm,licm' -debug-only=loop-versioning-licm -disable-loop-unrolling 2>&1 | FileCheck %s
+; RUN: opt < %s  -S -passes='default<O1>,function(loop-versioning-licm,loop-mssa(licm))' -debug-only=loop-versioning-licm -disable-loop-unrolling 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; Test to confirm loop is a good candidate for LoopVersioningLICM
diff --git a/llvm/test/Transforms/LoopVersioningLICM/metadata.ll b/llvm/test/Transforms/LoopVersioningLICM/metadata.ll
index 6984c6e748a1f..445b09c072187 100644
--- a/llvm/test/Transforms/LoopVersioningLICM/metadata.ll
+++ b/llvm/test/Transforms/LoopVersioningLICM/metadata.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s  -O1  -S -loop-versioning-licm -licm 2>&1 | FileCheck %s
-; RUN: opt < %s  -S -passes='loop-versioning-licm,licm' 2>&1 | FileCheck %s
+; RUN: opt < %s  -S -passes='function(loop-versioning-licm,loop-mssa(licm))' 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: @without_metadata(
 define i32 @without_metadata(i32* nocapture %var1, i32* nocapture readnone %var2, i32* nocapture %var3, i32 %itr) #0 {
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll
index aa24871516e88..7c1b2e4c3a9eb 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -lower-matrix-intrinsics-minimal -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -instcombine -verify-dom-info %s -S | FileCheck %s
-; RUN: opt -passes='lower-matrix-intrinsics-minimal,instcombine,verify<domtree>' -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix %s -S | FileCheck %s
+; RUN: opt -passes='lower-matrix-intrinsics<minimal>,instcombine,verify<domtree>' -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix %s -S | FileCheck %s
 
 ; Test for the minimal version of the matrix lowering pass, which does not
 ; require DT or AA. Make sure no tiling is happening, even though it was
diff --git a/llvm/test/Transforms/OpenMP/add_attributes.ll b/llvm/test/Transforms/OpenMP/add_attributes.ll
index efc263ed16305..049b0c8da3450 100644
--- a/llvm/test/Transforms/OpenMP/add_attributes.ll
+++ b/llvm/test/Transforms/OpenMP/add_attributes.ll
@@ -626,9 +626,9 @@ declare void @__kmpc_destroy_allocator(i32, i8*)
 
 declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64)
 
-declare i32 @__kmpc_warp_active_thread_mask()
+declare i64 @__kmpc_warp_active_thread_mask()
 
-declare void @__kmpc_syncwarp(i32)
+declare void @__kmpc_syncwarp(i64)
 
 declare i32 @__tgt_target_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**)
 
@@ -1149,10 +1149,10 @@ attributes #0 = { noinline cold }
 ; CHECK-NEXT: declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64)
 
 ; CHECK: ; Function Attrs: convergent nounwind
-; CHECK-NEXT: declare i32 @__kmpc_warp_active_thread_mask()
+; CHECK-NEXT: declare i64 @__kmpc_warp_active_thread_mask()
 
 ; CHECK: ; Function Attrs: convergent nounwind
-; CHECK-NEXT: declare void @__kmpc_syncwarp(i32)
+; CHECK-NEXT: declare void @__kmpc_syncwarp(i64)
 
 ; CHECK: ; Function Attrs: nounwind
 ; CHECK-NEXT: declare i32 @__tgt_target_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**)
@@ -1677,10 +1677,10 @@ attributes #0 = { noinline cold }
 ; OPTIMISTIC-NEXT: declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64)
 
 ; OPTIMISTIC: ; Function Attrs: convergent nounwind
-; OPTIMISTIC-NEXT: declare i32 @__kmpc_warp_active_thread_mask()
+; OPTIMISTIC-NEXT: declare i64 @__kmpc_warp_active_thread_mask()
 
 ; OPTIMISTIC: ; Function Attrs: convergent nounwind
-; OPTIMISTIC-NEXT: declare void @__kmpc_syncwarp(i32)
+; OPTIMISTIC-NEXT: declare void @__kmpc_syncwarp(i64)
 
 ; OPTIMISTIC: ; Function Attrs: nounwind
 ; OPTIMISTIC-NEXT: declare i32 @__tgt_target_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**)
diff --git a/llvm/test/Transforms/OpenMP/global_constructor.ll b/llvm/test/Transforms/OpenMP/global_constructor.ll
new file mode 100644
index 0000000000000..45ce9f011f3ff
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/global_constructor.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --include-generated-funcs
+; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+@_ZL6Device = internal global double 0.000000e+00, align 8
+@__omp_offloading_fd02_85283c04_main_l11_exec_mode = weak constant i8 0
+
+define weak void @__omp_offloading_fd02_85283c04_main_l11(double* nonnull align 8 dereferenceable(8) %X) local_unnamed_addr {
+entry:
+  %0 = tail call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 true, i1 false, i1 false) #0
+  %exec_user_code = icmp eq i32 %0, -1
+  br i1 %exec_user_code, label %user_code.entry, label %common.ret
+
+common.ret:
+  ret void
+
+user_code.entry:
+  %1 = load double, double* @_ZL6Device, align 8, !tbaa !11
+  %2 = tail call i32 @__kmpc_get_hardware_thread_id_in_block() #0
+  %3 = icmp eq i32 %2, 0
+  br i1 %3, label %region.guarded, label %region.barrier
+
+region.guarded:
+  store double %1, double* %X, align 8, !tbaa !11
+  br label %region.barrier
+
+region.barrier:
+  tail call void @__kmpc_barrier_simple_spmd(%struct.ident_t* nonnull @1, i32 %2)
+  tail call void @__kmpc_target_deinit(%struct.ident_t* nonnull @1, i1 true, i1 false) #0
+  br label %common.ret
+}
+
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) local_unnamed_addr
+
+declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) local_unnamed_addr
+
+define internal void @__omp_offloading__fd02_85283c04_Device_l6_ctor() {
+entry:
+  %call.i = tail call double @__nv_log(double noundef 2.000000e+00) #1
+  %call.i2 = tail call double @__nv_log(double noundef 2.000000e+00) #1
+  %div = fdiv double %call.i, %call.i2
+  store double %div, double* @_ZL6Device, align 8, !tbaa !11
+  ret void
+}
+
+declare double @__nv_log(double)
+
+declare i32 @__kmpc_get_hardware_thread_id_in_block()
+
+declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32)
+
+attributes #0 = { nounwind }
+attributes #1 = { convergent nounwind }
+
+!omp_offload.info = !{!0, !1, !2}
+!nvvm.annotations = !{!3, !4}
+!llvm.module.flags = !{!5, !6, !7, !8, !9}
+!llvm.ident = !{!10}
+
+!0 = !{i32 0, i32 64770, i32 -2060960764, !"__omp_offloading__fd02_85283c04_Device_l6_ctor", i32 6, i32 1}
+!1 = !{i32 0, i32 64770, i32 -2060960764, !"main", i32 11, i32 2}
+!2 = !{i32 1, !"_ZL6Device", i32 0, i32 0}
+!3 = !{void ()* @__omp_offloading__fd02_85283c04_Device_l6_ctor, !"kernel", i32 1}
+!4 = !{void (double*)* @__omp_offloading_fd02_85283c04_main_l11, !"kernel", i32 1}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 7, !"openmp", i32 50}
+!7 = !{i32 7, !"openmp-device", i32 50}
+!8 = !{i32 7, !"PIC Level", i32 2}
+!9 = !{i32 7, !"frame-pointer", i32 2}
+!10 = !{!"clang version 14.0.0"}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"double", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C++ TBAA"}
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_85283c04_main_l11
+; CHECK-SAME: (double* nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       user_code.entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, double* @_ZL6Device, align 8, !tbaa [[TBAA11:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @__kmpc_get_hardware_thread_id_in_block() #[[ATTR1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
+; CHECK:       region.guarded:
+; CHECK-NEXT:    store double [[TMP1]], double* [[X]], align 8, !tbaa [[TBAA11]]
+; CHECK-NEXT:    br label [[REGION_BARRIER]]
+; CHECK:       region.barrier:
+; CHECK-NEXT:    tail call void @__kmpc_barrier_simple_spmd(%struct.ident_t* nonnull @[[GLOB1]], i32 [[TMP2]]) #[[ATTR1]]
+; CHECK-NEXT:    tail call void @__kmpc_target_deinit(%struct.ident_t* nonnull @[[GLOB1]], i1 true, i1 false) #[[ATTR1]]
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
diff --git a/llvm/test/Transforms/PGOProfile/mismatched-byval.ll b/llvm/test/Transforms/PGOProfile/mismatched-byval.ll
new file mode 100644
index 0000000000000..84c335c678473
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/mismatched-byval.ll
@@ -0,0 +1,22 @@
+; RUN: opt -passes=pgo-icall-prom -profile-summary-hot-count=10 -S < %s -pass-remarks-output=- | FileCheck %s
+
+; CHECK: byval mismatch
+
+define void @a(i8* %0) !prof !0 {
+  ret void
+}
+
+define void @b(void (i64*)** %v, i64* %p) !prof !1 {
+; CHECK-LABEL: @b
+; CHECK-NEXT: load
+; CHECK-NEXT: call void {{.*}}(i64* byval(i64)
+; CHECK-NEXT: ret void
+;
+  %a = load void (i64*)*, void (i64*)** %v
+  call void %a(i64* byval(i64) %p), !prof !2
+  ret void
+}
+
+!0 = !{!"function_entry_count", i64 36}
+!1 = !{!"function_entry_count", i64 1}
+!2 = !{!"VP", i32 0, i64 18, i64 12157170054180749580, i64 18}
diff --git a/llvm/test/Transforms/PGOProfile/mismatched-inalloca.ll b/llvm/test/Transforms/PGOProfile/mismatched-inalloca.ll
new file mode 100644
index 0000000000000..5ac174cd7c733
--- /dev/null
+++ b/llvm/test/Transforms/PGOProfile/mismatched-inalloca.ll
@@ -0,0 +1,21 @@
+; RUN: opt -passes=pgo-icall-prom -profile-summary-hot-count=10 -S < %s -pass-remarks-output=- | FileCheck %s
+
+; CHECK: inalloca mismatch
+
+define void @a(i8* %0) !prof !0 {
+  ret void
+}
+
+define void @b(void (i64*)** %v, i64* %p) !prof !1 {
+; CHECK-LABEL: @b
+; CHECK-NEXT: load
+; CHECK-NEXT: call void {{.*}}(i64* inalloca(i64)
+; CHECK-NEXT: ret void
+  %a = load void (i64*)*, void (i64*)** %v
+  call void %a(i64* inalloca(i64) %p), !prof !2
+  ret void
+}
+
+!0 = !{!"function_entry_count", i64 36}
+!1 = !{!"function_entry_count", i64 1}
+!2 = !{!"VP", i32 0, i64 18, i64 12157170054180749580, i64 18}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
new file mode 100644
index 0000000000000..c24affbeaad12
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
@@ -0,0 +1,196 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -O2 -mtriple=arm64-apple-ios -S %s | FileCheck %s
+
+%vec = type { i64*, i64* }
+
+; Test to ensure a loop with multiple loads guarded by runtime-checks (like
+; from multiple calls to C++'s std::vector::at) can be vectorized after
+; hoisting the runtime checks out of the loop.
+
+define i64 @sum_2_at_with_int_conversion(%vec* %A, %vec* %B, i64 %N) {
+; CHECK-LABEL: @sum_2_at_with_int_conversion(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_START_I:%.*]] = getelementptr [[VEC:%.*]], %vec* [[A:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[START_I:%.*]] = load i64*, i64** [[GEP_START_I]], align 8
+; CHECK-NEXT:    [[GEP_END_I:%.*]] = getelementptr [[VEC]], %vec* [[A]], i64 0, i32 1
+; CHECK-NEXT:    [[END_I:%.*]] = load i64*, i64** [[GEP_END_I]], align 8
+; CHECK-NEXT:    [[START_INT_I:%.*]] = ptrtoint i64* [[START_I]] to i64
+; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint i64* [[END_I]] to i64
+; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
+; CHECK-NEXT:    [[GEP_START_I1:%.*]] = getelementptr [[VEC]], %vec* [[B:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[GEP_END_I3:%.*]] = getelementptr [[VEC]], %vec* [[B]], i64 0, i32 1
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT12:%.*]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT12]] ]
+; CHECK-NEXT:    [[INRANGE_I:%.*]] = icmp ult i64 [[SUB_I]], [[IV]]
+; CHECK-NEXT:    br i1 [[INRANGE_I]], label [[ERROR_I:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT:%.*]]
+; CHECK:       error.i:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
+; CHECK:       at_with_int_conversion.exit:
+; CHECK-NEXT:    [[START_I2:%.*]] = load i64*, i64** [[GEP_START_I1]], align 8
+; CHECK-NEXT:    [[END_I4:%.*]] = load i64*, i64** [[GEP_END_I3]], align 8
+; CHECK-NEXT:    [[START_INT_I5:%.*]] = ptrtoint i64* [[START_I2]] to i64
+; CHECK-NEXT:    [[END_INT_I6:%.*]] = ptrtoint i64* [[END_I4]] to i64
+; CHECK-NEXT:    [[SUB_I7:%.*]] = sub i64 [[END_INT_I6]], [[START_INT_I5]]
+; CHECK-NEXT:    [[INRANGE_I8:%.*]] = icmp ult i64 [[SUB_I7]], [[IV]]
+; CHECK-NEXT:    br i1 [[INRANGE_I8]], label [[ERROR_I11:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT12]]
+; CHECK:       error.i11:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
+; CHECK:       at_with_int_conversion.exit12:
+; CHECK-NEXT:    [[GEP_IDX_I:%.*]] = getelementptr i64, i64* [[START_I]], i64 [[IV]]
+; CHECK-NEXT:    [[LV_I:%.*]] = load i64, i64* [[GEP_IDX_I]], align 4
+; CHECK-NEXT:    [[GEP_IDX_I9:%.*]] = getelementptr i64, i64* [[START_I2]], i64 [[IV]]
+; CHECK-NEXT:    [[LV_I10:%.*]] = load i64, i64* [[GEP_IDX_I9]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[LV_I]], [[SUM]]
+; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[ADD]], [[LV_I10]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i64 [[IV]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i64 [[SUM_NEXT]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %sum = phi i64 [ 0, %entry ], [ %sum.next, %loop ]
+  %a = call i64 @at_with_int_conversion(%vec* %A, i64 %iv)
+  %b = call i64 @at_with_int_conversion(%vec* %B, i64 %iv)
+  %add = add i64 %a, %b
+  %sum.next = add i64 %sum, %add
+  %iv.next = add nuw nsw i64 %iv, 1
+  %c = icmp slt i64 %iv, %N
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret i64 %sum.next
+}
+
+define i64 @sum_3_at_with_int_conversion(%vec* %A, %vec* %B, %vec* %C, i64 %N) {
+; CHECK-LABEL: @sum_3_at_with_int_conversion(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP_START_I:%.*]] = getelementptr [[VEC:%.*]], %vec* [[A:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[START_I:%.*]] = load i64*, i64** [[GEP_START_I]], align 8
+; CHECK-NEXT:    [[GEP_END_I:%.*]] = getelementptr [[VEC]], %vec* [[A]], i64 0, i32 1
+; CHECK-NEXT:    [[END_I:%.*]] = load i64*, i64** [[GEP_END_I]], align 8
+; CHECK-NEXT:    [[START_INT_I:%.*]] = ptrtoint i64* [[START_I]] to i64
+; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint i64* [[END_I]] to i64
+; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
+; CHECK-NEXT:    [[GEP_START_I1:%.*]] = getelementptr [[VEC]], %vec* [[B:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[GEP_END_I3:%.*]] = getelementptr [[VEC]], %vec* [[B]], i64 0, i32 1
+; CHECK-NEXT:    [[GEP_START_I13:%.*]] = getelementptr [[VEC]], %vec* [[C:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[GEP_END_I15:%.*]] = getelementptr [[VEC]], %vec* [[C]], i64 0, i32 1
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT24:%.*]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT24]] ]
+; CHECK-NEXT:    [[INRANGE_I:%.*]] = icmp ult i64 [[SUB_I]], [[IV]]
+; CHECK-NEXT:    br i1 [[INRANGE_I]], label [[ERROR_I:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT:%.*]]
+; CHECK:       error.i:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
+; CHECK:       at_with_int_conversion.exit:
+; CHECK-NEXT:    [[GEP_IDX_I:%.*]] = getelementptr i64, i64* [[START_I]], i64 [[IV]]
+; CHECK-NEXT:    [[LV_I:%.*]] = load i64, i64* [[GEP_IDX_I]], align 4
+; CHECK-NEXT:    [[START_I2:%.*]] = load i64*, i64** [[GEP_START_I1]], align 8
+; CHECK-NEXT:    [[END_I4:%.*]] = load i64*, i64** [[GEP_END_I3]], align 8
+; CHECK-NEXT:    [[START_INT_I5:%.*]] = ptrtoint i64* [[START_I2]] to i64
+; CHECK-NEXT:    [[END_INT_I6:%.*]] = ptrtoint i64* [[END_I4]] to i64
+; CHECK-NEXT:    [[SUB_I7:%.*]] = sub i64 [[END_INT_I6]], [[START_INT_I5]]
+; CHECK-NEXT:    [[INRANGE_I8:%.*]] = icmp ult i64 [[SUB_I7]], [[IV]]
+; CHECK-NEXT:    br i1 [[INRANGE_I8]], label [[ERROR_I11:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT12:%.*]]
+; CHECK:       error.i11:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
+; CHECK:       at_with_int_conversion.exit12:
+; CHECK-NEXT:    [[START_I14:%.*]] = load i64*, i64** [[GEP_START_I13]], align 8
+; CHECK-NEXT:    [[END_I16:%.*]] = load i64*, i64** [[GEP_END_I15]], align 8
+; CHECK-NEXT:    [[START_INT_I17:%.*]] = ptrtoint i64* [[START_I14]] to i64
+; CHECK-NEXT:    [[END_INT_I18:%.*]] = ptrtoint i64* [[END_I16]] to i64
+; CHECK-NEXT:    [[SUB_I19:%.*]] = sub i64 [[END_INT_I18]], [[START_INT_I17]]
+; CHECK-NEXT:    [[INRANGE_I20:%.*]] = icmp ult i64 [[SUB_I19]], [[IV]]
+; CHECK-NEXT:    br i1 [[INRANGE_I20]], label [[ERROR_I23:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT24]]
+; CHECK:       error.i23:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
+; CHECK:       at_with_int_conversion.exit24:
+; CHECK-NEXT:    [[GEP_IDX_I9:%.*]] = getelementptr i64, i64* [[START_I2]], i64 [[IV]]
+; CHECK-NEXT:    [[LV_I10:%.*]] = load i64, i64* [[GEP_IDX_I9]], align 4
+; CHECK-NEXT:    [[GEP_IDX_I21:%.*]] = getelementptr i64, i64* [[START_I14]], i64 [[IV]]
+; CHECK-NEXT:    [[LV_I22:%.*]] = load i64, i64* [[GEP_IDX_I21]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i64 [[LV_I]], [[SUM]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i64 [[ADD_1]], [[LV_I10]]
+; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[ADD_2]], [[LV_I22]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[IV]], [[N:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i64 [[SUM_NEXT]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %sum = phi i64 [ 0, %entry ], [ %sum.next, %loop ]
+  %a = call i64 @at_with_int_conversion(%vec* %A, i64 %iv)
+  %b = call i64 @at_with_int_conversion(%vec* %B, i64 %iv)
+  %c = call i64 @at_with_int_conversion(%vec* %C, i64 %iv)
+  %add.1 = add i64 %a, %b
+  %add.2 = add i64 %add.1, %c
+  %sum.next = add i64 %sum, %add.2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cond = icmp slt i64 %iv, %N
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret i64 %sum.next
+}
+
+
+define i64 @at_with_int_conversion(%vec* %ptr, i64 %idx) {
+; CHECK-LABEL: @at_with_int_conversion(
+; CHECK-NEXT:    [[GEP_START:%.*]] = getelementptr [[VEC:%.*]], %vec* [[PTR:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[START:%.*]] = load i64*, i64** [[GEP_START]], align 8
+; CHECK-NEXT:    [[GEP_END:%.*]] = getelementptr [[VEC]], %vec* [[PTR]], i64 0, i32 1
+; CHECK-NEXT:    [[END:%.*]] = load i64*, i64** [[GEP_END]], align 8
+; CHECK-NEXT:    [[START_INT:%.*]] = ptrtoint i64* [[START]] to i64
+; CHECK-NEXT:    [[END_INT:%.*]] = ptrtoint i64* [[END]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[END_INT]], [[START_INT]]
+; CHECK-NEXT:    [[INRANGE:%.*]] = icmp ult i64 [[SUB]], [[IDX:%.*]]
+; CHECK-NEXT:    br i1 [[INRANGE]], label [[ERROR:%.*]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[GEP_IDX:%.*]] = getelementptr i64, i64* [[START]], i64 [[IDX]]
+; CHECK-NEXT:    [[LV:%.*]] = load i64, i64* [[GEP_IDX]], align 4
+; CHECK-NEXT:    ret i64 [[LV]]
+; CHECK:       error:
+; CHECK-NEXT:    tail call void @error()
+; CHECK-NEXT:    unreachable
+;
+  %gep.start = getelementptr %vec, %vec* %ptr, i64 0, i32 0
+  %start = load i64*, i64** %gep.start
+  %gep.end = getelementptr %vec, %vec* %ptr, i64 0, i32 1
+  %end = load i64*, i64** %gep.end
+  %start.int = ptrtoint i64* %start to i64
+  %end.int = ptrtoint i64* %end to i64
+  %sub = sub i64 %end.int, %start.int
+  %inrange = icmp ugt i64 %idx, %sub
+  br i1 %inrange, label %error, label %exit
+
+exit:
+  %gep.idx = getelementptr i64, i64* %start, i64 %idx
+  %lv = load i64, i64* %gep.idx
+  ret i64 %lv
+
+error:
+  call void @error()
+  unreachable
+}
+
+declare void @error()
+
+
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
new file mode 100644
index 0000000000000..cacedbb7aa962
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
@@ -0,0 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes='default<O3>' -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+; This should, after inlining and simplification, be a single tail predicated
+; 16x vector loop handling llvm.sadd.sat.  __SSAT is inlined and so is DCE'd.
+
+; Function Attrs: nounwind
+define dso_local void @arm_add_q7(i8* %pSrcA, i8* %pSrcB, i8* noalias %pDst, i32 %blockSize) #0 {
+; CHECK-LABEL: @arm_add_q7(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT3:%.*]] = icmp eq i32 [[BLOCKSIZE:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP_NOT3]], label [[WHILE_END:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[BLOCKSIZE]], 15
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PSRCA:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP14:%.*]] = getelementptr i8, i8* [[PDST:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP15:%.*]] = getelementptr i8, i8* [[PSRCB:%.*]], i32 [[INDEX]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 [[INDEX]], i32 [[BLOCKSIZE]])
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[NEXT_GEP]] to <16 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[NEXT_GEP15]] to <16 x i8>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[WIDE_MASKED_LOAD]], <16 x i8> [[WIDE_MASKED_LOAD18]])
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[NEXT_GEP14]] to <16 x i8>*
+; CHECK-NEXT:    call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[TMP2]], <16 x i8>* [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[WHILE_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       while.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %pSrcA.addr = alloca i8*, align 4
+  %pSrcB.addr = alloca i8*, align 4
+  %pDst.addr = alloca i8*, align 4
+  %blockSize.addr = alloca i32, align 4
+  %blkCnt = alloca i32, align 4
+  store i8* %pSrcA, i8** %pSrcA.addr, align 4
+  store i8* %pSrcB, i8** %pSrcB.addr, align 4
+  store i8* %pDst, i8** %pDst.addr, align 4
+  store i32 %blockSize, i32* %blockSize.addr, align 4
+  %0 = bitcast i32* %blkCnt to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0)
+  %1 = load i32, i32* %blockSize.addr, align 4
+  store i32 %1, i32* %blkCnt, align 4
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %2 = load i32, i32* %blkCnt, align 4
+  %cmp = icmp ugt i32 %2, 0
+  br i1 %cmp, label %while.body, label %while.end
+
+while.body:                                       ; preds = %while.cond
+  %3 = load i8*, i8** %pSrcA.addr, align 4
+  %incdec.ptr = getelementptr inbounds i8, i8* %3, i32 1
+  store i8* %incdec.ptr, i8** %pSrcA.addr, align 4
+  %4 = load i8, i8* %3, align 1
+  %conv = sext i8 %4 to i16
+  %conv1 = sext i16 %conv to i32
+  %5 = load i8*, i8** %pSrcB.addr, align 4
+  %incdec.ptr2 = getelementptr inbounds i8, i8* %5, i32 1
+  store i8* %incdec.ptr2, i8** %pSrcB.addr, align 4
+  %6 = load i8, i8* %5, align 1
+  %conv3 = sext i8 %6 to i32
+  %add = add nsw i32 %conv1, %conv3
+  %call = call i32 @__SSAT(i32 %add, i32 8)
+  %conv4 = trunc i32 %call to i8
+  %7 = load i8*, i8** %pDst.addr, align 4
+  %incdec.ptr5 = getelementptr inbounds i8, i8* %7, i32 1
+  store i8* %incdec.ptr5, i8** %pDst.addr, align 4
+  store i8 %conv4, i8* %7, align 1
+  %8 = load i32, i32* %blkCnt, align 4
+  %dec = add i32 %8, -1
+  store i32 %dec, i32* %blkCnt, align 4
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  %9 = bitcast i32* %blkCnt to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %9)
+  ret void
+}
+
+define internal i32 @__SSAT(i32 %val, i32 %sat) #1 {
+entry:
+  %retval = alloca i32, align 4
+  %val.addr = alloca i32, align 4
+  %sat.addr = alloca i32, align 4
+  %max = alloca i32, align 4
+  %min = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  store i32 %val, i32* %val.addr, align 4
+  store i32 %sat, i32* %sat.addr, align 4
+  %0 = load i32, i32* %sat.addr, align 4
+  %cmp = icmp uge i32 %0, 1
+  br i1 %cmp, label %land.lhs.true, label %if.end10
+
+land.lhs.true:                                    ; preds = %entry
+  %1 = load i32, i32* %sat.addr, align 4
+  %cmp1 = icmp ule i32 %1, 32
+  br i1 %cmp1, label %if.then, label %if.end10
+
+if.then:                                          ; preds = %land.lhs.true
+  %2 = bitcast i32* %max to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2)
+  %3 = load i32, i32* %sat.addr, align 4
+  %sub = sub i32 %3, 1
+  %shl = shl i32 1, %sub
+  %sub2 = sub i32 %shl, 1
+  store i32 %sub2, i32* %max, align 4
+  %4 = bitcast i32* %min to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4)
+  %5 = load i32, i32* %max, align 4
+  %sub3 = sub nsw i32 -1, %5
+  store i32 %sub3, i32* %min, align 4
+  %6 = load i32, i32* %val.addr, align 4
+  %7 = load i32, i32* %max, align 4
+  %cmp4 = icmp sgt i32 %6, %7
+  br i1 %cmp4, label %if.then5, label %if.else
+
+if.then5:                                         ; preds = %if.then
+  %8 = load i32, i32* %max, align 4
+  store i32 %8, i32* %retval, align 4
+  store i32 1, i32* %cleanup.dest.slot, align 4
+  br label %cleanup
+
+if.else:                                          ; preds = %if.then
+  %9 = load i32, i32* %val.addr, align 4
+  %10 = load i32, i32* %min, align 4
+  %cmp6 = icmp slt i32 %9, %10
+  br i1 %cmp6, label %if.then7, label %if.end
+
+if.then7:                                         ; preds = %if.else
+  %11 = load i32, i32* %min, align 4
+  store i32 %11, i32* %retval, align 4
+  store i32 1, i32* %cleanup.dest.slot, align 4
+  br label %cleanup
+
+if.end:                                           ; preds = %if.else
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.end
+  store i32 0, i32* %cleanup.dest.slot, align 4
+  br label %cleanup
+
+cleanup:                                          ; preds = %if.end8, %if.then7, %if.then5
+  %12 = bitcast i32* %min to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %12)
+  %13 = bitcast i32* %max to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %13)
+  %cleanup.dest = load i32, i32* %cleanup.dest.slot, align 4
+  switch i32 %cleanup.dest, label %unreachable [
+  i32 0, label %cleanup.cont
+  i32 1, label %return
+  ]
+
+cleanup.cont:                                     ; preds = %cleanup
+  br label %if.end10
+
+if.end10:                                         ; preds = %cleanup.cont, %land.lhs.true, %entry
+  %14 = load i32, i32* %val.addr, align 4
+  store i32 %14, i32* %retval, align 4
+  br label %return
+
+return:                                           ; preds = %if.end10, %cleanup
+  %15 = load i32, i32* %retval, align 4
+  ret i32 %15
+
+unreachable:                                      ; preds = %cleanup
+  unreachable
+}
+
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
+
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #1 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll
similarity index 53%
rename from llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll
rename to llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll
index 818ba6450fcb6..39b7d6792275a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr50555.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll
@@ -1,74 +1,38 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-- -aggressive-instcombine -slp-vectorizer -dce -S | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-- -mcpu=corei7-avx -aggressive-instcombine -slp-vectorizer -dce -S | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- | FileCheck %s --check-prefixes=SSE
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mattr=avx | FileCheck %s --check-prefixes=AVX
 
 define void @trunc_through_one_add(i16* noalias %0, i8* noalias readonly %1) {
 ; SSE-LABEL: @trunc_through_one_add(
-; SSE-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>*
-; SSE-NEXT:    [[TMP4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; SSE-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; SSE-NEXT:    [[TMP6:%.*]] = lshr <4 x i32> [[TMP5]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP7:%.*]] = add nuw nsw <4 x i32> [[TMP6]], [[TMP5]]
-; SSE-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP9:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i16>
-; SSE-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP9]], <4 x i16>* [[TMP10]], align 2
-; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4
-; SSE-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <4 x i8>*
-; SSE-NEXT:    [[TMP14:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
-; SSE-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[TMP14]] to <4 x i32>
-; SSE-NEXT:    [[TMP16:%.*]] = lshr <4 x i32> [[TMP15]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP17:%.*]] = add nuw nsw <4 x i32> [[TMP16]], [[TMP15]]
-; SSE-NEXT:    [[TMP18:%.*]] = lshr <4 x i32> [[TMP17]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP19:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i16>
-; SSE-NEXT:    [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP19]], <4 x i16>* [[TMP20]], align 2
-; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
-; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
-; SSE-NEXT:    [[TMP23:%.*]] = bitcast i8* [[TMP21]] to <4 x i8>*
-; SSE-NEXT:    [[TMP24:%.*]] = load <4 x i8>, <4 x i8>* [[TMP23]], align 1
-; SSE-NEXT:    [[TMP25:%.*]] = zext <4 x i8> [[TMP24]] to <4 x i32>
-; SSE-NEXT:    [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]]
-; SSE-NEXT:    [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16>
-; SSE-NEXT:    [[TMP30:%.*]] = bitcast i16* [[TMP22]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2
-; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12
-; SSE-NEXT:    [[TMP33:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>*
-; SSE-NEXT:    [[TMP34:%.*]] = load <4 x i8>, <4 x i8>* [[TMP33]], align 1
-; SSE-NEXT:    [[TMP35:%.*]] = zext <4 x i8> [[TMP34]] to <4 x i32>
-; SSE-NEXT:    [[TMP36:%.*]] = lshr <4 x i32> [[TMP35]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP37:%.*]] = add nuw nsw <4 x i32> [[TMP36]], [[TMP35]]
-; SSE-NEXT:    [[TMP38:%.*]] = lshr <4 x i32> [[TMP37]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP39:%.*]] = trunc <4 x i32> [[TMP38]] to <4 x i16>
-; SSE-NEXT:    [[TMP40:%.*]] = bitcast i16* [[TMP32]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP39]], <4 x i16>* [[TMP40]], align 2
+; SSE-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
+; SSE-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
+; SSE-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i16>
+; SSE-NEXT:    [[TMP6:%.*]] = lshr <8 x i16> [[TMP5]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[TMP7:%.*]] = add nuw nsw <8 x i16> [[TMP6]], [[TMP5]]
+; SSE-NEXT:    [[TMP8:%.*]] = lshr <8 x i16> [[TMP7]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; SSE-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
+; SSE-NEXT:    store <8 x i16> [[TMP8]], <8 x i16>* [[TMP9]], align 2
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
+; SSE-NEXT:    [[TMP13:%.*]] = load <8 x i8>, <8 x i8>* [[TMP12]], align 1
+; SSE-NEXT:    [[TMP14:%.*]] = zext <8 x i8> [[TMP13]] to <8 x i16>
+; SSE-NEXT:    [[TMP15:%.*]] = lshr <8 x i16> [[TMP14]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[TMP16:%.*]] = add nuw nsw <8 x i16> [[TMP15]], [[TMP14]]
+; SSE-NEXT:    [[TMP17:%.*]] = lshr <8 x i16> [[TMP16]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; SSE-NEXT:    [[TMP18:%.*]] = bitcast i16* [[TMP11]] to <8 x i16>*
+; SSE-NEXT:    store <8 x i16> [[TMP17]], <8 x i16>* [[TMP18]], align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @trunc_through_one_add(
-; AVX-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
-; AVX-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
-; AVX-NEXT:    [[TMP6:%.*]] = lshr <8 x i32> [[TMP5]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX-NEXT:    [[TMP7:%.*]] = add nuw nsw <8 x i32> [[TMP6]], [[TMP5]]
-; AVX-NEXT:    [[TMP8:%.*]] = lshr <8 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; AVX-NEXT:    [[TMP9:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i16>
-; AVX-NEXT:    [[TMP10:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
-; AVX-NEXT:    store <8 x i16> [[TMP9]], <8 x i16>* [[TMP10]], align 2
-; AVX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
-; AVX-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP11]] to <8 x i8>*
-; AVX-NEXT:    [[TMP14:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 1
-; AVX-NEXT:    [[TMP15:%.*]] = zext <8 x i8> [[TMP14]] to <8 x i32>
-; AVX-NEXT:    [[TMP16:%.*]] = lshr <8 x i32> [[TMP15]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX-NEXT:    [[TMP17:%.*]] = add nuw nsw <8 x i32> [[TMP16]], [[TMP15]]
-; AVX-NEXT:    [[TMP18:%.*]] = lshr <8 x i32> [[TMP17]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; AVX-NEXT:    [[TMP19:%.*]] = trunc <8 x i32> [[TMP18]] to <8 x i16>
-; AVX-NEXT:    [[TMP20:%.*]] = bitcast i16* [[TMP12]] to <8 x i16>*
-; AVX-NEXT:    store <8 x i16> [[TMP19]], <8 x i16>* [[TMP20]], align 2
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP1:%.*]] to <16 x i8>*
+; AVX-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1
+; AVX-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[TMP4]] to <16 x i16>
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <16 x i16> [[TMP5]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; AVX-NEXT:    [[TMP7:%.*]] = add nuw nsw <16 x i16> [[TMP6]], [[TMP5]]
+; AVX-NEXT:    [[TMP8:%.*]] = lshr <16 x i16> [[TMP7]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; AVX-NEXT:    [[TMP9:%.*]] = bitcast i16* [[TMP0:%.*]] to <16 x i16>*
+; AVX-NEXT:    store <16 x i16> [[TMP8]], <16 x i16>* [[TMP9]], align 2
 ; AVX-NEXT:    ret void
 ;
   %3 = load i8, i8* %1, align 1
@@ -218,99 +182,48 @@ define void @trunc_through_one_add(i16* noalias %0, i8* noalias readonly %1) {
 
 define void @trunc_through_two_adds(i16* noalias %0, i8* noalias readonly %1, i8* noalias readonly %2) {
 ; SSE-LABEL: @trunc_through_two_adds(
-; SSE-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <4 x i8>*
-; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
-; SSE-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
-; SSE-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <4 x i8>*
-; SSE-NEXT:    [[TMP8:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
-; SSE-NEXT:    [[TMP9:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32>
-; SSE-NEXT:    [[TMP10:%.*]] = add nuw nsw <4 x i32> [[TMP9]], [[TMP6]]
-; SSE-NEXT:    [[TMP11:%.*]] = lshr <4 x i32> [[TMP10]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP12:%.*]] = add nuw nsw <4 x i32> [[TMP11]], [[TMP10]]
-; SSE-NEXT:    [[TMP13:%.*]] = lshr <4 x i32> [[TMP12]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP14:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16>
-; SSE-NEXT:    [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP14]], <4 x i16>* [[TMP15]], align 2
-; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 4
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 4
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 4
-; SSE-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>*
-; SSE-NEXT:    [[TMP20:%.*]] = load <4 x i8>, <4 x i8>* [[TMP19]], align 1
-; SSE-NEXT:    [[TMP21:%.*]] = zext <4 x i8> [[TMP20]] to <4 x i32>
-; SSE-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <4 x i8>*
-; SSE-NEXT:    [[TMP23:%.*]] = load <4 x i8>, <4 x i8>* [[TMP22]], align 1
-; SSE-NEXT:    [[TMP24:%.*]] = zext <4 x i8> [[TMP23]] to <4 x i32>
-; SSE-NEXT:    [[TMP25:%.*]] = add nuw nsw <4 x i32> [[TMP24]], [[TMP21]]
-; SSE-NEXT:    [[TMP26:%.*]] = lshr <4 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP27:%.*]] = add nuw nsw <4 x i32> [[TMP26]], [[TMP25]]
-; SSE-NEXT:    [[TMP28:%.*]] = lshr <4 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP29:%.*]] = trunc <4 x i32> [[TMP28]] to <4 x i16>
-; SSE-NEXT:    [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP29]], <4 x i16>* [[TMP30]], align 2
-; SSE-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8
-; SSE-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
-; SSE-NEXT:    [[TMP34:%.*]] = bitcast i8* [[TMP31]] to <4 x i8>*
-; SSE-NEXT:    [[TMP35:%.*]] = load <4 x i8>, <4 x i8>* [[TMP34]], align 1
-; SSE-NEXT:    [[TMP36:%.*]] = zext <4 x i8> [[TMP35]] to <4 x i32>
-; SSE-NEXT:    [[TMP37:%.*]] = bitcast i8* [[TMP32]] to <4 x i8>*
-; SSE-NEXT:    [[TMP38:%.*]] = load <4 x i8>, <4 x i8>* [[TMP37]], align 1
-; SSE-NEXT:    [[TMP39:%.*]] = zext <4 x i8> [[TMP38]] to <4 x i32>
-; SSE-NEXT:    [[TMP40:%.*]] = add nuw nsw <4 x i32> [[TMP39]], [[TMP36]]
-; SSE-NEXT:    [[TMP41:%.*]] = lshr <4 x i32> [[TMP40]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP42:%.*]] = add nuw nsw <4 x i32> [[TMP41]], [[TMP40]]
-; SSE-NEXT:    [[TMP43:%.*]] = lshr <4 x i32> [[TMP42]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP44:%.*]] = trunc <4 x i32> [[TMP43]] to <4 x i16>
-; SSE-NEXT:    [[TMP45:%.*]] = bitcast i16* [[TMP33]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP44]], <4 x i16>* [[TMP45]], align 2
-; SSE-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 12
-; SSE-NEXT:    [[TMP47:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 12
-; SSE-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 12
-; SSE-NEXT:    [[TMP49:%.*]] = bitcast i8* [[TMP46]] to <4 x i8>*
-; SSE-NEXT:    [[TMP50:%.*]] = load <4 x i8>, <4 x i8>* [[TMP49]], align 1
-; SSE-NEXT:    [[TMP51:%.*]] = zext <4 x i8> [[TMP50]] to <4 x i32>
-; SSE-NEXT:    [[TMP52:%.*]] = bitcast i8* [[TMP47]] to <4 x i8>*
-; SSE-NEXT:    [[TMP53:%.*]] = load <4 x i8>, <4 x i8>* [[TMP52]], align 1
-; SSE-NEXT:    [[TMP54:%.*]] = zext <4 x i8> [[TMP53]] to <4 x i32>
-; SSE-NEXT:    [[TMP55:%.*]] = add nuw nsw <4 x i32> [[TMP54]], [[TMP51]]
-; SSE-NEXT:    [[TMP56:%.*]] = lshr <4 x i32> [[TMP55]], <i32 1, i32 1, i32 1, i32 1>
-; SSE-NEXT:    [[TMP57:%.*]] = add nuw nsw <4 x i32> [[TMP56]], [[TMP55]]
-; SSE-NEXT:    [[TMP58:%.*]] = lshr <4 x i32> [[TMP57]], <i32 2, i32 2, i32 2, i32 2>
-; SSE-NEXT:    [[TMP59:%.*]] = trunc <4 x i32> [[TMP58]] to <4 x i16>
-; SSE-NEXT:    [[TMP60:%.*]] = bitcast i16* [[TMP48]] to <4 x i16>*
-; SSE-NEXT:    store <4 x i16> [[TMP59]], <4 x i16>* [[TMP60]], align 2
+; SSE-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
+; SSE-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1
+; SSE-NEXT:    [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i16>
+; SSE-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <8 x i8>*
+; SSE-NEXT:    [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
+; SSE-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i16>
+; SSE-NEXT:    [[TMP10:%.*]] = add nuw nsw <8 x i16> [[TMP9]], [[TMP6]]
+; SSE-NEXT:    [[TMP11:%.*]] = lshr <8 x i16> [[TMP10]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[TMP12:%.*]] = add nuw nsw <8 x i16> [[TMP11]], [[TMP10]]
+; SSE-NEXT:    [[TMP13:%.*]] = lshr <8 x i16> [[TMP12]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; SSE-NEXT:    [[TMP14:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
+; SSE-NEXT:    store <8 x i16> [[TMP13]], <8 x i16>* [[TMP14]], align 2
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP18:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>*
+; SSE-NEXT:    [[TMP19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP18]], align 1
+; SSE-NEXT:    [[TMP20:%.*]] = zext <8 x i8> [[TMP19]] to <8 x i16>
+; SSE-NEXT:    [[TMP21:%.*]] = bitcast i8* [[TMP16]] to <8 x i8>*
+; SSE-NEXT:    [[TMP22:%.*]] = load <8 x i8>, <8 x i8>* [[TMP21]], align 1
+; SSE-NEXT:    [[TMP23:%.*]] = zext <8 x i8> [[TMP22]] to <8 x i16>
+; SSE-NEXT:    [[TMP24:%.*]] = add nuw nsw <8 x i16> [[TMP23]], [[TMP20]]
+; SSE-NEXT:    [[TMP25:%.*]] = lshr <8 x i16> [[TMP24]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; SSE-NEXT:    [[TMP26:%.*]] = add nuw nsw <8 x i16> [[TMP25]], [[TMP24]]
+; SSE-NEXT:    [[TMP27:%.*]] = lshr <8 x i16> [[TMP26]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; SSE-NEXT:    [[TMP28:%.*]] = bitcast i16* [[TMP17]] to <8 x i16>*
+; SSE-NEXT:    store <8 x i16> [[TMP27]], <8 x i16>* [[TMP28]], align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @trunc_through_two_adds(
-; AVX-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <8 x i8>*
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP4]], align 1
-; AVX-NEXT:    [[TMP6:%.*]] = zext <8 x i8> [[TMP5]] to <8 x i32>
-; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <8 x i8>*
-; AVX-NEXT:    [[TMP8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1
-; AVX-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[TMP8]] to <8 x i32>
-; AVX-NEXT:    [[TMP10:%.*]] = add nuw nsw <8 x i32> [[TMP9]], [[TMP6]]
-; AVX-NEXT:    [[TMP11:%.*]] = lshr <8 x i32> [[TMP10]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX-NEXT:    [[TMP12:%.*]] = add nuw nsw <8 x i32> [[TMP11]], [[TMP10]]
-; AVX-NEXT:    [[TMP13:%.*]] = lshr <8 x i32> [[TMP12]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; AVX-NEXT:    [[TMP14:%.*]] = trunc <8 x i32> [[TMP13]] to <8 x i16>
-; AVX-NEXT:    [[TMP15:%.*]] = bitcast i16* [[TMP0:%.*]] to <8 x i16>*
-; AVX-NEXT:    store <8 x i16> [[TMP14]], <8 x i16>* [[TMP15]], align 2
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 8
-; AVX-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i64 8
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i64 8
-; AVX-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <8 x i8>*
-; AVX-NEXT:    [[TMP20:%.*]] = load <8 x i8>, <8 x i8>* [[TMP19]], align 1
-; AVX-NEXT:    [[TMP21:%.*]] = zext <8 x i8> [[TMP20]] to <8 x i32>
-; AVX-NEXT:    [[TMP22:%.*]] = bitcast i8* [[TMP17]] to <8 x i8>*
-; AVX-NEXT:    [[TMP23:%.*]] = load <8 x i8>, <8 x i8>* [[TMP22]], align 1
-; AVX-NEXT:    [[TMP24:%.*]] = zext <8 x i8> [[TMP23]] to <8 x i32>
-; AVX-NEXT:    [[TMP25:%.*]] = add nuw nsw <8 x i32> [[TMP24]], [[TMP21]]
-; AVX-NEXT:    [[TMP26:%.*]] = lshr <8 x i32> [[TMP25]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; AVX-NEXT:    [[TMP27:%.*]] = add nuw nsw <8 x i32> [[TMP26]], [[TMP25]]
-; AVX-NEXT:    [[TMP28:%.*]] = lshr <8 x i32> [[TMP27]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; AVX-NEXT:    [[TMP29:%.*]] = trunc <8 x i32> [[TMP28]] to <8 x i16>
-; AVX-NEXT:    [[TMP30:%.*]] = bitcast i16* [[TMP18]] to <8 x i16>*
-; AVX-NEXT:    store <8 x i16> [[TMP29]], <8 x i16>* [[TMP30]], align 2
+; AVX-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP1:%.*]] to <16 x i8>*
+; AVX-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1
+; AVX-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[TMP5]] to <16 x i16>
+; AVX-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2:%.*]] to <16 x i8>*
+; AVX-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1
+; AVX-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[TMP8]] to <16 x i16>
+; AVX-NEXT:    [[TMP10:%.*]] = add nuw nsw <16 x i16> [[TMP9]], [[TMP6]]
+; AVX-NEXT:    [[TMP11:%.*]] = lshr <16 x i16> [[TMP10]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; AVX-NEXT:    [[TMP12:%.*]] = add nuw nsw <16 x i16> [[TMP11]], [[TMP10]]
+; AVX-NEXT:    [[TMP13:%.*]] = lshr <16 x i16> [[TMP12]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; AVX-NEXT:    [[TMP14:%.*]] = bitcast i16* [[TMP0:%.*]] to <16 x i16>*
+; AVX-NEXT:    store <16 x i16> [[TMP13]], <16 x i16>* [[TMP14]], align 2
 ; AVX-NEXT:    ret void
 ;
   %4 = load i8, i8* %1, align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll
index 0fbafe0f97de4..2f948de7a1358 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/intrinsic.ll
@@ -195,25 +195,25 @@ define void @vec_ctlz_i32_neg(i32* %a, i32* %b, i32* %c, i1) {
 ; CHECK-NEXT:    [[I0:%.*]] = load i32, i32* [[A:%.*]], align 4
 ; CHECK-NEXT:    [[I1:%.*]] = load i32, i32* [[B:%.*]], align 4
 ; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[I0]], [[I1]]
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD1]], i1 true) #3
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD1]], i1 true) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 1
 ; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 1
 ; CHECK-NEXT:    [[I3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[I2]], [[I3]]
-; CHECK-NEXT:    [[CALL2:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD2]], i1 false) #3
+; CHECK-NEXT:    [[CALL2:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD2]], i1 false) #[[ATTR4]]
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2
 ; CHECK-NEXT:    [[I4:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2
 ; CHECK-NEXT:    [[I5:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[I4]], [[I5]]
-; CHECK-NEXT:    [[CALL3:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD3]], i1 true) #3
+; CHECK-NEXT:    [[CALL3:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD3]], i1 true) #[[ATTR4]]
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3
 ; CHECK-NEXT:    [[I6:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3
 ; CHECK-NEXT:    [[I7:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4
 ; CHECK-NEXT:    [[ADD4:%.*]] = add i32 [[I6]], [[I7]]
-; CHECK-NEXT:    [[CALL4:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD4]], i1 false) #3
+; CHECK-NEXT:    [[CALL4:%.*]] = tail call i32 @llvm.ctlz.i32(i32 [[ADD4]], i1 false) #[[ATTR4]]
 ; CHECK-NEXT:    store i32 [[CALL1]], i32* [[C:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 1
 ; CHECK-NEXT:    store i32 [[CALL2]], i32* [[ARRAYIDX8]], align 4
@@ -322,25 +322,25 @@ define void @vec_cttz_i32_neg(i32* %a, i32* %b, i32* %c, i1) {
 ; CHECK-NEXT:    [[I0:%.*]] = load i32, i32* [[A:%.*]], align 4
 ; CHECK-NEXT:    [[I1:%.*]] = load i32, i32* [[B:%.*]], align 4
 ; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[I0]], [[I1]]
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD1]], i1 true) #3
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD1]], i1 true) #[[ATTR4]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 1
 ; CHECK-NEXT:    [[I2:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 1
 ; CHECK-NEXT:    [[I3:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[I2]], [[I3]]
-; CHECK-NEXT:    [[CALL2:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD2]], i1 false) #3
+; CHECK-NEXT:    [[CALL2:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD2]], i1 false) #[[ATTR4]]
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 2
 ; CHECK-NEXT:    [[I4:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 2
 ; CHECK-NEXT:    [[I5:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[I4]], [[I5]]
-; CHECK-NEXT:    [[CALL3:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD3]], i1 true) #3
+; CHECK-NEXT:    [[CALL3:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD3]], i1 true) #[[ATTR4]]
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 3
 ; CHECK-NEXT:    [[I6:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 3
 ; CHECK-NEXT:    [[I7:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4
 ; CHECK-NEXT:    [[ADD4:%.*]] = add i32 [[I6]], [[I7]]
-; CHECK-NEXT:    [[CALL4:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD4]], i1 false) #3
+; CHECK-NEXT:    [[CALL4:%.*]] = tail call i32 @llvm.cttz.i32(i32 [[ADD4]], i1 false) #[[ATTR4]]
 ; CHECK-NEXT:    store i32 [[CALL1]], i32* [[C:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 1
 ; CHECK-NEXT:    store i32 [[CALL2]], i32* [[ARRAYIDX8]], align 4
@@ -448,25 +448,25 @@ define void @vec_powi_f32_neg(float* %a, float* %b, float* %c, i32 %P, i32 %Q) {
 ; CHECK-NEXT:    [[I0:%.*]] = load float, float* [[A:%.*]], align 4
 ; CHECK-NEXT:    [[I1:%.*]] = load float, float* [[B:%.*]], align 4
 ; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[I0]], [[I1]]
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD1]], i32 [[P:%.*]]) #3
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD1]], i32 [[P:%.*]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i32 1
 ; CHECK-NEXT:    [[I2:%.*]] = load float, float* [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i32 1
 ; CHECK-NEXT:    [[I3:%.*]] = load float, float* [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[I2]], [[I3]]
-; CHECK-NEXT:    [[CALL2:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD2]], i32 [[Q:%.*]]) #3
+; CHECK-NEXT:    [[CALL2:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD2]], i32 [[Q:%.*]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[A]], i32 2
 ; CHECK-NEXT:    [[I4:%.*]] = load float, float* [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[B]], i32 2
 ; CHECK-NEXT:    [[I5:%.*]] = load float, float* [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD3:%.*]] = fadd float [[I4]], [[I5]]
-; CHECK-NEXT:    [[CALL3:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD3]], i32 [[P]]) #3
+; CHECK-NEXT:    [[CALL3:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD3]], i32 [[P]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[A]], i32 3
 ; CHECK-NEXT:    [[I6:%.*]] = load float, float* [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[B]], i32 3
 ; CHECK-NEXT:    [[I7:%.*]] = load float, float* [[ARRAYIDX7]], align 4
 ; CHECK-NEXT:    [[ADD4:%.*]] = fadd float [[I6]], [[I7]]
-; CHECK-NEXT:    [[CALL4:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD4]], i32 [[Q]]) #3
+; CHECK-NEXT:    [[CALL4:%.*]] = tail call float @llvm.powi.f32.i32(float [[ADD4]], i32 [[Q]]) #[[ATTR4]]
 ; CHECK-NEXT:    store float [[CALL1]], float* [[C:%.*]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds float, float* [[C]], i32 1
 ; CHECK-NEXT:    store float [[CALL2]], float* [[ARRAYIDX8]], align 4
@@ -513,3 +513,45 @@ entry:
   ret void
 
 }
+
+
+declare i1 @llvm.isnan.f64(double)
+
+define void @vec_isnan_f64(double* %a, double* %b, double* %c, double* %d) {
+; CHECK-LABEL: @vec_isnan_f64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AIDX1:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[A0:%.*]] = load double, double* [[A]], align 8
+; CHECK-NEXT:    [[A1:%.*]] = load double, double* [[AIDX1]], align 8
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[ISNAN0:%.*]] = tail call i1 @llvm.isnan.f64(double [[A0]])
+; CHECK-NEXT:    [[ISNAN1:%.*]] = tail call i1 @llvm.isnan.f64(double [[A1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i1> poison, i1 [[ISNAN0]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i1> [[TMP4]], i1 [[ISNAN1]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = select <2 x i1> [[TMP5]], <2 x double> [[TMP1]], <2 x double> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast double* [[D:%.*]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %aidx1 = getelementptr inbounds double, double* %a, i64 1
+  %bidx1 = getelementptr inbounds double, double* %b, i64 1
+  %cidx1 = getelementptr inbounds double, double* %c, i64 1
+  %didx1 = getelementptr inbounds double, double* %d, i64 1
+  %a0 = load double, double* %a, align 8
+  %b0 = load double, double* %b, align 8
+  %c0 = load double, double* %c, align 8
+  %a1 = load double, double* %aidx1, align 8
+  %b1 = load double, double* %bidx1, align 8
+  %c1 = load double, double* %cidx1, align 8
+  %isnan0 = tail call i1 @llvm.isnan.f64(double %a0)
+  %isnan1 = tail call i1 @llvm.isnan.f64(double %a1)
+  %r0 = select i1 %isnan0, double %b0, double %c0
+  %r1 = select i1 %isnan1, double %b1, double %c1
+  store double %r0, double* %d, align 8
+  store double %r1, double* %didx1, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-13-SingleEntryPHI.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-13-SingleEntryPHI.ll
index 0a769ec5da6b4..2c6424c235c46 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-13-SingleEntryPHI.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-13-SingleEntryPHI.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch -disable-output
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output
 
 	%struct.BLEND_MAP = type { i16, i16, i16, i32, %struct.BLEND_MAP_ENTRY* }
 	%struct.BLEND_MAP_ENTRY = type { float, i8, { [5 x float], [4 x i8] } }
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-27-DeadSwitchCase.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-27-DeadSwitchCase.ll
index 85066168e1e23..61460da1e5930 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-27-DeadSwitchCase.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2006-06-27-DeadSwitchCase.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch -disable-output
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output
 
 define void @init_caller_save() {
 entry:
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-Unreachable.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-Unreachable.ll
index 02c7a96deb5dc..6e0362e355880 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-Unreachable.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-Unreachable.ll
@@ -1,6 +1,5 @@
 ; PR1333
-; RUN: opt < %s -simple-loop-unswitch -disable-output
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "i686-pc-linux-gnu"
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-tl.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-tl.ll
index a0408c8ea6a2e..13b78dde0ee32 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-tl.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2007-05-09-tl.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch -disable-output
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output
 ; PR1333
 
 define void @pp_cxx_expression() {
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-12-ExitDomInfo.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-12-ExitDomInfo.ll
index 571e3eb6696df..65560369e3289 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-12-ExitDomInfo.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-12-ExitDomInfo.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch -instcombine -disable-output
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -instcombine -disable-output
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -instcombine -disable-output
 
 @str3 = external constant [3 x i8]		; <[3 x i8]*> [#uses=1]
 
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-13-DomInfo.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-13-DomInfo.ll
index 626ac848cfb6b..644aaedde3a13 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-13-DomInfo.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-13-DomInfo.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch -disable-output
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output
 
 define i32 @main(i32 %argc, i8** %argv) {
 entry:
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-18-DomInfo.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-18-DomInfo.ll
index 52d96893060e1..e7c94cc35413f 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-18-DomInfo.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2007-07-18-DomInfo.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch -disable-output
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output
 ; PR1559
 
 target triple = "i686-pc-linux-gnu"
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2007-08-01-LCSSA.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2007-08-01-LCSSA.ll
index 7c65459a65ccc..c389b2e5e1002 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2007-08-01-LCSSA.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2007-08-01-LCSSA.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch -instcombine -disable-output
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -instcombine -disable-output
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -instcombine -disable-output
 	%struct.ClassDef = type { %struct.QByteArray, %struct.QByteArray, %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", i8, i8, %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QList<ArgumentDef>", %"struct.QMap<QByteArray,QByteArray>", %"struct.QList<ArgumentDef>", %"struct.QMap<QByteArray,QByteArray>", i32, i32 }
 	%struct.FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct.FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] }
 	%struct.Generator = type { %struct.FILE*, %struct.ClassDef*, %"struct.QList<ArgumentDef>", %struct.QByteArray, %"struct.QList<ArgumentDef>" }
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2008-06-02-DomInfo.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2008-06-02-DomInfo.ll
index 5db1ced473f6a..ed61b1aca8241 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2008-06-02-DomInfo.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2008-06-02-DomInfo.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch -instcombine -gvn -disable-output
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -instcombine -gvn -disable-output
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -instcombine -gvn -disable-output
 ; PR2372
 target triple = "i386-pc-linux-gnu"
 
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2010-11-18-LCSSA.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2010-11-18-LCSSA.ll
index f3a382d813b8b..ece07c65f5ccb 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2010-11-18-LCSSA.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2010-11-18-LCSSA.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa
 ; PR8622
 @g_38 = external global i32, align 4
 
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll
index b861d3029d566..9ee6f4c9dc860 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2011-06-02-CritSwitch.ll
@@ -1,5 +1,4 @@
-; RUN: opt -simple-loop-unswitch -disable-output < %s
-; RUN: opt -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output < %s
+; RUN: opt -simple-loop-unswitch -verify-memoryssa -disable-output < %s
 ; PR10031
 
 define i32 @test(i32 %command) {
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2011-09-26-EHCrash.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2011-09-26-EHCrash.ll
index 16886bfec034d..7133b2618792d 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2011-09-26-EHCrash.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2011-09-26-EHCrash.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -sroa -simple-loop-unswitch -disable-output
-; RUN: opt < %s -sroa -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -sroa -simple-loop-unswitch -verify-memoryssa -disable-output
 ; PR11016
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-macosx10.7.2"
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-02-IndirectBr.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-02-IndirectBr.ll
index 72af984081877..61cc78f09ee98 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-02-IndirectBr.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2012-04-02-IndirectBr.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -S -simple-loop-unswitch -verify-loop-info -verify-dom-info | FileCheck %s
-; RUN: opt < %s -S -simple-loop-unswitch -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa | FileCheck %s
+; RUN: opt < %s -S -simple-loop-unswitch -verify-loop-info -verify-dom-info -verify-memoryssa | FileCheck %s
 ; PR12343: -simple-loop-unswitch crash on indirect branch
 
 ; CHECK:       %0 = icmp eq i64 undef, 0
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2012-05-20-Phi.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2012-05-20-Phi.ll
index e9f35709ba730..feed1f0c93558 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2012-05-20-Phi.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2012-05-20-Phi.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch -disable-output
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output
 ; PR12887
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll b/llvm/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll
index c8de642a5d09e..74e918aaf0e74 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/2015-09-18-Addrspace.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch -S | FileCheck %s
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -S | FileCheck %s
 
 ; In cases where two address spaces do not have the same size pointer, the
 ; input for the addrspacecast should not be used as a substitute for itself
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll b/llvm/test/Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll
index 59c14e937b637..32a3959a91be7 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/LIV-loop-condtion.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch -S 2>&1 | FileCheck %s
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S 2>&1 | FileCheck %s
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -S 2>&1 | FileCheck %s
 
 ; This is to test trivial loop unswitch only happens when trivial condition
 ; itself is an LIV loop condition (not partial LIV which could occur in and/or).
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/cleanuppad.ll b/llvm/test/Transforms/SimpleLoopUnswitch/cleanuppad.ll
index 1cade22b65905..2c949aa83cc17 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/cleanuppad.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/cleanuppad.ll
@@ -1,5 +1,4 @@
-; RUN: opt -S -simple-loop-unswitch < %s | FileCheck %s
-; RUN: opt -S -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa < %s | FileCheck %s
+; RUN: opt -S -simple-loop-unswitch -verify-memoryssa < %s | FileCheck %s
 target triple = "x86_64-pc-win32"
 
 define void @f(i32 %doit, i1 %x, i1 %y) personality i32 (...)* @__CxxFrameHandler3 {
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/copy-metadata.ll b/llvm/test/Transforms/SimpleLoopUnswitch/copy-metadata.ll
index 09d7d792c7c6c..6ad72c6a1824c 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/copy-metadata.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/copy-metadata.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch -S | FileCheck %s
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S | FileCheck %s
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -S | FileCheck %s
 
 ; This test checks if unswitched condition preserve make.implicit metadata.
 define i32 @test(i1 %cond) {
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/crash.ll b/llvm/test/Transforms/SimpleLoopUnswitch/crash.ll
index cf6a19d254019..554c30ebdd871 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/crash.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/crash.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -simple-loop-unswitch -disable-output
-; RUN: opt < %s -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output
+; RUN: opt < %s -simple-loop-unswitch -verify-memoryssa -disable-output
 
 define void @test1(i32* %S2) {
 entry:
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-behavior.ll b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-behavior.ll
index 1c46ddbf51a86..5d309ee703076 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-behavior.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-behavior.ll
@@ -1,5 +1,4 @@
-; RUN: opt -simple-loop-unswitch -S < %s | FileCheck %s
-; RUN: opt -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
+; RUN: opt -simple-loop-unswitch -verify-memoryssa -S < %s | FileCheck %s
 
 define void @f(i32 %n, i32* %ptr) {
 ; CHECK-LABEL: @f(
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/infinite-loop.ll b/llvm/test/Transforms/SimpleLoopUnswitch/infinite-loop.ll
index 91e1f486b8286..7169154849548 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/infinite-loop.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/infinite-loop.ll
@@ -1,7 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: opt -simple-loop-unswitch -disable-output -stats -info-output-file - < %s | FileCheck --check-prefix=STATS %s
-; RUN: opt -simple-loop-unswitch -S < %s | FileCheck %s
-; RUN: opt -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
+; RUN: opt -simple-loop-unswitch -verify-memoryssa -S < %s | FileCheck %s
 ; PR5373
 
 ; Loop unswitching shouldn't trivially unswitch the true case of condition %a
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-cost.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-cost.ll
index 56a86a01e28e6..1185fec70e1fe 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-cost.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-cost.ll
@@ -2,8 +2,7 @@
 ;
 ; RUN: opt -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -unswitch-threshold=5 -S < %s | FileCheck %s
 ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -unswitch-threshold=5 -S < %s | FileCheck %s
-; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -unswitch-threshold=5 -S < %s | FileCheck %s
-; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -unswitch-threshold=5 -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
+; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -unswitch-threshold=5 -verify-memoryssa -S < %s | FileCheck %s
 
 declare void @a()
 declare void @b()
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
index d0963b71f1ad9..8be18d5152399 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll
@@ -1,7 +1,6 @@
 ; RUN: opt -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s
 ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s
-; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -S < %s | FileCheck %s
-; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -S < %s | FileCheck %s
+; RUN: opt -simple-loop-unswitch -enable-nontrivial-unswitch -verify-memoryssa -S < %s | FileCheck %s
 
 declare i32 @a()
 declare i32 @b()
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/pr37888.ll b/llvm/test/Transforms/SimpleLoopUnswitch/pr37888.ll
index e8e34a2e88240..2fc3d2bf6824d 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/pr37888.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/pr37888.ll
@@ -1,5 +1,4 @@
-; RUN: opt -simple-loop-unswitch -loop-deletion -S < %s | FileCheck %s
-; RUN: opt -simple-loop-unswitch -enable-mssa-loop-dependency=true -verify-memoryssa -loop-deletion -S < %s | FileCheck %s
+; RUN: opt -simple-loop-unswitch -verify-memoryssa -loop-deletion -S < %s | FileCheck %s
 ;
 ; Check that when we do unswitching where we re-enqueue the loop to be processed
 ; again, but manage to delete the loop before ever getting to iterate on it, it
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll b/llvm/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll
index 114825348dadd..50dac9d4a433b 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/preserve-analyses.ll
@@ -1,5 +1,4 @@
-; RUN: opt -simple-loop-unswitch -verify-loop-info -verify-dom-info -disable-output < %s
-; RUN: opt -simple-loop-unswitch -verify-loop-info -verify-dom-info -enable-mssa-loop-dependency=true -verify-memoryssa -disable-output < %s
+; RUN: opt -simple-loop-unswitch -verify-loop-info -verify-dom-info -verify-memoryssa -disable-output < %s
 
 ; Loop unswitch should be able to unswitch these loops and
 ; preserve LCSSA and LoopSimplify forms.
diff --git a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll
index 0dae0e8fc107b..2ff0418260771 100644
--- a/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll
+++ b/llvm/test/Transforms/SimplifyCFG/fold-branch-to-common-dest.ll
@@ -134,6 +134,7 @@ define void @one_pred_with_spec_call(i8 %v0, i8 %v1, i32* %p) {
 ; CHECK:       final_right:
 ; CHECK-NEXT:    call void @sideeffect0()
 ; CHECK-NEXT:    br label [[COMMON_RET]]
+;
 pred:
   %c0 = icmp ne i32* %p, null
   br i1 %c0, label %dispatch, label %final_right
@@ -153,10 +154,19 @@ final_right:
 
 ; Drop dereferenceable on the parameter
 define void @one_pred_with_spec_call_deref(i8 %v0, i8 %v1, i32* %p) {
-; CHECK-LABEL: one_pred_with_spec_call_deref
-; CHECK-LABEL: pred:
-; CHECK:         %c0 = icmp ne i32* %p, null
-; CHECK:         %x = call i32 @speculate_call(i32* %p)
+; CHECK-LABEL: @one_pred_with_spec_call_deref(
+; CHECK-NEXT:  pred:
+; CHECK-NEXT:    [[C0:%.*]] = icmp ne i32* [[P:%.*]], null
+; CHECK-NEXT:    [[X:%.*]] = call i32 @speculate_call(i32* [[P]])
+; CHECK-NEXT:    [[C1:%.*]] = icmp eq i8 [[V1:%.*]], 0
+; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[C0]], i1 [[C1]], i1 false
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[COMMON_RET:%.*]], label [[FINAL_RIGHT:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       final_right:
+; CHECK-NEXT:    call void @sideeffect0()
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
 pred:
   %c0 = icmp ne i32* %p, null
   br i1 %c0, label %dispatch, label %final_right
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
index b42024ee48995..154ecb2166310 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-dead-default.ll
@@ -33,7 +33,7 @@ default:
 
 define void @test2(i2 %a) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    switch i2 [[A:%.*]], label [[DEFAULT1:%.*]] [
+; CHECK-NEXT:    switch i2 [[A:%.*]], label [[DOTUNREACHABLEDEFAULT:%.*]] [
 ; CHECK-NEXT:    i2 0, label [[CASE0:%.*]]
 ; CHECK-NEXT:    i2 1, label [[CASE1:%.*]]
 ; CHECK-NEXT:    i2 -2, label [[CASE2:%.*]]
@@ -53,7 +53,7 @@ define void @test2(i2 %a) {
 ; CHECK:       case3:
 ; CHECK-NEXT:    call void @foo(i32 3)
 ; CHECK-NEXT:    br label [[COMMON_RET]]
-; CHECK:       default1:
+; CHECK:       .unreachabledefault:
 ; CHECK-NEXT:    unreachable
 ;
   switch i2 %a, label %default [i2 0, label %case0
diff --git a/llvm/test/Transforms/TypePromotion/ARM/calls.ll b/llvm/test/Transforms/TypePromotion/ARM/calls.ll
index cd273c06150f5..9b140b029beeb 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/calls.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/calls.ll
@@ -169,7 +169,7 @@ define hidden i32 @call_return_pointer(i8 zeroext %p_13) local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i8
 ; CHECK-NEXT:    [[CONV1:%.*]] = zext i8 [[TMP1]] to i16
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i16** @func_62(i8 zeroext undef, i32 undef, i16 signext [[CONV1]], i32* undef)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @g_893, i32 0, i32 0), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([[STRUCT_ANON:%.*]], %struct.anon* @g_893, i32 0, i32 0), align 4
 ; CHECK-NEXT:    [[CONV2:%.*]] = trunc i32 [[TMP2]] to i16
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
diff --git a/llvm/test/Transforms/TypePromotion/ARM/casts.ll b/llvm/test/Transforms/TypePromotion/ARM/casts.ll
index 70fa617115e86..7cd9cba0b7097 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/casts.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/casts.ll
@@ -32,7 +32,7 @@ define i8 @trunc_i16_i8(i16* %ptr, i16 zeroext %arg0, i8 zeroext %arg1) {
 ; CHECK-LABEL: @trunc_i16_i8(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[ARG1:%.*]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, i16* [[PTR:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, i16* [[PTR:%.*]], align 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], [[ARG0:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i16 [[TMP2]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
@@ -132,7 +132,7 @@ entry:
 define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) {
 ; CHECK-LABEL: @or_icmp_ugt(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[PTR:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[PTR:%.*]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i32
 ; CHECK-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[TMP2]], 1
@@ -246,6 +246,16 @@ exit:
 }
 
 define i16 @bitcast_i16(i16 zeroext %arg0, i16 zeroext %arg1) {
+; CHECK-LABEL: @bitcast_i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[ARG0:%.*]] to i32
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i16 12345 to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[CAST]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP0]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i32 [[ADD]], [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i16 [[ARG1:%.*]], i16 32657
+; CHECK-NEXT:    ret i16 [[RES]]
+;
 entry:
   %cast = bitcast i16 12345 to i16
   %add = add nuw i16 %arg0, 1
@@ -518,7 +528,7 @@ define i8 @search_through_zext_load(i8* %a, i8 zeroext %b, i16 zeroext %c) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[C:%.*]] to i32
-; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[A:%.*]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[A:%.*]], align 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[LOAD]] to i32
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
@@ -631,7 +641,7 @@ define i16 @trunc_sink_less_than_store(i16 zeroext %a, i16 zeroext %b, i16 zeroe
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[SUB]], 255
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[ADD]] to i8
-; CHECK-NEXT:    store i8 [[TMP5]], i8* [[E:%.*]]
+; CHECK-NEXT:    store i8 [[TMP5]], i8* [[E:%.*]], align 1
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUB]], [[IF_THEN]] ]
@@ -981,7 +991,7 @@ entry:
 define i32 @replace_trunk_with_mask(i16* %a) {
 ; CHECK-LABEL: @replace_trunk_with_mask(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[A:%.*]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[A:%.*]], align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP0]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
diff --git a/llvm/test/Transforms/TypePromotion/ARM/icmps.ll b/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
index 6dda15c309b4a..f5cf2bc43681c 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
@@ -168,8 +168,8 @@ define void @store_dsp_res(i8* %in, i8* %out, i8 %compare) {
 ; CHECK-LABEL: @store_dsp_res(
 ; CHECK-NEXT:    [[FIRST:%.*]] = getelementptr inbounds i8, i8* [[IN:%.*]], i32 0
 ; CHECK-NEXT:    [[SECOND:%.*]] = getelementptr inbounds i8, i8* [[IN]], i32 1
-; CHECK-NEXT:    [[LD0:%.*]] = load i8, i8* [[FIRST]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, i8* [[SECOND]]
+; CHECK-NEXT:    [[LD0:%.*]] = load i8, i8* [[FIRST]], align 1
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, i8* [[SECOND]], align 1
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[LD0]], -1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[COMPARE:%.*]], [[LD1]]
 ; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], i8 [[COMPARE]], i8 [[XOR]]
@@ -253,7 +253,7 @@ define i32 @icmp_not(i16 zeroext %arg0, i16 zeroext %arg1) {
 define i32 @icmp_i1(i1* %arg0, i1 zeroext %arg1, i32 %a, i32 %b) {
 ; CHECK-LABEL: @icmp_i1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD:%.*]] = load i1, i1* [[ARG0:%.*]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i1, i1* [[ARG0:%.*]], align 1
 ; CHECK-NEXT:    [[NOT:%.*]] = xor i1 [[LOAD]], true
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i1 [[ARG1:%.*]], [[NOT]]
 ; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 [[A:%.*]], i32 [[B:%.*]]
@@ -271,7 +271,7 @@ define i32 @icmp_i7(i7* %arg0, i7 zeroext %arg1, i32 %a, i32 %b) {
 ; CHECK-LABEL: @icmp_i7(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i7 [[ARG1:%.*]] to i32
-; CHECK-NEXT:    [[LOAD:%.*]] = load i7, i7* [[ARG0:%.*]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i7, i7* [[ARG0:%.*]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i7 [[LOAD]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP0]], [[ADD]]
diff --git a/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll b/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll
index e79e4ff1bdb2e..8659674bb9750 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll
@@ -278,7 +278,7 @@ define i16 @promote_arg_return(i16 zeroext %arg1, i16 zeroext %arg2, i8* %res) {
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[ADD]], 3
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[MUL]], [[TMP2]]
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i8
-; CHECK-NEXT:    store i8 [[CONV]], i8* [[RES:%.*]]
+; CHECK-NEXT:    store i8 [[CONV]], i8* [[RES:%.*]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP1]] to i16
 ; CHECK-NEXT:    ret i16 [[TMP3]]
 ;
diff --git a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
index 3c5f097b1b92b..3e37550186e69 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
@@ -130,8 +130,8 @@ define i8 @call_pointer(i8 zeroext %x, i8 zeroext %y, i16* %a, i16* %b) {
 define i16 @pointer_to_pointer(i16** %arg, i16 zeroext %limit) {
 ; CHECK-LABEL: @pointer_to_pointer(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ADDR:%.*]] = load i16*, i16** [[ARG:%.*]]
-; CHECK-NEXT:    [[VAL:%.*]] = load i16, i16* [[ADDR]]
+; CHECK-NEXT:    [[ADDR:%.*]] = load i16*, i16** [[ARG:%.*]], align 8
+; CHECK-NEXT:    [[VAL:%.*]] = load i16, i16* [[ADDR]], align 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[VAL]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP0]], 7
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD]], 256
diff --git a/llvm/test/Transforms/TypePromotion/ARM/signed.ll b/llvm/test/Transforms/TypePromotion/ARM/signed.ll
index 143220a53b5c2..fb60a3f101f7d 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/signed.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/signed.ll
@@ -4,7 +4,7 @@
 ; Test to check that ARMCodeGenPrepare doesn't optimised away sign extends.
 define i16 @test_signed_load(i16* %ptr) {
 ; CHECK-LABEL: @test_signed_load(
-; CHECK-NEXT:    [[LOAD:%.*]] = load i16, i16* [[PTR:%.*]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i16, i16* [[PTR:%.*]], align 2
 ; CHECK-NEXT:    [[CONV0:%.*]] = zext i16 [[LOAD]] to i32
 ; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[LOAD]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CONV0]], [[CONV1]]
diff --git a/llvm/test/Verifier/vp-intrinsics.ll b/llvm/test/Verifier/vp-intrinsics.ll
index 971d9b767ef2f..c77ba08321f94 100644
--- a/llvm/test/Verifier/vp-intrinsics.ll
+++ b/llvm/test/Verifier/vp-intrinsics.ll
@@ -29,6 +29,24 @@ define void @test_vp_fp(<8 x double> %f0, <8 x double> %f1, <8 x i1> %m, i32 %n)
 
 ; TODO: test_vp_constrained_fp
 
+
+define void @test_vp_reduction(i32 %x, <8 x i32> %vi, <8 x float> %vf, float %f, <8 x i1> %m, i32 %n) {
+  %r0 = call i32 @llvm.vp.reduce.add.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r1 = call i32 @llvm.vp.reduce.mul.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r2 = call i32 @llvm.vp.reduce.and.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r3 = call i32 @llvm.vp.reduce.or.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r4 = call i32 @llvm.vp.reduce.xor.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r5 = call i32 @llvm.vp.reduce.smax.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r6 = call i32 @llvm.vp.reduce.smin.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r7 = call i32 @llvm.vp.reduce.umax.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r8 = call i32 @llvm.vp.reduce.umin.v8i32(i32 %x, <8 x i32> %vi, <8 x i1> %m, i32 %n)
+  %r9 = call float @llvm.vp.reduce.fmin.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
+  %rA = call float @llvm.vp.reduce.fmax.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
+  %rB = call float @llvm.vp.reduce.fadd.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
+  %rC = call float @llvm.vp.reduce.fmul.v8f32(float %f, <8 x float> %vf, <8 x i1> %m, i32 %n)
+  ret void
+}
+
 ; integer arith
 declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
@@ -50,3 +68,17 @@ declare <8 x double> @llvm.vp.fsub.v8f64(<8 x double>, <8 x double>, <8 x i1>, i
 declare <8 x double> @llvm.vp.fmul.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32)
 declare <8 x double> @llvm.vp.fdiv.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32)
 declare <8 x double> @llvm.vp.frem.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32)
+; reductions
+declare i32 @llvm.vp.reduce.add.v8i32(i32, <8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.mul.v8i32(i32, <8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.and.v8i32(i32, <8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.or.v8i32(i32, <8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.xor.v8i32(i32, <8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.smax.v8i32(i32, <8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.smin.v8i32(i32, <8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.umax.v8i32(i32, <8 x i32>, <8 x i1>, i32)
+declare i32 @llvm.vp.reduce.umin.v8i32(i32, <8 x i32>, <8 x i1>, i32)
+declare float @llvm.vp.reduce.fmin.v8f32(float, <8 x float>, <8 x i1>, i32)
+declare float @llvm.vp.reduce.fmax.v8f32(float, <8 x float>, <8 x i1>, i32)
+declare float @llvm.vp.reduce.fadd.v8f32(float, <8 x float>, <8 x i1>, i32)
+declare float @llvm.vp.reduce.fmul.v8f32(float, <8 x float>, <8 x i1>, i32)
diff --git a/llvm/test/tools/llvm-cov/branch-export-lcov.test b/llvm/test/tools/llvm-cov/branch-export-lcov.test
index 3e4a300d645d7..1714123ec7be5 100644
--- a/llvm/test/tools/llvm-cov/branch-export-lcov.test
+++ b/llvm/test/tools/llvm-cov/branch-export-lcov.test
@@ -34,7 +34,7 @@
 // CHECK-DAG: BRDA:53,0,1,5
 // CHECK-NOT: BRDA
 // CHECK: BRF:30
-// CHECK: BFH:26
+// CHECK: BRH:26
 
 // Check recursive macro-expansions.
 // RUN: llvm-profdata merge %S/Inputs/branch-macros.proftext -o %t.profdata
@@ -70,4 +70,4 @@
 // MACROS-NOT: BRDA:37
 // MACROS-NOT: BRDA
 // MACROS: BRF:40
-// MACROS: BFH:24
+// MACROS: BRH:24
diff --git a/llvm/test/tools/llvm-cvtres/help.test b/llvm/test/tools/llvm-cvtres/help.test
index 3be7c96fa020d..a3817e9725ae5 100644
--- a/llvm/test/tools/llvm-cvtres/help.test
+++ b/llvm/test/tools/llvm-cvtres/help.test
@@ -4,11 +4,13 @@
 ; HELP_TEST: 	  OVERVIEW: Resource Converter
 ; HELP_TEST-DAG:  USAGE: llvm-cvtres [options] file...
 ; HELP_TEST-DAG:  OPTIONS:
-; HELP_TEST-NEXT:   /DEFINE:symbol 
-; HELP_TEST-NEXT:   /FOLDDUPS:     
+; HELP_TEST-NEXT:   /DEFINE:symbol Not implemented
+; HELP_TEST-NEXT:   /FOLDDUPS: Not implemented
+; HELP_TEST-NEXT:   /HELP Display available options
 ; HELP_TEST-NEXT:   /MACHINE:{ARM|ARM64|EBC|IA64|X64|X86}
-; HELP_TEST-DAG:    /NOLOGO        
-; HELP_TEST-NEXT:   /OUT:filename  
-; HELP_TEST-NEXT:   /READONLY      
+; HELP_TEST-NEXT:   Machine architecture
+; HELP_TEST-DAG:    /NOLOGO Not implemented
+; HELP_TEST-NEXT:   /OUT:filename Output file
+; HELP_TEST-NEXT:   /READONLY Not implemented
 ; HELP_TEST-NEXT:   /TIMESTAMP:<value> Timestamp for coff header, defaults to current time
-; HELP_TEST-NEXT:   /VERBOSE
+; HELP_TEST-NEXT:   /VERBOSE Use verbose output
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_cu_dont_share_line_table.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_cu_dont_share_line_table.yaml
new file mode 100644
index 0000000000000..3d0441ad8ca42
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_cu_dont_share_line_table.yaml
@@ -0,0 +1,74 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -debug-line -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_line... 
+# CHECK-NEXT: error: two compile unit DIEs, 0x0000000b and 0x0000001f, have the same DW_AT_stmt_list section offset:
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+    - /tmp/foo.c
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_stmt_list
+              Form:            DW_FORM_sec_offset
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000000
+    - Length:          16
+      Version:         4
+      AbbrevTableID:   0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x000000000000000D
+            - Value:           0x0000000000000000
+  debug_line:
+    - Version:         2
+      MinInstLength:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
+      IncludeDirs:
+        - /tmp
+      Files:
+        - Name:            main.c
+          DirIdx:          1
+          ModTime:         0
+          Length:          0
+      Opcodes:
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4096
+        - Opcode:          DW_LNS_advance_line
+          SData:           9
+          Data:            4096
+        - Opcode:          DW_LNS_copy
+          Data:            4096
+        - Opcode:          DW_LNS_advance_pc
+          Data:            256
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            256
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_cu_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_cu_ranges.yaml
new file mode 100644
index 0000000000000..1ace2d4159d1f
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_cu_ranges.yaml
@@ -0,0 +1,212 @@
+# This test verifies that: if a DW_TAG_compile_unit has children not having
+# ranges then llvm-dwarfdump still correctly verifies other dies with normal
+# ranges.
+#
+# The DWARF looks like:
+# 0x0000000b: DW_TAG_compile_unit
+#               DW_AT_name	("/tmp/main.c")
+#               DW_AT_language	(DW_LANG_C)
+#               DW_AT_low_pc	(0x0000000000000000)
+#               DW_AT_ranges	(0x00000000
+#                  [0x0000000000002000, 0x0000000000003000)
+#                  [0x0000000000001000, 0x0000000000002000))
+#
+# 0x0000001e:   DW_TAG_subprogram
+#                 DW_AT_name	("no_range1")
+#
+# 0x00000044:   DW_TAG_subprogram
+#                 DW_AT_name	("main")
+#                 DW_AT_low_pc	(0x0000000000001000)
+#                 DW_AT_high_pc	(0x0000000000002000)
+#
+# 0x00000055:   DW_TAG_subprogram
+#                 DW_AT_name	("foo")
+#                 DW_AT_low_pc	(0x0000000000002000)
+#                 DW_AT_high_pc	(0x0000000000003000)
+#
+# # 0x0000001e:   DW_TAG_subprogram
+#                 DW_AT_name    ("no_range2")
+#
+# 0x00000066:   NULL
+
+# RUN: yaml2obj %s | llvm-dwarfdump --verify - | FileCheck %s
+
+# CHECK: Verifying -:	file format Mach-O 64-bit x86-64
+# CHECK: Verifying .debug_abbrev...
+# CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK: No errors.
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x00000001
+  ncmds:           4
+  sizeofcmds:      464
+  flags:           0x00002000
+  reserved:        0x00000000
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         392
+    segname:         ''
+    vmaddr:          0
+    vmsize:          261
+    fileoff:         528
+    filesize:        261
+    maxprot:         7
+    initprot:        7
+    nsects:          4
+    flags:           0
+    Sections:
+      - sectname:        __debug_abbrev
+        segname:         __DWARF
+        addr:            0x0000000000000000
+        size:            36
+        offset:          0x00000210
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_info
+        segname:         __DWARF
+        addr:            0x0000000000000024
+        size:            75
+        offset:          0x00000234
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_ranges
+        segname:         __DWARF
+        addr:            0x000000000000008B
+        size:            48
+        offset:          0x0000029B
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_str
+        segname:         __DWARF
+        addr:            0x00000000000000DB
+        size:            42
+        offset:          0x000002EB
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          0
+    nsyms:           0
+    stroff:          792
+    strsize:         8
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           658944
+    sdk:             658944
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         34734080
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         792
+    datasize:        0
+LinkEditData:
+  StringTable:
+    - ' '
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+DWARF:
+  debug_str:
+    - ''
+    - '/tmp/main.c'
+    - no_range1
+    - no_range2
+    - main
+    - foo
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_ranges
+              Form:            DW_FORM_sec_offset
+        - Code:            0x00000002
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+        - Code:            0x00000003
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+
+  debug_ranges:
+    - Offset:          0x00000000
+      AddrSize:        0x08
+      Entries:
+        - LowOffset:       0x0000000000002000
+          HighOffset:      0x0000000000003000
+        - LowOffset:       0x0000000000001000
+          HighOffset:      0x0000000000002000
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000002
+            - Value:           0x0000000000000000
+            - Value:           0x0000000000000000
+        - AbbrCode:        0x00000003
+          Values:
+            - Value:           0x000000000000000D
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x0000000000000021
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000001000
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x0000000000000026
+            - Value:           0x0000000000002000
+            - Value:           0x0000000000001000
+        - AbbrCode:        0x00000003
+          Values:
+            - Value:           0x0000000000000017
+        - AbbrCode:        0x00000000
+...
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_curanges_incomplete.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_curanges_incomplete.yaml
new file mode 100644
index 0000000000000..39fadca9f5ba9
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_curanges_incomplete.yaml
@@ -0,0 +1,49 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: error: DIE address ranges are not contained in its parent's ranges:
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x00000002
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000001500
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000002000
+        - AbbrCode:        0x00000000
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_duplicate_file_warning.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_duplicate_file_warning.yaml
new file mode 100644
index 0000000000000..d9ebd57d13fbb
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_duplicate_file_warning.yaml
@@ -0,0 +1,70 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: llvm-dwarfdump -debug-line -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_line...
+# CHECK-NEXT: warning: .debug_line[0x00000000].prologue.file_names[2] is a duplicate of file_names[1]
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_stmt_list
+              Form:            DW_FORM_sec_offset
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000000
+  debug_line:
+    - Version:         2
+      MinInstLength:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
+      IncludeDirs:
+        - /tmp
+      Files:
+        - Name:            main.c
+          DirIdx:          1
+          ModTime:         0
+          Length:          0
+        - Name:            main.c
+          DirIdx:          1
+          ModTime:         0
+          Length:          0
+      Opcodes:
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4096
+        - Opcode:          DW_LNS_advance_line
+          SData:           9
+          Data:            4096
+        - Opcode:          DW_LNS_copy
+          Data:            4096
+        - Opcode:          DW_LNS_advance_pc
+          Data:            16
+        - Opcode:          DW_LNS_set_file
+          Data:            1
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            2
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_elided_doesnt_fail.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_elided_doesnt_fail.yaml
new file mode 100644
index 0000000000000..ff517f76a40e1
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_elided_doesnt_fail.yaml
@@ -0,0 +1,56 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: llvm-dwarfdump -verify %t.o | FileCheck --implicit-check-not=error: %s
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+    - main
+    - elided
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x00000002
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000002000
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x000000000000000D
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000002000
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x0000000000000012
+            - Value:           0x0000000000002000
+            - Value:           0x0000000000002000
+        - AbbrCode:        0x00000000
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_cu_ref.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_cu_ref.yaml
new file mode 100644
index 0000000000000..d3a5ff7c5dcf5
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_cu_ref.yaml
@@ -0,0 +1,44 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: error: DW_FORM_ref4 CU offset 0x00001234 is invalid (must be less than CU size of 0x0000001a):
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+    - main
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x00000002
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x000000000000000D
+            - Value:           0x0000000000001234
+        - AbbrCode:        0x00000000
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_die_range.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_die_range.yaml
new file mode 100644
index 0000000000000..31ff1231eab8d
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_die_range.yaml
@@ -0,0 +1,47 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: error: Invalid address range
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+    - main
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x00000002
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x000000000000000D
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000000900
+        - AbbrCode:        0x00000000
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_file_index.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_file_index.yaml
new file mode 100644
index 0000000000000..1a0862d01efd5
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_file_index.yaml
@@ -0,0 +1,66 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -debug-line -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_line...
+# CHECK-NEXT: error: .debug_line[0x00000000][1] has invalid file index 5 (valid values are [1,1]):
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_stmt_list
+              Form:            DW_FORM_sec_offset
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000000
+  debug_line:
+    - Version:         2
+      MinInstLength:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
+      IncludeDirs:
+        - /tmp
+      Files:
+        - Name:            main.c
+          DirIdx:          1
+          ModTime:         0
+          Length:          0
+      Opcodes:
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4096
+        - Opcode:          DW_LNS_advance_line
+          SData:           9
+          Data:            4096
+        - Opcode:          DW_LNS_copy
+          Data:            4096
+        - Opcode:          DW_LNS_advance_pc
+          Data:            16
+        - Opcode:          DW_LNS_set_file
+          Data:            5
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            5
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_sequence.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_sequence.yaml
new file mode 100644
index 0000000000000..c587707b9f7c8
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_sequence.yaml
@@ -0,0 +1,64 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -debug-line -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_line...
+# CHECK-NEXT: error: .debug_line[0x00000000] row[1] decreases in address from previous row:
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_stmt_list
+              Form:            DW_FORM_sec_offset
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000000
+  debug_line:
+    - Version:         2
+      MinInstLength:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
+      IncludeDirs:
+        - /tmp
+      Files:
+        - Name:            main.c
+          DirIdx:          1
+          ModTime:         0
+          Length:          0
+      Opcodes:
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4112
+        - Opcode:          DW_LNS_advance_line
+          SData:           9
+          Data:            4112
+        - Opcode:          DW_LNS_copy
+          Data:            4112
+        - Opcode:          DW_LNS_advance_pc
+          Data:            18446744073709551600
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            18446744073709551600
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_table_prologue_dir_index.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_table_prologue_dir_index.yaml
new file mode 100644
index 0000000000000..1b5e28c0bb50e
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_line_table_prologue_dir_index.yaml
@@ -0,0 +1,66 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -debug-line -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_line...
+# CHECK-NEXT: error: .debug_line[0x00000000].prologue.file_names[1].dir_idx contains an invalid index: 2
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_stmt_list
+              Form:            DW_FORM_sec_offset
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000000
+  debug_line:
+    - Version:         2
+      MinInstLength:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
+      IncludeDirs:
+        - /tmp
+      Files:
+        - Name:            main.c
+          DirIdx:          2
+          ModTime:         0
+          Length:          0
+      Opcodes:
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4096
+        - Opcode:          DW_LNS_advance_line
+          SData:           9
+          Data:            4096
+        - Opcode:          DW_LNS_copy
+          Data:            4096
+        - Opcode:          DW_LNS_advance_pc
+          Data:            16
+        - Opcode:          DW_LNS_set_file
+          Data:            1
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            1
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ranges.yaml
new file mode 100644
index 0000000000000..53317d7fc3c64
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ranges.yaml
@@ -0,0 +1,33 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: error: DW_AT_ranges offset is beyond .debug_ranges bounds: 0x00001000
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_ranges
+              Form:            DW_FORM_sec_offset
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000001000
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr.yaml
new file mode 100644
index 0000000000000..b998e1a754dcd
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr.yaml
@@ -0,0 +1,44 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: error: DW_FORM_ref_addr offset beyond .debug_info bounds:
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+    - main
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x00000002
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref_addr
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x000000000000000D
+            - Value:           0x0000000000001234
+        - AbbrCode:        0x00000000
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml
new file mode 100644
index 0000000000000..a412f7b879b8e
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_addr_between.yaml
@@ -0,0 +1,44 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_info
+# CHECK-NEXT: error: invalid DIE reference 0x00000011. Offset is in between DIEs:
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+    - main
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x00000002
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref_addr
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x000000000000000D
+            - Value:           0x0000000000000011
+        - AbbrCode:        0x00000000
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_multi_section.s b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_multi_section.s
new file mode 100644
index 0000000000000..71a93fad6c190
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_ref_multi_section.s
@@ -0,0 +1,193 @@
+# RUN: llvm-mc -triple x86_64-pc-linux %s -o %t.o -filetype=obj
+# RUN: llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
+
+# CHECK-NOT: error:
+
+# Assembly generated from this source:
+# struct t1 { int i; };
+# t1 v1;
+# and compiled with -g -fdebug-types-section
+#
+# This demonstrates that llvm-dwarfdump --verify does not try to apply offsets found in the .debug_type
+
+	.text
+	.file	"test.cpp"
+	.file	1 "/usr/local/google/home/blaikie/dev/scratch" "test.cpp"
+	.section	.debug_types,"G",@progbits,14297044602779165170,comdat
+	.long	.Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.quad	-4149699470930386446            # Type Signature
+	.long	30                              # Type DIE Offset
+	.byte	1                               # Abbrev [1] 0x17:0x25 DW_TAG_type_unit
+	.short	33                              # DW_AT_language
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.byte	2                               # Abbrev [2] 0x1e:0x16 DW_TAG_structure_type
+	.byte	5                               # DW_AT_calling_convention
+	.long	.Linfo_string6                  # DW_AT_name
+	.byte	4                               # DW_AT_byte_size
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	3                               # Abbrev [3] 0x27:0xc DW_TAG_member
+	.long	.Linfo_string4                  # DW_AT_name
+	.long	52                              # DW_AT_type
+	.byte	1                               # DW_AT_decl_file
+	.byte	1                               # DW_AT_decl_line
+	.byte	0                               # DW_AT_data_member_location
+	.byte	0                               # End Of Children Mark
+	.byte	4                               # Abbrev [4] 0x34:0x7 DW_TAG_base_type
+	.long	.Linfo_string5                  # DW_AT_name
+	.byte	5                               # DW_AT_encoding
+	.byte	4                               # DW_AT_byte_size
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end0:
+	.type	v1,@object                      # @v1
+	.bss
+	.globl	v1
+	.p2align	2
+v1:
+	.zero	4
+	.size	v1, 4
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1                               # Abbreviation Code
+	.byte	65                              # DW_TAG_type_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	2                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	54                              # DW_AT_calling_convention
+	.byte	11                              # DW_FORM_data1
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	3                               # Abbreviation Code
+	.byte	13                              # DW_TAG_member
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	56                              # DW_AT_data_member_location
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	4                               # Abbreviation Code
+	.byte	36                              # DW_TAG_base_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	62                              # DW_AT_encoding
+	.byte	11                              # DW_FORM_data1
+	.byte	11                              # DW_AT_byte_size
+	.byte	11                              # DW_FORM_data1
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	5                               # Abbreviation Code
+	.byte	17                              # DW_TAG_compile_unit
+	.byte	1                               # DW_CHILDREN_yes
+	.byte	37                              # DW_AT_producer
+	.byte	14                              # DW_FORM_strp
+	.byte	19                              # DW_AT_language
+	.byte	5                               # DW_FORM_data2
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	16                              # DW_AT_stmt_list
+	.byte	23                              # DW_FORM_sec_offset
+	.byte	27                              # DW_AT_comp_dir
+	.byte	14                              # DW_FORM_strp
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	6                               # Abbreviation Code
+	.byte	52                              # DW_TAG_variable
+	.byte	0                               # DW_CHILDREN_no
+	.byte	3                               # DW_AT_name
+	.byte	14                              # DW_FORM_strp
+	.byte	73                              # DW_AT_type
+	.byte	19                              # DW_FORM_ref4
+	.byte	63                              # DW_AT_external
+	.byte	25                              # DW_FORM_flag_present
+	.byte	58                              # DW_AT_decl_file
+	.byte	11                              # DW_FORM_data1
+	.byte	59                              # DW_AT_decl_line
+	.byte	11                              # DW_FORM_data1
+	.byte	2                               # DW_AT_location
+	.byte	24                              # DW_FORM_exprloc
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	7                               # Abbreviation Code
+	.byte	19                              # DW_TAG_structure_type
+	.byte	0                               # DW_CHILDREN_no
+	.byte	60                              # DW_AT_declaration
+	.byte	25                              # DW_FORM_flag_present
+	.byte	105                             # DW_AT_signature
+	.byte	32                              # DW_FORM_ref_sig8
+	.byte	0                               # EOM(1)
+	.byte	0                               # EOM(2)
+	.byte	0                               # EOM(3)
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+	.short	4                               # DWARF version number
+	.long	.debug_abbrev                   # Offset Into Abbrev. Section
+	.byte	8                               # Address Size (in bytes)
+	.byte	5                               # Abbrev [5] 0xb:0x32 DW_TAG_compile_unit
+	.long	.Linfo_string0                  # DW_AT_producer
+	.short	33                              # DW_AT_language
+	.long	.Linfo_string1                  # DW_AT_name
+	.long	.Lline_table_start0             # DW_AT_stmt_list
+	.long	.Linfo_string2                  # DW_AT_comp_dir
+	.byte	6                               # Abbrev [6] 0x1e:0x15 DW_TAG_variable
+	.long	.Linfo_string3                  # DW_AT_name
+	.long	51                              # DW_AT_type
+                                        # DW_AT_external
+	.byte	1                               # DW_AT_decl_file
+	.byte	2                               # DW_AT_decl_line
+	.byte	9                               # DW_AT_location
+	.byte	3
+	.quad	v1
+	.byte	7                               # Abbrev [7] 0x33:0x9 DW_TAG_structure_type
+                                        # DW_AT_declaration
+	.quad	-4149699470930386446            # DW_AT_signature
+	.byte	0                               # End Of Children Mark
+.Ldebug_info_end1:
+	.section	.debug_str,"MS",@progbits,1
+.Linfo_string0:
+	.asciz	"clang version 14.0.0 (git@github.com:llvm/llvm-project.git 7f00c7ce4b186ab8ba2ae66c82efdcf908c61019)" # string offset=0
+.Linfo_string1:
+	.asciz	"test.cpp"                      # string offset=101
+.Linfo_string2:
+	.asciz	"/usr/local/google/home/blaikie/dev/scratch" # string offset=110
+.Linfo_string3:
+	.asciz	"v1"                            # string offset=153
+.Linfo_string4:
+	.asciz	"i"                             # string offset=156
+.Linfo_string5:
+	.asciz	"int"                           # string offset=158
+.Linfo_string6:
+	.asciz	"t1"                            # string offset=162
+	.ident	"clang version 14.0.0 (git@github.com:llvm/llvm-project.git 7f00c7ce4b186ab8ba2ae66c82efdcf908c61019)"
+	.section	".note.GNU-stack","",@progbits
+	.addrsig
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_rnglists.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_rnglists.yaml
new file mode 100644
index 0000000000000..6328f1f22ec71
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_rnglists.yaml
@@ -0,0 +1,34 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: error: DW_AT_ranges offset is beyond .debug_rnglists bounds: 0x00001000
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_ranges
+              Form:            DW_FORM_sec_offset
+  debug_info:
+    - Version:         5
+      UnitType:        DW_UT_compile
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000001000
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_stmt_list.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_stmt_list.yaml
new file mode 100644
index 0000000000000..22db1c1ed1a32
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_stmt_list.yaml
@@ -0,0 +1,33 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: error: DW_AT_stmt_list offset is beyond .debug_line bounds: 0x00001000
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_stmt_list
+              Form:            DW_FORM_sec_offset
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000001000
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_strp.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_strp.yaml
new file mode 100644
index 0000000000000..128c8ba4c4934
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_invalid_strp.yaml
@@ -0,0 +1,29 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -debug-info -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_info Unit Header Chain... 
+# CHECK-NEXT: error: DW_FORM_strp offset beyond .debug_str bounds:
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000001234
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_lexical_block_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_lexical_block_ranges.yaml
new file mode 100644
index 0000000000000..b2e3667b106b7
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_lexical_block_ranges.yaml
@@ -0,0 +1,60 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: error: DIE address ranges are not contained in its parent's ranges:
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+    - main
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x00000002
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+        - Code:            0x00000003
+          Tag:             DW_TAG_lexical_block
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x000000000000000D
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000002000
+        - AbbrCode:        0x00000003
+          Values:
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000002001
+        - AbbrCode:        0x00000000
+        - AbbrCode:        0x00000000
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_nested_functions.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_nested_functions.yaml
new file mode 100644
index 0000000000000..275639b2e3a87
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_nested_functions.yaml
@@ -0,0 +1,59 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: llvm-dwarfdump -verify %t.o | FileCheck --implicit-check-not=error: %s
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+    - main
+    - nested
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x00000002
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000002000
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x000000000000000D
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000001500
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x0000000000000012
+            - Value:           0x0000000000001500
+            - Value:           0x0000000000002000
+        - AbbrCode:        0x00000000
+        - AbbrCode:        0x00000000
+        - AbbrCode:        0x00000000
+  
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_cu_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_cu_ranges.yaml
index b0970cdac8b24..9533c2795783d 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_cu_ranges.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_cu_ranges.yaml
@@ -64,7 +64,7 @@
 # CHECK-NEXT:              DW_AT_low_pc      (0x0000000000000000)
 # CHECK-NEXT:              DW_AT_high_pc     (0x0000000000000020)
 
-# CHECK: Verifying .debug_info references...
+# CHECK: Verifying
 
 --- !mach-o
 FileHeader:
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_function_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_function_ranges.yaml
new file mode 100644
index 0000000000000..430e77e4fa8a1
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_function_ranges.yaml
@@ -0,0 +1,54 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: error: DIEs have overlapping address ranges
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+    - main
+    - foo
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x00000002
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x000000000000000D
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000002000
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x0000000000000012
+            - Value:           0x0000000000001FFF
+            - Value:           0x0000000000002000
+        - AbbrCode:        0x00000000
+
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_lexical_block_ranges.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_lexical_block_ranges.yaml
new file mode 100644
index 0000000000000..0644f65d4eef3
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_overlapping_lexical_block_ranges.yaml
@@ -0,0 +1,71 @@
+# RUN: yaml2obj %s -o %t.o
+# RUN: not llvm-dwarfdump -verify %t.o | FileCheck %s
+
+#      CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK-NEXT: error: DIEs have overlapping address ranges
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_EXEC
+DWARF:
+  debug_str:
+    - ''
+    - /tmp/main.c
+    - main
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x00000002
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+        - Code:            0x00000003
+          Tag:             DW_TAG_lexical_block
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000002000
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x000000000000000D
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000002000
+        - AbbrCode:        0x00000003
+          Values:
+            - Value:           0x0000000000001100
+            - Value:           0x0000000000001300
+        - AbbrCode:        0x00000003
+          Values:
+            - Value:           0x00000000000012FF
+            - Value:           0x0000000000001300
+        - AbbrCode:        0x00000000
+        - AbbrCode:        0x00000000
+
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_parent_zero_length.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_parent_zero_length.yaml
new file mode 100644
index 0000000000000..d958d0d9073fa
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_parent_zero_length.yaml
@@ -0,0 +1,236 @@
+# This test verifies that: if a parent die(DW_TAG_subprogram) has zero
+# length address range and has a child with normal address range then
+# the "DIE address ranges are not contained in its parent's ranges"
+# error is reported. It also checks that if a parent die has zero
+# length address range and has children with overlapped addresses
+# then the error "DIEs have overlapping address ranges" is reported.
+#
+# The DWARF looks like:
+# 0x0000000b: DW_TAG_compile_unit
+#              DW_AT_name        ("/tmp/main.c")
+#               DW_AT_language    (DW_LANG_C)
+#               DW_AT_low_pc      (0x0000000000000000)
+#               DW_AT_ranges      (0x00000000
+#                  [0x0000000000001000, 0x0000000000002000))
+#
+# 0x0000001e:   DW_TAG_subprogram
+#                 DW_AT_name      ("main")
+#                 DW_AT_low_pc    (0x0000000000000000)
+#                 DW_AT_high_pc   (0x0000000000000000)
+#
+# 0x00000033:     DW_TAG_lexical_block
+#                   DW_AT_low_pc  (0x0000000000001000)
+#                   DW_AT_high_pc (0x0000000000002000)
+#
+# 0x00000044:     DW_TAG_lexical_block
+#                   DW_AT_low_pc  (0x0000000000001000)
+#                   DW_AT_high_pc (0x0000000000002000)
+# 0x00000055:     NULL
+#
+# 0x00000056:   NULL
+
+
+# RUN: yaml2obj %s | not llvm-dwarfdump --verify - | FileCheck %s --implicit-check-not=error:
+
+# CHECK: Verifying -:	file format Mach-O 64-bit x86-64
+# CHECK: Verifying .debug_abbrev...
+# CHECK: Verifying .debug_info Unit Header Chain...
+# CHECK: error: DIE address ranges are not contained in its parent's ranges:
+# CHECK: 0x0000001e: DW_TAG_subprogram
+# CHECK:               DW_AT_name	("main")
+# CHECK:               DW_AT_low_pc	(0x0000000000000000)
+# CHECK:               DW_AT_high_pc	(0x0000000000000000)
+
+# CHECK: 0x00000033:   DW_TAG_lexical_block
+# CHECK:                 DW_AT_low_pc	(0x0000000000001000)
+# CHECK:                 DW_AT_high_pc	(0x0000000000002000)
+
+# CHECK: error: DIEs have overlapping address ranges:
+# CHECK: 0x00000044: DW_TAG_lexical_block
+# CHECK:               DW_AT_low_pc	(0x0000000000001000)
+# CHECK:               DW_AT_high_pc	(0x0000000000002000)
+
+# CHECK: 0x00000033: DW_TAG_lexical_block
+# CHECK:               DW_AT_low_pc	(0x0000000000001000)
+# CHECK:               DW_AT_high_pc	(0x0000000000002000)
+
+# CHECK: error: DIE address ranges are not contained in its parent's ranges:
+# CHECK: 0x0000001e: DW_TAG_subprogram
+# CHECK:               DW_AT_name	("main")
+# CHECK:               DW_AT_low_pc	(0x0000000000000000)
+# CHECK:               DW_AT_high_pc	(0x0000000000000000)
+
+# CHECK: 0x00000044:   DW_TAG_lexical_block
+# CHECK:                 DW_AT_low_pc	(0x0000000000001000)
+# CHECK:                 DW_AT_high_pc	(0x0000000000002000)
+
+
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x00000001
+  ncmds:           4
+  sizeofcmds:      464
+  flags:           0x00002000
+  reserved:        0x00000000
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         392
+    segname:         ''
+    vmaddr:          0
+    vmsize:          261
+    fileoff:         528
+    filesize:        261
+    maxprot:         7
+    initprot:        7
+    nsects:          4
+    flags:           0
+    Sections:
+      - sectname:        __debug_abbrev
+        segname:         __DWARF
+        addr:            0x0000000000000000
+        size:            36
+        offset:          0x00000210
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_info
+        segname:         __DWARF
+        addr:            0x0000000000000024
+        size:            87
+        offset:          0x00000234
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_ranges
+        segname:         __DWARF
+        addr:            0x000000000000008B
+        size:            32
+        offset:          0x0000029B
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_str
+        segname:         __DWARF
+        addr:            0x00000000000000DB
+        size:            18
+        offset:          0x000002EB
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SYMTAB
+    cmdsize:         24
+    symoff:          0
+    nsyms:           0
+    stroff:          792
+    strsize:         8
+  - cmd:             LC_BUILD_VERSION
+    cmdsize:         32
+    platform:        1
+    minos:           658944
+    sdk:             658944
+    ntools:          1
+    Tools:
+      - tool:            3
+        version:         34734080
+  - cmd:             LC_DATA_IN_CODE
+    cmdsize:         16
+    dataoff:         792
+    datasize:        0
+LinkEditData:
+  StringTable:
+    - ' '
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+    - ''
+DWARF:
+  debug_str:
+    - ''
+    - '/tmp/main.c'
+    - main
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_ranges
+              Form:            DW_FORM_sec_offset
+        - Code:            0x00000002
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+        - Code:            0x00000003
+          Tag:             DW_TAG_lexical_block
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_addr
+
+  debug_ranges:
+    - Offset:          0x00000000
+      AddrSize:        0x08
+      Entries:
+        - LowOffset:       0x0000000000001000
+          HighOffset:      0x0000000000002000
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000002
+            - Value:           0x0000000000000000
+            - Value:           0x0000000000000000
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x000000000000000D
+            - Value:           0x0000000000000000
+            - Value:           0x0000000000000000
+        - AbbrCode:        0x00000003
+          Values:
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000002000
+        - AbbrCode:        0x00000003
+          Values:
+            - Value:           0x0000000000001000
+            - Value:           0x0000000000002000
+        - AbbrCode:        0x00000000
+        - AbbrCode:        0x00000000
+...
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/adcx-adox-read-advance.s b/llvm/test/tools/llvm-mca/X86/Haswell/adcx-adox-read-advance.s
new file mode 100644
index 0000000000000..d418b6176c993
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/adcx-adox-read-advance.s
@@ -0,0 +1,130 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -timeline -iterations=2 < %s | FileCheck %s
+
+# PR51494: Missing read-advance for the implicit read of EFLAGS.
+
+# LLVM-MCA-BEGIN
+adcx (%rdi), %rcx
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+adox (%rdi), %rcx
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      12
+# CHECK-NEXT: Total uOps:        6
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 0.8
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  3      7     0.50    *                   adcxq	(%rdi), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - HWDivider
+# CHECK-NEXT: [1]   - HWFPDivider
+# CHECK-NEXT: [2]   - HWPort0
+# CHECK-NEXT: [3]   - HWPort1
+# CHECK-NEXT: [4]   - HWPort2
+# CHECK-NEXT: [5]   - HWPort3
+# CHECK-NEXT: [6]   - HWPort4
+# CHECK-NEXT: [7]   - HWPort5
+# CHECK-NEXT: [8]   - HWPort6
+# CHECK-NEXT: [9]   - HWPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -     0.50   0.50    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -     0.50   0.50    -     adcxq	(%rdi), %rcx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER..   adcxq	(%rdi), %rcx
+# CHECK-NEXT: [1,0]     .D=eeeeeeeER   adcxq	(%rdi), %rcx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     1.5    0.5    0.0       adcxq	(%rdi), %rcx
+
+# CHECK:      [1] Code Region
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      12
+# CHECK-NEXT: Total uOps:        6
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: Block RThroughput: 0.8
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  3      7     0.50    *                   adoxq	(%rdi), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - HWDivider
+# CHECK-NEXT: [1]   - HWFPDivider
+# CHECK-NEXT: [2]   - HWPort0
+# CHECK-NEXT: [3]   - HWPort1
+# CHECK-NEXT: [4]   - HWPort2
+# CHECK-NEXT: [5]   - HWPort3
+# CHECK-NEXT: [6]   - HWPort4
+# CHECK-NEXT: [7]   - HWPort5
+# CHECK-NEXT: [8]   - HWPort6
+# CHECK-NEXT: [9]   - HWPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -     0.50   0.50    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -     0.50   0.50    -     adoxq	(%rdi), %rcx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     01
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER..   adoxq	(%rdi), %rcx
+# CHECK-NEXT: [1,0]     .D=eeeeeeeER   adoxq	(%rdi), %rcx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     1.5    0.5    0.0       adoxq	(%rdi), %rcx
diff --git a/llvm/test/tools/llvm-mca/X86/Haswell/mulx-read-advance.s b/llvm/test/tools/llvm-mca/X86/Haswell/mulx-read-advance.s
new file mode 100644
index 0000000000000..8203ce27c44aa
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/Haswell/mulx-read-advance.s
@@ -0,0 +1,130 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=haswell -timeline -iterations=2 < %s | FileCheck %s
+
+# PR51494: A read-advance on the implicit read of EDX/RDX was missing.
+
+# LLVM-MCA-BEGIN
+mulxl (%rdi), %eax, %edx
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+mulxq (%rdi), %rax, %rdx
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      16
+# CHECK-NEXT: Total uOps:        10
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.63
+# CHECK-NEXT: IPC:               0.13
+# CHECK-NEXT: Block RThroughput: 1.3
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  5      9     1.00    *                   mulxl	(%rdi), %eax, %edx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - HWDivider
+# CHECK-NEXT: [1]   - HWFPDivider
+# CHECK-NEXT: [2]   - HWPort0
+# CHECK-NEXT: [3]   - HWPort1
+# CHECK-NEXT: [4]   - HWPort2
+# CHECK-NEXT: [5]   - HWPort3
+# CHECK-NEXT: [6]   - HWPort4
+# CHECK-NEXT: [7]   - HWPort5
+# CHECK-NEXT: [8]   - HWPort6
+# CHECK-NEXT: [9]   - HWPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     0.50   1.00   0.50   0.50    -     0.50   1.00    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -     0.50   1.00   0.50   0.50    -     0.50   1.00    -     mulxl	(%rdi), %eax, %edx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .   mulxl	(%rdi), %eax, %edx
+# CHECK-NEXT: [1,0]     . D==eeeeeeeeeER   mulxl	(%rdi), %eax, %edx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     2.0    0.5    0.0       mulxl	(%rdi), %eax, %edx
+
+# CHECK:      [1] Code Region
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      16
+# CHECK-NEXT: Total uOps:        8
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.50
+# CHECK-NEXT: IPC:               0.13
+# CHECK-NEXT: Block RThroughput: 1.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  4      9     1.00    *                   mulxq	(%rdi), %rax, %rdx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - HWDivider
+# CHECK-NEXT: [1]   - HWFPDivider
+# CHECK-NEXT: [2]   - HWPort0
+# CHECK-NEXT: [3]   - HWPort1
+# CHECK-NEXT: [4]   - HWPort2
+# CHECK-NEXT: [5]   - HWPort3
+# CHECK-NEXT: [6]   - HWPort4
+# CHECK-NEXT: [7]   - HWPort5
+# CHECK-NEXT: [8]   - HWPort6
+# CHECK-NEXT: [9]   - HWPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -      -     1.00   0.50   0.50    -      -     1.00    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -      -     1.00   0.50   0.50    -      -     1.00    -     mulxq	(%rdi), %rax, %rdx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012345
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeeeER   .   mulxq	(%rdi), %rax, %rdx
+# CHECK-NEXT: [1,0]     .D===eeeeeeeeeER   mulxq	(%rdi), %rax, %rdx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     2.5    0.5    0.0       mulxq	(%rdi), %rax, %rdx
diff --git a/llvm/test/tools/llvm-mca/X86/Znver1/resources-bmi2.s b/llvm/test/tools/llvm-mca/X86/Znver1/resources-bmi2.s
index 68a2a4b9ec7ef..32785f92675d4 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver1/resources-bmi2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver1/resources-bmi2.s
@@ -64,8 +64,8 @@ shrx        %rax, (%rbx), %rcx
 # CHECK-NEXT:  2      5     0.50    *                   bzhiq	%rax, (%rbx), %rcx
 # CHECK-NEXT:  1      3     2.00                        mulxl	%eax, %ebx, %ecx
 # CHECK-NEXT:  1      8     2.00    *                   mulxl	(%rax), %ebx, %ecx
-# CHECK-NEXT:  1      3     1.00                        mulxq	%rax, %rbx, %rcx
-# CHECK-NEXT:  1      8     1.00    *                   mulxq	(%rax), %rbx, %rcx
+# CHECK-NEXT:  1      3     2.00                        mulxq	%rax, %rbx, %rcx
+# CHECK-NEXT:  1      8     2.00    *                   mulxq	(%rax), %rbx, %rcx
 # CHECK-NEXT:  1      100   0.25                        pdepl	%eax, %ebx, %ecx
 # CHECK-NEXT:  1      100   0.25    *                   pdepl	(%rax), %ebx, %ecx
 # CHECK-NEXT:  1      100   0.25                        pdepq	%rax, %rbx, %rcx
@@ -107,7 +107,7 @@ shrx        %rax, (%rbx), %rcx
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT: 6.00   6.00   5.00   10.00  5.00   5.00    -      -      -      -      -     5.00
+# CHECK-NEXT: 6.00   6.00   5.00   9.00   5.00   5.00    -      -      -      -      -     8.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -116,9 +116,9 @@ shrx        %rax, (%rbx), %rcx
 # CHECK-NEXT:  -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     bzhiq	%rax, %rbx, %rcx
 # CHECK-NEXT: 0.50   0.50   0.25   0.25   0.25   0.25    -      -      -      -      -      -     bzhiq	%rax, (%rbx), %rcx
 # CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -     2.00   mulxl	%eax, %ebx, %ecx
-# CHECK-NEXT: 0.50   0.50    -     2.00    -      -      -      -      -      -      -     2.00   mulxl	(%rax), %ebx, %ecx
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -     mulxq	%rax, %rbx, %rcx
-# CHECK-NEXT: 0.50   0.50    -     1.00    -      -      -      -      -      -      -     1.00   mulxq	(%rax), %rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -     1.00    -      -      -      -      -      -      -     2.00   mulxl	(%rax), %ebx, %ecx
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -     2.00   mulxq	%rax, %rbx, %rcx
+# CHECK-NEXT: 0.50   0.50    -     1.00    -      -      -      -      -      -      -     2.00   mulxq	(%rax), %rbx, %rcx
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     pdepl	%eax, %ebx, %ecx
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     pdepl	(%rax), %ebx, %ecx
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -     pdepq	%rax, %rbx, %rcx
diff --git a/llvm/test/tools/llvm-mca/X86/Znver2/adcx-adox-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver2/adcx-adox-read-advance.s
new file mode 100644
index 0000000000000..3b2360e3237fb
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/Znver2/adcx-adox-read-advance.s
@@ -0,0 +1,134 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2 -timeline -iterations=2 < %s | FileCheck %s
+
+# PR51494: Missing read-advance for the implicit read of EFLAGS.
+
+# LLVM-MCA-BEGIN
+adcx (%rdi), %rcx
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+adox (%rdi), %rcx
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      9
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.44
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      5     0.33    *                   adcxq	(%rdi), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - Zn2AGU0
+# CHECK-NEXT: [1]   - Zn2AGU1
+# CHECK-NEXT: [2]   - Zn2AGU2
+# CHECK-NEXT: [3]   - Zn2ALU0
+# CHECK-NEXT: [4]   - Zn2ALU1
+# CHECK-NEXT: [5]   - Zn2ALU2
+# CHECK-NEXT: [6]   - Zn2ALU3
+# CHECK-NEXT: [7]   - Zn2Divider
+# CHECK-NEXT: [8]   - Zn2FPU0
+# CHECK-NEXT: [9]   - Zn2FPU1
+# CHECK-NEXT: [10]  - Zn2FPU2
+# CHECK-NEXT: [11]  - Zn2FPU3
+# CHECK-NEXT: [12]  - Zn2Multiplier
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
+# CHECK-NEXT:  -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
+# CHECK-NEXT:  -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -     adcxq	(%rdi), %rcx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeeER.   adcxq	(%rdi), %rcx
+# CHECK-NEXT: [1,0]     D=eeeeeER   adcxq	(%rdi), %rcx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     1.5    0.5    0.0       adcxq	(%rdi), %rcx
+
+# CHECK:      [1] Code Region
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      9
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.44
+# CHECK-NEXT: IPC:               0.22
+# CHECK-NEXT: Block RThroughput: 0.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      5     0.33    *                   adoxq	(%rdi), %rcx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - Zn2AGU0
+# CHECK-NEXT: [1]   - Zn2AGU1
+# CHECK-NEXT: [2]   - Zn2AGU2
+# CHECK-NEXT: [3]   - Zn2ALU0
+# CHECK-NEXT: [4]   - Zn2ALU1
+# CHECK-NEXT: [5]   - Zn2ALU2
+# CHECK-NEXT: [6]   - Zn2ALU3
+# CHECK-NEXT: [7]   - Zn2Divider
+# CHECK-NEXT: [8]   - Zn2FPU0
+# CHECK-NEXT: [9]   - Zn2FPU1
+# CHECK-NEXT: [10]  - Zn2FPU2
+# CHECK-NEXT: [11]  - Zn2FPU3
+# CHECK-NEXT: [12]  - Zn2Multiplier
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
+# CHECK-NEXT:  -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
+# CHECK-NEXT:  -     0.50   0.50    -      -     0.50   0.50    -      -      -      -      -      -     adoxq	(%rdi), %rcx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeeeeER.   adoxq	(%rdi), %rcx
+# CHECK-NEXT: [1,0]     D=eeeeeER   adoxq	(%rdi), %rcx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     1.5    0.5    0.0       adoxq	(%rdi), %rcx
diff --git a/llvm/test/tools/llvm-mca/X86/Znver2/mulx-read-advance.s b/llvm/test/tools/llvm-mca/X86/Znver2/mulx-read-advance.s
new file mode 100644
index 0000000000000..7dc5a954c8268
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/Znver2/mulx-read-advance.s
@@ -0,0 +1,136 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2 -timeline -iterations=2 < %s | FileCheck %s
+
+# PR51494: A read-advance on the implicit read of EDX/RDX was missing.
+
+# LLVM-MCA-BEGIN
+mulxl (%rdi), %eax, %edx
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN
+mulxq (%rdi), %rax, %rdx
+# LLVM-MCA-END
+
+# CHECK:      [0] Code Region
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      13
+# CHECK-NEXT: Total uOps:        2
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.15
+# CHECK-NEXT: IPC:               0.15
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      7     2.00    *                   mulxl	(%rdi), %eax, %edx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - Zn2AGU0
+# CHECK-NEXT: [1]   - Zn2AGU1
+# CHECK-NEXT: [2]   - Zn2AGU2
+# CHECK-NEXT: [3]   - Zn2ALU0
+# CHECK-NEXT: [4]   - Zn2ALU1
+# CHECK-NEXT: [5]   - Zn2ALU2
+# CHECK-NEXT: [6]   - Zn2ALU3
+# CHECK-NEXT: [7]   - Zn2Divider
+# CHECK-NEXT: [8]   - Zn2FPU0
+# CHECK-NEXT: [9]   - Zn2FPU1
+# CHECK-NEXT: [10]  - Zn2FPU2
+# CHECK-NEXT: [11]  - Zn2FPU3
+# CHECK-NEXT: [12]  - Zn2Multiplier
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
+# CHECK-NEXT:  -     0.50   0.50    -     1.00    -      -      -      -      -      -      -     2.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
+# CHECK-NEXT:  -     0.50   0.50    -     1.00    -      -      -      -      -      -      -     2.00   mulxl	(%rdi), %eax, %edx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER. .   mulxl	(%rdi), %eax, %edx
+# CHECK-NEXT: [1,0]     D===eeeeeeeER   mulxl	(%rdi), %eax, %edx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     2.5    0.5    0.0       mulxl	(%rdi), %eax, %edx
+
+# CHECK:      [1] Code Region
+
+# CHECK:      Iterations:        2
+# CHECK-NEXT: Instructions:      2
+# CHECK-NEXT: Total Cycles:      13
+# CHECK-NEXT: Total uOps:        2
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.15
+# CHECK-NEXT: IPC:               0.15
+# CHECK-NEXT: Block RThroughput: 2.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      7     2.00    *                   mulxq	(%rdi), %rax, %rdx
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - Zn2AGU0
+# CHECK-NEXT: [1]   - Zn2AGU1
+# CHECK-NEXT: [2]   - Zn2AGU2
+# CHECK-NEXT: [3]   - Zn2ALU0
+# CHECK-NEXT: [4]   - Zn2ALU1
+# CHECK-NEXT: [5]   - Zn2ALU2
+# CHECK-NEXT: [6]   - Zn2ALU3
+# CHECK-NEXT: [7]   - Zn2Divider
+# CHECK-NEXT: [8]   - Zn2FPU0
+# CHECK-NEXT: [9]   - Zn2FPU1
+# CHECK-NEXT: [10]  - Zn2FPU2
+# CHECK-NEXT: [11]  - Zn2FPU3
+# CHECK-NEXT: [12]  - Zn2Multiplier
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
+# CHECK-NEXT:  -     0.50   0.50    -     1.00    -      -      -      -      -      -      -     2.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
+# CHECK-NEXT:  -     0.50   0.50    -     1.00    -      -      -      -      -      -      -     2.00   mulxq	(%rdi), %rax, %rdx
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     012
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER. .   mulxq	(%rdi), %rax, %rdx
+# CHECK-NEXT: [1,0]     D===eeeeeeeER   mulxq	(%rdi), %rax, %rdx
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     2     2.5    0.5    0.0       mulxq	(%rdi), %rax, %rdx
diff --git a/llvm/test/tools/llvm-mca/X86/Znver2/resources-bmi2.s b/llvm/test/tools/llvm-mca/X86/Znver2/resources-bmi2.s
index 9f3521672d81e..1562e859a6b2a 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver2/resources-bmi2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver2/resources-bmi2.s
@@ -64,8 +64,8 @@ shrx        %rax, (%rbx), %rcx
 # CHECK-NEXT:  2      5     0.33    *                   bzhiq	%rax, (%rbx), %rcx
 # CHECK-NEXT:  1      3     2.00                        mulxl	%eax, %ebx, %ecx
 # CHECK-NEXT:  1      7     2.00    *                   mulxl	(%rax), %ebx, %ecx
-# CHECK-NEXT:  1      3     1.00                        mulxq	%rax, %rbx, %rcx
-# CHECK-NEXT:  1      7     1.00    *                   mulxq	(%rax), %rbx, %rcx
+# CHECK-NEXT:  1      3     2.00                        mulxq	%rax, %rbx, %rcx
+# CHECK-NEXT:  1      7     2.00    *                   mulxq	(%rax), %rbx, %rcx
 # CHECK-NEXT:  1      100   0.25                        pdepl	%eax, %ebx, %ecx
 # CHECK-NEXT:  1      100   0.25    *                   pdepl	(%rax), %ebx, %ecx
 # CHECK-NEXT:  1      100   0.25                        pdepq	%rax, %rbx, %rcx
@@ -108,7 +108,7 @@ shrx        %rax, (%rbx), %rcx
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 4.00   4.00   4.00   5.00   10.00  5.00   5.00    -      -      -      -      -     5.00
+# CHECK-NEXT: 4.00   4.00   4.00   5.00   9.00   5.00   5.00    -      -      -      -      -     8.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -117,9 +117,9 @@ shrx        %rax, (%rbx), %rcx
 # CHECK-NEXT:  -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -     bzhiq	%rax, %rbx, %rcx
 # CHECK-NEXT: 0.33   0.33   0.33   0.25   0.25   0.25   0.25    -      -      -      -      -      -     bzhiq	%rax, (%rbx), %rcx
 # CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     2.00   mulxl	%eax, %ebx, %ecx
-# CHECK-NEXT: 0.33   0.33   0.33    -     2.00    -      -      -      -      -      -      -     2.00   mulxl	(%rax), %ebx, %ecx
-# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -      -     mulxq	%rax, %rbx, %rcx
-# CHECK-NEXT: 0.33   0.33   0.33    -     1.00    -      -      -      -      -      -      -     1.00   mulxq	(%rax), %rbx, %rcx
+# CHECK-NEXT: 0.33   0.33   0.33    -     1.00    -      -      -      -      -      -      -     2.00   mulxl	(%rax), %ebx, %ecx
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -      -      -      -     2.00   mulxq	%rax, %rbx, %rcx
+# CHECK-NEXT: 0.33   0.33   0.33    -     1.00    -      -      -      -      -      -      -     2.00   mulxq	(%rax), %rbx, %rcx
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     pdepl	%eax, %ebx, %ecx
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     pdepl	(%rax), %ebx, %ecx
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -     pdepq	%rax, %rbx, %rcx
diff --git a/llvm/test/tools/llvm-mca/X86/directives-handle-crlf.s b/llvm/test/tools/llvm-mca/X86/directives-handle-crlf.s
new file mode 100644
index 0000000000000..db797a75c2617
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/X86/directives-handle-crlf.s
@@ -0,0 +1,4 @@
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=generic %s
+# LLVM-MCA-BEGIN foo
+addl	$42, %eax
+# LLVM-MCA-END
diff --git a/llvm/test/tools/llvm-objdump/ELF/dynsym-version.test b/llvm/test/tools/llvm-objdump/ELF/dynsym-version.test
new file mode 100644
index 0000000000000..e21b3544be854
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/dynsym-version.test
@@ -0,0 +1,155 @@
+## Check we print symbol versions, when they are available.
+
+## Test undefined symbols.
+# RUN: yaml2obj %s -o %t-undef.o
+# RUN: llvm-objdump -T %t-undef.o 2>&1 | tr '\t' '|' | FileCheck %s \
+# RUN:   -DFILE=%t-undef.o --check-prefix=UNDEF --match-full-lines --strict-whitespace
+
+## version2sym and version3sym are invalid: undefined symbols cannot refer to .gnu.version_d.
+## We still check their behaviors.
+#      UNDEF:DYNAMIC SYMBOL TABLE:
+# UNDEF-NEXT:0000000000000000      D  *UND*|0000000000000000              localversym
+# UNDEF-NEXT:0000000000000000      D  *UND*|0000000000000000              globalversym
+# UNDEF-NEXT:0000000000000000      D  *UND*|0000000000000000 (v2)         version2sym
+# UNDEF-NEXT:0000000000000000      D  *UND*|0000000000000000 (v3hidden)   version3sym
+# UNDEF-NEXT:0000000000000000      D  *UND*|0000000000000000 (v4)         version4sym
+# UNDEF-NEXT:0000000000000000      D  *UND*|0000000000000000 (v5hidden)   .hidden version5sym
+
+## Test defined symbols.
+# RUN: yaml2obj -DINDEX=0x1 %s -o %t-def.o
+# RUN: llvm-objdump -T %t-def.o 2>&1 | tr '\t' '|' | FileCheck %s \
+# RUN:   -DFILE=%t-def.o --check-prefix=DEF --match-full-lines --strict-whitespace
+
+#      DEF:DYNAMIC SYMBOL TABLE:
+# DEF-NEXT:0000000000000000 g    D  .gnu.version|0000000000000000              localversym
+# DEF-NEXT:0000000000000000 g    D  .gnu.version|0000000000000000              globalversym
+# DEF-NEXT:0000000000000000 g    D  .gnu.version|0000000000000000  v2          version2sym
+# DEF-NEXT:0000000000000000 g    D  .gnu.version|0000000000000000 (v3hidden)   version3sym
+# DEF-NEXT:0000000000000000 g    D  .gnu.version|0000000000000000 (v4)         version4sym
+# DEF-NEXT:0000000000000000 g    D  .gnu.version|0000000000000000 (v5hidden)   .hidden version5sym
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_DYN
+Sections:
+  - Name:    .gnu.version
+    Type:    SHT_GNU_versym
+    Flags:   [ SHF_ALLOC ]
+## 0x8000 is a special VERSYM_HIDDEN bit.
+    Entries: [ 0, 0, 1, 2, 0x8003, 4, 0x8005 ]
+    ShSize:  [[VERSYMSIZE=<none>]]
+  - Name:         .gnu.version_d
+    Type:         SHT_GNU_verdef
+    Flags:        [ SHF_ALLOC ]
+    Link:         .dynstr
+    AddressAlign: 0x4
+    Info:         0x2
+    ShOffset:     [[VERDEFOFFSET=<none>]]
+    Entries:
+      - VersionNdx: 2
+        Names:
+          - v2
+      - VersionNdx: 3
+        Names:
+          - v3hidden
+  - Name:  .gnu.version_r
+    Type:  SHT_GNU_verneed
+    Flags: [ SHF_ALLOC ]
+    Link:  .dynstr
+    Info:  0x2
+    Dependencies:
+      - Version: 1
+        File:    file1.so
+        Entries:
+          - Name:  v4
+            Hash:  0
+            Flags: 0
+            Other: 4
+      - Version: 1
+        File:    file2.0
+        Entries:
+          - Name:  v5hidden
+            Hash:  0
+            Flags: 0
+            Other: 5
+  - Name:    .dynsym
+    Type:    SHT_DYNSYM
+    EntSize: [[ENTSIZE=<none>]]
+DynamicSymbols:
+  - Name:  localversym
+    Index: [[INDEX=<none>]]
+    Binding: STB_GLOBAL
+  - Name:  globalversym
+    Index: [[INDEX=<none>]]
+    Binding: STB_GLOBAL
+  - Name:  version2sym
+    Index: [[INDEX=<none>]]
+    Binding: STB_GLOBAL
+  - Name:  version3sym
+    Index: [[INDEX=<none>]]
+    Binding: STB_GLOBAL
+  - Name:  version4sym
+    Index: [[INDEX=<none>]]
+    Binding: STB_GLOBAL
+  - Name:  version5sym
+    Index: [[INDEX=<none>]]
+    Other: [ STV_HIDDEN ]
+    Binding: STB_GLOBAL
+
+## Test the output with a long version name.
+# RUN: yaml2obj --docnum=2 %s -o %t2
+# RUN: llvm-objdump -T %t2 2>&1 | tr '\t' '|' | FileCheck %s \
+# RUN:   --check-prefix=LONGNAME --match-full-lines --strict-whitespace
+
+#      LONGNAME:DYNAMIC SYMBOL TABLE:
+# LONGNAME-NEXT:0000000000000000 g    D  .gnu.version|0000000000000000  v2          sym1
+# LONGNAME-NEXT:0000000000000000 g    D  .gnu.version|0000000000000000  v3withverylongname sym2
+
+--- !ELF
+FileHeader:
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_DYN
+Sections:
+  - Name:    .gnu.version
+    Type:    SHT_GNU_versym
+    Flags:   [ SHF_ALLOC ]
+    Entries: [ 1, 2, 3 ]
+  - Name:         .gnu.version_d
+    Type:         SHT_GNU_verdef
+    Flags:        [ SHF_ALLOC ]
+    Link:         .dynstr
+    AddressAlign: 0x4
+    Info:         0x2
+    Entries:
+      - VersionNdx: 2
+        Names:
+          - v2
+      - VersionNdx: 3
+        Names:
+          - v3withverylongname
+  - Name:    .dynsym
+    Type:    SHT_DYNSYM
+DynamicSymbols:
+  - Name:  sym1
+    Index: 1
+    Binding: STB_GLOBAL
+  - Name:  sym2
+    Index: 1
+    Binding: STB_GLOBAL
+
+## Check we report a warning when we are unable to read a SHT_GNU_versym section entry.
+## In this case, the section has a size that is not a multiple of its sh_entsize.
+
+# RUN: yaml2obj -DVERSYMSIZE=0xff %s -o %t2-broken-versym.o
+# RUN: llvm-objdump -T %t2-broken-versym.o 2>&1 | FileCheck %s --check-prefixes=VERSION-ERR1,NOVER
+
+# VERSION-ERR1:warning: {{.*}}: unable to read an entry with index 1 from SHT_GNU_versym section
+# NOVER-NEXT:0000000000000000      D  *UND*  0000000000000000 localversym
+# NOVER-NEXT:0000000000000000      D  *UND*  0000000000000000 globalversym
+# NOVER-NEXT:0000000000000000      D  *UND*  0000000000000000 version2sym
+# NOVER-NEXT:0000000000000000      D  *UND*  0000000000000000 version3sym
+# NOVER-NEXT:0000000000000000      D  *UND*  0000000000000000 version4sym
+# NOVER-NEXT:0000000000000000      D  *UND*  0000000000000000 .hidden version5sym
diff --git a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description64.test b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description64.test
index fd3e2bb6b3119..893fd3c07d4bf 100644
--- a/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description64.test
+++ b/llvm/test/tools/llvm-objdump/XCOFF/disassemble-symbol-description64.test
@@ -6,9 +6,8 @@
 # RUN: llvm-objdump -D --symbol-description %p/Inputs/xcoff-section-headers64.o | \
 # RUN:   FileCheck --check-prefixes=COMMON,DESC %s
 
-# RUN: not --crash llvm-objdump -D -r --symbol-description %p/Inputs/xcoff-section-headers64.o 2>&1 | \
-# RUN:   FileCheck --check-prefix=ERROR %s
-# ERROR: 64-bit support not implemented yet
+# RUN: llvm-objdump -D -r --symbol-description %p/Inputs/xcoff-section-headers64.o | \
+# RUN:   FileCheck --check-prefixes=COMMON,DESC,RELOC,COMMON-RELOC %s
 
 ## xcoff-section-headers64.o Compiled with IBM XL C/C++ for AIX, V16.1.0
 ## compiler command: xlc -q64 -qtls -o xcoff-section-headers64.o -c test.c
@@ -29,6 +28,7 @@ COMMON-EMPTY:
 PLAIN:        0000000000000000 <.func>:
 DESC:         0000000000000000 (idx: 6) .func:
 COMMON-NEXT:         0: e8 62 00 08   ld 3, 8(2)
+RELOC-NEXT:                           00000002:  R_TOC        (idx: 16) a[TC]
 COMMON-NEXT:         4: e8 63 00 02   lwa 3, 0(3)
 COMMON-NEXT:         8: 4e 80 00 20   blr
 COMMON-NEXT:         c: 00 00 00 00   <unknown>
@@ -44,31 +44,40 @@ COMMON-EMPTY:
 PLAIN:        0000000000000080 <func>:
 DESC:         0000000000000080 (idx: 12) func[TC]:
 COMMON-NEXT:        80: 00 00 00 00   <unknown>
+RELOC-NEXT:                           00000080:  R_POS       (idx: 10) func[DS]
 COMMON-NEXT:        84: 00 00 00 a8   <unknown>
 COMMON-EMPTY:
 PLAIN:        0000000000000088 <a>:
 DESC:         0000000000000088 (idx: 16) a[TC]:
 COMMON-NEXT:        88: 00 00 00 00   <unknown>
+RELOC-NEXT:                            00000088:  R_POS       (idx: 14) a[RW]
 COMMON-NEXT:        8c: 00 00 00 c8   <unknown>
 COMMON-EMPTY:
 PLAIN:        0000000000000090 <b>:
 DESC:         0000000000000090 (idx: 20) b[TC]:
 COMMON-NEXT:        90: 00 00 00 00   <unknown>
+RELOC-NEXT:                            00000090:  R_POS       (idx: 18) b[RW]
 COMMON-NEXT:        94: 00 00 00 c0   <unknown>
 COMMON-EMPTY:
 PLAIN:        0000000000000098 <c>:
 DESC:         0000000000000098 (idx: 24) c[TC]:
 COMMON-NEXT:        98: 00 00 00 00   <unknown>
+RELOC-NEXT:                            00000098:  R_TLS       (idx: 22) c[UL]
 COMMON-NEXT:        9c: 00 00 00 08   <unknown>
 COMMON-EMPTY:
 PLAIN:        00000000000000a0 <d>:
 DESC:         00000000000000a0 (idx: 28) d[TC]:
-COMMON-NEXT:                  ...
-COMMON-EMPTY:
+COMMON-RELOC-NEXT:  a0: 00 00 00 00   <unknown>
+RELOC-NEXT:                           000000a0:  R_TLS        (idx: 26) d[TL]
+COMMON-RELOC-NEXT:  a4: 00 00 00 00   <unknown>
 PLAIN:        00000000000000a8 <func>:
 DESC:         00000000000000a8 (idx: 10) func[DS]:
-COMMON-NEXT:                  ...
-COMMON-NEXT:        b4: 00 00 00 80   <unknown>
+COMMON-RELOC-NEXT:  a8: 00 00 00 00   <unknown>
+RELOC-NEXT:                           000000a8:  R_POS        (idx: 6) .func
+COMMON-RELOC-NEXT:  ac: 00 00 00 00   <unknown>
+COMMON-RELOC-NEXT:  b0: 00 00 00 00   <unknown>
+RELOC-NEXT:                           000000b0:  R_POS        (idx: 8) TOC[TC0]
+COMMON:             b4: 00 00 00 80   <unknown>
 COMMON-NEXT:                  ...
 COMMON-EMPTY:
 PLAIN:        00000000000000c0 <b>:
diff --git a/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list.expected b/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list.expected
index d9af0de37d276..bd528b44b81c4 100644
--- a/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list.expected
+++ b/llvm/test/tools/llvm-profdata/Inputs/profile-symbol-list.expected
@@ -20,14 +20,14 @@ Samples collected in inlined callsites {
     }
     No inlined callsites in this function
 }
-Function: _Z3fooi: 15422, 1220, 1 sampled lines
+Function: _Z3bari: 40602, 2874, 1 sampled lines
 Samples collected in the function's body {
-  1: 1220
+  1: 2874
 }
 No inlined callsites in this function
-Function: _Z3bari: 40602, 2874, 1 sampled lines
+Function: _Z3fooi: 15422, 1220, 1 sampled lines
 Samples collected in the function's body {
-  1: 2874
+  1: 1220
 }
 No inlined callsites in this function
 ======== Dump profile symbol list ========
diff --git a/llvm/test/tools/llvm-profgen/Inputs/cs-preinline-cost.perfbin b/llvm/test/tools/llvm-profgen/Inputs/cs-preinline-cost.perfbin
new file mode 100755
index 0000000000000..4a339d6a65d51
Binary files /dev/null and b/llvm/test/tools/llvm-profgen/Inputs/cs-preinline-cost.perfbin differ
diff --git a/llvm/test/tools/llvm-profgen/Inputs/cs-preinline-cost.perfscript b/llvm/test/tools/llvm-profgen/Inputs/cs-preinline-cost.perfscript
new file mode 100644
index 0000000000000..90ea4c16f279e
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/Inputs/cs-preinline-cost.perfscript
@@ -0,0 +1,3000 @@
+Using perf wrapper that supports hot-text. Try perf.real if you encounter any issues.
+PERF_RECORD_MMAP2 1881628/1881628: [0x400000(0x1000) @ 0 103:01 538893601 3816301948]: r-xp /data/users/wenlei/repro/a.out
+PERF_RECORD_MMAP2 1881628/1881628: [0x7f047e5fd000(0x224000) @ 0 08:04 21365752 805244005]: r-xp /usr/lib64/ld-2.17.so
+PERF_RECORD_MMAP2 1881628/1881628: [0x7ffc5957e000(0x1000) @ 0 00:00 0 0]: r-xp [vdso]
+PERF_RECORD_MMAP2 1881628/1881628: [0x7f047e816000(0x5000) @ 0x4000 08:04 21366518 652059666]: r-xp /usr/lib64/perf_fopen_hook.so
+PERF_RECORD_MMAP2 1881628/1881628: [0x7f047e22f000(0x3ce000) @ 0 08:04 21369633 2582309186]: r-xp /usr/lib64/libc-2.17.so
+PERF_RECORD_MMAP2 1881628/1881628: [0x7f047e019000(0x216000) @ 0 08:04 21370107 1272346648]: r-xp /usr/lib64/libz.so.1.2.7
+PERF_RECORD_MMAP2 1881628/1881628: [0x7f047de15000(0x204000) @ 0 08:04 21378220 2582763741]: r-xp /usr/lib64/libdl-2.17.so
+
+	    7f047e6088fc
+	    7f047e60199a
+	    7f047e61500e
+ 0x7f047e6089ee/0x7f047e6088fc/M/-/-/0  0x7f047e608d55/0x7f047e6089d8/P/-/-/0  0x7f047e608c78/0x7f047e608d20/P/-/-/0  0x7f047e607141/0x7f047e608c26/P/-/-/0  0x7f047e6070a2/0x7f047e6070b6/P/-/-/0  0x7f047e6068d5/0x7f047e60709f/P/-/-/0  0x7f047e606876/0x7f047e6068a6/P/-/-/0  0x7f047e606dcd/0x7f047e606862/P/-/-/0  0x7f047e606623/0x7f047e606dbb/P/-/-/0  0x7f047e60668b/0x7f047e606619/P/-/-/0  0x7f047e616dd2/0x7f047e606684/P/-/-/0  0x7f047e616dce/0x7f047e616dc0/P/-/-/0  0x7f047e616dce/0x7f047e616dc0/P/-/-/0  0x7f047e616dce/0x7f047e616dc0/P/-/-/0  0x7f047e616dce/0x7f047e616dc0/P/-/-/0  0x7f047e616dce/0x7f047e616dc0/P/-/-/0 
+PERF_RECORD_MMAP2 1881628/1881628: [0x7f047e816000(0x5000) @ 0x4000 08:04 21366518 652059666]: r-xp /usr/lib64/perf_fopen_hook.so
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4005f5
+ 0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          400731
+ 0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
+	          400670
+ 0x4006af/0x400670/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0 
+
+	          4006e0
+ 0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40071c/0x4006e0/P/-/-/0  0x40064c/0x4006c6/P/-/-/0  0x400619/0x400640/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x40062d/0x400743/P/-/-/0  0x400750/0x4005f5/P/-/-/0  0x4006c4/0x400731/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0  0x4006af/0x400670/P/-/-/0 
+
diff --git a/llvm/test/tools/llvm-profgen/cs-preinline-cost.test b/llvm/test/tools/llvm-profgen/cs-preinline-cost.test
new file mode 100644
index 0000000000000..572dce22390ff
--- /dev/null
+++ b/llvm/test/tools/llvm-profgen/cs-preinline-cost.test
@@ -0,0 +1,66 @@
+; REQUIRES: asserts
+; Test default using size of profile as a proxy
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cs-preinline-cost.perfscript --binary=%S/Inputs/cs-preinline-cost.perfbin --csspgo-preinliner --debug-only=cs-preinliner --output=/dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-DEFAULT
+
+; Test use-context-cost-for-preinliner using inlinee's byte size as context-sensitive inline cost
+; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cs-preinline-cost.perfscript --binary=%S/Inputs/cs-preinline-cost.perfbin --csspgo-preinliner --debug-only=cs-preinliner --use-context-cost-for-preinliner --output=/dev/null 2>&1 | FileCheck %s --check-prefix=CHECK-CSCOST
+
+CHECK-DEFAULT:      Process main for context-sensitive pre-inlining (pre-inline size: 9, size limit: 108)
+CHECK-DEFAULT-NEXT:   Inlined context profile for: main:9 @ _Z3fooi (callee size: 2, call count:545)
+CHECK-DEFAULT-NEXT:   Inlined context profile for: main:7 @ _Z3fooi (callee size: 14, call count:545)
+CHECK-DEFAULT-NEXT:   Inlined context profile for: main:8 @ _Z3fooi (callee size: 4, call count:544)
+
+CHECK-CSCOST:      Process main for context-sensitive pre-inlining (pre-inline size: 69, size limit: 828)
+CHECK-CSCOST-NEXT:   Inlined context profile for: main:9 @ _Z3fooi (callee size: 264, call count:545)
+CHECK-CSCOST-NEXT:   Inlined context profile for: main:7 @ _Z3fooi (callee size: 279, call count:545)
+CHECK-CSCOST-NEXT:   Inlined context profile for: main:8 @ _Z3fooi (callee size: 44, call count:544)
+
+; binary is built with the source below using the following command line:
+;   clang -O3 -g -fpseudo-probe-for-profiling -fexperimental-new-pass-manager test.cpp
+;
+;#include <stdio.h>
+;
+;volatile int state = 9000;
+;
+;int foo(int x) {
+;    if (x == 0) {
+;        return 7;
+;    }
+;
+;    if ((x & 1) == 0) {
+;        state--;
+;        return 9;
+;    }
+;
+;    if (state > 5000) {
+;        while (state > 5000) {
+;               for (int i = 50; i >= 0; i--) {
+;                state *= 6;
+;                state /= 7;
+;                state -= 1;
+;            }
+;        }
+;    }
+;    else {
+;        while (state < 5000) {
+;            for (int i = 50; i >= 0; i--) {
+;                state *= 6;
+;                state /= 5;
+;                state += 1;
+;            }
+;        }
+;    }
+;
+;    return state;
+;}
+;
+;volatile int cnt = 10000000;//10000000;
+;int main() {
+;    int r = 0;
+;    for (int i = 0; i < cnt; i++) {
+;      r += foo(i);
+;      r -= foo(i & (~1));
+;      r += foo(0);
+;    }
+;    return r;
+;}
diff --git a/llvm/test/tools/llvm-readobj/ELF/ARM/attribute-big-endian.test b/llvm/test/tools/llvm-readobj/ELF/ARM/attribute-big-endian.test
new file mode 100644
index 0000000000000..7d20b310f9d90
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/ELF/ARM/attribute-big-endian.test
@@ -0,0 +1,17 @@
+## We only implement attribute section printing for little-endian encoding.
+
+# RUN: yaml2obj %s -o %t.o
+# RUN: llvm-readobj -A %t.o 2>&1 | FileCheck %s -DFILE=%t.o
+
+# CHECK: warning: '[[FILE]]': attribute printing not implemented for big-endian ARM objects
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS32
+## Test big-endian encoding.
+  Data:    ELFDATA2MSB
+  Type:    ET_REL
+  Machine: EM_ARM
+Sections:
+  - Name: .ARM.attributes
+    Type: SHT_ARM_ATTRIBUTES
diff --git a/llvm/test/tools/llvm-readobj/ELF/RISCV/validate-attr-section.test b/llvm/test/tools/llvm-readobj/ELF/RISCV/validate-attr-section.test
index 66a5a7a8d31f3..21bbb5ed93f11 100644
--- a/llvm/test/tools/llvm-readobj/ELF/RISCV/validate-attr-section.test
+++ b/llvm/test/tools/llvm-readobj/ELF/RISCV/validate-attr-section.test
@@ -1,9 +1,9 @@
 ## We only implement attribute section printing for little-endian encoding.
 
 # RUN: yaml2obj %s -o %t.o
-# RUN: llvm-readobj -A %t.o | FileCheck %s
+# RUN: llvm-readobj -A %t.o 2>&1 | FileCheck %s -DFILE=%t.o
 
-# CHECK: Attributes not implemented.
+# CHECK: warning: '[[FILE]]': attribute printing not implemented for big-endian RISC-V objects
 
 --- !ELF
 FileHeader:
diff --git a/llvm/test/tools/llvm-readobj/XCOFF/relocations-invalid.test b/llvm/test/tools/llvm-readobj/XCOFF/relocations-invalid.test
new file mode 100644
index 0000000000000..76d75ae7d5dab
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/XCOFF/relocations-invalid.test
@@ -0,0 +1,39 @@
+## Check we report warnings when relocations are broken.
+
+# RUN: yaml2obj --docnum=1 %s -o %t1
+# RUN: llvm-readobj --relocs --expand-relocs %t1 2>&1 | \
+# RUN:   FileCheck %s -DFILE=%t1 --check-prefix=INVALID-REL
+
+# INVALID-REL:      Relocations [
+# INVALID-REL-NEXT:   warning: '[[FILE]]': The end of the file was unexpectedly encountered
+# INVALID-REL-NEXT: ]
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:   0x1DF
+Sections:
+  - Name:        .text
+    Flags:       [ STYP_TEXT ]
+## Case 1: Incorrect offset to relocation entries.
+    FileOffsetToRelocations: 0x222
+
+# RUN: yaml2obj --docnum=2 %s -o %t2
+# RUN: llvm-readobj --relocs --expand-relocs %t2 2>&1 | \
+# RUN:   FileCheck %s -DFILE=%t2 --check-prefix=INVALID-SYM
+
+# INVALID-SYM:      Relocations [
+# INVALID-SYM-NEXT:   Section (index: 1) .text {
+# INVALID-SYM-NEXT:     warning: '[[FILE]]': Invalid symbol index
+# INVALID-SYM-NEXT:   }
+# INVALID-SYM-NEXT: ]
+
+--- !XCOFF
+FileHeader:
+  MagicNumber:   0x1DF
+Sections:
+  - Name:        .text
+    Flags:       [ STYP_TEXT ]
+    Relocations:
+      - Address: 0x80
+## Case 2: There is no symbol with index 0x21.
+        Symbol:  0x21
diff --git a/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml
index 4f5d01b2cf9b1..2b08cc0f7e984 100644
--- a/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml
+++ b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only-tail-call-deduction.yaml
@@ -1,5 +1,5 @@
-# RUN: llvm-xray account -d %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s
-# RUN: llvm-xray account -d -recursive-calls-only %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s
+# RUN: llvm-xray account -d %s -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s
+# RUN: llvm-xray account -d --recursive-calls-only %s -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s
 
 ---
 header:
diff --git a/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml
index d7b36200d10d3..d3b2d11d52a38 100644
--- a/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml
+++ b/llvm/test/tools/llvm-xray/X86/account-recursive-calls-only.yaml
@@ -1,5 +1,5 @@
-# RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s
-# RUN: llvm-xray account -recursive-calls-only %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s
+# RUN: llvm-xray account %s -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=ALL %s
+# RUN: llvm-xray account --recursive-calls-only %s -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefixes=RECURSIVE %s
 
 ---
 header:
diff --git a/llvm/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt
index 52ec12550a3d3..1ac24e18949e3 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt
+++ b/llvm/test/tools/llvm-xray/X86/convert-basic-arg1-to-yaml.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray convert %S/Inputs/naive-with-arg1-entries.xray -f=yaml -o - | FileCheck %s
+; RUN: llvm-xray convert %S/Inputs/naive-with-arg1-entries.xray -f yaml -o - | FileCheck %s
 
 ; CHECK:      ---
 ; CHECK-NEXT: header:
diff --git a/llvm/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt
index 84c757c2b2639..cdddbbc45c369 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt
+++ b/llvm/test/tools/llvm-xray/X86/convert-basic-log-arg1-version3-to-yaml.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray convert %S/Inputs/basic-log-arg1-version-3.xray -f=yaml -o - | FileCheck %s
+; RUN: llvm-xray convert %S/Inputs/basic-log-arg1-version-3.xray -f yaml -o - | FileCheck %s
 
 ; CHECK:      ---
 ; CHECK-NEXT: header:
diff --git a/llvm/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt
index d2af2fc09c2eb..a545932200a44 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt
+++ b/llvm/test/tools/llvm-xray/X86/convert-basic-log-version3-to-yaml.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray convert %S/Inputs/basic-log-version-3.xray -f=yaml -o - | FileCheck %s
+; RUN: llvm-xray convert %S/Inputs/basic-log-version-3.xray -f yaml -o - | FileCheck %s
 
 ; CHECK:      ---
 ; CHECK-NEXT: header:
diff --git a/llvm/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt
index 592796434bd83..6f4716b01ec3f 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt
+++ b/llvm/test/tools/llvm-xray/X86/convert-fdr-arg1-to-yaml.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray convert %S/Inputs/fdr-log-arg1.xray -f=yaml -o - | FileCheck %s
+; RUN: llvm-xray convert %S/Inputs/fdr-log-arg1.xray -f yaml -o - | FileCheck %s
 
 ; CHECK:      ---
 ; CHECK-NEXT: header:
diff --git a/llvm/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt
index afeac68fa3dac..594bf01945c70 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt
+++ b/llvm/test/tools/llvm-xray/X86/convert-fdr-log-arg1-version3-to-yaml.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray convert %S/Inputs/fdr-log-arg1-version-3.xray -f=yaml -o - | FileCheck %s
+; RUN: llvm-xray convert %S/Inputs/fdr-log-arg1-version-3.xray -f yaml -o - | FileCheck %s
 
 ; CHECK:      ---
 ; CHECK-NEXT: header:
diff --git a/llvm/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt
index fc70015c41e87..53edc3a3c16ff 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt
+++ b/llvm/test/tools/llvm-xray/X86/convert-fdr-log-version3-to-yaml.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray convert %S/Inputs/fdr-log-version-3.xray -f=yaml -o - | FileCheck %s
+; RUN: llvm-xray convert %S/Inputs/fdr-log-version-3.xray -f yaml -o - | FileCheck %s
 
 ; CHECK:      ---
 ; CHECK-NEXT: header:
diff --git a/llvm/test/tools/llvm-xray/X86/convert-fdr-to-traceevent.txt b/llvm/test/tools/llvm-xray/X86/convert-fdr-to-traceevent.txt
index 2e845ab3aa92a..99b4791ca5f44 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-fdr-to-traceevent.txt
+++ b/llvm/test/tools/llvm-xray/X86/convert-fdr-to-traceevent.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray convert %S/Inputs/fdr-log-version-1.xray -f=trace_event -o - \
+; RUN: llvm-xray convert %S/Inputs/fdr-log-version-1.xray -f trace_event -o - \
 ; RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 ; RUN:   | FileCheck %s
 
diff --git a/llvm/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt
index 99bc7e11b97b8..cfe0126d9e256 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt
+++ b/llvm/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray convert %S/Inputs/fdr-log-version-1.xray -f=yaml -o - | FileCheck %s
+; RUN: llvm-xray convert %S/Inputs/fdr-log-version-1.xray -f yaml -o - | FileCheck %s
 
 ; CHECK:      ---
 ; CHECK-NEXT: header:
diff --git a/llvm/test/tools/llvm-xray/X86/convert-roundtrip.yaml b/llvm/test/tools/llvm-xray/X86/convert-roundtrip.yaml
index bbebd67e57611..44702ace87fc5 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-roundtrip.yaml
+++ b/llvm/test/tools/llvm-xray/X86/convert-roundtrip.yaml
@@ -1,4 +1,4 @@
-#RUN: llvm-xray convert %s -f=raw -o %t && llvm-xray convert %t -f=yaml -o - | FileCheck %s
+#RUN: llvm-xray convert %s -f raw -o %t && llvm-xray convert %t -f yaml -o - | FileCheck %s
 ---
 header:
   version: 1
diff --git a/llvm/test/tools/llvm-xray/X86/convert-to-yaml.txt b/llvm/test/tools/llvm-xray/X86/convert-to-yaml.txt
index f807fae3a64c5..93c22826294de 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-to-yaml.txt
+++ b/llvm/test/tools/llvm-xray/X86/convert-to-yaml.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray convert %S/Inputs/naive-log-simple.xray -f=yaml -o - | FileCheck %s
+; RUN: llvm-xray convert %S/Inputs/naive-log-simple.xray -f yaml -o - | FileCheck %s
 
 ; CHECK:      ---
 ; CHECK-NEXT: header:
diff --git a/llvm/test/tools/llvm-xray/X86/convert-traceevent-special-events.txt b/llvm/test/tools/llvm-xray/X86/convert-traceevent-special-events.txt
index 1693a4213a1b2..eb8fee7236847 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-traceevent-special-events.txt
+++ b/llvm/test/tools/llvm-xray/X86/convert-traceevent-special-events.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-xray convert %s -f=trace_event -o - \
+# RUN: llvm-xray convert %s -f trace_event -o - \
 # RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
 # RUN:   | FileCheck %s
 ---
diff --git a/llvm/test/tools/llvm-xray/X86/convert-with-debug-syms.txt b/llvm/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
index dbb98e3d3cf05..fd6695a795f6a 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
+++ b/llvm/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray convert -m %S/Inputs/elf64-sample-o2.bin -y %S/Inputs/naive-log-simple.xray -f=yaml -o - 2>&1 | FileCheck %s
+; RUN: llvm-xray convert -m %S/Inputs/elf64-sample-o2.bin -y %S/Inputs/naive-log-simple.xray -f yaml -o - 2>&1 | FileCheck %s
 
 ; CHECK:      ---
 ; CHECK-NEXT: header:
@@ -15,3 +15,21 @@
 ; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802, data: '' }
 ; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828, data: '' }
 ; CHECK-NEXT: ...
+
+; RUN: llvm-xray convert -m %S/Inputs/elf64-sample-o2.bin --symbolize --no-demangle %S/Inputs/naive-log-simple.xray -f=yaml -o - 2>&1 | FileCheck --check-prefix=MANGLED %s
+
+; MANGLED:      ---
+; MANGLED-NEXT: header:
+; MANGLED-NEXT:   version:         1
+; MANGLED-NEXT:   type:            0
+; MANGLED-NEXT:   constant-tsc:    true
+; MANGLED-NEXT:   nonstop-tsc:     true
+; MANGLED-NEXT:   cycle-frequency: 2601000000
+; MANGLED-NEXT: records:
+; MANGLED-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914, data: '' }
+; MANGLED-NEXT:   - { type: 0, func-id: 2, function: _Z3foov, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542, data: '' }
+; MANGLED-NEXT:   - { type: 0, func-id: 2, function: _Z3foov, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670, data: '' }
+; MANGLED-NEXT:   - { type: 0, func-id: 1, function: _Z3barv, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762, data: '' }
+; MANGLED-NEXT:   - { type: 0, func-id: 1, function: _Z3barv, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802, data: '' }
+; MANGLED-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828, data: '' }
+; MANGLED-NEXT: ...
diff --git a/llvm/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt b/llvm/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
index 9a1218256565e..5c8e8e4cb5ccf 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
+++ b/llvm/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray convert -m %S/Inputs/elf64-objcopied-instrmap.bin -y %S/Inputs/naive-log-simple.xray -f=yaml -o - 2>&1 | FileCheck %s
+; RUN: llvm-xray convert -m %S/Inputs/elf64-objcopied-instrmap.bin -y %S/Inputs/naive-log-simple.xray -f yaml -o - 2>&1 | FileCheck %s
 
 ; CHECK:      ---
 ; CHECK-NEXT: header:
diff --git a/llvm/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt b/llvm/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
index 1efcb3572bad8..d705ee25d2264 100644
--- a/llvm/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
+++ b/llvm/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray convert -m %S/Inputs/simple-xray-instrmap.yaml %S/Inputs/naive-log-simple.xray -f=yaml -o - | FileCheck %s
+; RUN: llvm-xray convert -m %S/Inputs/simple-xray-instrmap.yaml %S/Inputs/naive-log-simple.xray -f yaml -o - | FileCheck %s
 
 ; CHECK:      ---
 ; CHECK-NEXT: header:
diff --git a/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt b/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt
index ccb8a1b0538b7..7288c320c1b46 100644
--- a/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt
+++ b/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1-version-3.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray fdr-dump -verify %S/Inputs/fdr-log-arg1-version-3.xray \
+; RUN: llvm-xray fdr-dump --verify %S/Inputs/fdr-log-arg1-version-3.xray \
 ; RUN:     | FileCheck %s
 
 ; CHECK:      [New Block]
diff --git a/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1.txt b/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1.txt
index 8fb381a170c32..f49bd3f7036c1 100644
--- a/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1.txt
+++ b/llvm/test/tools/llvm-xray/X86/fdr-dump-arg1.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray fdr-dump -verify %S/Inputs/fdr-log-arg1.xray | FileCheck %s
+; RUN: llvm-xray fdr-dump --verify %S/Inputs/fdr-log-arg1.xray | FileCheck %s
 
 ; CHECK:      [New Block]
 ; CHECK-NEXT: Preamble:
diff --git a/llvm/test/tools/llvm-xray/X86/stack-multithread.yaml b/llvm/test/tools/llvm-xray/X86/stack-multithread.yaml
index 95be7f770817b..ce8ffce62c24f 100644
--- a/llvm/test/tools/llvm-xray/X86/stack-multithread.yaml
+++ b/llvm/test/tools/llvm-xray/X86/stack-multithread.yaml
@@ -1,5 +1,5 @@
-#RUN: llvm-xray stack -per-thread-stacks %s | FileCheck %s --check-prefix PER-THREAD
-#RUN: llvm-xray stack -aggregate-threads %s | FileCheck %s --check-prefix AGGREGATE
+#RUN: llvm-xray stack --per-thread-stacks %s | FileCheck %s --check-prefix PER-THREAD
+#RUN: llvm-xray stack --aggregate-threads %s | FileCheck %s --check-prefix AGGREGATE
 
 ---
 header:
diff --git a/llvm/test/tools/split-file/basic.test b/llvm/test/tools/split-file/basic.test
index 5d32c3429ed37..b47aeb1e11801 100644
--- a/llvm/test/tools/split-file/basic.test
+++ b/llvm/test/tools/split-file/basic.test
@@ -9,20 +9,20 @@ cc
 //--- end
 
 # RUN: rm -rf %t
-# RUN: split-file %s %t
+# RUN: split-file --leading-lines %s %t
 # RUN: diff %S/Inputs/basic-aa.txt %t/aa
 # RUN: diff %S/Inputs/basic-bb.txt %t/bb
 # RUN: diff %S/Inputs/basic-cc.txt %t/subdir/cc
 # RUN: FileCheck %s --check-prefix=END < %t/end
 
 ## Can be called on a non-empty directory.
-# RUN: split-file %s %t
+# RUN: split-file --leading-lines %s %t
 # RUN: diff %S/Inputs/basic-aa.txt %t/aa
 
 ## Test that we will delete the output if it is a file, so that we can create
 ## a directory.
 # RUN: rm -rf %t && touch %t
-# RUN: split-file %s %t
+# RUN: split-file --leading-lines %s %t
 # RUN: diff %S/Inputs/basic-aa.txt %t/aa
 
 # END: RUN: split-file %s %t
diff --git a/llvm/tools/bugpoint-passes/TestPasses.cpp b/llvm/tools/bugpoint-passes/TestPasses.cpp
index 6667cbe9255ec..1e669538ec33b 100644
--- a/llvm/tools/bugpoint-passes/TestPasses.cpp
+++ b/llvm/tools/bugpoint-passes/TestPasses.cpp
@@ -143,7 +143,7 @@ class CrashOnFunctionAttribute : public FunctionPass {
   }
 
   bool runOnFunction(Function &F) override {
-    AttributeSet A = F.getAttributes().getFnAttributes();
+    AttributeSet A = F.getAttributes().getFnAttrs();
     if (A.hasAttribute("bugpoint-crash"))
       abort();
     return false;
diff --git a/llvm/tools/bugpoint/CrashDebugger.cpp b/llvm/tools/bugpoint/CrashDebugger.cpp
index 2601ee318f7db..8e4de35220364 100644
--- a/llvm/tools/bugpoint/CrashDebugger.cpp
+++ b/llvm/tools/bugpoint/CrashDebugger.cpp
@@ -358,8 +358,7 @@ bool ReduceCrashingFunctionAttributes::TestFuncAttrs(
   for (auto A : Attrs)
     AB.addAttribute(A);
   AttributeList NewAttrs;
-  NewAttrs =
-      NewAttrs.addAttributes(BD.getContext(), AttributeList::FunctionIndex, AB);
+  NewAttrs = NewAttrs.addFnAttributes(BD.getContext(), AB);
 
   // Set this new list of attributes on the function.
   F->setAttributes(NewAttrs);
@@ -375,7 +374,7 @@ bool ReduceCrashingFunctionAttributes::TestFuncAttrs(
 
     // Pass along the set of attributes that caused the crash.
     Attrs.clear();
-    for (Attribute A : NewAttrs.getFnAttributes()) {
+    for (Attribute A : NewAttrs.getFnAttrs()) {
       Attrs.push_back(A);
     }
     return true;
@@ -1232,7 +1231,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
         assert(Fn && "Could not find function?");
 
         std::vector<Attribute> Attrs;
-        for (Attribute A : Fn->getAttributes().getFnAttributes())
+        for (Attribute A : Fn->getAttributes().getFnAttrs())
           Attrs.push_back(A);
 
         OldSize += Attrs.size();
diff --git a/llvm/tools/llvm-cov/CoverageExporterLcov.cpp b/llvm/tools/llvm-cov/CoverageExporterLcov.cpp
index 6cf5d9285b90a..0096a3d44d856 100644
--- a/llvm/tools/llvm-cov/CoverageExporterLcov.cpp
+++ b/llvm/tools/llvm-cov/CoverageExporterLcov.cpp
@@ -167,7 +167,7 @@ void renderLineSummary(raw_ostream &OS, const FileCoverageSummary &Summary) {
 
 void renderBranchSummary(raw_ostream &OS, const FileCoverageSummary &Summary) {
   OS << "BRF:" << Summary.BranchCoverage.getNumBranches() << '\n'
-     << "BFH:" << Summary.BranchCoverage.getCovered() << '\n';
+     << "BRH:" << Summary.BranchCoverage.getCovered() << '\n';
 }
 
 void renderFile(raw_ostream &OS, const coverage::CoverageMapping &Coverage,
diff --git a/llvm/tools/llvm-cvtres/Opts.td b/llvm/tools/llvm-cvtres/Opts.td
index 8687d4701ce85..21d583787cd58 100644
--- a/llvm/tools/llvm-cvtres/Opts.td
+++ b/llvm/tools/llvm-cvtres/Opts.td
@@ -2,14 +2,14 @@ include "llvm/Option/OptParser.td"
 
 // All the switches can be preceded by either '/' or '-'.
 
-def DEFINE : Joined<["/", "-"], "DEFINE:">, HelpText<"">, MetaVarName<"symbol">;
-def FOLDDUPS : Flag<["/", "-"], "FOLDDUPS:">, HelpText<"">;
-def MACHINE : Joined<["/", "-"], "MACHINE:">, HelpText<"">, MetaVarName<"{ARM|ARM64|EBC|IA64|X64|X86}">;
-def NOLOGO : Flag<["/", "-"], "NOLOGO">, HelpText<"">;
-def OUT : Joined<["/", "-"], "OUT:">, HelpText<"">, MetaVarName<"filename">;
-def READONLY : Flag<["/", "-"], "READONLY">, HelpText<"">;
-def VERBOSE : Flag<["/", "-"], "VERBOSE">, HelpText<"">;
-def HELP : Flag<["/", "-"], "HELP">;
+def DEFINE : Joined<["/", "-"], "DEFINE:">, HelpText<"Not implemented">, MetaVarName<"symbol">;
+def FOLDDUPS : Flag<["/", "-"], "FOLDDUPS:">, HelpText<"Not implemented">;
+def MACHINE : Joined<["/", "-"], "MACHINE:">, HelpText<"Machine architecture">, MetaVarName<"{ARM|ARM64|EBC|IA64|X64|X86}">;
+def NOLOGO : Flag<["/", "-"], "NOLOGO">, HelpText<"Not implemented">;
+def OUT : Joined<["/", "-"], "OUT:">, HelpText<"Output file">, MetaVarName<"filename">;
+def READONLY : Flag<["/", "-"], "READONLY">, HelpText<"Not implemented">;
+def VERBOSE : Flag<["/", "-"], "VERBOSE">, HelpText<"Use verbose output">;
+def HELP : Flag<["/", "-"], "HELP">, HelpText<"Display available options">;
 def H : Flag<["/", "-"], "H">, Alias<HELP>;
 def HELP_Q : Flag<["/?", "-?"], "">, Alias<HELP>;
 
diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp
index ffb427a3f2bdf..f713fb8eb35f9 100644
--- a/llvm/tools/llvm-nm/llvm-nm.cpp
+++ b/llvm/tools/llvm-nm/llvm-nm.cpp
@@ -1575,90 +1575,11 @@ static void dumpSymbolsFromDLInfoMachO(MachOObjectFile &MachO) {
   }
 }
 
-namespace {
-struct SymbolVersion {
-  std::string Name;
-  bool IsDefault;
-};
-} // namespace
-
-template <class ELFT>
-static Expected<std::vector<SymbolVersion>>
-readSymbolVersionsELF(const ELFFile<ELFT> &Obj, StringRef FileName,
-                      ELFObjectFileBase::elf_symbol_iterator_range Symbols) {
-  using Elf_Shdr = typename ELFT::Shdr;
-
-  // We called sections() earlier, so can't fail here.
-  typename ELFT::ShdrRange SectionsOrErr = cantFail(Obj.sections());
-  const Elf_Shdr *SymVerSec = nullptr;
-  const Elf_Shdr *SymVerNeedSec = nullptr;
-  const Elf_Shdr *SymVerDefSec = nullptr;
-  for (const Elf_Shdr &Sec : SectionsOrErr) {
-    if (Sec.sh_type == ELF::SHT_GNU_versym)
-      SymVerSec = &Sec;
-    else if (Sec.sh_type == ELF::SHT_GNU_verdef)
-      SymVerDefSec = &Sec;
-    else if (Sec.sh_type == ELF::SHT_GNU_verneed)
-      SymVerNeedSec = &Sec;
-  }
-
-  if (!SymVerSec)
-    return std::vector<SymbolVersion>{};
-
-  Expected<SmallVector<Optional<VersionEntry>, 0>> MapOrErr =
-      Obj.loadVersionMap(SymVerNeedSec, SymVerDefSec);
-  if (!MapOrErr)
-    return MapOrErr.takeError();
-
-  std::vector<SymbolVersion> Ret;
-  size_t I = 0;
-  for (auto It = Symbols.begin(), E = Symbols.end(); It != E; ++It) {
-    ++I;
-    Expected<const typename ELFT::Versym *> VerEntryOrErr =
-        Obj.template getEntry<typename ELFT::Versym>(*SymVerSec, I);
-    if (!VerEntryOrErr)
-      return createError("unable to read an entry with index " + Twine(I) +
-                         " from " + describe(Obj, *SymVerSec) + ": " +
-                         toString(VerEntryOrErr.takeError()));
-
-    Expected<uint32_t> FlagsOrErr = It->getFlags();
-    if (!FlagsOrErr)
-      return createError("unable to read flags for symbol with index " +
-                         Twine(I) + ": " + toString(FlagsOrErr.takeError()));
-
-    bool IsDefault;
-    Expected<StringRef> VerOrErr = Obj.getSymbolVersionByIndex(
-        (*VerEntryOrErr)->vs_index, IsDefault, *MapOrErr,
-        (*FlagsOrErr) & SymbolRef::SF_Undefined);
-    if (!VerOrErr)
-      return createError("unable to get a version for entry " + Twine(I) +
-                         " of " + describe(Obj, *SymVerSec) + ": " +
-                         toString(VerOrErr.takeError()));
-
-    Ret.push_back({(*VerOrErr).str(), IsDefault});
-  }
-
-  return Ret;
-}
-
-static Expected<std::vector<SymbolVersion>>
-readSymbolVersionsELF(const ELFObjectFileBase &Obj,
-                      ELFObjectFileBase::elf_symbol_iterator_range Symbols) {
-  if (const auto *ELF = dyn_cast<ELF32LEObjectFile>(&Obj))
-    return readSymbolVersionsELF(ELF->getELFFile(), Obj.getFileName(), Symbols);
-  else if (const auto *ELF = dyn_cast<ELF32BEObjectFile>(&Obj))
-    return readSymbolVersionsELF(ELF->getELFFile(), Obj.getFileName(), Symbols);
-  else if (const auto *ELF = dyn_cast<ELF64LEObjectFile>(&Obj))
-    return readSymbolVersionsELF(ELF->getELFFile(), Obj.getFileName(), Symbols);
-  return readSymbolVersionsELF(cast<ELF64BEObjectFile>(&Obj)->getELFFile(),
-                               Obj.getFileName(), Symbols);
-}
-
 static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
                                       StringRef ArchiveName = {},
                                       StringRef ArchitectureName = {}) {
   auto Symbols = Obj.symbols();
-  std::vector<SymbolVersion> SymbolVersions;
+  std::vector<VersionEntry> SymbolVersions;
   if (DynamicSyms) {
     const auto *E = dyn_cast<ELFObjectFileBase>(&Obj);
     if (!E) {
@@ -1667,8 +1588,8 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
     }
     Symbols = E->getDynamicSymbolIterators();
 
-    if (Expected<std::vector<SymbolVersion>> VersionsOrErr =
-            readSymbolVersionsELF(*E, Symbols))
+    if (Expected<std::vector<VersionEntry>> VersionsOrErr =
+            E->readDynsymVersions())
       SymbolVersions = std::move(*VersionsOrErr);
     else
       WithColor::warning(errs(), ToolName)
@@ -1738,7 +1659,7 @@ static void dumpSymbolNamesFromObject(SymbolicFile &Obj, bool printName,
       }
       if (!SymbolVersions.empty() && !SymbolVersions[I].Name.empty())
         S.Name +=
-            (SymbolVersions[I].IsDefault ? "@@" : "@") + SymbolVersions[I].Name;
+            (SymbolVersions[I].IsVerDef ? "@@" : "@") + SymbolVersions[I].Name;
 
       S.Sym = Sym;
       SymbolList.push_back(S);
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index a062957cc7430..330597d04f0e5 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -1921,7 +1921,8 @@ void objdump::printSymbolTable(const ObjectFile *O, StringRef ArchiveName,
   if (!DumpDynamic) {
     outs() << "\nSYMBOL TABLE:\n";
     for (auto I = O->symbol_begin(); I != O->symbol_end(); ++I)
-      printSymbol(O, *I, FileName, ArchiveName, ArchitectureName, DumpDynamic);
+      printSymbol(O, *I, {}, FileName, ArchiveName, ArchitectureName,
+                  DumpDynamic);
     return;
   }
 
@@ -1934,12 +1935,21 @@ void objdump::printSymbolTable(const ObjectFile *O, StringRef ArchiveName,
   }
 
   const ELFObjectFileBase *ELF = cast<const ELFObjectFileBase>(O);
-  for (auto I = ELF->getDynamicSymbolIterators().begin();
-       I != ELF->getDynamicSymbolIterators().end(); ++I)
-    printSymbol(O, *I, FileName, ArchiveName, ArchitectureName, DumpDynamic);
+  auto Symbols = ELF->getDynamicSymbolIterators();
+  Expected<std::vector<VersionEntry>> SymbolVersionsOrErr =
+      ELF->readDynsymVersions();
+  if (!SymbolVersionsOrErr) {
+    reportWarning(toString(SymbolVersionsOrErr.takeError()), FileName);
+    SymbolVersionsOrErr = std::vector<VersionEntry>();
+    (void)!SymbolVersionsOrErr;
+  }
+  for (auto &Sym : Symbols)
+    printSymbol(O, Sym, *SymbolVersionsOrErr, FileName, ArchiveName,
+                ArchitectureName, DumpDynamic);
 }
 
 void objdump::printSymbol(const ObjectFile *O, const SymbolRef &Symbol,
+                          ArrayRef<VersionEntry> SymbolVersions,
                           StringRef FileName, StringRef ArchiveName,
                           StringRef ArchitectureName, bool DumpDynamic) {
   const MachOObjectFile *MachO = dyn_cast<const MachOObjectFile>(O);
@@ -2044,6 +2054,15 @@ void objdump::printSymbol(const ObjectFile *O, const SymbolRef &Symbol,
   }
 
   if (O->isELF()) {
+    if (!SymbolVersions.empty()) {
+      const VersionEntry &Ver =
+          SymbolVersions[Symbol.getRawDataRefImpl().d.b - 1];
+      std::string Str;
+      if (!Ver.Name.empty())
+        Str = Ver.IsVerDef ? ' ' + Ver.Name : '(' + Ver.Name + ')';
+      outs() << ' ' << left_justify(Str, 12);
+    }
+
     uint8_t Other = ELFSymbolRef(Symbol).getOther();
     switch (Other) {
     case ELF::STV_DEFAULT:
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.h b/llvm/tools/llvm-objdump/llvm-objdump.h
index 3796878558dec..d9fc3bfe66a5d 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.h
+++ b/llvm/tools/llvm-objdump/llvm-objdump.h
@@ -26,6 +26,7 @@ class ELFSectionRef;
 class MachOObjectFile;
 class MachOUniversalBinary;
 class RelocationRef;
+struct VersionEntry;
 } // namespace object
 
 namespace objdump {
@@ -137,6 +138,7 @@ void printSymbolTable(const object::ObjectFile *O, StringRef ArchiveName,
                       StringRef ArchitectureName = StringRef(),
                       bool DumpDynamic = false);
 void printSymbol(const object::ObjectFile *O, const object::SymbolRef &Symbol,
+                 ArrayRef<object::VersionEntry> SymbolVersions,
                  StringRef FileName, StringRef ArchiveName,
                  StringRef ArchitectureName, bool DumpDynamic);
 [[noreturn]] void reportError(StringRef File, const Twine &Message);
diff --git a/llvm/tools/llvm-profgen/CSPreInliner.cpp b/llvm/tools/llvm-profgen/CSPreInliner.cpp
index 23bdba820334d..2675a4d19cff6 100644
--- a/llvm/tools/llvm-profgen/CSPreInliner.cpp
+++ b/llvm/tools/llvm-profgen/CSPreInliner.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CSPreInliner.h"
+#include "ProfiledBinary.h"
 #include "llvm/ADT/SCCIterator.h"
 #include <cstdint>
 #include <queue>
@@ -25,15 +26,26 @@ extern cl::opt<int> ProfileInlineGrowthLimit;
 extern cl::opt<int> ProfileInlineLimitMin;
 extern cl::opt<int> ProfileInlineLimitMax;
 
+cl::opt<bool> EnableCSPreInliner(
+    "csspgo-preinliner", cl::Hidden, cl::init(false),
+    cl::desc("Run a global pre-inliner to merge context profile based on "
+             "estimated global top-down inline decisions"));
+
+cl::opt<bool> UseContextCostForPreInliner(
+    "use-context-cost-for-preinliner", cl::Hidden, cl::init(false),
+    cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
+
 static cl::opt<bool> SamplePreInlineReplay(
     "csspgo-replay-preinline", cl::Hidden, cl::init(false),
     cl::desc(
         "Replay previous inlining and adjust context profile accordingly"));
 
 CSPreInliner::CSPreInliner(StringMap<FunctionSamples> &Profiles,
-                           uint64_t HotThreshold, uint64_t ColdThreshold)
-    : ContextTracker(Profiles), ProfileMap(Profiles),
-      HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) {}
+                           ProfiledBinary &Binary, uint64_t HotThreshold,
+                           uint64_t ColdThreshold)
+    : UseContextCost(UseContextCostForPreInliner), ContextTracker(Profiles),
+      ProfileMap(Profiles), Binary(Binary), HotCountThreshold(HotThreshold),
+      ColdCountThreshold(ColdThreshold) {}
 
 std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
   std::vector<StringRef> Order;
@@ -87,12 +99,22 @@ bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
     // TODO: call site and callee entry count should be mostly consistent, add
     // check for that.
     HasNewCandidate = true;
-    CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount));
+    uint32_t CalleeSize = getFuncSize(*CalleeSamples);
+    CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
+                   CalleeSize);
   }
 
   return HasNewCandidate;
 }
 
+uint32_t CSPreInliner::getFuncSize(const FunctionSamples &FSamples) {
+  if (UseContextCost) {
+    return Binary.getFuncSizeForContext(FSamples.getContext());
+  }
+
+  return FSamples.getBodySamples().size();
+}
+
 bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
   // If replay inline is requested, simply follow the inline decision of the
   // profiled binary.
@@ -115,21 +137,20 @@ bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
 }
 
 void CSPreInliner::processFunction(const StringRef Name) {
-  LLVM_DEBUG(dbgs() << "Process " << Name
-                    << " for context-sensitive pre-inlining\n");
-
   FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
   if (!FSamples)
     return;
 
-  // Use the number of lines/probes as proxy for function size for now.
-  // TODO: retrieve accurate size from dwarf or binary instead.
-  unsigned FuncSize = FSamples->getBodySamples().size();
+  unsigned FuncSize = getFuncSize(*FSamples);
   unsigned FuncFinalSize = FuncSize;
   unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
   SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
   SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
 
+  LLVM_DEBUG(dbgs() << "Process " << Name
+                    << " for context-sensitive pre-inlining (pre-inline size: "
+                    << FuncSize << ", size limit: " << SizeLimit << ")\n");
+
   ProfiledCandidateQueue CQueue;
   getInlineCandidates(CQueue, FSamples);
 
diff --git a/llvm/tools/llvm-profgen/CSPreInliner.h b/llvm/tools/llvm-profgen/CSPreInliner.h
index 5c65d8fd4a3b7..fba4478805ed4 100644
--- a/llvm/tools/llvm-profgen/CSPreInliner.h
+++ b/llvm/tools/llvm-profgen/CSPreInliner.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H
 #define LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H
 
+#include "ProfiledBinary.h"
 #include "llvm/ADT/PriorityQueue.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/ProfileData/SampleProf.h"
@@ -23,9 +24,9 @@ namespace sampleprof {
 
 // Inline candidate seen from profile
 struct ProfiledInlineCandidate {
-  ProfiledInlineCandidate(const FunctionSamples *Samples, uint64_t Count)
-      : CalleeSamples(Samples), CallsiteCount(Count),
-        SizeCost(Samples->getBodySamples().size()) {}
+  ProfiledInlineCandidate(const FunctionSamples *Samples, uint64_t Count,
+                          uint32_t Size)
+      : CalleeSamples(Samples), CallsiteCount(Count), SizeCost(Size) {}
   // Context-sensitive function profile for inline candidate
   const FunctionSamples *CalleeSamples;
   // Call site count for an inline candidate
@@ -33,7 +34,6 @@ struct ProfiledInlineCandidate {
   // target count for corresponding call are consistent.
   uint64_t CallsiteCount;
   // Size proxy for function under particular call context.
-  // TODO: use post-inline callee size from debug info.
   uint64_t SizeCost;
 };
 
@@ -67,8 +67,8 @@ using ProfiledCandidateQueue =
 // size by only keep context that is estimated to be inlined.
 class CSPreInliner {
 public:
-  CSPreInliner(StringMap<FunctionSamples> &Profiles, uint64_t HotThreshold,
-               uint64_t ColdThreshold);
+  CSPreInliner(StringMap<FunctionSamples> &Profiles, ProfiledBinary &Binary,
+               uint64_t HotThreshold, uint64_t ColdThreshold);
   void run();
 
 private:
@@ -77,8 +77,11 @@ class CSPreInliner {
   std::vector<StringRef> buildTopDownOrder();
   void processFunction(StringRef Name);
   bool shouldInline(ProfiledInlineCandidate &Candidate);
+  uint32_t getFuncSize(const FunctionSamples &FSamples);
+  bool UseContextCost;
   SampleContextTracker ContextTracker;
   StringMap<FunctionSamples> &ProfileMap;
+  ProfiledBinary &Binary;
 
   // Count thresholds to answer isHotCount and isColdCount queries.
   // Mirrors the threshold in ProfileSummaryInfo.
diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp
index 4cff3a9e6bf71..f96761a831b04 100644
--- a/llvm/tools/llvm-profgen/PerfReader.cpp
+++ b/llvm/tools/llvm-profgen/PerfReader.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 #include "PerfReader.h"
 #include "ProfileGenerator.h"
-#include "llvm/Support/FileSystem.h"
 
 static cl::opt<bool> ShowMmapEvents("show-mmap-events", cl::ReallyHidden,
                                     cl::init(false), cl::ZeroOrMore,
@@ -244,50 +243,13 @@ bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) {
   return true;
 }
 
-void PerfReaderBase::validateCommandLine(
-    cl::list<std::string> &BinaryFilenames,
-    cl::list<std::string> &PerfTraceFilenames) {
-  // Allow the invalid perfscript if we only use to show binary disassembly
-  if (!ShowDisassemblyOnly) {
-    for (auto &File : PerfTraceFilenames) {
-      if (!llvm::sys::fs::exists(File)) {
-        std::string Msg = "Input perf script(" + File + ") doesn't exist!";
-        exitWithError(Msg);
-      }
-    }
-  }
-  if (BinaryFilenames.size() > 1) {
-    // TODO: remove this if everything is ready to support multiple binaries.
-    exitWithError(
-        "Currently only support one input binary, multiple binaries' "
-        "profile will be merged in one profile and make profile "
-        "summary info inaccurate. Please use `llvm-perfdata` to merge "
-        "profiles from multiple binaries.");
-  }
-  for (auto &Binary : BinaryFilenames) {
-    if (!llvm::sys::fs::exists(Binary)) {
-      std::string Msg = "Input binary(" + Binary + ") doesn't exist!";
-      exitWithError(Msg);
-    }
-  }
-  if (CSProfileGenerator::MaxCompressionSize < -1) {
-    exitWithError("Value of --compress-recursion should >= -1");
-  }
-  if (ShowSourceLocations && !ShowDisassemblyOnly) {
-    exitWithError("--show-source-locations should work together with "
-                  "--show-disassembly-only!");
-  }
-}
-
 std::unique_ptr<PerfReaderBase>
-PerfReaderBase::create(cl::list<std::string> &BinaryFilenames,
+PerfReaderBase::create(ProfiledBinary *Binary,
                        cl::list<std::string> &PerfTraceFilenames) {
-  validateCommandLine(BinaryFilenames, PerfTraceFilenames);
-
   PerfScriptType PerfType = extractPerfType(PerfTraceFilenames);
   std::unique_ptr<PerfReaderBase> PerfReader;
   if (PerfType == PERF_LBR_STACK) {
-    PerfReader.reset(new HybridPerfReader(BinaryFilenames));
+    PerfReader.reset(new HybridPerfReader(Binary));
   } else if (PerfType == PERF_LBR) {
     // TODO:
     exitWithError("Unsupported perfscript!");
@@ -298,77 +260,36 @@ PerfReaderBase::create(cl::list<std::string> &BinaryFilenames,
   return PerfReader;
 }
 
-PerfReaderBase::PerfReaderBase(cl::list<std::string> &BinaryFilenames) {
-  // Load the binaries.
-  for (auto Filename : BinaryFilenames)
-    loadBinary(Filename, /*AllowNameConflict*/ false);
-}
-
-ProfiledBinary &PerfReaderBase::loadBinary(const StringRef BinaryPath,
-                                           bool AllowNameConflict) {
-  // The binary table is currently indexed by the binary name not the full
-  // binary path. This is because the user-given path may not match the one
-  // that was actually executed.
-  StringRef BinaryName = llvm::sys::path::filename(BinaryPath);
-
-  // Call to load the binary in the ctor of ProfiledBinary.
-  auto Ret = BinaryTable.insert({BinaryName, ProfiledBinary(BinaryPath)});
-
-  if (!Ret.second && !AllowNameConflict) {
-    std::string ErrorMsg = "Binary name conflict: " + BinaryPath.str() +
-                           " and " + Ret.first->second.getPath().str() + " \n";
-    exitWithError(ErrorMsg);
-  }
-
-  // Initialize the base address to preferred address.
-  ProfiledBinary &B = Ret.first->second;
-  uint64_t PreferredAddr = B.getPreferredBaseAddress();
-  AddrToBinaryMap[PreferredAddr] = &B;
-  B.setBaseAddress(PreferredAddr);
-
-  return B;
-}
-
 void PerfReaderBase::updateBinaryAddress(const MMapEvent &Event) {
-  // Load the binary.
-  StringRef BinaryPath = Event.BinaryPath;
-  StringRef BinaryName = llvm::sys::path::filename(BinaryPath);
-
-  auto I = BinaryTable.find(BinaryName);
-  // Drop the event which doesn't belong to user-provided binaries
-  if (I == BinaryTable.end())
+  // Drop the event which doesn't belong to user-provided binary
+  StringRef BinaryName = llvm::sys::path::filename(Event.BinaryPath);
+  if (Binary->getName() != BinaryName)
     return;
 
-  ProfiledBinary &Binary = I->second;
   // Drop the event if its image is loaded at the same address
-  if (Event.Address == Binary.getBaseAddress()) {
-    Binary.setIsLoadedByMMap(true);
+  if (Event.Address == Binary->getBaseAddress()) {
+    Binary->setIsLoadedByMMap(true);
     return;
   }
 
-  if (Event.Offset == Binary.getTextSegmentOffset()) {
+  if (Event.Offset == Binary->getTextSegmentOffset()) {
     // A binary image could be unloaded and then reloaded at different
-    // place, so update the address map here.
+    // place, so update binary load address.
     // Only update for the first executable segment and assume all other
     // segments are loaded at consecutive memory addresses, which is the case on
     // X64.
-    AddrToBinaryMap.erase(Binary.getBaseAddress());
-    AddrToBinaryMap[Event.Address] = &Binary;
-
-    // Update binary load address.
-    Binary.setBaseAddress(Event.Address);
-
-    Binary.setIsLoadedByMMap(true);
+    Binary->setBaseAddress(Event.Address);
+    Binary->setIsLoadedByMMap(true);
   } else {
     // Verify segments are loaded consecutively.
-    const auto &Offsets = Binary.getTextSegmentOffsets();
+    const auto &Offsets = Binary->getTextSegmentOffsets();
     auto It = std::lower_bound(Offsets.begin(), Offsets.end(), Event.Offset);
     if (It != Offsets.end() && *It == Event.Offset) {
       // The event is for loading a separate executable segment.
       auto I = std::distance(Offsets.begin(), It);
-      const auto &PreferredAddrs = Binary.getPreferredTextSegmentAddresses();
-      if (PreferredAddrs[I] - Binary.getPreferredBaseAddress() !=
-          Event.Address - Binary.getBaseAddress())
+      const auto &PreferredAddrs = Binary->getPreferredTextSegmentAddresses();
+      if (PreferredAddrs[I] - Binary->getPreferredBaseAddress() !=
+          Event.Address - Binary->getBaseAddress())
         exitWithError("Executable segments not loaded consecutively");
     } else {
       if (It == Offsets.begin())
@@ -378,23 +299,13 @@ void PerfReaderBase::updateBinaryAddress(const MMapEvent &Event) {
         // via multiple mmap calls with consecutive memory addresses.
         --It;
         assert(*It < Event.Offset);
-        if (Event.Offset - *It != Event.Address - Binary.getBaseAddress())
+        if (Event.Offset - *It != Event.Address - Binary->getBaseAddress())
           exitWithError("Segment not loaded by consecutive mmaps");
       }
     }
   }
 }
 
-ProfiledBinary *PerfReaderBase::getBinary(uint64_t Address) {
-  auto Iter = AddrToBinaryMap.lower_bound(Address);
-  if (Iter == AddrToBinaryMap.end() || Iter->first != Address) {
-    if (Iter == AddrToBinaryMap.begin())
-      return nullptr;
-    Iter--;
-  }
-  return Iter->second;
-}
-
 // Use ordered map to make the output deterministic
 using OrderedCounterForPrint = std::map<std::string, RangeSample>;
 
@@ -448,20 +359,16 @@ static void printBranchCounter(ContextSampleCounterMap &Counter,
 }
 
 void HybridPerfReader::printUnwinderOutput() {
-  for (auto I : BinarySampleCounters) {
-    const ProfiledBinary *Binary = I.first;
     outs() << "Binary(" << Binary->getName().str() << ")'s Range Counter:\n";
-    printRangeCounter(I.second, Binary);
+    printRangeCounter(SampleCounters, Binary);
     outs() << "\nBinary(" << Binary->getName().str() << ")'s Branch Counter:\n";
-    printBranchCounter(I.second, Binary);
-  }
+    printBranchCounter(SampleCounters, Binary);
 }
 
 void HybridPerfReader::unwindSamples() {
   for (const auto &Item : AggregatedSamples) {
     const HybridSample *Sample = dyn_cast<HybridSample>(Item.first.getPtr());
-    VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary],
-                             Sample->Binary);
+    VirtualUnwinder Unwinder(&SampleCounters, Binary);
     Unwinder.unwind(Sample, Item.second);
   }
 
@@ -470,8 +377,7 @@ void HybridPerfReader::unwindSamples() {
 }
 
 bool PerfReaderBase::extractLBRStack(TraceStream &TraceIt,
-                                     SmallVectorImpl<LBREntry> &LBRStack,
-                                     ProfiledBinary *Binary) {
+                                     SmallVectorImpl<LBREntry> &LBRStack) {
   // The raw format of LBR stack is like:
   // 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
   //                           ... 0x4005c8/0x4005dc/P/-/-/0
@@ -572,7 +478,6 @@ bool PerfReaderBase::extractCallstack(TraceStream &TraceIt,
   // It's in bottom-up order with each frame in one line.
 
   // Extract stack frames from sample
-  ProfiledBinary *Binary = nullptr;
   while (!TraceIt.isAtEoF() && !TraceIt.getCurrentLine().startswith(" 0x")) {
     StringRef FrameStr = TraceIt.getCurrentLine().ltrim();
     uint64_t FrameAddr = 0;
@@ -583,16 +488,6 @@ bool PerfReaderBase::extractCallstack(TraceStream &TraceIt,
       return false;
     }
     TraceIt.advance();
-    if (!Binary) {
-      Binary = getBinary(FrameAddr);
-      // we might have addr not match the MMAP, skip it
-      if (!Binary) {
-        if (AddrToBinaryMap.size() == 0)
-          WithColor::warning() << "No MMAP event in the perfscript, create it "
-                                  "with '--show-mmap-events'\n";
-        break;
-      }
-    }
     // Currently intermixed frame from different binaries is not supported.
     // Ignore bottom frames not from binary of interest.
     if (!Binary->addressIsCode(FrameAddr))
@@ -635,7 +530,7 @@ void HybridPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) {
   // 0x4005c8/0x4005dc/P/-/-/0   0x40062f/0x4005b0/P/-/-/0 ...
   //          ... 0x4005c8/0x4005dc/P/-/-/0    # LBR Entries
   //
-  std::shared_ptr<HybridSample> Sample = std::make_shared<HybridSample>();
+  std::shared_ptr<HybridSample> Sample = std::make_shared<HybridSample>(Binary);
 
   // Parsing call stack and populate into HybridSample.CallStack
   if (!extractCallstack(TraceIt, Sample->CallStack)) {
@@ -644,19 +539,17 @@ void HybridPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) {
       TraceIt.advance();
     return;
   }
-  // Set the binary current sample belongs to
-  ProfiledBinary *PB = getBinary(Sample->CallStack.front());
-  Sample->Binary = PB;
-  if (!PB->getMissingMMapWarned() && !PB->getIsLoadedByMMap()) {
+
+  if (!Binary->getMissingMMapWarned() && !Binary->getIsLoadedByMMap()) {
     WithColor::warning() << "No relevant mmap event is matched, will use "
                             "preferred address as the base loading address!\n";
     // Avoid redundant warning, only warn at the first unmatched sample.
-    PB->setMissingMMapWarned(true);
+    Binary->setMissingMMapWarned(true);
   }
 
   if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) {
     // Parsing LBR stack and populate into HybridSample.LBRStack
-    if (extractLBRStack(TraceIt, Sample->LBRStack, Sample->Binary)) {
+    if (extractLBRStack(TraceIt, Sample->LBRStack)) {
       // Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR
       // ranges
       Sample->CallStack.front() = Sample->LBRStack[0].Target;
diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h
index 76f5b09874f77..80fb8b1bd61d1 100644
--- a/llvm/tools/llvm-profgen/PerfReader.h
+++ b/llvm/tools/llvm-profgen/PerfReader.h
@@ -163,7 +163,7 @@ struct HybridSample : public PerfSample {
   // LBR stack recorded in FIFO order
   SmallVector<LBREntry, 16> LBRStack;
 
-  HybridSample() : PerfSample(PK_HybridSample){};
+  HybridSample(ProfiledBinary *B) : PerfSample(PK_HybridSample), Binary(B){};
   static bool classof(const PerfSample *K) {
     return K->getKind() == PK_HybridSample;
   }
@@ -295,7 +295,6 @@ struct UnwindState {
            "IP should align with context leaf");
   }
 
-  const ProfiledBinary *getBinary() const { return Binary; }
   bool hasNextLBR() const { return LBRIndex < LBRStack.size(); }
   uint64_t getCurrentLBRSource() const { return LBRStack[LBRIndex].Source; }
   uint64_t getCurrentLBRTarget() const { return LBRStack[LBRIndex].Target; }
@@ -531,24 +530,16 @@ class VirtualUnwinder {
   const ProfiledBinary *Binary;
 };
 
-// Filename to binary map
-using BinaryMap = StringMap<ProfiledBinary>;
-// Address to binary map for fast look-up
-using AddressBinaryMap = std::map<uint64_t, ProfiledBinary *>;
-// Binary to ContextSampleCounters Map to support multiple binary, we may have
-// same binary loaded at different addresses, they should share the same sample
-// counter
-using BinarySampleCounterMap =
-    std::unordered_map<ProfiledBinary *, ContextSampleCounterMap>;
-
-// Load binaries and read perf trace to parse the events and samples
+// Read perf trace to parse the events and samples.
 class PerfReaderBase {
 public:
-  PerfReaderBase(cl::list<std::string> &BinaryFilenames);
+  PerfReaderBase(ProfiledBinary *B) : Binary(B) {
+    // Initialize the base address to preferred address.
+    Binary->setBaseAddress(Binary->getPreferredBaseAddress());
+  };
   virtual ~PerfReaderBase() = default;
   static std::unique_ptr<PerfReaderBase>
-  create(cl::list<std::string> &BinaryFilenames,
-         cl::list<std::string> &PerfTraceFilenames);
+  create(ProfiledBinary *Binary, cl::list<std::string> &PerfTraceFilenames);
 
   // A LBR sample is like:
   // 0x5c6313f/0x5c63170/P/-/-/0  0x5c630e7/0x5c63130/P/-/-/0 ...
@@ -604,23 +595,15 @@ class PerfReaderBase {
     StringRef BinaryPath;
   };
 
-  /// Load symbols and disassemble the code of a give binary.
-  /// Also register the binary in the binary table.
-  ///
-  ProfiledBinary &loadBinary(const StringRef BinaryPath,
-                             bool AllowNameConflict = true);
   void updateBinaryAddress(const MMapEvent &Event);
   PerfScriptType getPerfScriptType() const { return PerfType; }
   // Entry of the reader to parse multiple perf traces
   void parsePerfTraces(cl::list<std::string> &PerfTraceFilenames);
-  const BinarySampleCounterMap &getBinarySampleCounters() const {
-    return BinarySampleCounters;
+  const ContextSampleCounterMap &getSampleCounters() const {
+    return SampleCounters;
   }
 
 protected:
-  /// Validate the command line input
-  static void validateCommandLine(cl::list<std::string> &BinaryFilenames,
-                                  cl::list<std::string> &PerfTraceFilenames);
   static PerfScriptType
   extractPerfType(cl::list<std::string> &PerfTraceFilenames);
   /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a
@@ -636,8 +619,7 @@ class PerfReaderBase {
                         SmallVectorImpl<uint64_t> &CallStack);
   // Extract LBR stack from one perf trace line
   bool extractLBRStack(TraceStream &TraceIt,
-                       SmallVectorImpl<LBREntry> &LBRStack,
-                       ProfiledBinary *Binary);
+                       SmallVectorImpl<LBREntry> &LBRStack);
   uint64_t parseAggregatedCount(TraceStream &TraceIt);
   // Parse one sample from multiple perf lines, override this for different
   // sample type
@@ -648,13 +630,10 @@ class PerfReaderBase {
   // Post process the profile after trace aggregation, we will do simple range
   // overlap computation for AutoFDO, or unwind for CSSPGO(hybrid sample).
   virtual void generateRawProfile() = 0;
-  // Helper function for looking up binary in AddressBinaryMap
-  ProfiledBinary *getBinary(uint64_t Address);
 
-  BinaryMap BinaryTable;
-  AddressBinaryMap AddrToBinaryMap; // Used by address-based lookup.
+  ProfiledBinary *Binary = nullptr;
 
-  BinarySampleCounterMap BinarySampleCounters;
+  ContextSampleCounterMap SampleCounters;
   // Samples with the repeating time generated by the perf reader
   AggregatedCounter AggregatedSamples;
   PerfScriptType PerfType = PERF_UNKNOWN;
@@ -671,8 +650,7 @@ class PerfReaderBase {
 */
 class HybridPerfReader : public PerfReaderBase {
 public:
-  HybridPerfReader(cl::list<std::string> &BinaryFilenames)
-      : PerfReaderBase(BinaryFilenames) {
+  HybridPerfReader(ProfiledBinary *Binary) : PerfReaderBase(Binary) {
     PerfType = PERF_LBR_STACK;
   };
   // Parse the hybrid sample including the call and LBR line
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
index 071c4e0934c5e..29c4536ecbc8f 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ProfileGenerator.h"
+#include "ProfiledBinary.h"
 #include "llvm/ProfileData/ProfileCommon.h"
 #include <unordered_set>
 
@@ -55,11 +56,6 @@ static cl::opt<int, true> CSProfMaxContextDepth(
              "depth limit."),
     cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
 
-static cl::opt<bool> EnableCSPreInliner(
-    "csspgo-preinliner", cl::Hidden, cl::init(false),
-    cl::desc("Run a global pre-inliner to merge context profile based on "
-             "estimated global top-down inline decisions"));
-
 extern cl::opt<int> ProfileSummaryCutoffCold;
 
 using namespace llvm;
@@ -73,22 +69,17 @@ int32_t CSProfileGenerator::MaxCompressionSize = -1;
 
 int CSProfileGenerator::MaxContextDepth = -1;
 
-static bool
-usePseudoProbes(const BinarySampleCounterMap &BinarySampleCounters) {
-  return BinarySampleCounters.size() &&
-         BinarySampleCounters.begin()->first->usePseudoProbes();
-}
-
 std::unique_ptr<ProfileGenerator>
-ProfileGenerator::create(const BinarySampleCounterMap &BinarySampleCounters,
+ProfileGenerator::create(ProfiledBinary *Binary,
+                         const ContextSampleCounterMap &SampleCounters,
                          enum PerfScriptType SampleType) {
   std::unique_ptr<ProfileGenerator> ProfileGenerator;
   if (SampleType == PERF_LBR_STACK) {
-    if (usePseudoProbes(BinarySampleCounters)) {
+    if (Binary->usePseudoProbes()) {
       ProfileGenerator.reset(
-          new PseudoProbeCSProfileGenerator(BinarySampleCounters));
+          new PseudoProbeCSProfileGenerator(Binary, SampleCounters));
     } else {
-      ProfileGenerator.reset(new CSProfileGenerator(BinarySampleCounters));
+      ProfileGenerator.reset(new CSProfileGenerator(Binary, SampleCounters));
     }
   } else {
     // TODO:
@@ -229,23 +220,19 @@ CSProfileGenerator::getFunctionProfileForContext(StringRef ContextStr,
 
 void CSProfileGenerator::generateProfile() {
   FunctionSamples::ProfileIsCS = true;
-  for (const auto &BI : BinarySampleCounters) {
-    ProfiledBinary *Binary = BI.first;
-    for (const auto &CI : BI.second) {
-      const StringBasedCtxKey *CtxKey =
-          dyn_cast<StringBasedCtxKey>(CI.first.getPtr());
-      StringRef ContextId(CtxKey->Context);
-      // Get or create function profile for the range
-      FunctionSamples &FunctionProfile =
-          getFunctionProfileForContext(ContextId, CtxKey->WasLeafInlined);
-
-      // Fill in function body samples
-      populateFunctionBodySamples(FunctionProfile, CI.second.RangeCounter,
-                                  Binary);
-      // Fill in boundary sample counts as well as call site samples for calls
-      populateFunctionBoundarySamples(ContextId, FunctionProfile,
-                                      CI.second.BranchCounter, Binary);
-    }
+  for (const auto &CI : SampleCounters) {
+    const StringBasedCtxKey *CtxKey =
+        dyn_cast<StringBasedCtxKey>(CI.first.getPtr());
+    StringRef ContextId(CtxKey->Context);
+    // Get or create function profile for the range
+    FunctionSamples &FunctionProfile =
+        getFunctionProfileForContext(ContextId, CtxKey->WasLeafInlined);
+
+    // Fill in function body samples
+    populateFunctionBodySamples(FunctionProfile, CI.second.RangeCounter);
+    // Fill in boundary sample counts as well as call site samples for calls
+    populateFunctionBoundarySamples(ContextId, FunctionProfile,
+                                    CI.second.BranchCounter);
   }
   // Fill in call site value sample for inlined calls and also use context to
   // infer missing samples. Since we don't have call count for inlined
@@ -274,8 +261,7 @@ void CSProfileGenerator::updateBodySamplesforFunctionProfile(
 }
 
 void CSProfileGenerator::populateFunctionBodySamples(
-    FunctionSamples &FunctionProfile, const RangeSample &RangeCounter,
-    ProfiledBinary *Binary) {
+    FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) {
   // Compute disjoint ranges first, so we can use MAX
   // for calculating count for each location.
   RangeSample Ranges;
@@ -314,7 +300,7 @@ void CSProfileGenerator::populateFunctionBodySamples(
 
 void CSProfileGenerator::populateFunctionBoundarySamples(
     StringRef ContextId, FunctionSamples &FunctionProfile,
-    const BranchSample &BranchCounters, ProfiledBinary *Binary) {
+    const BranchSample &BranchCounters) {
 
   for (auto Entry : BranchCounters) {
     uint64_t SourceOffset = Entry.first.first;
@@ -416,8 +402,10 @@ void CSProfileGenerator::postProcessProfiles() {
 
   // Run global pre-inliner to adjust/merge context profile based on estimated
   // inline decisions.
-  if (EnableCSPreInliner)
-    CSPreInliner(ProfileMap, HotCountThreshold, ColdCountThreshold).run();
+  if (EnableCSPreInliner) {
+    CSPreInliner(ProfileMap, *Binary, HotCountThreshold, ColdCountThreshold)
+        .run();
+  }
 
   // Trim and merge cold context profile using cold threshold above;
   SampleContextTrimmer(ProfileMap)
@@ -450,8 +438,8 @@ void CSProfileGenerator::write(std::unique_ptr<SampleProfileWriter> Writer,
 // Helper function to extract context prefix string stack
 // Extract context stack for reusing, leaf context stack will
 // be added compressed while looking up function profile
-static void
-extractPrefixContextStack(SmallVectorImpl<std::string> &ContextStrStack,
+static void extractPrefixContextStack(
+    SmallVectorImpl<std::string> &ContextStrStack,
     const SmallVectorImpl<const MCDecodedPseudoProbe *> &Probes,
     ProfiledBinary *Binary) {
   for (const auto *P : Probes) {
@@ -463,29 +451,23 @@ void PseudoProbeCSProfileGenerator::generateProfile() {
   // Enable pseudo probe functionalities in SampleProf
   FunctionSamples::ProfileIsProbeBased = true;
   FunctionSamples::ProfileIsCS = true;
-  for (const auto &BI : BinarySampleCounters) {
-    ProfiledBinary *Binary = BI.first;
-    for (const auto &CI : BI.second) {
-      const ProbeBasedCtxKey *CtxKey =
-          dyn_cast<ProbeBasedCtxKey>(CI.first.getPtr());
-      SmallVector<std::string, 16> ContextStrStack;
-      extractPrefixContextStack(ContextStrStack, CtxKey->Probes, Binary);
-      // Fill in function body samples from probes, also infer caller's samples
-      // from callee's probe
-      populateBodySamplesWithProbes(CI.second.RangeCounter, ContextStrStack,
-                                    Binary);
-      // Fill in boundary samples for a call probe
-      populateBoundarySamplesWithProbes(CI.second.BranchCounter,
-                                        ContextStrStack, Binary);
-    }
+  for (const auto &CI : SampleCounters) {
+    const ProbeBasedCtxKey *CtxKey =
+        dyn_cast<ProbeBasedCtxKey>(CI.first.getPtr());
+    SmallVector<std::string, 16> ContextStrStack;
+    extractPrefixContextStack(ContextStrStack, CtxKey->Probes, Binary);
+    // Fill in function body samples from probes, also infer caller's samples
+    // from callee's probe
+    populateBodySamplesWithProbes(CI.second.RangeCounter, ContextStrStack);
+    // Fill in boundary samples for a call probe
+    populateBoundarySamplesWithProbes(CI.second.BranchCounter, ContextStrStack);
   }
 
   postProcessProfiles();
 }
 
 void PseudoProbeCSProfileGenerator::extractProbesFromRange(
-    const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter,
-    ProfiledBinary *Binary) {
+    const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter) {
   RangeSample Ranges;
   findDisjointRanges(Ranges, RangeCounter);
   for (const auto &Range : Ranges) {
@@ -524,11 +506,11 @@ void PseudoProbeCSProfileGenerator::extractProbesFromRange(
 
 void PseudoProbeCSProfileGenerator::populateBodySamplesWithProbes(
     const RangeSample &RangeCounter,
-    SmallVectorImpl<std::string> &ContextStrStack, ProfiledBinary *Binary) {
+    SmallVectorImpl<std::string> &ContextStrStack) {
   ProbeCounterMap ProbeCounter;
   // Extract the top frame probes by looking up each address among the range in
   // the Address2ProbeMap
-  extractProbesFromRange(RangeCounter, ProbeCounter, Binary);
+  extractProbesFromRange(RangeCounter, ProbeCounter);
   std::unordered_map<MCDecodedPseudoProbeInlineTree *,
                      std::unordered_set<FunctionSamples *>>
       FrameSamples;
@@ -536,7 +518,7 @@ void PseudoProbeCSProfileGenerator::populateBodySamplesWithProbes(
     const MCDecodedPseudoProbe *Probe = PI.first;
     uint64_t Count = PI.second;
     FunctionSamples &FunctionProfile =
-        getFunctionProfileForLeafProbe(ContextStrStack, Probe, Binary);
+        getFunctionProfileForLeafProbe(ContextStrStack, Probe);
     // Record the current frame and FunctionProfile whenever samples are
     // collected for non-danglie probes. This is for reporting all of the
     // zero count probes of the frame later.
@@ -585,7 +567,7 @@ void PseudoProbeCSProfileGenerator::populateBodySamplesWithProbes(
 
 void PseudoProbeCSProfileGenerator::populateBoundarySamplesWithProbes(
     const BranchSample &BranchCounter,
-    SmallVectorImpl<std::string> &ContextStrStack, ProfiledBinary *Binary) {
+    SmallVectorImpl<std::string> &ContextStrStack) {
   for (auto BI : BranchCounter) {
     uint64_t SourceOffset = BI.first.first;
     uint64_t TargetOffset = BI.first.second;
@@ -596,7 +578,7 @@ void PseudoProbeCSProfileGenerator::populateBoundarySamplesWithProbes(
     if (CallProbe == nullptr)
       continue;
     FunctionSamples &FunctionProfile =
-        getFunctionProfileForLeafProbe(ContextStrStack, CallProbe, Binary);
+        getFunctionProfileForLeafProbe(ContextStrStack, CallProbe);
     FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count);
     FunctionProfile.addTotalSamples(Count);
     StringRef CalleeName = FunctionSamples::getCanonicalFnName(
@@ -639,7 +621,7 @@ FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe(
 
 FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe(
     SmallVectorImpl<std::string> &ContextStrStack,
-    const MCDecodedPseudoProbe *LeafProbe, ProfiledBinary *Binary) {
+    const MCDecodedPseudoProbe *LeafProbe) {
 
   // Explicitly copy the context for appending the leaf context
   SmallVector<std::string, 16> ContextStrStackCopy(ContextStrStack.begin(),
diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h
index dae6c3a3ae8f0..58049610f68a1 100644
--- a/llvm/tools/llvm-profgen/ProfileGenerator.h
+++ b/llvm/tools/llvm-profgen/ProfileGenerator.h
@@ -25,10 +25,10 @@ namespace sampleprof {
 class ProfileGenerator {
 
 public:
-  ProfileGenerator(){};
+  ProfileGenerator(ProfiledBinary *B) : Binary(B){};
   virtual ~ProfileGenerator() = default;
   static std::unique_ptr<ProfileGenerator>
-  create(const BinarySampleCounterMap &BinarySampleCounters,
+  create(ProfiledBinary *Binary, const ContextSampleCounterMap &SampleCounters,
          enum PerfScriptType SampleType);
   virtual void generateProfile() = 0;
   // Use SampleProfileWriter to serialize profile map
@@ -57,15 +57,18 @@ class ProfileGenerator {
 
   // Used by SampleProfileWriter
   StringMap<FunctionSamples> ProfileMap;
+
+  ProfiledBinary *Binary = nullptr;
 };
 
 class CSProfileGenerator : public ProfileGenerator {
 protected:
-  const BinarySampleCounterMap &BinarySampleCounters;
+  const ContextSampleCounterMap &SampleCounters;
 
 public:
-  CSProfileGenerator(const BinarySampleCounterMap &Counters)
-      : BinarySampleCounters(Counters){};
+  CSProfileGenerator(ProfiledBinary *Binary,
+                     const ContextSampleCounterMap &Counters)
+      : ProfileGenerator(Binary), SampleCounters(Counters){};
 
 public:
   void generateProfile() override;
@@ -210,12 +213,10 @@ class CSProfileGenerator : public ProfileGenerator {
                                            const FrameLocation &LeafLoc,
                                            uint64_t Count);
   void populateFunctionBodySamples(FunctionSamples &FunctionProfile,
-                                   const RangeSample &RangeCounters,
-                                   ProfiledBinary *Binary);
+                                   const RangeSample &RangeCounters);
   void populateFunctionBoundarySamples(StringRef ContextId,
                                        FunctionSamples &FunctionProfile,
-                                       const BranchSample &BranchCounters,
-                                       ProfiledBinary *Binary);
+                                       const BranchSample &BranchCounters);
   void populateInferredFunctionSamples();
 
 public:
@@ -231,25 +232,24 @@ using ProbeCounterMap =
 class PseudoProbeCSProfileGenerator : public CSProfileGenerator {
 
 public:
-  PseudoProbeCSProfileGenerator(const BinarySampleCounterMap &Counters)
-      : CSProfileGenerator(Counters) {}
+  PseudoProbeCSProfileGenerator(ProfiledBinary *Binary,
+                                const ContextSampleCounterMap &Counters)
+      : CSProfileGenerator(Binary, Counters) {}
   void generateProfile() override;
 
 private:
   // Go through each address from range to extract the top frame probe by
   // looking up in the Address2ProbeMap
   void extractProbesFromRange(const RangeSample &RangeCounter,
-                              ProbeCounterMap &ProbeCounter,
-                              ProfiledBinary *Binary);
+                              ProbeCounterMap &ProbeCounter);
   // Fill in function body samples from probes
   void
   populateBodySamplesWithProbes(const RangeSample &RangeCounter,
-                                SmallVectorImpl<std::string> &ContextStrStack,
-                                ProfiledBinary *Binary);
+                                SmallVectorImpl<std::string> &ContextStrStack);
   // Fill in boundary samples for a call probe
   void populateBoundarySamplesWithProbes(
       const BranchSample &BranchCounter,
-      SmallVectorImpl<std::string> &ContextStrStack, ProfiledBinary *Binary);
+      SmallVectorImpl<std::string> &ContextStrStack);
   // Helper function to get FunctionSamples for the leaf inlined context
   FunctionSamples &
   getFunctionProfileForLeafProbe(SmallVectorImpl<std::string> &ContextStrStack,
@@ -258,8 +258,7 @@ class PseudoProbeCSProfileGenerator : public CSProfileGenerator {
   // Helper function to get FunctionSamples for the leaf probe
   FunctionSamples &
   getFunctionProfileForLeafProbe(SmallVectorImpl<std::string> &ContextStrStack,
-                                 const MCDecodedPseudoProbe *LeafProbe,
-                                 ProfiledBinary *Binary);
+                                 const MCDecodedPseudoProbe *LeafProbe);
 };
 
 } // end namespace sampleprof
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
index 9db7d6cf6bfcb..edc539f29f2d5 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp
@@ -52,6 +52,60 @@ static const Target *getTarget(const ObjectFile *Obj) {
   return TheTarget;
 }
 
+void BinarySizeContextTracker::addInstructionForContext(
+    const FrameLocationStack &Context, uint32_t InstrSize) {
+  ContextTrieNode *CurNode = &RootContext;
+  bool IsLeaf = true;
+  for (const auto &Callsite : reverse(Context)) {
+    StringRef CallerName = Callsite.first;
+    LineLocation CallsiteLoc = IsLeaf ? LineLocation(0, 0) : Callsite.second;
+    CurNode = CurNode->getOrCreateChildContext(CallsiteLoc, CallerName);
+    IsLeaf = false;
+  }
+
+  CurNode->setFunctionSize(CurNode->getFunctionSize() + InstrSize);
+}
+
+uint32_t
+BinarySizeContextTracker::getFuncSizeForContext(const SampleContext &Context) {
+  ContextTrieNode *CurrNode = &RootContext;
+  ContextTrieNode *PrevNode = nullptr;
+  StringRef ContextRemain = Context;
+  StringRef ChildContext;
+  StringRef CallerName;
+  uint32_t Size = 0;
+
+  // Start from top-level context-less function, travese down the reverse
+  // context trie to find the best/longest match for given context, then
+  // retrieve the size.
+  while (CurrNode && !ContextRemain.empty()) {
+    // rsplit so we process from leaf function to callers (added to context).
+    auto ContextSplit = SampleContext::rsplitContextString(ContextRemain);
+    ChildContext = ContextSplit.second;
+    ContextRemain = ContextSplit.first;
+    LineLocation CallSiteLoc(0, 0);
+    SampleContext::decodeContextString(ChildContext, CallerName, CallSiteLoc);
+    PrevNode = CurrNode;
+    CurrNode = CurrNode->getChildContext(CallSiteLoc, CallerName);
+    if (CurrNode && CurrNode->getFunctionSize())
+      Size = CurrNode->getFunctionSize();
+  }
+
+  // If we traversed all nodes along the path of the context and haven't
+  // found a size yet, pivot to look for size from sibling nodes, i.e size
+  // of inlinee under different context.
+  if (!Size) {
+    if (!CurrNode)
+      CurrNode = PrevNode;
+    while (!Size && CurrNode) {
+      CurrNode = &CurrNode->getAllChildContext().begin()->second;
+      Size = CurrNode->getFunctionSize();
+    }
+  }
+
+  return Size;
+}
+
 void ProfiledBinary::load() {
   // Attempt to open the binary.
   OwningBinary<Binary> OBinary = unwrapOrError(createBinary(Path), Path);
@@ -253,7 +307,8 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
         if (Cur < 40)
           outs().indent(40 - Cur);
         InstructionPointer IP(this, Offset);
-        outs() << getReversedLocWithContext(symbolize(IP, ShowCanonicalFnName));
+        outs() << getReversedLocWithContext(
+            symbolize(IP, ShowCanonicalFnName, ShowPseudoProbe));
       }
       outs() << "\n";
     }
@@ -263,12 +318,21 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
       // Populate a vector of the symbolized callsite at this location
       // We don't need symbolized info for probe-based profile, just use an
       // empty stack as an entry to indicate a valid binary offset
-      FrameLocationStack SymbolizedCallStack;
-      if (!UsePseudoProbes) {
+
+      if (!UsePseudoProbes || TrackFuncContextSize) {
         InstructionPointer IP(this, Offset);
-        SymbolizedCallStack = symbolize(IP, true);
+        // TODO: reallocation of Offset2LocStackMap will lead to dangling
+        // strings We need ProfiledBinary to owned these string.
+        Offset2LocStackMap[Offset] = symbolize(IP, true, UsePseudoProbes);
+        FrameLocationStack &SymbolizedCallStack = Offset2LocStackMap[Offset];
+        // Record instruction size for the corresponding context
+        if (TrackFuncContextSize && !SymbolizedCallStack.empty())
+          FuncSizeTracker.addInstructionForContext(Offset2LocStackMap[Offset],
+                                                   Size);
+      } else {
+        Offset2LocStackMap[Offset] = FrameLocationStack();
       }
-      Offset2LocStackMap[Offset] = SymbolizedCallStack;
+
       // Populate address maps.
       CodeAddrs.push_back(Offset);
       if (MCDesc.isCall())
@@ -411,7 +475,8 @@ void ProfiledBinary::setupSymbolizer() {
 }
 
 FrameLocationStack ProfiledBinary::symbolize(const InstructionPointer &IP,
-                                             bool UseCanonicalFnName) {
+                                             bool UseCanonicalFnName,
+                                             bool UseProbeDiscriminator) {
   assert(this == IP.Binary &&
          "Binary should only symbolize its own instruction");
   auto Addr = object::SectionedAddress{IP.Offset + getPreferredBaseAddress(),
@@ -420,18 +485,28 @@ FrameLocationStack ProfiledBinary::symbolize(const InstructionPointer &IP,
       unwrapOrError(Symbolizer->symbolizeInlinedCode(Path, Addr), getName());
 
   FrameLocationStack CallStack;
-
   for (int32_t I = InlineStack.getNumberOfFrames() - 1; I >= 0; I--) {
     const auto &CallerFrame = InlineStack.getFrame(I);
     if (CallerFrame.FunctionName == "<invalid>")
       break;
+
     StringRef FunctionName(CallerFrame.FunctionName);
     if (UseCanonicalFnName)
       FunctionName = FunctionSamples::getCanonicalFnName(FunctionName);
-    LineLocation Line(CallerFrame.Line - CallerFrame.StartLine,
-                      DILocation::getBaseDiscriminatorFromDiscriminator(
-                          CallerFrame.Discriminator,
-                          /* IsFSDiscriminator */ false));
+
+    uint32_t Discriminator = CallerFrame.Discriminator;
+    uint32_t LineOffset = CallerFrame.Line - CallerFrame.StartLine;
+    if (UseProbeDiscriminator) {
+      LineOffset =
+          PseudoProbeDwarfDiscriminator::extractProbeIndex(Discriminator);
+      Discriminator = 0;
+    } else {
+      Discriminator = DILocation::getBaseDiscriminatorFromDiscriminator(
+          CallerFrame.Discriminator,
+          /* IsFSDiscriminator */ false);
+    }
+
+    LineLocation Line(LineOffset, Discriminator);
     FrameLocation Callsite(FunctionName.str(), Line);
     CallStack.push_back(Callsite);
   }
diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h
index eba36d41f1524..9a924e6bd5754 100644
--- a/llvm/tools/llvm-profgen/ProfiledBinary.h
+++ b/llvm/tools/llvm-profgen/ProfiledBinary.h
@@ -29,6 +29,7 @@
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Transforms/IPO/SampleContextTracker.h"
 #include <list>
 #include <set>
 #include <sstream>
@@ -37,6 +38,9 @@
 #include <unordered_set>
 #include <vector>
 
+extern cl::opt<bool> EnableCSPreInliner;
+extern cl::opt<bool> UseContextCostForPreInliner;
+
 using namespace llvm;
 using namespace sampleprof;
 using namespace llvm::object;
@@ -95,6 +99,36 @@ struct PrologEpilogTracker {
   }
 };
 
+// Track function byte size under different context (outlined version as well as
+// various inlined versions). It also provides query support to get function
+// size with the best matching context, which is used to help pre-inliner use
+// accurate post-optimization size to make decisions.
+// TODO: If an inlinee is completely optimized away, ideally we should have zero
+// for its context size, currently we would misss such context since it doesn't
+// have instructions. To fix this, we need to mark all inlinee with entry probe
+// but without instructions as having zero size.
+class BinarySizeContextTracker {
+public:
+  // Add instruction with given size to a context
+  void addInstructionForContext(const FrameLocationStack &Context,
+                                uint32_t InstrSize);
+
+  // Get function size with a specific context. When there's no exact match
+  // for the given context, try to retrieve the size of that function from
+  // closest matching context.
+  uint32_t getFuncSizeForContext(const SampleContext &Context);
+
+  void dump() { RootContext.dumpTree(); }
+
+private:
+  // Root node for context trie tree, node that this is a reverse context trie
+  // with callee as parent and caller as child. This way we can traverse from
+  // root to find the best/longest matching context if an exact match does not
+  // exist. It gives us the best possible estimate for function's post-inline,
+  // post-optimization byte size.
+  ContextTrieNode RootContext;
+};
+
 class ProfiledBinary {
   // Absolute path of the binary.
   std::string Path;
@@ -130,8 +164,12 @@ class ProfiledBinary {
   // A set of return instruction offsets. Used by virtual unwinding.
   std::unordered_set<uint64_t> RetAddrs;
 
+  // Estimate and track function prolog and epilog ranges.
   PrologEpilogTracker ProEpilogTracker;
 
+  // Track function sizes under different context
+  BinarySizeContextTracker FuncSizeTracker;
+
   // The symbolizer used to get inline context for an instruction.
   std::unique_ptr<symbolize::LLVMSymbolizer> Symbolizer;
 
@@ -140,6 +178,9 @@ class ProfiledBinary {
 
   bool UsePseudoProbes = false;
 
+  // Whether we need to symbolize all instructions to get function context size.
+  bool TrackFuncContextSize = false;
+
   // Indicate if the base loading address is parsed from the mmap event or uses
   // the preferred address
   bool IsLoadedByMMap = false;
@@ -165,7 +206,8 @@ class ProfiledBinary {
                           SectionSymbolsTy &Symbols, const SectionRef &Section);
   /// Symbolize a given instruction pointer and return a full call context.
   FrameLocationStack symbolize(const InstructionPointer &IP,
-                               bool UseCanonicalFnName = false);
+                               bool UseCanonicalFnName = false,
+                               bool UseProbeDiscriminator = false);
 
   /// Decode the interesting parts of the binary and build internal data
   /// structures. On high level, the parts of interest are:
@@ -183,7 +225,10 @@ class ProfiledBinary {
   }
 
 public:
-  ProfiledBinary(StringRef Path) : Path(Path), ProEpilogTracker(this) {
+  ProfiledBinary(const StringRef Path)
+      : Path(Path), ProEpilogTracker(this),
+        TrackFuncContextSize(EnableCSPreInliner &&
+                             UseContextCostForPreInliner) {
     setupSymbolizer();
     load();
   }
@@ -249,6 +294,10 @@ class ProfiledBinary {
     return FuncStartAddrMap[Offset];
   }
 
+  uint32_t getFuncSizeForContext(SampleContext &Context) {
+    return FuncSizeTracker.getFuncSizeForContext(Context);
+  }
+
   Optional<FrameLocation> getInlineLeafFrameLoc(uint64_t Offset) {
     const auto &Stack = getFrameLocationStack(Offset);
     if (Stack.empty())
diff --git a/llvm/tools/llvm-profgen/llvm-profgen.cpp b/llvm/tools/llvm-profgen/llvm-profgen.cpp
index 4045a26209daa..4bc2ea8fc3ad5 100644
--- a/llvm/tools/llvm-profgen/llvm-profgen.cpp
+++ b/llvm/tools/llvm-profgen/llvm-profgen.cpp
@@ -15,6 +15,7 @@
 #include "ProfileGenerator.h"
 #include "ProfiledBinary.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/TargetSelect.h"
 
@@ -27,17 +28,44 @@ static cl::list<std::string> PerfTraceFilenames(
              "`script` command(the raw perf.data should be profiled with -b)"),
     cl::cat(ProfGenCategory));
 
-static cl::list<std::string>
-    BinaryFilenames("binary", cl::value_desc("binary"), cl::OneOrMore,
-                    llvm::cl::MiscFlags::CommaSeparated,
-                    cl::desc("Path of profiled binary files"),
-                    cl::cat(ProfGenCategory));
+static cl::opt<std::string> BinaryPath(
+    "binary", cl::value_desc("binary"), cl::Required,
+    cl::desc("Path of profiled binary, only one binary is supported."),
+    cl::cat(ProfGenCategory));
 
 extern cl::opt<bool> ShowDisassemblyOnly;
+extern cl::opt<bool> ShowSourceLocations;
 
 using namespace llvm;
 using namespace sampleprof;
 
+// Validate the command line input.
+static void validateCommandLine(StringRef BinaryPath,
+                                cl::list<std::string> &PerfTraceFilenames) {
+  // Allow the invalid perfscript if we only use to show binary disassembly.
+  if (!ShowDisassemblyOnly) {
+    for (auto &File : PerfTraceFilenames) {
+      if (!llvm::sys::fs::exists(File)) {
+        std::string Msg = "Input perf script(" + File + ") doesn't exist!";
+        exitWithError(Msg);
+      }
+    }
+  }
+
+  if (!llvm::sys::fs::exists(BinaryPath)) {
+    std::string Msg = "Input binary(" + BinaryPath.str() + ") doesn't exist!";
+    exitWithError(Msg);
+  }
+
+  if (CSProfileGenerator::MaxCompressionSize < -1) {
+    exitWithError("Value of --compress-recursion should >= -1");
+  }
+  if (ShowSourceLocations && !ShowDisassemblyOnly) {
+    exitWithError("--show-source-locations should work together with "
+                  "--show-disassembly-only!");
+  }
+}
+
 int main(int argc, const char *argv[]) {
   InitLLVM X(argc, argv);
 
@@ -48,21 +76,21 @@ int main(int argc, const char *argv[]) {
 
   cl::HideUnrelatedOptions({&ProfGenCategory, &getColorCategory()});
   cl::ParseCommandLineOptions(argc, argv, "llvm SPGO profile generator\n");
+  validateCommandLine(BinaryPath, PerfTraceFilenames);
 
-  if (ShowDisassemblyOnly) {
-    for (auto BinaryPath : BinaryFilenames) {
-      (void)ProfiledBinary(BinaryPath);
-    }
+  // Load symbols and disassemble the code of a given binary.
+  std::unique_ptr<ProfiledBinary> Binary =
+      std::make_unique<ProfiledBinary>(BinaryPath);
+  if (ShowDisassemblyOnly)
     return EXIT_SUCCESS;
-  }
 
-  // Load binaries and parse perf events and samples
+  // Parse perf events and samples
   std::unique_ptr<PerfReaderBase> Reader =
-      PerfReaderBase::create(BinaryFilenames, PerfTraceFilenames);
+      PerfReaderBase::create(Binary.get(), PerfTraceFilenames);
   Reader->parsePerfTraces(PerfTraceFilenames);
 
   std::unique_ptr<ProfileGenerator> Generator = ProfileGenerator::create(
-      Reader->getBinarySampleCounters(), Reader->getPerfScriptType());
+      Binary.get(), Reader->getSampleCounters(), Reader->getPerfScriptType());
   Generator->generateProfile();
   Generator->write();
 
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index 000a10b1c2f81..1073f22ddd091 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -339,7 +339,8 @@ template <typename ELFT> class ELFDumper : public ObjDumper {
     return DynRegionInfo(ObjF, *this, Obj.base() + Offset, Size, EntSize);
   }
 
-  void printAttributes();
+  void printAttributes(unsigned, std::unique_ptr<ELFAttributeParser>,
+                       support::endianness);
   void printMipsReginfo();
   void printMipsOptions();
 
@@ -2557,8 +2558,22 @@ template <typename ELFT> void ELFDumper<ELFT>::printLoadName() {
 template <class ELFT> void ELFDumper<ELFT>::printArchSpecificInfo() {
   switch (Obj.getHeader().e_machine) {
   case EM_ARM:
+    if (Obj.isLE())
+      printAttributes(ELF::SHT_ARM_ATTRIBUTES,
+                      std::make_unique<ARMAttributeParser>(&W),
+                      support::little);
+    else
+      reportUniqueWarning("attribute printing not implemented for big-endian "
+                          "ARM objects");
+    break;
   case EM_RISCV:
-    printAttributes();
+    if (Obj.isLE())
+      printAttributes(ELF::SHT_RISCV_ATTRIBUTES,
+                      std::make_unique<RISCVAttributeParser>(&W),
+                      support::little);
+    else
+      reportUniqueWarning("attribute printing not implemented for big-endian "
+                          "RISC-V objects");
     break;
   case EM_MIPS: {
     printMipsABIFlags();
@@ -2581,20 +2596,15 @@ template <class ELFT> void ELFDumper<ELFT>::printArchSpecificInfo() {
   }
 }
 
-template <class ELFT> void ELFDumper<ELFT>::printAttributes() {
-  if (!Obj.isLE()) {
-    W.startLine() << "Attributes not implemented.\n";
-    return;
-  }
-
-  const unsigned Machine = Obj.getHeader().e_machine;
-  assert((Machine == EM_ARM || Machine == EM_RISCV) &&
-         "Attributes not implemented.");
-
+template <class ELFT>
+void ELFDumper<ELFT>::printAttributes(
+    unsigned AttrShType, std::unique_ptr<ELFAttributeParser> AttrParser,
+    support::endianness Endianness) {
+  assert((AttrShType != ELF::SHT_NULL) && AttrParser &&
+         "Incomplete ELF attribute implementation");
   DictScope BA(W, "BuildAttributes");
   for (const Elf_Shdr &Sec : cantFail(Obj.sections())) {
-    if (Sec.sh_type != ELF::SHT_ARM_ATTRIBUTES &&
-        Sec.sh_type != ELF::SHT_RISCV_ATTRIBUTES)
+    if (Sec.sh_type != AttrShType)
       continue;
 
     ArrayRef<uint8_t> Contents;
@@ -2613,13 +2623,7 @@ template <class ELFT> void ELFDumper<ELFT>::printAttributes() {
 
     W.printHex("FormatVersion", Contents[0]);
 
-    auto ParseAttrubutes = [&]() {
-      if (Machine == EM_ARM)
-        return ARMAttributeParser(&W).parse(Contents, support::little);
-      return RISCVAttributeParser(&W).parse(Contents, support::little);
-    };
-
-    if (Error E = ParseAttrubutes())
+    if (Error E = AttrParser->parse(Contents, Endianness))
       reportUniqueWarning("unable to dump attributes from the " +
                           describe(Sec) + ": " + toString(std::move(E)));
   }
diff --git a/llvm/tools/llvm-readobj/XCOFFDumper.cpp b/llvm/tools/llvm-readobj/XCOFFDumper.cpp
index 85c3a251b6b70..9371901b1d347 100644
--- a/llvm/tools/llvm-readobj/XCOFFDumper.cpp
+++ b/llvm/tools/llvm-readobj/XCOFFDumper.cpp
@@ -44,7 +44,8 @@ class XCOFFDumper : public ObjDumper {
   void printCsectAuxEnt(XCOFFCsectAuxRef AuxEntRef);
   void printSectAuxEntForStat(const XCOFFSectAuxEntForStat *AuxEntPtr);
   void printSymbol(const SymbolRef &);
-  void printRelocations(ArrayRef<XCOFFSectionHeader32> Sections);
+  template <typename Shdr, typename RelTy>
+  void printRelocations(ArrayRef<Shdr> Sections);
   const XCOFFObjectFile &Obj;
 };
 } // anonymous namespace
@@ -105,9 +106,9 @@ void XCOFFDumper::printSectionHeaders() {
 
 void XCOFFDumper::printRelocations() {
   if (Obj.is64Bit())
-    llvm_unreachable("64-bit relocation output not implemented!");
+    printRelocations<XCOFFSectionHeader64, XCOFFRelocation64>(Obj.sections64());
   else
-    printRelocations(Obj.sections32());
+    printRelocations<XCOFFSectionHeader32, XCOFFRelocation32>(Obj.sections32());
 }
 
 static const EnumEntry<XCOFF::RelocationType> RelocationTypeNameclass[] = {
@@ -122,28 +123,40 @@ static const EnumEntry<XCOFF::RelocationType> RelocationTypeNameclass[] = {
 #undef ECase
 };
 
-void XCOFFDumper::printRelocations(ArrayRef<XCOFFSectionHeader32> Sections) {
+template <typename Shdr, typename RelTy>
+void XCOFFDumper::printRelocations(ArrayRef<Shdr> Sections) {
   if (!opts::ExpandRelocs)
     report_fatal_error("Unexpanded relocation output not implemented.");
 
   ListScope LS(W, "Relocations");
   uint16_t Index = 0;
-  for (const auto &Sec : Sections) {
+  for (const Shdr &Sec : Sections) {
     ++Index;
     // Only the .text, .data, .tdata, and STYP_DWARF sections have relocation.
     if (Sec.Flags != XCOFF::STYP_TEXT && Sec.Flags != XCOFF::STYP_DATA &&
         Sec.Flags != XCOFF::STYP_TDATA && Sec.Flags != XCOFF::STYP_DWARF)
       continue;
-    auto Relocations = unwrapOrError(Obj.getFileName(), Obj.relocations(Sec));
+    Expected<ArrayRef<RelTy>> ErrOrRelocations = Obj.relocations<Shdr, RelTy>(Sec);
+    if (Error E = ErrOrRelocations.takeError()) {
+      reportUniqueWarning(std::move(E));
+      continue;
+    }
+
+    const ArrayRef<RelTy> Relocations = *ErrOrRelocations;
     if (Relocations.empty())
       continue;
 
     W.startLine() << "Section (index: " << Index << ") " << Sec.getName()
                   << " {\n";
-    for (auto Reloc : Relocations) {
-      StringRef SymbolName = unwrapOrError(
-          Obj.getFileName(), Obj.getSymbolNameByIndex(Reloc.SymbolIndex));
+    for (const RelTy Reloc : Relocations) {
+      Expected<StringRef> ErrOrSymbolName =
+          Obj.getSymbolNameByIndex(Reloc.SymbolIndex);
+      if (Error E = ErrOrSymbolName.takeError()) {
+        reportUniqueWarning(std::move(E));
+        continue;
+      }
 
+      StringRef SymbolName = *ErrOrSymbolName;
       DictScope RelocScope(W, "Relocation");
       W.printHex("Virtual Address", Reloc.VirtualAddress);
       W.printNumber("Symbol", SymbolName, Reloc.SymbolIndex);
diff --git a/llvm/tools/llvm-xray/xray-converter.cpp b/llvm/tools/llvm-xray/xray-converter.cpp
index 47cb645a54088..2ab18b8b9ee37 100644
--- a/llvm/tools/llvm-xray/xray-converter.cpp
+++ b/llvm/tools/llvm-xray/xray-converter.cpp
@@ -57,6 +57,11 @@ static cl::opt<bool>
                      cl::init(false), cl::sub(Convert));
 static cl::alias ConvertSymbolize2("y", cl::aliasopt(ConvertSymbolize),
                                    cl::desc("Alias for -symbolize"));
+static cl::opt<bool>
+    NoDemangle("no-demangle",
+               cl::desc("determines whether to demangle function name "
+                        "when symbolizing function ids from the input log"),
+               cl::init(false), cl::sub(Convert));
 
 static cl::opt<std::string>
     ConvertInstrMap("instr_map",
@@ -373,7 +378,10 @@ static CommandRegistration Unused(&Convert, []() -> Error {
   }
 
   const auto &FunctionAddresses = Map.getFunctionAddresses();
-  symbolize::LLVMSymbolizer Symbolizer;
+  symbolize::LLVMSymbolizer::Options SymbolizerOpts;
+  if (NoDemangle)
+    SymbolizerOpts.Demangle = false;
+  symbolize::LLVMSymbolizer Symbolizer(SymbolizerOpts);
   llvm::xray::FuncIdConversionHelper FuncIdHelper(ConvertInstrMap, Symbolizer,
                                                   FunctionAddresses);
   llvm::xray::TraceConverter TC(FuncIdHelper, ConvertSymbolize);
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index d88636a4d784c..e1989958b9f29 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -284,6 +284,9 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
       P->CSAction = PGOOptions::CSIRUse;
     }
   }
+  if (TM)
+    TM->setPGOOption(P);
+
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
   CGSCCAnalysisManager CGAM;
@@ -336,18 +339,19 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   PB.registerPipelineParsingCallback(
       [](StringRef Name, ModulePassManager &MPM,
          ArrayRef<PassBuilder::PipelineElement>) {
+        AddressSanitizerOptions Opts;
         if (Name == "asan-pipeline") {
           MPM.addPass(
               RequireAnalysisPass<ASanGlobalsMetadataAnalysis, Module>());
           MPM.addPass(
-              createModuleToFunctionPassAdaptor(AddressSanitizerPass()));
+              createModuleToFunctionPassAdaptor(AddressSanitizerPass(Opts)));
           MPM.addPass(ModuleAddressSanitizerPass());
           return true;
         } else if (Name == "asan-function-pipeline") {
           MPM.addPass(
               RequireAnalysisPass<ASanGlobalsMetadataAnalysis, Module>());
           MPM.addPass(
-              createModuleToFunctionPassAdaptor(AddressSanitizerPass()));
+              createModuleToFunctionPassAdaptor(AddressSanitizerPass(Opts)));
           return true;
         }
         return false;
@@ -408,6 +412,7 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
   if (EnableDebugify)
     MPM.addPass(NewPMDebugifyPass());
 
+  // Add passes according to the -passes options.
   if (!PassPipeline.empty()) {
     assert(Passes.empty() &&
            "PassPipeline and Passes should not both contain passes");
@@ -416,10 +421,26 @@ bool llvm::runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
       return false;
     }
   }
+  // Add passes specified using the legacy PM syntax (i.e. not using
+  // -passes). This should be removed later when such support has been
+  // deprecated, i.e. when all lit tests running opt (and not using
+  // -enable-new-pm=0) have been updated to use -passes.
   for (auto PassName : Passes) {
     std::string ModifiedPassName(PassName.begin(), PassName.end());
     if (PB.isAnalysisPassName(PassName))
       ModifiedPassName = "require<" + ModifiedPassName + ">";
+    // FIXME: These translations are supposed to be removed when lit tests that
+    // use these names have been updated to use the -passes syntax (and when the
+    // support for using the old syntax to specify passes is considered as
+    // deprecated for the new PM).
+    if (ModifiedPassName == "early-cse-memssa")
+      ModifiedPassName = "early-cse<memssa>";
+    else if (ModifiedPassName == "post-inline-ee-instrument")
+      ModifiedPassName = "ee-instrument<post-inline>";
+    else if (ModifiedPassName == "loop-extract-single")
+      ModifiedPassName = "loop-extract<single>";
+    else if (ModifiedPassName == "lower-matrix-intrinsics-minimal")
+      ModifiedPassName = "lower-matrix-intrinsics<minimal>";
     if (auto Err = PB.parsePassPipeline(MPM, ModifiedPassName)) {
       errs() << Arg0 << ": " << toString(std::move(Err)) << "\n";
       return false;
diff --git a/llvm/tools/split-file/split-file.cpp b/llvm/tools/split-file/split-file.cpp
index 355678433ac16..bde7d21a51e9a 100644
--- a/llvm/tools/split-file/split-file.cpp
+++ b/llvm/tools/split-file/split-file.cpp
@@ -35,8 +35,12 @@ static cl::opt<std::string> input(cl::Positional, cl::desc("filename"),
 static cl::opt<std::string> output(cl::Positional, cl::desc("directory"),
                                    cl::value_desc("directory"), cl::cat(cat));
 
+static cl::opt<bool> leadingLines("leading-lines",
+                                    cl::desc("Preserve line numbers"),
+                                    cl::cat(cat));
+
 static cl::opt<bool> noLeadingLines("no-leading-lines",
-                                    cl::desc("Don't preserve line numbers"),
+                                    cl::desc("Don't preserve line numbers (default)"),
                                     cl::cat(cat));
 
 static StringRef toolName;
@@ -96,9 +100,9 @@ static int handle(MemoryBuffer &inputBuf, StringRef input) {
     Part &cur = res.first->second;
     if (!i.is_at_eof())
       cur.begin = i->data();
-    // If --no-leading-lines is not specified, numEmptyLines is 0. Append
-    // newlines so that the extracted part preserves line numbers.
-    cur.leadingLines = noLeadingLines ? 0 : i.line_number() - 1;
+    // If --leading-lines is specified, numEmptyLines is 0. Append newlines so
+    // that the extracted part preserves line numbers.
+    cur.leadingLines = leadingLines ? i.line_number() - 1 : 0;
 
     lastPart = partName;
   }
diff --git a/llvm/unittests/Analysis/LoopNestTest.cpp b/llvm/unittests/Analysis/LoopNestTest.cpp
index 98071d2dfad88..a279632d2b569 100644
--- a/llvm/unittests/Analysis/LoopNestTest.cpp
+++ b/llvm/unittests/Analysis/LoopNestTest.cpp
@@ -40,6 +40,14 @@ static std::unique_ptr<Module> makeLLVMModule(LLVMContext &Context,
   return parseAssemblyString(ModuleStr, Err, Context);
 }
 
+static Instruction *getInstructionByName(Function &F, StringRef Name) {
+  for (BasicBlock &BB : F)
+    for (Instruction &I : BB)
+      if (I.getName() == Name)
+        return &I;
+  llvm_unreachable("Expected to find instruction!");
+}
+
 TEST(LoopNestTest, PerfectLoopNest) {
   const char *ModuleStr =
     "target datalayout = \"e-m:o-i64:64-f80:128-n8:16:32:64-S128\"\n"
@@ -106,6 +114,8 @@ TEST(LoopNestTest, PerfectLoopNest) {
     // Ensure the nest depth and perfect nest depth are computed correctly.
     EXPECT_EQ(LN.getNestDepth(), 2u);
     EXPECT_EQ(LN.getMaxPerfectDepth(), 2u);
+
+    EXPECT_TRUE(LN.getInterveningInstructions(OL, *IL, SE).empty());
   });
 }
 
@@ -190,6 +200,107 @@ TEST(LoopNestTest, ImperfectLoopNest) {
     // Ensure the nest depth and perfect nest depth are computed correctly.
     EXPECT_EQ(LN.getNestDepth(), 3u);
     EXPECT_EQ(LN.getMaxPerfectDepth(), 2u);
+
+    EXPECT_TRUE(LN.getInterveningInstructions(OL, *IL, SE).empty());
   });
 }
 
+TEST(LoopNestTest, InterveningInstrLoopNest) {
+  const char *ModuleStr =
+      "target datalayout = \"e-m:o-i64:64-f80:128-n8:16:32:64-S128\"\n"
+      "define void @foo(i64 signext %nx, i64 signext %ny, i32* noalias %A, i32 "
+      "%op0, i32 %op1){\n"
+      "entry:\n"
+      "  br label %for.outer\n"
+      "for.outer:\n"
+      "  %i = phi i64 [ 0, %entry ], [ %inc13, %for.outer.latch ]\n"
+      "  %cmp21 = icmp slt i64 0, %ny\n"
+      "  call void @outerheader()\n"
+      "  br i1 %cmp21, label %for.inner.preheader, label %for.outer.latch\n"
+      "for.inner.preheader:\n"
+      "  %varr = getelementptr inbounds i32, i32* %A, i64 5\n"
+      "  store i32 5, i32* %varr, align 4\n"
+      "  call void @innerpreheader()\n"
+      "  br label %for.inner\n"
+      "for.inner:\n"
+      "  %j = phi i64 [ 0, %for.inner.preheader ], [ %inc, %for.inner.latch ]\n"
+      "  br label %for.inner.latch\n"
+      "for.inner.latch:\n"
+      "  %inc = add nsw i64 %j, 1\n"
+      "  %cmp2 = icmp slt i64 %inc, %ny\n"
+      "  br i1 %cmp2, label %for.inner, label %for.inner.exit\n"
+      "for.inner.exit:\n"
+      "  %varr1 = getelementptr inbounds i32, i32* %A, i64 5\n"
+      "  call void @innerexit()\n"
+      "  br label %for.outer.latch\n"
+      "for.outer.latch:\n"
+      "  %inc13 = add nsw i64 %i, 1\n"
+      "  call void @outerlatch()\n"
+      "  %cmp = icmp slt i64 %inc13, %nx\n"
+      "  br i1 %cmp, label %for.outer, label %for.outer.exit\n"
+      "for.outer.exit:\n"
+      "  br label %for.end\n"
+      "for.end:\n"
+      "  ret void\n"
+      "}\n"
+      "declare void @innerpreheader()\n"
+      "declare void @outerheader()\n"
+      "declare void @outerlatch()\n"
+      "declare void @innerexit()\n";
+
+  LLVMContext Context;
+  std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleStr);
+
+  runTest(*M, "foo", [&](Function &F, LoopInfo &LI, ScalarEvolution &SE) {
+    Function::iterator FI = F.begin();
+    // Skip the first basic block (entry), get to the outer loop header.
+    BasicBlock *Header = &*(++FI);
+    assert(Header->getName() == "for.outer");
+    Loop *L = LI.getLoopFor(Header);
+    EXPECT_NE(L, nullptr);
+
+    LoopNest LN(*L, SE);
+    EXPECT_TRUE(LN.areAllLoopsSimplifyForm());
+
+    // Ensure that we can identify the outermost loop in the nest.
+    const Loop &OL = LN.getOutermostLoop();
+    EXPECT_EQ(OL.getName(), "for.outer");
+
+    // Ensure that we can identify the innermost loop in the nest.
+    const Loop *IL = LN.getInnermostLoop();
+    EXPECT_NE(IL, nullptr);
+    EXPECT_EQ(IL->getName(), "for.inner");
+
+    // Ensure the loop nest is recognized as having 2 loops.
+    const ArrayRef<Loop *> Loops = LN.getLoops();
+    EXPECT_EQ(Loops.size(), 2ull);
+
+    // Ensure the loop nest is not recognized as perfect in its entirety.
+    const SmallVector<LoopVectorTy, 4> &PLV = LN.getPerfectLoops(SE);
+    EXPECT_EQ(PLV.size(), 2ull);
+    EXPECT_EQ(PLV.front().size(), 1ull);
+    EXPECT_EQ(PLV.back().size(), 1ull);
+
+    // Ensure the nest depth and perfect nest depth are computed correctly.
+    EXPECT_EQ(LN.getNestDepth(), 2u);
+    EXPECT_EQ(LN.getMaxPerfectDepth(), 1u);
+
+    // Ensure enclosed instructions are recognized
+    const LoopNest::InstrVectorTy InstrV =
+        LN.getInterveningInstructions(OL, *IL, SE);
+    EXPECT_EQ(InstrV.size(), 5u);
+
+    Instruction *SI = getInstructionByName(F, "varr")->getNextNode();
+    Instruction *CI = SI->getNextNode();
+    Instruction *OLH =
+        getInstructionByName(F, "i")->getNextNode()->getNextNode();
+    Instruction *OLL = getInstructionByName(F, "inc13")->getNextNode();
+    Instruction *IE = getInstructionByName(F, "varr1")->getNextNode();
+
+    EXPECT_EQ(InstrV.front(), OLH);
+    EXPECT_EQ(InstrV[1], OLL);
+    EXPECT_EQ(InstrV[2], IE);
+    EXPECT_EQ(InstrV[3], SI);
+    EXPECT_EQ(InstrV.back(), CI);
+  });
+}
diff --git a/llvm/unittests/DebugInfo/DWARF/CMakeLists.txt b/llvm/unittests/DebugInfo/DWARF/CMakeLists.txt
index 29cdec9064f0b..9fa24ee4cb9c9 100644
--- a/llvm/unittests/DebugInfo/DWARF/CMakeLists.txt
+++ b/llvm/unittests/DebugInfo/DWARF/CMakeLists.txt
@@ -20,6 +20,7 @@ add_llvm_unittest(DebugInfoDWARFTests
   DWARFDebugLineTest.cpp
   DWARFDieTest.cpp
   DWARFDieManualExtractTest.cpp
+  DWARFExpressionCopyBytesTest.cpp
   DWARFExpressionCompactPrinterTest.cpp
   DWARFFormValueTest.cpp
   DWARFListTableTest.cpp
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
index 9a48066ea2eb3..9073a3fc8eba9 100644
--- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
@@ -36,6 +36,7 @@
 using namespace llvm;
 using namespace dwarf;
 using namespace utils;
+using ::testing::HasSubstr;
 
 namespace {
 
@@ -1851,649 +1852,6 @@ TEST(DWARFDebugInfo, TestImplicitConstAbbrevs) {
   EXPECT_EQ(DIEs.find(Val2)->second, AbbrevPtrVal2);
 }
 
-void VerifyWarning(DWARFContext &DwarfContext, StringRef Error) {
-  SmallString<1024> Str;
-  raw_svector_ostream Strm(Str);
-  EXPECT_TRUE(DwarfContext.verify(Strm));
-  EXPECT_TRUE(Str.str().contains(Error));
-}
-
-void VerifyError(DWARFContext &DwarfContext, StringRef Error) {
-  SmallString<1024> Str;
-  raw_svector_ostream Strm(Str);
-  EXPECT_FALSE(DwarfContext.verify(Strm));
-  EXPECT_TRUE(Str.str().contains(Error));
-}
-
-void VerifySuccess(DWARFContext &DwarfContext) {
-  SmallString<1024> Str;
-  raw_svector_ostream Strm(Str);
-  EXPECT_TRUE(DwarfContext.verify(Strm));
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyInvalidCURef) {
-  // Create a single compile unit with a single function that has a DW_AT_type
-  // that is CU relative. The CU offset is not valid because it is larger than
-  // the compile unit itself.
-
-  const char *yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-      - main
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_yes
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-          - Code:            0x00000002
-            Tag:             DW_TAG_subprogram
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_type
-                Form:            DW_FORM_ref4
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-          - AbbrCode:        0x00000002
-            Values:
-              - Value:           0x000000000000000D
-              - Value:           0x0000000000001234
-          - AbbrCode:        0x00000000
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata));
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(*DwarfContext, "error: DW_FORM_ref4 CU offset 0x00001234 is "
-                             "invalid (must be less than CU size of "
-                             "0x0000001a):");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyInvalidRefAddr) {
-  // Create a single compile unit with a single function that has an invalid
-  // DW_AT_type with an invalid .debug_info offset in its DW_FORM_ref_addr.
-  const char *yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-      - main
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_yes
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-          - Code:            0x00000002
-            Tag:             DW_TAG_subprogram
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_type
-                Form:            DW_FORM_ref_addr
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-          - AbbrCode:        0x00000002
-            Values:
-              - Value:           0x000000000000000D
-              - Value:           0x0000000000001234
-          - AbbrCode:        0x00000000
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata));
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(*DwarfContext,
-              "error: DW_FORM_ref_addr offset beyond .debug_info bounds:");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyInvalidRanges) {
-  // Create a single compile unit with a DW_AT_ranges whose section offset
-  // isn't valid.
-  const char *yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_ranges
-                Form:            DW_FORM_sec_offset
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-              - Value:           0x0000000000001000
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata));
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(
-      *DwarfContext,
-      "error: DW_AT_ranges offset is beyond .debug_ranges bounds: 0x00001000");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyInvalidRnglists) {
-  // Create a single compile unit with a DW_AT_ranges whose section offset
-  // isn't valid.
-  const char *yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_ranges
-                Form:            DW_FORM_sec_offset
-    debug_info:
-      - Version:         5
-        UnitType:        DW_UT_compile
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-              - Value:           0x0000000000001000
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata));
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(*DwarfContext, "error: DW_AT_ranges offset is beyond "
-                             ".debug_rnglists bounds: 0x00001000");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyInvalidStmtList) {
-  // Create a single compile unit with a DW_AT_stmt_list whose section offset
-  // isn't valid.
-  const char *yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_stmt_list
-                Form:            DW_FORM_sec_offset
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-              - Value:           0x0000000000001000
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata));
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(
-      *DwarfContext,
-      "error: DW_AT_stmt_list offset is beyond .debug_line bounds: 0x00001000");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyInvalidStrp) {
-  // Create a single compile unit with a single function that has an invalid
-  // DW_FORM_strp for the DW_AT_name.
-  const char *yamldata = R"(
-    debug_str:
-      - ''
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000001234
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata));
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(*DwarfContext,
-              "error: DW_FORM_strp offset beyond .debug_str bounds:");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyInvalidRefAddrBetween) {
-  // Create a single compile unit with a single function that has a DW_AT_type
-  // with a valid .debug_info offset, but the offset is between two DIEs.
-  const char *yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-      - main
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_yes
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-          - Code:            0x00000002
-            Tag:             DW_TAG_subprogram
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_type
-                Form:            DW_FORM_ref_addr
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-          - AbbrCode:        0x00000002
-            Values:
-              - Value:           0x000000000000000D
-              - Value:           0x0000000000000011
-          - AbbrCode:        0x00000000
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(StringRef(yamldata));
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(
-      *DwarfContext,
-      "error: invalid DIE reference 0x00000011. Offset is in between DIEs:");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyInvalidLineSequence) {
-  // Create a single compile unit whose line table has a sequence in it where
-  // the address decreases.
-  StringRef yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_stmt_list
-                Form:            DW_FORM_sec_offset
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-              - Value:           0x0000000000000000
-    debug_line:
-      - Version:         2
-        MinInstLength:   1
-        DefaultIsStmt:   1
-        LineBase:        251
-        LineRange:       14
-        OpcodeBase:      13
-        StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
-        IncludeDirs:
-          - /tmp
-        Files:
-          - Name:            main.c
-            DirIdx:          1
-            ModTime:         0
-            Length:          0
-        Opcodes:
-          - Opcode:          DW_LNS_extended_op
-            ExtLen:          9
-            SubOpcode:       DW_LNE_set_address
-            Data:            4112
-          - Opcode:          DW_LNS_advance_line
-            SData:           9
-            Data:            4112
-          - Opcode:          DW_LNS_copy
-            Data:            4112
-          - Opcode:          DW_LNS_advance_pc
-            Data:            18446744073709551600
-          - Opcode:          DW_LNS_extended_op
-            ExtLen:          1
-            SubOpcode:       DW_LNE_end_sequence
-            Data:            18446744073709551600
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(*DwarfContext, "error: .debug_line[0x00000000] row[1] decreases "
-                             "in address from previous row:");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyInvalidLineFileIndex) {
-  // Create a single compile unit whose line table has a line table row with
-  // an invalid file index.
-  StringRef yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_stmt_list
-                Form:            DW_FORM_sec_offset
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-              - Value:           0x0000000000000000
-    debug_line:
-      - Version:         2
-        MinInstLength:   1
-        DefaultIsStmt:   1
-        LineBase:        251
-        LineRange:       14
-        OpcodeBase:      13
-        StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
-        IncludeDirs:
-          - /tmp
-        Files:
-          - Name:            main.c
-            DirIdx:          1
-            ModTime:         0
-            Length:          0
-        Opcodes:
-          - Opcode:          DW_LNS_extended_op
-            ExtLen:          9
-            SubOpcode:       DW_LNE_set_address
-            Data:            4096
-          - Opcode:          DW_LNS_advance_line
-            SData:           9
-            Data:            4096
-          - Opcode:          DW_LNS_copy
-            Data:            4096
-          - Opcode:          DW_LNS_advance_pc
-            Data:            16
-          - Opcode:          DW_LNS_set_file
-            Data:            5
-          - Opcode:          DW_LNS_extended_op
-            ExtLen:          1
-            SubOpcode:       DW_LNE_end_sequence
-            Data:            5
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(*DwarfContext, "error: .debug_line[0x00000000][1] has invalid "
-                             "file index 5 (valid values are [1,1]):");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyInvalidLineTablePorlogueDirIndex) {
-  // Create a single compile unit whose line table has a prologue with an
-  // invalid dir index.
-  StringRef yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_stmt_list
-                Form:            DW_FORM_sec_offset
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-              - Value:           0x0000000000000000
-    debug_line:
-      - Version:         2
-        MinInstLength:   1
-        DefaultIsStmt:   1
-        LineBase:        251
-        LineRange:       14
-        OpcodeBase:      13
-        StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
-        IncludeDirs:
-          - /tmp
-        Files:
-          - Name:            main.c
-            DirIdx:          2
-            ModTime:         0
-            Length:          0
-        Opcodes:
-          - Opcode:          DW_LNS_extended_op
-            ExtLen:          9
-            SubOpcode:       DW_LNE_set_address
-            Data:            4096
-          - Opcode:          DW_LNS_advance_line
-            SData:           9
-            Data:            4096
-          - Opcode:          DW_LNS_copy
-            Data:            4096
-          - Opcode:          DW_LNS_advance_pc
-            Data:            16
-          - Opcode:          DW_LNS_set_file
-            Data:            1
-          - Opcode:          DW_LNS_extended_op
-            ExtLen:          1
-            SubOpcode:       DW_LNE_end_sequence
-            Data:            1
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(*DwarfContext,
-              "error: .debug_line[0x00000000].prologue."
-              "file_names[1].dir_idx contains an invalid index: 2");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyDuplicateFileWarning) {
-  // Create a single compile unit whose line table has a prologue with an
-  // invalid dir index.
-  StringRef yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_stmt_list
-                Form:            DW_FORM_sec_offset
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-              - Value:           0x0000000000000000
-    debug_line:
-      - Version:         2
-        MinInstLength:   1
-        DefaultIsStmt:   1
-        LineBase:        251
-        LineRange:       14
-        OpcodeBase:      13
-        StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
-        IncludeDirs:
-          - /tmp
-        Files:
-          - Name:            main.c
-            DirIdx:          1
-            ModTime:         0
-            Length:          0
-          - Name:            main.c
-            DirIdx:          1
-            ModTime:         0
-            Length:          0
-        Opcodes:
-          - Opcode:          DW_LNS_extended_op
-            ExtLen:          9
-            SubOpcode:       DW_LNE_set_address
-            Data:            4096
-          - Opcode:          DW_LNS_advance_line
-            SData:           9
-            Data:            4096
-          - Opcode:          DW_LNS_copy
-            Data:            4096
-          - Opcode:          DW_LNS_advance_pc
-            Data:            16
-          - Opcode:          DW_LNS_set_file
-            Data:            1
-          - Opcode:          DW_LNS_extended_op
-            ExtLen:          1
-            SubOpcode:       DW_LNE_end_sequence
-            Data:            2
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyWarning(*DwarfContext,
-                "warning: .debug_line[0x00000000].prologue.file_names[2] is "
-                "a duplicate of file_names[1]");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyCUDontShareLineTable) {
-  // Create a two compile units where both compile units share the same
-  // DW_AT_stmt_list value and verify we report the error correctly.
-  StringRef yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-      - /tmp/foo.c
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_stmt_list
-                Form:            DW_FORM_sec_offset
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-              - Value:           0x0000000000000000
-      - Length:          16
-        Version:         4
-        AbbrevTableID:   0
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x000000000000000D
-              - Value:           0x0000000000000000
-    debug_line:
-      - Version:         2
-        MinInstLength:   1
-        DefaultIsStmt:   1
-        LineBase:        251
-        LineRange:       14
-        OpcodeBase:      13
-        StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
-        IncludeDirs:
-          - /tmp
-        Files:
-          - Name:            main.c
-            DirIdx:          1
-            ModTime:         0
-            Length:          0
-        Opcodes:
-          - Opcode:          DW_LNS_extended_op
-            ExtLen:          9
-            SubOpcode:       DW_LNE_set_address
-            Data:            4096
-          - Opcode:          DW_LNS_advance_line
-            SData:           9
-            Data:            4096
-          - Opcode:          DW_LNS_copy
-            Data:            4096
-          - Opcode:          DW_LNS_advance_pc
-            Data:            256
-          - Opcode:          DW_LNS_extended_op
-            ExtLen:          1
-            SubOpcode:       DW_LNE_end_sequence
-            Data:            256
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(*DwarfContext,
-              "error: two compile unit DIEs, 0x0000000b and "
-              "0x0000001f, have the same DW_AT_stmt_list section "
-              "offset:");
-}
-
 TEST(DWARFDebugInfo, TestErrorReporting) {
   Triple Triple("x86_64-pc-linux");
   if (!isConfigurationSupported(Triple))
@@ -2528,413 +1886,6 @@ TEST(DWARFDebugInfo, TestErrorReporting) {
   EXPECT_TRUE(Errors == 2);
 }
 
-TEST(DWARFDebugInfo, TestDwarfVerifyCURangesIncomplete) {
-  // Create a single compile unit with a single function. The compile
-  // unit has a DW_AT_ranges attribute that doesn't fully contain the
-  // address range of the function. The verification should fail due to
-  // the CU ranges not containing all of the address ranges of all of the
-  // functions.
-  StringRef yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_yes
-            Attributes:
-              - Attribute:       DW_AT_low_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_high_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-          - Code:            0x00000002
-            Tag:             DW_TAG_subprogram
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_low_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_high_pc
-                Form:            DW_FORM_addr
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000001000
-              - Value:           0x0000000000001500
-              - Value:           0x0000000000000001
-          - AbbrCode:        0x00000002
-            Values:
-              - Value:           0x0000000000001000
-              - Value:           0x0000000000002000
-          - AbbrCode:        0x00000000
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(*DwarfContext, "error: DIE address ranges are not "
-                             "contained in its parent's ranges:");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyLexicalBlockRanges) {
-  // Create a single compile unit with a single function that has a lexical
-  // block whose address range is not contained in the function address range.
-  StringRef yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-      - main
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_yes
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-          - Code:            0x00000002
-            Tag:             DW_TAG_subprogram
-            Children:        DW_CHILDREN_yes
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_low_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_high_pc
-                Form:            DW_FORM_addr
-          - Code:            0x00000003
-            Tag:             DW_TAG_lexical_block
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_low_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_high_pc
-                Form:            DW_FORM_addr
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-          - AbbrCode:        0x00000002
-            Values:
-              - Value:           0x000000000000000D
-              - Value:           0x0000000000001000
-              - Value:           0x0000000000002000
-          - AbbrCode:        0x00000003
-            Values:
-              - Value:           0x0000000000001000
-              - Value:           0x0000000000002001
-          - AbbrCode:        0x00000000
-          - AbbrCode:        0x00000000
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(*DwarfContext, "error: DIE address ranges are not "
-                             "contained in its parent's ranges:");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyOverlappingFunctionRanges) {
-  // Create a single compile unit with a two functions that have overlapping
-  // address ranges.
-  StringRef yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-      - main
-      - foo
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_yes
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-          - Code:            0x00000002
-            Tag:             DW_TAG_subprogram
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_low_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_high_pc
-                Form:            DW_FORM_addr
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-          - AbbrCode:        0x00000002
-            Values:
-              - Value:           0x000000000000000D
-              - Value:           0x0000000000001000
-              - Value:           0x0000000000002000
-          - AbbrCode:        0x00000002
-            Values:
-              - Value:           0x0000000000000012
-              - Value:           0x0000000000001FFF
-              - Value:           0x0000000000002000
-          - AbbrCode:        0x00000000
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(*DwarfContext, "error: DIEs have overlapping address ranges:");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyOverlappingLexicalBlockRanges) {
-  // Create a single compile unit with a one function that has two lexical
-  // blocks with overlapping address ranges.
-  StringRef yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-      - main
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_yes
-            Attributes:
-              - Attribute:       DW_AT_low_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_high_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-          - Code:            0x00000002
-            Tag:             DW_TAG_subprogram
-            Children:        DW_CHILDREN_yes
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_low_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_high_pc
-                Form:            DW_FORM_addr
-          - Code:            0x00000003
-            Tag:             DW_TAG_lexical_block
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_low_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_high_pc
-                Form:            DW_FORM_addr
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000001000
-              - Value:           0x0000000000002000
-              - Value:           0x0000000000000001
-          - AbbrCode:        0x00000002
-            Values:
-              - Value:           0x000000000000000D
-              - Value:           0x0000000000001000
-              - Value:           0x0000000000002000
-          - AbbrCode:        0x00000003
-            Values:
-              - Value:           0x0000000000001100
-              - Value:           0x0000000000001300
-          - AbbrCode:        0x00000003
-            Values:
-              - Value:           0x00000000000012FF
-              - Value:           0x0000000000001300
-          - AbbrCode:        0x00000000
-          - AbbrCode:        0x00000000
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(*DwarfContext, "error: DIEs have overlapping address ranges:");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyInvalidDIERange) {
-  // Create a single compile unit with a single function that has an invalid
-  // address range where the high PC is smaller than the low PC.
-  StringRef yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-      - main
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_yes
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-          - Code:            0x00000002
-            Tag:             DW_TAG_subprogram
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_low_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_high_pc
-                Form:            DW_FORM_addr
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000000001
-          - AbbrCode:        0x00000002
-            Values:
-              - Value:           0x000000000000000D
-              - Value:           0x0000000000001000
-              - Value:           0x0000000000000900
-          - AbbrCode:        0x00000000
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifyError(*DwarfContext, "error: Invalid address range");
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyElidedDoesntFail) {
-  // Create a single compile unit with two functions: one that has a valid range
-  // and one whose low and high PC are the same. When the low and high PC are
-  // the same, this indicates the function was dead code stripped. We want to
-  // ensure that verification succeeds.
-  StringRef yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-      - main
-      - elided
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_yes
-            Attributes:
-              - Attribute:       DW_AT_low_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_high_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-          - Code:            0x00000002
-            Tag:             DW_TAG_subprogram
-            Children:        DW_CHILDREN_no
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_low_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_high_pc
-                Form:            DW_FORM_addr
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000001000
-              - Value:           0x0000000000002000
-              - Value:           0x0000000000000001
-          - AbbrCode:        0x00000002
-            Values:
-              - Value:           0x000000000000000D
-              - Value:           0x0000000000001000
-              - Value:           0x0000000000002000
-          - AbbrCode:        0x00000002
-            Values:
-              - Value:           0x0000000000000012
-              - Value:           0x0000000000002000
-              - Value:           0x0000000000002000
-          - AbbrCode:        0x00000000
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifySuccess(*DwarfContext);
-}
-
-TEST(DWARFDebugInfo, TestDwarfVerifyNestedFunctions) {
-  // Create a single compile unit with a nested function which is not contained
-  // in its parent. Although LLVM doesn't generate this, it is valid accoridng
-  // to the DWARF standard.
-  StringRef yamldata = R"(
-    debug_str:
-      - ''
-      - /tmp/main.c
-      - main
-      - nested
-    debug_abbrev:
-      - Table:
-          - Code:            0x00000001
-            Tag:             DW_TAG_compile_unit
-            Children:        DW_CHILDREN_yes
-            Attributes:
-              - Attribute:       DW_AT_low_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_high_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-          - Code:            0x00000002
-            Tag:             DW_TAG_subprogram
-            Children:        DW_CHILDREN_yes
-            Attributes:
-              - Attribute:       DW_AT_name
-                Form:            DW_FORM_strp
-              - Attribute:       DW_AT_low_pc
-                Form:            DW_FORM_addr
-              - Attribute:       DW_AT_high_pc
-                Form:            DW_FORM_addr
-    debug_info:
-      - Version:         4
-        AddrSize:        8
-        Entries:
-          - AbbrCode:        0x00000001
-            Values:
-              - Value:           0x0000000000001000
-              - Value:           0x0000000000002000
-              - Value:           0x0000000000000001
-          - AbbrCode:        0x00000002
-            Values:
-              - Value:           0x000000000000000D
-              - Value:           0x0000000000001000
-              - Value:           0x0000000000001500
-          - AbbrCode:        0x00000002
-            Values:
-              - Value:           0x0000000000000012
-              - Value:           0x0000000000001500
-              - Value:           0x0000000000002000
-          - AbbrCode:        0x00000000
-          - AbbrCode:        0x00000000
-          - AbbrCode:        0x00000000
-  )";
-  auto ErrOrSections = DWARFYAML::emitDebugSections(yamldata);
-  ASSERT_TRUE((bool)ErrOrSections);
-  std::unique_ptr<DWARFContext> DwarfContext =
-      DWARFContext::create(*ErrOrSections, 8);
-  VerifySuccess(*DwarfContext);
-}
-
 TEST(DWARFDebugInfo, TestDWARFDieRangeInfoContains) {
   DWARFVerifier::DieRangeInfo Empty;
   ASSERT_TRUE(Empty.contains(Empty));
diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCopyBytesTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCopyBytesTest.cpp
new file mode 100644
index 0000000000000..ead650bced854
--- /dev/null
+++ b/llvm/unittests/DebugInfo/DWARF/DWARFExpressionCopyBytesTest.cpp
@@ -0,0 +1,241 @@
+//===- llvm/unittest/DebugInfo/DWARFExpressionRawDataTest.cpp -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFExpression.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace dwarf;
+
+namespace {
+
+/// Tests that a client of DebugInfo/DWARF is able to read raw data bytes of a
+/// DWARFExpression parsed from CFI with the intent of writing them back as is
+/// via MC layer / cfi_escape.
+/// This is relevant for binary tools that need to rewrite/copy unwind and
+/// debug info from input to output binary.
+class DWARFExpressionCopyBytesTest : public ::testing::Test {
+public:
+  const char *TripleName = "x86_64-pc-linux";
+  std::unique_ptr<MCRegisterInfo> MRI;
+  std::unique_ptr<MCAsmInfo> MAI;
+  std::unique_ptr<const MCSubtargetInfo> STI;
+  const Target *TheTarget;
+
+  DWARFExpressionCopyBytesTest() {
+    InitializeAllTargets();
+    InitializeAllTargetMCs();
+    InitializeAllAsmPrinters();
+
+    std::string ErrorStr;
+    TheTarget = TargetRegistry::lookupTarget(TripleName, ErrorStr);
+    if (!TheTarget)
+      return;
+
+    MRI.reset(TheTarget->createMCRegInfo(TripleName));
+    MAI.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCTargetOptions()));
+    STI.reset(TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+  }
+
+  struct StreamerContext {
+    std::unique_ptr<MCObjectFileInfo> MOFI;
+    std::unique_ptr<MCContext> Ctx;
+    std::unique_ptr<const MCInstrInfo> MII;
+    std::unique_ptr<MCStreamer> Streamer;
+  };
+
+  /// Create all data structures necessary to operate an assembler
+  StreamerContext createStreamer(raw_pwrite_stream &OS);
+  /// Emit a dummy obj file with a single CFI instruction,
+  /// DW_CFA_def_cfa_expression, encoding as its operand the DWARF expression
+  /// represented by ExprBytes
+  SmallString<0> emitObjFile(StringRef ExprBytes);
+  /// Peruse the object file looking for the encoded DWARF expression, and check
+  /// that its operand was encoded correctly
+  void parseCFIsAndCheckExpression(const llvm::object::ObjectFile &E,
+                                   ArrayRef<uint8_t> Expected);
+  /// Open the in-memory relocatable object file and verify that it contains
+  /// the expected DWARF expression bytes
+  void readAndCheckObjFile(StringRef ObjFileData, ArrayRef<uint8_t> Expected);
+  /// Run this test on the DWARF expression represented by the bytes in
+  /// ExprData. Check that the getData() API retrieves these original bytes and
+  /// that we can use them to encode a CFI with those bytes as operands (via
+  /// cfi_escape).
+  void testExpr(ArrayRef<uint8_t> ExprData);
+};
+
+} // namespace
+
+DWARFExpressionCopyBytesTest::StreamerContext
+DWARFExpressionCopyBytesTest::createStreamer(raw_pwrite_stream &OS) {
+  StreamerContext Res;
+  Res.Ctx =
+      std::make_unique<MCContext>(Triple(TripleName), MAI.get(), MRI.get(),
+                                  /*MSTI=*/nullptr);
+  Res.MOFI.reset(TheTarget->createMCObjectFileInfo(*Res.Ctx.get(),
+                                                   /*PIC=*/false));
+  Res.Ctx->setObjectFileInfo(Res.MOFI.get());
+
+  Res.MII.reset(TheTarget->createMCInstrInfo());
+  MCCodeEmitter *MCE = TheTarget->createMCCodeEmitter(*Res.MII, *MRI, *Res.Ctx);
+  MCAsmBackend *MAB =
+      TheTarget->createMCAsmBackend(*STI, *MRI, MCTargetOptions());
+  std::unique_ptr<MCObjectWriter> OW = MAB->createObjectWriter(OS);
+  Res.Streamer.reset(TheTarget->createMCObjectStreamer(
+      Triple(TripleName), *Res.Ctx, std::unique_ptr<MCAsmBackend>(MAB),
+      std::move(OW), std::unique_ptr<MCCodeEmitter>(MCE), *STI,
+      /* RelaxAll */ false,
+      /* IncrementalLinkerCompatible */ false,
+      /* DWARFMustBeAtTheEnd */ false));
+  return Res;
+}
+
+SmallString<0> DWARFExpressionCopyBytesTest::emitObjFile(StringRef ExprBytes) {
+  auto EncodeDefCfaExpr = [&](StringRef Bytes) {
+    std::string Str;
+    raw_string_ostream OS(Str);
+    OS << static_cast<uint8_t>(dwarf::DW_CFA_def_cfa_expression);
+    encodeULEB128(Bytes.size(), OS);
+    OS << Bytes;
+    return Str;
+  };
+
+  SmallString<0> Storage;
+  raw_svector_ostream VecOS(Storage);
+  StreamerContext C = createStreamer(VecOS);
+  C.Streamer->InitSections(false);
+  MCSection *Section = C.MOFI->getTextSection();
+  Section->setHasInstructions(true);
+  C.Streamer->SwitchSection(Section);
+  C.Streamer->emitCFIStartProc(true);
+  auto Str = EncodeDefCfaExpr(ExprBytes);
+  C.Streamer->emitCFIEscape(Str);
+  C.Streamer->emitNops(4, 1, SMLoc());
+  C.Streamer->emitCFIEndProc();
+  C.Streamer->Finish();
+  return Storage;
+}
+
+void DWARFExpressionCopyBytesTest::parseCFIsAndCheckExpression(
+    const llvm::object::ObjectFile &E, ArrayRef<uint8_t> Expected) {
+  auto FetchFirstCfaExpression =
+      [](const DWARFDebugFrame &EHFrame) -> Optional<CFIProgram::Instruction> {
+    for (const dwarf::FrameEntry &Entry : EHFrame.entries()) {
+      const auto *CurFDE = dyn_cast<dwarf::FDE>(&Entry);
+      if (!CurFDE)
+        continue;
+      for (const CFIProgram::Instruction &Instr : CurFDE->cfis()) {
+        if (Instr.Opcode != dwarf::DW_CFA_def_cfa_expression)
+          continue;
+        return Instr;
+      }
+    }
+    return NoneType();
+  };
+
+  std::unique_ptr<DWARFContext> Ctx = DWARFContext::create(E);
+  const DWARFDebugFrame *EHFrame = cantFail(Ctx->getEHFrame());
+  ASSERT_NE(nullptr, EHFrame);
+  auto CfiInstr = FetchFirstCfaExpression(*EHFrame);
+  ASSERT_TRUE(CfiInstr);
+  DWARFExpression Expr = *(CfiInstr->Expression);
+  StringRef ExprData = Expr.getData();
+  EXPECT_EQ(ExprData.size(), Expected.size());
+  for (unsigned I = 0, E = ExprData.size(); I != E; ++I) {
+    EXPECT_EQ(static_cast<uint8_t>(ExprData[I]), Expected[I]);
+  }
+}
+
+void DWARFExpressionCopyBytesTest::readAndCheckObjFile(
+    StringRef ObjFileData, ArrayRef<uint8_t> Expected) {
+  std::unique_ptr<MemoryBuffer> MB =
+      MemoryBuffer::getMemBuffer(ObjFileData, "", false);
+  std::unique_ptr<object::Binary> Bin =
+      cantFail(llvm::object::createBinary(MB->getMemBufferRef()));
+  if (auto *E = dyn_cast<llvm::object::ELFObjectFileBase>(&*Bin)) {
+    parseCFIsAndCheckExpression(*E, Expected);
+  }
+}
+
+void DWARFExpressionCopyBytesTest::testExpr(ArrayRef<uint8_t> ExprData) {
+  // If we didn't build x86, do not run the test.
+  if (!MRI)
+    return;
+
+  DataExtractor DE(ExprData, true, 8);
+  DWARFExpression Expr(DE, 8);
+
+  // Copy this expression into the CFI of a binary and check that we are able to
+  // get it back correctly from this binary.
+  const SmallString<0> EmittedBinContents = emitObjFile(Expr.getData());
+  readAndCheckObjFile(EmittedBinContents.str(), ExprData);
+}
+
+TEST_F(DWARFExpressionCopyBytesTest, Test_OP_reg0) { testExpr({DW_OP_reg0}); }
+
+TEST_F(DWARFExpressionCopyBytesTest, Test_OP_reg10) { testExpr({DW_OP_reg10}); }
+
+TEST_F(DWARFExpressionCopyBytesTest, Test_OP_regx) {
+  testExpr({DW_OP_regx, 0x80, 0x02});
+}
+
+TEST_F(DWARFExpressionCopyBytesTest, Test_OP_breg0) {
+  testExpr({DW_OP_breg0, 0x04});
+}
+
+TEST_F(DWARFExpressionCopyBytesTest, Test_OP_breg0_large_offset) {
+  testExpr({DW_OP_breg0, 0x80, 0x02});
+}
+
+TEST_F(DWARFExpressionCopyBytesTest, Test_OP_breg13) {
+  testExpr({DW_OP_breg13, 0x10});
+}
+
+TEST_F(DWARFExpressionCopyBytesTest, Test_OP_breg13_zero_offset) {
+  testExpr({DW_OP_breg13, 0x00});
+}
+
+TEST_F(DWARFExpressionCopyBytesTest, Test_OP_breg0_negative) {
+  testExpr({DW_OP_breg0, 0x70});
+}
+
+TEST_F(DWARFExpressionCopyBytesTest, Test_OP_bregx) {
+  testExpr({DW_OP_bregx, 0x0d, 0x28});
+}
+
+TEST_F(DWARFExpressionCopyBytesTest, Test_OP_stack_value) {
+  testExpr({DW_OP_breg13, 0x04, DW_OP_stack_value});
+}
+
+TEST_F(DWARFExpressionCopyBytesTest, Test_OP_entry_value) {
+  testExpr({DW_OP_entry_value, 0x01, DW_OP_reg0, DW_OP_stack_value});
+}
+
+TEST_F(DWARFExpressionCopyBytesTest, Test_OP_entry_value_mem) {
+  testExpr({DW_OP_entry_value, 0x02, DW_OP_breg13, 0x10, DW_OP_stack_value});
+}
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index 5c2a7879dcd3f..79c16cbe0597e 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -17,9 +17,11 @@ set(LLVM_LINK_COMPONENTS
 add_llvm_unittest(OrcJITTests
   CoreAPIsTest.cpp
   ExecutionSessionWrapperFunctionCallsTest.cpp
+  EPCGenericMemoryAccessTest.cpp
   IndirectionUtilsTest.cpp
   JITTargetMachineBuilderTest.cpp
   LazyCallThroughAndReexportsTest.cpp
+  LookupAndRecordAddrsTest.cpp
   ObjectLinkingLayerTest.cpp
   OrcCAPITest.cpp
   OrcTestCommon.cpp
diff --git a/llvm/unittests/ExecutionEngine/Orc/EPCGenericMemoryAccessTest.cpp b/llvm/unittests/ExecutionEngine/Orc/EPCGenericMemoryAccessTest.cpp
new file mode 100644
index 0000000000000..f5209d66fc1d2
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/EPCGenericMemoryAccessTest.cpp
@@ -0,0 +1,98 @@
+//===- EPCGenericMemoryAccessTest.cpp -- Tests for EPCGenericMemoryAccess -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "OrcTestCommon.h"
+
+#include "llvm/ExecutionEngine/Orc/EPCGenericMemoryAccess.h"
+#include "llvm/Testing/Support/Error.h"
+
+using namespace llvm;
+using namespace llvm::orc;
+using namespace llvm::orc::shared;
+
+namespace {
+
+template <typename WriteT, typename SPSWriteT>
+llvm::orc::shared::detail::CWrapperFunctionResult
+testWriteUInts(const char *ArgData, size_t ArgSize) {
+  return WrapperFunction<void(SPSSequence<SPSWriteT>)>::handle(
+             ArgData, ArgSize,
+             [](std::vector<WriteT> Ws) {
+               for (auto &W : Ws)
+                 *jitTargetAddressToPointer<decltype(W.Value) *>(W.Address) =
+                     W.Value;
+             })
+      .release();
+}
+
+llvm::orc::shared::detail::CWrapperFunctionResult
+testWriteBuffers(const char *ArgData, size_t ArgSize) {
+  return WrapperFunction<void(SPSSequence<SPSMemoryAccessBufferWrite>)>::handle(
+             ArgData, ArgSize,
+             [](std::vector<tpctypes::BufferWrite> Ws) {
+               for (auto &W : Ws)
+                 memcpy(jitTargetAddressToPointer<char *>(W.Address),
+                        W.Buffer.data(), W.Buffer.size());
+             })
+      .release();
+}
+
+TEST(EPCGenericMemoryAccessTest, MemWrites) {
+  auto SelfEPC = cantFail(SelfExecutorProcessControl::Create());
+
+  EPCGenericMemoryAccess::FuncAddrs FAs;
+  FAs.WriteUInt8s = ExecutorAddress::fromPtr(
+      &testWriteUInts<tpctypes::UInt8Write, SPSMemoryAccessUInt8Write>);
+  FAs.WriteUInt16s = ExecutorAddress::fromPtr(
+      &testWriteUInts<tpctypes::UInt16Write, SPSMemoryAccessUInt16Write>);
+  FAs.WriteUInt32s = ExecutorAddress::fromPtr(
+      &testWriteUInts<tpctypes::UInt32Write, SPSMemoryAccessUInt32Write>);
+  FAs.WriteUInt64s = ExecutorAddress::fromPtr(
+      &testWriteUInts<tpctypes::UInt64Write, SPSMemoryAccessUInt64Write>);
+  FAs.WriteBuffers = ExecutorAddress::fromPtr(&testWriteBuffers);
+
+  auto MemAccess = std::make_unique<EPCGenericMemoryAccess>(*SelfEPC, FAs);
+
+  uint8_t Test_UInt8_1 = 0;
+  uint8_t Test_UInt8_2 = 0;
+  uint16_t Test_UInt16 = 0;
+  uint32_t Test_UInt32 = 0;
+  uint64_t Test_UInt64 = 0;
+  char Test_Buffer[21];
+
+  auto Err1 = MemAccess->writeUInt8s(
+      {{pointerToJITTargetAddress(&Test_UInt8_1), 1},
+       {pointerToJITTargetAddress(&Test_UInt8_2), 0xFE}});
+
+  EXPECT_THAT_ERROR(std::move(Err1), Succeeded());
+  EXPECT_EQ(Test_UInt8_1, 1U);
+  EXPECT_EQ(Test_UInt8_2, 0xFE);
+
+  auto Err2 =
+      MemAccess->writeUInt16s({{pointerToJITTargetAddress(&Test_UInt16), 1}});
+  EXPECT_THAT_ERROR(std::move(Err2), Succeeded());
+  EXPECT_EQ(Test_UInt16, 1U);
+
+  auto Err3 =
+      MemAccess->writeUInt32s({{pointerToJITTargetAddress(&Test_UInt32), 1}});
+  EXPECT_THAT_ERROR(std::move(Err3), Succeeded());
+  EXPECT_EQ(Test_UInt32, 1U);
+
+  auto Err4 =
+      MemAccess->writeUInt64s({{pointerToJITTargetAddress(&Test_UInt64), 1}});
+  EXPECT_THAT_ERROR(std::move(Err4), Succeeded());
+  EXPECT_EQ(Test_UInt64, 1U);
+
+  StringRef TestMsg("test-message");
+  auto Err5 = MemAccess->writeBuffers(
+      {{pointerToJITTargetAddress(&Test_Buffer), TestMsg}});
+  EXPECT_THAT_ERROR(std::move(Err5), Succeeded());
+  EXPECT_EQ(StringRef(Test_Buffer, TestMsg.size()), TestMsg);
+}
+
+} // namespace
diff --git a/llvm/unittests/ExecutionEngine/Orc/ExecutionSessionWrapperFunctionCallsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ExecutionSessionWrapperFunctionCallsTest.cpp
index 9044ead631826..39554e2a82aa7 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ExecutionSessionWrapperFunctionCallsTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ExecutionSessionWrapperFunctionCallsTest.cpp
@@ -30,6 +30,11 @@ static void addAsyncWrapper(unique_function<void(int32_t)> SendResult,
   SendResult(X + Y);
 }
 
+static llvm::orc::shared::detail::CWrapperFunctionResult
+voidWrapper(const char *ArgData, size_t ArgSize) {
+  return WrapperFunction<void()>::handle(ArgData, ArgSize, []() {}).release();
+}
+
 TEST(ExecutionSessionWrapperFunctionCalls, RunWrapperTemplate) {
   ExecutionSession ES(cantFail(SelfExecutorProcessControl::Create()));
 
@@ -40,12 +45,24 @@ TEST(ExecutionSessionWrapperFunctionCalls, RunWrapperTemplate) {
   EXPECT_EQ(Result, 5);
 }
 
-TEST(ExecutionSessionWrapperFunctionCalls, RunWrapperAsyncTemplate) {
+TEST(ExecutionSessionWrapperFunctionCalls, RunVoidWrapperAsyncTemplate) {
+  ExecutionSession ES(cantFail(SelfExecutorProcessControl::Create()));
+
+  std::promise<MSVCPError> RP;
+  ES.callSPSWrapperAsync<void()>(
+      [&](Error SerializationErr) {
+        RP.set_value(std::move(SerializationErr));
+      },
+      pointerToJITTargetAddress(voidWrapper));
+  Error Err = RP.get_future().get();
+  EXPECT_THAT_ERROR(std::move(Err), Succeeded());
+}
+
+TEST(ExecutionSessionWrapperFunctionCalls, RunNonVoidWrapperAsyncTemplate) {
   ExecutionSession ES(cantFail(SelfExecutorProcessControl::Create()));
 
   std::promise<MSVCPExpected<int32_t>> RP;
-  using Sig = int32_t(int32_t, int32_t);
-  ES.callSPSWrapperAsync<Sig>(
+  ES.callSPSWrapperAsync<int32_t(int32_t, int32_t)>(
       [&](Error SerializationErr, int32_t R) {
         if (SerializationErr)
           RP.set_value(std::move(SerializationErr));
diff --git a/llvm/unittests/ExecutionEngine/Orc/LookupAndRecordAddrsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/LookupAndRecordAddrsTest.cpp
new file mode 100644
index 0000000000000..aff356149f667
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/LookupAndRecordAddrsTest.cpp
@@ -0,0 +1,108 @@
+//===- LookupAndRecordAddrsTest.cpp - Unit tests for LookupAndRecordAddrs -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "OrcTestCommon.h"
+
+#include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h"
+#include "llvm/Support/MSVCErrorWorkarounds.h"
+#include "llvm/Testing/Support/Error.h"
+
+#include <future>
+
+using namespace llvm;
+using namespace llvm::orc;
+
+class LookupAndRecordAddrsTest : public CoreAPIsBasedStandardTest {};
+
+namespace {
+
+TEST_F(LookupAndRecordAddrsTest, AsyncRequiredSuccess) {
+  cantFail(JD.define(absoluteSymbols({{Foo, FooSym}, {Bar, BarSym}})));
+
+  ExecutorAddress FooAddress, BarAddress;
+  std::promise<MSVCPError> ErrP;
+
+  lookupAndRecordAddrs([&](Error Err) { ErrP.set_value(std::move(Err)); }, ES,
+                       LookupKind::Static, makeJITDylibSearchOrder(&JD),
+                       {{Foo, &FooAddress}, {Bar, &BarAddress}});
+
+  Error Err = ErrP.get_future().get();
+
+  EXPECT_THAT_ERROR(std::move(Err), Succeeded());
+  EXPECT_EQ(FooAddress.getValue(), FooAddr);
+  EXPECT_EQ(BarAddress.getValue(), BarAddr);
+}
+
+TEST_F(LookupAndRecordAddrsTest, AsyncRequiredFailure) {
+  ExecutorAddress FooAddress, BarAddress;
+  std::promise<MSVCPError> ErrP;
+
+  lookupAndRecordAddrs([&](Error Err) { ErrP.set_value(std::move(Err)); }, ES,
+                       LookupKind::Static, makeJITDylibSearchOrder(&JD),
+                       {{Foo, &FooAddress}, {Bar, &BarAddress}});
+
+  Error Err = ErrP.get_future().get();
+
+  EXPECT_THAT_ERROR(std::move(Err), Failed());
+}
+
+TEST_F(LookupAndRecordAddrsTest, AsyncWeakReference) {
+  cantFail(JD.define(absoluteSymbols({{Foo, FooSym}})));
+
+  ExecutorAddress FooAddress, BarAddress;
+  std::promise<MSVCPError> ErrP;
+
+  lookupAndRecordAddrs([&](Error Err) { ErrP.set_value(std::move(Err)); }, ES,
+                       LookupKind::Static, makeJITDylibSearchOrder(&JD),
+                       {{Foo, &FooAddress}, {Bar, &BarAddress}},
+                       SymbolLookupFlags::WeaklyReferencedSymbol);
+
+  Error Err = ErrP.get_future().get();
+
+  EXPECT_THAT_ERROR(std::move(Err), Succeeded());
+  EXPECT_EQ(FooAddress.getValue(), FooAddr);
+  EXPECT_EQ(BarAddress.getValue(), 0U);
+}
+
+TEST_F(LookupAndRecordAddrsTest, BlockingRequiredSuccess) {
+  cantFail(JD.define(absoluteSymbols({{Foo, FooSym}, {Bar, BarSym}})));
+
+  ExecutorAddress FooAddress, BarAddress;
+  auto Err =
+      lookupAndRecordAddrs(ES, LookupKind::Static, makeJITDylibSearchOrder(&JD),
+                           {{Foo, &FooAddress}, {Bar, &BarAddress}});
+
+  EXPECT_THAT_ERROR(std::move(Err), Succeeded());
+  EXPECT_EQ(FooAddress.getValue(), FooAddr);
+  EXPECT_EQ(BarAddress.getValue(), BarAddr);
+}
+
+TEST_F(LookupAndRecordAddrsTest, BlockingRequiredFailure) {
+  ExecutorAddress FooAddress, BarAddress;
+  auto Err =
+      lookupAndRecordAddrs(ES, LookupKind::Static, makeJITDylibSearchOrder(&JD),
+                           {{Foo, &FooAddress}, {Bar, &BarAddress}});
+
+  EXPECT_THAT_ERROR(std::move(Err), Failed());
+}
+
+TEST_F(LookupAndRecordAddrsTest, BlockingWeakReference) {
+  cantFail(JD.define(absoluteSymbols({{Foo, FooSym}})));
+
+  ExecutorAddress FooAddress, BarAddress;
+  auto Err =
+      lookupAndRecordAddrs(ES, LookupKind::Static, makeJITDylibSearchOrder(&JD),
+                           {{Foo, &FooAddress}, {Bar, &BarAddress}},
+                           SymbolLookupFlags::WeaklyReferencedSymbol);
+
+  EXPECT_THAT_ERROR(std::move(Err), Succeeded());
+  EXPECT_EQ(FooAddress.getValue(), FooAddr);
+  EXPECT_EQ(BarAddress.getValue(), 0U);
+}
+
+} // namespace
diff --git a/llvm/unittests/ExecutionEngine/Orc/SimplePackedSerializationTest.cpp b/llvm/unittests/ExecutionEngine/Orc/SimplePackedSerializationTest.cpp
index f4a6b4588894e..b5455e6580bd2 100644
--- a/llvm/unittests/ExecutionEngine/Orc/SimplePackedSerializationTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/SimplePackedSerializationTest.cpp
@@ -47,7 +47,7 @@ TEST(SimplePackedSerializationTest, SPSInputBuffer) {
 }
 
 template <typename SPSTagT, typename T>
-static void blobSerializationRoundTrip(const T &Value) {
+static void spsSerializationRoundTrip(const T &Value) {
   using BST = SPSSerializationTraits<SPSTagT, T>;
 
   size_t Size = BST::size(Value);
@@ -66,24 +66,24 @@ static void blobSerializationRoundTrip(const T &Value) {
 }
 
 template <typename T> static void testFixedIntegralTypeSerialization() {
-  blobSerializationRoundTrip<T, T>(0);
-  blobSerializationRoundTrip<T, T>(static_cast<T>(1));
+  spsSerializationRoundTrip<T, T>(0);
+  spsSerializationRoundTrip<T, T>(static_cast<T>(1));
   if (std::is_signed<T>::value) {
-    blobSerializationRoundTrip<T, T>(static_cast<T>(-1));
-    blobSerializationRoundTrip<T, T>(std::numeric_limits<T>::min());
+    spsSerializationRoundTrip<T, T>(static_cast<T>(-1));
+    spsSerializationRoundTrip<T, T>(std::numeric_limits<T>::min());
   }
-  blobSerializationRoundTrip<T, T>(std::numeric_limits<T>::max());
+  spsSerializationRoundTrip<T, T>(std::numeric_limits<T>::max());
 }
 
 TEST(SimplePackedSerializationTest, BoolSerialization) {
-  blobSerializationRoundTrip<bool, bool>(true);
-  blobSerializationRoundTrip<bool, bool>(false);
+  spsSerializationRoundTrip<bool, bool>(true);
+  spsSerializationRoundTrip<bool, bool>(false);
 }
 
 TEST(SimplePackedSerializationTest, CharSerialization) {
-  blobSerializationRoundTrip<char, char>((char)0x00);
-  blobSerializationRoundTrip<char, char>((char)0xAA);
-  blobSerializationRoundTrip<char, char>((char)0xFF);
+  spsSerializationRoundTrip<char, char>((char)0x00);
+  spsSerializationRoundTrip<char, char>((char)0xAA);
+  spsSerializationRoundTrip<char, char>((char)0xFF);
 }
 
 TEST(SimplePackedSerializationTest, Int8Serialization) {
@@ -120,18 +120,22 @@ TEST(SimplePackedSerializationTest, UInt64Serialization) {
 
 TEST(SimplePackedSerializationTest, SequenceSerialization) {
   std::vector<int32_t> V({1, 2, -47, 139});
-  blobSerializationRoundTrip<SPSSequence<int32_t>, std::vector<int32_t>>(V);
+  spsSerializationRoundTrip<SPSSequence<int32_t>>(V);
 }
 
 TEST(SimplePackedSerializationTest, StringViewCharSequenceSerialization) {
   const char *HW = "Hello, world!";
-  blobSerializationRoundTrip<SPSString, StringRef>(StringRef(HW));
+  spsSerializationRoundTrip<SPSString>(StringRef(HW));
+}
+
+TEST(SimplePackedSerializationTest, StdTupleSerialization) {
+  std::tuple<int32_t, std::string, bool> P(42, "foo", true);
+  spsSerializationRoundTrip<SPSTuple<int32_t, SPSString, bool>>(P);
 }
 
 TEST(SimplePackedSerializationTest, StdPairSerialization) {
   std::pair<int32_t, std::string> P(42, "foo");
-  blobSerializationRoundTrip<SPSTuple<int32_t, SPSString>,
-                             std::pair<int32_t, std::string>>(P);
+  spsSerializationRoundTrip<SPSTuple<int32_t, SPSString>>(P);
 }
 
 TEST(SimplePackedSerializationTest, ArgListSerialization) {
@@ -162,6 +166,5 @@ TEST(SimplePackedSerializationTest, ArgListSerialization) {
 
 TEST(SimplePackedSerialization, StringMap) {
   StringMap<int32_t> M({{"A", 1}, {"B", 2}});
-  blobSerializationRoundTrip<SPSSequence<SPSTuple<SPSString, int32_t>>,
-                             StringMap<int32_t>>(M);
+  spsSerializationRoundTrip<SPSSequence<SPSTuple<SPSString, int32_t>>>(M);
 }
diff --git a/llvm/unittests/IR/AttributesTest.cpp b/llvm/unittests/IR/AttributesTest.cpp
index 37f9f310edabc..188d4e342e857 100644
--- a/llvm/unittests/IR/AttributesTest.cpp
+++ b/llvm/unittests/IR/AttributesTest.cpp
@@ -54,7 +54,8 @@ TEST(Attributes, Ordering) {
                          AttributeList::get(C, 1, Attribute::SExt)};
 
   AttributeList SetA = AttributeList::get(C, ASs);
-  AttributeList SetB = SetA.removeAttributes(C, 1, ASs[1].getAttributes(1));
+  AttributeList SetB =
+      SetA.removeParamAttributes(C, 0, ASs[1].getParamAttrs(0));
   EXPECT_NE(SetA, SetB);
 }
 
@@ -63,13 +64,13 @@ TEST(Attributes, AddAttributes) {
   AttributeList AL;
   AttrBuilder B;
   B.addAttribute(Attribute::NoReturn);
-  AL = AL.addAttributes(C, AttributeList::FunctionIndex, AttributeSet::get(C, B));
-  EXPECT_TRUE(AL.hasFnAttribute(Attribute::NoReturn));
+  AL = AL.addFnAttributes(C, AttributeSet::get(C, B));
+  EXPECT_TRUE(AL.hasFnAttr(Attribute::NoReturn));
   B.clear();
   B.addAttribute(Attribute::SExt);
-  AL = AL.addAttributes(C, AttributeList::ReturnIndex, B);
-  EXPECT_TRUE(AL.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt));
-  EXPECT_TRUE(AL.hasFnAttribute(Attribute::NoReturn));
+  AL = AL.addRetAttributes(C, B);
+  EXPECT_TRUE(AL.hasRetAttr(Attribute::SExt));
+  EXPECT_TRUE(AL.hasFnAttr(Attribute::NoReturn));
 }
 
 TEST(Attributes, RemoveAlign) {
@@ -101,11 +102,11 @@ TEST(Attributes, RemoveAlign) {
 
   AttributeList AL;
   AL = AL.addParamAttributes(C, 0, B_align_readonly);
-  AL = AL.addAttributes(C, 0, B_stackalign_optnone);
-  EXPECT_TRUE(AL.hasAttributes(0));
-  EXPECT_TRUE(AL.hasAttribute(0, Attribute::StackAlignment));
-  EXPECT_TRUE(AL.hasAttribute(0, Attribute::OptimizeNone));
-  EXPECT_TRUE(AL.getStackAlignment(0) == 32);
+  AL = AL.addRetAttributes(C, B_stackalign_optnone);
+  EXPECT_TRUE(AL.hasRetAttrs());
+  EXPECT_TRUE(AL.hasRetAttr(Attribute::StackAlignment));
+  EXPECT_TRUE(AL.hasRetAttr(Attribute::OptimizeNone));
+  EXPECT_TRUE(AL.getRetStackAlignment() == 32);
   EXPECT_TRUE(AL.hasParamAttrs(0));
   EXPECT_TRUE(AL.hasParamAttr(0, Attribute::Alignment));
   EXPECT_TRUE(AL.hasParamAttr(0, Attribute::ReadOnly));
@@ -114,48 +115,46 @@ TEST(Attributes, RemoveAlign) {
   AL = AL.removeParamAttribute(C, 0, Attribute::Alignment);
   EXPECT_FALSE(AL.hasParamAttr(0, Attribute::Alignment));
   EXPECT_TRUE(AL.hasParamAttr(0, Attribute::ReadOnly));
-  EXPECT_TRUE(AL.hasAttribute(0, Attribute::StackAlignment));
-  EXPECT_TRUE(AL.hasAttribute(0, Attribute::OptimizeNone));
-  EXPECT_TRUE(AL.getStackAlignment(0) == 32);
+  EXPECT_TRUE(AL.hasRetAttr(Attribute::StackAlignment));
+  EXPECT_TRUE(AL.hasRetAttr(Attribute::OptimizeNone));
+  EXPECT_TRUE(AL.getRetStackAlignment() == 32);
 
-  AL = AL.removeAttribute(C, 0, Attribute::StackAlignment);
+  AL = AL.removeRetAttribute(C, Attribute::StackAlignment);
   EXPECT_FALSE(AL.hasParamAttr(0, Attribute::Alignment));
   EXPECT_TRUE(AL.hasParamAttr(0, Attribute::ReadOnly));
-  EXPECT_FALSE(AL.hasAttribute(0, Attribute::StackAlignment));
-  EXPECT_TRUE(AL.hasAttribute(0, Attribute::OptimizeNone));
+  EXPECT_FALSE(AL.hasRetAttr(Attribute::StackAlignment));
+  EXPECT_TRUE(AL.hasRetAttr(Attribute::OptimizeNone));
 
   AttributeList AL2;
   AL2 = AL2.addParamAttributes(C, 0, B_align_readonly);
-  AL2 = AL2.addAttributes(C, 0, B_stackalign_optnone);
+  AL2 = AL2.addRetAttributes(C, B_stackalign_optnone);
 
   AL2 = AL2.removeParamAttributes(C, 0, B_align);
   EXPECT_FALSE(AL2.hasParamAttr(0, Attribute::Alignment));
   EXPECT_TRUE(AL2.hasParamAttr(0, Attribute::ReadOnly));
-  EXPECT_TRUE(AL2.hasAttribute(0, Attribute::StackAlignment));
-  EXPECT_TRUE(AL2.hasAttribute(0, Attribute::OptimizeNone));
-  EXPECT_TRUE(AL2.getStackAlignment(0) == 32);
+  EXPECT_TRUE(AL2.hasRetAttr(Attribute::StackAlignment));
+  EXPECT_TRUE(AL2.hasRetAttr(Attribute::OptimizeNone));
+  EXPECT_TRUE(AL2.getRetStackAlignment() == 32);
 
-  AL2 = AL2.removeAttributes(C, 0, B_stackalign);
+  AL2 = AL2.removeRetAttributes(C, B_stackalign);
   EXPECT_FALSE(AL2.hasParamAttr(0, Attribute::Alignment));
   EXPECT_TRUE(AL2.hasParamAttr(0, Attribute::ReadOnly));
-  EXPECT_FALSE(AL2.hasAttribute(0, Attribute::StackAlignment));
-  EXPECT_TRUE(AL2.hasAttribute(0, Attribute::OptimizeNone));
+  EXPECT_FALSE(AL2.hasRetAttr(Attribute::StackAlignment));
+  EXPECT_TRUE(AL2.hasRetAttr(Attribute::OptimizeNone));
 }
 
 TEST(Attributes, AddMatchingAlignAttr) {
   LLVMContext C;
   AttributeList AL;
-  AL = AL.addAttribute(C, AttributeList::FirstArgIndex,
-                       Attribute::getWithAlignment(C, Align(8)));
-  AL = AL.addAttribute(C, AttributeList::FirstArgIndex + 1,
-                       Attribute::getWithAlignment(C, Align(32)));
+  AL = AL.addParamAttribute(C, 0, Attribute::getWithAlignment(C, Align(8)));
+  AL = AL.addParamAttribute(C, 1, Attribute::getWithAlignment(C, Align(32)));
   EXPECT_EQ(Align(8), AL.getParamAlignment(0));
   EXPECT_EQ(Align(32), AL.getParamAlignment(1));
 
   AttrBuilder B;
   B.addAttribute(Attribute::NonNull);
   B.addAlignmentAttr(8);
-  AL = AL.addAttributes(C, AttributeList::FirstArgIndex, B);
+  AL = AL.addParamAttributes(C, 0, B);
   EXPECT_EQ(Align(8), AL.getParamAlignment(0));
   EXPECT_EQ(Align(32), AL.getParamAlignment(1));
   EXPECT_TRUE(AL.hasParamAttr(0, Attribute::NonNull));
@@ -228,8 +227,7 @@ TEST(Attributes, AttributeListPrinting) {
     std::string S;
     raw_string_ostream OS(S);
     AttributeList AL;
-    AL.addAttribute(C, AttributeList::FunctionIndex, Attribute::AlwaysInline)
-        .print(OS);
+    AL.addFnAttribute(C, Attribute::AlwaysInline).print(OS);
     EXPECT_EQ(S, "AttributeList[\n"
                  "  { function => alwaysinline }\n"
                  "]\n");
@@ -239,7 +237,7 @@ TEST(Attributes, AttributeListPrinting) {
     std::string S;
     raw_string_ostream OS(S);
     AttributeList AL;
-    AL.addAttribute(C, AttributeList::ReturnIndex, Attribute::SExt).print(OS);
+    AL.addRetAttribute(C, Attribute::SExt).print(OS);
     EXPECT_EQ(S, "AttributeList[\n"
                  "  { return => signext }\n"
                  "]\n");
diff --git a/llvm/unittests/IR/DebugTypeODRUniquingTest.cpp b/llvm/unittests/IR/DebugTypeODRUniquingTest.cpp
index fa314e1cb41f7..ad7587497e3ed 100644
--- a/llvm/unittests/IR/DebugTypeODRUniquingTest.cpp
+++ b/llvm/unittests/IR/DebugTypeODRUniquingTest.cpp
@@ -30,7 +30,7 @@ TEST(DebugTypeODRUniquingTest, getODRType) {
   EXPECT_FALSE(DICompositeType::getODRType(
       Context, UUID, dwarf::DW_TAG_class_type, nullptr, nullptr, 0, nullptr,
       nullptr, 0, 0, 0, DINode::FlagZero, nullptr, 0, nullptr, nullptr, nullptr,
-      nullptr, nullptr, nullptr, nullptr));
+      nullptr, nullptr, nullptr, nullptr, nullptr));
 
   // Enable the mapping.  There still shouldn't be a type.
   Context.enableDebugTypeODRUniquing();
@@ -40,7 +40,7 @@ TEST(DebugTypeODRUniquingTest, getODRType) {
   auto &CT = *DICompositeType::getODRType(
       Context, UUID, dwarf::DW_TAG_class_type, nullptr, nullptr, 0, nullptr,
       nullptr, 0, 0, 0, DINode::FlagZero, nullptr, 0, nullptr, nullptr, nullptr,
-      nullptr, nullptr, nullptr, nullptr);
+      nullptr, nullptr, nullptr, nullptr, nullptr);
   EXPECT_EQ(UUID.getString(), CT.getIdentifier());
 
   // Check that we get it back, even if we change a field.
@@ -49,12 +49,14 @@ TEST(DebugTypeODRUniquingTest, getODRType) {
             DICompositeType::getODRType(
                 Context, UUID, dwarf::DW_TAG_class_type, nullptr, nullptr, 0,
                 nullptr, nullptr, 0, 0, 0, DINode::FlagZero, nullptr, 0,
-                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr));
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr));
   EXPECT_EQ(&CT, DICompositeType::getODRType(
                      Context, UUID, dwarf::DW_TAG_class_type,
                      MDString::get(Context, "name"), nullptr, 0, nullptr,
                      nullptr, 0, 0, 0, DINode::FlagZero, nullptr, 0, nullptr,
-                     nullptr, nullptr, nullptr, nullptr, nullptr, nullptr));
+                     nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                     nullptr));
 
   // Check that it's discarded with the type map.
   Context.disableDebugTypeODRUniquing();
@@ -74,7 +76,7 @@ TEST(DebugTypeODRUniquingTest, buildODRType) {
   auto &CT = *DICompositeType::buildODRType(
       Context, UUID, dwarf::DW_TAG_class_type, nullptr, nullptr, 0, nullptr,
       nullptr, 0, 0, 0, DINode::FlagFwdDecl, nullptr, 0, nullptr, nullptr,
-      nullptr, nullptr, nullptr, nullptr, nullptr);
+      nullptr, nullptr, nullptr, nullptr, nullptr, nullptr);
   EXPECT_EQ(&CT, DICompositeType::getODRTypeIfExists(Context, UUID));
   EXPECT_EQ(dwarf::DW_TAG_class_type, CT.getTag());
 
@@ -83,7 +85,8 @@ TEST(DebugTypeODRUniquingTest, buildODRType) {
             DICompositeType::buildODRType(
                 Context, UUID, dwarf::DW_TAG_structure_type, nullptr, nullptr,
                 0, nullptr, nullptr, 0, 0, 0, DINode::FlagFwdDecl, nullptr, 0,
-                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr));
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr));
   EXPECT_EQ(dwarf::DW_TAG_class_type, CT.getTag());
 
   // Update with a definition.  This time we should see a change.
@@ -91,7 +94,8 @@ TEST(DebugTypeODRUniquingTest, buildODRType) {
             DICompositeType::buildODRType(
                 Context, UUID, dwarf::DW_TAG_structure_type, nullptr, nullptr,
                 0, nullptr, nullptr, 0, 0, 0, DINode::FlagZero, nullptr, 0,
-                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr));
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr));
   EXPECT_EQ(dwarf::DW_TAG_structure_type, CT.getTag());
 
   // Further updates should be ignored.
@@ -99,13 +103,15 @@ TEST(DebugTypeODRUniquingTest, buildODRType) {
             DICompositeType::buildODRType(
                 Context, UUID, dwarf::DW_TAG_class_type, nullptr, nullptr, 0,
                 nullptr, nullptr, 0, 0, 0, DINode::FlagFwdDecl, nullptr, 0,
-                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr));
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr));
   EXPECT_EQ(dwarf::DW_TAG_structure_type, CT.getTag());
   EXPECT_EQ(&CT,
             DICompositeType::buildODRType(
                 Context, UUID, dwarf::DW_TAG_class_type, nullptr, nullptr, 0,
                 nullptr, nullptr, 0, 0, 0, DINode::FlagZero, nullptr, 0,
-                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr));
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr));
   EXPECT_EQ(dwarf::DW_TAG_structure_type, CT.getTag());
 }
 
@@ -118,7 +124,7 @@ TEST(DebugTypeODRUniquingTest, buildODRTypeFields) {
   auto &CT = *DICompositeType::buildODRType(
       Context, UUID, 0, nullptr, nullptr, 0, nullptr, nullptr, 0, 0, 0,
       DINode::FlagFwdDecl, nullptr, 0, nullptr, nullptr, nullptr, nullptr,
-      nullptr, nullptr, nullptr);
+      nullptr, nullptr, nullptr, nullptr);
 
 // Create macros for running through all the fields except Identifier and Flags.
 #define FOR_EACH_MDFIELD()                                                     \
@@ -152,7 +158,7 @@ TEST(DebugTypeODRUniquingTest, buildODRTypeFields) {
                 Context, UUID, Tag, Name, File, Line, Scope, BaseType,
                 SizeInBits, AlignInBits, OffsetInBits, DINode::FlagArtificial,
                 Elements, RuntimeLang, VTableHolder, TemplateParams, nullptr,
-                nullptr, nullptr, nullptr, nullptr));
+                nullptr, nullptr, nullptr, nullptr, nullptr));
 
   // Confirm that all the right fields got updated.
 #define DO_FOR_FIELD(X) EXPECT_EQ(X, CT.getRaw##X());
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index a11b75fc881c9..6e24c150cae8f 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -289,13 +289,13 @@ TEST_F(IRBuilderTest, ConstrainedFP) {
   EXPECT_EQ(II->getIntrinsicID(), Intrinsic::experimental_constrained_fpext);
 
   // Verify attributes on the call are created automatically.
-  AttributeSet CallAttrs = II->getAttributes().getFnAttributes();
+  AttributeSet CallAttrs = II->getAttributes().getFnAttrs();
   EXPECT_EQ(CallAttrs.hasAttribute(Attribute::StrictFP), true);
 
   // Verify attributes on the containing function are created when requested.
   Builder.setConstrainedFPFunctionAttr();
   AttributeList Attrs = BB->getParent()->getAttributes();
-  AttributeSet FnAttrs = Attrs.getFnAttributes();
+  AttributeSet FnAttrs = Attrs.getFnAttrs();
   EXPECT_EQ(FnAttrs.hasAttribute(Attribute::StrictFP), true);
 
   // Verify the codepaths for setting and overriding the default metadata.
@@ -392,8 +392,8 @@ TEST_F(IRBuilderTest, ConstrainedFPFunctionCall) {
   CallInst *FCall = Builder.CreateCall(Callee, None);
 
   // Check the attributes to verify the strictfp attribute is on the call.
-  EXPECT_TRUE(FCall->getAttributes().getFnAttributes().hasAttribute(
-      Attribute::StrictFP));
+  EXPECT_TRUE(
+      FCall->getAttributes().getFnAttrs().hasAttribute(Attribute::StrictFP));
 
   Builder.CreateRetVoid();
   EXPECT_FALSE(verifyModule(*M));
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index c9d6d846cbab7..81299b9f97782 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -95,8 +95,7 @@ TEST_F(ModuleWithFunctionTest, CallInst) {
     Idx++;
   }
 
-  Call->addAttribute(llvm::AttributeList::ReturnIndex,
-                     Attribute::get(Call->getContext(), "test-str-attr"));
+  Call->addRetAttr(Attribute::get(Call->getContext(), "test-str-attr"));
   EXPECT_TRUE(Call->hasRetAttr("test-str-attr"));
   EXPECT_FALSE(Call->hasRetAttr("not-on-call"));
 }
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index 0ca0da81a33e8..223e3add4bc92 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -23,6 +23,11 @@ using namespace llvm;
 
 namespace {
 
+static const char *ReductionIntOpcodes[] = {
+    "add", "mul", "and", "or", "xor", "smin", "smax", "umin", "umax"};
+
+static const char *ReductionFPOpcodes[] = {"fadd", "fmul", "fmin", "fmax"};
+
 class VPIntrinsicTest : public testing::Test {
 protected:
   LLVMContext Context;
@@ -55,6 +60,14 @@ class VPIntrinsicTest : public testing::Test {
     Str << " declare <8 x i32> @llvm.vp.gather.v8i32.v8p0i32(<8 x i32*>, <8 x "
            "i1>, i32) ";
 
+    for (const char *ReductionOpcode : ReductionIntOpcodes)
+      Str << " declare i32 @llvm.vp.reduce." << ReductionOpcode
+          << ".v8i32(i32, <8 x i32>, <8 x i1>, i32) ";
+
+    for (const char *ReductionOpcode : ReductionFPOpcodes)
+      Str << " declare float @llvm.vp.reduce." << ReductionOpcode
+          << ".v8f32(float, <8 x float>, <8 x i1>, i32) ";
+
     return parseAssemblyString(Str.str(), Err, C);
   }
 };
@@ -287,3 +300,72 @@ TEST_F(VPIntrinsicTest, HandleToConstrainedFP) {
 }
 
 } // end anonymous namespace
+
+/// Check various properties of VPReductionIntrinsics
+TEST_F(VPIntrinsicTest, VPReductions) {
+  LLVMContext C;
+  SMDiagnostic Err;
+
+  std::stringstream Str;
+  Str << "declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, "
+         "i32)";
+  for (const char *ReductionOpcode : ReductionIntOpcodes)
+    Str << " declare i32 @llvm.vp.reduce." << ReductionOpcode
+        << ".v8i32(i32, <8 x i32>, <8 x i1>, i32) ";
+
+  for (const char *ReductionOpcode : ReductionFPOpcodes)
+    Str << " declare float @llvm.vp.reduce." << ReductionOpcode
+        << ".v8f32(float, <8 x float>, <8 x i1>, i32) ";
+
+  Str << "define void @test_reductions(i32 %start, <8 x i32> %val, float "
+         "%fpstart, <8 x float> %fpval, <8 x i1> %m, i32 %vl) {";
+
+  // Mix in a regular non-reduction intrinsic to check that the
+  // VPReductionIntrinsic subclass works as intended.
+  Str << "  %r0 = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %val, <8 x i32> "
+         "%val, <8 x i1> %m, i32 %vl)";
+
+  unsigned Idx = 1;
+  for (const char *ReductionOpcode : ReductionIntOpcodes)
+    Str << "  %r" << Idx++ << " = call i32 @llvm.vp.reduce." << ReductionOpcode
+        << ".v8i32(i32 %start, <8 x i32> %val, <8 x i1> %m, i32 %vl)";
+  for (const char *ReductionOpcode : ReductionFPOpcodes)
+    Str << "  %r" << Idx++ << " = call float @llvm.vp.reduce."
+        << ReductionOpcode
+        << ".v8f32(float %fpstart, <8 x float> %fpval, <8 x i1> %m, i32 %vl)";
+
+  Str << "  ret void"
+         "}";
+
+  std::unique_ptr<Module> M = parseAssemblyString(Str.str(), Err, C);
+  assert(M);
+
+  auto *F = M->getFunction("test_reductions");
+  assert(F);
+
+  for (const auto &I : F->getEntryBlock()) {
+    const VPIntrinsic *VPI = dyn_cast<VPIntrinsic>(&I);
+    if (!VPI)
+      continue;
+
+    Intrinsic::ID ID = VPI->getIntrinsicID();
+    const auto *VPRedI = dyn_cast<VPReductionIntrinsic>(&I);
+
+    if (!VPReductionIntrinsic::isVPReduction(ID)) {
+      EXPECT_EQ(VPRedI, nullptr);
+      EXPECT_EQ(VPReductionIntrinsic::getStartParamPos(ID).hasValue(), false);
+      EXPECT_EQ(VPReductionIntrinsic::getVectorParamPos(ID).hasValue(), false);
+      continue;
+    }
+
+    EXPECT_EQ(VPReductionIntrinsic::getStartParamPos(ID).hasValue(), true);
+    EXPECT_EQ(VPReductionIntrinsic::getVectorParamPos(ID).hasValue(), true);
+    ASSERT_NE(VPRedI, nullptr);
+    EXPECT_EQ(VPReductionIntrinsic::getStartParamPos(ID),
+              VPRedI->getStartParamPos());
+    EXPECT_EQ(VPReductionIntrinsic::getVectorParamPos(ID),
+              VPRedI->getVectorParamPos());
+    EXPECT_EQ(VPRedI->getStartParamPos(), 0u);
+    EXPECT_EQ(VPRedI->getVectorParamPos(), 1u);
+  }
+}
diff --git a/llvm/unittests/IR/VerifierTest.cpp b/llvm/unittests/IR/VerifierTest.cpp
index 0e58bc4ed8124..6031c808f5842 100644
--- a/llvm/unittests/IR/VerifierTest.cpp
+++ b/llvm/unittests/IR/VerifierTest.cpp
@@ -99,8 +99,7 @@ TEST(VerifierTest, InvalidRetAttribute) {
   FunctionType *FTy = FunctionType::get(Type::getInt32Ty(C), /*isVarArg=*/false);
   Function *F = Function::Create(FTy, Function::ExternalLinkage, "foo", M);
   AttributeList AS = F->getAttributes();
-  F->setAttributes(
-      AS.addAttribute(C, AttributeList::ReturnIndex, Attribute::UWTable));
+  F->setAttributes(AS.addRetAttribute(C, Attribute::UWTable));
 
   std::string Error;
   raw_string_ostream ErrorOS(Error);
diff --git a/llvm/unittests/Support/CommandLineTest.cpp b/llvm/unittests/Support/CommandLineTest.cpp
index a0352bc8a4c5e..d8fd6f6516cdd 100644
--- a/llvm/unittests/Support/CommandLineTest.cpp
+++ b/llvm/unittests/Support/CommandLineTest.cpp
@@ -1894,4 +1894,34 @@ TEST(CommandLineTest, ConsumeAfterTwoPositionals) {
   EXPECT_TRUE(Errs.empty());
 }
 
+TEST(CommandLineTest, ResetAllOptionOccurrences) {
+  cl::ResetCommandLineParser();
+
+  // -option [sink] input [args]
+  StackOption<bool> Option("option");
+  StackOption<std::string, cl::list<std::string>> Sink(cl::Sink);
+  StackOption<std::string> Input(cl::Positional);
+  StackOption<std::string, cl::list<std::string>> ExtraArgs(cl::ConsumeAfter);
+
+  const char *Args[] = {"prog", "-option", "-unknown", "input", "-arg"};
+
+  std::string Errs;
+  raw_string_ostream OS(Errs);
+  EXPECT_TRUE(cl::ParseCommandLineOptions(5, Args, StringRef(), &OS));
+  EXPECT_TRUE(OS.str().empty());
+
+  EXPECT_TRUE(Option);
+  EXPECT_EQ(1, (int)Sink.size());
+  EXPECT_EQ("-unknown", Sink[0]);
+  EXPECT_EQ("input", Input);
+  EXPECT_EQ(1, (int)ExtraArgs.size());
+  EXPECT_EQ("-arg", ExtraArgs[0]);
+
+  cl::ResetAllOptionOccurrences();
+  EXPECT_FALSE(Option);
+  EXPECT_EQ(0, (int)Sink.size());
+  EXPECT_EQ(0, Input.getNumOccurrences());
+  EXPECT_EQ(0, (int)ExtraArgs.size());
+}
+
 } // anonymous namespace
diff --git a/llvm/unittests/Support/MD5Test.cpp b/llvm/unittests/Support/MD5Test.cpp
index 3a24e1fa69796..8eb4d3422cb62 100644
--- a/llvm/unittests/Support/MD5Test.cpp
+++ b/llvm/unittests/Support/MD5Test.cpp
@@ -68,4 +68,35 @@ TEST(MD5HashTest, MD5) {
   EXPECT_EQ(0x3be167ca6c49fb7dULL, MD5Res.high());
   EXPECT_EQ(0x00e49261d7d3fcc3ULL, MD5Res.low());
 }
+
+TEST(MD5Test, FinalAndResultHelpers) {
+  MD5 Hash;
+
+  Hash.update("abcd");
+
+  {
+    MD5 ReferenceHash;
+    ReferenceHash.update("abcd");
+    MD5::MD5Result ReferenceResult;
+    ReferenceHash.final(ReferenceResult);
+    StringRef ExpectedResult =
+        StringRef(reinterpret_cast<char *>(ReferenceResult.Bytes.data()),
+                  ReferenceResult.Bytes.size());
+    EXPECT_EQ(Hash.result(), ExpectedResult);
+  }
+
+  Hash.update("xyz");
+
+  {
+    MD5 ReferenceHash;
+    ReferenceHash.update("abcd");
+    ReferenceHash.update("xyz");
+    MD5::MD5Result ReferenceResult;
+    ReferenceHash.final(ReferenceResult);
+    StringRef ExpectedResult =
+        StringRef(reinterpret_cast<char *>(ReferenceResult.Bytes.data()),
+                  ReferenceResult.Bytes.size());
+    EXPECT_EQ(Hash.final(), ExpectedResult);
+  }
 }
+} // namespace
diff --git a/llvm/unittests/Transforms/Scalar/LICMTest.cpp b/llvm/unittests/Transforms/Scalar/LICMTest.cpp
index 3063785332bb0..5a986b067700c 100644
--- a/llvm/unittests/Transforms/Scalar/LICMTest.cpp
+++ b/llvm/unittests/Transforms/Scalar/LICMTest.cpp
@@ -32,7 +32,7 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) {
   PB.registerLoopAnalyses(LAM);
   PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
 
-  StringRef PipelineStr = "require<opt-remark-emit>,loop(licm)";
+  StringRef PipelineStr = "require<opt-remark-emit>,loop-mssa(licm)";
   ASSERT_THAT_ERROR(PB.parsePassPipeline(MPM, PipelineStr), Succeeded());
 
   SMDiagnostic Error;
diff --git a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
index db631c221e593..bdabc34decf85 100644
--- a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
+++ b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
@@ -415,8 +415,7 @@ TEST_F(LoopPassManagerTest, FunctionPassInvalidationOfLoopAnalyses) {
       RequireAnalysisLoopPass<MockLoopAnalysisHandle::Analysis>()));
   // For 'f', preserve most things but not the specific loop analyses.
   auto PA = getLoopPassPreservedAnalyses();
-  if (EnableMSSALoopDependency)
-    PA.preserve<MemorySSAAnalysis>();
+  PA.preserve<MemorySSAAnalysis>();
   EXPECT_CALL(MFPHandle, run(HasName("f"), _))
       .InSequence(FSequence)
       .WillOnce(Return(PA));
@@ -494,8 +493,7 @@ TEST_F(LoopPassManagerTest, ModulePassInvalidationOfLoopAnalyses) {
   EXPECT_CALL(MMPHandle, run(_, _)).WillOnce(InvokeWithoutArgs([] {
     auto PA = getLoopPassPreservedAnalyses();
     PA.preserve<FunctionAnalysisManagerModuleProxy>();
-    if (EnableMSSALoopDependency)
-      PA.preserve<MemorySSAAnalysis>();
+    PA.preserve<MemorySSAAnalysis>();
     return PA;
   }));
   // All the loop analyses from both functions get invalidated before we
@@ -822,8 +820,7 @@ TEST_F(LoopPassManagerTest, IndirectOuterPassInvalidation) {
   // the fact that they were preserved.
   EXPECT_CALL(MFPHandle, run(HasName("f"), _)).WillOnce(InvokeWithoutArgs([] {
     auto PA = getLoopPassPreservedAnalyses();
-    if (EnableMSSALoopDependency)
-      PA.preserve<MemorySSAAnalysis>();
+    PA.preserve<MemorySSAAnalysis>();
     PA.preserveSet<AllAnalysesOn<Loop>>();
     return PA;
   }));
@@ -845,8 +842,7 @@ TEST_F(LoopPassManagerTest, IndirectOuterPassInvalidation) {
   // Which means that no extra invalidation occurs and cached values are used.
   EXPECT_CALL(MFPHandle, run(HasName("g"), _)).WillOnce(InvokeWithoutArgs([] {
     auto PA = getLoopPassPreservedAnalyses();
-    if (EnableMSSALoopDependency)
-      PA.preserve<MemorySSAAnalysis>();
+    PA.preserve<MemorySSAAnalysis>();
     PA.preserveSet<AllAnalysesOn<Loop>>();
     return PA;
   }));
diff --git a/llvm/unittests/Transforms/Utils/VFABIUtils.cpp b/llvm/unittests/Transforms/Utils/VFABIUtils.cpp
index 689e1280deab9..4ac9b45a403c5 100644
--- a/llvm/unittests/Transforms/Utils/VFABIUtils.cpp
+++ b/llvm/unittests/Transforms/Utils/VFABIUtils.cpp
@@ -46,8 +46,7 @@ TEST_F(VFABIAttrTest, Write) {
   Mappings.push_back("_ZGVnN8v_g");
   Mappings.push_back("_ZGVnN2v_g(custom_vg)");
   VFABI::setVectorVariantNames(CI, Mappings);
-  const StringRef S = CI->getAttribute(AttributeList::FunctionIndex,
-                                       "vector-function-abi-variant")
-                          .getValueAsString();
+  const StringRef S =
+      CI->getFnAttr("vector-function-abi-variant").getValueAsString();
   EXPECT_EQ(S, "_ZGVnN8v_g,_ZGVnN2v_g(custom_vg)");
 }
diff --git a/llvm/utils/gn/build/BUILD.gn b/llvm/utils/gn/build/BUILD.gn
index 8df8be1ca0707..b0f02cf6a937d 100644
--- a/llvm/utils/gn/build/BUILD.gn
+++ b/llvm/utils/gn/build/BUILD.gn
@@ -377,6 +377,9 @@ config("llvm_code") {
     "//llvm/include",
     "$root_gen_dir/llvm/include",
   ]
+  if (current_os != "win") {
+    cflags = [ "-fPIC" ]
+  }
 }
 
 config("lld_code") {
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
index be29641fe4248..8e2cd1cf73dfb 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
@@ -98,6 +98,7 @@ static_library("clangd") {
     "Headers.cpp",
     "HeuristicResolver.cpp",
     "Hover.cpp",
+    "IncludeCleaner.cpp",
     "IncludeFixer.cpp",
     "InlayHints.cpp",
     "JSONTransport.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
index 7f53e0e60ef98..24bade76dc108 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/unittests/BUILD.gn
@@ -76,6 +76,7 @@ unittest("ClangdTests") {
     "HeaderSourceSwitchTests.cpp",
     "HeadersTests.cpp",
     "HoverTests.cpp",
+    "IncludeCleanerTests.cpp",
     "IndexActionTests.cpp",
     "IndexTests.cpp",
     "InlayHintTests.cpp",
diff --git a/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
index 556c8783ba306..6f2c19c613a36 100644
--- a/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
@@ -5,33 +5,24 @@ import("//llvm/version.gni")
 # This build file is just enough to get check-clang to pass, it's missing
 # several things from the CMake build:
 # - a build target copying the Python bindings
-# - the GN linux build always builds without -fPIC (as if LLVM_ENABLE_PIC=OFF
-#   in the CMake build), so libclang is always a static library on linux
 # - the GN build doesn't have LIBCLANG_BUILD_STATIC
 
-libclang_target_type = "shared_library"
-if (host_os != "win" && host_os != "mac") {
-  # ELF targets need -fPIC to build shared libs but they aren't on by default.
-  # For now, make libclang a static lib there.
-  libclang_target_type = "static_library"
-} else {
-  action("linker_script_to_exports") {
-    script = "linker-script-to-export-list.py"
-    inputs = [ "libclang.map" ]
-    outputs = [ "$target_gen_dir/libclang.exports" ]
-    args = [
-      rebase_path(inputs[0], root_build_dir),
-      rebase_path(outputs[0], root_build_dir),
-    ]
-  }
+action("linker_script_to_exports") {
+  script = "linker-script-to-export-list.py"
+  inputs = [ "libclang.map" ]
+  outputs = [ "$target_gen_dir/libclang.exports" ]
+  args = [
+    rebase_path(inputs[0], root_build_dir),
+    rebase_path(outputs[0], root_build_dir),
+  ]
+}
 
-  symbol_exports("exports") {
-    deps = [ ":linker_script_to_exports" ]
-    exports_file = "$target_gen_dir/libclang.exports"
-  }
+symbol_exports("exports") {
+  deps = [ ":linker_script_to_exports" ]
+  exports_file = "$target_gen_dir/libclang.exports"
 }
 
-target(libclang_target_type, "libclang") {
+shared_library("libclang") {
   configs += [ "//llvm/utils/gn/build:clang_code" ]
   deps = [
     "//clang/include/clang/Config",
@@ -48,14 +39,17 @@ target(libclang_target_type, "libclang") {
     "//llvm/lib/Support",
     "//llvm/lib/Target:TargetsToBuild",
   ]
+  if (current_os == "win" || current_os == "mac") {
+    deps += [ ":exports" ]
+  } else {
+    inputs = [ "libclang.map" ]
+    ldflags =
+        [ "-Wl,--version-script," + rebase_path(inputs[0], root_build_dir) ]
+  }
   if (clang_enable_arcmt) {
     deps += [ "//clang/lib/ARCMigrate" ]
   }
 
-  if (libclang_target_type == "shared_library") {
-    deps += [ ":exports" ]
-  }
-
   defines = []
 
   if (host_os == "win") {
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
index ef1251b81c1b9..ba733ad90f64b 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
@@ -421,7 +421,6 @@ static_library("builtins") {
         "arm/aeabi_uidivmod.S",
         "arm/aeabi_uldivmod.S",
         "arm/chkstk.S",
-        "mingw_fixfloat.c",
       ]
     }
   }
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index f3bc8f98104c9..e1629b5db70bf 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -160,6 +160,7 @@ if (current_toolchain == default_toolchain) {
       "__bsd_locale_defaults.h",
       "__bsd_locale_fallbacks.h",
       "__compare/common_comparison_category.h",
+      "__compare/compare_three_way_result.h",
       "__compare/ordering.h",
       "__concepts/arithmetic.h",
       "__concepts/assignable.h",
@@ -285,6 +286,8 @@ if (current_toolchain == default_toolchain) {
       "__ranges/empty_view.h",
       "__ranges/enable_borrowed_range.h",
       "__ranges/enable_view.h",
+      "__ranges/iota_view.h",
+      "__ranges/join_view.h",
       "__ranges/non_propagating_cache.h",
       "__ranges/ref_view.h",
       "__ranges/reverse_view.h",
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index 74828a6e9acf9..1e4fb7870cc2d 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -162,6 +162,7 @@ write_cmake_config("config") {
       "HAVE_MALLOC_ZONE_STATISTICS=1",
       "HAVE_PROC_PID_RUSAGE=1",
       "HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC=1",
+      "HAVE_UNW_ADD_DYNAMIC_FDE=1",
     ]
   } else {
     values += [
@@ -173,6 +174,7 @@ write_cmake_config("config") {
       "HAVE_MALLOC_ZONE_STATISTICS=",
       "HAVE_PROC_PID_RUSAGE=",
       "HAVE_STRUCT_STAT_ST_MTIMESPEC_TV_NSEC=",
+      "HAVE_UNW_ADD_DYNAMIC_FDE=",
     ]
   }
 
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index 2022477c7185d..c144a67a0843c 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -95,6 +95,7 @@ static_library("CodeGen") {
     "MIRNamerPass.cpp",
     "MIRPrinter.cpp",
     "MIRPrintingPass.cpp",
+    "MIRSampleProfile.cpp",
     "MIRVRegNamerUtils.cpp",
     "MIRYamlMapping.cpp",
     "MachineBasicBlock.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
index 41226d623383f..a6fd891cf2006 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn
@@ -21,6 +21,7 @@ static_library("Orc") {
     "EPCDebugObjectRegistrar.cpp",
     "EPCDynamicLibrarySearchGenerator.cpp",
     "EPCEHFrameRegistrar.cpp",
+    "EPCGenericMemoryAccess.cpp",
     "EPCIndirectionUtils.cpp",
     "ExecutionUtils.cpp",
     "ExecutorProcessControl.cpp",
@@ -31,6 +32,7 @@ static_library("Orc") {
     "LLJIT.cpp",
     "Layer.cpp",
     "LazyReexports.cpp",
+    "LookupAndRecordAddrs.cpp",
     "MachOPlatform.cpp",
     "Mangling.cpp",
     "ObjectLinkingLayer.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/tools/lto/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/lto/BUILD.gn
index 418bdeb4fc068..590e8951686e6 100644
--- a/llvm/utils/gn/secondary/llvm/tools/lto/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/lto/BUILD.gn
@@ -1,20 +1,14 @@
 import("//llvm/utils/gn/build/symbol_exports.gni")
 import("//llvm/version.gni")
 
-lto_target_type = "shared_library"
-if (host_os != "mac" && host_os != "win") {
-  # ELF targets need -fPIC to build shared libs but they aren't on by default.
-  # For now, make libclang a static lib there.
-  lto_target_type = "static_library"
-}
-
 symbol_exports("exports") {
   exports_file = "lto.exports"
 }
 
-target(lto_target_type, "lto") {
+shared_library("lto") {
   output_name = "LTO"
   deps = [
+    ":exports",
     "//llvm/lib/Bitcode/Reader",
     "//llvm/lib/IR",
     "//llvm/lib/LTO",
@@ -29,10 +23,6 @@ target(lto_target_type, "lto") {
     "lto.cpp",
   ]
 
-  if (lto_target_type == "shared_library") {
-    deps += [ ":exports" ]
-  }
-
   if (host_os == "mac") {
     ldflags = [
       "-Wl,-compatibility_version,1",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/DebugInfo/DWARF/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/DebugInfo/DWARF/BUILD.gn
index 38ec5269f4b16..a7c1d6ac8a546 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/DebugInfo/DWARF/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/DebugInfo/DWARF/BUILD.gn
@@ -21,6 +21,7 @@ unittest("DebugInfoDWARFTests") {
     "DWARFDieManualExtractTest.cpp",
     "DWARFDieTest.cpp",
     "DWARFExpressionCompactPrinterTest.cpp",
+    "DWARFExpressionCopyBytesTest.cpp",
     "DWARFFormValueTest.cpp",
     "DWARFListTableTest.cpp",
     "DWARFLocationExpressionTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
index 9704ab51f4306..7d0ea14cc701c 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
@@ -14,10 +14,12 @@ unittest("OrcJITTests") {
   ]
   sources = [
     "CoreAPIsTest.cpp",
+    "EPCGenericMemoryAccessTest.cpp",
     "ExecutionSessionWrapperFunctionCallsTest.cpp",
     "IndirectionUtilsTest.cpp",
     "JITTargetMachineBuilderTest.cpp",
     "LazyCallThroughAndReexportsTest.cpp",
+    "LookupAndRecordAddrsTest.cpp",
     "ObjectLinkingLayerTest.cpp",
     "OrcCAPITest.cpp",
     "OrcTestCommon.cpp",
diff --git a/llvm/utils/lit/tests/shtest-format.py b/llvm/utils/lit/tests/shtest-format.py
index a15c0f9501740..4d8004db267a6 100644
--- a/llvm/utils/lit/tests/shtest-format.py
+++ b/llvm/utils/lit/tests/shtest-format.py
@@ -20,7 +20,7 @@
 # CHECK-NEXT: line 2: failed test output on stdout
 # CHECK: Command Output (stderr):
 # CHECK-NEXT: --
-# CHECK-NEXT: cat{{(\.exe)?}}: {{cannot open does-not-exist|does-not-exist: No such file or directory}}
+# CHECK-NEXT: cat{{(_64)?(\.exe)?}}: {{cannot open does-not-exist|does-not-exist: No such file or directory}}
 # CHECK: --
 
 # CHECK: FAIL: shtest-format :: external_shell/fail_with_bad_encoding.txt
diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake
index 48390f45ff90c..b1e2f0b3f5559 100644
--- a/mlir/cmake/modules/AddMLIRPython.cmake
+++ b/mlir/cmake/modules/AddMLIRPython.cmake
@@ -494,6 +494,7 @@ function(add_mlir_python_extension libname extname)
     set_target_properties(
       ${libname} PROPERTIES
       RUNTIME_OUTPUT_DIRECTORY ${ARG_OUTPUT_DIRECTORY}
+      ARCHIVE_OUTPUT_DIRECTORY ${ARG_OUTPUT_DIRECTORY}
     )
   endif()
 
diff --git a/mlir/include/mlir-c/Bindings/Python/Interop.h b/mlir/include/mlir-c/Bindings/Python/Interop.h
index 882f73d84383e..7fcfd028b407d 100644
--- a/mlir/include/mlir-c/Bindings/Python/Interop.h
+++ b/mlir/include/mlir-c/Bindings/Python/Interop.h
@@ -30,19 +30,44 @@
 #include "mlir-c/IntegerSet.h"
 #include "mlir-c/Pass.h"
 
-#define MLIR_PYTHON_CAPSULE_AFFINE_EXPR "mlir.ir.AffineExpr._CAPIPtr"
-#define MLIR_PYTHON_CAPSULE_AFFINE_MAP "mlir.ir.AffineMap._CAPIPtr"
-#define MLIR_PYTHON_CAPSULE_ATTRIBUTE "mlir.ir.Attribute._CAPIPtr"
-#define MLIR_PYTHON_CAPSULE_CONTEXT "mlir.ir.Context._CAPIPtr"
+// The 'mlir' Python package is relocatable and supports co-existing in multiple
+// projects. Each project must define its outer package prefix with this define
+// in order to provide proper isolation and local name resolution.
+// The default is for the upstream "import mlir" package layout.
+// Note that this prefix is internally stringified, allowing it to be passed
+// unquoted on the compiler command line without shell quote escaping issues.
+#ifndef MLIR_PYTHON_PACKAGE_PREFIX
+#define MLIR_PYTHON_PACKAGE_PREFIX mlir.
+#endif
+
+// Makes a fully-qualified name relative to the MLIR python package.
+#define MLIR_PYTHON_STRINGIZE(s) #s
+#define MLIR_PYTHON_STRINGIZE_ARG(arg) MLIR_PYTHON_STRINGIZE(arg)
+#define MAKE_MLIR_PYTHON_QUALNAME(local)                                       \
+  MLIR_PYTHON_STRINGIZE_ARG(MLIR_PYTHON_PACKAGE_PREFIX) local
+
+#define MLIR_PYTHON_CAPSULE_AFFINE_EXPR                                        \
+  MAKE_MLIR_PYTHON_QUALNAME("ir.AffineExpr._CAPIPtr")
+#define MLIR_PYTHON_CAPSULE_AFFINE_MAP                                         \
+  MAKE_MLIR_PYTHON_QUALNAME("ir.AffineMap._CAPIPtr")
+#define MLIR_PYTHON_CAPSULE_ATTRIBUTE                                          \
+  MAKE_MLIR_PYTHON_QUALNAME("ir.Attribute._CAPIPtr")
+#define MLIR_PYTHON_CAPSULE_CONTEXT                                            \
+  MAKE_MLIR_PYTHON_QUALNAME("ir.Context._CAPIPtr")
 #define MLIR_PYTHON_CAPSULE_EXECUTION_ENGINE                                   \
-  "mlir.execution_engine.ExecutionEngine._CAPIPtr"
-#define MLIR_PYTHON_CAPSULE_INTEGER_SET "mlir.ir.IntegerSet._CAPIPtr"
-#define MLIR_PYTHON_CAPSULE_LOCATION "mlir.ir.Location._CAPIPtr"
-#define MLIR_PYTHON_CAPSULE_MODULE "mlir.ir.Module._CAPIPtr"
-#define MLIR_PYTHON_CAPSULE_OPERATION "mlir.ir.Operation._CAPIPtr"
-#define MLIR_PYTHON_CAPSULE_TYPE "mlir.ir.Type._CAPIPtr"
-#define MLIR_PYTHON_CAPSULE_PASS_MANAGER "mlir.passmanager.PassManager._CAPIPtr"
-#define MLIR_PYTHON_CAPSULE_VALUE "mlir.ir.Value._CAPIPtr"
+  MAKE_MLIR_PYTHON_QUALNAME("execution_engine.ExecutionEngine._CAPIPtr")
+#define MLIR_PYTHON_CAPSULE_INTEGER_SET                                        \
+  MAKE_MLIR_PYTHON_QUALNAME("ir.IntegerSet._CAPIPtr")
+#define MLIR_PYTHON_CAPSULE_LOCATION                                           \
+  MAKE_MLIR_PYTHON_QUALNAME("ir.Location._CAPIPtr")
+#define MLIR_PYTHON_CAPSULE_MODULE                                             \
+  MAKE_MLIR_PYTHON_QUALNAME("ir.Module._CAPIPtr")
+#define MLIR_PYTHON_CAPSULE_OPERATION                                          \
+  MAKE_MLIR_PYTHON_QUALNAME("ir.Operation._CAPIPtr")
+#define MLIR_PYTHON_CAPSULE_TYPE MAKE_MLIR_PYTHON_QUALNAME("ir.Type._CAPIPtr")
+#define MLIR_PYTHON_CAPSULE_PASS_MANAGER                                       \
+  MAKE_MLIR_PYTHON_QUALNAME("passmanager.PassManager._CAPIPtr")
+#define MLIR_PYTHON_CAPSULE_VALUE MAKE_MLIR_PYTHON_QUALNAME("ir.Value._CAPIPtr")
 
 /** Attribute on MLIR Python objects that expose their C-API pointer.
  * This will be a type-specific capsule created as per one of the helpers
diff --git a/mlir/include/mlir/Analysis/AffineAnalysis.h b/mlir/include/mlir/Analysis/AffineAnalysis.h
index a0fae59939052..849d22e6938fb 100644
--- a/mlir/include/mlir/Analysis/AffineAnalysis.h
+++ b/mlir/include/mlir/Analysis/AffineAnalysis.h
@@ -25,7 +25,7 @@ namespace mlir {
 class AffineApplyOp;
 class AffineForOp;
 class AffineValueMap;
-class FlatAffineConstraints;
+class FlatAffineValueConstraints;
 class Operation;
 
 /// A description of a (parallelizable) reduction in an affine loop.
@@ -67,7 +67,7 @@ void getReachableAffineApplyOps(ArrayRef<Value> operands,
 /// AffineIfOp.
 //  TODO: handle non-unit strides.
 LogicalResult getIndexSet(MutableArrayRef<Operation *> ops,
-                          FlatAffineConstraints *domain);
+                          FlatAffineValueConstraints *domain);
 
 /// Encapsulates a memref load or store access information.
 struct MemRefAccess {
@@ -136,7 +136,7 @@ struct DependenceResult {
 
 DependenceResult checkMemrefAccessDependence(
     const MemRefAccess &srcAccess, const MemRefAccess &dstAccess,
-    unsigned loopDepth, FlatAffineConstraints *dependenceConstraints,
+    unsigned loopDepth, FlatAffineValueConstraints *dependenceConstraints,
     SmallVector<DependenceComponent, 2> *dependenceComponents,
     bool allowRAR = false);
 
diff --git a/mlir/include/mlir/Analysis/AffineStructures.h b/mlir/include/mlir/Analysis/AffineStructures.h
index e2f4c10e1078e..c1eb3758d4210 100644
--- a/mlir/include/mlir/Analysis/AffineStructures.h
+++ b/mlir/include/mlir/Analysis/AffineStructures.h
@@ -58,38 +58,34 @@ struct MutableAffineMap;
 ///
 class FlatAffineConstraints {
 public:
+  /// All derived classes of FlatAffineConstraints.
+  enum class Kind { FlatAffineConstraints, FlatAffineValueConstraints };
+
+  /// Kind of identifier (column).
   enum IdKind { Dimension, Symbol, Local };
 
   /// Constructs a constraint system reserving memory for the specified number
-  /// of constraints and identifiers..
+  /// of constraints and identifiers.
   FlatAffineConstraints(unsigned numReservedInequalities,
                         unsigned numReservedEqualities,
                         unsigned numReservedCols, unsigned numDims,
-                        unsigned numSymbols, unsigned numLocals,
-                        ArrayRef<Optional<Value>> idArgs = {})
+                        unsigned numSymbols, unsigned numLocals)
       : numIds(numDims + numSymbols + numLocals), numDims(numDims),
         numSymbols(numSymbols),
         equalities(0, numIds + 1, numReservedEqualities, numReservedCols),
         inequalities(0, numIds + 1, numReservedInequalities, numReservedCols) {
     assert(numReservedCols >= numIds + 1);
-    assert(idArgs.empty() || idArgs.size() == numIds);
-    ids.reserve(numReservedCols);
-    if (idArgs.empty())
-      ids.resize(numIds, None);
-    else
-      ids.append(idArgs.begin(), idArgs.end());
   }
 
   /// Constructs a constraint system with the specified number of
   /// dimensions and symbols.
   FlatAffineConstraints(unsigned numDims = 0, unsigned numSymbols = 0,
-                        unsigned numLocals = 0,
-                        ArrayRef<Optional<Value>> idArgs = {})
+                        unsigned numLocals = 0)
       : FlatAffineConstraints(/*numReservedInequalities=*/0,
                               /*numReservedEqualities=*/0,
                               /*numReservedCols=*/numDims + numSymbols +
                                   numLocals + 1,
-                              numDims, numSymbols, numLocals, idArgs) {}
+                              numDims, numSymbols, numLocals) {}
 
   /// Return a system with no constraints, i.e., one which is satisfied by all
   /// points.
@@ -98,30 +94,29 @@ class FlatAffineConstraints {
     return FlatAffineConstraints(numDims, numSymbols);
   }
 
-  /// Create a flat affine constraint system from an AffineValueMap or a list of
-  /// these. The constructed system will only include equalities.
-  explicit FlatAffineConstraints(const AffineValueMap &avm);
-  explicit FlatAffineConstraints(ArrayRef<const AffineValueMap *> avmRef);
-
   /// Creates an affine constraint system from an IntegerSet.
   explicit FlatAffineConstraints(IntegerSet set);
 
-  FlatAffineConstraints(ArrayRef<const AffineValueMap *> avmRef,
-                        IntegerSet set);
-
   FlatAffineConstraints(const MutableAffineMap &map);
 
-  ~FlatAffineConstraints() {}
+  virtual ~FlatAffineConstraints() = default;
 
-  // Clears any existing data and reserves memory for the specified constraints.
-  void reset(unsigned numReservedInequalities, unsigned numReservedEqualities,
-             unsigned numReservedCols, unsigned numDims, unsigned numSymbols,
-             unsigned numLocals = 0, ArrayRef<Value> idArgs = {});
+  /// Return the kind of this FlatAffineConstraints.
+  virtual Kind getKind() const { return Kind::FlatAffineConstraints; }
+
+  static bool classof(const FlatAffineConstraints *cst) { return true; }
+
+  /// Clears any existing data and reserves memory for the specified
+  /// constraints.
+  virtual void reset(unsigned numReservedInequalities,
+                     unsigned numReservedEqualities, unsigned numReservedCols,
+                     unsigned numDims, unsigned numSymbols,
+                     unsigned numLocals = 0);
 
   void reset(unsigned numDims = 0, unsigned numSymbols = 0,
-             unsigned numLocals = 0, ArrayRef<Value> idArgs = {});
+             unsigned numLocals = 0);
 
-  /// Appends constraints from 'other' into this. This is equivalent to an
+  /// Appends constraints from `other` into `this`. This is equivalent to an
   /// intersection with no simplification of any sort attempted.
   void append(const FlatAffineConstraints &other);
 
@@ -132,20 +127,21 @@ class FlatAffineConstraints {
   /// Returns false otherwise.
   bool isEmpty() const;
 
-  /// Runs the GCD test on all equality constraints. Returns 'true' if this test
-  /// fails on any equality. Returns 'false' otherwise.
+  /// Runs the GCD test on all equality constraints. Returns true if this test
+  /// fails on any equality. Returns false otherwise.
   /// This test can be used to disprove the existence of a solution. If it
   /// returns true, no integer solution to the equality constraints can exist.
   bool isEmptyByGCDTest() const;
 
   /// Returns true if the set of constraints is found to have no solution,
-  /// false if a solution exists. Uses the same algorithm as findIntegerSample.
+  /// false if a solution exists. Uses the same algorithm as
+  /// `findIntegerSample`.
   bool isIntegerEmpty() const;
 
-  // Returns a matrix where each row is a vector along which the polytope is
-  // bounded. The span of the returned vectors is guaranteed to contain all
-  // such vectors. The returned vectors are NOT guaranteed to be linearly
-  // independent. This function should not be called on empty sets.
+  /// Returns a matrix where each row is a vector along which the polytope is
+  /// bounded. The span of the returned vectors is guaranteed to contain all
+  /// such vectors. The returned vectors are NOT guaranteed to be linearly
+  /// independent. This function should not be called on empty sets.
   Matrix getBoundedDirections() const;
 
   /// Find an integer sample point satisfying the constraints using a
@@ -160,17 +156,17 @@ class FlatAffineConstraints {
   /// otherwise.
   bool containsPoint(ArrayRef<int64_t> point) const;
 
-  // Clones this object.
+  /// Clones this object.
   std::unique_ptr<FlatAffineConstraints> clone() const;
 
   /// Returns the value at the specified equality row and column.
   inline int64_t atEq(unsigned i, unsigned j) const { return equalities(i, j); }
   inline int64_t &atEq(unsigned i, unsigned j) { return equalities(i, j); }
 
+  /// Returns the value at the specified inequality row and column.
   inline int64_t atIneq(unsigned i, unsigned j) const {
     return inequalities(i, j);
   }
-
   inline int64_t &atIneq(unsigned i, unsigned j) { return inequalities(i, j); }
 
   /// Returns the number of columns in the constraint system.
@@ -198,67 +194,23 @@ class FlatAffineConstraints {
     return inequalities.getRow(idx);
   }
 
-  /// Adds constraints (lower and upper bounds) for the specified 'affine.for'
-  /// operation's Value using IR information stored in its bound maps. The
-  /// right identifier is first looked up using forOp's Value. Asserts if the
-  /// Value corresponding to the 'affine.for' operation isn't found in the
-  /// constraint system. Returns failure for the yet unimplemented/unsupported
-  /// cases.  Any new identifiers that are found in the bound operands of the
-  /// 'affine.for' operation are added as trailing identifiers (either
-  /// dimensional or symbolic depending on whether the operand is a valid
-  /// symbol).
-  //  TODO: add support for non-unit strides.
-  LogicalResult addAffineForOpDomain(AffineForOp forOp);
-
-  /// Adds constraints (lower and upper bounds) for each loop in the loop nest
-  /// described by the bound maps 'lbMaps' and 'ubMaps' of a computation slice.
-  /// Every pair ('lbMaps[i]', 'ubMaps[i]') describes the bounds of a loop in
-  /// the nest, sorted outer-to-inner. 'operands' contains the bound operands
-  /// for a single bound map. All the bound maps will use the same bound
-  /// operands. Note that some loops described by a computation slice might not
-  /// exist yet in the IR so the Value attached to those dimension identifiers
-  /// might be empty. For that reason, this method doesn't perform Value
-  /// look-ups to retrieve the dimension identifier positions. Instead, it
-  /// assumes the position of the dim identifiers in the constraint system is
-  /// the same as the position of the loop in the loop nest.
-  LogicalResult addDomainFromSliceMaps(ArrayRef<AffineMap> lbMaps,
-                                       ArrayRef<AffineMap> ubMaps,
-                                       ArrayRef<Value> operands);
+  /// The type of bound: equal, lower bound or upper bound.
+  enum BoundType { EQ, LB, UB };
 
-  /// Adds constraints imposed by the `affine.if` operation. These constraints
-  /// are collected from the IntegerSet attached to the given `affine.if`
-  /// instance argument (`ifOp`). It is asserted that:
-  /// 1) The IntegerSet of the given `affine.if` instance should not contain
-  /// semi-affine expressions,
-  /// 2) The columns of the constraint system created from `ifOp` should match
-  /// the columns in the current one regarding numbers and values.
-  void addAffineIfOpDomain(AffineIfOp ifOp);
-
-  /// Adds a lower or an upper bound for the identifier at the specified
-  /// position with constraints being drawn from the specified bound map. If
-  /// `eq` is true, add a single equality equal to the bound map's first result
-  /// expr.
+  /// Adds a bound for the identifier at the specified position with constraints
+  /// being drawn from the specified bound map. In case of an EQ bound, the
+  /// bound map is expected to have exactly one result. In case of a LB/UB, the
+  /// bound map may have more than one result, for each of which an inequality
+  /// is added.
   /// Note: The dimensions/symbols of this FlatAffineConstraints must match the
   /// dimensions/symbols of the affine map.
-  LogicalResult addLowerOrUpperBound(unsigned pos, AffineMap boundMap, bool eq,
-                                     bool lower = true);
+  LogicalResult addBound(BoundType type, unsigned pos, AffineMap boundMap);
 
-  /// Adds a lower or an upper bound for the identifier at the specified
-  /// position with constraints being drawn from the specified bound map and
-  /// operands. If `eq` is true, add a single equality equal to the bound map's
-  /// first result expr.
-  LogicalResult addLowerOrUpperBound(unsigned pos, AffineMap boundMap,
-                                     ValueRange operands, bool eq,
-                                     bool lower = true);
+  /// Adds a constant bound for the specified identifier.
+  void addBound(BoundType type, unsigned pos, int64_t value);
 
-  /// Returns the bound for the identifier at `pos` from the inequality at
-  /// `ineqPos` as a 1-d affine value map (affine map + operands). The returned
-  /// affine value map can either be a lower bound or an upper bound depending
-  /// on the sign of atIneq(ineqPos, pos). Asserts if the row at `ineqPos` does
-  /// not involve the `pos`th identifier.
-  void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos,
-                               AffineValueMap &vmap,
-                               MLIRContext *context) const;
+  /// Adds a constant bound for the specified expression.
+  void addBound(BoundType type, ArrayRef<int64_t> expr, int64_t value);
 
   /// Returns the constraint system as an integer set. Returns a null integer
   /// set if the system has no constraints, or if an integer set couldn't be
@@ -266,8 +218,8 @@ class FlatAffineConstraints {
   /// being known and such a local variable appearing in any of the constraints.
   IntegerSet getAsIntegerSet(MLIRContext *context) const;
 
-  /// Computes the lower and upper bounds of the first 'num' dimensional
-  /// identifiers (starting at 'offset') as an affine map of the remaining
+  /// Computes the lower and upper bounds of the first `num` dimensional
+  /// identifiers (starting at `offset`) as an affine map of the remaining
   /// identifiers (dimensional and symbolic). This method is able to detect
   /// identifiers as floordiv's and mod's of affine expressions of other
   /// identifiers with respect to (positive) constants. Sets bound map to a
@@ -276,84 +228,34 @@ class FlatAffineConstraints {
                       SmallVectorImpl<AffineMap> *lbMaps,
                       SmallVectorImpl<AffineMap> *ubMaps);
 
-  /// Adds slice lower bounds represented by lower bounds in 'lbMaps' and upper
-  /// bounds in 'ubMaps' to each identifier in the constraint system which has
-  /// a value in 'values'. Note that both lower/upper bounds share the same
-  /// operand list 'operands'.
-  /// This function assumes 'values.size' == 'lbMaps.size' == 'ubMaps.size'.
-  /// Note that both lower/upper bounds use operands from 'operands'.
-  LogicalResult addSliceBounds(ArrayRef<Value> values,
-                               ArrayRef<AffineMap> lbMaps,
-                               ArrayRef<AffineMap> ubMaps,
-                               ArrayRef<Value> operands);
-
-  // Adds an inequality (>= 0) from the coefficients specified in inEq.
+  /// Adds an inequality (>= 0) from the coefficients specified in `inEq`.
   void addInequality(ArrayRef<int64_t> inEq);
-  // Adds an equality from the coefficients specified in eq.
+  /// Adds an equality from the coefficients specified in `eq`.
   void addEquality(ArrayRef<int64_t> eq);
 
-  /// Adds a constant lower bound constraint for the specified identifier.
-  void addConstantLowerBound(unsigned pos, int64_t lb);
-  /// Adds a constant upper bound constraint for the specified identifier.
-  void addConstantUpperBound(unsigned pos, int64_t ub);
-
   /// Adds a new local identifier as the floordiv of an affine function of other
-  /// identifiers, the coefficients of which are provided in 'dividend' and with
-  /// respect to a positive constant 'divisor'. Two constraints are added to the
+  /// identifiers, the coefficients of which are provided in `dividend` and with
+  /// respect to a positive constant `divisor`. Two constraints are added to the
   /// system to capture equivalence with the floordiv:
   /// q = dividend floordiv c    <=>   c*q <= dividend <= c*q + c - 1.
   void addLocalFloorDiv(ArrayRef<int64_t> dividend, int64_t divisor);
 
-  /// Adds a constant lower bound constraint for the specified expression.
-  void addConstantLowerBound(ArrayRef<int64_t> expr, int64_t lb);
-  /// Adds a constant upper bound constraint for the specified expression.
-  void addConstantUpperBound(ArrayRef<int64_t> expr, int64_t ub);
-
-  /// Sets the identifier at the specified position to a constant.
-  void setIdToConstant(unsigned pos, int64_t val);
-
-  /// Sets the identifier corresponding to the specified Value id to a
-  /// constant. Asserts if the 'id' is not found.
-  void setIdToConstant(Value id, int64_t val);
-
-  /// Looks up the position of the identifier with the specified Value. Returns
-  /// true if found (false otherwise). `pos' is set to the (column) position of
-  /// the identifier.
-  bool findId(Value id, unsigned *pos) const;
-
-  /// Returns true if an identifier with the specified Value exists, false
-  /// otherwise.
-  bool containsId(Value id) const;
-
   /// Swap the posA^th identifier with the posB^th identifier.
-  void swapId(unsigned posA, unsigned posB);
-
-  // Add identifiers of the specified kind - specified positions are relative to
-  // the kind of identifier. The coefficient column corresponding to the added
-  // identifier is initialized to zero. 'id' is the Value corresponding to the
-  // identifier that can optionally be provided.
-  void addDimId(unsigned pos, Value id = nullptr);
-  void addSymbolId(unsigned pos, Value id = nullptr);
-  void addLocalId(unsigned pos);
-  void addId(IdKind kind, unsigned pos, Value id = nullptr);
+  virtual void swapId(unsigned posA, unsigned posB);
 
-  /// Add the specified values as a dim or symbol id depending on its nature, if
-  /// it already doesn't exist in the system. `id' has to be either a terminal
-  /// symbol or a loop IV, i.e., it cannot be the result affine.apply of any
-  /// symbols or loop IVs. The identifier is added to the end of the existing
-  /// dims or symbols. Additional information on the identifier is extracted
-  /// from the IR and added to the constraint system.
-  void addInductionVarOrTerminalSymbol(Value id);
-
-  /// Composes the affine value map with this FlatAffineConstrains, adding the
-  /// results of the map as dimensions at the front [0, vMap->getNumResults())
-  /// and with the dimensions set to the equalities specified by the value map.
-  /// Returns failure if the composition fails (when vMap is a semi-affine map).
-  /// The vMap's operand Value's are used to look up the right positions in
-  /// the FlatAffineConstraints with which to associate. Every operand of vMap
-  /// should have a matching dim/symbol column in this constraint system (with
-  /// the same associated Value).
-  LogicalResult composeMap(const AffineValueMap *vMap);
+  /// Add identifiers of the specified kind - specified positions are relative
+  /// to the kind of identifier. The coefficient column corresponding to the
+  /// added identifier is initialized to zero.
+  void addDimId(unsigned pos);
+  void addSymbolId(unsigned pos);
+  void addLocalId(unsigned pos);
+  virtual unsigned addId(IdKind kind, unsigned pos);
+  /// Add identifiers of the specified kind at the end of the table. Return the
+  /// position of the column. The coefficient column corresponding to the
+  /// added identifier is initialized to zero.
+  unsigned addDimId();
+  unsigned addSymbolId();
+  unsigned addLocalId();
 
   /// Composes an affine map whose dimensions and symbols match one to one with
   /// the dimensions and symbols of this FlatAffineConstraints. The results of
@@ -361,79 +263,58 @@ class FlatAffineConstraints {
   /// system. Returns failure if `other` is a semi-affine map.
   LogicalResult composeMatchingMap(AffineMap other);
 
-  /// Projects out (aka eliminates) 'num' identifiers starting at position
-  /// 'pos'. The resulting constraint system is the shadow along the dimensions
+  /// Projects out (aka eliminates) `num` identifiers starting at position
+  /// `pos`. The resulting constraint system is the shadow along the dimensions
   /// that still exist. This method may not always be integer exact.
   // TODO: deal with integer exactness when necessary - can return a value to
   // mark exactness for example.
   void projectOut(unsigned pos, unsigned num);
   inline void projectOut(unsigned pos) { return projectOut(pos, 1); }
 
-  /// Projects out the identifier that is associate with Value .
-  void projectOut(Value id);
-
   /// Removes the specified identifier from the system.
   void removeId(unsigned pos);
 
   void removeEquality(unsigned pos);
   void removeInequality(unsigned pos);
 
+  /// Sets the `values.size()` identifiers starting at `po`s to the specified
+  /// values and removes them.
+  void setAndEliminate(unsigned pos, ArrayRef<int64_t> values);
+
   /// Changes the partition between dimensions and symbols. Depending on the new
   /// symbol count, either a chunk of trailing dimensional identifiers becomes
   /// symbols, or some of the leading symbols become dimensions.
   void setDimSymbolSeparation(unsigned newSymbolCount);
 
-  /// Changes all symbol identifiers which are loop IVs to dim identifiers.
-  void convertLoopIVSymbolsToDims();
-
-  /// Sets the values.size() identifiers starting at pos to the specified values
-  /// and removes them.
-  void setAndEliminate(unsigned pos, ArrayRef<int64_t> values);
-
   /// Tries to fold the specified identifier to a constant using a trivial
   /// equality detection; if successful, the constant is substituted for the
   /// identifier everywhere in the constraint system and then removed from the
   /// system.
   LogicalResult constantFoldId(unsigned pos);
 
-  /// This method calls constantFoldId for the specified range of identifiers,
-  /// 'num' identifiers starting at position 'pos'.
+  /// This method calls `constantFoldId` for the specified range of identifiers,
+  /// `num` identifiers starting at position `pos`.
   void constantFoldIdRange(unsigned pos, unsigned num);
 
   /// Updates the constraints to be the smallest bounding (enclosing) box that
-  /// contains the points of 'this' set and that of 'other', with the symbols
+  /// contains the points of `this` set and that of `other`, with the symbols
   /// being treated specially. For each of the dimensions, the min of the lower
   /// bounds (symbolic) and the max of the upper bounds (symbolic) is computed
-  /// to determine such a bounding box. `other' is expected to have the same
+  /// to determine such a bounding box. `other` is expected to have the same
   /// dimensional identifiers as this constraint system (in the same order).
   ///
-  /// Eg: if 'this' is {0 <= d0 <= 127}, 'other' is {16 <= d0 <= 192}, the
-  ///      output is {0 <= d0 <= 192}.
-  /// 2) 'this' = {s0 + 5 <= d0 <= s0 + 20}, 'other' is {s0 + 1 <= d0 <= s0 +
-  ///     9}, output = {s0 + 1 <= d0 <= s0 + 20}.
-  /// 3) 'this' = {0 <= d0 <= 5, 1 <= d1 <= 9}, 'other' = {2 <= d0 <= 6, 5 <= d1
-  ///     <= 15}, output = {0 <= d0 <= 6, 1 <= d1 <= 15}.
+  /// E.g.:
+  /// 1) this   = {0 <= d0 <= 127},
+  ///    other  = {16 <= d0 <= 192},
+  ///    output = {0 <= d0 <= 192}
+  /// 2) this   = {s0 + 5 <= d0 <= s0 + 20},
+  ///    other  = {s0 + 1 <= d0 <= s0 + 9},
+  ///    output = {s0 + 1 <= d0 <= s0 + 20}
+  /// 3) this   = {0 <= d0 <= 5, 1 <= d1 <= 9}
+  ///    other  = {2 <= d0 <= 6, 5 <= d1 <= 15},
+  ///    output = {0 <= d0 <= 6, 1 <= d1 <= 15}
   LogicalResult unionBoundingBox(const FlatAffineConstraints &other);
 
-  /// Returns 'true' if this constraint system and 'other' are in the same
-  /// space, i.e., if they are associated with the same set of identifiers,
-  /// appearing in the same order. Returns 'false' otherwise.
-  bool areIdsAlignedWithOther(const FlatAffineConstraints &other);
-
-  /// Merge and align the identifiers of 'this' and 'other' starting at
-  /// 'offset', so that both constraint systems get the union of the contained
-  /// identifiers that is dimension-wise and symbol-wise unique; both
-  /// constraint systems are updated so that they have the union of all
-  /// identifiers, with this's original identifiers appearing first followed by
-  /// any of other's identifiers that didn't appear in 'this'. Local
-  /// identifiers of each system are by design separate/local and are placed
-  /// one after other (this's followed by other's).
-  //  Eg: Input: 'this'  has ((%i %j) [%M %N])
-  //             'other' has (%k, %j) [%P, %N, %M])
-  //      Output: both 'this', 'other' have (%i, %j, %k) [%M, %N, %P]
-  //
-  void mergeAndAlignIdsWithOther(unsigned offset, FlatAffineConstraints *other);
-
   unsigned getNumConstraints() const {
     return getNumInequalities() + getNumEqualities();
   }
@@ -445,56 +326,8 @@ class FlatAffineConstraints {
     return numIds - numDims - numSymbols;
   }
 
-  inline ArrayRef<Optional<Value>> getIds() const {
-    return {ids.data(), ids.size()};
-  }
-  inline MutableArrayRef<Optional<Value>> getIds() {
-    return {ids.data(), ids.size()};
-  }
-
-  /// Returns the optional Value corresponding to the pos^th identifier.
-  inline Optional<Value> getId(unsigned pos) const { return ids[pos]; }
-  inline Optional<Value> &getId(unsigned pos) { return ids[pos]; }
-
-  /// Returns the Value associated with the pos^th identifier. Asserts if
-  /// no Value identifier was associated.
-  inline Value getIdValue(unsigned pos) const {
-    assert(ids[pos].hasValue() && "identifier's Value not set");
-    return ids[pos].getValue();
-  }
-
-  /// Returns the Values associated with identifiers in range [start, end).
-  /// Asserts if no Value was associated with one of these identifiers.
-  void getIdValues(unsigned start, unsigned end,
-                   SmallVectorImpl<Value> *values) const {
-    assert((start < numIds || start == end) && "invalid start position");
-    assert(end <= numIds && "invalid end position");
-    values->clear();
-    values->reserve(end - start);
-    for (unsigned i = start; i < end; i++) {
-      values->push_back(getIdValue(i));
-    }
-  }
-  inline void getAllIdValues(SmallVectorImpl<Value> *values) const {
-    getIdValues(0, numIds, values);
-  }
-
-  /// Sets Value associated with the pos^th identifier.
-  inline void setIdValue(unsigned pos, Value val) {
-    assert(pos < numIds && "invalid id position");
-    ids[pos] = val;
-  }
-  /// Sets Values associated with identifiers in the range [start, end).
-  void setIdValues(unsigned start, unsigned end, ArrayRef<Value> values) {
-    assert((start < numIds || end == start) && "invalid start position");
-    assert(end <= numIds && "invalid end position");
-    assert(values.size() == end - start);
-    for (unsigned i = start; i < end; ++i)
-      ids[i] = values[i - start];
-  }
-
-  /// Clears this list of constraints and copies other into it.
-  void clearAndCopyFrom(const FlatAffineConstraints &other);
+  /// Replaces the contents of this FlatAffineConstraints with `other`.
+  virtual void clearAndCopyFrom(const FlatAffineConstraints &other);
 
   /// Returns the smallest known constant bound for the extent of the specified
   /// identifier (pos^th), i.e., the smallest known constant that is greater
@@ -506,7 +339,7 @@ class FlatAffineConstraints {
   /// involving only the symbolic identifiers. `lb` and `ub` (along with the
   /// `boundFloorDivisor`) are set to represent the lower and upper bound
   /// associated with the constant difference: `lb`, `ub` have the coefficients,
-  /// and boundFloorDivisor, their divisor. `minLbPos` and `minUbPos` if
+  /// and `boundFloorDivisor`, their divisor. `minLbPos` and `minUbPos` if
   /// non-null are set to the position of the constant lower bound and upper
   /// bound respectively (to the same if they are from an equality). Ex: if the
   /// lower bound is [(s0 + s2 - 1) floordiv 32] for a system with three
@@ -518,13 +351,10 @@ class FlatAffineConstraints {
       SmallVectorImpl<int64_t> *ub = nullptr, unsigned *minLbPos = nullptr,
       unsigned *minUbPos = nullptr) const;
 
-  /// Returns the constant lower bound for the pos^th identifier if there is
-  /// one; None otherwise.
-  Optional<int64_t> getConstantLowerBound(unsigned pos) const;
-
-  /// Returns the constant upper bound for the pos^th identifier if there is
-  /// one; None otherwise.
-  Optional<int64_t> getConstantUpperBound(unsigned pos) const;
+  /// Returns the constant bound for the pos^th identifier if there is one;
+  /// None otherwise.
+  // TODO: Support EQ bounds.
+  Optional<int64_t> getConstantBound(BoundType type, unsigned pos) const;
 
   /// Gets the lower and upper bound of the `offset` + `pos`th identifier
   /// treating [0, offset) U [offset + num, symStartPos) as dimensions and
@@ -549,7 +379,7 @@ class FlatAffineConstraints {
                                unsigned offset = 0, unsigned num = 0) const;
 
   /// Removes constraints that are independent of (i.e., do not have a
-  /// coefficient for) for identifiers in the range [pos, pos + num).
+  /// coefficient) identifiers in the range [pos, pos + num).
   void removeIndependentConstraints(unsigned pos, unsigned num);
 
   /// Returns true if the set can be trivially detected as being
@@ -564,8 +394,8 @@ class FlatAffineConstraints {
   /// constraints by their GCD and performs GCD tightening on inequalities.
   void removeTrivialRedundancy();
 
-  /// A more expensive check to detect redundant inequalities thatn
-  /// removeTrivialRedundancy.
+  /// A more expensive check than `removeTrivialRedundancy` to detect redundant
+  /// inequalities.
   void removeRedundantInequalities();
 
   /// Removes redundant constraints using Simplex. Although the algorithm can
@@ -575,22 +405,22 @@ class FlatAffineConstraints {
   /// O(VC) time.
   void removeRedundantConstraints();
 
-  // Removes all equalities and inequalities.
+  /// Removes all equalities and inequalities.
   void clearConstraints();
 
   void print(raw_ostream &os) const;
   void dump() const;
 
-private:
+protected:
   /// Returns false if the fields corresponding to various identifier counts, or
   /// equality/inequality buffer sizes aren't consistent; true otherwise. This
   /// is meant to be used within an assert internally.
-  bool hasConsistentState() const;
+  virtual bool hasConsistentState() const;
 
   /// Checks all rows of equality/inequality constraints for trivial
   /// contradictions (for example: 1 == 0, 0 >= 1), which may have surfaced
-  /// after elimination. Returns 'true' if an invalid constraint is found;
-  /// 'false'otherwise.
+  /// after elimination. Returns true if an invalid constraint is found;
+  /// false otherwise.
   bool hasInvalidConstraint() const;
 
   /// Returns the constant lower bound bound if isLower is true, and the upper
@@ -598,10 +428,6 @@ class FlatAffineConstraints {
   template <bool isLower>
   Optional<int64_t> computeConstantLowerOrUpperBound(unsigned pos);
 
-  /// Align `map` with this constraint system based on `operands`. Each operand
-  /// must already have a corresponding dim/symbol in this constraint system.
-  AffineMap computeAlignedMap(AffineMap map, ValueRange operands) const;
-
   /// Given an affine map that is aligned with this constraint system:
   /// * Flatten the map.
   /// * Add newly introduced local columns at the beginning of this constraint
@@ -615,27 +441,27 @@ class FlatAffineConstraints {
   LogicalResult flattenAlignedMapAndMergeLocals(
       AffineMap map, std::vector<SmallVector<int64_t, 8>> *flattenedExprs);
 
-  // Eliminates a single identifier at 'position' from equality and inequality
-  // constraints. Returns 'success' if the identifier was eliminated, and
-  // 'failure' otherwise.
+  /// Eliminates a single identifier at `position` from equality and inequality
+  /// constraints. Returns `success` if the identifier was eliminated, and
+  /// `failure` otherwise.
   inline LogicalResult gaussianEliminateId(unsigned position) {
     return success(gaussianEliminateIds(position, position + 1) == 1);
   }
 
-  // Eliminates identifiers from equality and inequality constraints
-  // in column range [posStart, posLimit).
-  // Returns the number of variables eliminated.
+  /// Eliminates identifiers from equality and inequality constraints
+  /// in column range [posStart, posLimit).
+  /// Returns the number of variables eliminated.
   unsigned gaussianEliminateIds(unsigned posStart, unsigned posLimit);
 
-  /// Eliminates identifier at the specified position using Fourier-Motzkin
+  /// Eliminates the identifier at the specified position using Fourier-Motzkin
   /// variable elimination, but uses Gaussian elimination if there is an
   /// equality involving that identifier. If the result of the elimination is
-  /// integer exact, *isResultIntegerExact is set to true. If 'darkShadow' is
+  /// integer exact, `*isResultIntegerExact` is set to true. If `darkShadow` is
   /// set to true, a potential under approximation (subset) of the rational
   /// shadow / exact integer shadow is computed.
   // See implementation comments for more details.
-  void fourierMotzkinEliminate(unsigned pos, bool darkShadow = false,
-                               bool *isResultIntegerExact = nullptr);
+  virtual void fourierMotzkinEliminate(unsigned pos, bool darkShadow = false,
+                                       bool *isResultIntegerExact = nullptr);
 
   /// Tightens inequalities given that we are dealing with integer spaces. This
   /// is similar to the GCD test but applied to inequalities. The constant term
@@ -651,7 +477,7 @@ class FlatAffineConstraints {
   /// Removes identifiers in the column range [idStart, idLimit), and copies any
   /// remaining valid data into place, updates member variables, and resizes
   /// arrays as needed.
-  void removeIdRange(unsigned idStart, unsigned idLimit);
+  virtual void removeIdRange(unsigned idStart, unsigned idLimit);
 
   /// Total number of identifiers.
   unsigned numIds;
@@ -669,12 +495,6 @@ class FlatAffineConstraints {
   /// Coefficients of affine inequalities (in >= 0 form).
   Matrix inequalities;
 
-  /// Values corresponding to the (column) identifiers of this constraint
-  /// system appearing in the order the identifiers correspond to columns.
-  /// Temporary ones or those that aren't associated to any Value are set to
-  /// None.
-  SmallVector<Optional<Value>, 8> ids;
-
   /// A parameter that controls detection of an unrealistic number of
   /// constraints. If the number of constraints is this many times the number of
   /// variables, we consider such a system out of line with the intended use
@@ -688,6 +508,331 @@ class FlatAffineConstraints {
   constexpr static unsigned kExplosionFactor = 32;
 };
 
+/// An extension of FlatAffineConstraints in which dimensions and symbols can
+/// optionally be associated with an SSA value.
+class FlatAffineValueConstraints : public FlatAffineConstraints {
+public:
+  /// Constructs a constraint system reserving memory for the specified number
+  /// of constraints and identifiers.
+  FlatAffineValueConstraints(unsigned numReservedInequalities,
+                             unsigned numReservedEqualities,
+                             unsigned numReservedCols, unsigned numDims,
+                             unsigned numSymbols, unsigned numLocals,
+                             ArrayRef<Optional<Value>> valArgs = {})
+      : FlatAffineConstraints(numReservedInequalities, numReservedEqualities,
+                              numReservedCols, numDims, numSymbols, numLocals) {
+    assert(numReservedCols >= numIds + 1);
+    assert(valArgs.empty() || valArgs.size() == numIds);
+    values.reserve(numReservedCols);
+    if (valArgs.empty())
+      values.resize(numIds, None);
+    else
+      values.append(valArgs.begin(), valArgs.end());
+  }
+
+  /// Constructs a constraint system with the specified number of
+  /// dimensions and symbols.
+  FlatAffineValueConstraints(unsigned numDims = 0, unsigned numSymbols = 0,
+                             unsigned numLocals = 0,
+                             ArrayRef<Optional<Value>> valArgs = {})
+      : FlatAffineValueConstraints(/*numReservedInequalities=*/0,
+                                   /*numReservedEqualities=*/0,
+                                   /*numReservedCols=*/numDims + numSymbols +
+                                       numLocals + 1,
+                                   numDims, numSymbols, numLocals, valArgs) {}
+
+  /// Create a flat affine constraint system from an AffineValueMap or a list of
+  /// these. The constructed system will only include equalities.
+  explicit FlatAffineValueConstraints(const AffineValueMap &avm);
+  explicit FlatAffineValueConstraints(ArrayRef<const AffineValueMap *> avmRef);
+
+  /// Creates an affine constraint system from an IntegerSet.
+  explicit FlatAffineValueConstraints(IntegerSet set);
+
+  FlatAffineValueConstraints(ArrayRef<const AffineValueMap *> avmRef,
+                             IntegerSet set);
+
+  /// Return the kind of this FlatAffineConstraints.
+  Kind getKind() const override { return Kind::FlatAffineValueConstraints; }
+
+  static bool classof(const FlatAffineConstraints *cst) {
+    return cst->getKind() == Kind::FlatAffineValueConstraints;
+  }
+
+  /// Clears any existing data and reserves memory for the specified
+  /// constraints.
+  void reset(unsigned numReservedInequalities, unsigned numReservedEqualities,
+             unsigned numReservedCols, unsigned numDims, unsigned numSymbols,
+             unsigned numLocals = 0) override;
+  void reset(unsigned numReservedInequalities, unsigned numReservedEqualities,
+             unsigned numReservedCols, unsigned numDims, unsigned numSymbols,
+             unsigned numLocals, ArrayRef<Value> valArgs);
+  void reset(unsigned numDims, unsigned numSymbols, unsigned numLocals,
+             ArrayRef<Value> valArgs);
+  using FlatAffineConstraints::reset;
+
+  /// Clones this object.
+  std::unique_ptr<FlatAffineValueConstraints> clone() const;
+
+  /// Adds constraints (lower and upper bounds) for the specified 'affine.for'
+  /// operation's Value using IR information stored in its bound maps. The
+  /// right identifier is first looked up using `forOp`'s Value. Asserts if the
+  /// Value corresponding to the 'affine.for' operation isn't found in the
+  /// constraint system. Returns failure for the yet unimplemented/unsupported
+  /// cases.  Any new identifiers that are found in the bound operands of the
+  /// 'affine.for' operation are added as trailing identifiers (either
+  /// dimensional or symbolic depending on whether the operand is a valid
+  /// symbol).
+  //  TODO: add support for non-unit strides.
+  LogicalResult addAffineForOpDomain(AffineForOp forOp);
+
+  /// Adds constraints (lower and upper bounds) for each loop in the loop nest
+  /// described by the bound maps `lbMaps` and `ubMaps` of a computation slice.
+  /// Every pair (`lbMaps[i]`, `ubMaps[i]`) describes the bounds of a loop in
+  /// the nest, sorted outer-to-inner. `operands` contains the bound operands
+  /// for a single bound map. All the bound maps will use the same bound
+  /// operands. Note that some loops described by a computation slice might not
+  /// exist yet in the IR so the Value attached to those dimension identifiers
+  /// might be empty. For that reason, this method doesn't perform Value
+  /// look-ups to retrieve the dimension identifier positions. Instead, it
+  /// assumes the position of the dim identifiers in the constraint system is
+  /// the same as the position of the loop in the loop nest.
+  LogicalResult addDomainFromSliceMaps(ArrayRef<AffineMap> lbMaps,
+                                       ArrayRef<AffineMap> ubMaps,
+                                       ArrayRef<Value> operands);
+
+  /// Adds constraints imposed by the `affine.if` operation. These constraints
+  /// are collected from the IntegerSet attached to the given `affine.if`
+  /// instance argument (`ifOp`). It is asserted that:
+  /// 1) The IntegerSet of the given `affine.if` instance should not contain
+  /// semi-affine expressions,
+  /// 2) The columns of the constraint system created from `ifOp` should match
+  /// the columns in the current one regarding numbers and values.
+  void addAffineIfOpDomain(AffineIfOp ifOp);
+
+  /// Adds a bound for the identifier at the specified position with constraints
+  /// being drawn from the specified bound map and operands. In case of an
+  /// EQ bound, the  bound map is expected to have exactly one result. In case
+  /// of a LB/UB, the bound map may have more than one result, for each of which
+  /// an inequality is added.
+  LogicalResult addBound(BoundType type, unsigned pos, AffineMap boundMap,
+                         ValueRange operands);
+
+  /// Adds a constant bound for the identifier associated with the given Value.
+  void addBound(BoundType type, Value val, int64_t value);
+
+  using FlatAffineConstraints::addBound;
+
+  /// Returns the bound for the identifier at `pos` from the inequality at
+  /// `ineqPos` as a 1-d affine value map (affine map + operands). The returned
+  /// affine value map can either be a lower bound or an upper bound depending
+  /// on the sign of atIneq(ineqPos, pos). Asserts if the row at `ineqPos` does
+  /// not involve the `pos`th identifier.
+  void getIneqAsAffineValueMap(unsigned pos, unsigned ineqPos,
+                               AffineValueMap &vmap,
+                               MLIRContext *context) const;
+
+  /// Adds slice lower bounds represented by lower bounds in `lbMaps` and upper
+  /// bounds in `ubMaps` to each identifier in the constraint system which has
+  /// a value in `values`. Note that both lower/upper bounds share the same
+  /// operand list `operands`.
+  /// This function assumes `values.size` == `lbMaps.size` == `ubMaps.size`.
+  /// Note that both lower/upper bounds use operands from `operands`.
+  LogicalResult addSliceBounds(ArrayRef<Value> values,
+                               ArrayRef<AffineMap> lbMaps,
+                               ArrayRef<AffineMap> ubMaps,
+                               ArrayRef<Value> operands);
+
+  /// Looks up the position of the identifier with the specified Value. Returns
+  /// true if found (false otherwise). `pos` is set to the (column) position of
+  /// the identifier.
+  bool findId(Value val, unsigned *pos) const;
+
+  /// Returns true if an identifier with the specified Value exists, false
+  /// otherwise.
+  bool containsId(Value val) const;
+
+  /// Swap the posA^th identifier with the posB^th identifier.
+  void swapId(unsigned posA, unsigned posB) override;
+
+  /// Add identifiers of the specified kind - specified positions are relative
+  /// to the kind of identifier. The coefficient column corresponding to the
+  /// added identifier is initialized to zero. `val` is the Value corresponding
+  /// to the identifier that can optionally be provided.
+  void addDimId(unsigned pos, Value val);
+  using FlatAffineConstraints::addDimId;
+  void addSymbolId(unsigned pos, Value val);
+  using FlatAffineConstraints::addSymbolId;
+  unsigned addId(IdKind kind, unsigned pos) override;
+  unsigned addId(IdKind kind, unsigned pos, Value val);
+  /// Add identifiers of the specified kind at the end of the table. Return the
+  /// position of the column. The coefficient column corresponding to the
+  /// added identifier is initialized to zero. `val` is the Value corresponding
+  /// to the identifier that can optionally be provided.
+  unsigned addDimId(Value val);
+  unsigned addSymbolId(Value val);
+
+  /// Add the specified values as a dim or symbol id depending on its nature, if
+  /// it already doesn't exist in the system. `val` has to be either a terminal
+  /// symbol or a loop IV, i.e., it cannot be the result affine.apply of any
+  /// symbols or loop IVs. The identifier is added to the end of the existing
+  /// dims or symbols. Additional information on the identifier is extracted
+  /// from the IR and added to the constraint system.
+  void addInductionVarOrTerminalSymbol(Value val);
+
+  /// Align `map` with this constraint system based on `operands`. Each operand
+  /// must already have a corresponding dim/symbol in this constraint system.
+  AffineMap computeAlignedMap(AffineMap map, ValueRange operands) const;
+
+  /// Composes the affine value map with this FlatAffineValueConstrains, adding
+  /// the results of the map as dimensions at the front
+  /// [0, vMap->getNumResults()) and with the dimensions set to the equalities
+  /// specified by the value map.
+  ///
+  /// Returns failure if the composition fails (when vMap is a semi-affine map).
+  /// The vMap's operand Value's are used to look up the right positions in
+  /// the FlatAffineConstraints with which to associate. Every operand of vMap
+  /// should have a matching dim/symbol column in this constraint system (with
+  /// the same associated Value).
+  LogicalResult composeMap(const AffineValueMap *vMap);
+
+  /// Projects out the identifier that is associate with Value.
+  void projectOut(Value val);
+  using FlatAffineConstraints::projectOut;
+
+  /// Changes all symbol identifiers which are loop IVs to dim identifiers.
+  void convertLoopIVSymbolsToDims();
+
+  /// Updates the constraints to be the smallest bounding (enclosing) box that
+  /// contains the points of `this` set and that of `other`, with the symbols
+  /// being treated specially. For each of the dimensions, the min of the lower
+  /// bounds (symbolic) and the max of the upper bounds (symbolic) is computed
+  /// to determine such a bounding box. `other` is expected to have the same
+  /// dimensional identifiers as this constraint system (in the same order).
+  ///
+  /// E.g.:
+  /// 1) this   = {0 <= d0 <= 127},
+  ///    other  = {16 <= d0 <= 192},
+  ///    output = {0 <= d0 <= 192}
+  /// 2) this   = {s0 + 5 <= d0 <= s0 + 20},
+  ///    other  = {s0 + 1 <= d0 <= s0 + 9},
+  ///    output = {s0 + 1 <= d0 <= s0 + 20}
+  /// 3) this   = {0 <= d0 <= 5, 1 <= d1 <= 9}
+  ///    other  = {2 <= d0 <= 6, 5 <= d1 <= 15},
+  ///    output = {0 <= d0 <= 6, 1 <= d1 <= 15}
+  LogicalResult unionBoundingBox(const FlatAffineValueConstraints &other);
+  using FlatAffineConstraints::unionBoundingBox;
+
+  /// Merge and align the identifiers of `this` and `other` starting at
+  /// `offset`, so that both constraint systems get the union of the contained
+  /// identifiers that is dimension-wise and symbol-wise unique; both
+  /// constraint systems are updated so that they have the union of all
+  /// identifiers, with `this`'s original identifiers appearing first followed
+  /// by any of `other`'s identifiers that didn't appear in `this`. Local
+  /// identifiers of each system are by design separate/local and are placed
+  /// one after other (`this`'s followed by `other`'s).
+  //  Eg: Input: `this`  has (%i %j) [%M %N]
+  //             `other` has (%k, %j) [%P, %N, %M]
+  //      Output: both `this`, `other` have (%i, %j, %k) [%M, %N, %P]
+  //
+  void mergeAndAlignIdsWithOther(unsigned offset,
+                                 FlatAffineValueConstraints *other);
+
+  /// Returns true if this constraint system and `other` are in the same
+  /// space, i.e., if they are associated with the same set of identifiers,
+  /// appearing in the same order. Returns false otherwise.
+  bool areIdsAlignedWithOther(const FlatAffineValueConstraints &other);
+
+  /// Replaces the contents of this FlatAffineValueConstraints with `other`.
+  void clearAndCopyFrom(const FlatAffineConstraints &other) override;
+
+  /// Returns the Value associated with the pos^th identifier. Asserts if
+  /// no Value identifier was associated.
+  inline Value getValue(unsigned pos) const {
+    assert(hasValue(pos) && "identifier's Value not set");
+    return values[pos].getValue();
+  }
+
+  /// Returns true if the pos^th identifier has an associated Value.
+  inline bool hasValue(unsigned pos) const { return values[pos].hasValue(); }
+
+  /// Returns true if at least one identifier has an associated Value.
+  bool hasValues() const;
+
+  /// Returns the Values associated with identifiers in range [start, end).
+  /// Asserts if no Value was associated with one of these identifiers.
+  inline void getValues(unsigned start, unsigned end,
+                        SmallVectorImpl<Value> *values) const {
+    assert((start < numIds || start == end) && "invalid start position");
+    assert(end <= numIds && "invalid end position");
+    values->clear();
+    values->reserve(end - start);
+    for (unsigned i = start; i < end; i++)
+      values->push_back(getValue(i));
+  }
+  inline void getAllValues(SmallVectorImpl<Value> *values) const {
+    getValues(0, numIds, values);
+  }
+
+  inline ArrayRef<Optional<Value>> getMaybeValues() const {
+    return {values.data(), values.size()};
+  }
+
+  inline ArrayRef<Optional<Value>> getMaybeDimValues() const {
+    return {values.data(), getNumDimIds()};
+  }
+
+  inline ArrayRef<Optional<Value>> getMaybeSymbolValues() const {
+    return {values.data() + getNumDimIds(), getNumSymbolIds()};
+  }
+
+  inline ArrayRef<Optional<Value>> getMaybeDimAndSymbolValues() const {
+    return {values.data(), getNumDimIds() + getNumSymbolIds()};
+  }
+
+  /// Sets the Value associated with the pos^th identifier.
+  inline void setValue(unsigned pos, Value val) {
+    assert(pos < numIds && "invalid id position");
+    values[pos] = val;
+  }
+
+  /// Sets the Values associated with the identifiers in the range [start, end).
+  void setValues(unsigned start, unsigned end, ArrayRef<Value> values) {
+    assert((start < numIds || end == start) && "invalid start position");
+    assert(end <= numIds && "invalid end position");
+    assert(values.size() == end - start);
+    for (unsigned i = start; i < end; ++i)
+      setValue(i, values[i - start]);
+  }
+
+protected:
+  /// Returns false if the fields corresponding to various identifier counts, or
+  /// equality/inequality buffer sizes aren't consistent; true otherwise. This
+  /// is meant to be used within an assert internally.
+  bool hasConsistentState() const override;
+
+  /// Removes identifiers in the column range [idStart, idLimit), and copies any
+  /// remaining valid data into place, updates member variables, and resizes
+  /// arrays as needed.
+  void removeIdRange(unsigned idStart, unsigned idLimit) override;
+
+  /// Eliminates the identifier at the specified position using Fourier-Motzkin
+  /// variable elimination, but uses Gaussian elimination if there is an
+  /// equality involving that identifier. If the result of the elimination is
+  /// integer exact, `*isResultIntegerExact` is set to true. If `darkShadow` is
+  /// set to true, a potential under approximation (subset) of the rational
+  /// shadow / exact integer shadow is computed.
+  // See implementation comments for more details.
+  void fourierMotzkinEliminate(unsigned pos, bool darkShadow = false,
+                               bool *isResultIntegerExact = nullptr) override;
+
+  /// Values corresponding to the (column) identifiers of this constraint
+  /// system appearing in the order the identifiers correspond to columns.
+  /// Temporary ones or those that aren't associated with any Value are set to
+  /// None.
+  SmallVector<Optional<Value>, 8> values;
+};
+
 /// Flattens 'expr' into 'flattenedExpr', which contains the coefficients of the
 /// dimensions, symbols, and additional variables that represent floor divisions
 /// of dimensions, symbols, and in turn other floor divisions.  Returns failure
diff --git a/mlir/include/mlir/Analysis/Utils.h b/mlir/include/mlir/Analysis/Utils.h
index 89b73dace9b7d..d1d2218b8c019 100644
--- a/mlir/include/mlir/Analysis/Utils.h
+++ b/mlir/include/mlir/Analysis/Utils.h
@@ -28,7 +28,6 @@ namespace mlir {
 
 class AffineForOp;
 class Block;
-class FlatAffineConstraints;
 class Location;
 struct MemRefAccess;
 class Operation;
@@ -93,13 +92,13 @@ struct ComputationSliceState {
   // Constraints are added for all loop IV bounds (dim or symbol), and
   // constraints are added for slice bounds in 'lbs'/'ubs'.
   // Returns failure if we cannot add loop bounds because of unsupported cases.
-  LogicalResult getAsConstraints(FlatAffineConstraints *cst);
+  LogicalResult getAsConstraints(FlatAffineValueConstraints *cst);
 
   /// Adds to 'cst' constraints which represent the original loop bounds on
   /// 'ivs' in 'this'. This corresponds to the original domain of the loop nest
   /// from which the slice is being computed. Returns failure if we cannot add
   /// loop bounds because of unsupported cases.
-  LogicalResult getSourceAsConstraints(FlatAffineConstraints &cst);
+  LogicalResult getSourceAsConstraints(FlatAffineValueConstraints &cst);
 
   // Clears all bounds and operands in slice state.
   void clearBounds();
@@ -183,7 +182,7 @@ struct ComputationSliceState {
 //    }
 //
 void getComputationSliceState(Operation *depSourceOp, Operation *depSinkOp,
-                              FlatAffineConstraints *dependenceConstraints,
+                              FlatAffineValueConstraints *dependenceConstraints,
                               unsigned loopDepth, bool isBackwardSlice,
                               ComputationSliceState *sliceState);
 
@@ -243,7 +242,7 @@ AffineForOp insertBackwardComputationSlice(Operation *srcOpInst,
 //    }
 //
 // Region:  {memref = %A, write = false, {%i <= m0 <= %i + 7} }
-// The last field is a 2-d FlatAffineConstraints symbolic in %i.
+// The last field is a 2-d FlatAffineValueConstraints symbolic in %i.
 //
 struct MemRefRegion {
   explicit MemRefRegion(Location loc) : loc(loc) {}
@@ -278,14 +277,14 @@ struct MemRefRegion {
   ///    }
   ///
   ///   {memref = %A, write = false, {%i <= m0 <= %i + 7} }
-  /// The last field is a 2-d FlatAffineConstraints symbolic in %i.
+  /// The last field is a 2-d FlatAffineValueConstraints symbolic in %i.
   ///
   LogicalResult compute(Operation *op, unsigned loopDepth,
                         const ComputationSliceState *sliceState = nullptr,
                         bool addMemRefDimBounds = true);
 
-  FlatAffineConstraints *getConstraints() { return &cst; }
-  const FlatAffineConstraints *getConstraints() const { return &cst; }
+  FlatAffineValueConstraints *getConstraints() { return &cst; }
+  const FlatAffineValueConstraints *getConstraints() const { return &cst; }
   bool isWrite() const { return write; }
   void setWrite(bool flag) { write = flag; }
 
@@ -309,10 +308,10 @@ struct MemRefRegion {
   void getLowerAndUpperBound(unsigned pos, AffineMap &lbMap,
                              AffineMap &ubMap) const;
 
-  /// A wrapper around FlatAffineConstraints::getConstantBoundOnDimSize(). 'pos'
-  /// corresponds to the position of the memref shape's dimension (major to
-  /// minor) which matches 1:1 with the dimensional identifier positions in
-  //'cst'.
+  /// A wrapper around FlatAffineValueConstraints::getConstantBoundOnDimSize().
+  /// 'pos' corresponds to the position of the memref shape's dimension (major
+  /// to minor) which matches 1:1 with the dimensional identifier positions in
+  /// 'cst'.
   Optional<int64_t>
   getConstantBoundOnDimSize(unsigned pos,
                             SmallVectorImpl<int64_t> *lb = nullptr,
@@ -324,7 +323,7 @@ struct MemRefRegion {
   /// Returns the size of this MemRefRegion in bytes.
   Optional<int64_t> getRegionSize();
 
-  // Wrapper around FlatAffineConstraints::unionBoundingBox.
+  // Wrapper around FlatAffineValueConstraints::unionBoundingBox.
   LogicalResult unionBoundingBox(const MemRefRegion &other);
 
   /// Returns the rank of the memref that this region corresponds to.
@@ -348,7 +347,7 @@ struct MemRefRegion {
   /// and thus the region is symbolic in the outer surrounding loops at that
   /// depth.
   // TODO: Replace this to exploit HyperRectangularSet.
-  FlatAffineConstraints cst;
+  FlatAffineValueConstraints cst;
 };
 
 /// Returns the size of memref data in bytes if it's statically shaped, None
diff --git a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
index db8769d3c35f3..61b982193c029 100644
--- a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
+++ b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
@@ -30,10 +30,6 @@
 
 namespace py = pybind11;
 
-// TODO: Move this to Interop.h and make it externally configurable/use it
-// consistently to locate the "import mlir" top-level.
-#define MLIR_PYTHON_PACKAGE_PREFIX "mlir."
-
 // Raw CAPI type casters need to be declared before use, so always include them
 // first.
 namespace pybind11 {
@@ -76,7 +72,7 @@ struct type_caster<MlirAffineMap> {
   static handle cast(MlirAffineMap v, return_value_policy, handle) {
     py::object capsule =
         py::reinterpret_steal<py::object>(mlirPythonAffineMapToCapsule(v));
-    return py::module::import(MLIR_PYTHON_PACKAGE_PREFIX "ir")
+    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
         .attr("AffineMap")
         .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
         .release();
@@ -98,7 +94,7 @@ struct type_caster<MlirAttribute> {
   static handle cast(MlirAttribute v, return_value_policy, handle) {
     py::object capsule =
         py::reinterpret_steal<py::object>(mlirPythonAttributeToCapsule(v));
-    return py::module::import(MLIR_PYTHON_PACKAGE_PREFIX "ir")
+    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
         .attr("Attribute")
         .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
         .release();
@@ -115,7 +111,7 @@ struct type_caster<MlirContext> {
       // TODO: This raises an error of "No current context" currently.
       // Update the implementation to pretty-print the helpful error that the
       // core implementations print in this case.
-      src = py::module::import(MLIR_PYTHON_PACKAGE_PREFIX "ir")
+      src = py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
                 .attr("Context")
                 .attr("current");
     }
@@ -144,7 +140,7 @@ struct type_caster<MlirLocation> {
   static handle cast(MlirLocation v, return_value_policy, handle) {
     py::object capsule =
         py::reinterpret_steal<py::object>(mlirPythonLocationToCapsule(v));
-    return py::module::import(MLIR_PYTHON_PACKAGE_PREFIX "ir")
+    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
         .attr("Location")
         .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
         .release();
@@ -166,7 +162,7 @@ struct type_caster<MlirModule> {
   static handle cast(MlirModule v, return_value_policy, handle) {
     py::object capsule =
         py::reinterpret_steal<py::object>(mlirPythonModuleToCapsule(v));
-    return py::module::import(MLIR_PYTHON_PACKAGE_PREFIX "ir")
+    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
         .attr("Module")
         .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
         .release();
@@ -190,7 +186,7 @@ struct type_caster<MlirOperation> {
       return py::none();
     py::object capsule =
         py::reinterpret_steal<py::object>(mlirPythonOperationToCapsule(v));
-    return py::module::import(MLIR_PYTHON_PACKAGE_PREFIX "ir")
+    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
         .attr("Operation")
         .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
         .release();
@@ -226,7 +222,7 @@ struct type_caster<MlirType> {
   static handle cast(MlirType t, return_value_policy, handle) {
     py::object capsule =
         py::reinterpret_steal<py::object>(mlirPythonTypeToCapsule(t));
-    return py::module::import(MLIR_PYTHON_PACKAGE_PREFIX "ir")
+    return py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
         .attr("Type")
         .attr(MLIR_PYTHON_CAPI_FACTORY_ATTR)(capsule)
         .release();
@@ -266,7 +262,7 @@ class pure_subclass {
   }
 
   template <typename Func, typename... Extra>
-  pure_subclass &def(const char *name, Func &&f, const Extra &...extra) {
+  pure_subclass &def(const char *name, Func &&f, const Extra &... extra) {
     py::cpp_function cf(
         std::forward<Func>(f), py::name(name), py::is_method(py::none()),
         py::sibling(py::getattr(thisClass, name, py::none())), extra...);
@@ -276,7 +272,7 @@ class pure_subclass {
 
   template <typename Func, typename... Extra>
   pure_subclass &def_property_readonly(const char *name, Func &&f,
-                                       const Extra &...extra) {
+                                       const Extra &... extra) {
     py::cpp_function cf(
         std::forward<Func>(f), py::name(name), py::is_method(py::none()),
         py::sibling(py::getattr(thisClass, name, py::none())), extra...);
@@ -288,7 +284,7 @@ class pure_subclass {
 
   template <typename Func, typename... Extra>
   pure_subclass &def_staticmethod(const char *name, Func &&f,
-                                  const Extra &...extra) {
+                                  const Extra &... extra) {
     static_assert(!std::is_member_function_pointer<Func>::value,
                   "def_staticmethod(...) called with a non-static member "
                   "function pointer");
@@ -301,7 +297,7 @@ class pure_subclass {
 
   template <typename Func, typename... Extra>
   pure_subclass &def_classmethod(const char *name, Func &&f,
-                                 const Extra &...extra) {
+                                 const Extra &... extra) {
     static_assert(!std::is_member_function_pointer<Func>::value,
                   "def_classmethod(...) called with a non-static member "
                   "function pointer");
@@ -329,7 +325,7 @@ class mlir_attribute_subclass : public pure_subclass {
                           IsAFunctionTy isaFunction)
       : mlir_attribute_subclass(
             scope, attrClassName, isaFunction,
-            py::module::import(MLIR_PYTHON_PACKAGE_PREFIX "ir")
+            py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir"))
                 .attr("Attribute")) {}
 
   /// Subclasses with a provided mlir.ir.Attribute super-class. This must
@@ -381,7 +377,7 @@ class mlir_type_subclass : public pure_subclass {
                      IsAFunctionTy isaFunction)
       : mlir_type_subclass(
             scope, typeClassName, isaFunction,
-            py::module::import(MLIR_PYTHON_PACKAGE_PREFIX "ir").attr("Type")) {}
+            py::module::import(MAKE_MLIR_PYTHON_QUALNAME("ir")).attr("Type")) {}
 
   /// Subclasses with a provided mlir.ir.Type super-class. This must
   /// be used if the subclass is being defined in the same extension module
diff --git a/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td b/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td
index d168b8cefad8a..ca7aa32646059 100644
--- a/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td
+++ b/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td
@@ -29,7 +29,8 @@ class Async_Op<string mnemonic, list<OpTrait> traits = []> :
 def Async_ExecuteOp :
   Async_Op<"execute", [SingleBlockImplicitTerminator<"YieldOp">,
                        DeclareOpInterfaceMethods<RegionBranchOpInterface,
-                                                 ["getNumRegionInvocations"]>,
+                                                 ["getSuccessorEntryOperands",
+                                                  "getNumRegionInvocations"]>,
                        AttrSizedOperandSegments]> {
   let summary = "Asynchronous execute operation";
   let description = [{
@@ -99,7 +100,9 @@ def Async_ExecuteOp :
 }
 
 def Async_YieldOp :
-    Async_Op<"yield", [HasParent<"ExecuteOp">, NoSideEffect, Terminator]> {
+    Async_Op<"yield", [
+      HasParent<"ExecuteOp">, NoSideEffect, Terminator,
+      DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface>]> {
   let summary = "terminator for Async execute operation";
   let description = [{
     The `async.yield` is a special terminator operation for the block inside
diff --git a/mlir/include/mlir/Dialect/Linalg/Analysis/ConstraintsSet.h b/mlir/include/mlir/Dialect/Linalg/Analysis/ConstraintsSet.h
deleted file mode 100644
index 32ffd97235eac..0000000000000
--- a/mlir/include/mlir/Dialect/Linalg/Analysis/ConstraintsSet.h
+++ /dev/null
@@ -1,67 +0,0 @@
-//===- ConstraintsSet.h - Extensions for FlatAffineConstraints --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Linalg-specific constraints set extensions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_DIALECT_LINALG_ANALYSIS_CONSTRAINTS_SET_H_
-#define MLIR_DIALECT_LINALG_ANALYSIS_CONSTRAINTS_SET_H_
-
-#include "mlir/Analysis/AffineStructures.h"
-#include "mlir/IR/AffineMap.h"
-
-namespace mlir {
-class ValueRange;
-
-/// Linalg-specific constraints set extensions.
-class ConstraintsSet : public FlatAffineConstraints {
-public:
-  ConstraintsSet() : FlatAffineConstraints() {}
-
-  /// Assuming `val` is defined by `val = affine.min map (operands)`, introduce
-  /// all the constraints `val <= expr_i(operands)`, where expr_i are all the
-  /// results of `map`.
-  // This API avoids taking a dependence on the AffineMinOp definition.
-  LogicalResult composeMin(Value val, AffineMap map, ValueRange operands) {
-    return composeMinOrMaxMapAndOperands(val, map, operands, /*min=*/true);
-  }
-
-  /// Assuming `val` is defined by `val = affine.max map (operands)`, introduce
-  /// all the constraints `val >= expr_i(operands)`, where expr_i are all the
-  /// results of `map`.
-  // This API avoids taking a dependence on the AffineMaxOp definition.
-  LogicalResult composeMax(Value val, AffineMap map, ValueRange operands) {
-    return composeMinOrMaxMapAndOperands(val, map, operands, /*min=*/false);
-  }
-
-  /// Assuming `val` is defined by `val = affine.apply map (operands)`, call
-  /// composeMap.
-  // This API avoids taking a dependence on the AffineMApplyOp definition.
-  LogicalResult composeAffineApply(Value val, AffineMap map,
-                                   ValueRange operands);
-
-  /// Asserts the identifier `id` is in the constraints set and returns it.
-  unsigned lookupPos(Value id) const;
-
-  /// If v is not in the constraint set, insert it as a dim or symbol depending
-  /// on `asDim`.
-  /// Return success if v is of dim id type when `asDim` is true and of symbol
-  /// id type when `asDim` is false.
-  /// Return failure otherwise.
-  LogicalResult ensureIdOfType(Value v, bool asDim);
-
-private:
-  /// Implementation detail for composeMin/Max.
-  LogicalResult composeMinOrMaxMapAndOperands(Value val, AffineMap map,
-                                              ValueRange operands, bool min);
-};
-
-} // namespace mlir
-
-#endif // MLIR_DIALECT_LINALG_ANALYSIS_CONSTRAINTS_SET_H_
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
index feb8e0c1ea5a4..ec71aa59ab916 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml
@@ -1,4 +1,5 @@
-# Auto-generated file. Do not edit!
+### AUTOGENERATED from core_named_ops.py
+### To regenerate, run: bin/update_core_linalg_named_ops.sh
 --- !LinalgOpConfig
 metadata: !LinalgOpMetadata
   name: matmul
@@ -628,11 +629,276 @@ structured_op: !LinalgStructuredOpConfig
                   scalar_arg: B
 --- !LinalgOpConfig
 metadata: !LinalgOpMetadata
-  name: conv_2d_nchw
-  cpp_class_name: Conv2DNchwOp
+  name: conv_1d
+  cpp_class_name: Conv1DOp
+  doc: |-
+    Performs 1-D convolution with no channels.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    usage: InputOperand
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2] -> (s0)>
+  - !LinalgOperandDefConfig
+    name: K
+    usage: InputOperand
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2] -> (s1)>
+  - !LinalgOperandDefConfig
+    name: O
+    usage: OutputOperand
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2] -> (s2)>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1)[s0, s1, s2] -> (d0 + d1)>
+    - affine_map<(d0, d1)[s0, s1, s2] -> (d1)>
+    - affine_map<(d0, d1)[s0, s1, s2] -> (d0)>
+  iterator_types:
+  - parallel
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_apply:
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_apply:
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              symbolic_cast:
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: I
+            - !ScalarExpression
+              symbolic_cast:
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: K
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: conv_2d
+  cpp_class_name: Conv2DOp
+  doc: |-
+    Performs 2-D convolution with no channels.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    usage: InputOperand
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s0, s1)>
+  - !LinalgOperandDefConfig
+    name: K
+    usage: InputOperand
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s2, s3)>
+  - !LinalgOperandDefConfig
+    name: O
+    usage: OutputOperand
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5] -> (s4, s5)>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0 + d2, d1 + d3)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d2, d3)>
+    - affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4, s5] -> (d0, d1)>
+  iterator_types:
+  - parallel
+  - parallel
+  - reduction
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_apply:
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_apply:
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              symbolic_cast:
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: I
+            - !ScalarExpression
+              symbolic_cast:
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: K
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: conv_3d
+  cpp_class_name: Conv3DOp
+  doc: |-
+    Performs 3-D convolution with no channels.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    usage: InputOperand
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, s1, s2)>
+  - !LinalgOperandDefConfig
+    name: K
+    usage: InputOperand
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s3, s4, s5)>
+  - !LinalgOperandDefConfig
+    name: O
+    usage: OutputOperand
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s6, s7, s8)>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (d0
+      + d3, d1 + d4, d2 + d5)>
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (d3,
+      d4, d5)>
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (d0,
+      d1, d2)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  - reduction
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_apply:
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_apply:
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              symbolic_cast:
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: I
+            - !ScalarExpression
+              symbolic_cast:
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: K
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: conv_1d_nwc_wcf
+  cpp_class_name: Conv1DNwcWcfOp
+  doc: |-
+    Performs 1-D convolution.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    usage: InputOperand
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7] -> (s0, s1, s2)>
+  - !LinalgOperandDefConfig
+    name: K
+    usage: InputOperand
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7] -> (s3, s2, s4)>
+  - !LinalgOperandDefConfig
+    name: O
+    usage: OutputOperand
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7] -> (s0, s5, s4)>
+  - !LinalgOperandDefConfig
+    name: strides
+    usage: IndexAttribute
+    type_var: I64
+    attribute_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7] -> (s6)>
+  - !LinalgOperandDefConfig
+    name: dilations
+    usage: IndexAttribute
+    type_var: I64
+    attribute_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7] -> (s7)>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3, d4)[s0, s1, s2, s3, s4, s5, s6, s7] -> (d0, d1 *
+      s6 + d3 * s7, d4)>
+    - affine_map<(d0, d1, d2, d3, d4)[s0, s1, s2, s3, s4, s5, s6, s7] -> (d3, d4,
+      d2)>
+    - affine_map<(d0, d1, d2, d3, d4)[s0, s1, s2, s3, s4, s5, s6, s7] -> (d0, d1,
+      d2)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_apply:
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_apply:
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              symbolic_cast:
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: I
+            - !ScalarExpression
+              symbolic_cast:
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: K
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: conv_2d_nhwc_hwcf
+  cpp_class_name: Conv2DNhwcHwcfOp
   doc: |-
     Performs 2-D convolution.
 
+    Layout:
+      * Input: NHWC.
+      * Kernel: HWCF.
+
     Numeric casting is performed on the operands to the inner multiply, promoting
     them to the same data type as the accumulator/output.
 structured_op: !LinalgStructuredOpConfig
@@ -648,13 +914,13 @@ structured_op: !LinalgStructuredOpConfig
     usage: InputOperand
     type_var: T2
     shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]
-      -> (s4, s1, s5, s6)>
+      -> (s4, s5, s3, s6)>
   - !LinalgOperandDefConfig
     name: O
     usage: OutputOperand
     type_var: U
     shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]
-      -> (s0, s4, s7, s8, s1)>
+      -> (s0, s7, s8, s6)>
   - !LinalgOperandDefConfig
     name: strides
     usage: IndexAttribute
@@ -670,9 +936,9 @@ structured_op: !LinalgStructuredOpConfig
   indexing_maps: !LinalgIndexingMapsConfig
     static_indexing_maps:
     - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
-      s9, s10, s11, s12] -> (d0, d4, d2 * s9 + d5 * s11, d3 * s10 + d6 * s12)>
+      s9, s10, s11, s12] -> (d0, d1 * s9 + d4 * s11, d2 * s10 + d5 * s12, d6)>
     - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
-      s9, s10, s11, s12] -> (d1, d4, d5, d6)>
+      s9, s10, s11, s12] -> (d4, d5, d6, d3)>
     - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
       s9, s10, s11, s12] -> (d0, d1, d2, d3)>
   iterator_types:
@@ -710,13 +976,18 @@ structured_op: !LinalgStructuredOpConfig
                   scalar_arg: K
 --- !LinalgOpConfig
 metadata: !LinalgOpMetadata
-  name: conv_2d_nhwc_hwcf
-  cpp_class_name: Conv2DNhwcHwcfOp
+  name: conv_2d_nhwc_hwcf_q
+  cpp_class_name: Conv2DNhwcHwcfQOp
   doc: |-
-    Performs 2-D convolution.
+    Performs 2-D convolution with zero point offsets.
+
+    Layout:
+      * Input: NHWC.
+      * Kernel: HWCF.
 
     Numeric casting is performed on the operands to the inner multiply, promoting
-    them to the same data type as the accumulator/output.
+    them to the same data type as the accumulator/output. This includes the zero
+    point offsets common to quantized operations.
 structured_op: !LinalgStructuredOpConfig
   args:
   - !LinalgOperandDefConfig
@@ -731,6 +1002,14 @@ structured_op: !LinalgStructuredOpConfig
     type_var: T2
     shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]
       -> (s4, s5, s3, s6)>
+  - !LinalgOperandDefConfig
+    name: IZp
+    usage: InputOperand
+    type_var: I32
+  - !LinalgOperandDefConfig
+    name: KZp
+    usage: InputOperand
+    type_var: I32
   - !LinalgOperandDefConfig
     name: O
     usage: OutputOperand
@@ -755,6 +1034,116 @@ structured_op: !LinalgStructuredOpConfig
       s9, s10, s11, s12] -> (d0, d1 * s9 + d4 * s11, d2 * s10 + d5 * s12, d6)>
     - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
       s9, s10, s11, s12] -> (d4, d5, d6, d3)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
+      s9, s10, s11, s12] -> ()>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
+      s9, s10, s11, s12] -> ()>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
+      s9, s10, s11, s12] -> (d0, d1, d2, d3)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  - reduction
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_apply:
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_apply:
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              scalar_apply:
+                fn_name: sub
+                operands:
+                - !ScalarExpression
+                  symbolic_cast:
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: I
+                - !ScalarExpression
+                  symbolic_cast:
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: IZp
+            - !ScalarExpression
+              scalar_apply:
+                fn_name: sub
+                operands:
+                - !ScalarExpression
+                  symbolic_cast:
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: K
+                - !ScalarExpression
+                  symbolic_cast:
+                    type_var: U
+                    operands:
+                    - !ScalarExpression
+                      scalar_arg: KZp
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: conv_2d_nchw_fchw
+  cpp_class_name: Conv2DNchwFchwOp
+  doc: |-
+    Performs 2-D convolution.
+
+    Layout:
+      * Input: NCHW.
+      * Kernel: FCHW.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    usage: InputOperand
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]
+      -> (s0, s1, s2, s3)>
+  - !LinalgOperandDefConfig
+    name: K
+    usage: InputOperand
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]
+      -> (s4, s1, s5, s6)>
+  - !LinalgOperandDefConfig
+    name: O
+    usage: OutputOperand
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]
+      -> (s0, s4, s7, s8)>
+  - !LinalgOperandDefConfig
+    name: strides
+    usage: IndexAttribute
+    type_var: I64
+    attribute_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+      s12] -> (s9, s10)>
+  - !LinalgOperandDefConfig
+    name: dilations
+    usage: IndexAttribute
+    type_var: I64
+    attribute_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+      s12] -> (s11, s12)>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
+      s9, s10, s11, s12] -> (d0, d4, d2 * s9 + d5 * s11, d3 * s10 + d6 * s12)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
+      s9, s10, s11, s12] -> (d1, d4, d5, d6)>
     - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
       s9, s10, s11, s12] -> (d0, d1, d2, d3)>
   iterator_types:
@@ -792,13 +1181,99 @@ structured_op: !LinalgStructuredOpConfig
                   scalar_arg: K
 --- !LinalgOpConfig
 metadata: !LinalgOpMetadata
-  name: depthwise_conv_2d_input_nhwc_filter_hwc_poly
-  cpp_class_name: DepthwiseConv2DInputNhwcFilterHwcPolyOp
+  name: conv_3d_ndhwc_dhwcf
+  cpp_class_name: Conv3DNdhwcDhwcfOp
   doc: |-
-    Performs depth-wise 2-D convolution.
+    Performs 3-D convolution.
 
     Numeric casting is performed on the operands to the inner multiply, promoting
     them to the same data type as the accumulator/output.
+structured_op: !LinalgStructuredOpConfig
+  args:
+  - !LinalgOperandDefConfig
+    name: I
+    usage: InputOperand
+    type_var: T1
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+      s13, s14, s15, s16, s17] -> (s0, s1, s2, s3, s4)>
+  - !LinalgOperandDefConfig
+    name: K
+    usage: InputOperand
+    type_var: T2
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+      s13, s14, s15, s16, s17] -> (s5, s6, s7, s4, s8)>
+  - !LinalgOperandDefConfig
+    name: O
+    usage: OutputOperand
+    type_var: U
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12,
+      s13, s14, s15, s16, s17] -> (s0, s9, s10, s11, s8)>
+  - !LinalgOperandDefConfig
+    name: strides
+    usage: IndexAttribute
+    type_var: I64
+    attribute_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+      s12, s13, s14, s15, s16, s17] -> (s12, s13, s14)>
+  - !LinalgOperandDefConfig
+    name: dilations
+    usage: IndexAttribute
+    type_var: I64
+    attribute_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
+      s12, s13, s14, s15, s16, s17] -> (s15, s16, s17)>
+  indexing_maps: !LinalgIndexingMapsConfig
+    static_indexing_maps:
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8)[s0, s1, s2, s3, s4, s5, s6,
+      s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17] -> (d0, d1 * s12 + d5 *
+      s15, d2 * s13 + d6 * s16, d3 * s14 + d7 * s17, d8)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8)[s0, s1, s2, s3, s4, s5, s6,
+      s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17] -> (d5, d6, d7, d8, d4)>
+    - affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8)[s0, s1, s2, s3, s4, s5, s6,
+      s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17] -> (d0, d1, d2, d3, d4)>
+  iterator_types:
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - parallel
+  - reduction
+  - reduction
+  - reduction
+  - reduction
+  assignments:
+  - !ScalarAssign
+    arg: O
+    value: !ScalarExpression
+      scalar_apply:
+        fn_name: add
+        operands:
+        - !ScalarExpression
+          scalar_arg: O
+        - !ScalarExpression
+          scalar_apply:
+            fn_name: mul
+            operands:
+            - !ScalarExpression
+              symbolic_cast:
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: I
+            - !ScalarExpression
+              symbolic_cast:
+                type_var: U
+                operands:
+                - !ScalarExpression
+                  scalar_arg: K
+--- !LinalgOpConfig
+metadata: !LinalgOpMetadata
+  name: depthwise_conv2D_nhw
+  cpp_class_name: DepthwiseConv2DNhwOp
+  doc: |-
+    Performs depth-wise 2-D convolution.
+
+    Numeric casting is performed on the operands to the inner multiply, promoting
+    them to the same data type as the accumulator/output. Multiplier is set to 1
+    which is a special case for most dpethwise convolutions.
 structured_op: !LinalgStructuredOpConfig
   args:
   - !LinalgOperandDefConfig
@@ -834,18 +1309,18 @@ structured_op: !LinalgStructuredOpConfig
   indexing_maps: !LinalgIndexingMapsConfig
     static_indexing_maps:
     - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
-      s10, s11] -> (d0, d1 * s8 + d3 * s10, d2 * s9 + d4 * s11, d5)>
+      s10, s11] -> (d0, d1 * s8 + d4 * s10, d2 * s9 + d5 * s11, d3)>
     - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
-      s10, s11] -> (d3, d4, d5)>
+      s10, s11] -> (d4, d5, d3)>
     - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
-      s10, s11] -> (d0, d1, d2, d5)>
+      s10, s11] -> (d0, d1, d2, d3)>
   iterator_types:
   - parallel
   - parallel
   - parallel
+  - parallel
   - reduction
   - reduction
-  - parallel
   assignments:
   - !ScalarAssign
     arg: O
@@ -873,28 +1348,27 @@ structured_op: !LinalgStructuredOpConfig
                   scalar_arg: K
 --- !LinalgOpConfig
 metadata: !LinalgOpMetadata
-  name: conv_2d_nhwc_hwcf_q
-  cpp_class_name: Conv2DNhwcHwcfQOp
+  name: depthwise_conv2D_nhw_q
+  cpp_class_name: DepthwiseConv2DNhwQOp
   doc: |-
-    Performs 2-D convolution with zero point offsets.
+    Performs depth-wise 2-D convolution.
 
     Numeric casting is performed on the operands to the inner multiply, promoting
-    them to the same data type as the accumulator/output. This includes the zero
-    point offsets common to quantized operations.
+    them to the same data type as the accumulator/output.
 structured_op: !LinalgStructuredOpConfig
   args:
   - !LinalgOperandDefConfig
     name: I
     usage: InputOperand
     type_var: T1
-    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]
-      -> (s0, s1, s2, s3)>
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s1, s2, s3)>
   - !LinalgOperandDefConfig
     name: K
     usage: InputOperand
     type_var: T2
-    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]
-      -> (s4, s5, s3, s6)>
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s4, s5, s3)>
   - !LinalgOperandDefConfig
     name: IZp
     usage: InputOperand
@@ -907,32 +1381,32 @@ structured_op: !LinalgStructuredOpConfig
     name: O
     usage: OutputOperand
     type_var: U
-    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12]
-      -> (s0, s7, s8, s6)>
+    shape_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] ->
+      (s0, s6, s7, s3)>
   - !LinalgOperandDefConfig
     name: strides
     usage: IndexAttribute
     type_var: I64
-    attribute_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
-      s12] -> (s9, s10)>
+    attribute_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s8, s9)>
   - !LinalgOperandDefConfig
     name: dilations
     usage: IndexAttribute
     type_var: I64
-    attribute_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11,
-      s12] -> (s11, s12)>
+    attribute_map: affine_map<()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11]
+      -> (s10, s11)>
   indexing_maps: !LinalgIndexingMapsConfig
     static_indexing_maps:
-    - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
-      s9, s10, s11, s12] -> (d0, d1 * s9 + d4 * s11, d2 * s10 + d5 * s12, d6)>
-    - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
-      s9, s10, s11, s12] -> (d4, d5, d6, d3)>
-    - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
-      s9, s10, s11, s12] -> ()>
-    - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
-      s9, s10, s11, s12] -> ()>
-    - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
-      s9, s10, s11, s12] -> (d0, d1, d2, d3)>
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
+      s10, s11] -> (d0, d1 * s8 + d4 * s10, d2 * s9 + d5 * s11, d3)>
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
+      s10, s11] -> (d4, d5, d3)>
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
+      s10, s11] -> ()>
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
+      s10, s11] -> ()>
+    - affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9,
+      s10, s11] -> (d0, d1, d2, d3)>
   iterator_types:
   - parallel
   - parallel
@@ -940,7 +1414,6 @@ structured_op: !LinalgStructuredOpConfig
   - parallel
   - reduction
   - reduction
-  - reduction
   assignments:
   - !ScalarAssign
     arg: O
@@ -1030,19 +1503,19 @@ structured_op: !LinalgStructuredOpConfig
   indexing_maps: !LinalgIndexingMapsConfig
     static_indexing_maps:
     - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
-      s9, s10, s11, s12] -> (d0, d1 * s9 + d3 * s11, d2 * s10 + d4 * s12, d5)>
+      s9, s10, s11, s12] -> (d0, d1 * s9 + d5 * s11, d2 * s10 + d6 * s12, d3)>
     - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
-      s9, s10, s11, s12] -> (d3, d4, d5, d6)>
+      s9, s10, s11, s12] -> (d5, d6, d3, d4)>
     - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
-      s9, s10, s11, s12] -> (d0, d1, d2, d5, d6)>
+      s9, s10, s11, s12] -> (d0, d1, d2, d3, d4)>
   iterator_types:
   - parallel
   - parallel
   - parallel
-  - reduction
-  - reduction
   - parallel
   - parallel
+  - reduction
+  - reduction
   assignments:
   - !ScalarAssign
     arg: O
@@ -1120,23 +1593,23 @@ structured_op: !LinalgStructuredOpConfig
   indexing_maps: !LinalgIndexingMapsConfig
     static_indexing_maps:
     - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
-      s9, s10, s11, s12] -> (d0, d1 * s9 + d3 * s11, d2 * s10 + d4 * s12, d5)>
+      s9, s10, s11, s12] -> (d0, d1 * s9 + d5 * s11, d2 * s10 + d6 * s12, d3)>
     - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
-      s9, s10, s11, s12] -> (d3, d4, d5, d6)>
+      s9, s10, s11, s12] -> (d5, d6, d3, d4)>
     - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
       s9, s10, s11, s12] -> ()>
     - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
       s9, s10, s11, s12] -> ()>
     - affine_map<(d0, d1, d2, d3, d4, d5, d6)[s0, s1, s2, s3, s4, s5, s6, s7, s8,
-      s9, s10, s11, s12] -> (d0, d1, d2, d5, d6)>
+      s9, s10, s11, s12] -> (d0, d1, d2, d3, d4)>
   iterator_types:
   - parallel
   - parallel
   - parallel
-  - reduction
-  - reduction
   - parallel
   - parallel
+  - reduction
+  - reduction
   assignments:
   - !ScalarAssign
     arg: O
@@ -1896,4 +2369,3 @@ structured_op: !LinalgStructuredOpConfig
                     operands:
                     - !ScalarExpression
                       scalar_arg: I
-
diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc
index e792c110eab61..0fddacf65f095 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOpsSpec.tc
@@ -1,9 +1,3 @@
-ods_def<MatmulColumnMajorOp>
-implements_interface<LinalgContractionOpInterface> :
-def matmul_column_major(A: f32(K, M), B: f32(N, K)) -> (C: f32(N, M)) {
-  C(n, m) = AddFOp<k>(C(n, m), MulFOp(A(k, m), B(n, k)));
-}
-
 ods_def<MatmulI8I8I32Op>
 implements_interface<LinalgContractionOpInterface> :
 def matmul_i8_i8_i32(A: i8(M, K), B: i8(K, N)) -> (C: i32(M, N)) {
@@ -11,326 +5,3 @@ def matmul_i8_i8_i32(A: i8(M, K), B: i8(K, N)) -> (C: i32(M, N)) {
   //   C(m, n) += cast<i32>(A(m, k)) * cast<i32>(B(k, n))
   C(m, n) = AddIOp<k>(C(m, n), MulIOp(SignExtendIOp32(A(m, k)), SignExtendIOp32(B(k, n))));
 }
-
-ods_def<MatmulI16I16I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def matmul_i16_i16_i32(A: i16(M, K), B: i16(K, N)) -> (C: i32(M, N)) {
-  C(m, n) = AddIOp<k>(C(m, n), MulIOp(SignExtendIOp32(A(m, k)), SignExtendIOp32(B(k, n))));
-}
-
-ods_def<MatmulI32I32I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def matmul_i32_i32_i32(A: i32(M, K), B: i32(K, N)) -> (C: i32(M, N)) {
-  C(m, n) = AddIOp<k>(C(m, n), MulIOp(A(m, k), B(k, n)));
-}
-
-ods_def<MatvecI8I8I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def matvec_i8_i8_i32(A: i8(M, N), y: i8(N)) -> (x: i32(M)) {
-  x(m) = AddIOp<n>(x(m), MulIOp(SignExtendIOp32(A(m, n)), SignExtendIOp32(y(n))));
-}
-
-ods_def<MatvecI16I16I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def matvec_i16_i16_i32(A: i16(M, N), y: i16(N)) -> (x: i32(M)) {
-  x(m) = AddIOp<n>(x(m), MulIOp(SignExtendIOp32(A(m, n)), SignExtendIOp32(y(n))));
-}
-
-ods_def<MatvecI32I32I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def matvec_i32_i32_i32(A: i32(M, N), y: i32(N)) -> (x: i32(M)) {
-  x(m) = AddIOp<n>(x(m), MulIOp(A(m, n), y(n)));
-}
-
-ods_def<VecmatI8I8I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def vecmat_i8_i8_i32(y: i8(M), A: i8(M, N)) -> (x: i32(N)) {
-  x(n) = AddIOp<m>(x(n), MulIOp(SignExtendIOp32(y(m)), SignExtendIOp32(A(m, n))));
-}
-
-ods_def<VecmatI16I16I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def vecmat_i16_i16_i32(y: i16(M), A: i16(M, N)) -> (x: i32(N)) {
-  x(n) = AddIOp<m>(x(n), MulIOp(SignExtendIOp32(y(m)), SignExtendIOp32(A(m, n))));
-}
-
-ods_def<VecmatI32I32I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def vecmat_i32_i32_i32(y: i32(M), A: i32(M, N)) -> (x: i32(N)) {
-  x(n) = AddIOp<m>(x(n), MulIOp(y(m), A(m, n)));
-}
-
-ods_def<DotI8I8I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def dot_i8_i8_i32(A: i8(M), B: i8(M)) -> (C: i32()) {
-  C() = AddIOp<m>(C(), MulIOp(SignExtendIOp32(A(m)), SignExtendIOp32(B(m))));
-}
-
-ods_def<DotI16I16I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def dot_i16_i16_i32(A: i16(M), B: i16(M)) -> (C: i32()) {
-  C() = AddIOp<m>(C(), MulIOp(SignExtendIOp32(A(m)), SignExtendIOp32(B(m))));
-}
-
-ods_def<DotI32I32I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def dot_i32_i32_i32(A: i32(M), B: i32(M)) -> (C: i32()) {
-  C() = AddIOp<m>(C(), MulIOp(A(m), B(m)));
-}
-
-ods_def<BatchMatmulI8I8I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def batch_matmul_i8_i8_i32(A: i8(Batch, M, K), B: i8(Batch, K, N)) -> (C: i32(Batch, M, N)) {
-  C(b, m, n) =
-      AddIOp<k>(C(b, m, n), MulIOp(SignExtendIOp32(A(b, m, k)), SignExtendIOp32(B(b, k, n))));
-}
-
-ods_def<BatchMatmulI16I16I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def batch_matmul_i16_i16_i32(A: i16(Batch, M, K), B: i16(Batch, K, N)) -> (C: i32(Batch, M, N)) {
-  C(b, m, n) =
-      AddIOp<k>(C(b, m, n), MulIOp(SignExtendIOp32(A(b, m, k)), SignExtendIOp32(B(b, k, n))));
-}
-
-
-ods_def<BatchMatmulI32I32I32Op>
-implements_interface<LinalgContractionOpInterface> :
-def batch_matmul_i32_i32_i32(A: i32(Batch, M, K), B: i32(Batch, K, N)) -> (C: i32(Batch, M, N)) {
-  C(b, m, n) = AddIOp<k>(C(b, m, n), MulIOp(A(b, m, k), B(b, k, n)));
-}
-
-ods_def<ConvWOp>:
-def conv_1d(I: f32(W), K: f32(KW)) -> (O: f32(W)) {
-  O(w) = AddFOp<kw>(O(w), MulFOp(I(w + kw), K(kw)));
-}
-
-ods_def<ConvNWCOp>:
-def conv_1d_nwc(I: f32(N, W, C), K: f32(F, KW, C)) -> (O: f32(N, W, F)) {
-  O(n, w, f) = AddFOp<kw>(O(n, w, f), MulFOp(I(n, w + kw, c), K(f, kw, c)));
-}
-
-ods_def<ConvNCWOp>:
-def conv_1d_ncw(I: f32(N, C, W), K: f32(F, C, KW)) -> (O: f32(N, F, W)) {
-  O(n, f, w) = AddFOp<kw>(O(n, f, w), MulFOp(I(n, c, w + kw), K(f, c, kw)));
-}
-
-ods_def<ConvHWOp>:
-def conv_2d(I: f32(H, W), K: f32(KH, KW)) -> (O: f32(H, W)) {
-  O(h, w) = AddFOp<kh, kw>(O(h, w), MulFOp(I(h + kh, w + kw), K(kh, kw)));
-}
-
-ods_def<ConvNHWCOp>:
-def conv_2d_nhwc(I: f32(N, H, W, C), K: f32(F, KH, KW, C)) -> (O: f32(N, H, W, F)) {
-  O(n, h, w, f) = AddFOp<kh, kw>(
-      O(n, h, w, f), MulFOp(I(n, h + kh, w + kw, c), K(f, kh, kw, c)));
-}
-
-ods_def<ConvDHWOp>:
-def conv_3d(I: f32(D, H, W), K: f32(KD, KH, KW)) -> (O: f32(D, H, W)) {
-  O(d, h, w) = AddFOp<kd, kh, kw>(
-      O(d, h, w), MulFOp(I(d + kd, h + kh, w + kw), K(kd, kh, kw)));
-}
-
-ods_def<ConvNDHWCOp>:
-def conv_3d_ndhwc(I: f32(N, D, H, W, C), K: f32(F, KD, KH, KW, C)) -> (O: f32(N, D, H, W, F)) {
-  O(n, d, h, w, f) = AddFOp<kd, kh, kw>(
-      O(n, d, h, w, f),
-      MulFOp(I(n, d + kd, h + kh, w + kw, c), K(f, kd, kh, kw, c)));
-}
-
-ods_def<ConvNCDHWOp>:
-def conv_3d_ncdhw(I: f32(N, C, D, H, W), K: f32(F, C, KD, KH, KW)) -> (O: f32(N, F, D, H, W)) {
-  O(n, f, d, h, w) = AddFOp<kd, kh, kw>(
-      O(n, f, d, h, w),
-      MulFOp(I(n, c, d + kd, h + kh, w + kw), K(f, c, kd, kh, kw)));
-}
-
-ods_def<DepthwiseConvInputNHWCFilterHWCFOp>:
-def depthwise_conv_2d_input_nhwc_filter_hwcf
-      (I: f32(N, IH, IW, CI), K: f32(KH, KW, CI, CO))
-   -> (O: f32(N, OH, OW, CI, CO))
-  attr(strides: 2xi64, dilations: 2xi64)
-"""A general depth-wise 2-D convolution operation.
-
-This operation performs depth-wise 2-D convolution over an input `I` and filter
-`F` and generates output `O` using the following computation:
-
-```
-  O(n, oh, ow, ci, co) = AddFOp<kh, kw>(
-      O(n, oh, ow, ci, co),
-      MulFOp(I(n, oh * strides[0] + kh * dilations[0], ow * strides[1] + kw * dilations[1], ci),
-               K(kh, kw, ci, co)));
-```
-
-where
-
-* `I` is a 4-D tensor with shape `(N, IH, IW, CI)`.
-* `F` is a 4-D tensor with shape `(KH, KW, CI, CO)`.
-* `O` is a 5-D tensor with shape `(N, OH, OW, CI, CO)`.
-* `strides` is a 2-element vector attribute for window strides along the
-  height/width dimension.
-
-The indexing maps for these three tensors contain 7 dimensions, following the
-order of (`N`, `OH`, `OW`, `CI`, `CO`, `KH`, `KW`).
-
-Note: this op only supports any channel multiplier, which is `CO`. To map back
-to 4D result as DepthwiseConvInputNHWCFilterHWCOp, you will have to create a
-Linalg reshape op which collapses `CI` and `CO` into one dimension.
-"""
-{
-  O(n, oh, ow, ci, co) = AddFOp<kh, kw>(
-      O(n, oh, ow, ci, co),
-      MulFOp(I(n, oh * strides[0] + kh * dilations[0], ow * strides[1] + kw * dilations[1], ci),
-               K(kh, kw, ci, co)));
-}
-
-ods_def<DepthwiseConvInputNHWCFilterHWCOp>:
-def depthwise_conv_2d_input_nhwc_filter_hwc
-      (I: f32(N, IH, IW, C), K: f32(KH, KW, C))
-   -> (O: f32(N, OH, OW, C))
-  attr(strides: 2xi64, dilations: 2xi64)
-"""A depth-wise 2-D convolution operation.
-
-This operation performs depth-wise 2-D convolution over an input `I` and filter
-`F` and generates output `O` using the following computation:
-
-```
-O(n, oh, ow, c) = AddFOp<kh, kw>(
-    O(n, oh, ow, c),
-    MulFOp(I(n, oh * strides[0] + kh * dilations[0], ow * strides[1] + kw * dilations[1], c),
-             K(kh, kw, c)));
-```
-
-where
-
-* `I` is a 4-D tensor with shape `(N, IH, IW, C)`.
-* `F` is a 3-D tensor with shape `(KH, KW, C)`.
-* `O` is a 4-D tensor with shape `(N, OH, OW, C)`.
-* `strides` is a 2-element vector attribute for window strides along the
-  height/width dimension.
-
-The indexing maps for these three tensors contain 6 dimensions, following the
-order of (`N`, `OH`, `OW`, `C`, `KH`, `KW`).
-
-Note: this op only supports channel multiplier == 1.
-"""
-{
-  O(n, oh, ow, c) = AddFOp<kh, kw>(
-      O(n, oh, ow, c),
-      MulFOp(I(n, oh * strides[0] + kh * dilations[0], ow * strides[1] + kw * dilations[1], c),
-               K(kh, kw, c)));
-}
-
-ods_def<ConvInputNWCFilterWCFOp>:
-def conv_1d_input_nwc_filter_wcf(I: f32(N, W, C), K: f32(KW, C, F)) -> (O: f32(N, W, F))
-  attr(strides: 1xi64, dilations: 1xi64)
-""" A 1-D convolution given NWC layout input and WCF layout filter.
-
-Computes a 1-D convolution given 3-D input and filter. The data layout
-of input is NWC and the data layout of filter is WCF.
-
-The indexing maps for these three tensors contain 5 dimensions, following the
-order of (`N`, `W`, `F`, `KW`, `C`).
-"""
-{
-  O(n, w, f) = AddFOp<kw>(
-      O(n, w, f),
-      MulFOp(I(n, w * strides[0] + kw * dilations[0], c), K(kw, c, f)));
-}
-
-ods_def<ConvInputNCWFilterWCFOp>:
-def conv_1d_input_ncw_filter_wcf(I: f32(N, C, W), K: f32(KW, C, F)) -> (O: f32(N, F, W))
-  attr(strides: 1xi64, dilations: 1xi64)
-""" A 1-D convolution given NCW layout input and WCF layout filter.
-
-Computes a 1-D convolution given 3-D input and filter. The data layout
-of input is NCW and the data layout of filter is WCF.
-
-The indexing maps for these three tensors contain 5 dimensions, following the
-order of (`N`, `F`, `W`, `KW`, `C`).
-"""
-{
-  O(n, f, w) = AddFOp<kw>(
-      O(n, f, w),
-      MulFOp(I(n, c, w * strides[0] + kw * dilations[0]), K(kw, c, f)));
-}
-
-ods_def<ConvInputNHWCFilterHWCFOp>:
-def conv_2d_input_nhwc_filter_hwcf(I: f32(N, H, W, C), K: f32(KH, KW, C, F)) -> (O: f32(N, H, W, F))
-  attr(strides: 2xi64, dilations: 2xi64)
-""" A 2-D convolution given NHWC layout input and HWCF layout filter.
-
-Computes a 2-D convolution given 4-D input and filter. The data layout
-of input is NHWC and the data layout of filter is HWCF.
-
-The indexing maps for these three tensors contain 7 dimensions, following the
-order of (`N`, `H`, `W`, `F`, `KH`, `KW`, `C`).
-"""
-{
-  O(n, h, w, f) = AddFOp<kh, kw>(
-      O(n, h, w, f), MulFOp(I(n, h * strides[0] + kh * dilations[0],
-                                w * strides[1] + kw * dilations[1], c),
-                              K(kh, kw, c, f)));
-}
-
-ods_def<ConvInputNCHWFilterHWCFOp>:
-def conv_2d_input_nchw_filter_hwcf
-    (I: f32(N, C, H, W), K: f32(KH, KW, C, F))
-  -> (O: f32(N, F, H, W))
-  attr(strides: 2xi64, dilations: 2xi64)
-""" A 2-D convolution given NCHW layout input and HWCF layout filter.
-
-Computes a 2-D convolution given 4-D input and filter. The data layout
-of input is NCHW and the data layout of filter is HWCF.
-
-The indexing maps for these three tensors contain 7 dimensions, following the
-order of (`N`, `F`, `H`, `W`, `KH`, `KW`, `C`).
-"""
-{
-  O(n, f, h, w) = AddFOp<kh, kw>(
-      O(n, f, h, w), MulFOp(I(n, c, h * strides[0] + kh * dilations[0],
-                                w * strides[1] + kw * dilations[1]),
-                              K(kh, kw, c, f)));
-}
-
-ods_def<ConvInputNDHWCFilterDHWCFOp>:
-def conv_3d_input_ndhwc_filter_dhwcf
-    (I: f32(N, D, H, W, C), K: f32(KD, KH, KW, C, F))
-  -> (O: f32(N, D, H, W, F))
-  attr(strides: 3xi64, dilations: 3xi64)
-""" A 3-D convolution given NDHWC layout input and DHWCF layout filter.
-
-Computes a 3-D convolution given 5-D input and filter. The data layout
-of input is NDHWC and the data layout of filter is DHWCF.
-
-The indexing maps for these three tensors contain 9 dimensions, following the
-order of (`N`, `D`, `H`, `W`, `F`, `KD`, `KH`, `KW`, `C`).
-"""
-{
-  O(n, d, h, w, f) = AddFOp<kd, kh, kw>(
-      O(n, d, h, w, f), MulFOp(I(n, d * strides[0] + kd * dilations[0],
-                                   h * strides[1] + kh * dilations[1],
-                                   w * strides[2] + kw * dilations[2], c),
-                                 K(kd, kh, kw, c, f)));
-}
-
-ods_def<ConvInputNCDHWFilterDHWCFOp>:
-def conv_3d_input_ncdhw_filter_dhwcf
-    (I: f32(N, C, D, H, W), K: f32(KD, KH, KW, C, F))
-  -> (O: f32(N, F, D, H, W))
-  attr(strides: 3xi64, dilations: 3xi64)
-""" A 3-D convolution given NCDHW layout input and DHWCF layout filter.
-
-Computes a 3-D convolution given 5-D input and filter. The data layout
-of input is NCDHW and the data layout of filter is DHWCF.
-
-The indexing maps for these three tensors contain 9 dimensions, following the
-order of (`N`, `F`, `D`, `H`, `W`, `KD`, `KH`, `KW`, `C`).
-"""
-{
-  O(n, f, d, h, w) = AddFOp<kd, kh, kw>(
-      O(n, f, d, h, w), MulFOp(I(n, c, d * strides[0] + kd * dilations[0],
-                                   h * strides[1] + kh * dilations[1],
-                                   w * strides[2] + kw * dilations[2]),
-                                 K(kd, kh, kw, c, f)));
-}
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
index b03fde6e9b370..81ab7eaa08866 100644
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -56,19 +56,6 @@ SmallVector<Value, 4> getDynOperands(Location loc, Value val, OpBuilder &b);
 /// Otherwise return nullptr.
 IntegerAttr getSmallestBoundingIndex(Value size);
 
-//===----------------------------------------------------------------------===//
-// Iterator type utilities
-//===----------------------------------------------------------------------===//
-
-/// Checks if an iterator_type attribute is parallel.
-bool isParallelIteratorType(Attribute attr);
-
-/// Checks if an iterator_type attribute is parallel.
-bool isReductionIteratorType(Attribute attr);
-
-/// Checks if an iterator_type attribute is parallel.
-bool isWindowIteratorType(Attribute attr);
-
 //===----------------------------------------------------------------------===//
 // Fusion utilities
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/SCF/Passes.h b/mlir/include/mlir/Dialect/SCF/Passes.h
index f8ed2c429b47f..df6a27232a0d3 100644
--- a/mlir/include/mlir/Dialect/SCF/Passes.h
+++ b/mlir/include/mlir/Dialect/SCF/Passes.h
@@ -36,8 +36,13 @@ std::unique_ptr<Pass> createParallelLoopFusionPass();
 std::unique_ptr<Pass> createParallelLoopSpecializationPass();
 
 /// Creates a pass which tiles innermost parallel loops.
+/// If noMinMaxBounds, the upper bound of the inner loop will
+/// be a same value among different outter loop iterations, and
+/// an additional inbound check will be emitted inside the internal
+/// loops.
 std::unique_ptr<Pass>
-createParallelLoopTilingPass(llvm::ArrayRef<int64_t> tileSize = {});
+createParallelLoopTilingPass(llvm::ArrayRef<int64_t> tileSize = {},
+                             bool noMinMaxBounds = false);
 
 /// Creates a pass which folds arith ops on induction variable into
 /// loop range.
diff --git a/mlir/include/mlir/Dialect/SCF/Passes.td b/mlir/include/mlir/Dialect/SCF/Passes.td
index 5e2a3a81bc0f0..44a7617ae4d97 100644
--- a/mlir/include/mlir/Dialect/SCF/Passes.td
+++ b/mlir/include/mlir/Dialect/SCF/Passes.td
@@ -47,7 +47,11 @@ def SCFParallelLoopTiling : FunctionPass<"parallel-loop-tiling"> {
   let options = [
     ListOption<"tileSizes", "parallel-loop-tile-sizes", "int64_t",
                "Factors to tile parallel loops by",
-               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
+               "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
+    Option<"noMinMaxBounds", "no-min-max-bounds", "bool",
+           /*default=*/"false",
+           "Perform tiling with fixed upper bound with inbound check "
+           "inside the internal loops">
   ];
   let dependentDialects = ["AffineDialect"];
 }
diff --git a/mlir/include/mlir/Dialect/SCF/Transforms.h b/mlir/include/mlir/Dialect/SCF/Transforms.h
index 5cb816c808fa1..c824c6f43544f 100644
--- a/mlir/include/mlir/Dialect/SCF/Transforms.h
+++ b/mlir/include/mlir/Dialect/SCF/Transforms.h
@@ -44,9 +44,10 @@ void naivelyFuseParallelOps(Region &region);
 /// by an scf.if for the last (partial) iteration (if any). This transformation
 /// is called "loop peeling".
 ///
-/// Other patterns can simplify/canonicalize operations in the body of the loop
-/// and the scf.if. This is beneficial for a wide range of transformations such
-/// as vectorization or loop tiling.
+/// This transformation is beneficial for a wide range of transformations such
+/// as vectorization or loop tiling: It enables additional canonicalizations
+/// inside the peeled loop body such as rewriting masked loads into unmaked
+/// loads.
 ///
 /// E.g., assuming a lower bound of 0 (for illustration purposes):
 /// ```
@@ -65,11 +66,22 @@ void naivelyFuseParallelOps(Region &region);
 /// }
 /// ```
 ///
-/// This function rewrites the given scf.for loop in-place and creates a new
-/// scf.if operation (returned via `ifOp`) for the last iteration.
+/// After loop peeling, this function tries to simplify/canonicalize affine.min
+/// operations in the body of the loop and the scf.if, taking advantage of the
+/// fact that every iteration of the peeled loop is a "full" iteration. This
+/// canonicalization is expected to enable further canonicalization
+/// opportunities through other patterns.
 ///
-/// TODO: Simplify affine.min ops inside the new loop/if statement.
-LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, scf::IfOp &ifOp);
+/// The return value indicates whether the loop was rewritten or not. Loops are
+/// not rewritten if:
+/// * Loop step size is 1 or
+/// * Loop bounds and step size are static, and step already divides the
+///   iteration space evenly.
+///
+/// Note: This function rewrites the given scf.for loop in-place and creates a
+/// new scf.if operation for the last iteration. It replaces all uses of the
+/// unpeeled loop with the results of the newly generated scf.if.
+LogicalResult peelAndCanonicalizeForLoop(RewriterBase &rewriter, ForOp forOp);
 
 /// Tile a parallel loop of the form
 ///   scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
@@ -87,7 +99,8 @@ LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, scf::IfOp &ifOp);
 /// The function returns the resulting ParallelOps, i.e. {outer_loop_op,
 /// inner_loop_op}.
 std::pair<ParallelOp, ParallelOp>
-tileParallelLoop(ParallelOp op, llvm::ArrayRef<int64_t> tileSizes);
+tileParallelLoop(ParallelOp op, llvm::ArrayRef<int64_t> tileSizes,
+                 bool noMinMaxBounds);
 
 /// Populates patterns for SCF structural type conversions and sets up the
 /// provided ConversionTarget with the appropriate legality configuration for
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
index 47fa1b2b75275..bb32d1b837add 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td
@@ -3194,6 +3194,8 @@ def SPV_OC_OpLoad                      : I32EnumAttrCase<"OpLoad", 61>;
 def SPV_OC_OpStore                     : I32EnumAttrCase<"OpStore", 62>;
 def SPV_OC_OpCopyMemory                : I32EnumAttrCase<"OpCopyMemory", 63>;
 def SPV_OC_OpAccessChain               : I32EnumAttrCase<"OpAccessChain", 65>;
+def SPV_OC_OpPtrAccessChain            : I32EnumAttrCase<"OpPtrAccessChain", 67>;
+def SPV_OC_OpInBoundsPtrAccessChain    : I32EnumAttrCase<"OpInBoundsPtrAccessChain", 70>;
 def SPV_OC_OpDecorate                  : I32EnumAttrCase<"OpDecorate", 71>;
 def SPV_OC_OpMemberDecorate            : I32EnumAttrCase<"OpMemberDecorate", 72>;
 def SPV_OC_OpVectorExtractDynamic      : I32EnumAttrCase<"OpVectorExtractDynamic", 77>;
@@ -3340,10 +3342,10 @@ def SPV_OpcodeAttr :
       SPV_OC_OpSpecConstant, SPV_OC_OpSpecConstantComposite, SPV_OC_OpSpecConstantOp,
       SPV_OC_OpFunction, SPV_OC_OpFunctionParameter, SPV_OC_OpFunctionEnd,
       SPV_OC_OpFunctionCall, SPV_OC_OpVariable, SPV_OC_OpLoad, SPV_OC_OpStore,
-      SPV_OC_OpCopyMemory, SPV_OC_OpAccessChain, SPV_OC_OpDecorate,
-      SPV_OC_OpMemberDecorate, SPV_OC_OpVectorExtractDynamic,
-      SPV_OC_OpVectorInsertDynamic, SPV_OC_OpVectorShuffle,
-      SPV_OC_OpCompositeConstruct, SPV_OC_OpCompositeExtract,
+      SPV_OC_OpCopyMemory, SPV_OC_OpAccessChain, SPV_OC_OpPtrAccessChain,
+      SPV_OC_OpInBoundsPtrAccessChain, SPV_OC_OpDecorate, SPV_OC_OpMemberDecorate,
+      SPV_OC_OpVectorExtractDynamic, SPV_OC_OpVectorInsertDynamic,
+      SPV_OC_OpVectorShuffle, SPV_OC_OpCompositeConstruct, SPV_OC_OpCompositeExtract,
       SPV_OC_OpCompositeInsert, SPV_OC_OpTranspose, SPV_OC_OpImageDrefGather,
       SPV_OC_OpImage, SPV_OC_OpImageQuerySize, SPV_OC_OpConvertFToU,
       SPV_OC_OpConvertFToS, SPV_OC_OpConvertSToF, SPV_OC_OpConvertUToF,
diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
index 77fa63fc492ff..63cd3fe0213da 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVMemoryOps.td
@@ -137,6 +137,55 @@ def SPV_CopyMemoryOp : SPV_Op<"CopyMemory", []> {
 
 // -----
 
+def SPV_InBoundsPtrAccessChainOp : SPV_Op<"InBoundsPtrAccessChain", [NoSideEffect]> {
+  let summary = [{
+    Has the same semantics as OpPtrAccessChain, with the addition that the
+    resulting pointer is known to point within the base object.
+  }];
+
+  let description = [{
+
+
+    <!-- End of AutoGen section -->
+
+    ```
+    access-chain-op ::= ssa-id `=` `spv.InBoundsPtrAccessChain` ssa-use
+                        `[` ssa-use (',' ssa-use)* `]`
+                        `:` pointer-type
+    ```mlir
+
+    #### Example:
+
+    ```
+    func @inbounds_ptr_access_chain(%arg0: !spv.ptr<f32, CrossWorkgroup>, %arg1 : i64) -> () {
+      %0 = spv.InBoundsPtrAccessChain %arg0[%arg1] : !spv.ptr<f32, CrossWorkgroup>, i64
+      ...
+    }
+    ```
+  }];
+
+  let availability = [
+    MinVersion<SPV_V_1_0>,
+    MaxVersion<SPV_V_1_5>,
+    Extension<[]>,
+    Capability<[SPV_C_Addresses]>
+  ];
+
+  let arguments = (ins
+    SPV_AnyPtr:$base_ptr,
+    SPV_Integer:$element,
+    Variadic<SPV_Integer>:$indices
+  );
+
+  let results = (outs
+    SPV_AnyPtr:$result
+  );
+
+  let builders = [OpBuilder<(ins "Value":$basePtr, "Value":$element, "ValueRange":$indices)>];
+}
+
+// -----
+
 def SPV_LoadOp : SPV_Op<"Load", []> {
   let summary = "Load through a pointer.";
 
@@ -191,6 +240,78 @@ def SPV_LoadOp : SPV_Op<"Load", []> {
 
 // -----
 
+def SPV_PtrAccessChainOp : SPV_Op<"PtrAccessChain", [NoSideEffect]> {
+  let summary = [{
+    Has the same semantics as OpAccessChain, with the addition of the
+    Element operand.
+  }];
+
+  let description = [{
+    Element is used to do an initial dereference of Base: Base is treated as
+    the address of an element in an array, and a new element address is
+    computed from Base and Element to become the OpAccessChain Base to
+    dereference as per OpAccessChain. This computed Base has the same type
+    as the originating Base.
+
+    To compute the new element address, Element is treated as a signed count
+    of elements E, relative to the original Base element B, and the address
+    of element B + E is computed using enough precision to avoid overflow
+    and underflow. For objects in the Uniform, StorageBuffer, or
+    PushConstant storage classes, the element's address or location is
+    calculated using a stride, which will be the Base-type's Array Stride if
+    the Base type is decorated with ArrayStride. For all other objects, the
+    implementation calculates the element's address or location.
+
+    With one exception, undefined behavior results when B + E is not an
+    element in the same array (same innermost array, if array types are
+    nested) as B. The exception being when B + E = L, where L is the length
+    of the array: the address computation for element L is done with the
+    same stride as any other B + E computation that stays within the array.
+
+    Note: If Base is typed to be a pointer to an array and the desired
+    operation is to select an element of that array, OpAccessChain should be
+    directly used, as its first Index selects the array element.
+
+    <!-- End of AutoGen section -->
+
+    ```
+    [access-chain-op ::= ssa-id `=` `spv.PtrAccessChain` ssa-use
+                        `[` ssa-use (',' ssa-use)* `]`
+                        `:` pointer-type
+    ```mlir
+
+    #### Example:
+
+    ```
+    func @ptr_access_chain(%arg0: !spv.ptr<f32, CrossWorkgroup>, %arg1 : i64) -> () {
+      %0 = spv.PtrAccessChain %arg0[%arg1] : !spv.ptr<f32, CrossWorkgroup>, i64
+      ...
+    }
+    ```
+  }];
+
+  let availability = [
+    MinVersion<SPV_V_1_0>,
+    MaxVersion<SPV_V_1_5>,
+    Extension<[]>,
+    Capability<[SPV_C_Addresses, SPV_C_PhysicalStorageBufferAddresses, SPV_C_VariablePointers, SPV_C_VariablePointersStorageBuffer]>
+  ];
+
+  let arguments = (ins
+    SPV_AnyPtr:$base_ptr,
+    SPV_Integer:$element,
+    Variadic<SPV_Integer>:$indices
+  );
+
+  let results = (outs
+    SPV_AnyPtr:$result
+  );
+
+  let builders = [OpBuilder<(ins "Value":$basePtr, "Value":$element, "ValueRange":$indices)>];
+}
+
+// -----
+
 def SPV_StoreOp : SPV_Op<"Store", []> {
   let summary = "Store through a pointer.";
 
diff --git a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
index d415bb8b56225..6b39fbffe9d43 100644
--- a/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
+++ b/mlir/include/mlir/Dialect/Shape/IR/ShapeOps.td
@@ -28,7 +28,9 @@ include "mlir/IR/SymbolInterfaces.td"
 class Shape_Op<string mnemonic, list<OpTrait> traits = []> :
     Op<ShapeDialect, mnemonic, traits>;
 
-def Shape_AddOp : Shape_Op<"add", [Commutative, NoSideEffect]> {
+def Shape_AddOp : Shape_Op<"add",
+    [Commutative, NoSideEffect,
+     DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Addition of sizes and indices";
   let description = [{
     Adds two sizes or indices. If either operand is an error it will be
@@ -47,6 +49,12 @@ def Shape_AddOp : Shape_Op<"add", [Commutative, NoSideEffect]> {
   }];
 
   let verifier = [{ return verifySizeOrIndexOp(*this); }];
+
+  let extraClassDeclaration = [{
+    // Returns when two result types are compatible for this op; method used by
+    // InferTypeOpInterface
+    static bool isCompatibleReturnTypes(TypeRange l, TypeRange r);
+  }];
 }
 
 def Shape_BroadcastOp : Shape_Op<"broadcast", [Commutative, NoSideEffect]> {
@@ -77,6 +85,8 @@ def Shape_BroadcastOp : Shape_Op<"broadcast", [Commutative, NoSideEffect]> {
                        OptionalAttr<StrAttr>:$error);
   let results = (outs Shape_ShapeOrExtentTensorType:$result);
 
+  let builders = [OpBuilder<(ins "Value":$shape)>];
+
   let assemblyFormat = [{
     $shapes attr-dict `:` type($shapes) `->` type($result)
   }];
@@ -145,7 +155,8 @@ def Shape_ConstSizeOp : Shape_Op<"const_size", [
   let hasFolder = 1;
 }
 
-def Shape_DivOp : Shape_Op<"div", [NoSideEffect]> {
+def Shape_DivOp : Shape_Op<"div", [NoSideEffect,
+                           DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Division of sizes and indices";
   let description = [{
     Divides two sizes or indices. If either operand is an error it will be
@@ -173,10 +184,16 @@ def Shape_DivOp : Shape_Op<"div", [NoSideEffect]> {
 
   let verifier = [{ return ::verifySizeOrIndexOp(*this); }];
   let hasFolder = 1;
+
+  let extraClassDeclaration = [{
+    // Returns when two result types are compatible for this op; method used by
+    // InferTypeOpInterface
+    static bool isCompatibleReturnTypes(TypeRange l, TypeRange r);
+  }];
 }
 
-def Shape_ShapeEqOp : Shape_Op<"shape_eq", [NoSideEffect, Commutative,
-                                            InferTypeOpInterface]> {
+def Shape_ShapeEqOp : Shape_Op<"shape_eq",
+    [NoSideEffect, Commutative, InferTypeOpInterface]> {
   let summary = "Returns whether the input shapes or extent tensors are equal";
   let description = [{
     Takes one or more shape or extent tensor operands and determines whether
@@ -290,7 +307,8 @@ def Shape_IsBroadcastableOp : Shape_Op<"is_broadcastable",
   let assemblyFormat = "$shapes attr-dict `:` type($shapes)";
 }
 
-def Shape_RankOp : Shape_Op<"rank", [NoSideEffect]> {
+def Shape_RankOp : Shape_Op<"rank",
+    [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Gets the rank of a shape";
   let description = [{
     Returns the rank of the shape or extent tensor, i.e. the number of extents.
@@ -304,6 +322,12 @@ def Shape_RankOp : Shape_Op<"rank", [NoSideEffect]> {
   let hasFolder = 1;
   let hasCanonicalizer = 1;
   let verifier = [{ return ::verifySizeOrIndexOp(*this); }];
+
+  let extraClassDeclaration = [{
+    // Returns when two result types are compatible for this op; method used by
+    // InferTypeOpInterface
+    static bool isCompatibleReturnTypes(TypeRange l, TypeRange r);
+  }];
 }
 
 def Shape_ToExtentTensorOp : Shape_Op<"to_extent_tensor", [NoSideEffect]> {
@@ -324,7 +348,8 @@ def Shape_ToExtentTensorOp : Shape_Op<"to_extent_tensor", [NoSideEffect]> {
   let hasFolder = 1;
 }
 
-def Shape_GetExtentOp : Shape_Op<"get_extent", [NoSideEffect]> {
+def Shape_GetExtentOp : Shape_Op<"get_extent",
+    [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Gets the specified extent from a shape or extent tensor";
   let description = [{
     Gets the extent indexed by `dim` from the `shape` operand. If the shape is
@@ -344,6 +369,9 @@ def Shape_GetExtentOp : Shape_Op<"get_extent", [NoSideEffect]> {
   let extraClassDeclaration = [{
     /// Get the `dim` value as integer if it is constant.
     Optional<int64_t> getConstantDim();
+    /// Returns when two result types are compatible for this op; method used by
+    /// InferTypeOpInterface
+    static bool isCompatibleReturnTypes(TypeRange l, TypeRange r);
   }];
 
   let hasFolder = 1;
@@ -369,7 +397,8 @@ def Shape_IndexToSizeOp : Shape_Op<"index_to_size", [NoSideEffect]> {
   let hasCanonicalizer = 1;
 }
 
-def Shape_JoinOp : Shape_Op<"join", [Commutative]> {
+def Shape_JoinOp : Shape_Op<"join",
+    [Commutative, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Returns the least general shape.shape of its operands";
   let description = [{
     An operation that computes the least general shape of input operands.
@@ -405,9 +434,17 @@ def Shape_JoinOp : Shape_Op<"join", [Commutative]> {
     $arg0 `,` $arg1 (`,` `error` `=` $error^)? attr-dict `:`
       type($arg0) `,` type($arg1) `->` type($result)
   }];
+
+  let extraClassDeclaration = [{
+    // Returns when two result types are compatible for this op; method used by
+    // InferTypeOpInterface
+    static bool isCompatibleReturnTypes(TypeRange l, TypeRange r);
+  }];
 }
 
-def Shape_MaxOp : Shape_Op<"max", [Commutative, NoSideEffect]> {
+def Shape_MaxOp : Shape_Op<"max",
+    [Commutative, NoSideEffect,
+     DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Elementwise maximum";
   let description = [{
     Computes the elementwise maximum of two sizes or shapes with equal ranks.
@@ -424,9 +461,17 @@ def Shape_MaxOp : Shape_Op<"max", [Commutative, NoSideEffect]> {
   }];
 
   let hasFolder = 1;
+
+  let extraClassDeclaration = [{
+    // Returns when two result types are compatible for this op; method used by
+    // InferTypeOpInterface
+    static bool isCompatibleReturnTypes(TypeRange l, TypeRange r);
+  }];
 }
 
-def Shape_MinOp : Shape_Op<"min", [Commutative, NoSideEffect]> {
+def Shape_MinOp : Shape_Op<"min",
+    [Commutative, NoSideEffect,
+     DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Elementwise minimum";
   let description = [{
     Computes the elementwise minimum of two sizes or shapes with equal ranks.
@@ -443,9 +488,17 @@ def Shape_MinOp : Shape_Op<"min", [Commutative, NoSideEffect]> {
   }];
 
   let hasFolder = 1;
+
+  let extraClassDeclaration = [{
+    // Returns when two result types are compatible for this op; method used by
+    // InferTypeOpInterface
+    static bool isCompatibleReturnTypes(TypeRange l, TypeRange r);
+  }];
 }
 
-def Shape_MulOp : Shape_Op<"mul", [Commutative, NoSideEffect]> {
+def Shape_MulOp : Shape_Op<"mul",
+    [Commutative, NoSideEffect,
+     DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Multiplication of sizes and indices";
   let description = [{
     Multiplies two sizes or indices. If either operand is an error it will be
@@ -465,9 +518,16 @@ def Shape_MulOp : Shape_Op<"mul", [Commutative, NoSideEffect]> {
 
   let verifier = [{ return ::verifySizeOrIndexOp(*this); }];
   let hasFolder = 1;
+
+  let extraClassDeclaration = [{
+    // Returns when two result types are compatible for this op; method used by
+    // InferTypeOpInterface
+    static bool isCompatibleReturnTypes(TypeRange l, TypeRange r);
+  }];
 }
 
-def Shape_NumElementsOp : Shape_Op<"num_elements", [NoSideEffect]> {
+def Shape_NumElementsOp : Shape_Op<"num_elements",
+    [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Returns the number of elements for a given shape";
   let description = [{
     Returns the number of elements for a given shape which is the product of its
@@ -480,12 +540,15 @@ def Shape_NumElementsOp : Shape_Op<"num_elements", [NoSideEffect]> {
   let arguments = (ins Shape_ShapeOrExtentTensorType:$shape);
   let results = (outs Shape_SizeOrIndexType:$result);
 
-  let builders = [OpBuilder<(ins "Value":$shape)>];
-
   let assemblyFormat = "$shape attr-dict `:` type($shape) `->` type($result)";
 
   let hasFolder = 1;
   let verifier = [{ return ::verifySizeOrIndexOp(*this); }];
+  let extraClassDeclaration = [{
+    // Returns when two result types are compatible for this op; method used by
+    // InferTypeOpInterface
+    static bool isCompatibleReturnTypes(TypeRange l, TypeRange r);
+  }];
 }
 
 def Shape_ReduceOp : Shape_Op<"reduce",
@@ -535,7 +598,8 @@ def Shape_ReduceOp : Shape_Op<"reduce",
   let parser = [{ return ::parse$cppClass(parser, result); }];
 }
 
-def Shape_ShapeOfOp : Shape_Op<"shape_of", [NoSideEffect]> {
+def Shape_ShapeOfOp : Shape_Op<"shape_of",
+    [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Returns shape of a value or shaped type operand";
 
   let description = [{
@@ -548,11 +612,15 @@ def Shape_ShapeOfOp : Shape_Op<"shape_of", [NoSideEffect]> {
 
   let assemblyFormat = "$arg attr-dict `:` type($arg) `->` type($result)";
 
-  let builders = [OpBuilder<(ins "Value":$arg)>];
-
   let verifier = [{ return ::verifyShapeOrExtentTensorOp(*this); }];
   let hasCanonicalizer = 1;
   let hasFolder = 1;
+
+  let extraClassDeclaration = [{
+    // Returns when two result types are compatible for this op; method used by
+    // InferTypeOpInterface
+    static bool isCompatibleReturnTypes(TypeRange l, TypeRange r);
+  }];
 }
 
 def Shape_SizeToIndexOp : Shape_Op<"size_to_index", [NoSideEffect]> {
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index 3bbf6c82b5cd3..01f8615de1721 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -53,6 +53,7 @@ def Sparsification : Pass<"sparsification", "ModuleOp"> {
   }];
   let constructor = "mlir::createSparsificationPass()";
   let dependentDialects = [
+    "AffineDialect",
     "LLVM::LLVMDialect",
     "memref::MemRefDialect",
     "scf::SCFDialect",
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index 484628109b6f2..02129a5f1bcca 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -38,6 +38,8 @@ class Tosa_QuantizedType<string n, list<int> params, bit signed>
 // Used to express accumulator results or compare results.
 //===----------------------------------------------------------------------===//
 
+def Tosa_UInt8 : UI<8>;
+
 def Tosa_Int8 : I<8>;
 def Tosa_Int16 : I<16>;
 def Tosa_Int32 : I<32>;
@@ -54,6 +56,7 @@ def Tosa_Bool : I<1>;
 
 // No unsigned unquantized int types.
 def Tosa_Int : AnyTypeOf<[Tosa_Bool,
+                          Tosa_UInt8,
                           Tosa_SignedInt]>;
 
 def Tosa_Int32Or64 : AnyTypeOf<[Tosa_Int32,
diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.h b/mlir/include/mlir/Dialect/Vector/VectorOps.h
index cf53e8fcff97c..9bc2cd4e35acf 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.h
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.h
@@ -81,7 +81,8 @@ void populateVectorMaskMaterializationPatterns(RewritePatternSet &patterns,
 
 // Collect a set of patterns to convert vector.multi_reduction op into
 // a sequence of vector.reduction ops.
-void populateVectorMultiReductionLoweringPatterns(RewritePatternSet &patterns);
+void populateVectorMultiReductionLoweringPatterns(
+    RewritePatternSet &patterns, bool useInnerDimsForReduction = false);
 
 /// Collect a set of patterns to propagate insert_map/extract_map in the ssa
 /// chain.
diff --git a/mlir/include/mlir/IR/AffineExpr.h b/mlir/include/mlir/IR/AffineExpr.h
index 6bd15789db335..7cb97f5ed3207 100644
--- a/mlir/include/mlir/IR/AffineExpr.h
+++ b/mlir/include/mlir/IR/AffineExpr.h
@@ -143,12 +143,15 @@ class AffineExpr {
   /// `*this` and apply replace with `map` on its subexpressions.
   AffineExpr replace(const DenseMap<AffineExpr, AffineExpr> &map) const;
 
-  /// Replace dims[0 .. numDims - 1] by dims[shift .. shift + numDims - 1].
-  AffineExpr shiftDims(unsigned numDims, unsigned shift) const;
-
-  /// Replace symbols[0 .. numSymbols - 1] by
-  ///         symbols[shift .. shift + numSymbols - 1].
-  AffineExpr shiftSymbols(unsigned numSymbols, unsigned shift) const;
+  /// Replace dims[offset ... numDims)
+  /// by dims[offset + shift ... shift + numDims).
+  AffineExpr shiftDims(unsigned numDims, unsigned shift,
+                       unsigned offset = 0) const;
+
+  /// Replace symbols[offset ... numSymbols)
+  /// by symbols[offset + shift ... shift + numSymbols).
+  AffineExpr shiftSymbols(unsigned numSymbols, unsigned shift,
+                          unsigned offset = 0) const;
 
   AffineExpr operator+(int64_t v) const;
   AffineExpr operator+(AffineExpr other) const;
diff --git a/mlir/include/mlir/IR/AffineMap.h b/mlir/include/mlir/IR/AffineMap.h
index 22e3c30f517fc..f687253b36feb 100644
--- a/mlir/include/mlir/IR/AffineMap.h
+++ b/mlir/include/mlir/IR/AffineMap.h
@@ -207,24 +207,28 @@ class AffineMap {
   AffineMap replace(const DenseMap<AffineExpr, AffineExpr> &map,
                     unsigned numResultDims, unsigned numResultSyms) const;
 
-  /// Replace dims[0 .. numDims - 1] by dims[shift .. shift + numDims - 1].
-  AffineMap shiftDims(unsigned shift) const {
-    return AffineMap::get(
-        getNumDims() + shift, getNumSymbols(),
-        llvm::to_vector<4>(llvm::map_range(
-            getResults(),
-            [&](AffineExpr e) { return e.shiftDims(getNumDims(), shift); })),
-        getContext());
+  /// Replace dims[offset ... numDims)
+  /// by dims[offset + shift ... shift + numDims).
+  AffineMap shiftDims(unsigned shift, unsigned offset = 0) const {
+    assert(offset <= getNumDims());
+    return AffineMap::get(getNumDims() + shift, getNumSymbols(),
+                          llvm::to_vector<4>(llvm::map_range(
+                              getResults(),
+                              [&](AffineExpr e) {
+                                return e.shiftDims(getNumDims(), shift, offset);
+                              })),
+                          getContext());
   }
 
-  /// Replace symbols[0 .. numSymbols - 1] by
-  ///         symbols[shift .. shift + numSymbols - 1].
-  AffineMap shiftSymbols(unsigned shift) const {
+  /// Replace symbols[offset ... numSymbols)
+  /// by symbols[offset + shift ... shift + numSymbols).
+  AffineMap shiftSymbols(unsigned shift, unsigned offset = 0) const {
     return AffineMap::get(getNumDims(), getNumSymbols() + shift,
                           llvm::to_vector<4>(llvm::map_range(
                               getResults(),
                               [&](AffineExpr e) {
-                                return e.shiftSymbols(getNumSymbols(), shift);
+                                return e.shiftSymbols(getNumSymbols(), shift,
+                                                      offset);
                               })),
                           getContext());
   }
diff --git a/mlir/include/mlir/IR/BuiltinAttributes.h b/mlir/include/mlir/IR/BuiltinAttributes.h
index 28ced2deb86f0..c71c5d4bbd4ae 100644
--- a/mlir/include/mlir/IR/BuiltinAttributes.h
+++ b/mlir/include/mlir/IR/BuiltinAttributes.h
@@ -560,6 +560,11 @@ class DenseElementsAttr : public ElementsAttr {
   /// same total number of elements as well as element type.
   DenseElementsAttr reshape(ShapedType newType);
 
+  /// Return a new DenseElementsAttr that has the same data as the current
+  /// attribute, but has bitcast elements to 'newElType'. The new type must have
+  /// the same bitwidth as the current element type.
+  DenseElementsAttr bitcast(Type newElType);
+
   /// Generates a new DenseElementsAttr by mapping each int value to a new
   /// underlying APInt. The new values can represent either an integer or float.
   /// This underlying type must be an DenseIntElementsAttr.
diff --git a/mlir/include/mlir/IR/BuiltinOps.td b/mlir/include/mlir/IR/BuiltinOps.td
index a05cfdd944714..8b6852b693936 100644
--- a/mlir/include/mlir/IR/BuiltinOps.td
+++ b/mlir/include/mlir/IR/BuiltinOps.td
@@ -115,7 +115,7 @@ def FuncOp : Builtin_Op<"func", [
     /// Returns the region on the current operation that is callable. This may
     /// return null in the case of an external callable object, e.g. an external
     /// function.
-    Region *getCallableRegion() { return isExternal() ? nullptr : &getBody(); }
+    ::mlir::Region *getCallableRegion() { return isExternal() ? nullptr : &getBody(); }
 
     /// Returns the results types that the callable region produces when
     /// executed.
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index c3fb49035f8a4..c24c05b877cf3 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -1618,7 +1618,7 @@ class DerivedAttr<code ret, code b, code convert = ""> :
 }
 
 // Derived attribute that returns a mlir::Type.
-class DerivedTypeAttr<code body> : DerivedAttr<"Type", body> {
+class DerivedTypeAttr<code body> : DerivedAttr<"::mlir::Type", body> {
   let convertFromStorage = "::mlir::TypeAttr::get($_self)";
 }
 
diff --git a/mlir/include/mlir/IR/RegionKindInterface.td b/mlir/include/mlir/IR/RegionKindInterface.td
index bef161c00def2..90c96ac21b43e 100644
--- a/mlir/include/mlir/IR/RegionKindInterface.td
+++ b/mlir/include/mlir/IR/RegionKindInterface.td
@@ -33,7 +33,7 @@ def RegionKindInterface : OpInterface<"RegionKindInterface"> {
       /*desc=*/[{
         Return the kind of the region with the given index inside this operation.
       }],
-      /*retTy=*/"RegionKind",
+      /*retTy=*/"::mlir::RegionKind",
       /*methodName=*/"getRegionKind",
       /*args=*/(ins "unsigned":$index)
     >,
@@ -44,7 +44,7 @@ def RegionKindInterface : OpInterface<"RegionKindInterface"> {
       /*methodName=*/"hasSSADominance",
       /*args=*/(ins "unsigned":$index),
       /*methodBody=*/[{
-        return getRegionKind(index) == RegionKind::SSACFG;
+        return getRegionKind(index) == ::mlir::RegionKind::SSACFG;
       }]
     >,
   ];
diff --git a/mlir/include/mlir/Interfaces/InferTypeOpInterface.td b/mlir/include/mlir/Interfaces/InferTypeOpInterface.td
index 0d2c2b520a178..1f604e25bf910 100644
--- a/mlir/include/mlir/Interfaces/InferTypeOpInterface.td
+++ b/mlir/include/mlir/Interfaces/InferTypeOpInterface.td
@@ -34,7 +34,9 @@ def InferTypeOpInterface : OpInterface<"InferTypeOpInterface"> {
       The method takes an optional location which, if set, will be used to
       report errors on. The operands and attributes correspond to those with
       which an Operation would be created (e.g., as used in Operation::create)
-      and the regions of the op.
+      and the regions of the op. Be aware that this method is supposed to be
+      called with valid arguments, e.g., operands are verified, or it may result
+      in an undefined behavior.
       }],
       /*retTy=*/"::mlir::LogicalResult",
       /*methodName=*/"inferReturnTypes",
@@ -168,10 +170,10 @@ def ReifyRankedShapedTypeOpInterface :
         rank of the corresponding result. If the shape of a particular
         result cannot be computed it must be empty.
       }],
-      /*retTy=*/"LogicalResult",
+      /*retTy=*/"::mlir::LogicalResult",
       /*methodName=*/"reifyResultShapes",
       /*args=*/(ins "::mlir::OpBuilder &":$builder,
-        "ReifiedRankedShapedTypeDims &":$reifiedReturnShapes)
+        "::mlir::ReifiedRankedShapedTypeDims &":$reifiedReturnShapes)
     >
   ];
 }
diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h
index 7187daa53d90c..4d41d4865b6dc 100644
--- a/mlir/include/mlir/TableGen/Operator.h
+++ b/mlir/include/mlir/TableGen/Operator.h
@@ -277,7 +277,7 @@ class Operator {
   struct OperandOrAttribute {
     enum class Kind { Operand, Attribute };
     OperandOrAttribute(Kind kind, int index) {
-      packed = (index << 1) & (kind == Kind::Attribute);
+      packed = (index << 1) | (kind == Kind::Attribute);
     }
     int operandOrAttributeIndex() const { return (packed >> 1); }
     Kind kind() { return (packed & 0x1) ? Kind::Attribute : Kind::Operand; }
diff --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp
index 18e6430c63e52..b83f02283ba08 100644
--- a/mlir/lib/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Analysis/AffineAnalysis.cpp
@@ -157,7 +157,7 @@ bool mlir::isLoopMemoryParallel(AffineForOp forOp) {
     MemRefAccess srcAccess(srcOp);
     for (auto *dstOp : loadAndStoreOps) {
       MemRefAccess dstAccess(dstOp);
-      FlatAffineConstraints dependenceConstraints;
+      FlatAffineValueConstraints dependenceConstraints;
       DependenceResult result = checkMemrefAccessDependence(
           srcAccess, dstAccess, depth, &dependenceConstraints,
           /*dependenceComponents=*/nullptr);
@@ -220,11 +220,11 @@ void mlir::getReachableAffineApplyOps(
 // the bound operands are added as symbols in the system. Returns failure for
 // the yet unimplemented cases.
 // TODO: Handle non-unit steps through local variables or stride information in
-// FlatAffineConstraints. (For eg., by using iv - lb % step = 0 and/or by
-// introducing a method in FlatAffineConstraints setExprStride(ArrayRef<int64_t>
-// expr, int64_t stride)
+// FlatAffineValueConstraints. (For eg., by using iv - lb % step = 0 and/or by
+// introducing a method in FlatAffineValueConstraints
+// setExprStride(ArrayRef<int64_t> expr, int64_t stride)
 LogicalResult mlir::getIndexSet(MutableArrayRef<Operation *> ops,
-                                FlatAffineConstraints *domain) {
+                                FlatAffineValueConstraints *domain) {
   SmallVector<Value, 4> indices;
   SmallVector<AffineForOp, 8> forOps;
 
@@ -255,7 +255,7 @@ LogicalResult mlir::getIndexSet(MutableArrayRef<Operation *> ops,
 /// 'indexSet' correspond to the loops surrounding 'op' from outermost to
 /// innermost.
 static LogicalResult getOpIndexSet(Operation *op,
-                                   FlatAffineConstraints *indexSet) {
+                                   FlatAffineValueConstraints *indexSet) {
   SmallVector<Operation *, 4> ops;
   getEnclosingAffineForAndIfOps(*op, &ops);
   return getIndexSet(ops, indexSet);
@@ -352,10 +352,11 @@ class ValuePositionMap {
 // 'srcAccessMap'/'dstAccessMap' (as well as those in 'srcDomain'/'dstDomain')
 // to the position of these values in the merged list.
 static void buildDimAndSymbolPositionMaps(
-    const FlatAffineConstraints &srcDomain,
-    const FlatAffineConstraints &dstDomain, const AffineValueMap &srcAccessMap,
-    const AffineValueMap &dstAccessMap, ValuePositionMap *valuePosMap,
-    FlatAffineConstraints *dependenceConstraints) {
+    const FlatAffineValueConstraints &srcDomain,
+    const FlatAffineValueConstraints &dstDomain,
+    const AffineValueMap &srcAccessMap, const AffineValueMap &dstAccessMap,
+    ValuePositionMap *valuePosMap,
+    FlatAffineValueConstraints *dependenceConstraints) {
 
   // IsDimState is a tri-state boolean. It is used to distinguish three
   // different cases of the values passed to updateValuePosMap.
@@ -412,12 +413,12 @@ static void buildDimAndSymbolPositionMaps(
   // the collected values into dim and symbol parts.
   SmallVector<Value, 4> srcDimValues, dstDimValues, srcSymbolValues,
       dstSymbolValues;
-  srcDomain.getIdValues(0, srcDomain.getNumDimIds(), &srcDimValues);
-  dstDomain.getIdValues(0, dstDomain.getNumDimIds(), &dstDimValues);
-  srcDomain.getIdValues(srcDomain.getNumDimIds(),
-                        srcDomain.getNumDimAndSymbolIds(), &srcSymbolValues);
-  dstDomain.getIdValues(dstDomain.getNumDimIds(),
-                        dstDomain.getNumDimAndSymbolIds(), &dstSymbolValues);
+  srcDomain.getValues(0, srcDomain.getNumDimIds(), &srcDimValues);
+  dstDomain.getValues(0, dstDomain.getNumDimIds(), &dstDimValues);
+  srcDomain.getValues(srcDomain.getNumDimIds(),
+                      srcDomain.getNumDimAndSymbolIds(), &srcSymbolValues);
+  dstDomain.getValues(dstDomain.getNumDimIds(),
+                      dstDomain.getNumDimAndSymbolIds(), &dstSymbolValues);
 
   // Update value position map with dim values from src iteration domain.
   updateValuePosMap(srcDimValues, /*isSrc=*/true, /*isDim=*/TRUE);
@@ -437,13 +438,15 @@ static void buildDimAndSymbolPositionMaps(
 
 // Sets up dependence constraints columns appropriately, in the format:
 // [src-dim-ids, dst-dim-ids, symbol-ids, local-ids, const_term]
-static void initDependenceConstraints(
-    const FlatAffineConstraints &srcDomain,
-    const FlatAffineConstraints &dstDomain, const AffineValueMap &srcAccessMap,
-    const AffineValueMap &dstAccessMap, const ValuePositionMap &valuePosMap,
-    FlatAffineConstraints *dependenceConstraints) {
+static void
+initDependenceConstraints(const FlatAffineValueConstraints &srcDomain,
+                          const FlatAffineValueConstraints &dstDomain,
+                          const AffineValueMap &srcAccessMap,
+                          const AffineValueMap &dstAccessMap,
+                          const ValuePositionMap &valuePosMap,
+                          FlatAffineValueConstraints *dependenceConstraints) {
   // Calculate number of equalities/inequalities and columns required to
-  // initialize FlatAffineConstraints for 'dependenceDomain'.
+  // initialize FlatAffineValueConstraints for 'dependenceDomain'.
   unsigned numIneq =
       srcDomain.getNumInequalities() + dstDomain.getNumInequalities();
   AffineMap srcMap = srcAccessMap.getAffineMap();
@@ -461,11 +464,11 @@ static void initDependenceConstraints(
 
   // Set values corresponding to dependence constraint identifiers.
   SmallVector<Value, 4> srcLoopIVs, dstLoopIVs;
-  srcDomain.getIdValues(0, srcDomain.getNumDimIds(), &srcLoopIVs);
-  dstDomain.getIdValues(0, dstDomain.getNumDimIds(), &dstLoopIVs);
+  srcDomain.getValues(0, srcDomain.getNumDimIds(), &srcLoopIVs);
+  dstDomain.getValues(0, dstDomain.getNumDimIds(), &dstLoopIVs);
 
-  dependenceConstraints->setIdValues(0, srcLoopIVs.size(), srcLoopIVs);
-  dependenceConstraints->setIdValues(
+  dependenceConstraints->setValues(0, srcLoopIVs.size(), srcLoopIVs);
+  dependenceConstraints->setValues(
       srcLoopIVs.size(), srcLoopIVs.size() + dstLoopIVs.size(), dstLoopIVs);
 
   // Set values for the symbolic identifier dimensions. `isSymbolDetermined`
@@ -478,7 +481,7 @@ static void initDependenceConstraints(
     for (auto value : values) {
       if (isSymbolDetermined || !isForInductionVar(value)) {
         assert(isValidSymbol(value) && "expected symbol");
-        dependenceConstraints->setIdValue(valuePosMap.getSymPos(value), value);
+        dependenceConstraints->setValue(valuePosMap.getSymPos(value), value);
       }
     }
   };
@@ -489,10 +492,10 @@ static void initDependenceConstraints(
   setSymbolIds(dstAccessMap.getOperands(), /*isSymbolDetermined=*/false);
 
   SmallVector<Value, 8> srcSymbolValues, dstSymbolValues;
-  srcDomain.getIdValues(srcDomain.getNumDimIds(),
-                        srcDomain.getNumDimAndSymbolIds(), &srcSymbolValues);
-  dstDomain.getIdValues(dstDomain.getNumDimIds(),
-                        dstDomain.getNumDimAndSymbolIds(), &dstSymbolValues);
+  srcDomain.getValues(srcDomain.getNumDimIds(),
+                      srcDomain.getNumDimAndSymbolIds(), &srcSymbolValues);
+  dstDomain.getValues(dstDomain.getNumDimIds(),
+                      dstDomain.getNumDimAndSymbolIds(), &dstSymbolValues);
   // Since we only take symbol Values out of `srcDomain` and `dstDomain`,
   // `isSymbolDetermined` is kept to its default value: true.
   setSymbolIds(srcSymbolValues);
@@ -500,23 +503,23 @@ static void initDependenceConstraints(
 
   for (unsigned i = 0, e = dependenceConstraints->getNumDimAndSymbolIds();
        i < e; i++)
-    assert(dependenceConstraints->getIds()[i].hasValue());
+    assert(dependenceConstraints->hasValue(i));
 }
 
 // Adds iteration domain constraints from 'srcDomain' and 'dstDomain' into
 // 'dependenceDomain'.
 // Uses 'valuePosMap' to determine the position in 'dependenceDomain' to which a
 // srcDomain/dstDomain Value maps.
-static void addDomainConstraints(const FlatAffineConstraints &srcDomain,
-                                 const FlatAffineConstraints &dstDomain,
+static void addDomainConstraints(const FlatAffineValueConstraints &srcDomain,
+                                 const FlatAffineValueConstraints &dstDomain,
                                  const ValuePositionMap &valuePosMap,
-                                 FlatAffineConstraints *dependenceDomain) {
+                                 FlatAffineValueConstraints *dependenceDomain) {
   unsigned depNumDimsAndSymbolIds = dependenceDomain->getNumDimAndSymbolIds();
 
   SmallVector<int64_t, 4> cst(dependenceDomain->getNumCols());
 
   auto addDomain = [&](bool isSrc, bool isEq, unsigned localOffset) {
-    const FlatAffineConstraints &domain = isSrc ? srcDomain : dstDomain;
+    const FlatAffineValueConstraints &domain = isSrc ? srcDomain : dstDomain;
     unsigned numCsts =
         isEq ? domain.getNumEqualities() : domain.getNumInequalities();
     unsigned numDimAndSymbolIds = domain.getNumDimAndSymbolIds();
@@ -524,8 +527,8 @@ static void addDomainConstraints(const FlatAffineConstraints &srcDomain,
       return isEq ? domain.atEq(i, j) : domain.atIneq(i, j);
     };
     auto map = [&](unsigned i) -> int64_t {
-      return isSrc ? valuePosMap.getSrcDimOrSymPos(domain.getIdValue(i))
-                   : valuePosMap.getDstDimOrSymPos(domain.getIdValue(i));
+      return isSrc ? valuePosMap.getSrcDimOrSymPos(domain.getValue(i))
+                   : valuePosMap.getDstDimOrSymPos(domain.getValue(i));
     };
 
     for (unsigned i = 0; i < numCsts; ++i) {
@@ -587,7 +590,7 @@ static LogicalResult
 addMemRefAccessConstraints(const AffineValueMap &srcAccessMap,
                            const AffineValueMap &dstAccessMap,
                            const ValuePositionMap &valuePosMap,
-                           FlatAffineConstraints *dependenceDomain) {
+                           FlatAffineValueConstraints *dependenceDomain) {
   AffineMap srcMap = srcAccessMap.getAffineMap();
   AffineMap dstMap = dstAccessMap.getAffineMap();
   assert(srcMap.getNumResults() == dstMap.getNumResults());
@@ -601,7 +604,7 @@ addMemRefAccessConstraints(const AffineValueMap &srcAccessMap,
 
   std::vector<SmallVector<int64_t, 8>> srcFlatExprs;
   std::vector<SmallVector<int64_t, 8>> destFlatExprs;
-  FlatAffineConstraints srcLocalVarCst, destLocalVarCst;
+  FlatAffineValueConstraints srcLocalVarCst, destLocalVarCst;
   // Get flattened expressions for the source destination maps.
   if (failed(getFlattenedAffineExprs(srcMap, &srcFlatExprs, &srcLocalVarCst)) ||
       failed(getFlattenedAffineExprs(dstMap, &destFlatExprs, &destLocalVarCst)))
@@ -661,8 +664,9 @@ addMemRefAccessConstraints(const AffineValueMap &srcAccessMap,
       assert(isValidSymbol(symbol));
       // Check if the symbol is a constant.
       if (auto cOp = symbol.getDefiningOp<ConstantIndexOp>())
-        dependenceDomain->setIdToConstant(valuePosMap.getSymPos(symbol),
-                                          cOp.getValue());
+        dependenceDomain->addBound(FlatAffineConstraints::EQ,
+                                   valuePosMap.getSymPos(symbol),
+                                   cOp.getValue());
     }
   };
 
@@ -716,20 +720,20 @@ addMemRefAccessConstraints(const AffineValueMap &srcAccessMap,
 // Returns the number of outer loop common to 'src/dstDomain'.
 // Loops common to 'src/dst' domains are added to 'commonLoops' if non-null.
 static unsigned
-getNumCommonLoops(const FlatAffineConstraints &srcDomain,
-                  const FlatAffineConstraints &dstDomain,
+getNumCommonLoops(const FlatAffineValueConstraints &srcDomain,
+                  const FlatAffineValueConstraints &dstDomain,
                   SmallVectorImpl<AffineForOp> *commonLoops = nullptr) {
   // Find the number of common loops shared by src and dst accesses.
   unsigned minNumLoops =
       std::min(srcDomain.getNumDimIds(), dstDomain.getNumDimIds());
   unsigned numCommonLoops = 0;
   for (unsigned i = 0; i < minNumLoops; ++i) {
-    if (!isForInductionVar(srcDomain.getIdValue(i)) ||
-        !isForInductionVar(dstDomain.getIdValue(i)) ||
-        srcDomain.getIdValue(i) != dstDomain.getIdValue(i))
+    if (!isForInductionVar(srcDomain.getValue(i)) ||
+        !isForInductionVar(dstDomain.getValue(i)) ||
+        srcDomain.getValue(i) != dstDomain.getValue(i))
       break;
     if (commonLoops != nullptr)
-      commonLoops->push_back(getForInductionVarOwner(srcDomain.getIdValue(i)));
+      commonLoops->push_back(getForInductionVarOwner(srcDomain.getValue(i)));
     ++numCommonLoops;
   }
   if (commonLoops != nullptr)
@@ -740,7 +744,7 @@ getNumCommonLoops(const FlatAffineConstraints &srcDomain,
 /// Returns Block common to 'srcAccess.opInst' and 'dstAccess.opInst'.
 static Block *getCommonBlock(const MemRefAccess &srcAccess,
                              const MemRefAccess &dstAccess,
-                             const FlatAffineConstraints &srcDomain,
+                             const FlatAffineValueConstraints &srcDomain,
                              unsigned numCommonLoops) {
   // Get the chain of ancestor blocks to the given `MemRefAccess` instance. The
   // search terminates when either an op with the `AffineScope` trait or
@@ -765,7 +769,7 @@ static Block *getCommonBlock(const MemRefAccess &srcAccess,
     }
     return block;
   }
-  Value commonForIV = srcDomain.getIdValue(numCommonLoops - 1);
+  Value commonForIV = srcDomain.getValue(numCommonLoops - 1);
   AffineForOp forOp = getForInductionVarOwner(commonForIV);
   assert(forOp && "commonForValue was not an induction variable");
 
@@ -791,7 +795,7 @@ static Block *getCommonBlock(const MemRefAccess &srcAccess,
 // 'numCommonLoops' is the number of contiguous surrounding outer loops.
 static bool srcAppearsBeforeDstInAncestralBlock(
     const MemRefAccess &srcAccess, const MemRefAccess &dstAccess,
-    const FlatAffineConstraints &srcDomain, unsigned numCommonLoops) {
+    const FlatAffineValueConstraints &srcDomain, unsigned numCommonLoops) {
   // Get Block common to 'srcAccess.opInst' and 'dstAccess.opInst'.
   auto *commonBlock =
       getCommonBlock(srcAccess, dstAccess, srcDomain, numCommonLoops);
@@ -813,10 +817,11 @@ static bool srcAppearsBeforeDstInAncestralBlock(
 // *) If 'loopDepth == 1' then one constraint is added: i' >= i + 1
 // *) If 'loopDepth == 2' then two constraints are added: i == i' and j' > j + 1
 // *) If 'loopDepth == 3' then two constraints are added: i == i' and j == j'
-static void addOrderingConstraints(const FlatAffineConstraints &srcDomain,
-                                   const FlatAffineConstraints &dstDomain,
-                                   unsigned loopDepth,
-                                   FlatAffineConstraints *dependenceDomain) {
+static void
+addOrderingConstraints(const FlatAffineValueConstraints &srcDomain,
+                       const FlatAffineValueConstraints &dstDomain,
+                       unsigned loopDepth,
+                       FlatAffineValueConstraints *dependenceDomain) {
   unsigned numCols = dependenceDomain->getNumCols();
   SmallVector<int64_t, 4> eq(numCols);
   unsigned numSrcDims = srcDomain.getNumDimIds();
@@ -840,9 +845,9 @@ static void addOrderingConstraints(const FlatAffineConstraints &srcDomain,
 // eliminating all other variables, and reading off distance vectors from
 // equality constraints (if possible), and direction vectors from inequalities.
 static void computeDirectionVector(
-    const FlatAffineConstraints &srcDomain,
-    const FlatAffineConstraints &dstDomain, unsigned loopDepth,
-    FlatAffineConstraints *dependenceDomain,
+    const FlatAffineValueConstraints &srcDomain,
+    const FlatAffineValueConstraints &dstDomain, unsigned loopDepth,
+    FlatAffineValueConstraints *dependenceDomain,
     SmallVector<DependenceComponent, 2> *dependenceComponents) {
   // Find the number of common loops shared by src and dst accesses.
   SmallVector<AffineForOp, 4> commonLoops;
@@ -881,10 +886,12 @@ static void computeDirectionVector(
   dependenceComponents->resize(numCommonLoops);
   for (unsigned j = 0; j < numCommonLoops; ++j) {
     (*dependenceComponents)[j].op = commonLoops[j].getOperation();
-    auto lbConst = dependenceDomain->getConstantLowerBound(j);
+    auto lbConst =
+        dependenceDomain->getConstantBound(FlatAffineConstraints::LB, j);
     (*dependenceComponents)[j].lb =
         lbConst.getValueOr(std::numeric_limits<int64_t>::min());
-    auto ubConst = dependenceDomain->getConstantUpperBound(j);
+    auto ubConst =
+        dependenceDomain->getConstantBound(FlatAffineConstraints::UB, j);
     (*dependenceComponents)[j].ub =
         ubConst.getValueOr(std::numeric_limits<int64_t>::max());
   }
@@ -996,7 +1003,7 @@ void MemRefAccess::getAccessMap(AffineValueMap *accessMap) const {
 // TODO: Support AffineExprs mod/floordiv/ceildiv.
 DependenceResult mlir::checkMemrefAccessDependence(
     const MemRefAccess &srcAccess, const MemRefAccess &dstAccess,
-    unsigned loopDepth, FlatAffineConstraints *dependenceConstraints,
+    unsigned loopDepth, FlatAffineValueConstraints *dependenceConstraints,
     SmallVector<DependenceComponent, 2> *dependenceComponents, bool allowRAR) {
   LLVM_DEBUG(llvm::dbgs() << "Checking for dependence at depth: "
                           << Twine(loopDepth) << " between:\n";);
@@ -1022,12 +1029,12 @@ DependenceResult mlir::checkMemrefAccessDependence(
   dstAccess.getAccessMap(&dstAccessMap);
 
   // Get iteration domain for the 'srcAccess' operation.
-  FlatAffineConstraints srcDomain;
+  FlatAffineValueConstraints srcDomain;
   if (failed(getOpIndexSet(srcAccess.opInst, &srcDomain)))
     return DependenceResult::Failure;
 
   // Get iteration domain for 'dstAccess' operation.
-  FlatAffineConstraints dstDomain;
+  FlatAffineValueConstraints dstDomain;
   if (failed(getOpIndexSet(dstAccess.opInst, &dstDomain)))
     return DependenceResult::Failure;
 
@@ -1106,7 +1113,7 @@ void mlir::getDependenceComponents(
         auto *dstOp = loadAndStoreOps[j];
         MemRefAccess dstAccess(dstOp);
 
-        FlatAffineConstraints dependenceConstraints;
+        FlatAffineValueConstraints dependenceConstraints;
         SmallVector<DependenceComponent, 2> depComps;
         // TODO: Explore whether it would be profitable to pre-compute and store
         // deps instead of repeatedly checking.
diff --git a/mlir/lib/Analysis/AffineStructures.cpp b/mlir/lib/Analysis/AffineStructures.cpp
index 984500e94dbd8..7489409a59194 100644
--- a/mlir/lib/Analysis/AffineStructures.cpp
+++ b/mlir/lib/Analysis/AffineStructures.cpp
@@ -141,7 +141,7 @@ LogicalResult mlir::getFlattenedAffineExprs(
 }
 
 //===----------------------------------------------------------------------===//
-// FlatAffineConstraints.
+// FlatAffineConstraints / FlatAffineValueConstraints.
 //===----------------------------------------------------------------------===//
 
 // Clones this object.
@@ -149,14 +149,17 @@ std::unique_ptr<FlatAffineConstraints> FlatAffineConstraints::clone() const {
   return std::make_unique<FlatAffineConstraints>(*this);
 }
 
+std::unique_ptr<FlatAffineValueConstraints>
+FlatAffineValueConstraints::clone() const {
+  return std::make_unique<FlatAffineValueConstraints>(*this);
+}
+
 // Construct from an IntegerSet.
 FlatAffineConstraints::FlatAffineConstraints(IntegerSet set)
     : numIds(set.getNumDims() + set.getNumSymbols()), numDims(set.getNumDims()),
       numSymbols(set.getNumSymbols()),
       equalities(0, numIds + 1, set.getNumEqualities(), numIds + 1),
       inequalities(0, numIds + 1, set.getNumInequalities(), numIds + 1) {
-  ids.resize(numIds, None);
-
   // Flatten expressions and add them to the constraint system.
   std::vector<SmallVector<int64_t, 8>> flatExprs;
   FlatAffineConstraints localVarCst;
@@ -182,28 +185,61 @@ FlatAffineConstraints::FlatAffineConstraints(IntegerSet set)
   append(localVarCst);
 }
 
+// Construct from an IntegerSet.
+FlatAffineValueConstraints::FlatAffineValueConstraints(IntegerSet set)
+    : FlatAffineConstraints(set) {
+  values.resize(numIds, None);
+}
+
 void FlatAffineConstraints::reset(unsigned numReservedInequalities,
                                   unsigned numReservedEqualities,
                                   unsigned newNumReservedCols,
                                   unsigned newNumDims, unsigned newNumSymbols,
-                                  unsigned newNumLocals,
-                                  ArrayRef<Value> idArgs) {
+                                  unsigned newNumLocals) {
   assert(newNumReservedCols >= newNumDims + newNumSymbols + newNumLocals + 1 &&
          "minimum 1 column");
-  SmallVector<Optional<Value>, 8> newIds;
-  if (!idArgs.empty())
-    newIds.assign(idArgs.begin(), idArgs.end());
-
   *this = FlatAffineConstraints(numReservedInequalities, numReservedEqualities,
                                 newNumReservedCols, newNumDims, newNumSymbols,
-                                newNumLocals, newIds);
+                                newNumLocals);
+}
+
+void FlatAffineValueConstraints::reset(unsigned numReservedInequalities,
+                                       unsigned numReservedEqualities,
+                                       unsigned newNumReservedCols,
+                                       unsigned newNumDims,
+                                       unsigned newNumSymbols,
+                                       unsigned newNumLocals) {
+  reset(numReservedInequalities, numReservedEqualities, newNumReservedCols,
+        newNumDims, newNumSymbols, newNumLocals, /*valArgs=*/{});
+}
+
+void FlatAffineValueConstraints::reset(
+    unsigned numReservedInequalities, unsigned numReservedEqualities,
+    unsigned newNumReservedCols, unsigned newNumDims, unsigned newNumSymbols,
+    unsigned newNumLocals, ArrayRef<Value> valArgs) {
+  assert(newNumReservedCols >= newNumDims + newNumSymbols + newNumLocals + 1 &&
+         "minimum 1 column");
+  SmallVector<Optional<Value>, 8> newVals;
+  if (!valArgs.empty())
+    newVals.assign(valArgs.begin(), valArgs.end());
+
+  *this = FlatAffineValueConstraints(
+      numReservedInequalities, numReservedEqualities, newNumReservedCols,
+      newNumDims, newNumSymbols, newNumLocals, newVals);
 }
 
 void FlatAffineConstraints::reset(unsigned newNumDims, unsigned newNumSymbols,
-                                  unsigned newNumLocals,
-                                  ArrayRef<Value> idArgs) {
+                                  unsigned newNumLocals) {
+  reset(0, 0, newNumDims + newNumSymbols + newNumLocals + 1, newNumDims,
+        newNumSymbols, newNumLocals);
+}
+
+void FlatAffineValueConstraints::reset(unsigned newNumDims,
+                                       unsigned newNumSymbols,
+                                       unsigned newNumLocals,
+                                       ArrayRef<Value> valArgs) {
   reset(0, 0, newNumDims + newNumSymbols + newNumLocals + 1, newNumDims,
-        newNumSymbols, newNumLocals, idArgs);
+        newNumSymbols, newNumLocals, valArgs);
 }
 
 void FlatAffineConstraints::append(const FlatAffineConstraints &other) {
@@ -227,17 +263,53 @@ void FlatAffineConstraints::addLocalId(unsigned pos) {
   addId(IdKind::Local, pos);
 }
 
-void FlatAffineConstraints::addDimId(unsigned pos, Value id) {
-  addId(IdKind::Dimension, pos, id);
+unsigned FlatAffineConstraints::addLocalId() {
+  unsigned pos = getNumLocalIds();
+  addId(IdKind::Local, pos);
+  return pos;
+}
+
+void FlatAffineConstraints::addDimId(unsigned pos) {
+  addId(IdKind::Dimension, pos);
+}
+
+unsigned FlatAffineConstraints::addDimId() {
+  unsigned pos = getNumDimIds();
+  addId(IdKind::Dimension, pos);
+  return pos;
+}
+
+void FlatAffineValueConstraints::addDimId(unsigned pos, Value val) {
+  addId(IdKind::Dimension, pos, val);
+}
+
+unsigned FlatAffineValueConstraints::addDimId(Value val) {
+  unsigned pos = getNumDimIds();
+  addId(IdKind::Dimension, pos, val);
+  return pos;
 }
 
-void FlatAffineConstraints::addSymbolId(unsigned pos, Value id) {
-  addId(IdKind::Symbol, pos, id);
+void FlatAffineConstraints::addSymbolId(unsigned pos) {
+  addId(IdKind::Symbol, pos);
 }
 
-/// Adds a dimensional identifier. The added column is initialized to
-/// zero.
-void FlatAffineConstraints::addId(IdKind kind, unsigned pos, Value id) {
+unsigned FlatAffineConstraints::addSymbolId() {
+  unsigned pos = getNumSymbolIds();
+  addId(IdKind::Symbol, pos);
+  return pos;
+}
+
+void FlatAffineValueConstraints::addSymbolId(unsigned pos, Value val) {
+  addId(IdKind::Symbol, pos, val);
+}
+
+unsigned FlatAffineValueConstraints::addSymbolId(Value val) {
+  unsigned pos = getNumSymbolIds();
+  addId(IdKind::Symbol, pos, val);
+  return pos;
+}
+
+unsigned FlatAffineConstraints::addId(IdKind kind, unsigned pos) {
   if (kind == IdKind::Dimension)
     assert(pos <= getNumDimIds());
   else if (kind == IdKind::Symbol)
@@ -245,7 +317,7 @@ void FlatAffineConstraints::addId(IdKind kind, unsigned pos, Value id) {
   else
     assert(pos <= getNumLocalIds());
 
-  int absolutePos;
+  unsigned absolutePos;
   if (kind == IdKind::Dimension) {
     absolutePos = pos;
     numDims++;
@@ -260,36 +332,56 @@ void FlatAffineConstraints::addId(IdKind kind, unsigned pos, Value id) {
   inequalities.insertColumn(absolutePos);
   equalities.insertColumn(absolutePos);
 
+  return absolutePos;
+}
+
+unsigned FlatAffineValueConstraints::addId(IdKind kind, unsigned pos) {
+  return addId(kind, pos, /*val=*/{});
+}
+
+unsigned FlatAffineValueConstraints::addId(IdKind kind, unsigned pos,
+                                           Value val) {
+  unsigned absolutePos = FlatAffineConstraints::addId(kind, pos);
+
   // If an 'id' is provided, insert it; otherwise use None.
-  if (id)
-    ids.insert(ids.begin() + absolutePos, id);
+  if (val)
+    values.insert(values.begin() + absolutePos, val);
   else
-    ids.insert(ids.begin() + absolutePos, None);
-  assert(ids.size() == getNumIds());
+    values.insert(values.begin() + absolutePos, None);
+  assert(values.size() == getNumIds());
+
+  return absolutePos;
+}
+
+bool FlatAffineValueConstraints::hasValues() const {
+  return llvm::find_if(values, [](Optional<Value> id) {
+           return id.hasValue();
+         }) != values.end();
 }
 
 /// Checks if two constraint systems are in the same space, i.e., if they are
 /// associated with the same set of identifiers, appearing in the same order.
-static bool areIdsAligned(const FlatAffineConstraints &a,
-                          const FlatAffineConstraints &b) {
+static bool areIdsAligned(const FlatAffineValueConstraints &a,
+                          const FlatAffineValueConstraints &b) {
   return a.getNumDimIds() == b.getNumDimIds() &&
          a.getNumSymbolIds() == b.getNumSymbolIds() &&
-         a.getNumIds() == b.getNumIds() && a.getIds().equals(b.getIds());
+         a.getNumIds() == b.getNumIds() &&
+         a.getMaybeValues().equals(b.getMaybeValues());
 }
 
 /// Calls areIdsAligned to check if two constraint systems have the same set
 /// of identifiers in the same order.
-bool FlatAffineConstraints::areIdsAlignedWithOther(
-    const FlatAffineConstraints &other) {
+bool FlatAffineValueConstraints::areIdsAlignedWithOther(
+    const FlatAffineValueConstraints &other) {
   return areIdsAligned(*this, other);
 }
 
-/// Checks if the SSA values associated with `cst''s identifiers are unique.
+/// Checks if the SSA values associated with `cst`'s identifiers are unique.
 static bool LLVM_ATTRIBUTE_UNUSED
-areIdsUnique(const FlatAffineConstraints &cst) {
+areIdsUnique(const FlatAffineValueConstraints &cst) {
   SmallPtrSet<Value, 8> uniqueIds;
-  for (auto id : cst.getIds()) {
-    if (id.hasValue() && !uniqueIds.insert(id.getValue()).second)
+  for (auto val : cst.getMaybeValues()) {
+    if (val.hasValue() && !uniqueIds.insert(val.getValue()).second)
       return false;
   }
   return true;
@@ -304,20 +396,19 @@ areIdsUnique(const FlatAffineConstraints &cst) {
 /// and are placed one after other (A's followed by B's).
 //  Eg: Input: A has ((%i %j) [%M %N]) and B has (%k, %j) [%P, %N, %M])
 //      Output: both A, B have (%i, %j, %k) [%M, %N, %P]
-//
-static void mergeAndAlignIds(unsigned offset, FlatAffineConstraints *a,
-                             FlatAffineConstraints *b) {
+static void mergeAndAlignIds(unsigned offset, FlatAffineValueConstraints *a,
+                             FlatAffineValueConstraints *b) {
   assert(offset <= a->getNumDimIds() && offset <= b->getNumDimIds());
   // A merge/align isn't meaningful if a cst's ids aren't distinct.
-  assert(areIdsUnique(*a) && "A's id values aren't unique");
-  assert(areIdsUnique(*b) && "B's id values aren't unique");
+  assert(areIdsUnique(*a) && "A's values aren't unique");
+  assert(areIdsUnique(*b) && "B's values aren't unique");
 
-  assert(std::all_of(a->getIds().begin() + offset,
-                     a->getIds().begin() + a->getNumDimAndSymbolIds(),
+  assert(std::all_of(a->getMaybeValues().begin() + offset,
+                     a->getMaybeValues().begin() + a->getNumDimAndSymbolIds(),
                      [](Optional<Value> id) { return id.hasValue(); }));
 
-  assert(std::all_of(b->getIds().begin() + offset,
-                     b->getIds().begin() + b->getNumDimAndSymbolIds(),
+  assert(std::all_of(b->getMaybeValues().begin() + offset,
+                     b->getMaybeValues().begin() + b->getNumDimAndSymbolIds(),
                      [](Optional<Value> id) { return id.hasValue(); }));
 
   // Place local id's of A after local id's of B.
@@ -330,8 +421,8 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineConstraints *a,
   }
 
   SmallVector<Value, 4> aDimValues, aSymValues;
-  a->getIdValues(offset, a->getNumDimIds(), &aDimValues);
-  a->getIdValues(a->getNumDimIds(), a->getNumDimAndSymbolIds(), &aSymValues);
+  a->getValues(offset, a->getNumDimIds(), &aDimValues);
+  a->getValues(a->getNumDimIds(), a->getNumDimAndSymbolIds(), &aSymValues);
   {
     // Merge dims from A into B.
     unsigned d = offset;
@@ -344,7 +435,7 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineConstraints *a,
         b->swapId(d, loc);
       } else {
         b->addDimId(d);
-        b->setIdValue(d, aDimValue);
+        b->setValue(d, aDimValue);
       }
       d++;
     }
@@ -352,7 +443,7 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineConstraints *a,
     // Dimensions that are in B, but not in A, are added at the end.
     for (unsigned t = a->getNumDimIds(), e = b->getNumDimIds(); t < e; t++) {
       a->addDimId(a->getNumDimIds());
-      a->setIdValue(a->getNumDimIds() - 1, b->getIdValue(t));
+      a->setValue(a->getNumDimIds() - 1, b->getValue(t));
     }
   }
   {
@@ -366,7 +457,7 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineConstraints *a,
         b->swapId(s, loc);
       } else {
         b->addSymbolId(s - b->getNumDimIds());
-        b->setIdValue(s, aSymValue);
+        b->setValue(s, aSymValue);
       }
       s++;
     }
@@ -375,19 +466,20 @@ static void mergeAndAlignIds(unsigned offset, FlatAffineConstraints *a,
                   e = b->getNumDimAndSymbolIds();
          t < e; t++) {
       a->addSymbolId(a->getNumSymbolIds());
-      a->setIdValue(a->getNumDimAndSymbolIds() - 1, b->getIdValue(t));
+      a->setValue(a->getNumDimAndSymbolIds() - 1, b->getValue(t));
     }
   }
   assert(areIdsAligned(*a, *b) && "IDs expected to be aligned");
 }
 
 // Call 'mergeAndAlignIds' to align constraint systems of 'this' and 'other'.
-void FlatAffineConstraints::mergeAndAlignIdsWithOther(
-    unsigned offset, FlatAffineConstraints *other) {
+void FlatAffineValueConstraints::mergeAndAlignIdsWithOther(
+    unsigned offset, FlatAffineValueConstraints *other) {
   mergeAndAlignIds(offset, this, other);
 }
 
-LogicalResult FlatAffineConstraints::composeMap(const AffineValueMap *vMap) {
+LogicalResult
+FlatAffineValueConstraints::composeMap(const AffineValueMap *vMap) {
   return composeMatchingMap(
       computeAlignedMap(vMap->getAffineMap(), vMap->getOperands()));
 }
@@ -446,7 +538,7 @@ LogicalResult FlatAffineConstraints::composeMatchingMap(AffineMap other) {
 }
 
 // Turn a symbol into a dimension.
-static void turnSymbolIntoDim(FlatAffineConstraints *cst, Value id) {
+static void turnSymbolIntoDim(FlatAffineValueConstraints *cst, Value id) {
   unsigned pos;
   if (cst->findId(id, &pos) && pos >= cst->getNumDimIds() &&
       pos < cst->getNumDimAndSymbolIds()) {
@@ -456,12 +548,12 @@ static void turnSymbolIntoDim(FlatAffineConstraints *cst, Value id) {
 }
 
 // Changes all symbol identifiers which are loop IVs to dim identifiers.
-void FlatAffineConstraints::convertLoopIVSymbolsToDims() {
+void FlatAffineValueConstraints::convertLoopIVSymbolsToDims() {
   // Gather all symbols which are loop IVs.
   SmallVector<Value, 4> loopIVs;
   for (unsigned i = getNumDimIds(), e = getNumDimAndSymbolIds(); i < e; i++) {
-    if (ids[i].hasValue() && getForInductionVarOwner(ids[i].getValue()))
-      loopIVs.push_back(ids[i].getValue());
+    if (hasValue(i) && getForInductionVarOwner(getValue(i)))
+      loopIVs.push_back(getValue(i));
   }
   // Turn each symbol in 'loopIVs' into a dim identifier.
   for (auto iv : loopIVs) {
@@ -469,29 +561,30 @@ void FlatAffineConstraints::convertLoopIVSymbolsToDims() {
   }
 }
 
-void FlatAffineConstraints::addInductionVarOrTerminalSymbol(Value id) {
-  if (containsId(id))
+void FlatAffineValueConstraints::addInductionVarOrTerminalSymbol(Value val) {
+  if (containsId(val))
     return;
 
   // Caller is expected to fully compose map/operands if necessary.
-  assert((isTopLevelValue(id) || isForInductionVar(id)) &&
+  assert((isTopLevelValue(val) || isForInductionVar(val)) &&
          "non-terminal symbol / loop IV expected");
   // Outer loop IVs could be used in forOp's bounds.
-  if (auto loop = getForInductionVarOwner(id)) {
-    addDimId(getNumDimIds(), id);
+  if (auto loop = getForInductionVarOwner(val)) {
+    addDimId(getNumDimIds(), val);
     if (failed(this->addAffineForOpDomain(loop)))
       LLVM_DEBUG(
           loop.emitWarning("failed to add domain info to constraint system"));
     return;
   }
   // Add top level symbol.
-  addSymbolId(getNumSymbolIds(), id);
+  addSymbolId(getNumSymbolIds(), val);
   // Check if the symbol is a constant.
-  if (auto constOp = id.getDefiningOp<ConstantIndexOp>())
-    setIdToConstant(id, constOp.getValue());
+  if (auto constOp = val.getDefiningOp<ConstantIndexOp>())
+    addBound(BoundType::EQ, val, constOp.getValue());
 }
 
-LogicalResult FlatAffineConstraints::addAffineForOpDomain(AffineForOp forOp) {
+LogicalResult
+FlatAffineValueConstraints::addAffineForOpDomain(AffineForOp forOp) {
   unsigned pos;
   // Pre-condition for this method.
   if (!findId(forOp.getInductionVar(), &pos)) {
@@ -525,40 +618,27 @@ LogicalResult FlatAffineConstraints::addAffineForOpDomain(AffineForOp forOp) {
   }
 
   if (forOp.hasConstantLowerBound()) {
-    addConstantLowerBound(pos, forOp.getConstantLowerBound());
+    addBound(BoundType::LB, pos, forOp.getConstantLowerBound());
   } else {
     // Non-constant lower bound case.
-    if (failed(addLowerOrUpperBound(pos, forOp.getLowerBoundMap(),
-                                    forOp.getLowerBoundOperands(),
-                                    /*eq=*/false, /*lower=*/true)))
+    if (failed(addBound(BoundType::LB, pos, forOp.getLowerBoundMap(),
+                        forOp.getLowerBoundOperands())))
       return failure();
   }
 
   if (forOp.hasConstantUpperBound()) {
-    addConstantUpperBound(pos, forOp.getConstantUpperBound() - 1);
+    addBound(BoundType::UB, pos, forOp.getConstantUpperBound() - 1);
     return success();
   }
   // Non-constant upper bound case.
-  return addLowerOrUpperBound(pos, forOp.getUpperBoundMap(),
-                              forOp.getUpperBoundOperands(),
-                              /*eq=*/false, /*lower=*/false);
-}
-
-/// Adds constraints (lower and upper bounds) for each loop in the loop nest
-/// described by the bound maps 'lbMaps' and 'ubMaps' of a computation slice.
-/// Every pair ('lbMaps[i]', 'ubMaps[i]') describes the bounds of a loop in
-/// the nest, sorted outer-to-inner. 'operands' contains the bound operands
-/// for a single bound map. All the bound maps will use the same bound
-/// operands. Note that some loops described by a computation slice might not
-/// exist yet in the IR so the Value attached to those dimension identifiers
-/// might be empty. For that reason, this method doesn't perform Value
-/// look-ups to retrieve the dimension identifier positions. Instead, it
-/// assumes the position of the dim identifiers in the constraint system is
-/// the same as the position of the loop in the loop nest.
+  return addBound(BoundType::UB, pos, forOp.getUpperBoundMap(),
+                  forOp.getUpperBoundOperands());
+}
+
 LogicalResult
-FlatAffineConstraints::addDomainFromSliceMaps(ArrayRef<AffineMap> lbMaps,
-                                              ArrayRef<AffineMap> ubMaps,
-                                              ArrayRef<Value> operands) {
+FlatAffineValueConstraints::addDomainFromSliceMaps(ArrayRef<AffineMap> lbMaps,
+                                                   ArrayRef<AffineMap> ubMaps,
+                                                   ArrayRef<Value> operands) {
   assert(lbMaps.size() == ubMaps.size());
   assert(lbMaps.size() <= getNumDimIds());
 
@@ -597,24 +677,21 @@ FlatAffineConstraints::addDomainFromSliceMaps(ArrayRef<AffineMap> lbMaps,
     // This slice refers to a loop that doesn't exist in the IR yet. Add its
     // bounds to the system assuming its dimension identifier position is the
     // same as the position of the loop in the loop nest.
-    if (lbMap && failed(addLowerOrUpperBound(i, lbMap, operands, /*eq=*/false,
-                                             /*lower=*/true)))
+    if (lbMap && failed(addBound(BoundType::LB, i, lbMap, operands)))
       return failure();
-
-    if (ubMap && failed(addLowerOrUpperBound(i, ubMap, operands, /*eq=*/false,
-                                             /*lower=*/false)))
+    if (ubMap && failed(addBound(BoundType::UB, i, ubMap, operands)))
       return failure();
   }
   return success();
 }
 
-void FlatAffineConstraints::addAffineIfOpDomain(AffineIfOp ifOp) {
+void FlatAffineValueConstraints::addAffineIfOpDomain(AffineIfOp ifOp) {
   // Create the base constraints from the integer set attached to ifOp.
-  FlatAffineConstraints cst(ifOp.getIntegerSet());
+  FlatAffineValueConstraints cst(ifOp.getIntegerSet());
 
   // Bind ids in the constraints to ifOp operands.
   SmallVector<Value, 4> operands = ifOp.getOperands();
-  cst.setIdValues(0, cst.getNumDimAndSymbolIds(), operands);
+  cst.setValues(0, cst.getNumDimAndSymbolIds(), operands);
 
   // Merge the constraints from ifOp to the current domain. We need first merge
   // and align the IDs from both constraints, and then append the constraints
@@ -623,10 +700,9 @@ void FlatAffineConstraints::addAffineIfOpDomain(AffineIfOp ifOp) {
   append(cst);
 }
 
-// Searches for a constraint with a non-zero coefficient at 'colIdx' in
+// Searches for a constraint with a non-zero coefficient at `colIdx` in
 // equality (isEq=true) or inequality (isEq=false) constraints.
-// Returns true and sets row found in search in 'rowIdx'.
-// Returns false otherwise.
+// Returns true and sets row found in search in `rowIdx`, false otherwise.
 static bool findConstraintWithNonZeroAt(const FlatAffineConstraints &cst,
                                         unsigned colIdx, bool isEq,
                                         unsigned *rowIdx) {
@@ -643,8 +719,8 @@ static bool findConstraintWithNonZeroAt(const FlatAffineConstraints &cst,
   return false;
 }
 
-// Normalizes the coefficient values across all columns in 'rowIDx' by their
-// GCD in equality or inequality constraints as specified by 'isEq'.
+// Normalizes the coefficient values across all columns in `rowIdx` by their
+// GCD in equality or inequality constraints as specified by `isEq`.
 template <bool isEq>
 static void normalizeConstraintByGCD(FlatAffineConstraints *constraints,
                                      unsigned rowIdx) {
@@ -679,8 +755,6 @@ bool FlatAffineConstraints::hasConsistentState() const {
     return false;
   if (!equalities.hasConsistentState())
     return false;
-  if (ids.size() != getNumIds())
-    return false;
 
   // Catches errors where numDims, numSymbols, numIds aren't consistent.
   if (numDims > numIds || numSymbols > numIds || numDims + numSymbols > numIds)
@@ -689,10 +763,11 @@ bool FlatAffineConstraints::hasConsistentState() const {
   return true;
 }
 
-/// Checks all rows of equality/inequality constraints for trivial
-/// contradictions (for example: 1 == 0, 0 >= 1), which may have surfaced
-/// after elimination. Returns 'true' if an invalid constraint is found;
-/// 'false' otherwise.
+bool FlatAffineValueConstraints::hasConsistentState() const {
+  return FlatAffineConstraints::hasConsistentState() &&
+         values.size() == getNumIds();
+}
+
 bool FlatAffineConstraints::hasInvalidConstraint() const {
   assert(hasConsistentState());
   auto check = [&](bool isEq) -> bool {
@@ -723,9 +798,9 @@ bool FlatAffineConstraints::hasInvalidConstraint() const {
   return check(/*isEq=*/false);
 }
 
-// Eliminate identifier from constraint at 'rowIdx' based on coefficient at
-// pivotRow, pivotCol. Columns in range [elimColStart, pivotCol) will not be
-// updated as they have already been eliminated.
+/// Eliminate identifier from constraint at `rowIdx` based on coefficient at
+/// pivotRow, pivotCol. Columns in range [elimColStart, pivotCol) will not be
+/// updated as they have already been eliminated.
 static void eliminateFromConstraint(FlatAffineConstraints *constraints,
                                     unsigned rowIdx, unsigned pivotRow,
                                     unsigned pivotCol, unsigned elimColStart,
@@ -758,8 +833,6 @@ static void eliminateFromConstraint(FlatAffineConstraints *constraints,
   }
 }
 
-// Removes identifiers in column range [idStart, idLimit), and copies any
-// remaining valid data into place, and updates member variables.
 void FlatAffineConstraints::removeIdRange(unsigned idStart, unsigned idLimit) {
   assert(idLimit < getNumCols() && "invalid id limit");
 
@@ -793,8 +866,12 @@ void FlatAffineConstraints::removeIdRange(unsigned idStart, unsigned idLimit) {
   numDims -= numDimsEliminated;
   numSymbols -= numSymbolsEliminated;
   numIds = numIds - numColsEliminated;
+}
 
-  ids.erase(ids.begin() + idStart, ids.begin() + idLimit);
+void FlatAffineValueConstraints::removeIdRange(unsigned idStart,
+                                               unsigned idLimit) {
+  FlatAffineConstraints::removeIdRange(idStart, idLimit);
+  values.erase(values.begin() + idStart, values.begin() + idLimit);
 }
 
 /// Returns the position of the identifier that has the minimum <number of lower
@@ -1341,7 +1418,8 @@ static bool detectAsMod(const FlatAffineConstraints &cst, unsigned pos,
 
     // Express `id_r` as `id_n % divisor` and store the expression in `memo`.
     if (quotientCount >= 1) {
-      auto ub = cst.getConstantUpperBound(dimExpr.getPosition());
+      auto ub = cst.getConstantBound(FlatAffineConstraints::BoundType::UB,
+                                     dimExpr.getPosition());
       // If `id_n` has an upperbound that is less than the divisor, mod can be
       // eliminated altogether.
       if (ub.hasValue() && ub.getValue() < divisor)
@@ -1716,8 +1794,8 @@ void FlatAffineConstraints::getSliceBounds(unsigned offset, unsigned num,
       if (memo[pos])
         continue;
 
-      auto lbConst = getConstantLowerBound(pos);
-      auto ubConst = getConstantUpperBound(pos);
+      auto lbConst = getConstantBound(BoundType::LB, pos);
+      auto ubConst = getConstantBound(BoundType::UB, pos);
       if (lbConst.hasValue() && ubConst.hasValue()) {
         // Detect equality to a constant.
         if (lbConst.getValue() == ubConst.getValue()) {
@@ -1826,7 +1904,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned offset, unsigned num,
       if (!lbMap || lbMap.getNumResults() > 1) {
         LLVM_DEBUG(llvm::dbgs()
                    << "WARNING: Potentially over-approximating slice lb\n");
-        auto lbConst = getConstantLowerBound(pos + offset);
+        auto lbConst = getConstantBound(BoundType::LB, pos + offset);
         if (lbConst.hasValue()) {
           lbMap = AffineMap::get(
               numMapDims, numMapSymbols,
@@ -1836,7 +1914,7 @@ void FlatAffineConstraints::getSliceBounds(unsigned offset, unsigned num,
       if (!ubMap || ubMap.getNumResults() > 1) {
         LLVM_DEBUG(llvm::dbgs()
                    << "WARNING: Potentially over-approximating slice ub\n");
-        auto ubConst = getConstantUpperBound(pos + offset);
+        auto ubConst = getConstantBound(BoundType::UB, pos + offset);
         if (ubConst.hasValue()) {
           (ubMap) = AffineMap::get(
               numMapDims, numMapSymbols,
@@ -1879,18 +1957,17 @@ LogicalResult FlatAffineConstraints::flattenAlignedMapAndMergeLocals(
   return success();
 }
 
-LogicalResult FlatAffineConstraints::addLowerOrUpperBound(unsigned pos,
-                                                          AffineMap boundMap,
-                                                          bool eq, bool lower) {
+LogicalResult FlatAffineConstraints::addBound(BoundType type, unsigned pos,
+                                              AffineMap boundMap) {
   assert(boundMap.getNumDims() == getNumDimIds() && "dim mismatch");
   assert(boundMap.getNumSymbols() == getNumSymbolIds() && "symbol mismatch");
   assert(pos < getNumDimAndSymbolIds() && "invalid position");
 
   // Equality follows the logic of lower bound except that we add an equality
   // instead of an inequality.
-  assert((!eq || boundMap.getNumResults() == 1) && "single result expected");
-  if (eq)
-    lower = true;
+  assert((type != BoundType::EQ || boundMap.getNumResults() == 1) &&
+         "single result expected");
+  bool lower = type == BoundType::LB || type == BoundType::EQ;
 
   std::vector<SmallVector<int64_t, 8>> flatExprs;
   if (failed(flattenAlignedMapAndMergeLocals(boundMap, &flatExprs)))
@@ -1921,14 +1998,15 @@ LogicalResult FlatAffineConstraints::addLowerOrUpperBound(unsigned pos,
         lower ? -flatExpr[flatExpr.size() - 1]
               // Upper bound in flattenedExpr is an exclusive one.
               : flatExpr[flatExpr.size() - 1] - 1;
-    eq ? addEquality(ineq) : addInequality(ineq);
+    type == BoundType::EQ ? addEquality(ineq) : addInequality(ineq);
   }
 
   return success();
 }
 
-AffineMap FlatAffineConstraints::computeAlignedMap(AffineMap map,
-                                                   ValueRange operands) const {
+AffineMap
+FlatAffineValueConstraints::computeAlignedMap(AffineMap map,
+                                              ValueRange operands) const {
   assert(map.getNumInputs() == operands.size() && "number of inputs mismatch");
 
   SmallVector<Value> dims, syms;
@@ -1942,9 +2020,9 @@ AffineMap FlatAffineConstraints::computeAlignedMap(AffineMap map,
   dims.reserve(numDims);
   syms.reserve(numSymbols);
   for (unsigned i = 0; i < numDims; ++i)
-    dims.push_back(ids[i] ? *ids[i] : Value());
+    dims.push_back(values[i] ? *values[i] : Value());
   for (unsigned i = numDims, e = numDims + numSymbols; i < e; ++i)
-    syms.push_back(ids[i] ? *ids[i] : Value());
+    syms.push_back(values[i] ? *values[i] : Value());
 
   AffineMap alignedMap =
       alignAffineMapWithValues(map, operands, dims, syms, newSymsPtr);
@@ -1955,10 +2033,9 @@ AffineMap FlatAffineConstraints::computeAlignedMap(AffineMap map,
   return alignedMap;
 }
 
-LogicalResult
-FlatAffineConstraints::addLowerOrUpperBound(unsigned pos, AffineMap boundMap,
-                                            ValueRange boundOperands, bool eq,
-                                            bool lower) {
+LogicalResult FlatAffineValueConstraints::addBound(BoundType type, unsigned pos,
+                                                   AffineMap boundMap,
+                                                   ValueRange boundOperands) {
   // Fully compose map and operands; canonicalize and simplify so that we
   // transitively get to terminal symbols or loop IVs.
   auto map = boundMap;
@@ -1968,7 +2045,7 @@ FlatAffineConstraints::addLowerOrUpperBound(unsigned pos, AffineMap boundMap,
   canonicalizeMapAndOperands(&map, &operands);
   for (auto operand : operands)
     addInductionVarOrTerminalSymbol(operand);
-  return addLowerOrUpperBound(pos, computeAlignedMap(map, operands), eq, lower);
+  return addBound(type, pos, computeAlignedMap(map, operands));
 }
 
 // Adds slice lower bounds represented by lower bounds in 'lbMaps' and upper
@@ -1980,10 +2057,9 @@ FlatAffineConstraints::addLowerOrUpperBound(unsigned pos, AffineMap boundMap,
 // Note that both lower/upper bounds use operands from 'operands'.
 // Returns failure for unimplemented cases such as semi-affine expressions or
 // expressions with mod/floordiv.
-LogicalResult FlatAffineConstraints::addSliceBounds(ArrayRef<Value> values,
-                                                    ArrayRef<AffineMap> lbMaps,
-                                                    ArrayRef<AffineMap> ubMaps,
-                                                    ArrayRef<Value> operands) {
+LogicalResult FlatAffineValueConstraints::addSliceBounds(
+    ArrayRef<Value> values, ArrayRef<AffineMap> lbMaps,
+    ArrayRef<AffineMap> ubMaps, ArrayRef<Value> operands) {
   assert(values.size() == lbMaps.size());
   assert(lbMaps.size() == ubMaps.size());
 
@@ -2001,8 +2077,7 @@ LogicalResult FlatAffineConstraints::addSliceBounds(ArrayRef<Value> values,
     if (lbMap && ubMap && lbMap.getNumResults() == 1 &&
         ubMap.getNumResults() == 1 &&
         lbMap.getResult(0) + 1 == ubMap.getResult(0)) {
-      if (failed(addLowerOrUpperBound(pos, lbMap, operands, /*eq=*/true,
-                                      /*lower=*/true)))
+      if (failed(addBound(BoundType::EQ, pos, lbMap, operands)))
         return failure();
       continue;
     }
@@ -2012,11 +2087,9 @@ LogicalResult FlatAffineConstraints::addSliceBounds(ArrayRef<Value> values,
     // part of the slice.
     if (lbMap && lbMap.getNumResults() != 0 && ubMap &&
         ubMap.getNumResults() != 0) {
-      if (failed(addLowerOrUpperBound(pos, lbMap, operands, /*eq=*/false,
-                                      /*lower=*/true)))
+      if (failed(addBound(BoundType::LB, pos, lbMap, operands)))
         return failure();
-      if (failed(addLowerOrUpperBound(pos, ubMap, operands, /*eq=*/false,
-                                      /*lower=*/false)))
+      if (failed(addBound(BoundType::UB, pos, ubMap, operands)))
         return failure();
     } else {
       auto loop = getForInductionVarOwner(values[i]);
@@ -2041,33 +2114,30 @@ void FlatAffineConstraints::addInequality(ArrayRef<int64_t> inEq) {
     inequalities(row, i) = inEq[i];
 }
 
-void FlatAffineConstraints::addConstantLowerBound(unsigned pos, int64_t lb) {
+void FlatAffineConstraints::addBound(BoundType type, unsigned pos,
+                                     int64_t value) {
   assert(pos < getNumCols());
-  unsigned row = inequalities.appendExtraRow();
-  inequalities(row, pos) = 1;
-  inequalities(row, getNumCols() - 1) = -lb;
-}
-
-void FlatAffineConstraints::addConstantUpperBound(unsigned pos, int64_t ub) {
-  assert(pos < getNumCols());
-  unsigned row = inequalities.appendExtraRow();
-  inequalities(row, pos) = -1;
-  inequalities(row, getNumCols() - 1) = ub;
-}
-
-void FlatAffineConstraints::addConstantLowerBound(ArrayRef<int64_t> expr,
-                                                  int64_t lb) {
-  addInequality(expr);
-  inequalities(inequalities.getNumRows() - 1, getNumCols() - 1) += -lb;
+  if (type == BoundType::EQ) {
+    unsigned row = equalities.appendExtraRow();
+    equalities(row, pos) = 1;
+    equalities(row, getNumCols() - 1) = -value;
+  } else {
+    unsigned row = inequalities.appendExtraRow();
+    inequalities(row, pos) = type == BoundType::LB ? 1 : -1;
+    inequalities(row, getNumCols() - 1) =
+        type == BoundType::LB ? -value : value;
+  }
 }
 
-void FlatAffineConstraints::addConstantUpperBound(ArrayRef<int64_t> expr,
-                                                  int64_t ub) {
+void FlatAffineConstraints::addBound(BoundType type, ArrayRef<int64_t> expr,
+                                     int64_t value) {
+  assert(type != BoundType::EQ && "EQ not implemented");
   assert(expr.size() == getNumCols());
   unsigned row = inequalities.appendExtraRow();
   for (unsigned i = 0, e = expr.size(); i < e; ++i)
-    inequalities(row, i) = -expr[i];
-  inequalities(inequalities.getNumRows() - 1, getNumCols() - 1) += ub;
+    inequalities(row, i) = type == BoundType::LB ? expr[i] : -expr[i];
+  inequalities(inequalities.getNumRows() - 1, getNumCols() - 1) +=
+      type == BoundType::LB ? -value : value;
 }
 
 /// Adds a new local identifier as the floordiv of an affine function of other
@@ -2099,10 +2169,10 @@ void FlatAffineConstraints::addLocalFloorDiv(ArrayRef<int64_t> dividend,
   addInequality(bound);
 }
 
-bool FlatAffineConstraints::findId(Value id, unsigned *pos) const {
+bool FlatAffineValueConstraints::findId(Value val, unsigned *pos) const {
   unsigned i = 0;
-  for (const auto &mayBeId : ids) {
-    if (mayBeId.hasValue() && mayBeId.getValue() == id) {
+  for (const auto &mayBeId : values) {
+    if (mayBeId.hasValue() && mayBeId.getValue() == val) {
       *pos = i;
       return true;
     }
@@ -2111,9 +2181,9 @@ bool FlatAffineConstraints::findId(Value id, unsigned *pos) const {
   return false;
 }
 
-bool FlatAffineConstraints::containsId(Value id) const {
-  return llvm::any_of(ids, [&](const Optional<Value> &mayBeId) {
-    return mayBeId.hasValue() && mayBeId.getValue() == id;
+bool FlatAffineValueConstraints::containsId(Value val) const {
+  return llvm::any_of(values, [&](const Optional<Value> &mayBeId) {
+    return mayBeId.hasValue() && mayBeId.getValue() == val;
   });
 }
 
@@ -2128,7 +2198,11 @@ void FlatAffineConstraints::swapId(unsigned posA, unsigned posB) {
     std::swap(atIneq(r, posA), atIneq(r, posB));
   for (unsigned r = 0, e = getNumEqualities(); r < e; r++)
     std::swap(atEq(r, posA), atEq(r, posB));
-  std::swap(getId(posA), getId(posB));
+}
+
+void FlatAffineValueConstraints::swapId(unsigned posA, unsigned posB) {
+  FlatAffineConstraints::swapId(posA, posB);
+  std::swap(values[posA], values[posB]);
 }
 
 void FlatAffineConstraints::setDimSymbolSeparation(unsigned newSymbolCount) {
@@ -2138,22 +2212,13 @@ void FlatAffineConstraints::setDimSymbolSeparation(unsigned newSymbolCount) {
   numSymbols = newSymbolCount;
 }
 
-/// Sets the specified identifier to a constant value.
-void FlatAffineConstraints::setIdToConstant(unsigned pos, int64_t val) {
-  equalities.resizeVertically(equalities.getNumRows() + 1);
-  unsigned row = equalities.getNumRows() - 1;
-  equalities(row, pos) = 1;
-  equalities(row, getNumCols() - 1) = -val;
-}
-
-/// Sets the specified identifier to a constant value; asserts if the id is not
-/// found.
-void FlatAffineConstraints::setIdToConstant(Value id, int64_t val) {
+void FlatAffineValueConstraints::addBound(BoundType type, Value val,
+                                          int64_t value) {
   unsigned pos;
-  if (!findId(id, &pos))
+  if (!findId(val, &pos))
     // This is a pre-condition for this method.
     assert(0 && "id not found");
-  setIdToConstant(pos, val);
+  addBound(type, pos, value);
 }
 
 void FlatAffineConstraints::removeEquality(unsigned pos) {
@@ -2430,15 +2495,12 @@ FlatAffineConstraints::computeConstantLowerOrUpperBound(unsigned pos) {
   return minOrMaxConst;
 }
 
-Optional<int64_t>
-FlatAffineConstraints::getConstantLowerBound(unsigned pos) const {
-  FlatAffineConstraints tmpCst(*this);
-  return tmpCst.computeConstantLowerOrUpperBound</*isLower=*/true>(pos);
-}
-
-Optional<int64_t>
-FlatAffineConstraints::getConstantUpperBound(unsigned pos) const {
+Optional<int64_t> FlatAffineConstraints::getConstantBound(BoundType type,
+                                                          unsigned pos) const {
+  assert(type != BoundType::EQ && "EQ not implemented");
   FlatAffineConstraints tmpCst(*this);
+  if (type == BoundType::LB)
+    return tmpCst.computeConstantLowerOrUpperBound</*isLower=*/true>(pos);
   return tmpCst.computeConstantLowerOrUpperBound</*isLower=*/false>(pos);
 }
 
@@ -2475,10 +2537,14 @@ void FlatAffineConstraints::print(raw_ostream &os) const {
      << " constraints)\n";
   os << "(";
   for (unsigned i = 0, e = getNumIds(); i < e; i++) {
-    if (ids[i] == None)
+    if (auto *valueCstr = dyn_cast<const FlatAffineValueConstraints>(this)) {
+      if (valueCstr->hasValue(i))
+        os << "Value ";
+      else
+        os << "None ";
+    } else {
       os << "None ";
-    else
-      os << "Value ";
+    }
   }
   os << " const)\n";
   for (unsigned i = 0, e = getNumEqualities(); i < e; ++i) {
@@ -2571,9 +2637,23 @@ void FlatAffineConstraints::removeTrivialRedundancy() {
 
 void FlatAffineConstraints::clearAndCopyFrom(
     const FlatAffineConstraints &other) {
-  FlatAffineConstraints copy(other);
-  std::swap(*this, copy);
-  assert(copy.getNumIds() == copy.getIds().size());
+  if (auto *otherValueSet = dyn_cast<const FlatAffineValueConstraints>(&other))
+    assert(!otherValueSet->hasValues() &&
+           "cannot copy associated Values into FlatAffineConstraints");
+  // Note: Assigment operator does not vtable pointer, so kind does not change.
+  *this = other;
+}
+
+void FlatAffineValueConstraints::clearAndCopyFrom(
+    const FlatAffineConstraints &other) {
+  if (auto *otherValueSet =
+          dyn_cast<const FlatAffineValueConstraints>(&other)) {
+    *this = *otherValueSet;
+  } else {
+    *static_cast<FlatAffineConstraints *>(this) = other;
+    values.clear();
+    values.resize(numIds, None);
+  }
 }
 
 void FlatAffineConstraints::removeId(unsigned pos) {
@@ -2712,18 +2792,11 @@ void FlatAffineConstraints::fourierMotzkinEliminate(
   unsigned newNumDims = dimsSymbols.first;
   unsigned newNumSymbols = dimsSymbols.second;
 
-  SmallVector<Optional<Value>, 8> newIds;
-  newIds.reserve(numIds - 1);
-  newIds.append(ids.begin(), ids.begin() + pos);
-  newIds.append(ids.begin() + pos + 1, ids.end());
-
   /// Create the new system which has one identifier less.
   FlatAffineConstraints newFac(
       lbIndices.size() * ubIndices.size() + nbIndices.size(),
       getNumEqualities(), getNumCols() - 1, newNumDims, newNumSymbols,
-      /*numLocals=*/getNumIds() - 1 - newNumDims - newNumSymbols, newIds);
-
-  assert(newFac.getIds().size() == newFac.getNumIds());
+      /*numLocals=*/getNumIds() - 1 - newNumDims - newNumSymbols);
 
   // This will be used to check if the elimination was integer exact.
   unsigned lcmProducts = 1;
@@ -2813,6 +2886,19 @@ void FlatAffineConstraints::fourierMotzkinEliminate(
 #undef DEBUG_TYPE
 #define DEBUG_TYPE "affine-structures"
 
+void FlatAffineValueConstraints::fourierMotzkinEliminate(
+    unsigned pos, bool darkShadow, bool *isResultIntegerExact) {
+  SmallVector<Optional<Value>, 8> newVals;
+  newVals.reserve(numIds - 1);
+  newVals.append(values.begin(), values.begin() + pos);
+  newVals.append(values.begin() + pos + 1, values.end());
+  // Note: Base implementation discards all associated Values.
+  FlatAffineConstraints::fourierMotzkinEliminate(pos, darkShadow,
+                                                 isResultIntegerExact);
+  values = newVals;
+  assert(values.size() == getNumIds());
+}
+
 void FlatAffineConstraints::projectOut(unsigned pos, unsigned num) {
   if (num == 0)
     return;
@@ -2848,9 +2934,9 @@ void FlatAffineConstraints::projectOut(unsigned pos, unsigned num) {
   normalizeConstraintsByGCD();
 }
 
-void FlatAffineConstraints::projectOut(Value id) {
+void FlatAffineValueConstraints::projectOut(Value val) {
   unsigned pos;
-  bool ret = findId(id, &pos);
+  bool ret = findId(val, &pos);
   assert(ret);
   (void)ret;
   fourierMotzkinEliminate(pos);
@@ -2913,26 +2999,13 @@ static void getCommonConstraints(const FlatAffineConstraints &a,
 LogicalResult
 FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) {
   assert(otherCst.getNumDimIds() == numDims && "dims mismatch");
-  assert(otherCst.getIds()
-             .slice(0, getNumDimIds())
-             .equals(getIds().slice(0, getNumDimIds())) &&
-         "dim values mismatch");
   assert(otherCst.getNumLocalIds() == 0 && "local ids not supported here");
   assert(getNumLocalIds() == 0 && "local ids not supported yet here");
 
-  // Align `other` to this.
-  Optional<FlatAffineConstraints> otherCopy;
-  if (!areIdsAligned(*this, otherCst)) {
-    otherCopy.emplace(FlatAffineConstraints(otherCst));
-    mergeAndAlignIds(/*offset=*/numDims, this, &otherCopy.getValue());
-  }
-
-  const auto &otherAligned = otherCopy ? *otherCopy : otherCst;
-
   // Get the constraints common to both systems; these will be added as is to
   // the union.
   FlatAffineConstraints commonCst;
-  getCommonConstraints(*this, otherAligned, commonCst);
+  getCommonConstraints(*this, otherCst, commonCst);
 
   std::vector<SmallVector<int64_t, 8>> boundingLbs;
   std::vector<SmallVector<int64_t, 8>> boundingUbs;
@@ -2955,7 +3028,7 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) {
       // TODO: handle union if a dimension is unbounded.
       return failure();
 
-    auto otherExtent = otherAligned.getConstantBoundOnDimSize(
+    auto otherExtent = otherCst.getConstantBoundOnDimSize(
         d, &otherLb, &otherLbFloorDivisor, &otherUb);
     if (!otherExtent.hasValue() || lbFloorDivisor != otherLbFloorDivisor)
       // TODO: symbolic extents when necessary.
@@ -2976,8 +3049,8 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) {
       minLb.back() -= otherLbFloorDivisor - 1;
     } else {
       // Uncomparable - check for constant lower/upper bounds.
-      auto constLb = getConstantLowerBound(d);
-      auto constOtherLb = otherAligned.getConstantLowerBound(d);
+      auto constLb = getConstantBound(BoundType::LB, d);
+      auto constOtherLb = otherCst.getConstantBound(BoundType::LB, d);
       if (!constLb.hasValue() || !constOtherLb.hasValue())
         return failure();
       std::fill(minLb.begin(), minLb.end(), 0);
@@ -2992,8 +3065,8 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) {
       maxUb = otherUb;
     } else {
       // Uncomparable - check for constant lower/upper bounds.
-      auto constUb = getConstantUpperBound(d);
-      auto constOtherUb = otherAligned.getConstantUpperBound(d);
+      auto constUb = getConstantBound(BoundType::UB, d);
+      auto constOtherUb = otherCst.getConstantBound(BoundType::UB, d);
       if (!constUb.hasValue() || !constOtherUb.hasValue())
         return failure();
       std::fill(maxUb.begin(), maxUb.end(), 0);
@@ -3035,6 +3108,26 @@ FlatAffineConstraints::unionBoundingBox(const FlatAffineConstraints &otherCst) {
   return success();
 }
 
+LogicalResult FlatAffineValueConstraints::unionBoundingBox(
+    const FlatAffineValueConstraints &otherCst) {
+  assert(otherCst.getNumDimIds() == numDims && "dims mismatch");
+  assert(otherCst.getMaybeValues()
+             .slice(0, getNumDimIds())
+             .equals(getMaybeValues().slice(0, getNumDimIds())) &&
+         "dim values mismatch");
+  assert(otherCst.getNumLocalIds() == 0 && "local ids not supported here");
+  assert(getNumLocalIds() == 0 && "local ids not supported yet here");
+
+  // Align `other` to this.
+  if (!areIdsAligned(*this, otherCst)) {
+    FlatAffineValueConstraints otherCopy(otherCst);
+    mergeAndAlignIds(/*offset=*/numDims, this, &otherCopy);
+    return FlatAffineConstraints::unionBoundingBox(otherCopy);
+  }
+
+  return FlatAffineConstraints::unionBoundingBox(otherCst);
+}
+
 /// Compute an explicit representation for local vars. For all systems coming
 /// from MLIR integer sets, maps, or expressions where local vars were
 /// introduced to model floordivs and mods, this always succeeds.
@@ -3068,7 +3161,7 @@ static LogicalResult computeLocalVars(const FlatAffineConstraints &cst,
       llvm::all_of(localExprs, [](AffineExpr expr) { return expr; }));
 }
 
-void FlatAffineConstraints::getIneqAsAffineValueMap(
+void FlatAffineValueConstraints::getIneqAsAffineValueMap(
     unsigned pos, unsigned ineqPos, AffineValueMap &vmap,
     MLIRContext *context) const {
   unsigned numDims = getNumDimIds();
@@ -3106,9 +3199,9 @@ void FlatAffineConstraints::getIneqAsAffineValueMap(
 
   // Get the values to bind to this affine expr (all dims and symbols).
   SmallVector<Value, 4> operands;
-  getIdValues(0, pos, &operands);
+  getValues(0, pos, &operands);
   SmallVector<Value, 4> trailingOperands;
-  getIdValues(pos + 1, getNumDimAndSymbolIds(), &trailingOperands);
+  getValues(pos + 1, getNumDimAndSymbolIds(), &trailingOperands);
   operands.append(trailingOperands.begin(), trailingOperands.end());
   vmap.reset(AffineMap::get(numDims - 1, numSyms, boundExpr), operands);
 }
diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp
index 262a329ab0de3..558808bcbe00e 100644
--- a/mlir/lib/Analysis/Utils.cpp
+++ b/mlir/lib/Analysis/Utils.cpp
@@ -62,10 +62,10 @@ void mlir::getEnclosingAffineForAndIfOps(Operation &op,
   std::reverse(ops->begin(), ops->end());
 }
 
-// Populates 'cst' with FlatAffineConstraints which represent original domain of
-// the loop bounds that define 'ivs'.
+// Populates 'cst' with FlatAffineValueConstraints which represent original
+// domain of the loop bounds that define 'ivs'.
 LogicalResult
-ComputationSliceState::getSourceAsConstraints(FlatAffineConstraints &cst) {
+ComputationSliceState::getSourceAsConstraints(FlatAffineValueConstraints &cst) {
   assert(!ivs.empty() && "Cannot have a slice without its IVs");
   cst.reset(/*numDims=*/ivs.size(), /*numSymbols=*/0, /*numLocals=*/0, ivs);
   for (Value iv : ivs) {
@@ -77,9 +77,9 @@ ComputationSliceState::getSourceAsConstraints(FlatAffineConstraints &cst) {
   return success();
 }
 
-// Populates 'cst' with FlatAffineConstraints which represent slice bounds.
+// Populates 'cst' with FlatAffineValueConstraints which represent slice bounds.
 LogicalResult
-ComputationSliceState::getAsConstraints(FlatAffineConstraints *cst) {
+ComputationSliceState::getAsConstraints(FlatAffineValueConstraints *cst) {
   assert(!lbOperands.empty());
   // Adds src 'ivs' as dimension identifiers in 'cst'.
   unsigned numDims = ivs.size();
@@ -99,7 +99,7 @@ ComputationSliceState::getAsConstraints(FlatAffineConstraints *cst) {
     if (isValidSymbol(value)) {
       // Check if the symbol is a constant.
       if (auto cOp = value.getDefiningOp<ConstantIndexOp>())
-        cst->setIdToConstant(value, cOp.getValue());
+        cst->addBound(FlatAffineConstraints::EQ, value, cOp.getValue());
     } else if (auto loop = getForInductionVarOwner(value)) {
       if (failed(cst->addAffineForOpDomain(loop)))
         return failure();
@@ -232,7 +232,7 @@ Optional<bool> ComputationSliceState::isSliceValid() {
     return true;
 
   // Create constraints for the source loop nest using which slice is computed.
-  FlatAffineConstraints srcConstraints;
+  FlatAffineValueConstraints srcConstraints;
   // TODO: Store the source's domain to avoid computation at each depth.
   if (failed(getSourceAsConstraints(srcConstraints))) {
     LLVM_DEBUG(llvm::dbgs() << "Unable to compute source's domain\n");
@@ -254,7 +254,7 @@ Optional<bool> ComputationSliceState::isSliceValid() {
 
   // Create constraints for the slice loop nest that would be created if the
   // fusion succeeds.
-  FlatAffineConstraints sliceConstraints;
+  FlatAffineValueConstraints sliceConstraints;
   if (failed(getAsConstraints(&sliceConstraints))) {
     LLVM_DEBUG(llvm::dbgs() << "Unable to compute slice's domain\n");
     return llvm::None;
@@ -294,7 +294,7 @@ Optional<bool> ComputationSliceState::isMaximal() const {
     return isMaximalFastCheck;
 
   // Create constraints for the src loop nest being sliced.
-  FlatAffineConstraints srcConstraints;
+  FlatAffineValueConstraints srcConstraints;
   srcConstraints.reset(/*numDims=*/ivs.size(), /*numSymbols=*/0,
                        /*numLocals=*/0, ivs);
   for (Value iv : ivs) {
@@ -316,7 +316,7 @@ Optional<bool> ComputationSliceState::isMaximal() const {
   for (int i = consumerIVs.size(), end = ivs.size(); i < end; ++i)
     consumerIVs.push_back(Value());
 
-  FlatAffineConstraints sliceConstraints;
+  FlatAffineValueConstraints sliceConstraints;
   sliceConstraints.reset(/*numDims=*/consumerIVs.size(), /*numSymbols=*/0,
                          /*numLocals=*/0, consumerIVs);
 
@@ -357,11 +357,11 @@ Optional<int64_t> MemRefRegion::getConstantBoundingSizeAndShape(
   // that will need non-trivials means to eliminate.
   FlatAffineConstraints cstWithShapeBounds(cst);
   for (unsigned r = 0; r < rank; r++) {
-    cstWithShapeBounds.addConstantLowerBound(r, 0);
+    cstWithShapeBounds.addBound(FlatAffineConstraints::LB, r, 0);
     int64_t dimSize = memRefType.getDimSize(r);
     if (ShapedType::isDynamic(dimSize))
       continue;
-    cstWithShapeBounds.addConstantUpperBound(r, dimSize - 1);
+    cstWithShapeBounds.addBound(FlatAffineConstraints::UB, r, dimSize - 1);
   }
 
   // Find a constant upper bound on the extent of this memref region along each
@@ -518,7 +518,7 @@ LogicalResult MemRefRegion::compute(Operation *op, unsigned loopDepth,
       // Check if the symbol is a constant.
       if (auto *op = symbol.getDefiningOp()) {
         if (auto constOp = dyn_cast<ConstantIndexOp>(op)) {
-          cst.setIdToConstant(symbol, constOp.getValue());
+          cst.addBound(FlatAffineConstraints::EQ, symbol, constOp.getValue());
         }
       }
     }
@@ -558,7 +558,7 @@ LogicalResult MemRefRegion::compute(Operation *op, unsigned loopDepth,
   assert(loopDepth <= enclosingIVs.size() && "invalid loop depth");
   enclosingIVs.resize(loopDepth);
   SmallVector<Value, 4> ids;
-  cst.getIdValues(cst.getNumDimIds(), cst.getNumDimAndSymbolIds(), &ids);
+  cst.getValues(cst.getNumDimIds(), cst.getNumDimAndSymbolIds(), &ids);
   for (auto id : ids) {
     AffineForOp iv;
     if ((iv = getForInductionVarOwner(id)) &&
@@ -583,10 +583,11 @@ LogicalResult MemRefRegion::compute(Operation *op, unsigned loopDepth,
   if (addMemRefDimBounds) {
     auto memRefType = memref.getType().cast<MemRefType>();
     for (unsigned r = 0; r < rank; r++) {
-      cst.addConstantLowerBound(/*pos=*/r, /*lb=*/0);
+      cst.addBound(FlatAffineConstraints::LB, /*pos=*/r, /*value=*/0);
       if (memRefType.isDynamicDim(r))
         continue;
-      cst.addConstantUpperBound(/*pos=*/r, memRefType.getDimSize(r) - 1);
+      cst.addBound(FlatAffineConstraints::UB, /*pos=*/r,
+                   memRefType.getDimSize(r) - 1);
     }
   }
   cst.removeTrivialRedundancy();
@@ -688,7 +689,7 @@ LogicalResult mlir::boundCheckLoadOrStoreOp(LoadOrStoreOp loadOrStoreOp,
       continue;
 
     // Check for overflow: d_i >= memref dim size.
-    ucst.addConstantLowerBound(r, dimSize);
+    ucst.addBound(FlatAffineConstraints::LB, r, dimSize);
     outOfBounds = !ucst.isEmpty();
     if (outOfBounds && emitError) {
       loadOrStoreOp.emitOpError()
@@ -699,7 +700,7 @@ LogicalResult mlir::boundCheckLoadOrStoreOp(LoadOrStoreOp loadOrStoreOp,
     FlatAffineConstraints lcst(*region.getConstraints());
     std::fill(ineq.begin(), ineq.end(), 0);
     // d_i <= -1;
-    lcst.addConstantUpperBound(r, -1);
+    lcst.addBound(FlatAffineConstraints::UB, r, -1);
     outOfBounds = !lcst.isEmpty();
     if (outOfBounds && emitError) {
       loadOrStoreOp.emitOpError()
@@ -760,9 +761,9 @@ static Operation *getInstAtPosition(ArrayRef<unsigned> positions,
 
 // Adds loop IV bounds to 'cst' for loop IVs not found in 'ivs'.
 static LogicalResult addMissingLoopIVBounds(SmallPtrSet<Value, 8> &ivs,
-                                            FlatAffineConstraints *cst) {
+                                            FlatAffineValueConstraints *cst) {
   for (unsigned i = 0, e = cst->getNumDimIds(); i < e; ++i) {
-    auto value = cst->getIdValue(i);
+    auto value = cst->getValue(i);
     if (ivs.count(value) == 0) {
       assert(isForInductionVar(value));
       auto loop = getForInductionVarOwner(value);
@@ -813,7 +814,7 @@ mlir::computeSliceUnion(ArrayRef<Operation *> opsA, ArrayRef<Operation *> opsB,
                         ComputationSliceState *sliceUnion) {
   // Compute the union of slice bounds between all pairs in 'opsA' and
   // 'opsB' in 'sliceUnionCst'.
-  FlatAffineConstraints sliceUnionCst;
+  FlatAffineValueConstraints sliceUnionCst;
   assert(sliceUnionCst.getNumDimAndSymbolIds() == 0);
   std::vector<std::pair<Operation *, Operation *>> dependentOpPairs;
   for (unsigned i = 0, numOpsA = opsA.size(); i < numOpsA; ++i) {
@@ -831,7 +832,7 @@ mlir::computeSliceUnion(ArrayRef<Operation *> opsA, ArrayRef<Operation *> opsB,
 
       bool readReadAccesses = isa<AffineReadOpInterface>(srcAccess.opInst) &&
                               isa<AffineReadOpInterface>(dstAccess.opInst);
-      FlatAffineConstraints dependenceConstraints;
+      FlatAffineValueConstraints dependenceConstraints;
       // Check dependence between 'srcAccess' and 'dstAccess'.
       DependenceResult result = checkMemrefAccessDependence(
           srcAccess, dstAccess, /*loopDepth=*/numCommonLoops + 1,
@@ -863,7 +864,7 @@ mlir::computeSliceUnion(ArrayRef<Operation *> opsA, ArrayRef<Operation *> opsB,
       }
 
       // Compute constraints for 'tmpSliceState' in 'tmpSliceCst'.
-      FlatAffineConstraints tmpSliceCst;
+      FlatAffineValueConstraints tmpSliceCst;
       if (failed(tmpSliceState.getAsConstraints(&tmpSliceCst))) {
         LLVM_DEBUG(llvm::dbgs()
                    << "Unable to compute slice bound constraints\n");
@@ -877,10 +878,10 @@ mlir::computeSliceUnion(ArrayRef<Operation *> opsA, ArrayRef<Operation *> opsB,
         // system.
         SmallPtrSet<Value, 8> sliceUnionIVs;
         for (unsigned k = 0, l = sliceUnionCst.getNumDimIds(); k < l; ++k)
-          sliceUnionIVs.insert(sliceUnionCst.getIdValue(k));
+          sliceUnionIVs.insert(sliceUnionCst.getValue(k));
         SmallPtrSet<Value, 8> tmpSliceIVs;
         for (unsigned k = 0, l = tmpSliceCst.getNumDimIds(); k < l; ++k)
-          tmpSliceIVs.insert(tmpSliceCst.getIdValue(k));
+          tmpSliceIVs.insert(tmpSliceCst.getValue(k));
 
         sliceUnionCst.mergeAndAlignIdsWithOther(/*offset=*/0, &tmpSliceCst);
 
@@ -938,13 +939,13 @@ mlir::computeSliceUnion(ArrayRef<Operation *> opsA, ArrayRef<Operation *> opsB,
 
   // Add slice bound operands of union.
   SmallVector<Value, 4> sliceBoundOperands;
-  sliceUnionCst.getIdValues(numSliceLoopIVs,
-                            sliceUnionCst.getNumDimAndSymbolIds(),
-                            &sliceBoundOperands);
+  sliceUnionCst.getValues(numSliceLoopIVs,
+                          sliceUnionCst.getNumDimAndSymbolIds(),
+                          &sliceBoundOperands);
 
   // Copy src loop IVs from 'sliceUnionCst' to 'sliceUnion'.
   sliceUnion->ivs.clear();
-  sliceUnionCst.getIdValues(0, numSliceLoopIVs, &sliceUnion->ivs);
+  sliceUnionCst.getValues(0, numSliceLoopIVs, &sliceUnion->ivs);
 
   // Set loop nest insertion point to block start at 'loopDepth'.
   sliceUnion->insertPoint =
@@ -1044,7 +1045,7 @@ const char *const kSliceFusionBarrierAttrName = "slice_fusion_barrier";
 // the other loop nest's IVs, symbols and constants (using 'isBackwardsSlice').
 void mlir::getComputationSliceState(
     Operation *depSourceOp, Operation *depSinkOp,
-    FlatAffineConstraints *dependenceConstraints, unsigned loopDepth,
+    FlatAffineValueConstraints *dependenceConstraints, unsigned loopDepth,
     bool isBackwardSlice, ComputationSliceState *sliceState) {
   // Get loop nest surrounding src operation.
   SmallVector<AffineForOp, 4> srcLoopIVs;
@@ -1068,8 +1069,8 @@ void mlir::getComputationSliceState(
   // Add slice loop IV values to 'sliceState'.
   unsigned offset = isBackwardSlice ? 0 : loopDepth;
   unsigned numSliceLoopIVs = isBackwardSlice ? numSrcLoopIVs : numDstLoopIVs;
-  dependenceConstraints->getIdValues(offset, offset + numSliceLoopIVs,
-                                     &sliceState->ivs);
+  dependenceConstraints->getValues(offset, offset + numSliceLoopIVs,
+                                   &sliceState->ivs);
 
   // Set up lower/upper bound affine maps for the slice.
   sliceState->lbs.resize(numSliceLoopIVs, AffineMap());
@@ -1085,7 +1086,7 @@ void mlir::getComputationSliceState(
   unsigned numDimsAndSymbols = dependenceConstraints->getNumDimAndSymbolIds();
   for (unsigned i = 0; i < numDimsAndSymbols; ++i) {
     if (i < offset || i >= offset + numSliceLoopIVs) {
-      sliceBoundOperands.push_back(dependenceConstraints->getIdValue(i));
+      sliceBoundOperands.push_back(dependenceConstraints->getValue(i));
     }
   }
 
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 1f81550005250..d6305e7f49ec5 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -868,22 +868,23 @@ py::object PyOperationBase::getAsm(bool binary,
   return fileObject.attr("getvalue")();
 }
 
-PyOperationRef PyOperation::getParentOperation() {
+llvm::Optional<PyOperationRef> PyOperation::getParentOperation() {
   checkValid();
   if (!isAttached())
     throw SetPyError(PyExc_ValueError, "Detached operations have no parent");
   MlirOperation operation = mlirOperationGetParentOperation(get());
   if (mlirOperationIsNull(operation))
-    throw SetPyError(PyExc_ValueError, "Operation has no parent.");
+    return {};
   return PyOperation::forOperation(getContext(), operation);
 }
 
 PyBlock PyOperation::getBlock() {
   checkValid();
-  PyOperationRef parentOperation = getParentOperation();
+  llvm::Optional<PyOperationRef> parentOperation = getParentOperation();
   MlirBlock block = mlirOperationGetBlock(get());
   assert(!mlirBlockIsNull(block) && "Attached operation has null parent");
-  return PyBlock{std::move(parentOperation), block};
+  assert(parentOperation && "Operation has no parent");
+  return PyBlock{std::move(*parentOperation), block};
 }
 
 py::object PyOperation::getCapsule() {
@@ -2121,8 +2122,11 @@ void mlir::python::populateIRCore(py::module &m) {
                   py::arg("loc") = py::none(), py::arg("ip") = py::none(),
                   kOperationCreateDocstring)
       .def_property_readonly("parent",
-                             [](PyOperation &self) {
-                               return self.getParentOperation().getObject();
+                             [](PyOperation &self) -> py::object {
+                               auto parent = self.getParentOperation();
+                               if (parent)
+                                 return parent->getObject();
+                               return py::none();
                              })
       .def("erase", &PyOperation::erase)
       .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR,
@@ -2196,6 +2200,12 @@ void mlir::python::populateIRCore(py::module &m) {
   // Mapping of PyBlock.
   //----------------------------------------------------------------------------
   py::class_<PyBlock>(m, "Block")
+      .def_property_readonly(
+          "owner",
+          [](PyBlock &self) {
+            return self.getParentOperation()->createOpView();
+          },
+          "Returns the owning operation of this block.")
       .def_property_readonly(
           "arguments",
           [](PyBlock &self) {
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 79c480e9446f5..9d217c872191d 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -18,6 +18,7 @@
 #include "mlir-c/IR.h"
 #include "mlir-c/IntegerSet.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 
 namespace mlir {
 namespace python {
@@ -452,7 +453,7 @@ class PyOperation : public PyOperationBase, public BaseContextObject {
 
   /// Gets the parent operation or raises an exception if the operation has
   /// no parent.
-  PyOperationRef getParentOperation();
+  llvm::Optional<PyOperationRef> getParentOperation();
 
   /// Gets a capsule wrapping the void* within the MlirOperation.
   pybind11::object getCapsule();
diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index 2327fa0807827..e3aaef5cf0cbd 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -454,9 +454,17 @@ struct GlobalMemrefOpLowering
         initialValue = elementsAttr.getValue({});
     }
 
-    rewriter.replaceOpWithNewOp<LLVM::GlobalOp>(
+    auto newGlobal = rewriter.replaceOpWithNewOp<LLVM::GlobalOp>(
         global, arrayTy, global.constant(), linkage, global.sym_name(),
         initialValue, /*alignment=*/0, type.getMemorySpaceAsInt());
+    if (!global.isExternal() && global.isUninitialized()) {
+      Block *blk = new Block();
+      newGlobal.getInitializerRegion().push_back(blk);
+      rewriter.setInsertionPointToStart(blk);
+      Value undef[] = {
+          rewriter.create<LLVM::UndefOp>(global.getLoc(), arrayTy)};
+      rewriter.create<LLVM::ReturnOp>(global.getLoc(), undef);
+    }
     return success();
   }
 };
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index 1dedc2c39d8f6..a7d4a99c9d5b5 100644
--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -44,44 +44,21 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
     }
 
     // Replace the loop.
+    auto omp = rewriter.create<omp::ParallelOp>(parallelOp.getLoc());
+    Block *block = rewriter.createBlock(&omp.getRegion());
+    rewriter.setInsertionPointToStart(block);
     auto loop = rewriter.create<omp::WsLoopOp>(
         parallelOp.getLoc(), parallelOp.lowerBound(), parallelOp.upperBound(),
         parallelOp.step());
     rewriter.inlineRegionBefore(parallelOp.region(), loop.region(),
                                 loop.region().begin());
+    rewriter.create<omp::TerminatorOp>(parallelOp.getLoc());
+
     rewriter.eraseOp(parallelOp);
     return success();
   }
 };
 
-/// Inserts OpenMP "parallel" operations around top-level SCF "parallel"
-/// operations in the given function. This is implemented as a direct IR
-/// modification rather than as a conversion pattern because it does not
-/// modify the top-level operation it matches, which is a requirement for
-/// rewrite patterns.
-//
-// TODO: consider creating nested parallel operations when necessary.
-static void insertOpenMPParallel(FuncOp func) {
-  // Collect top-level SCF "parallel" ops.
-  SmallVector<scf::ParallelOp, 4> topLevelParallelOps;
-  func.walk([&topLevelParallelOps](scf::ParallelOp parallelOp) {
-    // Ignore ops that are already within OpenMP parallel construct.
-    if (!parallelOp->getParentOfType<scf::ParallelOp>())
-      topLevelParallelOps.push_back(parallelOp);
-  });
-
-  // Wrap SCF ops into OpenMP "parallel" ops.
-  for (scf::ParallelOp parallelOp : topLevelParallelOps) {
-    OpBuilder builder(parallelOp);
-    auto omp = builder.create<omp::ParallelOp>(parallelOp.getLoc());
-    Block *block = builder.createBlock(&omp.getRegion());
-    builder.create<omp::TerminatorOp>(parallelOp.getLoc());
-    block->getOperations().splice(block->begin(),
-                                  parallelOp->getBlock()->getOperations(),
-                                  parallelOp.getOperation());
-  }
-}
-
 /// Applies the conversion patterns in the given function.
 static LogicalResult applyPatterns(FuncOp func) {
   ConversionTarget target(*func.getContext());
@@ -100,7 +77,6 @@ static LogicalResult applyPatterns(FuncOp func) {
 struct SCFToOpenMPPass : public ConvertSCFToOpenMPBase<SCFToOpenMPPass> {
   /// Pass entry point.
   void runOnFunction() override {
-    insertOpenMPParallel(getFunction());
     if (failed(applyPatterns(getFunction())))
       signalPassFailure();
   }
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index ca64e5de39f0e..cfd1696d55d15 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -428,12 +428,32 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args,
   }
 
   if (isa<tosa::ClampOp>(op) && elementTy.isa<IntegerType>()) {
-    auto min = createConstFromIntAttribute<int32_t>(op, "min_int", elementTy,
-                                                    rewriter);
-    auto max = createConstFromIntAttribute<int32_t>(op, "max_int", elementTy,
-                                                    rewriter);
-    return clampHelper<mlir::CmpIOp>(loc, args[0], min, max, CmpIPredicate::slt,
-                                     rewriter);
+    auto intTy = elementTy.cast<IntegerType>();
+    int32_t min = static_cast<int32_t>(
+        op->getAttr("min_int").cast<IntegerAttr>().getValue().getSExtValue());
+    int32_t max = static_cast<int32_t>(
+        op->getAttr("max_int").cast<IntegerAttr>().getValue().getSExtValue());
+
+    if (intTy.isUnsignedInteger()) {
+      min = std::max<int32_t>(min, 0);
+      max = std::min<int32_t>(
+          max,
+          APInt::getMaxValue(intTy.getIntOrFloatBitWidth()).getSExtValue());
+    } else {
+      min = std::max<int32_t>(
+          min, APInt::getSignedMinValue(intTy.getIntOrFloatBitWidth())
+                   .getSExtValue());
+      max = std::min<int32_t>(
+          max, APInt::getSignedMaxValue(intTy.getIntOrFloatBitWidth())
+                   .getSExtValue());
+    }
+
+    auto minVal =
+        rewriter.create<ConstantIntOp>(loc, min, intTy.getIntOrFloatBitWidth());
+    auto maxVal =
+        rewriter.create<ConstantIntOp>(loc, max, intTy.getIntOrFloatBitWidth());
+    return clampHelper<mlir::CmpIOp>(loc, args[0], minVal, maxVal,
+                                     CmpIPredicate::slt, rewriter);
   }
 
   // tosa::ReluNOp
@@ -1524,12 +1544,12 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
         [&](OpBuilder &nestedBuilder, Location nestedLoc,
             ValueRange blockArgs) {
           Value value = blockArgs[0];
+          Type valueTy = value.getType();
 
           // For now we do all of our math in 64-bit. This is not optimal but
           // should be correct for now, consider computing correct bit depth
           // later.
-          int32_t inBitwidth =
-              value.getType().getIntOrFloatBitWidth() > 32 ? 48 : 32;
+          int32_t inBitwidth = valueTy.getIntOrFloatBitWidth() > 32 ? 48 : 32;
 
           auto inputZp = createConstFromIntAttribute<int32_t>(
               op, "input_zp", nestedBuilder.getIntegerType(inBitwidth),
@@ -1541,9 +1561,21 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
                                                 : blockArgs[multiplierArg];
           Value shift = shiftConstant ? shiftConstant : blockArgs[shiftArg];
 
-          if (value.getType().getIntOrFloatBitWidth() < 32) {
-            value = nestedBuilder.create<SignExtendIOp>(
-                nestedLoc, nestedBuilder.getI32Type(), value);
+          if (valueTy.getIntOrFloatBitWidth() < 32) {
+            if (valueTy.isUnsignedInteger()) {
+              value = nestedBuilder
+                          .create<UnrealizedConversionCastOp>(
+                              nestedLoc,
+                              nestedBuilder.getIntegerType(
+                                  valueTy.getIntOrFloatBitWidth()),
+                              value)
+                          .getResult(0);
+              value = nestedBuilder.create<ZeroExtendIOp>(
+                  nestedLoc, nestedBuilder.getI32Type(), value);
+            } else {
+              value = nestedBuilder.create<SignExtendIOp>(
+                  nestedLoc, nestedBuilder.getI32Type(), value);
+            }
           }
 
           value = nestedBuilder.create<SubIOp>(nestedLoc, value, inputZp);
@@ -1559,21 +1591,38 @@ class RescaleConverter : public OpRewritePattern<tosa::RescaleOp> {
           IntegerType outIntType =
               blockArgs.back().getType().cast<IntegerType>();
           unsigned outBitWidth = outIntType.getWidth();
-          auto intMin = nestedBuilder.create<ConstantOp>(
-              loc, nestedBuilder.getIntegerAttr(
-                       nestedBuilder.getI32Type(),
-                       APInt::getSignedMinValue(outBitWidth).getSExtValue()));
-          auto intMax = nestedBuilder.create<ConstantOp>(
-              loc, nestedBuilder.getIntegerAttr(
-                       nestedBuilder.getI32Type(),
-                       APInt::getSignedMaxValue(outBitWidth).getSExtValue()));
-
-          value = clampHelper<mlir::CmpIOp>(nestedLoc, value, intMin, intMax,
-                                            CmpIPredicate::slt, nestedBuilder);
+
+          int32_t intMin = APInt::getSignedMinValue(outBitWidth).getSExtValue();
+          int32_t intMax = APInt::getSignedMaxValue(outBitWidth).getSExtValue();
+
+          // Unsigned integers have a difference output value.
+          if (outIntType.isUnsignedInteger()) {
+            intMin = 0;
+            intMax = APInt::getMaxValue(outBitWidth).getZExtValue();
+          }
+
+          auto intMinVal = nestedBuilder.create<ConstantOp>(
+              loc,
+              nestedBuilder.getIntegerAttr(nestedBuilder.getI32Type(), intMin));
+          auto intMaxVal = nestedBuilder.create<ConstantOp>(
+              loc,
+              nestedBuilder.getIntegerAttr(nestedBuilder.getI32Type(), intMax));
+
+          value =
+              clampHelper<mlir::CmpIOp>(nestedLoc, value, intMinVal, intMaxVal,
+                                        CmpIPredicate::slt, nestedBuilder);
 
           if (outIntType.getWidth() < 32) {
-            value =
-                nestedBuilder.create<TruncateIOp>(nestedLoc, outIntType, value);
+            value = nestedBuilder.create<TruncateIOp>(
+                nestedLoc, rewriter.getIntegerType(outIntType.getWidth()),
+                value);
+
+            if (outIntType.isUnsignedInteger()) {
+              value = nestedBuilder
+                          .create<UnrealizedConversionCastOp>(nestedLoc,
+                                                              outIntType, value)
+                          .getResult(0);
+            }
           }
 
           nestedBuilder.create<linalg::YieldOp>(loc, value);
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 7a1e207822806..bec1664c41b6e 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -357,6 +357,9 @@ static bool isDimOpValidSymbol(OpTy dimOp, Region *region) {
 // *) It is a result of the dim op on a memref whose corresponding size is a
 //    valid symbol.
 bool mlir::isValidSymbol(Value value) {
+  if (!value)
+    return false;
+
   // The value must be an index type.
   if (!value.getType().isIndex())
     return false;
diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
index b6cce790f715d..e1b635a110a2b 100644
--- a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
@@ -139,7 +139,7 @@ bool hasNoInterveningEffect(Operation *start, T memOp) {
         // Dependence analysis is only correct if both ops operate on the same
         // memref.
         if (srcAccess.memref == destAccess.memref) {
-          FlatAffineConstraints dependenceConstraints;
+          FlatAffineValueConstraints dependenceConstraints;
 
           // Number of loops containing the start op and the ending operation.
           unsigned minSurroundingLoops =
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index d529d842caba3..d843c9c4c2c99 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -1016,7 +1016,7 @@ static Value createMask(AffineForOp vecForOp, VectorizationState &state) {
   state.builder.setInsertionPointToStart(vecForOp.getBody());
 
   // We generate the mask using the `vector.create_mask` operation which accepts
-  // the number of meaningful elements (i.e. the legth of the prefix of 1s).
+  // the number of meaningful elements (i.e. the length of the prefix of 1s).
   // To compute the number of meaningful elements we subtract the current value
   // of the iteration variable from the upper bound of the loop. Example:
   //
diff --git a/mlir/lib/Dialect/Async/IR/Async.cpp b/mlir/lib/Dialect/Async/IR/Async.cpp
index 67b4096122745..030c8787b7939 100644
--- a/mlir/lib/Dialect/Async/IR/Async.cpp
+++ b/mlir/lib/Dialect/Async/IR/Async.cpp
@@ -48,6 +48,12 @@ static LogicalResult verify(YieldOp op) {
   return success();
 }
 
+MutableOperandRange
+YieldOp::getMutableSuccessorOperands(Optional<unsigned> index) {
+  assert(!index.hasValue());
+  return operandsMutable();
+}
+
 //===----------------------------------------------------------------------===//
 /// ExecuteOp
 //===----------------------------------------------------------------------===//
@@ -55,24 +61,28 @@ static LogicalResult verify(YieldOp op) {
 constexpr char kOperandSegmentSizesAttr[] = "operand_segment_sizes";
 
 void ExecuteOp::getNumRegionInvocations(
-    ArrayRef<Attribute> operands, SmallVectorImpl<int64_t> &countPerRegion) {
-  (void)operands;
+    ArrayRef<Attribute>, SmallVectorImpl<int64_t> &countPerRegion) {
   assert(countPerRegion.empty());
   countPerRegion.push_back(1);
 }
 
+OperandRange ExecuteOp::getSuccessorEntryOperands(unsigned index) {
+  assert(index == 0 && "invalid region index");
+  return operands();
+}
+
 void ExecuteOp::getSuccessorRegions(Optional<unsigned> index,
-                                    ArrayRef<Attribute> operands,
+                                    ArrayRef<Attribute>,
                                     SmallVectorImpl<RegionSuccessor> &regions) {
   // The `body` region branch back to the parent operation.
   if (index.hasValue()) {
-    assert(*index == 0);
-    regions.push_back(RegionSuccessor(getResults()));
+    assert(*index == 0 && "invalid region index");
+    regions.push_back(RegionSuccessor(results()));
     return;
   }
 
   // Otherwise the successor is the body region.
-  regions.push_back(RegionSuccessor(&body()));
+  regions.push_back(RegionSuccessor(&body(), body().getArguments()));
 }
 
 void ExecuteOp::build(OpBuilder &builder, OperationState &result,
diff --git a/mlir/lib/Dialect/Linalg/Analysis/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Analysis/CMakeLists.txt
index 966d6a7bc956b..bc7a107831716 100644
--- a/mlir/lib/Dialect/Linalg/Analysis/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Analysis/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_mlir_dialect_library(MLIRLinalgAnalysis
-  ConstraintsSet.cpp
   DependenceAnalysis.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/mlir/lib/Dialect/Linalg/Analysis/ConstraintsSet.cpp b/mlir/lib/Dialect/Linalg/Analysis/ConstraintsSet.cpp
deleted file mode 100644
index 7e37da5546a72..0000000000000
--- a/mlir/lib/Dialect/Linalg/Analysis/ConstraintsSet.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-//===- ConstraintsSet.cpp - Extensions for FlatAffineConstraints ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Linalg-specific constraints set extensions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/Linalg/Analysis/ConstraintsSet.h"
-#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
-#include "mlir/IR/AffineMap.h"
-
-using namespace mlir;
-
-unsigned ConstraintsSet::lookupPos(Value id) const {
-  unsigned pos;
-  if (!findId(id, &pos)) {
-    llvm::errs() << "Lookup failed: " << id << "\n";
-    llvm_unreachable("Lookup failed");
-  }
-  return pos;
-}
-
-LogicalResult ConstraintsSet::ensureIdOfType(Value v, bool asDim) {
-  if (!containsId(v)) {
-    if (asDim)
-      addDimId(getNumDimIds(), v);
-    else
-      addSymbolId(getNumSymbolIds(), v);
-    return success();
-  }
-  unsigned pos = lookupPos(v);
-  return success((asDim && pos < getNumDimIds()) ||
-                 (!asDim && getNumDimIds() <= pos &&
-                  pos < getNumDimIds() + getNumSymbolIds()));
-}
-
-LogicalResult ConstraintsSet::composeAffineApply(Value val, AffineMap map,
-                                                 ValueRange operands) {
-  AffineValueMap avm(map, operands, val);
-  return composeMap(&avm);
-}
-
-LogicalResult ConstraintsSet::composeMinOrMaxMapAndOperands(Value val,
-                                                            AffineMap map,
-                                                            ValueRange operands,
-                                                            bool min) {
-  ConstraintsSet localCst;
-  std::vector<SmallVector<int64_t, 8>> flatExprs;
-  if (failed(getFlattenedAffineExprs(map, &flatExprs, &localCst)))
-    return failure();
-  assert(flatExprs.size() == map.getNumResults() &&
-         "incorrect number of flattened expressiosn");
-
-  // Local vars on a per-need basis.
-  if (localCst.getNumLocalIds() != 0)
-    return failure();
-
-  // Add one inequality for each result connecting `val` to the other ids in
-  // `operands`. For instance, uf the expression is:
-  //   `16 * i0 + i1` and
-  //   `min` is true
-  // add:
-  //  -d_val + 16 * i0 + i1 >= 0.
-  for (const auto &flatExpr : flatExprs) {
-    assert(flatExpr.size() >= operands.size() + 1);
-    SmallVector<int64_t, 8> ineq(getNumCols(), 0);
-    for (unsigned i = 0, e = operands.size(); i < e; i++)
-      ineq[lookupPos(operands[i])] = min ? flatExpr[i] : -flatExpr[i];
-
-    // Set the coefficient for `d_val`.
-    ineq[lookupPos(val)] = min ? -1 : 1;
-
-    // Set the constant term (upper bound in flatExpr is exclusive).
-    ineq[getNumCols() - 1] = min ? flatExpr[flatExpr.size() - 1] - 1
-                                 : -flatExpr[flatExpr.size() - 1];
-
-    // Add the inequality connecting the result of the map to the rest.
-    addInequality(ineq);
-  }
-
-  return success();
-}
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index 8b54b6de8f092..6b65a9ecd9e51 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -729,6 +729,15 @@ struct DeduplicateGenericOpInputs : public OpRewritePattern<GenericOp> {
         outputOperands, rewriter.getAffineMapArrayAttr(newIndexingMaps),
         genericOp.iterator_types(), genericOp.docAttr(),
         genericOp.library_callAttr());
+
+    // Copy over unknown attributes. They might be load bearing for some flow.
+    ArrayRef<StringRef> odsAttrs = genericOp.getAttributeNames();
+    for (NamedAttribute kv : genericOp->getAttrs()) {
+      if (!llvm::is_contained(odsAttrs, kv.first.c_str())) {
+        newOp->setAttr(kv.first, kv.second);
+      }
+    }
+
     rewriter.inlineRegionBefore(genericOp.region(), newOp.region(),
                                 newOp.region().begin());
 
@@ -2055,6 +2064,57 @@ struct TiledLoopInputsFolder : public OpRewritePattern<linalg::TiledLoopOp> {
   }
 };
 
+/// Fold dim(x) where `x` is an input/output argument of a TiledLoopOp block
+/// to dim(y) where `y` is the initial input/output value of the argument.
+///
+/// E.g.:
+/// %y = ... : tensor<...>
+/// linalg.tiled_loop ... ins(%x = %y : tensor<...>) {
+///   tensor.dim %x, %c0 : tensor<...>
+/// }
+///
+/// is folded to:
+/// %y = ... : tensor<...>
+/// linalg.tiled_loop ... ins(%x = %y : tensor<...>) {
+///   tensor.dim %y, %c0 : tensor<...>
+/// }
+template <typename OpTy>
+struct DimOfTiledLoopInsOutsFolder : public OpRewritePattern<OpTy> {
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpTy dimOp,
+                                PatternRewriter &rewriter) const final {
+    auto src = dimOp.source().template dyn_cast<BlockArgument>();
+    if (!src)
+      return failure();
+    auto loopOp = dyn_cast<TiledLoopOp>(
+        src.getOwner()->getParent()->getParentOp());
+    if (!loopOp)
+      return failure();
+
+    auto inputArgs = loopOp.getRegionInputArgs();
+    auto it1 = llvm::find(inputArgs, src);
+    if (it1 != inputArgs.end()) {
+      rewriter.updateRootInPlace(dimOp, [&] {
+        dimOp.sourceMutable().assign(loopOp.inputs()[it1 - inputArgs.begin()]);
+      });
+      return success();
+    }
+
+    auto outputArgs = loopOp.getRegionOutputArgs();
+    auto it2 = llvm::find(outputArgs, src);
+    if (it2 != outputArgs.end()) {
+      rewriter.updateRootInPlace(dimOp, [&] {
+        dimOp.sourceMutable().assign(
+            loopOp.outputs()[it2 - outputArgs.begin()]);
+      });
+      return success();
+    }
+
+    return failure();
+  }
+};
+
 // Folds away TiledLoopOp output tensors when the following conditions are met:
 // * result of `linalg.tiled_loop` has no uses
 // * output tensor is the argument of `linalg.yield`
@@ -2158,7 +2218,9 @@ struct TiledLoopResultsFolder : public OpRewritePattern<linalg::TiledLoopOp> {
 
 void TiledLoopOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                               MLIRContext *context) {
-  results.insert<TiledLoopInputsFolder, TiledLoopResultsFolder>(context);
+  results.insert<TiledLoopInputsFolder, TiledLoopResultsFolder,
+                 DimOfTiledLoopInsOutsFolder<tensor::DimOp>,
+                 DimOfTiledLoopInsOutsFolder<memref::DimOp>>(context);
 }
 
 LogicalResult TiledLoopOp::fold(ArrayRef<Attribute>,
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Distribution.cpp b/mlir/lib/Dialect/Linalg/Transforms/Distribution.cpp
index 994f7c76ddfda..e951d68820220 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Distribution.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Distribution.cpp
@@ -53,7 +53,7 @@ struct DistributeTiledLoopPattern
       if (procInfoCallback == options.procInfoMap.end())
         continue;
 
-      if (!isParallelIteratorType(op.iterator_types()[i])) {
+      if (!isParallelIterator(op.iterator_types()[i])) {
         op.emitOpError("only support for parallel loops is implemented");
         return failure();
       }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index fdca523b38544..43a4105e4c3f8 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -1294,7 +1294,12 @@ struct LinalgElementwiseOpFusionPass
         patterns,
         LinalgElementwiseFusionOptions().setControlFoldingReshapes(
             allowFoldingUnitDimReshapes ? allowFoldingFn : skipUnitDimReshape));
-    (void)applyPatternsAndFoldGreedily(op->getRegions(), std::move(patterns));
+
+    // Use TopDownTraversal for compile time reasons
+    GreedyRewriteConfig grc;
+    grc.useTopDownTraversal = true;
+    (void)applyPatternsAndFoldGreedily(op->getRegions(), std::move(patterns),
+                                       grc);
   }
 };
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
index b6fbdb7612599..3a41a68cb3ac2 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -14,8 +14,8 @@
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Affine/IR/AffineValueMap.h"
 #include "mlir/Dialect/Affine/Utils.h"
-#include "mlir/Dialect/Linalg/Analysis/ConstraintsSet.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/SCF.h"
@@ -555,8 +555,9 @@ static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
 /// Given a set of loops, assumed to be scf::ForOp, create a constraint set
 /// containing the inequalities `iv - lb >= 0` and `-iv + ub - 1 >= 0` for each
 /// loop.
-static ConstraintsSet initLoopIvsAndBounds(ArrayRef<Operation *> loops) {
-  ConstraintsSet constraints;
+static FlatAffineValueConstraints
+initLoopIvsAndBounds(ArrayRef<Operation *> loops) {
+  FlatAffineValueConstraints constraints;
   for (Operation *op : loops)
     constraints.addDimId(constraints.getNumDimIds(),
                          cast<scf::ForOp>(op).getInductionVar());
@@ -598,7 +599,7 @@ static ConstraintsSet initLoopIvsAndBounds(ArrayRef<Operation *> loops) {
 /// If any other operation is met, return failure.
 // TODO: extend on a per-need basis.
 static LogicalResult
-foldUpperBoundsIntoConstraintsSet(ConstraintsSet &constraints,
+foldUpperBoundsIntoConstraintsSet(FlatAffineValueConstraints &constraints,
                                   scf::ForOp outerLimit,
                                   ArrayRef<Operation *> loops) {
   SetVector<Value> toProjectOut;
@@ -622,7 +623,13 @@ foldUpperBoundsIntoConstraintsSet(ConstraintsSet &constraints,
         continue;
       // Ensure there is a
       auto ensureIdFailed = [&](Value v) {
-        return failed(constraints.ensureIdOfType(v, /*asDim=*/true));
+        if (constraints.containsId(v)) {
+          unsigned pos;
+          constraints.findId(v, &pos);
+          return pos >= constraints.getNumDimIds();
+        }
+        constraints.addDimId(constraints.getNumDimIds(), v);
+        return false;
       };
 
       // Ensure all ids exist and add results for later projection.
@@ -636,16 +643,22 @@ foldUpperBoundsIntoConstraintsSet(ConstraintsSet &constraints,
 
       // Compose supported ops.
       if (auto affineApplyOp = dyn_cast<AffineApplyOp>(op)) {
-        if (failed(constraints.composeAffineApply(affineApplyOp.getResult(),
-                                                  affineApplyOp.getAffineMap(),
-                                                  affineApplyOp.getOperands())))
+        AffineValueMap avm(affineApplyOp.getAffineMap(),
+                           affineApplyOp.getOperands(),
+                           affineApplyOp.getResult());
+        if (failed(constraints.composeMap(&avm)))
           return failure();
         continue;
       }
       auto affineMinOp = cast<AffineMinOp>(op);
-      if (failed(constraints.composeMin(affineMinOp.getResult(),
-                                        affineMinOp.getAffineMap(),
-                                        affineMinOp.operands())))
+      unsigned pos;
+      bool foundMinOp = constraints.findId(affineMinOp.getResult(), &pos);
+      (void)foundMinOp;
+      assert(foundMinOp);
+      AffineMap alignedMap = constraints.computeAlignedMap(
+          affineMinOp.getAffineMap(), affineMinOp.getOperands());
+      if (failed(
+              constraints.addBound(FlatAffineConstraints::UB, pos, alignedMap)))
         return failure();
     }
   }
@@ -654,25 +667,6 @@ foldUpperBoundsIntoConstraintsSet(ConstraintsSet &constraints,
   return success();
 }
 
-/// Compute dynamic tensor sizes, independent of any value defined inside
-/// `outer` and such that every n-D iteration of the packingLoops has its own
-/// space (so that each packed buffer has a storage location). This is achieved
-/// by computing the extent for each of the packing loops.
-static LogicalResult computeBounds(scf::ForOp outer,
-                                   ArrayRef<Operation *> packingLoops,
-                                   SmallVector<AffineMap> &lbs,
-                                   SmallVector<AffineMap> &ubs) {
-  // Packing loop IVs are introduced as the first positions.
-  ConstraintsSet constraints = initLoopIvsAndBounds(packingLoops);
-  if (failed(
-          foldUpperBoundsIntoConstraintsSet(constraints, outer, packingLoops)))
-    return failure();
-  // Compute the bounds of the first positions, assuming the others are fixed.
-  constraints.getSliceBounds(/*pos=*/0, /*num=*/packingLoops.size(),
-                             outer->getContext(), &lbs, &ubs);
-  return success();
-}
-
 /// Ensure prerequisites that guarantee pad op hoisting can occur.
 /// Return failure in the cases when we cannot perform hoisting; i.e. if either:
 ///   1. There exists a use of `padTensorOp` that is not a linalg input operand.
@@ -758,18 +752,20 @@ hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels,
 
   scf::ForOp outer = cast<scf::ForOp>(outermostEnclosingForOp);
 
-  ConstraintsSet constraints = initLoopIvsAndBounds(packingLoops.getArrayRef());
+  FlatAffineValueConstraints constraints =
+      initLoopIvsAndBounds(packingLoops.getArrayRef());
   if (failed(foldUpperBoundsIntoConstraintsSet(constraints, outer,
                                                packingLoops.getArrayRef())))
     return failure();
 
   unsigned numLoops = packingLoops.size();
   SmallVector<AffineMap> lbs(numLoops), ubs(numLoops);
-  if (failed(computeBounds(outer, packingLoops.getArrayRef(), lbs, ubs)))
-    return failure();
+  // Compute the bounds of the first positions, assuming the others are fixed.
+  constraints.getSliceBounds(/*pos=*/0, /*num=*/packingLoops.size(),
+                             outer->getContext(), &lbs, &ubs);
 
   SmallVector<Value> allValues;
-  constraints.getAllIdValues(&allValues);
+  constraints.getAllValues(&allValues);
   SmallVector<Value> allNonLoopValues(allValues.begin() + numLoops,
                                       allValues.end());
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 9324fa2f443e1..7007d8b1101ac 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -210,7 +210,7 @@ static Value reduceIfNeeded(OpBuilder &b, VectorType targetVectorType,
   unsigned idx = 0;
   SmallVector<bool> reductionMask(linalgOp.iterator_types().size(), false);
   for (auto attr : linalgOp.iterator_types()) {
-    if (isReductionIteratorType(attr))
+    if (isReductionIterator(attr))
       reductionMask[idx] = true;
     ++idx;
   }
@@ -615,7 +615,7 @@ static bool allIndexingsAreProjectedPermutation(LinalgOp op) {
 // TODO: probably need some extra checks for reduction followed by consumer
 // ops that may not commute (e.g. linear reduction + non-linear instructions).
 static LogicalResult reductionPreconditions(LinalgOp op) {
-  if (llvm::none_of(op.iterator_types(), isReductionIteratorType))
+  if (llvm::none_of(op.iterator_types(), isReductionIterator))
     return failure();
   for (OpOperand *opOperand : op.getOutputOperands()) {
     Operation *reductionOp = getSingleBinaryOpAssumedReduction(opOperand);
@@ -1127,7 +1127,7 @@ LogicalResult ConvOpVectorization<ConvOp, N>::matchAndRewrite(
   return success();
 }
 
-using ConvOpConst = ConvOpVectorization<ConvWOp, 1>;
+using ConvOpConst = ConvOpVectorization<Conv1DOp, 1>;
 
 /// Inserts tiling, promotion and vectorization pattern for ConvOp
 /// conversion into corresponding pattern lists.
@@ -1165,43 +1165,22 @@ void mlir::linalg::populateConvVectorizationPatterns(
   RewritePatternSet tiling(context);
   RewritePatternSet promotion(context);
   RewritePatternSet vectorization(context);
-  populateVectorizationPatterns<ConvWOp, 1>(tiling, promotion, vectorization,
-                                            tileSizes);
-
-  populateVectorizationPatterns<ConvNWCOp, 3>(tiling, promotion, vectorization,
-                                              tileSizes);
-  populateVectorizationPatterns<ConvInputNWCFilterWCFOp, 3>(
-      tiling, promotion, vectorization, tileSizes);
-
-  populateVectorizationPatterns<ConvNCWOp, 3>(tiling, promotion, vectorization,
-                                              tileSizes);
-  populateVectorizationPatterns<ConvInputNCWFilterWCFOp, 3>(
-      tiling, promotion, vectorization, tileSizes);
-
-  populateVectorizationPatterns<ConvHWOp, 2>(tiling, promotion, vectorization,
+  populateVectorizationPatterns<Conv1DOp, 1>(tiling, promotion, vectorization,
                                              tileSizes);
 
-  populateVectorizationPatterns<ConvNHWCOp, 4>(tiling, promotion, vectorization,
-                                               tileSizes);
-  populateVectorizationPatterns<ConvInputNHWCFilterHWCFOp, 4>(
-      tiling, promotion, vectorization, tileSizes);
+  populateVectorizationPatterns<Conv2DOp, 2>(tiling, promotion, vectorization,
+                                             tileSizes);
 
-  populateVectorizationPatterns<Conv2DNchwOp, 4>(tiling, promotion,
-                                                 vectorization, tileSizes);
-  populateVectorizationPatterns<ConvInputNCHWFilterHWCFOp, 4>(
-      tiling, promotion, vectorization, tileSizes);
+  populateVectorizationPatterns<Conv3DOp, 3>(tiling, promotion, vectorization,
+                                             tileSizes);
 
-  populateVectorizationPatterns<ConvDHWOp, 3>(tiling, promotion, vectorization,
-                                              tileSizes);
+  populateVectorizationPatterns<Conv1DNwcWcfOp, 3>(tiling, promotion,
+                                                   vectorization, tileSizes);
 
-  populateVectorizationPatterns<ConvNDHWCOp, 5>(tiling, promotion,
-                                                vectorization, tileSizes);
-  populateVectorizationPatterns<ConvInputNDHWCFilterDHWCFOp, 5>(
-      tiling, promotion, vectorization, tileSizes);
+  populateVectorizationPatterns<Conv2DNhwcHwcfOp, 4>(tiling, promotion,
+                                                     vectorization, tileSizes);
 
-  populateVectorizationPatterns<ConvNCDHWOp, 5>(tiling, promotion,
-                                                vectorization, tileSizes);
-  populateVectorizationPatterns<ConvInputNCDHWFilterDHWCFOp, 5>(
+  populateVectorizationPatterns<Conv3DNdhwcDhwcfOp, 5>(
       tiling, promotion, vectorization, tileSizes);
 
   patterns.push_back(std::move(tiling));
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
index 1620a047390be..596ae49232c6d 100644
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -116,27 +116,6 @@ RegionMatcher::matchAsScalarBinaryOp(GenericOp op) {
   return llvm::None;
 }
 
-bool mlir::linalg::isParallelIteratorType(Attribute attr) {
-  if (auto strAttr = attr.dyn_cast<StringAttr>()) {
-    return strAttr.getValue() == getParallelIteratorTypeName();
-  }
-  return false;
-}
-
-bool mlir::linalg::isReductionIteratorType(Attribute attr) {
-  if (auto strAttr = attr.dyn_cast<StringAttr>()) {
-    return strAttr.getValue() == getReductionIteratorTypeName();
-  }
-  return false;
-}
-
-bool mlir::linalg::isWindowIteratorType(Attribute attr) {
-  if (auto strAttr = attr.dyn_cast<StringAttr>()) {
-    return strAttr.getValue() == getWindowIteratorTypeName();
-  }
-  return false;
-}
-
 /// Explicit instantiation of loop nest generator for different loop types.
 template struct mlir::linalg::GenerateLoopNest<scf::ForOp>;
 template struct mlir::linalg::GenerateLoopNest<scf::ParallelOp>;
@@ -233,7 +212,7 @@ void GenerateLoopNest<scf::ForOp>::doit(
     // Collect loop ranges for parallel dimensions.
     SmallVector<Range, 2> parallelLoopRanges;
     for (auto iteratorType : enumerate(iteratorTypes))
-      if (isParallelIteratorType(iteratorType.value()))
+      if (isParallelIterator(iteratorType.value()))
         parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
 
     // Get their distribution schemes.
@@ -254,7 +233,7 @@ void GenerateLoopNest<scf::ForOp>::doit(
   // Filter out scf.for loops that were created out of parallel dimensions.
   SmallVector<scf::ForOp, 4> loops;
   for (auto iteratorType : enumerate(iteratorTypes))
-    if (isParallelIteratorType(iteratorType.value()))
+    if (isParallelIterator(iteratorType.value()))
       loops.push_back(loopNest.loops[iteratorType.index()]);
 
   // Distribute - only supports cyclic distribution for now.
@@ -375,7 +354,7 @@ static void generateParallelLoopNest(
   // Find the outermost parallel loops and drop their types from the list.
   unsigned nLoops = iteratorTypes.size();
   unsigned nOuterPar =
-      nLoops - iteratorTypes.drop_while(isParallelIteratorType).size();
+      nLoops - iteratorTypes.drop_while(isParallelIterator).size();
 
   // If there are no outer parallel loops, generate one sequential loop and
   // recurse. Note that we wouldn't have dropped anything from `iteratorTypes`
@@ -502,7 +481,7 @@ void GenerateLoopNest<scf::ParallelOp>::doit(
                               distributionOptions->distributionMethod.end());
     SmallVector<Range, 2> parallelLoopRanges;
     for (auto iteratorType : enumerate(iteratorTypes)) {
-      if (isParallelIteratorType(iteratorType.value()))
+      if (isParallelIterator(iteratorType.value()))
         parallelLoopRanges.push_back(loopRanges[iteratorType.index()]);
     }
     if (distributionMethod.size() < parallelLoopRanges.size())
@@ -513,7 +492,7 @@ void GenerateLoopNest<scf::ParallelOp>::doit(
     for (auto iteratorType : enumerate(iteratorTypes)) {
       if (index >= procInfo.size())
         break;
-      if (isParallelIteratorType(iteratorType.value())) {
+      if (isParallelIterator(iteratorType.value())) {
         unsigned i = iteratorType.index();
         updateBoundsForCyclicDistribution(b, loc, procInfo[index].procId,
                                           procInfo[index].nprocs, lbsStorage[i],
diff --git a/mlir/lib/Dialect/SCF/SCF.cpp b/mlir/lib/Dialect/SCF/SCF.cpp
index 2388162880367..e5c24bb70b851 100644
--- a/mlir/lib/Dialect/SCF/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/SCF.cpp
@@ -115,8 +115,7 @@ static ParseResult parseExecuteRegionOp(OpAsmParser &parser,
 
 static void print(OpAsmPrinter &p, ExecuteRegionOp op) {
   p << ExecuteRegionOp::getOperationName();
-  if (op.getNumResults() > 0)
-    p << " -> " << op.getResultTypes();
+  p.printOptionalArrowTypeList(op.getResultTypes());
 
   p.printRegion(op.region(),
                 /*printEntryBlockArgs=*/false,
diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
index cbdc26a644dad..c5c586eebced2 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopSpecialization.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PassDetail.h"
+#include "mlir/Analysis/AffineStructures.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/SCF/Passes.h"
 #include "mlir/Dialect/SCF/SCF.h"
@@ -97,8 +98,16 @@ static void specializeForLoopForUnrolling(ForOp op) {
 /// Rewrite a for loop with bounds/step that potentially do not divide evenly
 /// into a for loop where the step divides the iteration space evenly, followed
 /// by an scf.if for the last (partial) iteration (if any).
-LogicalResult mlir::scf::peelForLoop(RewriterBase &b, ForOp forOp,
-                                     scf::IfOp &ifOp) {
+///
+/// This function rewrites the given scf.for loop in-place and creates a new
+/// scf.if operation for the last iteration. It replaces all uses of the
+/// unpeeled loop with the results of the newly generated scf.if.
+///
+/// The newly generated scf.if operation is returned via `ifOp`. The boundary
+/// at which the loop is split (new upper bound) is returned via `splitBound`.
+/// The return value indicates whether the loop was rewritten or not.
+static LogicalResult peelForLoop(RewriterBase &b, ForOp forOp, scf::IfOp &ifOp,
+                                 Value &splitBound) {
   RewriterBase::InsertionGuard guard(b);
   auto lbInt = getConstantIntValue(forOp.lowerBound());
   auto ubInt = getConstantIntValue(forOp.upperBound());
@@ -117,7 +126,7 @@ LogicalResult mlir::scf::peelForLoop(RewriterBase &b, ForOp forOp,
   // New upper bound: %ub - (%ub - %lb) mod %step
   auto modMap = AffineMap::get(3, 0, {dim1 - ((dim1 - dim0) % dim2)});
   b.setInsertionPoint(forOp);
-  Value splitBound = b.createOrFold<AffineApplyOp>(
+  splitBound = b.createOrFold<AffineApplyOp>(
       loc, modMap,
       ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()});
 
@@ -153,6 +162,226 @@ LogicalResult mlir::scf::peelForLoop(RewriterBase &b, ForOp forOp,
   return success();
 }
 
+static void unpackOptionalValues(ArrayRef<Optional<Value>> source,
+                                 SmallVector<Value> &target) {
+  target = llvm::to_vector<4>(llvm::map_range(source, [](Optional<Value> val) {
+    return val.hasValue() ? *val : Value();
+  }));
+}
+
+/// Bound an identifier `pos` in a given FlatAffineValueConstraints with
+/// constraints drawn from an affine map. Before adding the constraint, the
+/// dimensions/symbols of the affine map are aligned with `constraints`.
+/// `operands` are the SSA Value operands used with the affine map.
+/// Note: This function adds a new symbol column to the `constraints` for each
+/// dimension/symbol that exists in the affine map but not in `constraints`.
+static LogicalResult alignAndAddBound(FlatAffineValueConstraints &constraints,
+                                      FlatAffineConstraints::BoundType type,
+                                      unsigned pos, AffineMap map,
+                                      ValueRange operands) {
+  SmallVector<Value> dims, syms, newSyms;
+  unpackOptionalValues(constraints.getMaybeDimValues(), dims);
+  unpackOptionalValues(constraints.getMaybeSymbolValues(), syms);
+
+  AffineMap alignedMap =
+      alignAffineMapWithValues(map, operands, dims, syms, &newSyms);
+  for (unsigned i = syms.size(); i < newSyms.size(); ++i)
+    constraints.addSymbolId(constraints.getNumSymbolIds(), newSyms[i]);
+  return constraints.addBound(type, pos, alignedMap);
+}
+
+/// This function tries to canonicalize affine.min operations by proving that
+/// its value is bounded by the same lower and upper bound. In that case, the
+/// operation can be folded away.
+///
+/// Bounds are computed by FlatAffineValueConstraints. Invariants required for
+/// finding/proving bounds should be supplied via `constraints`.
+///
+/// 1. Add dimensions for `minOp` and `minOpUb` (upper bound of `minOp`).
+/// 2. Compute an upper bound of `minOp` and bind it to `minOpUb`. SSA values
+///    that are used in `minOp` but are not part of `dims`, are added as extra
+///    symbols to the constraint set.
+/// 3. For each result of `minOp`: Add result as a dimension `r_i`. Prove that
+///    r_i >= minOpUb. If this is the case, ub(minOp) == lb(minOp) and `minOp`
+///    can be replaced with that bound.
+///
+/// In summary, the following constraints are added throughout this function.
+/// Note: `invar` are dimensions added by the caller to express the invariants.
+///
+///  invar | minOp | minOpUb | r_i | extra syms... | const |           eq/ineq
+///  ------+-------+---------+-----+---------------+-------+-------------------
+///   (various eq./ineq. constraining `invar`, added by the caller)
+///    ... |     0 |       0 |   0 |             0 |   ... |               ...
+///  ------+-------+---------+-----+---------------+-------+-------------------
+///   (various ineq. constraining `minOp` in terms of `minOp` operands (`invar`
+///    and extra `minOp` operands "extra syms" that are not in `invar`)).
+///    ... |    -1 |       0 |   0 |           ... |   ... |              >= 0
+///  ------+-------+---------+-----+---------------+-------+-------------------
+///   (set `minOpUb` to `minOp` upper bound in terms of `invar` and extra syms)
+///    ... |     0 |      -1 |   0 |           ... |   ... |               = 0
+///  ------+-------+---------+-----+---------------+-------+-------------------
+///   (for each `minOp` map result r_i: copy previous constraints, set r_i to
+///    corresponding map result, prove r_i >= minOpUb via contradiction)
+///    ... |     0 |       0 |  -1 |           ... |   ... |               = 0
+///      0 |     0 |       1 |  -1 |             0 |    -1 |              >= 0
+///
+static LogicalResult
+canonicalizeAffineMinOp(RewriterBase &rewriter, AffineMinOp minOp,
+                        FlatAffineValueConstraints constraints) {
+  RewriterBase::InsertionGuard guard(rewriter);
+  AffineMap minOpMap = minOp.getAffineMap();
+  unsigned numResults = minOpMap.getNumResults();
+
+  // Add a few extra dimensions.
+  unsigned dimMinOp = constraints.addDimId();   // `minOp`
+  unsigned dimMinOpUb = constraints.addDimId(); // `minOp` upper bound
+  unsigned resultDimStart = constraints.getNumDimIds();
+  for (unsigned i = 0; i < numResults; ++i)
+    constraints.addDimId();
+
+  // Add an inequality for each result expr_i of minOpMap: minOp <= expr_i
+  if (failed(alignAndAddBound(constraints, FlatAffineConstraints::UB, dimMinOp,
+                              minOpMap, minOp.operands())))
+    return failure();
+
+  // Try to compute an upper bound for minOp, expressed in terms of the other
+  // `dims` and extra symbols.
+  SmallVector<AffineMap> minOpValLb(1), minOpValUb(1);
+  constraints.getSliceBounds(dimMinOp, 1, minOp.getContext(), &minOpValLb,
+                             &minOpValUb);
+  // TODO: `getSliceBounds` may return multiple bounds at the moment. This is
+  // a TODO of `getSliceBounds` and not handled here.
+  if (!minOpValUb[0] || minOpValUb[0].getNumResults() != 1)
+    return failure(); // No or multiple upper bounds found.
+
+  // Add an equality: dimMinOpUb = minOpValUb[0]
+  // Add back dimension for minOp. (Was removed by `getSliceBounds`.)
+  AffineMap alignedUbMap = minOpValUb[0].shiftDims(/*shift=*/1,
+                                                   /*offset=*/dimMinOp);
+  if (failed(constraints.addBound(FlatAffineConstraints::EQ, dimMinOpUb,
+                                  alignedUbMap)))
+    return failure();
+
+  // If the constraint system is empty, there is an inconsistency. (E.g., this
+  // can happen if loop lb > ub.)
+  if (constraints.isEmpty())
+    return failure();
+
+  // Prove that each result of minOpMap has a lower bound that is equal to (or
+  // greater than) the upper bound of minOp (`kDimMinOpUb`). In that case,
+  // minOp can be replaced with the bound. I.e., prove that for each result
+  // expr_i (represented by dimension r_i):
+  //
+  // r_i >= minOpUb
+  //
+  // To prove this inequality, add its negation to the constraint set and prove
+  // that the constraint set is empty.
+  for (unsigned i = resultDimStart; i < resultDimStart + numResults; ++i) {
+    FlatAffineValueConstraints newConstr(constraints);
+
+    // Add an equality: r_i = expr_i
+    // Note: These equalities could have been added earlier and used to express
+    // minOp <= expr_i. However, then we run the risk that `getSliceBounds`
+    // computes minOpUb in terms of r_i dims, which is not desired.
+    if (failed(alignAndAddBound(newConstr, FlatAffineConstraints::EQ, i,
+                                minOpMap.getSubMap({i - resultDimStart}),
+                                minOp.operands())))
+      return failure();
+
+    // Add inequality: r_i < minOpUb (equiv.: minOpUb - r_i - 1 >= 0)
+    SmallVector<int64_t> ineq(newConstr.getNumCols(), 0);
+    ineq[dimMinOpUb] = 1;
+    ineq[i] = -1;
+    ineq[newConstr.getNumCols() - 1] = -1;
+    newConstr.addInequality(ineq);
+    if (!newConstr.isEmpty())
+      return failure();
+  }
+
+  // Lower and upper bound of `minOp` are equal. Replace `minOp` with its bound.
+  AffineMap newMap = alignedUbMap;
+  SmallVector<Value> newOperands;
+  unpackOptionalValues(constraints.getMaybeDimAndSymbolValues(), newOperands);
+  mlir::canonicalizeMapAndOperands(&newMap, &newOperands);
+  rewriter.setInsertionPoint(minOp);
+  rewriter.replaceOpWithNewOp<AffineApplyOp>(minOp, newMap, newOperands);
+  return success();
+}
+
+/// Try to simplify an affine.min operation `minOp` after loop peeling. This
+/// function detects affine.min operations such as (ub is the previous upper
+/// bound of the unpeeled loop):
+/// ```
+/// #map = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>
+/// %r = affine.min #affine.min #map(%iv)[%step, %ub]
+/// ```
+/// and rewrites them into (in the case the peeled loop):
+/// ```
+/// %r = %step
+/// ```
+/// affine.min operations inside the generated scf.if operation are rewritten in
+/// a similar way.
+///
+/// This function builds up a set of constraints, capable of proving that:
+/// * Inside the peeled loop: min(step, ub - iv) == step
+/// * Inside the scf.if operation: min(step, ub - iv) == ub - iv
+///
+/// Note: `ub` is the previous upper bound of the loop (before peeling).
+/// `insideLoop` must be true for affine.min ops inside the loop and false for
+/// affine.min ops inside the scf.for op.
+static LogicalResult rewritePeeledAffineOp(RewriterBase &rewriter,
+                                           AffineMinOp minOp, Value iv,
+                                           Value ub, Value step,
+                                           bool insideLoop) {
+  FlatAffineValueConstraints constraints;
+  constraints.addDimId(0, iv);
+  constraints.addDimId(1, ub);
+  constraints.addDimId(2, step);
+  if (auto constUb = getConstantIntValue(ub))
+    constraints.addBound(FlatAffineConstraints::EQ, 1, *constUb);
+  if (auto constStep = getConstantIntValue(step))
+    constraints.addBound(FlatAffineConstraints::EQ, 2, *constStep);
+
+  // Add loop peeling invariant. This is the main piece of knowledge that
+  // enables AffineMinOp simplification.
+  if (insideLoop) {
+    // ub - iv >= step (equiv.: -iv + ub - step + 0 >= 0)
+    // Intuitively: Inside the peeled loop, every iteration is a "full"
+    // iteration, i.e., step divides the iteration space `ub - lb` evenly.
+    constraints.addInequality({-1, 1, -1, 0});
+  } else {
+    // ub - iv < step (equiv.: iv + -ub + step - 1 >= 0)
+    // Intuitively: `iv` is the split bound here, i.e., the iteration variable
+    // value of the very last iteration (in the unpeeled loop). At that point,
+    // there are less than `step` elements remaining. (Otherwise, the peeled
+    // loop would run for at least one more iteration.)
+    constraints.addInequality({1, -1, 1, -1});
+  }
+
+  return canonicalizeAffineMinOp(rewriter, minOp, constraints);
+}
+
+LogicalResult mlir::scf::peelAndCanonicalizeForLoop(RewriterBase &rewriter,
+                                                    ForOp forOp) {
+  Value ub = forOp.upperBound();
+  scf::IfOp ifOp;
+  Value splitBound;
+  if (failed(peelForLoop(rewriter, forOp, ifOp, splitBound)))
+    return failure();
+
+  // Rewrite affine.min ops.
+  forOp.walk([&](AffineMinOp minOp) {
+    (void)rewritePeeledAffineOp(rewriter, minOp, forOp.getInductionVar(), ub,
+                                forOp.step(), /*insideLoop=*/true);
+  });
+  ifOp.walk([&](AffineMinOp minOp) {
+    (void)rewritePeeledAffineOp(rewriter, minOp, splitBound, ub, forOp.step(),
+                                /*insideLoop=*/false);
+  });
+
+  return success();
+}
+
 static constexpr char kPeeledLoopLabel[] = "__peeled_loop__";
 
 namespace {
@@ -163,15 +392,12 @@ struct ForLoopPeelingPattern : public OpRewritePattern<ForOp> {
                                 PatternRewriter &rewriter) const override {
     if (forOp->hasAttr(kPeeledLoopLabel))
       return failure();
-
-    scf::IfOp ifOp;
-    if (failed(peelForLoop(rewriter, forOp, ifOp)))
+    if (failed(peelAndCanonicalizeForLoop(rewriter, forOp)))
       return failure();
     // Apply label, so that the same loop is not rewritten a second time.
     rewriter.updateRootInPlace(forOp, [&]() {
       forOp->setAttr(kPeeledLoopLabel, rewriter.getUnitAttr());
     });
-
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
index 8282c0771f302..af8dcaf4b5c38 100644
--- a/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/ParallelLoopTiling.cpp
@@ -33,12 +33,25 @@ using namespace mlir::scf;
 ///                                          min(%arg5*tileSize[1], %arg3-%i1))
 ///                                      step (%arg4, %arg5)
 ///
+/// or, when no-min-max-bounds is true, into
+///   scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+///                                            step (%arg4*tileSize[0],
+///                                                  %arg5*tileSize[1])
+///     scf.parallel (%j0, %j1) = (0, 0) to (%arg4*tileSize[0],
+///                                          %arg5*tileSize[1])
+///                                      step (%arg4, %arg5)
+///        %inbound = (%j0 * %arg4 + %i0 < %arg2) &&
+///                   (%j1 * %arg5 + %i1 < %arg3)
+///        scf.if (%inbound)
+///          ....
+///
 /// where the uses of %i0 and %i1 in the loop body are replaced by
 /// %i0 + j0 and %i1 + %j1.
 //
 /// The old loop is replaced with the new one.
 std::pair<ParallelOp, ParallelOp>
-mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes) {
+mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes,
+                            bool noMinMaxBounds) {
   OpBuilder b(op);
   auto zero = b.create<ConstantIndexOp>(op.getLoc(), 0);
   SmallVector<Value, 2> tileSizeConstants;
@@ -64,8 +77,6 @@ mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes) {
   b.setInsertionPointToStart(outerLoop.getBody());
 
   // Compute min(size, dim - offset) to avoid out-of-bounds accesses.
-  // FIXME: Instead of using min, we want to replicate the tail. This would give
-  // the inner loop constant bounds for easy vectorization.
   auto minMap = AffineMap::get(
       /*dimCount=*/3, /*symbolCount=*/0,
       {getAffineDimExpr(/*position=*/0, b.getContext()),
@@ -76,6 +87,7 @@ mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes) {
   // Create the inner loop with adjusted bounds.
   SmallVector<Value, 2> newBounds;
   newBounds.reserve(op.upperBound().size());
+  bool needInboundCheck = false;
   for (auto dim : llvm::zip(outerLoop.lowerBound(), outerLoop.upperBound(),
                             outerLoop.step(), outerLoop.getInductionVars(),
                             op.step(), tileSizeConstants)) {
@@ -101,6 +113,14 @@ mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes) {
         continue;
       }
     }
+
+    // For InboundCheck mode, just use the variable outer step
+    if (noMinMaxBounds) {
+      newBounds.push_back(newStep);
+      needInboundCheck = true;
+      continue;
+    }
+
     // Otherwise, we dynamically compute the bound for
     // each iteration of the outer loop.
     newBounds.push_back(
@@ -111,17 +131,51 @@ mlir::scf::tileParallelLoop(ParallelOp op, ArrayRef<int64_t> tileSizes) {
       op.getLoc(), SmallVector<Value, 2>(newBounds.size(), zero), newBounds,
       op.step());
 
-  // Steal the body of the old parallel loop and erase it.
-  innerLoop.region().takeBody(op.region());
-
-  // Insert computation for new index vectors and replace uses.
-  b.setInsertionPointToStart(innerLoop.getBody());
-  for (auto ivs :
-       llvm::zip(innerLoop.getInductionVars(), outerLoop.getInductionVars())) {
-    Value inner_index = std::get<0>(ivs);
-    AddIOp newIndex =
-        b.create<AddIOp>(op.getLoc(), std::get<0>(ivs), std::get<1>(ivs));
-    inner_index.replaceAllUsesExcept(newIndex, newIndex);
+  if (noMinMaxBounds && needInboundCheck) {
+    b.setInsertionPointToStart(innerLoop.getBody());
+    // Insert in-bound check
+    Value inbound =
+        b.create<ConstantOp>(op.getLoc(), b.getIntegerType(1),
+                             b.getIntegerAttr(b.getIntegerType(1), 1));
+    for (auto dim :
+         llvm::zip(outerLoop.upperBound(), outerLoop.getInductionVars(),
+                   innerLoop.getInductionVars(), innerLoop.step())) {
+      Value outerUpperBound, outerIV, innerIV, innerStep;
+      std::tie(outerUpperBound, outerIV, innerIV, innerStep) = dim;
+      // %in_bound = %in_bound &&
+      //             (%inner_iv * %inner_step + %outer_iv < %outer_upper_bound)
+      Value index = b.create<AddIOp>(
+          op.getLoc(), b.create<MulIOp>(op.getLoc(), innerIV, innerStep),
+          outerIV);
+      Value dimInbound = b.create<CmpIOp>(op.getLoc(), CmpIPredicate::ult,
+                                          index, outerUpperBound);
+      inbound = b.create<AndOp>(op.getLoc(), inbound, dimInbound);
+    }
+    auto ifInbound = b.create<IfOp>(op.getLoc(),
+                                    /*resultTypes*/ ArrayRef<Type>{}, inbound,
+                                    /*hasElseRegion*/ false);
+    ifInbound.thenRegion().takeBody(op.region());
+    Block &thenBlock = ifInbound.thenRegion().front();
+    b.setInsertionPointToStart(innerLoop.getBody());
+    for (auto ivs : llvm::enumerate(llvm::zip(innerLoop.getInductionVars(),
+                                              outerLoop.getInductionVars()))) {
+      AddIOp newIndex = b.create<AddIOp>(op.getLoc(), std::get<0>(ivs.value()),
+                                         std::get<1>(ivs.value()));
+      thenBlock.getArgument(ivs.index())
+          .replaceAllUsesExcept(newIndex, newIndex);
+    }
+    thenBlock.eraseArguments(llvm::to_vector<4>(
+        llvm::seq((unsigned)0, thenBlock.getNumArguments())));
+  } else {
+    innerLoop.region().takeBody(op.region());
+    b.setInsertionPointToStart(innerLoop.getBody());
+    for (auto ivs : llvm::zip(innerLoop.getInductionVars(),
+                              outerLoop.getInductionVars())) {
+      Value innerIndex = std::get<0>(ivs);
+      AddIOp newIndex =
+          b.create<AddIOp>(op.getLoc(), std::get<0>(ivs), std::get<1>(ivs));
+      innerIndex.replaceAllUsesExcept(newIndex, newIndex);
+    }
   }
 
   op.erase();
@@ -132,8 +186,10 @@ namespace {
 struct ParallelLoopTiling
     : public SCFParallelLoopTilingBase<ParallelLoopTiling> {
   ParallelLoopTiling() = default;
-  explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes) {
+  explicit ParallelLoopTiling(ArrayRef<int64_t> tileSizes,
+                              bool noMinMaxBounds = false) {
     this->tileSizes = tileSizes;
+    this->noMinMaxBounds = noMinMaxBounds;
   }
 
   void runOnFunction() override {
@@ -142,13 +198,14 @@ struct ParallelLoopTiling
     for (ParallelOp ploop : innermostPloops) {
       // FIXME: Add reduction support.
       if (ploop.getNumReductions() == 0)
-        tileParallelLoop(ploop, tileSizes);
+        tileParallelLoop(ploop, tileSizes, noMinMaxBounds);
     }
   }
 };
 } // namespace
 
 std::unique_ptr<Pass>
-mlir::createParallelLoopTilingPass(ArrayRef<int64_t> tileSizes) {
-  return std::make_unique<ParallelLoopTiling>(tileSizes);
+mlir::createParallelLoopTilingPass(ArrayRef<int64_t> tileSizes,
+                                   bool noMinMaxBounds) {
+  return std::make_unique<ParallelLoopTiling>(tileSizes, noMinMaxBounds);
 }
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
index c03388d266b5a..fc18fdd78b3cc 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVOps.cpp
@@ -1019,37 +1019,41 @@ static ParseResult parseAccessChainOp(OpAsmParser &parser,
   return success();
 }
 
+template <typename Op>
+static void printAccessChain(Op op, ValueRange indices, OpAsmPrinter &printer) {
+  printer << Op::getOperationName() << ' ' << op.base_ptr() << '[' << indices
+          << "] : " << op.base_ptr().getType() << ", " << indices.getTypes();
+}
+
 static void print(spirv::AccessChainOp op, OpAsmPrinter &printer) {
-  printer << spirv::AccessChainOp::getOperationName() << ' ' << op.base_ptr()
-          << '[' << op.indices() << "] : " << op.base_ptr().getType() << ", "
-          << op.indices().getTypes();
+  printAccessChain(op, op.indices(), printer);
 }
 
-static LogicalResult verify(spirv::AccessChainOp accessChainOp) {
-  SmallVector<Value, 4> indices(accessChainOp.indices().begin(),
-                                accessChainOp.indices().end());
+template <typename Op>
+static LogicalResult verifyAccessChain(Op accessChainOp, ValueRange indices) {
   auto resultType = getElementPtrType(accessChainOp.base_ptr().getType(),
                                       indices, accessChainOp.getLoc());
-  if (!resultType) {
+  if (!resultType)
     return failure();
-  }
 
   auto providedResultType =
-      accessChainOp.getType().dyn_cast<spirv::PointerType>();
-  if (!providedResultType) {
+      accessChainOp.getType().template dyn_cast<spirv::PointerType>();
+  if (!providedResultType)
     return accessChainOp.emitOpError(
                "result type must be a pointer, but provided")
            << providedResultType;
-  }
 
-  if (resultType != providedResultType) {
+  if (resultType != providedResultType)
     return accessChainOp.emitOpError("invalid result type: expected ")
            << resultType << ", but provided " << providedResultType;
-  }
 
   return success();
 }
 
+static LogicalResult verify(spirv::AccessChainOp accessChainOp) {
+  return verifyAccessChain(accessChainOp, accessChainOp.indices());
+}
+
 //===----------------------------------------------------------------------===//
 // spv.mlir.addressof
 //===----------------------------------------------------------------------===//
@@ -3770,6 +3774,109 @@ static LogicalResult verify(spirv::ImageQuerySizeOp imageQuerySizeOp) {
   return success();
 }
 
+static ParseResult parsePtrAccessChainOpImpl(StringRef opName,
+                                             OpAsmParser &parser,
+                                             OperationState &state) {
+  OpAsmParser::OperandType ptrInfo;
+  SmallVector<OpAsmParser::OperandType, 4> indicesInfo;
+  Type type;
+  auto loc = parser.getCurrentLocation();
+  SmallVector<Type, 4> indicesTypes;
+
+  if (parser.parseOperand(ptrInfo) ||
+      parser.parseOperandList(indicesInfo, OpAsmParser::Delimiter::Square) ||
+      parser.parseColonType(type) ||
+      parser.resolveOperand(ptrInfo, type, state.operands))
+    return failure();
+
+  // Check that the provided indices list is not empty before parsing their
+  // type list.
+  if (indicesInfo.empty())
+    return emitError(state.location) << opName << " expected element";
+
+  if (parser.parseComma() || parser.parseTypeList(indicesTypes))
+    return failure();
+
+  // Check that the indices types list is not empty and that it has a one-to-one
+  // mapping to the provided indices.
+  if (indicesTypes.size() != indicesInfo.size())
+    return emitError(state.location)
+           << opName
+           << " indices types' count must be equal to indices info count";
+
+  if (parser.resolveOperands(indicesInfo, indicesTypes, loc, state.operands))
+    return failure();
+
+  auto resultType = getElementPtrType(
+      type, llvm::makeArrayRef(state.operands).drop_front(2), state.location);
+  if (!resultType)
+    return failure();
+
+  state.addTypes(resultType);
+  return success();
+}
+
+template <typename Op>
+static auto concatElemAndIndices(Op op) {
+  SmallVector<Value> ret(op.indices().size() + 1);
+  ret[0] = op.element();
+  llvm::copy(op.indices(), ret.begin() + 1);
+  return ret;
+}
+
+//===----------------------------------------------------------------------===//
+// spv.InBoundsPtrAccessChainOp
+//===----------------------------------------------------------------------===//
+
+void spirv::InBoundsPtrAccessChainOp::build(OpBuilder &builder,
+                                            OperationState &state,
+                                            Value basePtr, Value element,
+                                            ValueRange indices) {
+  auto type = getElementPtrType(basePtr.getType(), indices, state.location);
+  assert(type && "Unable to deduce return type based on basePtr and indices");
+  build(builder, state, type, basePtr, element, indices);
+}
+
+static ParseResult parseInBoundsPtrAccessChainOp(OpAsmParser &parser,
+                                                 OperationState &state) {
+  return parsePtrAccessChainOpImpl(
+      spirv::InBoundsPtrAccessChainOp::getOperationName(), parser, state);
+}
+
+static void print(spirv::InBoundsPtrAccessChainOp op, OpAsmPrinter &printer) {
+  printAccessChain(op, concatElemAndIndices(op), printer);
+}
+
+static LogicalResult verify(spirv::InBoundsPtrAccessChainOp accessChainOp) {
+  return verifyAccessChain(accessChainOp, accessChainOp.indices());
+}
+
+//===----------------------------------------------------------------------===//
+// spv.PtrAccessChainOp
+//===----------------------------------------------------------------------===//
+
+void spirv::PtrAccessChainOp::build(OpBuilder &builder, OperationState &state,
+                                    Value basePtr, Value element,
+                                    ValueRange indices) {
+  auto type = getElementPtrType(basePtr.getType(), indices, state.location);
+  assert(type && "Unable to deduce return type based on basePtr and indices");
+  build(builder, state, type, basePtr, element, indices);
+}
+
+static ParseResult parsePtrAccessChainOp(OpAsmParser &parser,
+                                         OperationState &state) {
+  return parsePtrAccessChainOpImpl(spirv::PtrAccessChainOp::getOperationName(),
+                                   parser, state);
+}
+
+static void print(spirv::PtrAccessChainOp op, OpAsmPrinter &printer) {
+  printAccessChain(op, concatElemAndIndices(op), printer);
+}
+
+static LogicalResult verify(spirv::PtrAccessChainOp accessChainOp) {
+  return verifyAccessChain(accessChainOp, accessChainOp.indices());
+}
+
 namespace mlir {
 namespace spirv {
 
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index f75bfc5894b6a..7c17455cb3ae6 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -89,6 +89,16 @@ static LogicalResult verifyShapeOrExtentTensorOp(Operation *op) {
   return success();
 }
 
+template <typename... Ty>
+static bool eachHasOnlyOneOfTypes(TypeRange typeRange) {
+  return typeRange.size() == 1 && typeRange.front().isa<Ty...>();
+}
+
+template <typename... Ty, typename... ranges>
+static bool eachHasOnlyOneOfTypes(TypeRange l, ranges... rs) {
+  return eachHasOnlyOneOfTypes<Ty...>(l) && eachHasOnlyOneOfTypes<Ty...>(rs...);
+}
+
 //===----------------------------------------------------------------------===//
 // InlinerInterface
 //===----------------------------------------------------------------------===//
@@ -404,6 +414,27 @@ void AssumingOp::build(
   result.addTypes(assumingTypes);
 }
 
+//===----------------------------------------------------------------------===//
+// AddOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult mlir::shape::AddOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  if (operands[0].getType().isa<SizeType>() ||
+      operands[1].getType().isa<SizeType>())
+    inferredReturnTypes.assign({SizeType::get(context)});
+  else
+    inferredReturnTypes.assign({IndexType::get(context)});
+  return success();
+}
+
+bool mlir::shape::AddOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
+  // SizeType is compatible with IndexType.
+  return eachHasOnlyOneOfTypes<SizeType, IndexType>(l, r);
+}
+
 //===----------------------------------------------------------------------===//
 // AssumingAllOp
 //===----------------------------------------------------------------------===//
@@ -955,6 +986,23 @@ OpFoldResult DivOp::fold(ArrayRef<Attribute> operands) {
   return IntegerAttr::get(indexTy, quotient);
 }
 
+LogicalResult mlir::shape::DivOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  if (operands[0].getType().isa<SizeType>() ||
+      operands[1].getType().isa<SizeType>())
+    inferredReturnTypes.assign({SizeType::get(context)});
+  else
+    inferredReturnTypes.assign({IndexType::get(context)});
+  return success();
+}
+
+bool mlir::shape::DivOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
+  // SizeType is compatible with IndexType.
+  return eachHasOnlyOneOfTypes<SizeType, IndexType>(l, r);
+}
+
 //===----------------------------------------------------------------------===//
 // ShapeEqOp
 //===----------------------------------------------------------------------===//
@@ -1096,6 +1144,20 @@ void GetExtentOp::build(OpBuilder &builder, OperationState &result, Value shape,
   }
 }
 
+LogicalResult mlir::shape::GetExtentOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.assign({IndexType::get(context)});
+  return success();
+}
+
+bool mlir::shape::GetExtentOp::isCompatibleReturnTypes(TypeRange l,
+                                                       TypeRange r) {
+  // SizeType is compatible with IndexType.
+  return eachHasOnlyOneOfTypes<SizeType, IndexType>(l, r);
+}
+
 //===----------------------------------------------------------------------===//
 // IsBroadcastableOp
 //===----------------------------------------------------------------------===//
@@ -1114,6 +1176,38 @@ OpFoldResult IsBroadcastableOp::fold(ArrayRef<Attribute> operands) {
   return nullptr;
 }
 
+//===----------------------------------------------------------------------===//
+// JoinOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult mlir::shape::JoinOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  inferredReturnTypes.assign({operands[0].getType()});
+  return success();
+}
+
+bool mlir::shape::JoinOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
+  if (l.size() != 1 || r.size() != 1)
+    return false;
+  if (l == r)
+    return true;
+
+  Type lhs = l.front();
+  Type rhs = r.front();
+
+  if (lhs != rhs)
+    return false;
+
+  if (lhs.isa<SizeType>() || lhs.isa<ShapeType>())
+    return true;
+
+  if (succeeded(verifyCompatibleShapes({lhs, rhs})))
+    return true;
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // RankOp
 //===----------------------------------------------------------------------===//
@@ -1173,6 +1267,22 @@ void shape::RankOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<RankShapeOfCanonicalizationPattern>(context);
 }
 
+LogicalResult mlir::shape::RankOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  if (operands[0].getType().isa<ShapeType>())
+    inferredReturnTypes.assign({SizeType::get(context)});
+  else
+    inferredReturnTypes.assign({IndexType::get(context)});
+  return success();
+}
+
+bool mlir::shape::RankOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
+  // SizeType is compatible with IndexType.
+  return eachHasOnlyOneOfTypes<SizeType, IndexType>(l, r);
+}
+
 //===----------------------------------------------------------------------===//
 // NumElementsOp
 //===----------------------------------------------------------------------===//
@@ -1191,14 +1301,21 @@ OpFoldResult NumElementsOp::fold(ArrayRef<Attribute> operands) {
   return builder.getIndexAttr(product.getLimitedValue());
 }
 
-void NumElementsOp::build(OpBuilder &builder, OperationState &result,
-                          Value shape) {
-  if (shape.getType().isa<ShapedType>()) {
-    auto type = builder.getIndexType();
-    return build(builder, result, type, shape);
-  }
-  auto type = SizeType::get(builder.getContext());
-  return build(builder, result, type, shape);
+LogicalResult mlir::shape::NumElementsOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  if (operands[0].getType().isa<ShapeType>())
+    inferredReturnTypes.assign({SizeType::get(context)});
+  else
+    inferredReturnTypes.assign({IndexType::get(context)});
+  return success();
+}
+
+bool mlir::shape::NumElementsOp::isCompatibleReturnTypes(TypeRange l,
+                                                         TypeRange r) {
+  // SizeType is compatible with IndexType.
+  return eachHasOnlyOneOfTypes<SizeType, IndexType>(l, r);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1212,6 +1329,27 @@ OpFoldResult MaxOp::fold(llvm::ArrayRef<mlir::Attribute> operands) {
   return nullptr;
 }
 
+LogicalResult mlir::shape::MaxOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  if (operands[0].getType() == operands[1].getType())
+    inferredReturnTypes.assign({operands[0].getType()});
+  else
+    inferredReturnTypes.assign({SizeType::get(context)});
+  return success();
+}
+
+bool mlir::shape::MaxOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
+  if (l.size() != 1 || r.size() != 1)
+    return false;
+  if (l.front().isa<ShapeType>() && r.front().isa<ShapeType>())
+    return true;
+  if (l.front().isa<SizeType>() && r.front().isa<SizeType>())
+    return true;
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // MinOp
 //===----------------------------------------------------------------------===//
@@ -1223,6 +1361,27 @@ OpFoldResult MinOp::fold(llvm::ArrayRef<mlir::Attribute> operands) {
   return nullptr;
 }
 
+LogicalResult mlir::shape::MinOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  if (operands[0].getType() == operands[1].getType())
+    inferredReturnTypes.assign({operands[0].getType()});
+  else
+    inferredReturnTypes.assign({SizeType::get(context)});
+  return success();
+}
+
+bool mlir::shape::MinOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
+  if (l.size() != 1 || r.size() != 1)
+    return false;
+  if (l.front().isa<ShapeType>() && r.front().isa<ShapeType>())
+    return true;
+  if (l.front().isa<SizeType>() && r.front().isa<SizeType>())
+    return true;
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // MulOp
 //===----------------------------------------------------------------------===//
@@ -1239,6 +1398,22 @@ OpFoldResult MulOp::fold(ArrayRef<Attribute> operands) {
   return IntegerAttr::get(indexTy, folded);
 }
 
+LogicalResult mlir::shape::MulOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  if (operands[0].getType().isa<SizeType>() ||
+      operands[1].getType().isa<SizeType>())
+    inferredReturnTypes.assign({SizeType::get(context)});
+  else
+    inferredReturnTypes.assign({IndexType::get(context)});
+  return success();
+}
+
+bool mlir::shape::MulOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
+  // SizeType is compatible with IndexType.
+  return eachHasOnlyOneOfTypes<SizeType, IndexType>(l, r);
+}
 //===----------------------------------------------------------------------===//
 // ShapeOfOp
 //===----------------------------------------------------------------------===//
@@ -1251,18 +1426,6 @@ OpFoldResult ShapeOfOp::fold(ArrayRef<Attribute>) {
   return builder.getIndexTensorAttr(type.getShape());
 }
 
-void ShapeOfOp::build(OpBuilder &builder, OperationState &result, Value arg) {
-  if (auto shapedTy = arg.getType().dyn_cast<ShapedType>()) {
-    int64_t rank =
-        shapedTy.hasRank() ? shapedTy.getRank() : ShapedType::kDynamicSize;
-    Type indexTy = builder.getIndexType();
-    Type extentTensorTy = RankedTensorType::get({rank}, indexTy);
-    return ShapeOfOp::build(builder, result, extentTensorTy, arg);
-  }
-  Type shapeTy = builder.getType<ShapeType>();
-  return ShapeOfOp::build(builder, result, shapeTy, arg);
-}
-
 namespace {
 struct ShapeOfWithTensor : public OpRewritePattern<shape::ShapeOfOp> {
   using OpRewritePattern<shape::ShapeOfOp>::OpRewritePattern;
@@ -1317,6 +1480,44 @@ void ShapeOfOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<ShapeOfCastExtentTensor, ShapeOfWithTensor>(context);
 }
 
+LogicalResult mlir::shape::ShapeOfOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  if (operands[0].getType().isa<ValueShapeType>())
+    inferredReturnTypes.assign({ShapeType::get(context)});
+  else {
+    auto shapedTy = operands[0].getType().cast<ShapedType>();
+    int64_t rank =
+        shapedTy.hasRank() ? shapedTy.getRank() : ShapedType::kDynamicSize;
+    Type indexTy = IndexType::get(context);
+    Type extentTensorTy = RankedTensorType::get({rank}, indexTy);
+    inferredReturnTypes.assign({extentTensorTy});
+  }
+  return success();
+}
+
+bool mlir::shape::ShapeOfOp::isCompatibleReturnTypes(TypeRange l, TypeRange r) {
+  if (l.size() != 1 || r.size() != 1)
+    return false;
+  if (l == r)
+    return true;
+
+  Type lhs = l.front();
+  Type rhs = r.front();
+
+  if (!lhs.isa<ShapeType, ShapedType>() || !rhs.isa<ShapeType, ShapedType>())
+    return false;
+
+  if (lhs.isa<ShapeType>() || rhs.isa<ShapeType>())
+    // Shape type is compatible with all other valid return types.
+    return true;
+
+  if (succeeded(verifyCompatibleShapes({lhs, rhs})))
+    return true;
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // SizeToIndexOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index c5f4b07fb4981..01b180084c21a 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -232,12 +232,33 @@ class SparseTensorToDimSizeConverter
   LogicalResult
   matchAndRewrite(tensor::DimOp op, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const override {
-    if (!operands[0].getType().isa<LLVM::LLVMPointerType>())
-      return failure();
     Type resType = op.getType();
+    auto enc = getSparseTensorEncoding(op.source().getType());
+    if (!enc)
+      return failure();
+    // Permute the dim index.
+    Optional<int64_t> index = op.getConstantIndex();
+    if (!index.hasValue())
+      return failure();
+    int64_t idx = index.getValue();
+    AffineMap p = enc.getDimOrdering();
+    if (p) {
+      assert(p.isPermutation());
+      for (unsigned i = 0, sz = p.getNumResults(); i < sz; i++) {
+        if (p.getDimPosition(i) == idx) {
+          idx = i;
+          break;
+        }
+      }
+    }
+    // Generate the call.
     StringRef name = "sparseDimSize";
+    SmallVector<Value, 2> params;
+    params.push_back(operands[0]);
+    params.push_back(
+        rewriter.create<ConstantOp>(op.getLoc(), rewriter.getIndexAttr(idx)));
     rewriter.replaceOpWithNewOp<CallOp>(
-        op, resType, getFunc(op, name, resType, operands), operands);
+        op, resType, getFunc(op, name, resType, params), params);
     return success();
   }
 };
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
index b2c64e450a84f..f30318649dc07 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -10,10 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/SCF/Transforms.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
 #include "mlir/Dialect/SparseTensor/Utils/Merger.h"
@@ -282,17 +284,11 @@ static bool genBuffers(Merger &merger, CodeGen &codegen,
         codegen.indices[tensor][idx] =
             rewriter.create<ToIndicesOp>(loc, indTp, t->get(), dim);
       }
-      // Find lower and upper bound in current dimension. Note that a
-      // permuted encoding queries static type dimensions accordingly,
-      // but queries dynamic type dimensions in the generated order.
-      Value up;
+      // Find upper bound in current dimension.
       unsigned p = perm(enc, d);
-      if (shape[p] == MemRefType::kDynamicSize) {
-        up = rewriter.create<tensor::DimOp>(loc, t->get(), d);
+      Value up = linalg::createOrFoldDimOp(rewriter, loc, t->get(), p);
+      if (shape[p] == MemRefType::kDynamicSize)
         args.push_back(up);
-      } else {
-        up = rewriter.create<ConstantIndexOp>(loc, shape[p]);
-      }
       assert(codegen.highs[tensor][idx] == nullptr);
       codegen.sizes[idx] = codegen.highs[tensor][idx] = up;
     }
@@ -354,7 +350,13 @@ static Value genVectorMask(CodeGen &codegen, PatternRewriter &rewriter,
   // during vector execution. Here we rely on subsequent loop optimizations to
   // avoid executing the mask in all iterations, for example, by splitting the
   // loop into an unconditional vector loop and a scalar cleanup loop.
-  Value end = rewriter.create<SubIOp>(loc, hi, iv);
+  auto minMap = AffineMap::get(
+      /*dimCount=*/2, /*symbolCount=*/1,
+      {rewriter.getAffineSymbolExpr(0),
+       rewriter.getAffineDimExpr(0) - rewriter.getAffineDimExpr(1)},
+      rewriter.getContext());
+  Value end =
+      rewriter.createOrFold<AffineMinOp>(loc, minMap, ValueRange{hi, iv, step});
   return rewriter.create<vector::CreateMaskOp>(loc, mtp, end);
 }
 
@@ -743,7 +745,7 @@ static Operation *genFor(Merger &merger, CodeGen &codegen,
   unsigned tensor = merger.tensor(fb);
   assert(idx == merger.index(fb));
   auto iteratorTypes = op.iterator_types().getValue();
-  bool isReduction = linalg::isReductionIteratorType(iteratorTypes[idx]);
+  bool isReduction = isReductionIterator(iteratorTypes[idx]);
   bool isSparse = merger.isDim(fb, Dim::kSparse);
   bool isVector = isVectorFor(codegen, isInner, isSparse) &&
                   denseUnitStrides(merger, op, idx);
diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
index c0d18193b2608..0c4b1441dd4cd 100644
--- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
+++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
@@ -512,16 +512,8 @@ OpFoldResult BitcastOp::fold(ArrayRef<Attribute> operands) {
 
   Type resType = getResult().getType();
 
-  if (auto denseAttr = operand.dyn_cast<DenseFPElementsAttr>()) {
-    Type elType = getElementTypeOrSelf(resType);
-    return denseAttr.mapValues(
-        elType, [](const APFloat &f) { return f.bitcastToAPInt(); });
-  }
-  if (auto denseAttr = operand.dyn_cast<DenseIntElementsAttr>()) {
-    Type elType = getElementTypeOrSelf(resType);
-    // mapValues does its own bitcast to the target type.
-    return denseAttr.mapValues(elType, [](const APInt &i) { return i; });
-  }
+  if (auto denseAttr = operand.dyn_cast<DenseElementsAttr>())
+    return denseAttr.bitcast(resType.cast<ShapedType>().getElementType());
 
   APInt bits;
   if (auto floatAttr = operand.dyn_cast<FloatAttr>())
diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
index 278cd6d639cfa..f3ad31c042f6f 100644
--- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
@@ -3490,12 +3490,18 @@ class VectorCreateMaskOpConversion
   const bool enableIndexOptimizations;
 };
 
-// Converts vector.multi_reduction into inner-most reduction form by inserting
-// vector.transpose
-struct InnerDimReductionConversion
+// Converts vector.multi_reduction into inner-most/outer-most reduction form
+// by using vector.tranpose
+class InnerOuterDimReductionConversion
     : public OpRewritePattern<vector::MultiDimReductionOp> {
+public:
   using OpRewritePattern<vector::MultiDimReductionOp>::OpRewritePattern;
 
+  explicit InnerOuterDimReductionConversion(MLIRContext *context,
+                                            bool useInnerDimsForReduction)
+      : mlir::OpRewritePattern<vector::MultiDimReductionOp>(context),
+        useInnerDimsForReduction(useInnerDimsForReduction) {}
+
   LogicalResult matchAndRewrite(vector::MultiDimReductionOp multiReductionOp,
                                 PatternRewriter &rewriter) const override {
     auto src = multiReductionOp.source();
@@ -3516,87 +3522,116 @@ struct InnerDimReductionConversion
         parallelDims.push_back(i);
     }
 
-    // Add transpose only if inner-most dimensions are not reductions
-    if (parallelDims ==
-        llvm::to_vector<4>(llvm::seq<int64_t>(0, parallelDims.size())))
+    // Add transpose only if inner-most/outer-most dimensions are not parallel
+    if (useInnerDimsForReduction &&
+        (parallelDims ==
+         llvm::to_vector<4>(llvm::seq<int64_t>(0, parallelDims.size()))))
+      return failure();
+
+    if (!useInnerDimsForReduction &&
+        (parallelDims !=
+         llvm::to_vector<4>(llvm::seq<int64_t>(0, parallelDims.size()))))
       return failure();
 
     SmallVector<int64_t, 4> indices;
-    indices.append(parallelDims.begin(), parallelDims.end());
-    indices.append(reductionDims.begin(), reductionDims.end());
+    if (useInnerDimsForReduction) {
+      indices.append(parallelDims.begin(), parallelDims.end());
+      indices.append(reductionDims.begin(), reductionDims.end());
+    } else {
+      indices.append(reductionDims.begin(), reductionDims.end());
+      indices.append(parallelDims.begin(), parallelDims.end());
+    }
     auto transposeOp = rewriter.create<vector::TransposeOp>(loc, src, indices);
     SmallVector<bool> reductionMask(srcRank, false);
     for (int i = 0; i < reductionSize; ++i) {
-      reductionMask[srcRank - i - 1] = true;
+      if (useInnerDimsForReduction)
+        reductionMask[srcRank - i - 1] = true;
+      else
+        reductionMask[i] = true;
     }
     rewriter.replaceOpWithNewOp<vector::MultiDimReductionOp>(
         multiReductionOp, transposeOp.result(), reductionMask,
         multiReductionOp.kind());
     return success();
   }
+
+private:
+  const bool useInnerDimsForReduction;
 };
 
 // Reduces the rank of vector.mult_reduction nd -> 2d given all reduction
-// dimensions are inner most.
-struct ReduceMultiDimReductionRank
+// dimensions are either inner most or outer most.
+class ReduceMultiDimReductionRank
     : public OpRewritePattern<vector::MultiDimReductionOp> {
+public:
   using OpRewritePattern<vector::MultiDimReductionOp>::OpRewritePattern;
 
+  explicit ReduceMultiDimReductionRank(MLIRContext *context,
+                                       bool useInnerDimsForReduction)
+      : mlir::OpRewritePattern<vector::MultiDimReductionOp>(context),
+        useInnerDimsForReduction(useInnerDimsForReduction) {}
+
   LogicalResult matchAndRewrite(vector::MultiDimReductionOp multiReductionOp,
                                 PatternRewriter &rewriter) const override {
     auto srcRank = multiReductionOp.getSourceVectorType().getRank();
     auto srcShape = multiReductionOp.getSourceVectorType().getShape();
+    auto loc = multiReductionOp.getLoc();
     if (srcRank == 2)
       return failure();
 
-    auto loc = multiReductionOp.getLoc();
-    auto reductionDims = llvm::to_vector<4>(
-        llvm::map_range(multiReductionOp.reduction_dims().cast<ArrayAttr>(),
-                        [](Attribute attr) -> int64_t {
-                          return attr.cast<IntegerAttr>().getInt();
-                        }));
-    llvm::sort(reductionDims);
-
-    // Fails if not inner most reduction.
-    int64_t reductionSize = reductionDims.size();
-    bool innerMostReduction = true;
-    for (int i = 0; i < reductionSize; ++i) {
-      if (reductionDims[reductionSize - i - 1] != srcRank - i - 1) {
-        innerMostReduction = false;
+    // Separate reduction and parallel dims
+    auto reductionDimsRange =
+        multiReductionOp.reduction_dims().getAsValueRange<IntegerAttr>();
+    auto reductionDims = llvm::to_vector<4>(llvm::map_range(
+        reductionDimsRange, [](APInt a) { return a.getZExtValue(); }));
+    llvm::SmallDenseSet<int64_t> reductionDimsSet(reductionDims.begin(),
+                                                  reductionDims.end());
+    SmallVector<int64_t, 4> parallelDims, parallelShapes;
+    int canonicalReductionDim = 1;
+    int canonicalParallelDim = 1;
+    for (int64_t i = 0; i < srcRank; i++) {
+      if (!reductionDimsSet.contains(i)) {
+        parallelDims.push_back(i);
+        parallelShapes.push_back(srcShape[i]);
+        canonicalParallelDim *= srcShape[i];
+      } else {
+        canonicalReductionDim *= srcShape[i];
       }
     }
-    if (!innerMostReduction)
+
+    // Fail if reduction dims are not either inner-most or outer-most
+    if (useInnerDimsForReduction &&
+        (parallelDims !=
+         llvm::to_vector<4>(llvm::seq<int64_t>(0, parallelDims.size()))))
       return failure();
 
-    // Extracts 2d rank reduction shape.
-    int innerDims = 1;
-    int outterDims = 1;
-    SmallVector<int64_t> innerDimsShape;
-    for (int i = 0; i < srcRank; ++i) {
-      if (i < (srcRank - reductionSize)) {
-        innerDims *= srcShape[i];
-        innerDimsShape.push_back(srcShape[i]);
-      } else {
-        outterDims *= srcShape[i];
-      }
-    }
+    if (!useInnerDimsForReduction &&
+        (parallelDims ==
+         llvm::to_vector<4>(llvm::seq<int64_t>(0, parallelDims.size()))))
+      return failure();
 
     // Creates shape cast for the inputs n_d -> 2d
+    int64_t outerDim =
+        useInnerDimsForReduction ? canonicalParallelDim : canonicalReductionDim;
+    int64_t innerDim =
+        useInnerDimsForReduction ? canonicalReductionDim : canonicalParallelDim;
+
     auto castedType = VectorType::get(
-        {innerDims, outterDims},
+        ArrayRef<int64_t>{outerDim, innerDim},
         multiReductionOp.getSourceVectorType().getElementType());
     auto castedOp = rewriter.create<vector::ShapeCastOp>(
         loc, castedType, multiReductionOp.source());
 
-    // Creates the canonical form of 2d vector.multi_reduction with inner most
-    // dim as reduction.
+    // Creates the canonical form of 2d vector.multi_reduction with inner/outer
+    // most dim as reduction.
+    SmallVector<bool, 2> mask{!useInnerDimsForReduction,
+                              useInnerDimsForReduction};
     auto newOp = rewriter.create<vector::MultiDimReductionOp>(
-        loc, castedOp.result(), ArrayRef<bool>{false, true},
-        multiReductionOp.kind());
+        loc, castedOp.result(), mask, multiReductionOp.kind());
 
     // Creates shape cast for the output 2d -> nd
-    auto outputCastedType = VectorType::get(
-        innerDimsShape,
+    VectorType outputCastedType = VectorType::get(
+        parallelShapes,
         multiReductionOp.getSourceVectorType().getElementType());
     Value castedOutputOp = rewriter.create<vector::ShapeCastOp>(
         loc, outputCastedType, newOp.dest());
@@ -3604,6 +3639,88 @@ struct ReduceMultiDimReductionRank
     rewriter.replaceOp(multiReductionOp, castedOutputOp);
     return success();
   }
+
+private:
+  const bool useInnerDimsForReduction;
+};
+
+// Unrolls vector.multi_reduction with outermost reductions
+// and combines results
+struct UnrollOuterMultiReduction
+    : public OpRewritePattern<vector::MultiDimReductionOp> {
+  using OpRewritePattern<vector::MultiDimReductionOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::MultiDimReductionOp multiReductionOp,
+                                PatternRewriter &rewriter) const override {
+    auto srcRank = multiReductionOp.getSourceVectorType().getRank();
+    if (srcRank != 2)
+      return failure();
+
+    if (multiReductionOp.getReductionMask()[1] ||
+        !multiReductionOp.getReductionMask()[0])
+      return failure();
+
+    auto loc = multiReductionOp.getLoc();
+    ArrayRef<int64_t> srcShape =
+        multiReductionOp.getSourceVectorType().getShape();
+
+    Type elementType = multiReductionOp.getDestVectorType().getElementType();
+    if (!elementType.isIntOrIndexOrFloat())
+      return failure();
+
+    Value condition;
+    Value result =
+        rewriter.create<vector::ExtractOp>(loc, multiReductionOp.source(), 0)
+            .getResult();
+    for (int64_t i = 1; i < srcShape[0]; i++) {
+      auto operand =
+          rewriter.create<vector::ExtractOp>(loc, multiReductionOp.source(), i);
+      switch (multiReductionOp.kind()) {
+      case vector::CombiningKind::ADD:
+        if (elementType.isIntOrIndex())
+          result = rewriter.create<AddIOp>(loc, operand, result);
+        else
+          result = rewriter.create<AddFOp>(loc, operand, result);
+        break;
+      case vector::CombiningKind::MUL:
+        if (elementType.isIntOrIndex())
+          result = rewriter.create<MulIOp>(loc, operand, result);
+        else
+          result = rewriter.create<MulFOp>(loc, operand, result);
+        break;
+      case vector::CombiningKind::MIN:
+        if (elementType.isIntOrIndex())
+          condition =
+              rewriter.create<CmpIOp>(loc, CmpIPredicate::slt, operand, result);
+        else
+          condition =
+              rewriter.create<CmpFOp>(loc, CmpFPredicate::OLT, operand, result);
+        result = rewriter.create<SelectOp>(loc, condition, operand, result);
+        break;
+      case vector::CombiningKind::MAX:
+        if (elementType.isIntOrIndex())
+          condition =
+              rewriter.create<CmpIOp>(loc, CmpIPredicate::sge, operand, result);
+        else
+          condition =
+              rewriter.create<CmpFOp>(loc, CmpFPredicate::OGE, operand, result);
+        result = rewriter.create<SelectOp>(loc, condition, operand, result);
+        break;
+      case vector::CombiningKind::AND:
+        result = rewriter.create<AndOp>(loc, operand, result);
+        break;
+      case vector::CombiningKind::OR:
+        result = rewriter.create<OrOp>(loc, operand, result);
+        break;
+      case vector::CombiningKind::XOR:
+        result = rewriter.create<XOrOp>(loc, operand, result);
+        break;
+      }
+    }
+
+    rewriter.replaceOp(multiReductionOp, result);
+    return success();
+  }
 };
 
 // Converts 2d vector.multi_reduction with inner most reduction dimension into a
@@ -3747,9 +3864,13 @@ void mlir::vector::populateVectorTransferLoweringPatterns(
 }
 
 void mlir::vector::populateVectorMultiReductionLoweringPatterns(
-    RewritePatternSet &patterns) {
-  patterns.add<InnerDimReductionConversion, ReduceMultiDimReductionRank,
-               TwoDimMultiReductionToReduction>(patterns.getContext());
+    RewritePatternSet &patterns, bool useInnerDimsForReduction) {
+  patterns.add<InnerOuterDimReductionConversion, ReduceMultiDimReductionRank>(
+      patterns.getContext(), useInnerDimsForReduction);
+  if (useInnerDimsForReduction)
+    patterns.add<TwoDimMultiReductionToReduction>(patterns.getContext());
+  else
+    patterns.add<UnrollOuterMultiReduction>(patterns.getContext());
 }
 
 void mlir::vector::populateVectorUnrollPatterns(
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
index c31d96e1abdca..0364b3b2b96ed 100644
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -101,19 +101,26 @@ AffineExpr::replaceSymbols(ArrayRef<AffineExpr> symReplacements) const {
   return replaceDimsAndSymbols({}, symReplacements);
 }
 
-/// Replace symbols[0 .. numDims - 1] by symbols[shift .. shift + numDims - 1].
-AffineExpr AffineExpr::shiftDims(unsigned numDims, unsigned shift) const {
+/// Replace dims[offset ... numDims)
+/// by dims[offset + shift ... shift + numDims).
+AffineExpr AffineExpr::shiftDims(unsigned numDims, unsigned shift,
+                                 unsigned offset) const {
   SmallVector<AffineExpr, 4> dims;
-  for (unsigned idx = 0; idx < numDims; ++idx)
+  for (unsigned idx = 0; idx < offset; ++idx)
+    dims.push_back(getAffineDimExpr(idx, getContext()));
+  for (unsigned idx = offset; idx < numDims; ++idx)
     dims.push_back(getAffineDimExpr(idx + shift, getContext()));
   return replaceDimsAndSymbols(dims, {});
 }
 
-/// Replace symbols[0 .. numSymbols - 1] by
-/// symbols[shift .. shift + numSymbols - 1].
-AffineExpr AffineExpr::shiftSymbols(unsigned numSymbols, unsigned shift) const {
+/// Replace symbols[offset ... numSymbols)
+/// by symbols[offset + shift ... shift + numSymbols).
+AffineExpr AffineExpr::shiftSymbols(unsigned numSymbols, unsigned shift,
+                                    unsigned offset) const {
   SmallVector<AffineExpr, 4> symbols;
-  for (unsigned idx = 0; idx < numSymbols; ++idx)
+  for (unsigned idx = 0; idx < offset; ++idx)
+    symbols.push_back(getAffineSymbolExpr(idx, getContext()));
+  for (unsigned idx = offset; idx < numSymbols; ++idx)
     symbols.push_back(getAffineSymbolExpr(idx + shift, getContext()));
   return replaceDimsAndSymbols({}, symbols);
 }
diff --git a/mlir/lib/IR/BuiltinAttributes.cpp b/mlir/lib/IR/BuiltinAttributes.cpp
index 85914be243b92..5a754dfe7c3f5 100644
--- a/mlir/lib/IR/BuiltinAttributes.cpp
+++ b/mlir/lib/IR/BuiltinAttributes.cpp
@@ -1024,7 +1024,6 @@ DenseElementsAttr DenseElementsAttr::reshape(ShapedType newType) {
   if (curType == newType)
     return *this;
 
-  (void)curType;
   assert(newType.getElementType() == curType.getElementType() &&
          "expected the same element type");
   assert(newType.getNumElements() == curType.getNumElements() &&
@@ -1032,6 +1031,23 @@ DenseElementsAttr DenseElementsAttr::reshape(ShapedType newType) {
   return DenseIntOrFPElementsAttr::getRaw(newType, getRawData(), isSplat());
 }
 
+/// Return a new DenseElementsAttr that has the same data as the current
+/// attribute, but has bitcast elements such that it is now 'newType'. The new
+/// type must have the same shape and element types of the same bitwidth as the
+/// current type.
+DenseElementsAttr DenseElementsAttr::bitcast(Type newElType) {
+  ShapedType curType = getType();
+  Type curElType = curType.getElementType();
+  if (curElType == newElType)
+    return *this;
+
+  assert(getDenseElementBitWidth(newElType) ==
+             getDenseElementBitWidth(curElType) &&
+         "expected element types with the same bitwidth");
+  return DenseIntOrFPElementsAttr::getRaw(curType.clone(newElType),
+                                          getRawData(), isSplat());
+}
+
 DenseElementsAttr
 DenseElementsAttr::mapValues(Type newElementType,
                              function_ref<APInt(const APInt &)> mapping) const {
diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp
index 955230d2068f3..c19c887a593d6 100644
--- a/mlir/lib/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Transforms/LoopFusion.cpp
@@ -916,12 +916,12 @@ static Value createPrivateMemRef(AffineForOp forOp, Operation *srcStoreOpInst,
   assert(numElements.hasValue() &&
          "non-constant number of elts in local buffer");
 
-  const FlatAffineConstraints *cst = region.getConstraints();
+  const FlatAffineValueConstraints *cst = region.getConstraints();
   // 'outerIVs' holds the values that this memory region is symbolic/parametric
   // on; this would correspond to loop IVs surrounding the level at which the
   // slice is being materialized.
   SmallVector<Value, 8> outerIVs;
-  cst->getIdValues(rank, cst->getNumIds(), &outerIVs);
+  cst->getValues(rank, cst->getNumIds(), &outerIVs);
 
   // Build 'rank' AffineExprs from MemRefRegion 'lbs'
   SmallVector<AffineExpr, 4> offsets;
diff --git a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
index 29ee0cffe73d2..6b7f9369cbc29 100644
--- a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp
@@ -235,7 +235,7 @@ static unsigned getMaxLoopDepth(ArrayRef<Operation *> srcOps,
       unsigned numCommonLoops =
           getNumCommonSurroundingLoops(*srcOpInst, *dstOpInst);
       for (unsigned d = 1; d <= numCommonLoops + 1; ++d) {
-        FlatAffineConstraints dependenceConstraints;
+        FlatAffineValueConstraints dependenceConstraints;
         // TODO: Cache dependence analysis results, check cache here.
         DependenceResult result = checkMemrefAccessDependence(
             srcAccess, dstAccess, d, &dependenceConstraints,
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
index 15aea84c57dbc..05b292ab72ed3 100644
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -459,7 +459,7 @@ checkTilingLegalityImpl(MutableArrayRef<mlir::AffineForOp> origLoops) {
 
   unsigned numOps = loadAndStoreOps.size();
   unsigned numLoops = origLoops.size();
-  FlatAffineConstraints dependenceConstraints;
+  FlatAffineValueConstraints dependenceConstraints;
   for (unsigned d = 1; d <= numLoops + 1; ++d) {
     for (unsigned i = 0; i < numOps; ++i) {
       Operation *srcOp = loadAndStoreOps[i];
@@ -596,7 +596,7 @@ void constructTiledLoopNest(MutableArrayRef<AffineForOp> origLoops,
 LogicalResult checkIfHyperRectangular(MutableArrayRef<AffineForOp> input,
                                       AffineForOp rootAffineForOp,
                                       unsigned width) {
-  FlatAffineConstraints cst;
+  FlatAffineValueConstraints cst;
   SmallVector<Operation *, 8> ops(input.begin(), input.end());
   (void)getIndexSet(ops, &cst);
   if (!cst.isHyperRectangular(0, width)) {
@@ -2206,7 +2206,7 @@ findHighestBlockForPlacement(const MemRefRegion &region, Block &block,
                              Block::iterator *copyOutPlacementStart) {
   const auto *cst = region.getConstraints();
   SmallVector<Value, 4> symbols;
-  cst->getIdValues(cst->getNumDimIds(), cst->getNumDimAndSymbolIds(), &symbols);
+  cst->getValues(cst->getNumDimIds(), cst->getNumDimAndSymbolIds(), &symbols);
 
   SmallVector<AffineForOp, 4> enclosingFors;
   getLoopIVs(*block.begin(), &enclosingFors);
@@ -2440,12 +2440,12 @@ static LogicalResult generateCopy(
   for (unsigned i = 0; i < rank; ++i)
     region.getLowerAndUpperBound(i, lbMaps[i], ubMaps[i]);
 
-  const FlatAffineConstraints *cst = region.getConstraints();
+  const FlatAffineValueConstraints *cst = region.getConstraints();
   // 'regionSymbols' hold values that this memory region is symbolic/parametric
   // on; these typically include loop IVs surrounding the level at which the
   // copy generation is being done or other valid symbols in MLIR.
   SmallVector<Value, 8> regionSymbols;
-  cst->getIdValues(rank, cst->getNumIds(), &regionSymbols);
+  cst->getValues(rank, cst->getNumIds(), &regionSymbols);
 
   // Construct the index expressions for the fast memory buffer. The index
   // expression for a particular dimension of the fast buffer is obtained by
@@ -2689,14 +2689,14 @@ static bool getFullMemRefAsRegion(Operation *op, unsigned numParamLoopIVs,
   SmallVector<Value, 4> symbols;
   extractForInductionVars(ivs, &symbols);
   regionCst->reset(rank, numParamLoopIVs, 0);
-  regionCst->setIdValues(rank, rank + numParamLoopIVs, symbols);
+  regionCst->setValues(rank, rank + numParamLoopIVs, symbols);
 
   // Memref dim sizes provide the bounds.
   for (unsigned d = 0; d < rank; d++) {
     auto dimSize = memRefType.getDimSize(d);
     assert(dimSize > 0 && "filtered dynamic shapes above");
-    regionCst->addConstantLowerBound(d, 0);
-    regionCst->addConstantUpperBound(d, dimSize - 1);
+    regionCst->addBound(FlatAffineConstraints::LB, d, 0);
+    regionCst->addBound(FlatAffineConstraints::UB, d, dimSize - 1);
   }
   return true;
 }
@@ -3001,7 +3001,7 @@ static AffineIfOp createSeparationCondition(MutableArrayRef<AffineForOp> loops,
 
   auto *context = loops[0].getContext();
 
-  FlatAffineConstraints cst;
+  FlatAffineValueConstraints cst;
   SmallVector<Operation *, 8> ops;
   ops.reserve(loops.size());
   for (AffineForOp forOp : loops)
@@ -3068,7 +3068,7 @@ static AffineIfOp createSeparationCondition(MutableArrayRef<AffineForOp> loops,
     return nullptr;
 
   SmallVector<Value, 4> setOperands;
-  cst.getIdValues(0, cst.getNumDimAndSymbolIds(), &setOperands);
+  cst.getValues(0, cst.getNumDimAndSymbolIds(), &setOperands);
   canonicalizeSetAndOperands(&ifCondSet, &setOperands);
   return b.create<AffineIfOp>(loops[0].getLoc(), ifCondSet, setOperands,
                               /*withElseRegion=*/true);
@@ -3082,7 +3082,7 @@ createFullTiles(MutableArrayRef<AffineForOp> inputNest,
 
   // For each loop in the original nest identify a lower/upper bound pair such
   // that their difference is a constant.
-  FlatAffineConstraints cst;
+  FlatAffineValueConstraints cst;
   for (auto loop : inputNest) {
     // TODO: straightforward to generalize to a non-unit stride.
     if (loop.getStep() != 1) {
diff --git a/mlir/lib/Transforms/Utils/Utils.cpp b/mlir/lib/Transforms/Utils/Utils.cpp
index fdff3160e95f6..d8ffab3852f65 100644
--- a/mlir/lib/Transforms/Utils/Utils.cpp
+++ b/mlir/lib/Transforms/Utils/Utils.cpp
@@ -722,8 +722,8 @@ MemRefType mlir::normalizeMemRefType(MemRefType memrefType, OpBuilder b,
   for (unsigned d = 0; d < rank; ++d) {
     // Use constraint system only in static dimensions.
     if (shape[d] > 0) {
-      fac.addConstantLowerBound(d, 0);
-      fac.addConstantUpperBound(d, shape[d] - 1);
+      fac.addBound(FlatAffineConstraints::LB, d, 0);
+      fac.addBound(FlatAffineConstraints::UB, d, shape[d] - 1);
     } else {
       memrefTypeDynDims.emplace_back(d);
     }
@@ -746,7 +746,7 @@ MemRefType mlir::normalizeMemRefType(MemRefType memrefType, OpBuilder b,
       newShape[d] = -1;
     } else {
       // The lower bound for the shape is always zero.
-      auto ubConst = fac.getConstantUpperBound(d);
+      auto ubConst = fac.getConstantBound(FlatAffineConstraints::UB, d);
       // For a static memref and an affine map with no symbols, this is
       // always bounded.
       assert(ubConst.hasValue() && "should always have an upper bound");
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/dump_oplib.py b/mlir/python/mlir/dialects/linalg/opdsl/dump_oplib.py
index 05c06e737be73..bacc0c302c5e3 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/dump_oplib.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/dump_oplib.py
@@ -81,7 +81,6 @@ def main(args):
 
   # Print.
   if args.format == "yaml":
-    print("# Auto-generated file. Do not edit!")
     print(yaml_dump_all(configs))
   elif args.format == "repr":
     for config in configs:
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py b/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py
index f7bfa81c0a2c9..f54d2a5855388 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/lang/comprehension.py
@@ -405,7 +405,7 @@ def to_scalar_expression(self) -> ScalarExpression:
     return ScalarConst(self.value).expr()
 
   def __repr__(self):
-    return f"const({self.type_var}, {self.value})"
+    return f"const({self.value})"
 
 
 class index(TensorExpression):
diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
index b2bd4ce57f9b5..38db294428c94 100644
--- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
+++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py
@@ -145,21 +145,63 @@ def dot(
   C[None] += cast(U, A[D.m]) * cast(U, B[D.m])
 
 @linalg_structured_op
-def conv_2d_nchw(
-    I=TensorDef(T1, S.N, S.C, S.IH, S.IW),
-    K=TensorDef(T2, S.F, S.C, S.KH, S.KW),
-    O=TensorDef(U, S.N, S.F, S.OH, S.OW, S.C, output=True),
-    strides=AttributeDef(S.SH, S.SW),
-    dilations=AttributeDef(S.DH, S.DW)):
-  """Performs 2-D convolution.
+def conv_1d(
+    I=TensorDef(T1, S.IW),
+    K=TensorDef(T2, S.KW),
+    O=TensorDef(U, S.OW, output=True)):
+  """Performs 1-D convolution with no channels.
 
   Numeric casting is performed on the operands to the inner multiply, promoting
   them to the same data type as the accumulator/output.
   """
-  domain(D.n, D.f, D.oh, D.ow, D.c, D.kh, D.kw)
-  O[D.n, D.f, D.oh, D.ow] += cast(
-      U, I[D.n, D.c, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW,
-           ]) * cast(U, K[D.f, D.c, D.kh, D.kw])
+  domain(D.ow, D.kw)
+  O[D.ow] += cast(
+      U, I[D.ow + D.kw]) * cast(U, K[D.kw])
+
+@linalg_structured_op
+def conv_2d(
+    I=TensorDef(T1, S.IH, S.IW),
+    K=TensorDef(T2, S.KH, S.KW),
+    O=TensorDef(U, S.OH, S.OW, output=True)):
+  """Performs 2-D convolution with no channels.
+
+  Numeric casting is performed on the operands to the inner multiply, promoting
+  them to the same data type as the accumulator/output.
+  """
+  domain(D.oh, D.ow, D.kh, D.kw)
+  O[D.oh, D.ow] += cast(
+      U, I[D.oh + D.kh, D.ow + D.kw]) * cast(U, K[D.kh, D.kw])
+
+@linalg_structured_op
+def conv_3d(
+    I=TensorDef(T1, S.ID, S.IH, S.IW),
+    K=TensorDef(T2, S.KD, S.KH, S.KW),
+    O=TensorDef(U, S.OD, S.OH, S.OW, output=True)):
+  """Performs 3-D convolution with no channels.
+
+  Numeric casting is performed on the operands to the inner multiply, promoting
+  them to the same data type as the accumulator/output.
+  """
+  domain(D.od, D.oh, D.ow, D.kd, D.kh, D.kw)
+  O[D.od, D.oh, D.ow] += cast(
+      U, I[D.od + D.kd, D.oh + D.kh, D.ow + D.kw]) * cast(U, K[D.kd, D.kh, D.kw])
+
+@linalg_structured_op
+def conv_1d_nwc_wcf(
+    I=TensorDef(T1, S.N, S.IW, S.C),
+    K=TensorDef(T2, S.KW, S.C, S.F),
+    O=TensorDef(U, S.N, S.OW, S.F, output=True),
+    strides=AttributeDef(S.SW),
+    dilations=AttributeDef(S.DW)):
+  """Performs 1-D convolution.
+
+  Numeric casting is performed on the operands to the inner multiply, promoting
+  them to the same data type as the accumulator/output.
+  """
+  domain(D.n, D.ow, D.f, D.kw, D.c)
+  O[D.n, D.ow, D.f] += cast(
+      U, I[D.n, D.ow * S.SW + D.kw * S.DW, D.c
+           ]) * cast(U, K[D.kw, D.c, D.f])
 
 @linalg_structured_op
 def conv_2d_nhwc_hwcf(
@@ -170,6 +212,10 @@ def conv_2d_nhwc_hwcf(
     dilations=AttributeDef(S.DH, S.DW)):
   """Performs 2-D convolution.
 
+  Layout:
+    * Input: NHWC.
+    * Kernel: HWCF.
+
   Numeric casting is performed on the operands to the inner multiply, promoting
   them to the same data type as the accumulator/output.
   """
@@ -178,23 +224,6 @@ def conv_2d_nhwc_hwcf(
       U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.c
            ]) * cast(U, K[D.kh, D.kw, D.c, D.f])
 
-@linalg_structured_op
-def depthwise_conv_2d_input_nhwc_filter_hwc_poly(
-    I=TensorDef(T1, S.N, S.IH, S.IW, S.C),
-    K=TensorDef(T2, S.KH, S.KW, S.C),
-    O=TensorDef(U, S.N, S.OH, S.OW, S.C, output=True),
-    strides=AttributeDef(S.SH, S.SW),
-    dilations=AttributeDef(S.DH, S.DW)):
-  """Performs depth-wise 2-D convolution.
-
-  Numeric casting is performed on the operands to the inner multiply, promoting
-  them to the same data type as the accumulator/output.
-  """
-  domain(D.n, D.oh, D.ow, D.kh, D.kw, D.c)
-  O[D.n, D.oh, D.ow, D.c] += cast(
-      U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW,
-           D.c]) * cast(U, K[D.kh, D.kw, D.c])
-
 @linalg_structured_op
 def conv_2d_nhwc_hwcf_q(
     I=TensorDef(T1, S.N, S.IH, S.IW, S.C),
@@ -206,6 +235,10 @@ def conv_2d_nhwc_hwcf_q(
     dilations=AttributeDef(S.DH, S.DW)):
   """Performs 2-D convolution with zero point offsets.
 
+  Layout:
+    * Input: NHWC.
+    * Kernel: HWCF.
+
   Numeric casting is performed on the operands to the inner multiply, promoting
   them to the same data type as the accumulator/output. This includes the zero
   point offsets common to quantized operations.
@@ -215,6 +248,82 @@ def conv_2d_nhwc_hwcf_q(
       U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.c
            ]) - cast(U, IZp)) * (cast(U, K[D.kh, D.kw, D.c, D.f]) - cast(U, KZp))
 
+@linalg_structured_op
+def conv_2d_nchw_fchw(
+    I=TensorDef(T1, S.N, S.C, S.IH, S.IW),
+    K=TensorDef(T2, S.F, S.C, S.KH, S.KW),
+    O=TensorDef(U, S.N, S.F, S.OH, S.OW, output=True),
+    strides=AttributeDef(S.SH, S.SW),
+    dilations=AttributeDef(S.DH, S.DW)):
+  """Performs 2-D convolution.
+
+  Layout:
+    * Input: NCHW.
+    * Kernel: FCHW.
+
+  Numeric casting is performed on the operands to the inner multiply, promoting
+  them to the same data type as the accumulator/output.
+  """
+  domain(D.n, D.f, D.oh, D.ow, D.c, D.kh, D.kw)
+  O[D.n, D.f, D.oh, D.ow] += cast(
+      U, I[D.n, D.c, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW
+           ]) * cast(U, K[D.f, D.c, D.kh, D.kw])
+
+@linalg_structured_op
+def conv_3d_ndhwc_dhwcf(
+    I=TensorDef(T1, S.N, S.ID, S.IH, S.IW, S.C),
+    K=TensorDef(T2, S.KD, S.KH, S.KW, S.C, S.F),
+    O=TensorDef(U, S.N, S.OD, S.OH, S.OW, S.F, output=True),
+    strides=AttributeDef(S.SD, S.SH, S.SW),
+    dilations=AttributeDef(S.DD, S.DH, S.DW)):
+  """Performs 3-D convolution.
+
+  Numeric casting is performed on the operands to the inner multiply, promoting
+  them to the same data type as the accumulator/output.
+  """
+  domain(D.n, D.od, D.oh, D.ow, D.f, D.kd, D.kh, D.kw, D.c)
+  O[D.n, D.od, D.oh, D.ow, D.f] += cast(
+      U, I[D.n, D.od * S.SD + D.kd * S.DD, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW, D.c
+           ]) * cast(U, K[D.kd, D.kh, D.kw, D.c, D.f])
+
+@linalg_structured_op
+def depthwise_conv2D_nhw(
+    I=TensorDef(T1, S.N, S.IH, S.IW, S.IC),
+    K=TensorDef(T2, S.KH, S.KW, S.IC),
+    O=TensorDef(U, S.N, S.OH, S.OW, S.IC, output=True),
+    strides=AttributeDef(S.SH, S.SW),
+    dilations=AttributeDef(S.DH, S.DW)):
+  """Performs depth-wise 2-D convolution.
+
+  Numeric casting is performed on the operands to the inner multiply, promoting
+  them to the same data type as the accumulator/output. Multiplier is set to 1
+  which is a special case for most dpethwise convolutions.
+  """
+  domain(D.n, D.oh, D.ow, D.ic, D.kh, D.kw)
+  O[D.n, D.oh, D.ow, D.ic] += cast(
+      U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW,
+           D.ic]) * cast(U, K[D.kh, D.kw, D.ic])
+
+@linalg_structured_op
+def depthwise_conv2D_nhw_q(
+    I=TensorDef(T1, S.N, S.IH, S.IW, S.IC),
+    K=TensorDef(T2, S.KH, S.KW, S.IC),
+    IZp=ScalarDef(I32),
+    KZp=ScalarDef(I32),
+    O=TensorDef(U, S.N, S.OH, S.OW, S.IC, output=True),
+    strides=AttributeDef(S.SH, S.SW),
+    dilations=AttributeDef(S.DH, S.DW)):
+  """Performs depth-wise 2-D convolution.
+
+  Numeric casting is performed on the operands to the inner multiply, promoting
+  them to the same data type as the accumulator/output.
+  """
+  domain(D.n, D.oh, D.ow, D.ic, D.kh, D.kw)
+  O[D.n, D.oh, D.ow, D.ic] += (
+      (cast(U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW,
+                 D.ic]) - cast(U, IZp)) *
+      (cast(U, K[D.kh, D.kw, D.ic]) - cast(U, KZp)))
+
 @linalg_structured_op
 def depthwise_conv2D_nhwc(
     I=TensorDef(T1, S.N, S.IH, S.IW, S.IC),
@@ -227,7 +336,7 @@ def depthwise_conv2D_nhwc(
   Numeric casting is performed on the operands to the inner multiply, promoting
   them to the same data type as the accumulator/output.
   """
-  domain(D.n, D.oh, D.ow, D.kh, D.kw, D.ic, D.cm)
+  domain(D.n, D.oh, D.ow, D.ic, D.cm, D.kh, D.kw)
   O[D.n, D.oh, D.ow, D.ic, D.cm] += cast(
       U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW,
            D.ic]) * cast(U, K[D.kh, D.kw, D.ic, D.cm])
@@ -246,7 +355,7 @@ def depthwise_conv2D_nhwc_q(
   Numeric casting is performed on the operands to the inner multiply, promoting
   them to the same data type as the accumulator/output.
   """
-  domain(D.n, D.oh, D.ow, D.kh, D.kw, D.ic, D.cm)
+  domain(D.n, D.oh, D.ow, D.ic, D.cm, D.kh, D.kw)
   O[D.n, D.oh, D.ow, D.ic, D.cm] += (
       (cast(U, I[D.n, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW + D.kw * S.DW,
                  D.ic]) - cast(U, IZp)) *
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index 4344f88bea5e9..8e5f3dd5a4ac9 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -623,7 +623,10 @@ func @transpose(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
 
 // -----
 
-// CHECK: llvm.mlir.global external @gv0() : !llvm.array<2 x f32>
+// CHECK:   llvm.mlir.global external @gv0() : !llvm.array<2 x f32> {
+// CHECK-NEXT:     %0 = llvm.mlir.undef : !llvm.array<2 x f32>
+// CHECK-NEXT:     llvm.return %0 : !llvm.array<2 x f32>
+// CHECK-NEXT:   }
 memref.global @gv0 : memref<2xf32> = uninitialized
 
 // CHECK: llvm.mlir.global private @gv1() : !llvm.array<2 x f32>
diff --git a/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir b/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir
index 60a143a85006b..44059a27b3295 100644
--- a/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir
+++ b/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir
@@ -21,8 +21,8 @@ func @nested_loops(%arg0: index, %arg1: index, %arg2: index,
                    %arg3: index, %arg4: index, %arg5: index) {
   // CHECK: omp.parallel {
   // CHECK: omp.wsloop (%[[LVAR_OUT1:.*]]) : index = (%arg0) to (%arg2) step (%arg4) {
-  // CHECK-NOT: omp.parallel
   scf.parallel (%i) = (%arg0) to (%arg2) step (%arg4) {
+    // CHECK: omp.parallel
     // CHECK: omp.wsloop (%[[LVAR_IN1:.*]]) : index = (%arg1) to (%arg3) step (%arg5) {
     scf.parallel (%j) = (%arg1) to (%arg3) step (%arg5) {
       // CHECK: "test.payload"(%[[LVAR_OUT1]], %[[LVAR_IN1]]) : (index, index) -> ()
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index 88906b763d390..68f56bd84cc9c 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -404,6 +404,31 @@ func @test_simple_i32(%arg0: tensor<1xi32>) -> () {
 
 // -----
 
+// CHECK-LABEL: @test_i8
+func @test_i8(%arg0: tensor<1xi8>) -> () {
+  // CHECK: linalg.generic
+  // CHECK-DAG: %[[C127:.+]] = constant -127
+  // CHECK-DAG: %[[C126:.+]] = constant 126
+  // CHECK-DAG: %[[CMP1:.+]] = cmpi slt, %arg1, %[[C127]]
+  // CHECK-DAG: %[[SEL1:.+]] = select %[[CMP1]], %[[C127]]
+  // CHECK-DAG: %[[CMP2:.+]] = cmpi slt, %[[C126]], %arg1
+  // CHECK: %[[SEL2:.+]] = select %[[CMP2]], %[[C126]], %[[SEL1]]
+  %0 = "tosa.clamp"(%arg0) {min_int = -127 : i64, max_int = 126 : i64, min_fp = 0.0 : f32, max_fp = 0.0 : f32} : (tensor<1xi8>) -> tensor<1xi8>
+
+  // CHECK: linalg.generic
+  // CHECK-DAG: %[[C128:.+]] = constant -128
+  // CHECK-DAG: %[[C127:.+]] = constant 127
+  // CHECK-DAG: %[[CMP1:.+]] = cmpi slt, %arg1, %[[C128]]
+  // CHECK-DAG: %[[SEL1:.+]] = select %[[CMP1]], %[[C128]]
+  // CHECK-DAG: %[[CMP2:.+]] = cmpi slt, %[[C127]], %arg1
+  // CHECK: %[[SEL2:.+]] = select %[[CMP2]], %[[C127]], %[[SEL1]]
+  %1 = "tosa.clamp"(%arg0) {min_int = -130 : i64, max_int = 130 : i64, min_fp = 0.0 : f32, max_fp = 0.0 : f32} : (tensor<1xi8>) -> tensor<1xi8>
+
+  return
+}
+
+// -----
+
 // CHECK-LABEL: @test_bool
 func @test_bool(%arg0: tensor<1xi1>, %arg1: tensor<1xi1>) -> () {
   // CHECK: linalg.generic
@@ -702,20 +727,19 @@ func @concat(%arg0: tensor<5x1xf32>, %arg1: tensor<6x1xf32>) -> () {
 
 // CHECK: #[[$MAP0:.*]] = affine_map<(d0) -> (d0)>
 
-// CHECK-LABEL: @rescale
-func @rescale(%arg0 : tensor<2xi8>) -> (tensor<2xi8>) {
+// CHECK-LABEL: @rescale_i8
+func @rescale_i8(%arg0 : tensor<2xi8>) -> () {
   // CHECK: [[C0:%.+]] = constant 19689
   // CHECK: [[C1:%.+]] = constant 15
   // CHECK: [[INIT:%.+]] = linalg.init_tensor [2]
   // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP0]]], iterator_types = ["parallel"]} ins(%arg0 : tensor<2xi8>) outs([[INIT]] : tensor<2xi8>)
   // CHECK: ^bb0([[IN:%.+]]: i8, [[UNUSED:%.+]]: i8):
-  // CHECK: [[C243:%.+]] = constant 243
-  // CHECK: [[C252:%.+]] = constant 252
-
+  // CHECK: [[C17:%.+]] = constant 17
+  // CHECK: [[C22:%.+]] = constant 22
   // CHECK-DAG: [[IN32:%.+]] = sexti [[IN]]
-  // CHECK-DAG: [[IN_ZEROED:%.+]] = subi [[IN32]], [[C243]]
+  // CHECK-DAG: [[IN_ZEROED:%.+]] = subi [[IN32]], [[C17]]
   // CHECK-DAG: [[SCALED:%.+]] = "tosa.apply_scale"([[IN_ZEROED]], [[C0]], [[C1]]) {double_round = false}
-  // CHECK-DAG: [[SCALED_ZEROED:%.+]] = addi [[SCALED]], [[C252]]
+  // CHECK-DAG: [[SCALED_ZEROED:%.+]] = addi [[SCALED]], [[C22]]
   // CHECK-DAG: [[CMIN:%.+]] = constant -128
   // CHECK-DAG: [[CMAX:%.+]] = constant 127
   // CHECK-DAG: [[MINLT:%.+]] = cmpi slt, [[SCALED_ZEROED]], [[CMIN]]
@@ -724,10 +748,63 @@ func @rescale(%arg0 : tensor<2xi8>) -> (tensor<2xi8>) {
   // CHECK-DAG: [[BOUNDED:%.+]] = select [[MAXLT]], [[CMAX]], [[LOWER]]
   // CHECK-DAG: [[TRUNC:%.+]] = trunci [[BOUNDED]]
   // CHECK-DAG: linalg.yield [[TRUNC]]
-  %0 = "tosa.rescale"(%arg0) {input_zp = 243 : i32, output_zp = 252 : i32, multiplier = [19689 : i32], shift = [15 : i32], scale32 = false, double_round = false, per_channel = false} : (tensor<2xi8>)  -> (tensor<2xi8>)
+  %0 = "tosa.rescale"(%arg0) {input_zp = 17 : i32, output_zp = 22 : i32, multiplier = [19689 : i32], shift = [15 : i32], scale32 = false, double_round = false, per_channel = false} : (tensor<2xi8>)  -> (tensor<2xi8>)
 
-  // CHECK: return [[GENERIC]]
-  return %0 : tensor<2xi8>
+  // CHECK: [[C0:%.+]] = constant 19689
+  // CHECK: [[C1:%.+]] = constant 15
+  // CHECK: [[INIT:%.+]] = linalg.init_tensor [2]
+  // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP0]]], iterator_types = ["parallel"]} ins(%arg0 : tensor<2xi8>) outs([[INIT]] : tensor<2xui8>)
+  // CHECK: ^bb0([[IN:%.+]]: i8, [[UNUSED:%.+]]: ui8):
+  // CHECK: [[C17:%.+]] = constant 17
+  // CHECK: [[C22:%.+]] = constant 22
+  // CHECK-DAG: [[IN32:%.+]] = sexti [[IN]]
+  // CHECK-DAG: [[IN_ZEROED:%.+]] = subi [[IN32]], [[C17]]
+  // CHECK-DAG: [[SCALED:%.+]] = "tosa.apply_scale"([[IN_ZEROED]], [[C0]], [[C1]]) {double_round = false}
+  // CHECK-DAG: [[SCALED_ZEROED:%.+]] = addi [[SCALED]], [[C22]]
+  // CHECK-DAG: [[CMIN:%.+]] = constant 0
+  // CHECK-DAG: [[CMAX:%.+]] = constant 255
+  // CHECK-DAG: [[MINLT:%.+]] = cmpi slt, [[SCALED_ZEROED]], [[CMIN]]
+  // CHECK-DAG: [[LOWER:%.+]] = select [[MINLT]], [[CMIN]], [[SCALED_ZEROED]]
+  // CHECK-DAG: [[MAXLT:%.+]] = cmpi slt, [[CMAX]], [[SCALED_ZEROED]]
+  // CHECK-DAG: [[BOUNDED:%.+]] = select [[MAXLT]], [[CMAX]], [[LOWER]]
+  // CHECK-DAG: [[TRUNC:%.+]] = trunci [[BOUNDED]]
+  // CHECK-DAG: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[TRUNC]] : i8 to ui8
+  // CHECK: linalg.yield [[CAST]]
+  %1 = "tosa.rescale"(%arg0) {input_zp = 17 : i32, output_zp = 22 : i32, multiplier = [19689 : i32], shift = [15 : i32], scale32 = false, double_round = false, per_channel = false} : (tensor<2xi8>)  -> (tensor<2xui8>)
+
+  // CHECK: return
+  return
+}
+
+// -----
+
+// CHECK: #[[$MAP0:.*]] = affine_map<(d0) -> (d0)>
+
+// CHECK-LABEL: @rescale_ui8
+func @rescale_ui8(%arg0 : tensor<2xui8>) -> () {
+  // CHECK: [[C0:%.+]] = constant 19689
+  // CHECK: [[C1:%.+]] = constant 15
+  // CHECK: [[INIT:%.+]] = linalg.init_tensor [2]
+  // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP0]]], iterator_types = ["parallel"]} ins(%arg0 : tensor<2xui8>) outs([[INIT]] : tensor<2xi8>)
+  // CHECK: ^bb0([[IN:%.+]]: ui8, [[UNUSED:%.+]]: i8):
+  // CHECK: [[C17:%.+]] = constant 17
+  // CHECK: [[C22:%.+]] = constant 22
+  // CHECK-DAG: [[CAST:%.+]] = builtin.unrealized_conversion_cast [[IN]] : ui8 to i8
+  // CHECK-DAG: [[IN32:%.+]] = zexti [[CAST]]
+  // CHECK-DAG: [[IN_ZEROED:%.+]] = subi [[IN32]], [[C17]]
+  // CHECK-DAG: [[SCALED:%.+]] = "tosa.apply_scale"([[IN_ZEROED]], [[C0]], [[C1]]) {double_round = false}
+  // CHECK-DAG: [[SCALED_ZEROED:%.+]] = addi [[SCALED]], [[C22]]
+  // CHECK-DAG: [[CMIN:%.+]] = constant -128
+  // CHECK-DAG: [[CMAX:%.+]] = constant 127
+  // CHECK-DAG: [[MINLT:%.+]] = cmpi slt, [[SCALED_ZEROED]], [[CMIN]]
+  // CHECK-DAG: [[LOWER:%.+]] = select [[MINLT]], [[CMIN]], [[SCALED_ZEROED]]
+  // CHECK-DAG: [[MAXLT:%.+]] = cmpi slt, [[CMAX]], [[SCALED_ZEROED]]
+  // CHECK-DAG: [[BOUNDED:%.+]] = select [[MAXLT]], [[CMAX]], [[LOWER]]
+  // CHECK-DAG: [[TRUNC:%.+]] = trunci [[BOUNDED]]
+  // CHECK: linalg.yield [[TRUNC]]
+  %0 = "tosa.rescale"(%arg0) {input_zp = 17 : i32, output_zp = 22 : i32, multiplier = [19689 : i32], shift = [15 : i32], scale32 = false, double_round = false, per_channel = false} : (tensor<2xui8>)  -> (tensor<2xi8>)
+
+  return
 }
 
 // -----
diff --git a/mlir/test/Dialect/Linalg/canonicalize-duplicate-inputs.mlir b/mlir/test/Dialect/Linalg/canonicalize-duplicate-inputs.mlir
index 73a1031aa68b6..4e1e8c75f7d4c 100644
--- a/mlir/test/Dialect/Linalg/canonicalize-duplicate-inputs.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize-duplicate-inputs.mlir
@@ -8,11 +8,12 @@
 // CHECK-LABEL: @basic
 func @basic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   // CHECK: linalg.generic{{.*}}[#[[$MAP]], #[[$MAP]]]
+  // CHECK:   attrs =  {someattr}
   // CHECK:   ^bb0(%[[BBARG:.*]]: f32, %{{.*}}: f32):
   // CHECK:     addf %[[BBARG]], %[[BBARG]]
   %0 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]}
      ins(%arg0, %arg0 : tensor<?xf32>, tensor<?xf32>)
-    outs(%arg0 : tensor<?xf32>) {
+    outs(%arg0 : tensor<?xf32>) attrs = {someattr} {
   ^bb0(%arg1: f32, %arg2: f32, %arg3: f32):
     %1 = addf %arg1, %arg2 : f32
     linalg.yield %1 : f32
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index 34dacd58a51e5..41a4bfe9c9800 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -919,3 +919,30 @@ func @dim_of_pad_tensor(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>,
   %r = tensor.dim %0, %c0 : tensor<?x?xf32>
   return %r : index
 }
+
+// -----
+
+// CHECK-LABEL: func @dim_of_tiled_loop_input(
+//  CHECK-SAME:     %[[arg0:.*]]: tensor<?x?xf32>, %[[arg1:.*]]: tensor<?x?xf32>, %[[arg2:.*]]: tensor<?x?xf32>
+//       CHECK:   %[[c0:.*]] = constant 0 : index
+//       CHECK:   linalg.tiled_loop
+//       CHECK:     %[[dim:.*]] = tensor.dim %[[arg1]], %[[c0]]
+//       CHECK:     index_cast %[[dim]]
+func @dim_of_tiled_loop_input(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>)
+    -> tensor<?x?xf32> {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
+  %d1 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
+  %r = linalg.tiled_loop (%iv0, %iv1) = (%c0, %c0)
+      to (%d0, %d1) step (%c1, %c1)
+      ins (%in0 = %arg0 : tensor<?x?xf32>, %in1 = %arg1 : tensor<?x?xf32>)
+      outs (%out1 = %arg2 : tensor<?x?xf32>) {
+    %inner_dim = tensor.dim %in1, %c0 : tensor<?x?xf32>
+    %cast1 = std.index_cast %inner_dim : index to i32
+    %cast2 = std.sitofp %cast1 : i32 to f32
+    %fill = linalg.fill(%cast2, %out1) : f32, tensor<?x?xf32> -> tensor<?x?xf32>
+    linalg.yield %fill : tensor<?x?xf32>
+  }
+  return %r : tensor<?x?xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir b/mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir
index de7c0b7c4820f..01dab480d89c6 100644
--- a/mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-tensor-pattern.mlir
@@ -121,8 +121,8 @@ module {
 // TLOOP:    %[[AB_SUB:.*]] = linalg.matmul
 // TLOOP-SAME:  ins(%[[A_SUB]], %[[B_]] : {{.*}}) outs(%[[AB_INIT_SUB]]
 
-// TLOOP:    %[[DIM_B_1:.*]] = tensor.dim %[[B_]], %[[C1]] : [[TY]]
-// TLOOP:    %[[DIM_C_1:.*]] = tensor.dim %[[C_]], %[[C1]] : [[TY]]
+// TLOOP:    %[[DIM_B_1:.*]] = tensor.dim %[[B]], %[[C1]] : [[TY]]
+// TLOOP:    %[[DIM_C_1:.*]] = tensor.dim %[[C]], %[[C1]] : [[TY]]
 
 // TLOOP:    %[[ABC_SUB_:.*]] = linalg.tiled_loop (%[[IV1:.*]], %[[IV2:.*]]) = 
 // TLOOP-SAME: (%[[C0]], %[[C0]]) to (%[[DIM_C_1]], %[[DIM_B_1]])
@@ -300,7 +300,7 @@ module {
 // TLOOP-SAME:      %[[C0_F32_:.*]] = %[[C0_F32]]
 // TLOOP-SAME: outs (%[[OUT_:.*]] = %[[OUT]]: [[TY]]) {
 
-// TLOOP:    %[[DIM_A__1:.*]] = tensor.dim %[[A_]], %[[C1]] : [[TY]]
+// TLOOP:    %[[DIM_A__1:.*]] = tensor.dim %[[A]], %[[C1]] : [[TY]]
 // TLOOP:    %[[A_SUB:.*]] = tensor.extract_slice %[[A_]][%[[I]], 0]
 // TLOOP:    %[[B_SUB:.*]] = tensor.extract_slice %[[B_]][0, %[[J]]]
 // TLOOP:    %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[I]], %[[J]]]
@@ -371,7 +371,7 @@ module {
 // TLOOP-SAME:      %[[C0_F32_:.*]] = %[[C0_F32]]
 // TLOOP-SAME: outs (%[[OUT_:.*]] = %[[OUT]]: [[TY]]) {
 
-// TLOOP:    %[[DIM_A__1:.*]] = tensor.dim %[[A_]], %[[C1]] : [[TY]]
+// TLOOP:    %[[DIM_A__1:.*]] = tensor.dim %[[A]], %[[C1]] : [[TY]]
 // TLOOP:    %[[A_SUB:.*]] = tensor.extract_slice %[[A_]][%[[I]], 0]
 // TLOOP:    %[[B_SUB:.*]] = tensor.extract_slice %[[B_]][0, %[[J]]]
 // TLOOP:    %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[I]], %[[J]]]
diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
index 31966890dcf09..abfeda6b738db 100644
--- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir
@@ -76,8 +76,8 @@ func @generalize_matmul_tensor(%A : tensor<16x8xf32>, %B: tensor<8x32xf32>, %C:
 
 // -----
 
-func @depthwise_conv_2d_input_nhwc_filter_hwcf(%input: memref<2x4x5x2xf32>, %filter: memref<2x2x2x3xf32>, %output: memref<2x3x4x2x3xf32>) {
-  linalg.depthwise_conv_2d_input_nhwc_filter_hwcf
+func @depthwise_conv2D_nhwc(%input: memref<2x4x5x2xf32>, %filter: memref<2x2x2x3xf32>, %output: memref<2x3x4x2x3xf32>) {
+  linalg.depthwise_conv2D_nhwc
      { dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> }
      ins(%input, %filter : memref<2x4x5x2xf32>, memref<2x2x2x3xf32>)
     outs(%output : memref<2x3x4x2x3xf32>)
@@ -88,7 +88,7 @@ func @depthwise_conv_2d_input_nhwc_filter_hwcf(%input: memref<2x4x5x2xf32>, %fil
 // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d5, d6, d3, d4)>
 // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4)>
 
-// CHECK: func @depthwise_conv_2d_input_nhwc_filter_hwcf
+// CHECK: func @depthwise_conv2D_nhwc
 
 // CHECK: linalg.generic
 // CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
@@ -103,8 +103,8 @@ func @depthwise_conv_2d_input_nhwc_filter_hwcf(%input: memref<2x4x5x2xf32>, %fil
 
 // -----
 
-func @depthwise_conv_2d_input_nhwc_filter_hwcf(%input: memref<2x4x5x2xf32>, %filter: memref<2x2x2x3xf32>, %output: memref<2x2x3x2x3xf32>) {
-  linalg.depthwise_conv_2d_input_nhwc_filter_hwcf
+func @depthwise_conv2D_nhwc(%input: memref<2x4x5x2xf32>, %filter: memref<2x2x2x3xf32>, %output: memref<2x2x3x2x3xf32>) {
+  linalg.depthwise_conv2D_nhwc
      { dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> }
      ins(%input, %filter : memref<2x4x5x2xf32>, memref<2x2x2x3xf32>)
     outs(%output : memref<2x2x3x2x3xf32>)
@@ -115,7 +115,7 @@ func @depthwise_conv_2d_input_nhwc_filter_hwcf(%input: memref<2x4x5x2xf32>, %fil
 // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d5, d6, d3, d4)>
 // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3, d4)>
 
-// CHECK: func @depthwise_conv_2d_input_nhwc_filter_hwcf
+// CHECK: func @depthwise_conv2D_nhwc
 
 // CHECK: linalg.generic
 // CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
@@ -130,8 +130,8 @@ func @depthwise_conv_2d_input_nhwc_filter_hwcf(%input: memref<2x4x5x2xf32>, %fil
 
 // -----
 
-func @depthwise_conv_2d_input_nhwc_filter_hwc(%input: memref<1x113x113x96xf32>, %filter: memref<3x3x96xf32>, %output: memref<1x56x56x96xf32>) {
-  linalg.depthwise_conv_2d_input_nhwc_filter_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
+func @depthwise_conv2D_nhw(%input: memref<1x113x113x96xf32>, %filter: memref<3x3x96xf32>, %output: memref<1x56x56x96xf32>) {
+  linalg.depthwise_conv2D_nhw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
     ins(%input, %filter: memref<1x113x113x96xf32>, memref<3x3x96xf32>)
     outs(%output: memref<1x56x56x96xf32>)
   return
@@ -141,7 +141,7 @@ func @depthwise_conv_2d_input_nhwc_filter_hwc(%input: memref<1x113x113x96xf32>,
 // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d5, d3)>
 // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
 
-// CHECK: func @depthwise_conv_2d_input_nhwc_filter_hwc
+// CHECK: func @depthwise_conv2D_nhw
 
 // CHECK: linalg.generic
 // CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
@@ -156,8 +156,8 @@ func @depthwise_conv_2d_input_nhwc_filter_hwc(%input: memref<1x113x113x96xf32>,
 
 // -----
 
-func @conv_1d_input_nwc_filter_wcf(%input: memref<?x?x?xf32>, %filter: memref<?x?x?xf32>, %output: memref<?x?x?xf32>) {
-  linalg.conv_1d_input_nwc_filter_wcf {dilations = dense<1> : tensor<1xi64>,
+func @conv_1d_nwc_wcf(%input: memref<?x?x?xf32>, %filter: memref<?x?x?xf32>, %output: memref<?x?x?xf32>) {
+  linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>,
                                        strides = dense<1> : tensor<1xi64>}
      ins (%input, %filter: memref<?x?x?xf32>, memref<?x?x?xf32>)
     outs (%output: memref<?x?x?xf32>)
@@ -167,11 +167,11 @@ func @conv_1d_input_nwc_filter_wcf(%input: memref<?x?x?xf32>, %filter: memref<?x
 // CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d3, d4, d2)>
 // CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
 
-// CHECK: func @conv_1d_input_nwc_filter_wcf
+// CHECK: func @conv_1d_nwc_wcf
 
 // CHECK: linalg.generic
 // CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
-// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]}
+// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]}
 // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<?x?x?xf32>, memref<?x?x?xf32>)
 // CHECK-SAME: outs(%{{.+}} : memref<?x?x?xf32>)
 
@@ -182,140 +182,6 @@ func @conv_1d_input_nwc_filter_wcf(%input: memref<?x?x?xf32>, %filter: memref<?x
 
 // -----
 
-func @conv_1d_input_ncw_filter_wcf(%input: memref<?x?x?xf32>, %filter: memref<?x?x?xf32>, %output: memref<?x?x?xf32>) {
-  linalg.conv_1d_input_ncw_filter_wcf {dilations = dense<1> : tensor<1xi64>,
-                                       strides = dense<1> : tensor<1xi64>}
-     ins (%input, %filter: memref<?x?x?xf32>, memref<?x?x?xf32>)
-    outs (%output: memref<?x?x?xf32>)
-  return
-}
-
-// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d4, d2 + d3)>
-// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d3, d4, d1)>
-// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2)>
-
-// CHECK: func @conv_1d_input_ncw_filter_wcf
-// CHECK: linalg.generic
-// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
-// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction", "parallel"]}
-// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<?x?x?xf32>, memref<?x?x?xf32>)
-// CHECK-SAME: outs(%{{.+}} : memref<?x?x?xf32>)
-
-// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32)
-// CHECK-NEXT:      %[[MUL:.+]] = mulf %[[BBARG0]], %[[BBARG1]] : f32
-// CHECK-NEXT:      %[[ADD:.+]] = addf %[[BBARG2]], %[[MUL]] : f32
-// CHECK-NEXT:      linalg.yield %[[ADD]] : f32
-
-// -----
-
-func @conv_2d_input_nhwc_filter_hwcf(%input: memref<?x?x?x?xf32>, %filter: memref<?x?x?x?xf32>, %output: memref<?x?x?x?xf32>) {
-  linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<2> : tensor<2xi64>,
-                                         strides = dense<3> : tensor<2xi64>}
-     ins (%input, %filter: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
-    outs (%output: memref<?x?x?x?xf32>)
-  return
-}
-
-// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1 * 3 + d4 * 2, d2 * 3 + d5 * 2, d6)>
-// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d3)>
-// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
-
-// CHECK: func @conv_2d_input_nhwc_filter_hwcf
-
-// CHECK: linalg.generic
-// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
-// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "parallel"]}
-// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
-// CHECK-SAME: outs(%{{.+}} : memref<?x?x?x?xf32>)
-
-// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32)
-// CHECK-NEXT:      %[[MUL:.+]] = mulf %[[BBARG0]], %[[BBARG1]] : f32
-// CHECK-NEXT:      %[[ADD:.+]] = addf %[[BBARG2]], %[[MUL]] : f32
-// CHECK-NEXT:      linalg.yield %[[ADD]] : f32
-
-// -----
-
-func @conv_2d_input_nchw_filter_hwcf(%input: memref<?x?x?x?xf32>, %filter: memref<?x?x?x?xf32>, %output: memref<?x?x?x?xf32>) {
-  linalg.conv_2d_input_nchw_filter_hwcf {dilations = dense<1> : tensor<2xi64>,
-                                         strides = dense<1> : tensor<2xi64>}
-     ins (%input, %filter: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
-    outs (%output: memref<?x?x?x?xf32>)
-  return
-}
-
-// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d6, d2 + d4, d3 + d5)>
-// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d4, d5, d6, d1)>
-// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d2, d3)>
-
-// CHECK: func @conv_2d_input_nchw_filter_hwcf
-
-// CHECK: linalg.generic
-// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
-// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "parallel"]}
-// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
-// CHECK-SAME: outs(%{{.+}} : memref<?x?x?x?xf32>)
-
-// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32)
-// CHECK-NEXT:      %[[MUL:.+]] = mulf %[[BBARG0]], %[[BBARG1]] : f32
-// CHECK-NEXT:      %[[ADD:.+]] = addf %[[BBARG2]], %[[MUL]] : f32
-// CHECK-NEXT:      linalg.yield %[[ADD]] : f32
-
-// -----
-
-func @conv_3d_input_ndhwc_filter_dhwcf(%input: memref<?x?x?x?x?xf32>, %filter: memref<?x?x?x?x?xf32>, %output: memref<?x?x?x?x?xf32>) {
-  linalg.conv_3d_input_ndhwc_filter_dhwcf {dilations = dense<1> : tensor<3xi64>,
-                                           strides = dense<1> : tensor<3xi64>}
-     ins (%input, %filter: memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
-    outs (%output: memref<?x?x?x?x?xf32>)
-  return
-}
-
-// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1 + d5, d2 + d6, d3 + d7, d8)>
-// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d5, d6, d7, d8, d4)>
-// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d2, d3, d4)>
-
-// CHECK: func @conv_3d_input_ndhwc_filter_dhwcf
-
-// CHECK: linalg.generic
-// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
-// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction", "parallel"]}
-// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
-// CHECK-SAME: outs(%{{.+}} : memref<?x?x?x?x?xf32>)
-
-// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32)
-// CHECK-NEXT:      %[[MUL:.+]] = mulf %[[BBARG0]], %[[BBARG1]] : f32
-// CHECK-NEXT:      %[[ADD:.+]] = addf %[[BBARG2]], %[[MUL]] : f32
-// CHECK-NEXT:      linalg.yield %[[ADD]] : f32
-
-// -----
-
-func @conv_3d_input_ncdhw_filter_dhwcf(%input: memref<?x?x?x?x?xf32>, %filter: memref<?x?x?x?x?xf32>, %output: memref<?x?x?x?x?xf32>) {
-  linalg.conv_3d_input_ncdhw_filter_dhwcf {dilations = dense<1> : tensor<3xi64>,
-                                           strides = dense<1> : tensor<3xi64>}
-     ins (%input, %filter: memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
-    outs (%output: memref<?x?x?x?x?xf32>)
-  return
-}
-
-// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d8, d2 + d5, d3 + d6, d4 + d7)>
-// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d5, d6, d7, d8, d1)>
-// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8) -> (d0, d1, d2, d3, d4)>
-
-// CHECK: func @conv_3d_input_ncdhw_filter_dhwcf
-
-// CHECK: linalg.generic
-// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
-// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel", "reduction", "reduction", "reduction", "parallel"]}
-// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
-// CHECK-SAME: outs(%{{.+}} : memref<?x?x?x?x?xf32>)
-
-// CHECK:         ^{{.+}}(%[[BBARG0:.+]]: f32, %[[BBARG1:.+]]: f32, %[[BBARG2:.+]]: f32)
-// CHECK-NEXT:      %[[MUL:.+]] = mulf %[[BBARG0]], %[[BBARG1]] : f32
-// CHECK-NEXT:      %[[ADD:.+]] = addf %[[BBARG2]], %[[MUL]] : f32
-// CHECK-NEXT:      linalg.yield %[[ADD]] : f32
-
-// -----
-
 func @generalize_fill(%output: memref<?x?xf32>, %value : f32) {
   linalg.fill(%value, %output) : f32, memref<?x?xf32>
   return
diff --git a/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
index 2d9f0932cde90..3e934d42012c4 100644
--- a/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-named-polymorphic-ops.mlir
@@ -30,36 +30,6 @@ func @generalize_matmul_tensor_i32(%A : tensor<16x8xi32>, %B: tensor<8x32xi32>,
 
 // -----
 
-func @generalize_depthwise_conv_2d_input_nhwc_filter_hwc_poly_f32(%input : tensor<1x4x16x1xf32>, %filter: tensor<2x2x1xf32>, %output: tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32> {
-  %0 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc_poly {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
-    ins(%input, %filter : tensor<1x4x16x1xf32>, tensor<2x2x1xf32>) outs(%output : tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32>
-  return %0: tensor<1x2x4x1xf32>
-}
-
-// CHECK-LABEL: @generalize_depthwise_conv_2d_input_nhwc_filter_hwc_poly_f32
-// CHECK:      ^{{.*}}(%[[IN_ARG:.+]]: f32, %[[FILTER_ARG:.+]]: f32, %[[OUT_ARG:.+]]: f32)
-// CHECK-NEXT:   %[[MUL:.+]] = mulf %[[IN_ARG]], %[[FILTER_ARG]] : f32
-// CHECK-NEXT:   %[[ADD:.+]] = addf %[[OUT_ARG]], %[[MUL]] : f32
-// CHECK-NEXT:   linalg.yield %[[ADD]] : f32
-// CHECK-NEXT: -> tensor<1x2x4x1xf32>
-
-// -----
-
-func @generalize_depthwise_conv_2d_input_nhwc_filter_hwc_poly_i32(%input : tensor<1x4x16x1xi32>, %filter: tensor<2x2x1xi32>, %output: tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32> {
-  %0 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc_poly {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
-    ins(%input, %filter : tensor<1x4x16x1xi32>, tensor<2x2x1xi32>) outs(%output : tensor<1x2x4x1xi32>) -> tensor<1x2x4x1xi32>
-  return %0: tensor<1x2x4x1xi32>
-}
-
-// CHECK-LABEL: @generalize_depthwise_conv_2d_input_nhwc_filter_hwc_poly_i32
-// CHECK:      ^{{.*}}(%[[IN_ARG:.+]]: i32, %[[FILTER_ARG:.+]]: i32, %[[OUT_ARG:.+]]: i32)
-// CHECK-NEXT:   %[[MUL:.+]] = muli %[[IN_ARG]], %[[FILTER_ARG]] : i32
-// CHECK-NEXT:   %[[ADD:.+]] = addi %[[OUT_ARG]], %[[MUL]] : i32
-// CHECK-NEXT:   linalg.yield %[[ADD]] : i32
-// CHECK-NEXT: -> tensor<1x2x4x1xi32>
-
-// -----
-
 func @generalize_pooling_nhwc_max_f32(%input : tensor<1x4x16x1xf32>, %shape: tensor<2x2xf32>, %output: tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32> {
   %0 = linalg.pooling_nhwc_max {dilations = dense<[1, 2]> : tensor<2xi64>, strides = dense<[2, 4]> : tensor<2xi64>}
     ins(%input, %shape : tensor<1x4x16x1xf32>, tensor<2x2xf32>) outs(%output : tensor<1x2x4x1xf32>) -> tensor<1x2x4x1xf32>
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
index 569b9a1b387db..36860272a80c8 100644
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -573,7 +573,7 @@ func @invalid_static_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>, %arg
 
 func @invalid_static_2d_conv(%input : memref<1x3x4x2xf32>, %filter: memref<3x2x2x1xf32>, %output: memref<1x2x3x1xf32>) {
   // expected-error @+1 {{inferred input/output operand #0 has shape's dimension #1 to be greater than or equal to 4, but found 3}}
-  linalg.conv_2d_input_nhwc_filter_hwcf
+  linalg.conv_2d_nhwc_hwcf
     { dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
     ins(%input, %filter : memref<1x3x4x2xf32>, memref<3x2x2x1xf32>)
     outs(%output : memref<1x2x3x1xf32>)
diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir
index d19b87c487f90..208e4f2baf5f5 100644
--- a/mlir/test/Dialect/Linalg/named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/named-ops.mlir
@@ -1,99 +1,81 @@
 // RUN: mlir-opt -split-input-file -verify-diagnostics %s | FileCheck %s
 
-// CHECK-LABEL: func @depthwise_conv_2d_input_nhwc_filter_hwcf_tensor
-func @depthwise_conv_2d_input_nhwc_filter_hwcf_tensor(%input: tensor<2x4x5x2xf32>, %filter: tensor<2x2x2x3xf32>) -> tensor<2x3x4x2x3xf32> {
+// CHECK-LABEL: func @depthwise_conv2D_nhwc_tensor
+func @depthwise_conv2D_nhwc_tensor(%input: tensor<2x4x5x2xf32>, %filter: tensor<2x2x2x3xf32>) -> tensor<2x3x4x2x3xf32> {
   %zero = constant 0.000000e+00 : f32
   %init = linalg.init_tensor [2, 3, 4, 2, 3] : tensor<2x3x4x2x3xf32>
   %fill = linalg.fill(%zero, %init) : f32, tensor<2x3x4x2x3xf32> -> tensor<2x3x4x2x3xf32>
-  // CHECK:      %{{.+}} = linalg.depthwise_conv_2d_input_nhwc_filter_hwcf
+  // CHECK:      %{{.+}} = linalg.depthwise_conv2D_nhwc
   // CHECK-SAME:   {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<2x4x5x2xf32>, tensor<2x2x2x3xf32>)
   // CHECK-SAME:   outs(%{{.+}} : tensor<2x3x4x2x3xf32>)
-  %0 = linalg.depthwise_conv_2d_input_nhwc_filter_hwcf
+  %0 = linalg.depthwise_conv2D_nhwc
      { dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> }
      ins(%input, %filter : tensor<2x4x5x2xf32>, tensor<2x2x2x3xf32>)
     outs(%fill : tensor<2x3x4x2x3xf32>) -> tensor<2x3x4x2x3xf32>
   return %0 : tensor<2x3x4x2x3xf32>
 }
 
-// CHECK-LABEL: func @conv_2d_nchw_tensor
-func @conv_2d_nchw_tensor(%input: tensor<2x2x4x5xf32>, %filter: tensor<4x2x3x3xf32>) -> tensor<2x4x2x3xf32> {
-    %cst = constant 0.000000e+00 : f32
-    %init = linalg.init_tensor [2, 4, 2, 3] : tensor<2x4x2x3xf32>
-    %fill = linalg.fill(%cst, %init) : f32, tensor<2x4x2x3xf32> -> tensor<2x4x2x3xf32>
-// CHECK:           %{{.+}} = linalg.conv_2d_nchw
-// CHECK-SAME:       {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>}
-// CHECK-SAME:       ins(%{{.+}}, %{{.+}} : tensor<2x2x4x5xf32>, tensor<4x2x3x3xf32>)
-// CHECK-SAME:       outs(%{{.+}} : tensor<2x4x2x3xf32>) -> tensor<2x4x2x3xf32>
-// CHECK:           return %{{.+}} : tensor<2x4x2x3xf32>
-// CHECK:         }
-    %0 = linalg.conv_2d_nchw
-    {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>}
-    ins(%input, %filter: tensor<2x2x4x5xf32>, tensor<4x2x3x3xf32>)
-    outs(%fill : tensor<2x4x2x3xf32>) -> tensor<2x4x2x3xf32>
-    return %0 : tensor<2x4x2x3xf32>
-}
-
-// CHECK-LABEL: func @depthwise_conv_2d_input_nhwc_filter_hwcf_memref
-func @depthwise_conv_2d_input_nhwc_filter_hwcf_memref(%input: memref<2x4x5x2xf32>, %filter: memref<2x2x2x3xf32>, %output: memref<2x3x4x2x3xf32>) {
-  // CHECK:      linalg.depthwise_conv_2d_input_nhwc_filter_hwcf
+// CHECK-LABEL: func @depthwise_conv2D_nhwc_memref
+func @depthwise_conv2D_nhwc_memref(%input: memref<2x4x5x2xf32>, %filter: memref<2x2x2x3xf32>, %output: memref<2x3x4x2x3xf32>) {
+  // CHECK:      linalg.depthwise_conv2D_nhwc
   // CHECK-SAME:   {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : memref<2x4x5x2xf32>, memref<2x2x2x3xf32>)
   // CHECK-SAME:   outs(%{{.+}} : memref<2x3x4x2x3xf32>)
-  linalg.depthwise_conv_2d_input_nhwc_filter_hwcf
+  linalg.depthwise_conv2D_nhwc
      { dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> }
      ins(%input, %filter : memref<2x4x5x2xf32>, memref<2x2x2x3xf32>)
     outs(%output : memref<2x3x4x2x3xf32>)
   return
 }
 
-// CHECK-LABEL: func @depthwise_conv_2d_input_nhwc_filter_hwc_tensor
-func @depthwise_conv_2d_input_nhwc_filter_hwc_tensor(%input: tensor<1x113x113x96xf32>, %filter: tensor<3x3x96xf32>) -> tensor<1x56x56x96xf32> {
+// CHECK-LABEL: func @depthwise_conv2D_nhw_tensor
+func @depthwise_conv2D_nhw_tensor(%input: tensor<1x113x113x96xf32>, %filter: tensor<3x3x96xf32>) -> tensor<1x56x56x96xf32> {
   %init = linalg.init_tensor [1, 56, 56, 96] : tensor<1x56x56x96xf32>
-  // CHECK:      %{{.+}} = linalg.depthwise_conv_2d_input_nhwc_filter_hwc
+  // CHECK:      %{{.+}} = linalg.depthwise_conv2D_nhw
   // CHECK-SAME:   {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<1x113x113x96xf32>, tensor<3x3x96xf32>)
   // CHECK-SAME:   outs(%{{.+}} : tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32>
-  %0 = linalg.depthwise_conv_2d_input_nhwc_filter_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
+  %0 = linalg.depthwise_conv2D_nhw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
          ins(%input, %filter: tensor<1x113x113x96xf32>, tensor<3x3x96xf32>)
          outs(%init: tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32>
   return %0: tensor<1x56x56x96xf32>
 }
 
-// CHECK-LABEL: func @depthwise_conv_2d_input_nhwc_filter_hwc_memref
-func @depthwise_conv_2d_input_nhwc_filter_hwc_memref(%input: memref<1x113x113x96xf32>, %filter: memref<3x3x96xf32>, %output: memref<1x56x56x96xf32>) {
-  // CHECK:      linalg.depthwise_conv_2d_input_nhwc_filter_hwc
+// CHECK-LABEL: func @depthwise_conv2D_nhw_memref
+func @depthwise_conv2D_nhw_memref(%input: memref<1x113x113x96xf32>, %filter: memref<3x3x96xf32>, %output: memref<1x56x56x96xf32>) {
+  // CHECK:      linalg.depthwise_conv2D_nhw
   // CHECK-SAME:   {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : memref<1x113x113x96xf32>, memref<3x3x96xf32>)
   // CHECK-SAME:   outs(%{{.+}} : memref<1x56x56x96xf32>)
-  linalg.depthwise_conv_2d_input_nhwc_filter_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
+  linalg.depthwise_conv2D_nhw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
     ins(%input, %filter: memref<1x113x113x96xf32>, memref<3x3x96xf32>)
     outs(%output: memref<1x56x56x96xf32>)
   return
 }
 
-func @depthwise_conv_2d_input_nhwc_filter_hwcf_tensor_dilated(%input: tensor<2x8x9x2xf32>, %filter: tensor<2x2x2x3xf32>) -> tensor<2x6x7x2x3xf32> {
+func @depthwise_conv2D_nhwc_tensor_dilated(%input: tensor<2x8x9x2xf32>, %filter: tensor<2x2x2x3xf32>) -> tensor<2x6x7x2x3xf32> {
   %zero = constant 0.000000e+00 : f32
   %init = linalg.init_tensor [2, 6, 7, 2, 3] : tensor<2x6x7x2x3xf32>
   %fill = linalg.fill(%zero, %init) : f32, tensor<2x6x7x2x3xf32> -> tensor<2x6x7x2x3xf32>
-  // CHECK:      %{{.+}} = linalg.depthwise_conv_2d_input_nhwc_filter_hwcf
+  // CHECK:      %{{.+}} = linalg.depthwise_conv2D_nhwc
   // CHECK-SAME:   {dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<2x8x9x2xf32>, tensor<2x2x2x3xf32>)
   // CHECK-SAME:   outs(%{{.+}} : tensor<2x6x7x2x3xf32>)
-  %0 = linalg.depthwise_conv_2d_input_nhwc_filter_hwcf
+  %0 = linalg.depthwise_conv2D_nhwc
      { dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> }
      ins(%input, %filter : tensor<2x8x9x2xf32>, tensor<2x2x2x3xf32>)
     outs(%fill : tensor<2x6x7x2x3xf32>) -> tensor<2x6x7x2x3xf32>
   return %0 : tensor<2x6x7x2x3xf32>
 }
 
-// CHECK-LABEL: func @depthwise_conv_2d_input_nhwc_filter_hwcf_memref_dilated
-func @depthwise_conv_2d_input_nhwc_filter_hwcf_memref_dilated(%input: memref<2x8x9x2xf32>, %filter: memref<2x2x2x3xf32>, %output: memref<2x6x7x2x3xf32>) {
-  // CHECK:      linalg.depthwise_conv_2d_input_nhwc_filter_hwcf
+// CHECK-LABEL: func @depthwise_conv2D_nhwc_memref_dilated
+func @depthwise_conv2D_nhwc_memref_dilated(%input: memref<2x8x9x2xf32>, %filter: memref<2x2x2x3xf32>, %output: memref<2x6x7x2x3xf32>) {
+  // CHECK:      linalg.depthwise_conv2D_nhwc
   // CHECK-SAME:   {dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : memref<2x8x9x2xf32>, memref<2x2x2x3xf32>)
   // CHECK-SAME:   outs(%{{.+}} : memref<2x6x7x2x3xf32>)
-  linalg.depthwise_conv_2d_input_nhwc_filter_hwcf
+  linalg.depthwise_conv2D_nhwc
      { dilations = dense<2> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> }
      ins(%input, %filter : memref<2x8x9x2xf32>, memref<2x2x2x3xf32>)
     outs(%output : memref<2x6x7x2x3xf32>)
@@ -104,7 +86,7 @@ func @depthwise_conv_2d_input_nhwc_filter_hwcf_memref_dilated(%input: memref<2x8
 
 func @depthwise_conv_2d_input_nhwc_filter_missing_stride(%input: memref<1x113x113x96xf32>, %filter: memref<3x3x96xf32>, %output: memref<1x56x56x96xf32>) {
   // expected-error @+1 {{missing indexing map required attribute 'strides'}}
-  linalg.depthwise_conv_2d_input_nhwc_filter_hwc {dilations = dense<1> : vector<2xi64>}
+  linalg.depthwise_conv2D_nhw {dilations = dense<1> : vector<2xi64>}
     ins(%input, %filter: memref<1x113x113x96xf32>, memref<3x3x96xf32>)
     outs(%output: memref<1x56x56x96xf32>)
   return
@@ -114,7 +96,7 @@ func @depthwise_conv_2d_input_nhwc_filter_missing_stride(%input: memref<1x113x11
 
 func @depthwise_conv_2d_input_nhwc_filter_missing_dilations(%input: memref<1x113x113x96xf32>, %filter: memref<3x3x96xf32>, %output: memref<1x56x56x96xf32>) {
   // expected-error @+1 {{missing indexing map required attribute 'dilations'}}
-  linalg.depthwise_conv_2d_input_nhwc_filter_hwc {strides = dense<1> : vector<2xi64>}
+  linalg.depthwise_conv2D_nhw {strides = dense<1> : vector<2xi64>}
     ins(%input, %filter: memref<1x113x113x96xf32>, memref<3x3x96xf32>)
     outs(%output: memref<1x56x56x96xf32>)
   return
@@ -124,7 +106,7 @@ func @depthwise_conv_2d_input_nhwc_filter_missing_dilations(%input: memref<1x113
 
 func @depthwise_conv_2d_input_nhwc_filter_wrong_stride_element_type(%input: memref<1x113x113x96xf32>, %filter: memref<3x3x96xf32>, %output: memref<1x56x56x96xf32>) {
   // expected-error @+1 {{incorrect element type for indexing map required attribute 'strides'}}
-  linalg.depthwise_conv_2d_input_nhwc_filter_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2.0> : vector<2xf32>}
+  linalg.depthwise_conv2D_nhw {dilations = dense<1> : vector<2xi64>, strides = dense<2.0> : vector<2xf32>}
     ins(%input, %filter: memref<1x113x113x96xf32>, memref<3x3x96xf32>)
     outs(%output: memref<1x56x56x96xf32>)
   return
@@ -134,7 +116,7 @@ func @depthwise_conv_2d_input_nhwc_filter_wrong_stride_element_type(%input: memr
 
 func @depthwise_conv_2d_input_nhwc_filter_wrong_stride_size(%input: memref<1x113x113x96xf32>, %filter: memref<3x3x96xf32>, %output: memref<1x56x56x96xf32>) {
   // expected-error @+1 {{incorrect shape for indexing map required attribute 'strides'}}
-  linalg.depthwise_conv_2d_input_nhwc_filter_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<3xi64> }
+  linalg.depthwise_conv2D_nhw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<3xi64> }
     ins(%input, %filter: memref<1x113x113x96xf32>, memref<3x3x96xf32>)
     outs(%output: memref<1x56x56x96xf32>)
   return
@@ -142,14 +124,14 @@ func @depthwise_conv_2d_input_nhwc_filter_wrong_stride_size(%input: memref<1x113
 
 // -----
 
-// CHECK-LABEL: func @conv_1d_input_nwc_filter_wcf
-func @conv_1d_input_nwc_filter_wcf(%input: tensor<?x?x?xf32>, %filter: tensor<?x?x?xf32>, %init: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
-  // CHECK:      %{{.+}} = linalg.conv_1d_input_nwc_filter_wcf
+// CHECK-LABEL: func @conv_1d_nwc_wcf
+func @conv_1d_nwc_wcf(%input: tensor<?x?x?xf32>, %filter: tensor<?x?x?xf32>, %init: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  // CHECK:      %{{.+}} = linalg.conv_1d_nwc_wcf
   // CHECK-SAME:   dilations = dense<1> : tensor<1xi64>
   // CHECK-SAME:   strides = dense<1> : tensor<1xi64>
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
   // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-  %0 = linalg.conv_1d_input_nwc_filter_wcf {dilations = dense<1> : tensor<1xi64>,
+  %0 = linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>,
                                             strides = dense<1> : tensor<1xi64>}
      ins (%input, %filter: tensor<?x?x?xf32>, tensor<?x?x?xf32>)
     outs (%init: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
@@ -158,14 +140,14 @@ func @conv_1d_input_nwc_filter_wcf(%input: tensor<?x?x?xf32>, %filter: tensor<?x
 
 // -----
 
-// CHECK-LABEL: func @conv_1d_input_nwc_filter_wcf
-func @conv_1d_input_nwc_filter_wcf(%input: memref<?x?x?xf32>, %filter: memref<?x?x?xf32>, %output: memref<?x?x?xf32>) {
-  // CHECK:      linalg.conv_1d_input_nwc_filter_wcf
+// CHECK-LABEL: func @conv_1d_nwc_wcf
+func @conv_1d_nwc_wcf(%input: memref<?x?x?xf32>, %filter: memref<?x?x?xf32>, %output: memref<?x?x?xf32>) {
+  // CHECK:      linalg.conv_1d_nwc_wcf
   // CHECK-SAME:   dilations = dense<1> : tensor<1xi64>
   // CHECK-SAME:   strides = dense<1> : tensor<1xi64>
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : memref<?x?x?xf32>, memref<?x?x?xf32>)
   // CHECK-SAME:   outs(%{{.+}} : memref<?x?x?xf32>)
-  linalg.conv_1d_input_nwc_filter_wcf {dilations = dense<1> : tensor<1xi64>,
+  linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>,
                                        strides = dense<1> : tensor<1xi64>}
      ins (%input, %filter: memref<?x?x?xf32>, memref<?x?x?xf32>)
     outs (%output: memref<?x?x?xf32>)
@@ -174,78 +156,14 @@ func @conv_1d_input_nwc_filter_wcf(%input: memref<?x?x?xf32>, %filter: memref<?x
 
 // -----
 
-// CHECK-LABEL: func @conv_1d_input_ncw_filter_wcf
-func @conv_1d_input_ncw_filter_wcf(%input: tensor<?x?x?xf32>, %filter: tensor<?x?x?xf32>, %init: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
-  // CHECK:      %{{.+}} = linalg.conv_1d_input_ncw_filter_wcf
-  // CHECK-SAME:   dilations = dense<1> : tensor<1xi64>
-  // CHECK-SAME:   strides = dense<1> : tensor<1xi64>
-  // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-  %0 = linalg.conv_1d_input_ncw_filter_wcf {dilations = dense<1> : tensor<1xi64>,
-                                            strides = dense<1> : tensor<1xi64>}
-     ins (%input, %filter: tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-    outs (%init: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-  return %0 : tensor<?x?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @conv_1d_input_ncw_filter_wcf
-func @conv_1d_input_ncw_filter_wcf(%input: memref<?x?x?xf32>, %filter: memref<?x?x?xf32>, %output: memref<?x?x?xf32>) {
-  // CHECK:      linalg.conv_1d_input_ncw_filter_wcf
-  // CHECK-SAME:   dilations = dense<1> : tensor<1xi64>
-  // CHECK-SAME:   strides = dense<1> : tensor<1xi64>
-  // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : memref<?x?x?xf32>, memref<?x?x?xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : memref<?x?x?xf32>)
-  linalg.conv_1d_input_ncw_filter_wcf {dilations = dense<1> : tensor<1xi64>,
-                                       strides = dense<1> : tensor<1xi64>}
-     ins (%input, %filter: memref<?x?x?xf32>, memref<?x?x?xf32>)
-    outs (%output: memref<?x?x?xf32>)
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @conv_2d_input_nhwc_filter_hwcf
-func @conv_2d_input_nhwc_filter_hwcf(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?xf32>, %init: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
-  // CHECK:      %{{.+}} = linalg.conv_2d_input_nhwc_filter_hwcf
-  // CHECK-SAME:   dilations = dense<1> : tensor<2xi64>
-  // CHECK-SAME:   strides = dense<1> : tensor<2xi64>
-  // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
-  %0 = linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>,
-                                              strides = dense<1> : tensor<2xi64>}
-     ins (%input, %filter: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
-    outs (%init: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
-  return %0 : tensor<?x?x?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @conv_2d_input_nhwc_filter_hwcf
-func @conv_2d_input_nhwc_filter_hwcf(%input: memref<?x?x?x?xf32>, %filter: memref<?x?x?x?xf32>, %output: memref<?x?x?x?xf32>) {
-  // CHECK:      linalg.conv_2d_input_nhwc_filter_hwcf
-  // CHECK-SAME:   dilations = dense<1> : tensor<2xi64>
-  // CHECK-SAME:   strides = dense<1> : tensor<2xi64>
-  // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : memref<?x?x?x?xf32>)
-  linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>,
-                                         strides = dense<1> : tensor<2xi64>}
-     ins (%input, %filter: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
-    outs (%output: memref<?x?x?x?xf32>)
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @conv_2d_input_nchw_filter_hwcf
-func @conv_2d_input_nchw_filter_hwcf(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?xf32>, %init: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
-  // CHECK:      %{{.+}} = linalg.conv_2d_input_nchw_filter_hwcf
+// CHECK-LABEL: func @conv_2d_nhwc_hwcf
+func @conv_2d_nhwc_hwcf(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?xf32>, %init: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+  // CHECK:      %{{.+}} = linalg.conv_2d_nhwc_hwcf
   // CHECK-SAME:   dilations = dense<1> : tensor<2xi64>
   // CHECK-SAME:   strides = dense<1> : tensor<2xi64>
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
   // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
-  %0 = linalg.conv_2d_input_nchw_filter_hwcf {dilations = dense<1> : tensor<2xi64>,
+  %0 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
                                               strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
     outs (%init: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
@@ -254,14 +172,14 @@ func @conv_2d_input_nchw_filter_hwcf(%input: tensor<?x?x?x?xf32>, %filter: tenso
 
 // -----
 
-// CHECK-LABEL: func @conv_2d_input_nchw_filter_hwcf
-func @conv_2d_input_nchw_filter_hwcf(%input: memref<?x?x?x?xf32>, %filter: memref<?x?x?x?xf32>, %output: memref<?x?x?x?xf32>) {
-  // CHECK:      linalg.conv_2d_input_nchw_filter_hwcf
+// CHECK-LABEL: func @conv_2d_nhwc_hwcf
+func @conv_2d_nhwc_hwcf(%input: memref<?x?x?x?xf32>, %filter: memref<?x?x?x?xf32>, %output: memref<?x?x?x?xf32>) {
+  // CHECK:      linalg.conv_2d_nhwc_hwcf
   // CHECK-SAME:   dilations = dense<1> : tensor<2xi64>
   // CHECK-SAME:   strides = dense<1> : tensor<2xi64>
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
   // CHECK-SAME:   outs(%{{.+}} : memref<?x?x?x?xf32>)
-  linalg.conv_2d_input_nchw_filter_hwcf {dilations = dense<1> : tensor<2xi64>,
+  linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
                                          strides = dense<1> : tensor<2xi64>}
      ins (%input, %filter: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
     outs (%output: memref<?x?x?x?xf32>)
@@ -270,46 +188,14 @@ func @conv_2d_input_nchw_filter_hwcf(%input: memref<?x?x?x?xf32>, %filter: memre
 
 // -----
 
-// CHECK-LABEL: func @conv_3d_input_ndhwc_filter_dhwcf
-func @conv_3d_input_ndhwc_filter_dhwcf(%input: tensor<?x?x?x?x?xf32>, %filter: tensor<?x?x?x?x?xf32>, %init: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32> {
-  // CHECK:      %{{.+}} = linalg.conv_3d_input_ndhwc_filter_dhwcf
-  // CHECK-SAME:   dilations = dense<1> : tensor<3xi64>
-  // CHECK-SAME:   strides = dense<1> : tensor<3xi64>
-  // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
-  %0 = linalg.conv_3d_input_ndhwc_filter_dhwcf {dilations = dense<1> : tensor<3xi64>,
-                                                strides = dense<1> : tensor<3xi64>}
-     ins (%input, %filter: tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>)
-    outs (%init: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
-  return %0 : tensor<?x?x?x?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @conv_3d_input_ndhwc_filter_dhwcf
-func @conv_3d_input_ndhwc_filter_dhwcf(%input: memref<?x?x?x?x?xf32>, %filter: memref<?x?x?x?x?xf32>, %output: memref<?x?x?x?x?xf32>) {
-  // CHECK:      linalg.conv_3d_input_ndhwc_filter_dhwcf
-  // CHECK-SAME:   dilations = dense<1> : tensor<3xi64>
-  // CHECK-SAME:   strides = dense<1> : tensor<3xi64>
-  // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
-  // CHECK-SAME:   outs(%{{.+}} : memref<?x?x?x?x?xf32>)
-  linalg.conv_3d_input_ndhwc_filter_dhwcf {dilations = dense<1> : tensor<3xi64>,
-                                           strides = dense<1> : tensor<3xi64>}
-     ins (%input, %filter: memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
-    outs (%output: memref<?x?x?x?x?xf32>)
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @conv_3d_input_ncdhw_filter_dhwcf
-func @conv_3d_input_ncdhw_filter_dhwcf(%input: tensor<?x?x?x?x?xf32>, %filter: tensor<?x?x?x?x?xf32>, %init: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32> {
-  // CHECK:      %{{.+}} = linalg.conv_3d_input_ncdhw_filter_dhwcf
+// CHECK-LABEL: func @conv_3d_ndhwc_dhwcf
+func @conv_3d_ndhwc_dhwcf(%input: tensor<?x?x?x?x?xf32>, %filter: tensor<?x?x?x?x?xf32>, %init: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32> {
+  // CHECK:      %{{.+}} = linalg.conv_3d_ndhwc_dhwcf
   // CHECK-SAME:   dilations = dense<1> : tensor<3xi64>
   // CHECK-SAME:   strides = dense<1> : tensor<3xi64>
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>)
   // CHECK-SAME:   outs(%{{.+}} : tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
-  %0 = linalg.conv_3d_input_ncdhw_filter_dhwcf {dilations = dense<1> : tensor<3xi64>,
+  %0 = linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>,
                                                 strides = dense<1> : tensor<3xi64>}
      ins (%input, %filter: tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>)
     outs (%init: tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
@@ -318,14 +204,14 @@ func @conv_3d_input_ncdhw_filter_dhwcf(%input: tensor<?x?x?x?x?xf32>, %filter: t
 
 // -----
 
-// CHECK-LABEL: func @conv_3d_input_ncdhw_filter_dhwcf
-func @conv_3d_input_ncdhw_filter_dhwcf(%input: memref<?x?x?x?x?xf32>, %filter: memref<?x?x?x?x?xf32>, %output: memref<?x?x?x?x?xf32>) {
-  // CHECK:      linalg.conv_3d_input_ncdhw_filter_dhwcf
+// CHECK-LABEL: func @conv_3d_ndhwc_dhwcf
+func @conv_3d_ndhwc_dhwcf(%input: memref<?x?x?x?x?xf32>, %filter: memref<?x?x?x?x?xf32>, %output: memref<?x?x?x?x?xf32>) {
+  // CHECK:      linalg.conv_3d_ndhwc_dhwcf
   // CHECK-SAME:   dilations = dense<1> : tensor<3xi64>
   // CHECK-SAME:   strides = dense<1> : tensor<3xi64>
   // CHECK-SAME:   ins(%{{.+}}, %{{.+}} : memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
   // CHECK-SAME:   outs(%{{.+}} : memref<?x?x?x?x?xf32>)
-  linalg.conv_3d_input_ncdhw_filter_dhwcf {dilations = dense<1> : tensor<3xi64>,
+  linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>,
                                            strides = dense<1> : tensor<3xi64>}
      ins (%input, %filter: memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
     outs (%output: memref<?x?x?x?x?xf32>)
diff --git a/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir
index c905e6f616b3e..2bab8758ee0b6 100644
--- a/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir
+++ b/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir
@@ -79,7 +79,7 @@ func @conv_tensors_static(%input: tensor<1x225x225x3xf32>, %filter: tensor<3x3x3
   %init = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xf32>
   %fill = linalg.fill(%cst, %init) : f32, tensor<1x112x112x32xf32> -> tensor<1x112x112x32xf32>
 
-  %conv = linalg.conv_2d_input_nhwc_filter_hwcf
+  %conv = linalg.conv_2d_nhwc_hwcf
     {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
     ins(%input, %filter : tensor<1x225x225x3xf32>, tensor<3x3x3x32xf32>)
     outs(%fill : tensor<1x112x112x32xf32>) -> tensor<1x112x112x32xf32>
@@ -133,7 +133,7 @@ func @conv_tensors_static(%input: tensor<1x225x225x3xf32>, %filter: tensor<3x3x3
 // CHECK-NEXT:       %[[ST_ARG2:.+]] = tensor.extract_slice %[[ARG2]][0, %[[IV0]], %[[IV1]], %[[IV2]]] [1, 8, 16, 4] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
 // CHECK-NEXT:       %[[ST_FILTER:.+]] = tensor.extract_slice %[[FILTER]][0, 0, 0, %[[IV2]]] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
 // CHECK-NEXT:       %[[ST_FILL:.+]] = tensor.extract_slice %[[FILL]][0, %[[IV0]], %[[IV1]], %[[IV2]]] [1, 8, 16, 4] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
-// CHECK-NEXT:       %[[ST_CONV:.+]] = linalg.conv_2d_input_nhwc_filter_hwcf
+// CHECK-NEXT:       %[[ST_CONV:.+]] = linalg.conv_2d_nhwc_hwcf
 // CHECK-SAME:         ins(%[[ST_INPUT]], %[[ST_FILTER]] : tensor<1x17x33x3xf32>, tensor<3x3x3x4xf32>)
 // CHECK-SAME:         outs(%[[ST_FILL]] : tensor<1x8x16x4xf32>)
 // CHECK-NEXT:       %[[ADD:.+]] = linalg.generic
@@ -161,7 +161,7 @@ func @conv_tensors_dynamic(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?x
   %init = linalg.init_tensor [%n, %oh, %ow, %oc] : tensor<?x?x?x?xf32>
   %fill = linalg.fill(%cst, %init) : f32, tensor<?x?x?x?xf32> -> tensor<?x?x?x?xf32>
 
-  %conv = linalg.conv_2d_input_nhwc_filter_hwcf
+  %conv = linalg.conv_2d_nhwc_hwcf
     {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>}
     ins(%input, %filter : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
     outs(%fill : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
@@ -271,7 +271,7 @@ func @conv_tensors_dynamic(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?x
 // CHECK-NEXT:           %[[SIZE_ELEM_OC_3:.+]] = affine.min #[[BOUND2_MAP_2]](%[[IV3]], %[[IV2]])[%[[FILL_C]], %[[ELEM_OC]]]
 // CHECK-NEXT:           %[[ST_FILL:.+]] = tensor.extract_slice %[[FILL]][%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
 // CHECK-SAME:                 [%[[SIZE_ELEM_N_2]], %[[SIZE_ELEM_OH_2]], %[[SIZE_ELEM_OW_2]], %[[SIZE_ELEM_OC_3]]]
-// CHECK-NEXT:           %[[ST_CONV:.+]] = linalg.conv_2d_input_nhwc_filter_hwcf
+// CHECK-NEXT:           %[[ST_CONV:.+]] = linalg.conv_2d_nhwc_hwcf
 // CHECK-SAME:                 ins(%[[ST_INPUT]], %[[ST_FILTER]] : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
 // CHECK-SAME:                 outs(%[[ST_FILL]] : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
 // CHECK-NEXT:           %[[ST_ADD:.+]] = linalg.generic
diff --git a/mlir/test/Dialect/SCF/for-loop-peeling.mlir b/mlir/test/Dialect/SCF/for-loop-peeling.mlir
index d13a43b223e61..d54bac3b6eb40 100644
--- a/mlir/test/Dialect/SCF/for-loop-peeling.mlir
+++ b/mlir/test/Dialect/SCF/for-loop-peeling.mlir
@@ -1,22 +1,20 @@
 // RUN: mlir-opt %s -for-loop-peeling -canonicalize -split-input-file | FileCheck %s
 
 //  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0, s1, s2] -> (s1 - (s1 - s0) mod s2)>
-//  CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1)>
-//  CHECK-DAG: #[[MAP2:.*]] = affine_map<()[s0, s1, s2] -> (s0, s2 - (s2 - (s2 - s1) mod s0))>
+//  CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0)>
 //      CHECK: func @fully_dynamic_bounds(
 // CHECK-SAME:     %[[LB:.*]]: index, %[[UB:.*]]: index, %[[STEP:.*]]: index
 //      CHECK:   %[[C0_I32:.*]] = constant 0 : i32
 //      CHECK:   %[[NEW_UB:.*]] = affine.apply #[[MAP0]]()[%[[LB]], %[[UB]], %[[STEP]]]
 //      CHECK:   %[[LOOP:.*]] = scf.for %[[IV:.*]] = %[[LB]] to %[[NEW_UB]]
 // CHECK-SAME:       step %[[STEP]] iter_args(%[[ACC:.*]] = %[[C0_I32]]) -> (i32) {
-//      CHECK:     %[[MINOP:.*]] = affine.min #[[MAP1]](%[[IV]])[%[[STEP]], %[[UB]]]
-//      CHECK:     %[[CAST:.*]] = index_cast %[[MINOP]] : index to i32
+//      CHECK:     %[[CAST:.*]] = index_cast %[[STEP]] : index to i32
 //      CHECK:     %[[ADD:.*]] = addi %[[ACC]], %[[CAST]] : i32
 //      CHECK:     scf.yield %[[ADD]]
 //      CHECK:   }
 //      CHECK:   %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]]
 //      CHECK:   %[[RESULT:.*]] = scf.if %[[HAS_MORE]] -> (i32) {
-//      CHECK:     %[[REM:.*]] = affine.min #[[MAP2]]()[%[[STEP]], %[[LB]], %[[UB]]]
+//      CHECK:     %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]], %[[LB]], %[[STEP]]]
 //      CHECK:     %[[CAST2:.*]] = index_cast %[[REM]]
 //      CHECK:     %[[ADD2:.*]] = addi %[[LOOP]], %[[CAST2]]
 //      CHECK:     scf.yield %[[ADD2]]
@@ -38,18 +36,16 @@ func @fully_dynamic_bounds(%lb : index, %ub: index, %step: index) -> i32 {
 
 // -----
 
-//  CHECK-DAG: #[[MAP:.*]] = affine_map<(d0) -> (4, -d0 + 17)>
 //      CHECK: func @fully_static_bounds(
 //  CHECK-DAG:   %[[C0_I32:.*]] = constant 0 : i32
 //  CHECK-DAG:   %[[C1_I32:.*]] = constant 1 : i32
+//  CHECK-DAG:   %[[C4_I32:.*]] = constant 4 : i32
 //  CHECK-DAG:   %[[C0:.*]] = constant 0 : index
 //  CHECK-DAG:   %[[C4:.*]] = constant 4 : index
 //  CHECK-DAG:   %[[C16:.*]] = constant 16 : index
 //      CHECK:   %[[LOOP:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[C16]]
 // CHECK-SAME:       step %[[C4]] iter_args(%[[ACC:.*]] = %[[C0_I32]]) -> (i32) {
-//      CHECK:     %[[MINOP:.*]] = affine.min #[[MAP]](%[[IV]])
-//      CHECK:     %[[CAST:.*]] = index_cast %[[MINOP]] : index to i32
-//      CHECK:     %[[ADD:.*]] = addi %[[ACC]], %[[CAST]] : i32
+//      CHECK:     %[[ADD:.*]] = addi %[[ACC]], %[[C4_I32]] : i32
 //      CHECK:     scf.yield %[[ADD]]
 //      CHECK:   }
 //      CHECK:   %[[RESULT:.*]] = addi %[[LOOP]], %[[C1_I32]] : i32
@@ -73,24 +69,22 @@ func @fully_static_bounds() -> i32 {
 // -----
 
 //  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> ((s0 floordiv 4) * 4)>
-//  CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (4, -d0 + s0)>
-//  CHECK-DAG: #[[MAP2:.*]] = affine_map<()[s0] -> (4, s0 mod 4)>
+//  CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 mod 4)>
 //      CHECK: func @dynamic_upper_bound(
 // CHECK-SAME:     %[[UB:.*]]: index
 //  CHECK-DAG:   %[[C0_I32:.*]] = constant 0 : i32
+//  CHECK-DAG:   %[[C4_I32:.*]] = constant 4 : i32
 //  CHECK-DAG:   %[[C0:.*]] = constant 0 : index
 //  CHECK-DAG:   %[[C4:.*]] = constant 4 : index
 //      CHECK:   %[[NEW_UB:.*]] = affine.apply #[[MAP0]]()[%[[UB]]]
 //      CHECK:   %[[LOOP:.*]] = scf.for %[[IV:.*]] = %[[C0]] to %[[NEW_UB]]
 // CHECK-SAME:       step %[[C4]] iter_args(%[[ACC:.*]] = %[[C0_I32]]) -> (i32) {
-//      CHECK:     %[[MINOP:.*]] = affine.min #[[MAP1]](%[[IV]])[%[[UB]]]
-//      CHECK:     %[[CAST:.*]] = index_cast %[[MINOP]] : index to i32
-//      CHECK:     %[[ADD:.*]] = addi %[[ACC]], %[[CAST]] : i32
+//      CHECK:     %[[ADD:.*]] = addi %[[ACC]], %[[C4_I32]] : i32
 //      CHECK:     scf.yield %[[ADD]]
 //      CHECK:   }
 //      CHECK:   %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]]
 //      CHECK:   %[[RESULT:.*]] = scf.if %[[HAS_MORE]] -> (i32) {
-//      CHECK:     %[[REM:.*]] = affine.min #[[MAP2]]()[%[[UB]]]
+//      CHECK:     %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]]]
 //      CHECK:     %[[CAST2:.*]] = index_cast %[[REM]]
 //      CHECK:     %[[ADD2:.*]] = addi %[[LOOP]], %[[CAST2]]
 //      CHECK:     scf.yield %[[ADD2]]
@@ -116,23 +110,21 @@ func @dynamic_upper_bound(%ub : index) -> i32 {
 // -----
 
 //  CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> ((s0 floordiv 4) * 4)>
-//  CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0] -> (4, -d0 + s0)>
-//  CHECK-DAG: #[[MAP2:.*]] = affine_map<()[s0] -> (4, s0 mod 4)>
+//  CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 mod 4)>
 //      CHECK: func @no_loop_results(
 // CHECK-SAME:     %[[UB:.*]]: index, %[[MEMREF:.*]]: memref<i32>
+//  CHECK-DAG:   %[[C4_I32:.*]] = constant 4 : i32
 //  CHECK-DAG:   %[[C0:.*]] = constant 0 : index
 //  CHECK-DAG:   %[[C4:.*]] = constant 4 : index
 //      CHECK:   %[[NEW_UB:.*]] = affine.apply #[[MAP0]]()[%[[UB]]]
 //      CHECK:   scf.for %[[IV:.*]] = %[[C0]] to %[[NEW_UB]] step %[[C4]] {
-//      CHECK:     %[[MINOP:.*]] = affine.min #[[MAP1]](%[[IV]])[%[[UB]]]
 //      CHECK:     %[[LOAD:.*]] = memref.load %[[MEMREF]][]
-//      CHECK:     %[[CAST:.*]] = index_cast %[[MINOP]] : index to i32
-//      CHECK:     %[[ADD:.*]] = addi %[[LOAD]], %[[CAST]] : i32
+//      CHECK:     %[[ADD:.*]] = addi %[[LOAD]], %[[C4_I32]] : i32
 //      CHECK:     memref.store %[[ADD]], %[[MEMREF]]
 //      CHECK:   }
 //      CHECK:   %[[HAS_MORE:.*]] = cmpi slt, %[[NEW_UB]], %[[UB]]
 //      CHECK:   scf.if %[[HAS_MORE]] {
-//      CHECK:     %[[REM:.*]] = affine.min #[[MAP2]]()[%[[UB]]]
+//      CHECK:     %[[REM:.*]] = affine.apply #[[MAP1]]()[%[[UB]]]
 //      CHECK:     %[[LOAD2:.*]] = memref.load %[[MEMREF]][]
 //      CHECK:     %[[CAST2:.*]] = index_cast %[[REM]]
 //      CHECK:     %[[ADD2:.*]] = addi %[[LOAD2]], %[[CAST2]]
@@ -153,3 +145,81 @@ func @no_loop_results(%ub : index, %d : memref<i32>) {
   }
   return
 }
+
+// -----
+
+// Test rewriting of affine.min ops. Make sure that more general cases than
+// the ones above are successfully rewritten. Also make sure that the pattern
+// does not rewrite affine.min ops that should not be rewritten.
+
+//  CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 1)>
+//  CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0, s1] -> (s0, -d0 + s1 - 1)>
+//  CHECK-DAG: #[[MAP3:.*]] = affine_map<(d0)[s0, s1, s2] -> (s0, -d0 + s1, s2)>
+//  CHECK-DAG: #[[MAP4:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0)>
+//  CHECK-DAG: #[[MAP5:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0 + 1)>
+//  CHECK-DAG: #[[MAP6:.*]] = affine_map<()[s0, s1, s2] -> (-(s0 - (s0 - s1) mod s2) + s0 - 1)>
+//  CHECK-DAG: #[[MAP7:.*]] = affine_map<()[s0, s1, s2, s3] -> (s0, s2 - (s2 - (s2 - s1) mod s0), s3)>
+//      CHECK: func @test_affine_min_rewrite(
+// CHECK-SAME:     %[[LB:.*]]: index, %[[UB:.*]]: index, %[[STEP:.*]]: index,
+// CHECK-SAME:     %[[MEMREF:.*]]: memref<?xindex>, %[[SOME_VAL:.*]]: index
+//      CHECK:   scf.for %[[IV:.*]] = %[[LB]] to %{{.*}} step %[[STEP]] {
+//                 (affine.min folded away)
+//      CHECK:     memref.store %[[STEP]]
+//                 (affine.min folded away)
+//      CHECK:     memref.store %[[STEP]]
+//      CHECK:     %[[RES2:.*]] = affine.apply #[[MAP1]]()[%[[STEP]]]
+//      CHECK:     memref.store %[[RES2]]
+//      CHECK:     %[[RES3:.*]] = affine.min #[[MAP2]](%[[IV]])[%[[STEP]], %[[UB]]]
+//      CHECK:     memref.store %[[RES3]]
+//      CHECK:     %[[RES4:.*]] = affine.min #[[MAP3]](%[[IV]])[%[[STEP]], %[[UB]], %[[SOME_VAL]]]
+//      CHECK:     memref.store %[[RES4]]
+//      CHECK:   }
+//      CHECK:   scf.if {{.*}} {
+//      CHECK:     %[[RES_IF_0:.*]] = affine.apply #[[MAP4]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:     memref.store %[[RES_IF_0]]
+//      CHECK:     %[[RES_IF_1:.*]] = affine.apply #[[MAP5]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:     memref.store %[[RES_IF_1]]
+//      CHECK:     %[[RES_IF_2:.*]] = affine.apply #[[MAP5]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:     memref.store %[[RES_IF_2]]
+//      CHECK:     %[[RES_IF_3:.*]] = affine.apply #[[MAP6]]()[%[[UB]], %[[LB]], %[[STEP]]]
+//      CHECK:     memref.store %[[RES_IF_3]]
+//      CHECK:     %[[RES_IF_4:.*]] = affine.min #[[MAP7]]()[%[[STEP]], %[[LB]], %[[UB]], %[[SOME_VAL]]]
+//      CHECK:     memref.store %[[RES_IF_4]]
+#map0 = affine_map<(d0, d1)[s0] -> (s0, d0 - d1)>
+#map1 = affine_map<(d0, d1)[s0] -> (d0 - d1 + 1, s0)>
+#map2 = affine_map<(d0, d1)[s0] -> (s0 + 1, d0 - d1 + 1)>
+#map3 = affine_map<(d0, d1)[s0] -> (s0, d0 - d1 - 1)>
+#map4 = affine_map<(d0, d1, d2)[s0] -> (s0, d0 - d1, d2)>
+func @test_affine_min_rewrite(%lb : index, %ub: index,
+                              %step: index, %d : memref<?xindex>,
+                              %some_val: index) {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  %c3 = constant 3 : index
+  %c4 = constant 4 : index
+  scf.for %iv = %lb to %ub step %step {
+    // Most common case: Rewrite min(%ub - %iv, %step) to %step.
+    %m0 = affine.min #map0(%ub, %iv)[%step]
+    memref.store %m0, %d[%c0] : memref<?xindex>
+
+    // Increase %ub - %iv a little bit, pattern should still apply.
+    %m1 = affine.min #map1(%ub, %iv)[%step]
+    memref.store %m1, %d[%c1] : memref<?xindex>
+
+    // Rewrite min(%ub - %iv + 1, %step + 1) to %step + 1.
+    %m2 = affine.min #map2(%ub, %iv)[%step]
+    memref.store %m2, %d[%c2] : memref<?xindex>
+
+    // min(%ub - %iv - 1, %step) cannot be simplified because %ub - %iv - 1
+    // can be smaller than %step. (Can be simplified in if-statement.)
+    %m3 = affine.min #map3(%ub, %iv)[%step]
+    memref.store %m3, %d[%c3] : memref<?xindex>
+
+    // min(%ub - %iv, %step, %some_val) cannot be simplified because the range
+    // of %some_val is unknown.
+    %m4 = affine.min #map4(%ub, %iv, %some_val)[%step]
+    memref.store %m4, %d[%c4] : memref<?xindex>
+  }
+  return
+}
diff --git a/mlir/test/Dialect/SCF/ops.mlir b/mlir/test/Dialect/SCF/ops.mlir
index 08932bfffdc7a..f785b0124296a 100644
--- a/mlir/test/Dialect/SCF/ops.mlir
+++ b/mlir/test/Dialect/SCF/ops.mlir
@@ -291,6 +291,12 @@ func @execute_region() -> i64 {
     scf.yield %c1 : i64
   }
 
+  // CHECK:      scf.execute_region -> (i64, i64) {
+  %res2:2 = scf.execute_region -> (i64, i64) {
+    %c1 = constant 1 : i64
+    scf.yield %c1, %c1 : i64, i64
+  }
+
   // CHECK:       scf.execute_region {
   // CHECK-NEXT:    br ^bb1
   // CHECK-NEXT:  ^bb1:
diff --git a/mlir/test/Dialect/SCF/parallel-loop-tiling-inbound-check.mlir b/mlir/test/Dialect/SCF/parallel-loop-tiling-inbound-check.mlir
new file mode 100644
index 0000000000000..8f395c3b2828a
--- /dev/null
+++ b/mlir/test/Dialect/SCF/parallel-loop-tiling-inbound-check.mlir
@@ -0,0 +1,149 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.func(parallel-loop-tiling{parallel-loop-tile-sizes=1,4 no-min-max-bounds=true})' -split-input-file | FileCheck %s
+
+func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
+                    %arg3 : index, %arg4 : index, %arg5 : index,
+		    %A: memref<?x?xf32>, %B: memref<?x?xf32>,
+                    %C: memref<?x?xf32>, %result: memref<?x?xf32>) {
+  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) step (%arg4, %arg5) {
+    %B_elem = memref.load %B[%i0, %i1] : memref<?x?xf32>
+    %C_elem = memref.load %C[%i0, %i1] : memref<?x?xf32>
+    %sum_elem = addf %B_elem, %C_elem : f32
+    memref.store %sum_elem, %result[%i0, %i1] : memref<?x?xf32>
+  }
+  return
+}
+
+// CHECK-LABEL:   func @parallel_loop(
+// CHECK-SAME:                        [[ARG1:%.*]]: index, [[ARG2:%.*]]: index, [[ARG3:%.*]]: index, [[ARG4:%.*]]: index, [[ARG5:%.*]]: index, [[ARG6:%.*]]: index, [[ARG7:%.*]]: memref<?x?xf32>, [[ARG8:%.*]]: memref<?x?xf32>, [[ARG9:%.*]]: memref<?x?xf32>, [[ARG10:%.*]]: memref<?x?xf32>) {
+// CHECK:           [[C0:%.*]] = constant 0 : index
+// CHECK:           [[C1:%.*]] = constant 1 : index
+// CHECK:           [[C4:%.*]] = constant 4 : index
+// CHECK:           [[V1:%.*]] = muli [[ARG5]], [[C1]] : index
+// CHECK:           [[V2:%.*]] = muli [[ARG6]], [[C4]] : index
+// CHECK:           scf.parallel ([[V3:%.*]], [[V4:%.*]]) = ([[ARG1]], [[ARG2]]) to ([[ARG3]], [[ARG4]]) step ([[V1]], [[V2]]) {
+// CHECK:             scf.parallel ([[V7:%.*]], [[V8:%.*]]) = ([[C0]], [[C0]]) to ([[V1]], [[V2]]) step ([[ARG5]], [[ARG6]]) {
+// CHECK:               [[V9:%.*]] = addi [[V7]], [[V3]] : index
+// CHECK:               [[V10:%.*]] = addi [[V8]], [[V4]] : index
+// CHECK:               %true = constant true
+// CHECK:               [[V11:%.*]] = muli [[V7]], [[ARG5]] : index
+// CHECK:               [[V12:%.*]] = addi [[V11]], [[V3]] : index
+// CHECK:               [[V13:%.*]] = cmpi ult, [[V12]], [[ARG3]] : index
+// CHECK:               [[V14:%.*]] = and %true, [[V13]] : i1
+// CHECK:               [[V15:%.*]] = muli [[V8]], [[ARG6]] : index
+// CHECK:               [[V16:%.*]] = addi [[V15]], [[V4]] : index
+// CHECK:               [[V17:%.*]] = cmpi ult, [[V16]], [[ARG4]] : index
+// CHECK:               [[V18:%.*]] = and [[V14]], [[V17]] : i1
+// CHECK:               scf.if [[V18]] {
+// CHECK:                 [[V19:%.*]] = memref.load [[ARG8]]{{\[}}[[V9]], [[V10]]] : memref<?x?xf32>
+// CHECK:                 [[V20:%.*]] = memref.load [[ARG9]]{{\[}}[[V9]], [[V10]]] : memref<?x?xf32>
+// CHECK:                 [[V21:%.*]] = addf [[V19]], [[V20]] : f32
+// CHECK:                 memref.store [[V21]], [[ARG10]]{{\[}}[[V9]], [[V10]]] : memref<?x?xf32>
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+
+// -----
+
+func @static_loop_with_step() {
+  %c0 = constant 0 : index
+  %c3 = constant 3 : index
+  %c22 = constant 22 : index
+  %c24 = constant 24 : index
+  scf.parallel (%i0, %i1) = (%c0, %c0) to (%c22, %c24) step (%c3, %c3) {
+  }
+  return
+}
+
+// CHECK-LABEL:   func @static_loop_with_step() {
+// CHECK:           [[C0:%.*]] = constant 0 : index
+// CHECK:           [[C3:%.*]] = constant 3 : index
+// CHECK:           [[C22:%.*]] = constant 22 : index
+// CHECK:           [[C24:%.*]] = constant 24 : index
+// CHECK:           [[C0_1:%.*]] = constant 0 : index
+// CHECK:           [[C1:%.*]] = constant 1 : index
+// CHECK:           [[C4:%.*]] = constant 4 : index
+// CHECK:           [[V1:%.*]] = muli [[C3]], [[C1]] : index
+// CHECK:           [[V2:%.*]] = muli [[C3]], [[C4]] : index
+// CHECK:           scf.parallel ([[V3:%.*]], [[V4:%.*]]) = ([[C0]], [[C0]]) to ([[C22]], [[C24]]) step ([[V1]], [[V2]]) {
+// CHECK:             scf.parallel ([[V5:%.*]], [[V6:%.*]]) = ([[C0_1]], [[C0_1]]) to ([[V1]], [[V2]]) step ([[C3]], [[C3]]) {
+// CHECK-NOT:           scf.if
+// CHECK:               = addi [[V5]], [[V3]] : index
+// CHECK:               = addi [[V6]], [[V4]] : index
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+
+// -----
+
+func @tile_nested_innermost() {
+  %c2 = constant 2 : index
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+    scf.parallel (%k, %l) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+    }
+  }
+  scf.parallel (%i, %j) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+  }
+  return
+}
+
+// CHECK-LABEL:   func @tile_nested_innermost() {
+// CHECK:           [[C2:%.*]] = constant 2 : index
+// CHECK:           [[C0:%.*]] = constant 0 : index
+// CHECK:           [[C1:%.*]] = constant 1 : index
+// CHECK:           scf.parallel ([[V1:%.*]], [[V2:%.*]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[C1]], [[C1]]) {
+// CHECK:             [[C0_1:%.*]] = constant 0 : index
+// CHECK:             [[C1_1:%.*]] = constant 1 : index
+// CHECK:             [[C4:%.*]] = constant 4 : index
+// CHECK:             [[V3:%.*]] = muli [[C1]], [[C1_1]] : index
+// CHECK:             [[V4:%.*]] = muli [[C1]], [[C4]] : index
+// CHECK:             scf.parallel ([[V5:%.*]], [[V6:%.*]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[V3]], [[V4]]) {
+// CHECK:               scf.parallel ([[V8:%.*]], [[V9:%.*]]) = ([[C0_1]], [[C0_1]]) to ([[V3]], [[V4]]) step ([[C1]], [[C1]]) {
+// CHECK:                 = addi [[V8]], [[V5]] : index
+// CHECK:                 = addi [[V9]], [[V6]] : index
+// CHECK:                 scf.if
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK:           [[C0_2:%.*]] = constant 0 : index
+// CHECK:           [[C1_2:%.*]] = constant 1 : index
+// CHECK:           [[C4_1:%.*]] = constant 4 : index
+// CHECK:           [[V10:%.*]] = muli [[C1]], [[C1_2]] : index
+// CHECK:           [[V11:%.*]] = muli [[C1]], [[C4_1]] : index
+// CHECK:           scf.parallel ([[V12:%.*]], [[V13:%.*]]) = ([[C0]], [[C0]]) to ([[C2]], [[C2]]) step ([[V10]], [[V11]]) {
+// CHECK:             scf.parallel ([[V15:%.*]], [[V16:%.*]]) = ([[C0_2]], [[C0_2]]) to ([[V10]], [[V11]]) step ([[C1]], [[C1]]) {
+// CHECK:               = addi [[V15]], [[V12]] : index
+// CHECK:               = addi [[V16]], [[V13]] : index
+// CHECK:               scf.if
+// CHECK:             }
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func @tile_nested_in_non_ploop() {
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %c2 = constant 2 : index
+  scf.for %i = %c0 to %c2 step %c1 {
+    scf.for %j = %c0 to %c2 step %c1 {
+      scf.parallel (%k, %l) = (%c0, %c0) to (%c2, %c2) step (%c1, %c1) {
+      }
+    }
+  }
+  return
+}
+
+// CHECK-LABEL: func @tile_nested_in_non_ploop
+// CHECK:         scf.for
+// CHECK:           scf.for
+// CHECK:             scf.parallel
+// CHECK:               scf.parallel
+// CHECK:               }
+// CHECK:             }
+// CHECK:           }
+// CHECK:         }
+// CHECK:       }
diff --git a/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir b/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir
index 62a43b0f8f28e..1ebc62286bd25 100644
--- a/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir
@@ -628,3 +628,33 @@ func @copy_memory_print_maa() {
 
   spv.Return
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.PtrAccessChain
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL:   func @ptr_access_chain1(
+// CHECK-SAME:    %[[ARG0:.*]]: !spv.ptr<f32, CrossWorkgroup>,
+// CHECK-SAME:    %[[ARG1:.*]]: i64)
+// CHECK: spv.PtrAccessChain %[[ARG0]][%[[ARG1]]] : !spv.ptr<f32, CrossWorkgroup>, i64
+func @ptr_access_chain1(%arg0: !spv.ptr<f32, CrossWorkgroup>, %arg1 : i64) -> () {
+  %0 = spv.PtrAccessChain %arg0[%arg1] : !spv.ptr<f32, CrossWorkgroup>, i64
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// spv.InBoundsPtrAccessChain
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL:   func @inbounds_ptr_access_chain1(
+// CHECK-SAME:    %[[ARG0:.*]]: !spv.ptr<f32, CrossWorkgroup>,
+// CHECK-SAME:    %[[ARG1:.*]]: i64)
+// CHECK: spv.InBoundsPtrAccessChain %[[ARG0]][%[[ARG1]]] : !spv.ptr<f32, CrossWorkgroup>, i64
+func @inbounds_ptr_access_chain1(%arg0: !spv.ptr<f32, CrossWorkgroup>, %arg1 : i64) -> () {
+  %0 = spv.InBoundsPtrAccessChain %arg0[%arg1] : !spv.ptr<f32, CrossWorkgroup>, i64
+  return
+}
diff --git a/mlir/test/Dialect/Shape/invalid.mlir b/mlir/test/Dialect/Shape/invalid.mlir
index c605e25b3873c..030926a9cce4b 100644
--- a/mlir/test/Dialect/Shape/invalid.mlir
+++ b/mlir/test/Dialect/Shape/invalid.mlir
@@ -97,6 +97,14 @@ func @shape_of(%value_arg : !shape.value_shape,
 
 // -----
 
+func @shape_of_incompatible_return_types(%value_arg : tensor<1x2xindex>) {
+  // expected-error@+1 {{'shape.shape_of' op inferred type(s) 'tensor<2xindex>' are incompatible with return type(s) of operation 'tensor<3xf32>'}}
+  %0 = shape.shape_of %value_arg : tensor<1x2xindex> -> tensor<3xf32>
+  return
+}
+
+// -----
+
 func @rank(%arg : !shape.shape) {
   // expected-error@+1 {{if at least one of the operands can hold error values then the result must be of type `size` to propagate them}}
   %0 = shape.rank %arg : !shape.shape -> index
diff --git a/mlir/test/Dialect/SparseTensor/conversion.mlir b/mlir/test/Dialect/SparseTensor/conversion.mlir
index a2a2b4d15ce25..5a2e3b1356720 100644
--- a/mlir/test/Dialect/SparseTensor/conversion.mlir
+++ b/mlir/test/Dialect/SparseTensor/conversion.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --sparse-tensor-conversion --canonicalize | FileCheck %s
+// RUN: mlir-opt %s --sparse-tensor-conversion --canonicalize --cse | FileCheck %s
 
 #DenseVector = #sparse_tensor.encoding<{
   dimLevelType = ["dense"]
@@ -29,17 +29,44 @@
   dimOrdering = affine_map<(i,j,k) -> (k,i,j)>
 }>
 
-// CHECK-LABEL: func @sparse_dim(
+// CHECK-LABEL: func @sparse_dim1d(
 //  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
 //       CHECK: %[[C:.*]] = constant 0 : index
 //       CHECK: %[[D:.*]] = call @sparseDimSize(%[[A]], %[[C]])
 //       CHECK: return %[[D]] : index
-func @sparse_dim(%arg0: tensor<?xf64, #SparseVector>) -> index {
+func @sparse_dim1d(%arg0: tensor<?xf64, #SparseVector>) -> index {
   %c = constant 0 : index
   %0 = tensor.dim %arg0, %c : tensor<?xf64, #SparseVector>
   return %0 : index
 }
 
+// CHECK-LABEL: func @sparse_dim3d(
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//       CHECK: %[[C:.*]] = constant 2 : index
+//       CHECK: %[[D:.*]] = call @sparseDimSize(%[[A]], %[[C]])
+//       CHECK: return %[[D]] : index
+func @sparse_dim3d(%arg0: tensor<?x?x?xf64, #SparseTensor>) -> index {
+  // Querying for dimension 1 in the tensor type needs to be
+  // permuted into querying for dimension 2 in the stored sparse
+  // tensor scheme, since the latter honors the dimOrdering.
+  %c = constant 1 : index
+  %0 = tensor.dim %arg0, %c : tensor<?x?x?xf64, #SparseTensor>
+  return %0 : index
+}
+
+// CHECK-LABEL: func @sparse_dim3d_const(
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//       CHECK: %[[C:.*]] = constant 20 : index
+//       CHECK: return %[[C]] : index
+func @sparse_dim3d_const(%arg0: tensor<10x20x30xf64, #SparseTensor>) -> index {
+  // Querying for dimension 1 in the tensor type can be directly
+  // folded into the right value (even though it corresponds
+  // to dimension 2 in the stored sparse tensor scheme).
+  %c = constant 1 : index
+  %0 = tensor.dim %arg0, %c : tensor<10x20x30xf64, #SparseTensor>
+  return %0 : index
+}
+
 // CHECK-LABEL: func @sparse_new1d(
 //  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>) -> !llvm.ptr<i8>
 //   CHECK-DAG: %[[U:.*]] = constant dense<1> : tensor<1xi8>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir b/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir
new file mode 100644
index 0000000000000..405b44bcc86e5
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir
@@ -0,0 +1,92 @@
+// RUN: mlir-opt %s -sparsification --canonicalize | FileCheck %s --check-prefix=CHECK-HIR
+//
+// RUN: mlir-opt %s -sparsification --sparse-tensor-conversion --canonicalize | \
+// RUN: FileCheck %s --check-prefix=CHECK-MIR
+
+#X = #sparse_tensor.encoding<{
+ dimLevelType = [ "dense", "dense", "dense" ],
+ dimOrdering = affine_map<(i,j,k) -> (k,i,j)>
+}>
+
+#trait = {
+  indexing_maps = [
+    affine_map<(i,j,k) -> (k,i,j)>,  // A (in)
+    affine_map<(i,j,k) -> ()>        // X (out)
+  ],
+  iterator_types = ["reduction", "reduction", "reduction"]
+}
+
+// CHECK-HIR-LABEL:   builtin.func @sparse_dynamic_dims(
+// CHECK-HIR-SAME:                                      %[[VAL_0:.*]]: tensor<?x?x?xf32,  #sparse_tensor.encoding<{{{.*}}}>>,
+// CHECK-HIR-SAME:                                      %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK-HIR-DAG:       %[[C0:.*]] = constant 0 : index
+// CHECK-HIR-DAG:       %[[C1:.*]] = constant 1 : index
+// CHECK-HIR-DAG:       %[[C2:.*]] = constant 2 : index
+// CHECK-HIR:           %[[VAL_5:.*]] = tensor.dim %[[VAL_0]], %[[C2]] : tensor<?x?x?xf32,  #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-HIR:           %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?x?x?xf32,  #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-HIR:           %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[C1]] : tensor<?x?x?xf32,  #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-HIR:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?x?xf32,  #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-HIR:           %[[VAL_9:.*]] = memref.buffer_cast %[[VAL_1]] : memref<f32>
+// CHECK-HIR:           %[[VAL_10:.*]] = memref.alloc() : memref<f32>
+// CHECK-HIR:           memref.copy %[[VAL_9]], %[[VAL_10]] : memref<f32> to memref<f32>
+// CHECK-HIR:           scf.for %[[VAL_11:.*]] = %[[C0]] to %[[VAL_5]] step %[[C1]] {
+// CHECK-HIR:             scf.for %[[VAL_12:.*]] = %[[C0]] to %[[VAL_6]] step %[[C1]] {
+// CHECK-HIR:               %[[VAL_13:.*]] = muli %[[VAL_6]], %[[VAL_11]] : index
+// CHECK-HIR:               %[[VAL_14:.*]] = addi %[[VAL_13]], %[[VAL_12]] : index
+// CHECK-HIR:               %[[VAL_15:.*]] = memref.load %[[VAL_10]][] : memref<f32>
+// CHECK-HIR:               %[[VAL_16:.*]] = scf.for %[[VAL_17:.*]] = %[[C0]] to %[[VAL_7]] step %[[C1]] iter_args(%[[VAL_18:.*]] = %[[VAL_15]]) -> (f32) {
+// CHECK-HIR:                 %[[VAL_19:.*]] = muli %[[VAL_7]], %[[VAL_14]] : index
+// CHECK-HIR:                 %[[VAL_20:.*]] = addi %[[VAL_19]], %[[VAL_17]] : index
+// CHECK-HIR:                 %[[VAL_21:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_20]]] : memref<?xf32>
+// CHECK-HIR:                 %[[VAL_22:.*]] = addf %[[VAL_18]], %[[VAL_21]] : f32
+// CHECK-HIR:                 scf.yield %[[VAL_22]] : f32
+// CHECK-HIR:               }
+// CHECK-HIR:               memref.store %[[VAL_23:.*]], %[[VAL_10]][] : memref<f32>
+// CHECK-HIR:             }
+// CHECK-HIR:           }
+// CHECK-HIR:           %[[VAL_24:.*]] = memref.tensor_load %[[VAL_10]] : memref<f32>
+// CHECK-HIR:           return %[[VAL_24]] : tensor<f32>
+// CHECK-HIR:         }
+//
+// CHECK-MIR-LABEL:   builtin.func @sparse_dynamic_dims(
+// CHECK-MIR-SAME:                                      %[[VAL_0:.*]]: !llvm.ptr<i8>,
+// CHECK-MIR-SAME:                                      %[[VAL_1:.*]]: tensor<f32>) -> tensor<f32> {
+// CHECK-MIR-DAG:       %[[C0:.*]] = constant 0 : index
+// CHECK-MIR-DAG:       %[[C1:.*]] = constant 1 : index
+// CHECK-MIR-DAG:       %[[C2:.*]] = constant 2 : index
+// CHECK-MIR:           %[[VAL_5:.*]] = call @sparseDimSize(%[[VAL_0]], %[[C0]]) : (!llvm.ptr<i8>, index) -> index
+// CHECK-MIR:           %[[VAL_6:.*]] = call @sparseDimSize(%[[VAL_0]], %[[C1]]) : (!llvm.ptr<i8>, index) -> index
+// CHECK-MIR:           %[[VAL_7:.*]] = call @sparseDimSize(%[[VAL_0]], %[[C2]]) : (!llvm.ptr<i8>, index) -> index
+// CHECK-MIR:           %[[VAL_8:.*]] = call @sparseValuesF32(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf32>
+// CHECK-MIR:           %[[VAL_9:.*]] = memref.buffer_cast %[[VAL_1]] : memref<f32>
+// CHECK-MIR:           %[[VAL_10:.*]] = memref.alloc() : memref<f32>
+// CHECK-MIR:           memref.copy %[[VAL_9]], %[[VAL_10]] : memref<f32> to memref<f32>
+// CHECK-MIR:           scf.for %[[VAL_11:.*]] = %[[C0]] to %[[VAL_5]] step %[[C1]] {
+// CHECK-MIR:             scf.for %[[VAL_12:.*]] = %[[C0]] to %[[VAL_6]] step %[[C1]] {
+// CHECK-MIR:               %[[VAL_13:.*]] = muli %[[VAL_6]], %[[VAL_11]] : index
+// CHECK-MIR:               %[[VAL_14:.*]] = addi %[[VAL_13]], %[[VAL_12]] : index
+// CHECK-MIR:               %[[VAL_15:.*]] = memref.load %[[VAL_10]][] : memref<f32>
+// CHECK-MIR:               %[[VAL_16:.*]] = scf.for %[[VAL_17:.*]] = %[[C0]] to %[[VAL_7]] step %[[C1]] iter_args(%[[VAL_18:.*]] = %[[VAL_15]]) -> (f32) {
+// CHECK-MIR:                 %[[VAL_19:.*]] = muli %[[VAL_7]], %[[VAL_14]] : index
+// CHECK-MIR:                 %[[VAL_20:.*]] = addi %[[VAL_19]], %[[VAL_17]] : index
+// CHECK-MIR:                 %[[VAL_21:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_20]]] : memref<?xf32>
+// CHECK-MIR:                 %[[VAL_22:.*]] = addf %[[VAL_18]], %[[VAL_21]] : f32
+// CHECK-MIR:                 scf.yield %[[VAL_22]] : f32
+// CHECK-MIR:               }
+// CHECK-MIR:               memref.store %[[VAL_23:.*]], %[[VAL_10]][] : memref<f32>
+// CHECK-MIR:             }
+// CHECK-MIR:           }
+// CHECK-MIR:           %[[VAL_24:.*]] = memref.tensor_load %[[VAL_10]] : memref<f32>
+// CHECK-MIR:           return %[[VAL_24]] : tensor<f32>
+// CHECK-MIR:         }
+func @sparse_dynamic_dims(%arga: tensor<?x?x?xf32, #X>,
+                          %argx: tensor<f32>) -> tensor<f32> {
+  %0 = linalg.generic #trait
+    ins(%arga: tensor<?x?x?xf32, #X>)
+    outs(%argx: tensor<f32>) {
+      ^bb(%a : f32, %x: f32):
+        %0 = addf %x, %a : f32
+        linalg.yield %0 : f32
+  } -> tensor<f32>
+  return %0 : tensor<f32>
+}
diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector.mlir
index 9674123d13bd2..ee8f948eee4d9 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_vector.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_vector.mlir
@@ -1,26 +1,14 @@
-// RUN: mlir-opt %s -sparsification="vectorization-strategy=0 vl=16" | \
+// RUN: mlir-opt %s -sparsification="vectorization-strategy=0 vl=16" -split-input-file | \
 // RUN:   FileCheck %s --check-prefix=CHECK-VEC0
-// RUN: mlir-opt %s -sparsification="vectorization-strategy=1 vl=16" | \
+// RUN: mlir-opt %s -sparsification="vectorization-strategy=1 vl=16" -split-input-file | \
 // RUN:   FileCheck %s --check-prefix=CHECK-VEC1
-// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16" | \
+// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16" -split-input-file | \
 // RUN:   FileCheck %s --check-prefix=CHECK-VEC2
-// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16 enable-simd-index32=true" | \
+// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16 enable-simd-index32=true" -split-input-file | \
 // RUN:   FileCheck %s --check-prefix=CHECK-VEC3
 
 #DenseVector = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }>
 
-#SparseVector = #sparse_tensor.encoding<{
-  dimLevelType = [ "compressed" ],
-  pointerBitWidth = 32,
-  indexBitWidth = 32
-}>
-
-#SparseMatrix = #sparse_tensor.encoding<{
-  dimLevelType = [ "dense", "compressed" ],
-  pointerBitWidth = 32,
-  indexBitWidth = 32
-}>
-
 #trait_scale_d = {
   indexing_maps = [
     affine_map<(i) -> (i)>,  // a
@@ -77,6 +65,14 @@ func @scale_d(%arga: tensor<1024xf32, #DenseVector>, %b: f32, %argx: tensor<1024
   return %0 : tensor<1024xf32>
 }
 
+// -----
+
+#SparseVector = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed" ],
+  pointerBitWidth = 32,
+  indexBitWidth = 32
+}>
+
 #trait_mul_s = {
   indexing_maps = [
     affine_map<(i) -> (i)>,  // a
@@ -128,6 +124,7 @@ func @scale_d(%arga: tensor<1024xf32, #DenseVector>, %b: f32, %argx: tensor<1024
 // CHECK-VEC1:       }
 // CHECK-VEC1:       return
 //
+// CHECK-VEC2:       #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (16, d0 - d1)
 // CHECK-VEC2-LABEL: func @mul_s
 // CHECK-VEC2-DAG:   %[[c0:.*]] = constant 0 : index
 // CHECK-VEC2-DAG:   %[[c1:.*]] = constant 1 : index
@@ -139,7 +136,7 @@ func @scale_d(%arga: tensor<1024xf32, #DenseVector>, %b: f32, %argx: tensor<1024
 // CHECK-VEC2:       %[[b:.*]] = zexti %[[r]] : i32 to i64
 // CHECK-VEC2:       %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC2:       scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] {
-// CHECK-VEC2:         %[[sub:.*]] = subi %[[s]], %[[i]] : index
+// CHECK-VEC2:         %[[sub:.*]] = affine.min #[[$map]](%[[s]], %[[i]])[%[[c16]]]
 // CHECK-VEC2:         %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
 // CHECK-VEC2:         %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
 // CHECK-VEC2:         %[[zi:.*]] = zexti %[[li]] : vector<16xi32> to vector<16xi64>
@@ -150,6 +147,7 @@ func @scale_d(%arga: tensor<1024xf32, #DenseVector>, %b: f32, %argx: tensor<1024
 // CHECK-VEC2:       }
 // CHECK-VEC2:       return
 //
+// CHECK-VEC3:       #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (16, d0 - d1)
 // CHECK-VEC3-LABEL: func @mul_s
 // CHECK-VEC3-DAG:   %[[c0:.*]] = constant 0 : index
 // CHECK-VEC3-DAG:   %[[c1:.*]] = constant 1 : index
@@ -161,7 +159,7 @@ func @scale_d(%arga: tensor<1024xf32, #DenseVector>, %b: f32, %argx: tensor<1024
 // CHECK-VEC3:       %[[b:.*]] = zexti %[[r]] : i32 to i64
 // CHECK-VEC3:       %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC3:       scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] {
-// CHECK-VEC3:         %[[sub:.*]] = subi %{{.*}}, %[[i]] : index
+// CHECK-VEC3:         %[[sub:.*]] = affine.min #[[$map]](%[[s]], %[[i]])[%[[c16]]]
 // CHECK-VEC3:         %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
 // CHECK-VEC3:         %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
 // CHECK-VEC3:         %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
@@ -182,6 +180,10 @@ func @mul_s(%arga: tensor<1024xf32, #SparseVector>, %argb: tensor<1024xf32>, %ar
   return %0 : tensor<1024xf32>
 }
 
+// -----
+
+#DenseVector = #sparse_tensor.encoding<{ dimLevelType = [ "dense" ] }>
+
 #trait_reduction_d = {
   indexing_maps = [
     affine_map<(i) -> (i)>,  // a
@@ -248,6 +250,14 @@ func @reduction_d(%arga: tensor<1024xf32, #DenseVector>, %argb: tensor<1024xf32>
   return %0 : tensor<f32>
 }
 
+// -----
+
+#SparseMatrix = #sparse_tensor.encoding<{
+  dimLevelType = [ "dense", "compressed" ],
+  pointerBitWidth = 32,
+  indexBitWidth = 32
+}>
+
 #trait_mul_ds = {
   indexing_maps = [
     affine_map<(i,j) -> (i,j)>,  // A
@@ -307,6 +317,7 @@ func @reduction_d(%arga: tensor<1024xf32, #DenseVector>, %argb: tensor<1024xf32>
 // CHECK-VEC1:       }
 // CHECK-VEC1:       return
 //
+// CHECK-VEC2:       #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (16, d0 - d1)
 // CHECK-VEC2-LABEL: func @mul_ds
 // CHECK-VEC2-DAG:   %[[c0:.*]] = constant 0 : index
 // CHECK-VEC2-DAG:   %[[c1:.*]] = constant 1 : index
@@ -321,7 +332,7 @@ func @reduction_d(%arga: tensor<1024xf32, #DenseVector>, %argb: tensor<1024xf32>
 // CHECK-VEC2:         %[[b:.*]] = zexti %[[r]] : i32 to i64
 // CHECK-VEC2:         %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC2:         scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c16]] {
-// CHECK-VEC2:           %[[sub:.*]] = subi %[[s]], %[[j]] : index
+// CHECK-VEC2:           %[[sub:.*]] = affine.min #[[$map]](%[[s]], %[[j]])[%[[c16]]]
 // CHECK-VEC2:           %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
 // CHECK-VEC2:           %[[lj:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
 // CHECK-VEC2:           %[[zj:.*]] = zexti %[[lj]] : vector<16xi32> to vector<16xi64>
@@ -333,6 +344,7 @@ func @reduction_d(%arga: tensor<1024xf32, #DenseVector>, %argb: tensor<1024xf32>
 // CHECK-VEC2:       }
 // CHECK-VEC2:       return
 //
+// CHECK-VEC3:       #[[$map:.*]] = affine_map<(d0, d1)[s0] -> (16, d0 - d1)
 // CHECK-VEC3-LABEL: func @mul_ds
 // CHECK-VEC3-DAG:   %[[c0:.*]] = constant 0 : index
 // CHECK-VEC3-DAG:   %[[c1:.*]] = constant 1 : index
@@ -347,7 +359,7 @@ func @reduction_d(%arga: tensor<1024xf32, #DenseVector>, %argb: tensor<1024xf32>
 // CHECK-VEC3:         %[[b:.*]] = zexti %[[r]] : i32 to i64
 // CHECK-VEC3:         %[[s:.*]] = index_cast %[[b]] : i64 to index
 // CHECK-VEC3:         scf.for %[[j:.*]] = %[[q]] to %[[s]] step %[[c16]] {
-// CHECK-VEC3:           %[[sub:.*]] = subi %[[s]], %[[j]] : index
+// CHECK-VEC3:           %[[sub:.*]] = affine.min #[[$map]](%[[s]], %[[j]])[%[[c16]]]
 // CHECK-VEC3:           %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
 // CHECK-VEC3:           %[[lj:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
 // CHECK-VEC3:           %[[la:.*]] = vector.maskedload %{{.*}}[%[[j]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir
new file mode 100644
index 0000000000000..26c424f3f7916
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/sparse_vector_peeled.mlir
@@ -0,0 +1,64 @@
+// RUN: mlir-opt %s -sparsification="vectorization-strategy=2 vl=16" -for-loop-peeling -canonicalize -split-input-file | \
+// RUN:   FileCheck %s
+
+#SparseVector = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed" ],
+  pointerBitWidth = 32,
+  indexBitWidth = 32
+}>
+
+#trait_mul_s = {
+  indexing_maps = [
+    affine_map<(i) -> (i)>,  // a
+    affine_map<(i) -> (i)>,  // b
+    affine_map<(i) -> (i)>   // x (out)
+  ],
+  iterator_types = ["parallel"],
+  doc = "x(i) = a(i) * b(i)"
+}
+
+// CHECK-DAG:   #[[$map0:.*]] = affine_map<()[s0, s1] -> (s0 + ((-s0 + s1) floordiv 16) * 16)>
+// CHECK-DAG:   #[[$map1:.*]] = affine_map<()[s0, s1] -> ((s0 - s1) mod 16)>
+// CHECK-LABEL: func @mul_s
+// CHECK-DAG:   %[[c0:.*]] = constant 0 : index
+// CHECK-DAG:   %[[c1:.*]] = constant 1 : index
+// CHECK-DAG:   %[[c16:.*]] = constant 16 : index
+// CHECK:       %[[p:.*]] = memref.load %{{.*}}[%[[c0]]] : memref<?xi32>
+// CHECK:       %[[a:.*]] = zexti %[[p]] : i32 to i64
+// CHECK:       %[[q:.*]] = index_cast %[[a]] : i64 to index
+// CHECK:       %[[r:.*]] = memref.load %{{.*}}[%[[c1]]] : memref<?xi32>
+// CHECK:       %[[b:.*]] = zexti %[[r]] : i32 to i64
+// CHECK:       %[[s:.*]] = index_cast %[[b]] : i64 to index
+// CHECK:       %[[boundary:.*]] = affine.apply #[[$map0]]()[%[[q]], %[[s]]]
+// CHECK:       scf.for %[[i:.*]] = %[[q]] to %[[boundary]] step %[[c16]] {
+// CHECK:         %[[mask:.*]] = vector.constant_mask [16] : vector<16xi1>
+// CHECK:         %[[li:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xi32>, vector<16xi32>
+// CHECK:         %[[zi:.*]] = zexti %[[li]] : vector<16xi32> to vector<16xi64>
+// CHECK:         %[[la:.*]] = vector.load %{{.*}}[%[[i]]] : memref<?xf32>, vector<16xf32>
+// CHECK:         %[[lb:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %{{.*}} : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK:         %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
+// CHECK:         vector.scatter %{{.*}}[%[[c0]]] [%[[zi]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
+// CHECK:       }
+// CHECK:       %[[has_more:.*]] = cmpi slt, %[[boundary]], %[[s]] : index
+// CHECK:       scf.if %[[has_more]] {
+// CHECK:         %[[sub:.*]] = affine.apply #[[$map1]]()[%[[s]], %[[q]]]
+// CHECK:         %[[mask2:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
+// CHECK:         %[[li2:.*]] = vector.maskedload %{{.*}}[%[[boundary]]], %[[mask2]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
+// CHECK:         %[[zi2:.*]] = zexti %[[li2]] : vector<16xi32> to vector<16xi64>
+// CHECK:         %[[la2:.*]] = vector.maskedload %{{.*}}[%[[boundary]]], %[[mask2]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK:         %[[lb2:.*]] = vector.gather %{{.*}}[%[[c0]]] [%[[zi2]]], %[[mask2]], %{{.*}} : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK:         %[[m2:.*]] = mulf %[[la2]], %[[lb2]] : vector<16xf32>
+// CHECK:         vector.scatter %{{.*}}[%[[c0]]] [%[[zi2]]], %[[mask2]], %[[m2]] : memref<1024xf32>, vector<16xi64>, vector<16xi1>, vector<16xf32>
+// CHECK:       }
+// CHECK:       return
+//
+func @mul_s(%arga: tensor<1024xf32, #SparseVector>, %argb: tensor<1024xf32>, %argx: tensor<1024xf32>) -> tensor<1024xf32> {
+  %0 = linalg.generic #trait_mul_s
+    ins(%arga, %argb: tensor<1024xf32, #SparseVector>, tensor<1024xf32>)
+    outs(%argx: tensor<1024xf32>) {
+      ^bb(%a: f32, %b: f32, %x: f32):
+        %0 = mulf %a, %b : f32
+        linalg.yield %0 : f32
+  } -> tensor<1024xf32>
+  return %0 : tensor<1024xf32>
+}
diff --git a/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir b/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir
new file mode 100644
index 0000000000000..91dcc2e0172f7
--- /dev/null
+++ b/mlir/test/Dialect/Vector/vector-multi-reduction-outer-lowering.mlir
@@ -0,0 +1,161 @@
+// RUN: mlir-opt %s -test-vector-multi-reduction-lowering-patterns="use-outer-reductions" | FileCheck %s
+
+func @vector_multi_reduction(%arg0: vector<2x4xf32>) -> vector<2xf32> {
+    %0 = vector.multi_reduction #vector.kind<mul>, %arg0 [1] : vector<2x4xf32> to vector<2xf32>
+    return %0 : vector<2xf32>
+}
+
+// CHECK-LABEL: func @vector_multi_reduction
+//  CHECK-SAME:   %[[INPUT:.+]]: vector<2x4xf32>
+//       CHECK:   %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [1, 0] : vector<2x4xf32> to vector<4x2xf32>
+//       CHECK:   %[[V0:.+]] = vector.extract %[[TRANSPOSED]][0] : vector<4x2xf32>
+//       CHECK:   %[[V1:.+]] = vector.extract %[[TRANSPOSED]][1] : vector<4x2xf32>
+//       CHECK:   %[[RV01:.+]] = mulf %[[V1]], %[[V0]] : vector<2xf32>
+//       CHECK:   %[[V2:.+]] = vector.extract %[[TRANSPOSED]][2] : vector<4x2xf32>
+//       CHECK:   %[[RV012:.+]] = mulf %[[V2]], %[[RV01]] : vector<2xf32>
+//       CHECK:   %[[V3:.+]] = vector.extract %[[TRANSPOSED]][3] : vector<4x2xf32>
+//       CHECK:   %[[RESULT_VEC:.+]] = mulf %[[V3]], %[[RV012]] : vector<2xf32>
+//       CHECK:   return %[[RESULT_VEC]] : vector<2xf32>
+
+func @vector_multi_reduction_min(%arg0: vector<2x4xf32>) -> vector<2xf32> {
+    %0 = vector.multi_reduction #vector.kind<min>, %arg0 [1] : vector<2x4xf32> to vector<2xf32>
+    return %0 : vector<2xf32>
+}
+
+// CHECK-LABEL: func @vector_multi_reduction_min
+//  CHECK-SAME:   %[[INPUT:.+]]: vector<2x4xf32>
+//       CHECK:   %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [1, 0] : vector<2x4xf32> to vector<4x2xf32>
+//       CHECK:   %[[V0:.+]] = vector.extract %[[TRANSPOSED]][0] : vector<4x2xf32>
+//       CHECK:   %[[V1:.+]] = vector.extract %[[TRANSPOSED]][1] : vector<4x2xf32>
+//       CHECK:   %[[C0:.+]] = cmpf olt, %[[V1]], %[[V0]] : vector<2xf32>
+//       CHECK:   %[[RV01:.+]] = select %[[C0]], %[[V1]], %[[V0]] : vector<2xi1>, vector<2xf32>
+//       CHECK:   %[[V2:.+]] = vector.extract %[[TRANSPOSED]][2] : vector<4x2xf32>
+//       CHECK:   %[[C1:.+]] = cmpf olt, %[[V2]], %[[RV01]] : vector<2xf32>
+//       CHECK:   %[[RV012:.+]] = select %[[C1]], %[[V2]], %[[RV01]] : vector<2xi1>, vector<2xf32>
+//       CHECK:   %[[V3:.+]] = vector.extract %[[TRANSPOSED]][3] : vector<4x2xf32>
+//       CHECK:   %[[C2:.+]] = cmpf olt, %[[V3]], %[[RV012]] : vector<2xf32>
+//       CHECK:   %[[RESULT_VEC:.+]] = select %[[C2]], %[[V3]], %[[RV012]] : vector<2xi1>, vector<2xf32>
+//       CHECK:   return %[[RESULT_VEC]] : vector<2xf32>
+
+func @vector_multi_reduction_max(%arg0: vector<2x4xf32>) -> vector<2xf32> {
+    %0 = vector.multi_reduction #vector.kind<max>, %arg0 [1] : vector<2x4xf32> to vector<2xf32>
+    return %0 : vector<2xf32>
+}
+
+// CHECK-LABEL: func @vector_multi_reduction_max
+//  CHECK-SAME:   %[[INPUT:.+]]: vector<2x4xf32>
+//       CHECK:   %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [1, 0] : vector<2x4xf32> to vector<4x2xf32>
+//       CHECK:   %[[V0:.+]] = vector.extract %[[TRANSPOSED]][0] : vector<4x2xf32>
+//       CHECK:   %[[V1:.+]] = vector.extract %[[TRANSPOSED]][1] : vector<4x2xf32>
+//       CHECK:   %[[C0:.+]] = cmpf oge, %[[V1]], %[[V0]] : vector<2xf32>
+//       CHECK:   %[[RV01:.+]] = select %[[C0]], %[[V1]], %[[V0]] : vector<2xi1>, vector<2xf32>
+//       CHECK:   %[[V2:.+]] = vector.extract %[[TRANSPOSED]][2] : vector<4x2xf32>
+//       CHECK:   %[[C1:.+]] = cmpf oge, %[[V2]], %[[RV01]] : vector<2xf32>
+//       CHECK:   %[[RV012:.+]] = select %[[C1]], %[[V2]], %[[RV01]] : vector<2xi1>, vector<2xf32>
+//       CHECK:   %[[V3:.+]] = vector.extract %[[TRANSPOSED]][3] : vector<4x2xf32>
+//       CHECK:   %[[C2:.+]] = cmpf oge, %[[V3]], %[[RV012]] : vector<2xf32>
+//       CHECK:   %[[RESULT_VEC:.+]] = select %[[C2]], %[[V3]], %[[RV012]] : vector<2xi1>, vector<2xf32>
+//       CHECK:   return %[[RESULT_VEC]] : vector<2xf32>
+
+func @vector_multi_reduction_and(%arg0: vector<2x4xi32>) -> vector<2xi32> {
+    %0 = vector.multi_reduction #vector.kind<and>, %arg0 [1] : vector<2x4xi32> to vector<2xi32>
+    return %0 : vector<2xi32>
+}
+
+// CHECK-LABEL: func @vector_multi_reduction_and
+//  CHECK-SAME:   %[[INPUT:.+]]: vector<2x4xi32>
+//       CHECK:   %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [1, 0] : vector<2x4xi32> to vector<4x2xi32>
+//       CHECK:   %[[V0:.+]] = vector.extract %[[TRANSPOSED]][0] : vector<4x2xi32>
+//       CHECK:   %[[V1:.+]] = vector.extract %[[TRANSPOSED]][1] : vector<4x2xi32>
+//       CHECK:   %[[RV01:.+]] = and %[[V1]], %[[V0]] : vector<2xi32>
+//       CHECK:   %[[V2:.+]] = vector.extract %[[TRANSPOSED]][2] : vector<4x2xi32>
+//       CHECK:   %[[RV012:.+]] = and %[[V2]], %[[RV01]] : vector<2xi32>
+//       CHECK:   %[[V3:.+]] = vector.extract %[[TRANSPOSED]][3] : vector<4x2xi32>
+//       CHECK:   %[[RESULT_VEC:.+]] = and %[[V3]], %[[RV012]] : vector<2xi32>
+//       CHECK:   return %[[RESULT_VEC]] : vector<2xi32>
+
+func @vector_multi_reduction_or(%arg0: vector<2x4xi32>) -> vector<2xi32> {
+    %0 = vector.multi_reduction #vector.kind<or>, %arg0 [1] : vector<2x4xi32> to vector<2xi32>
+    return %0 : vector<2xi32>
+}
+
+// CHECK-LABEL: func @vector_multi_reduction_or
+//  CHECK-SAME:   %[[INPUT:.+]]: vector<2x4xi32>
+//       CHECK:   %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [1, 0] : vector<2x4xi32> to vector<4x2xi32>
+//       CHECK:   %[[V0:.+]] = vector.extract %[[TRANSPOSED]][0] : vector<4x2xi32>
+//       CHECK:   %[[V1:.+]] = vector.extract %[[TRANSPOSED]][1] : vector<4x2xi32>
+//       CHECK:   %[[RV01:.+]] = or %[[V1]], %[[V0]] : vector<2xi32>
+//       CHECK:   %[[V2:.+]] = vector.extract %[[TRANSPOSED]][2] : vector<4x2xi32>
+//       CHECK:   %[[RV012:.+]] = or %[[V2]], %[[RV01]] : vector<2xi32>
+//       CHECK:   %[[V3:.+]] = vector.extract %[[TRANSPOSED]][3] : vector<4x2xi32>
+//       CHECK:   %[[RESULT_VEC:.+]] = or %[[V3]], %[[RV012]] : vector<2xi32>
+//       CHECK:   return %[[RESULT_VEC]] : vector<2xi32>
+
+func @vector_multi_reduction_xor(%arg0: vector<2x4xi32>) -> vector<2xi32> {
+    %0 = vector.multi_reduction #vector.kind<xor>, %arg0 [1] : vector<2x4xi32> to vector<2xi32>
+    return %0 : vector<2xi32>
+}
+
+// CHECK-LABEL: func @vector_multi_reduction_xor
+//  CHECK-SAME:   %[[INPUT:.+]]: vector<2x4xi32>
+//       CHECK:   %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [1, 0] : vector<2x4xi32> to vector<4x2xi32>
+//       CHECK:   %[[V0:.+]] = vector.extract %[[TRANSPOSED]][0] : vector<4x2xi32>
+//       CHECK:   %[[V1:.+]] = vector.extract %[[TRANSPOSED]][1] : vector<4x2xi32>
+//       CHECK:   %[[RV01:.+]] = xor %[[V1]], %[[V0]] : vector<2xi32>
+//       CHECK:   %[[V2:.+]] = vector.extract %[[TRANSPOSED]][2] : vector<4x2xi32>
+//       CHECK:   %[[RV012:.+]] = xor %[[V2]], %[[RV01]] : vector<2xi32>
+//       CHECK:   %[[V3:.+]] = vector.extract %[[TRANSPOSED]][3] : vector<4x2xi32>
+//       CHECK:   %[[RESULT_VEC:.+]] = xor %[[V3]], %[[RV012]] : vector<2xi32>
+//       CHECK:   return %[[RESULT_VEC]] : vector<2xi32>
+
+
+func @vector_reduction_outer(%arg0: vector<2x3x4x5xi32>) -> vector<2x3xi32> {
+    %0 = vector.multi_reduction #vector.kind<add>, %arg0 [2, 3] : vector<2x3x4x5xi32> to vector<2x3xi32>
+    return %0 : vector<2x3xi32>
+}
+
+// CHECK-LABEL: func @vector_reduction_outer
+//  CHECK-SAME:   %[[INPUT:.+]]: vector<2x3x4x5xi32>
+//       CHECK:   %[[TRANSPOSED:.+]] = vector.transpose %[[INPUT]], [2, 3, 0, 1] : vector<2x3x4x5xi32> to vector<4x5x2x3xi32>
+//       CHECK:   %[[RESHAPED:.+]] = vector.shape_cast %[[TRANSPOSED]] : vector<4x5x2x3xi32> to vector<20x6xi32>
+//       CHECK:   %[[V0:.+]] = vector.extract %[[RESHAPED]][0] : vector<20x6xi32>
+//       CHECK:   %[[V1:.+]] = vector.extract %[[RESHAPED]][1] : vector<20x6xi32>
+//       CHECK:   %[[R0:.+]] = addi %[[V1]], %[[V0]] : vector<6xi32>
+//       CHECK:   %[[V2:.+]] = vector.extract %[[RESHAPED]][2] : vector<20x6xi32>
+//       CHECK:   %[[R1:.+]] = addi %[[V2]], %[[R0]] : vector<6xi32>
+//       CHECK:   %[[V3:.+]] = vector.extract %[[RESHAPED]][3] : vector<20x6xi32>
+//       CHECK:   %[[R2:.+]] = addi %[[V3]], %[[R1]] : vector<6xi32>
+//       CHECK:   %[[V4:.+]] = vector.extract %[[RESHAPED]][4] : vector<20x6xi32>
+//       CHECK:   %[[R3:.+]] = addi %[[V4]], %[[R2]] : vector<6xi32>
+//       CHECK:   %[[V5:.+]] = vector.extract %[[RESHAPED]][5] : vector<20x6xi32>
+//       CHECK:   %[[R4:.+]] = addi %[[V5]], %[[R3]] : vector<6xi32>
+//       CHECK:   %[[V6:.+]] = vector.extract %[[RESHAPED]][6] : vector<20x6xi32>
+//       CHECK:   %[[R5:.+]] = addi %[[V6]], %[[R4]] : vector<6xi32>
+//       CHECK:   %[[V7:.+]] = vector.extract %[[RESHAPED]][7] : vector<20x6xi32>
+//       CHECK:   %[[R6:.+]] = addi %[[V7]], %[[R5]] : vector<6xi32>
+//       CHECK:   %[[V8:.+]] = vector.extract %[[RESHAPED]][8] : vector<20x6xi32>
+//       CHECK:   %[[R7:.+]] = addi %[[V8]], %[[R6]] : vector<6xi32>
+//       CHECK:   %[[V9:.+]] = vector.extract %[[RESHAPED]][9] : vector<20x6xi32>
+//       CHECK:   %[[R8:.+]] = addi %[[V9]], %[[R7]] : vector<6xi32>
+//       CHECK:   %[[V10:.+]] = vector.extract %[[RESHAPED]][10] : vector<20x6xi32>
+//       CHECK:   %[[R9:.+]] = addi %[[V10]], %[[R8]] : vector<6xi32>
+//       CHECK:   %[[V11:.+]] = vector.extract %[[RESHAPED]][11] : vector<20x6xi32>
+//       CHECK:   %[[R10:.+]] = addi %[[V11]], %[[R9]] : vector<6xi32>
+//       CHECK:   %[[V12:.+]] = vector.extract %[[RESHAPED]][12] : vector<20x6xi32>
+//       CHECK:   %[[R11:.+]] = addi %[[V12]], %[[R10]] : vector<6xi32>
+//       CHECK:   %[[V13:.+]] = vector.extract %[[RESHAPED]][13] : vector<20x6xi32>
+//       CHECK:   %[[R12:.+]] = addi %[[V13]], %[[R11]] : vector<6xi32>
+//       CHECK:   %[[V14:.+]] = vector.extract %[[RESHAPED]][14] : vector<20x6xi32>
+//       CHECK:   %[[R13:.+]] = addi %[[V14]], %[[R12]] : vector<6xi32>
+//       CHECK:   %[[V15:.+]] = vector.extract %[[RESHAPED]][15] : vector<20x6xi32>
+//       CHECK:   %[[R14:.+]] = addi %[[V15]], %[[R13]] : vector<6xi32>
+//       CHECK:   %[[V16:.+]] = vector.extract %[[RESHAPED]][16] : vector<20x6xi32>
+//       CHECK:   %[[R15:.+]] = addi %[[V16]], %[[R14]] : vector<6xi32>
+//       CHECK:   %[[V17:.+]] = vector.extract %[[RESHAPED]][17] : vector<20x6xi32>
+//       CHECK:   %[[R16:.+]] = addi %[[V17]], %[[R15]] : vector<6xi32>
+//       CHECK:   %[[V18:.+]] = vector.extract %[[RESHAPED]][18] : vector<20x6xi32>
+//       CHECK:   %[[R17:.+]] = addi %[[V18]], %[[R16]] : vector<6xi32>
+//       CHECK:   %[[V19:.+]] = vector.extract %[[RESHAPED]][19] : vector<20x6xi32>
+//       CHECK:   %[[R18:.+]] = addi %[[V19]], %[[R17]] : vector<6xi32>
+//       CHECK:   %[[RESULT_VEC:.+]] = vector.shape_cast %[[R18]] : vector<6xi32> to vector<2x3xi32>
+//       CHECK:   return %[[RESULT_VEC]] : vector<2x3xi32>
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir
deleted file mode 100644
index 897f360b8153f..0000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul_column_major_as_row_major.mlir
+++ /dev/null
@@ -1,136 +0,0 @@
-// RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \
-// RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul_column_major_as_row_major anchor-op=linalg.matmul_column_major register-tile-sizes=16,0,32 vectorize" | \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul_column_major_as_row_major anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \
-// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.fill register-tile-sizes=4,16 vectorize" | \
-
-// TODO: linalg.copy vectorization in the presence of permutation map fails. Enable when addressed.
-// R_UN: mlir-opt -test-linalg-codegen-strategy="anchor-op=linalg.copy register-tile-sizes=4,16 vectorize" | \
-
-// RUN: mlir-opt -canonicalize -convert-vector-to-scf -lower-affine -convert-linalg-to-loops | \
-// RUN: mlir-opt -canonicalize -convert-scf-to-std -convert-vector-to-llvm -convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
-// Activate to dump assembly
-// R_UN:   -dump-object-file -object-filename=/tmp/a.o \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
-// Use tee to both print to stderr and FileCheck
-// RUN: tee -a /dev/stderr | FileCheck %s
-
-!elem_type_a = type f32
-!elem_type_b = type f32
-!elem_type_c = type f32
-!row_major_A = type memref<${M}x${K}x!elem_type_a>
-!row_major_B = type memref<${K}x${N}x!elem_type_b>
-!row_major_C = type memref<${M}x${N}x!elem_type_c>
-!column_major_A = type memref<${K}x${M}x!elem_type_a>
-!column_major_B = type memref<${N}x${K}x!elem_type_b>
-!column_major_C = type memref<${N}x${M}x!elem_type_c>
-
-func @matmul_column_major_as_row_major(
-  %ca: !column_major_A, %cb: !column_major_B, %cc: !column_major_C,
-   %a: !row_major_A,     %b: !row_major_B,     %c: !row_major_C)
-// TODO: activate manually for now.
-// attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]}
-{
-  linalg.copy(%ca, %a) {inputPermutation = affine_map<(i, j) -> (j, i)> } : !column_major_A, !row_major_A
-  linalg.copy(%cb, %b) {inputPermutation = affine_map<(i, j) -> (j, i)> } : !column_major_B, !row_major_B
-  linalg.matmul ins(%a, %b : !row_major_A, !row_major_B)
-    outs(%c: !row_major_C)
-  linalg.copy(%c, %cc) {inputPermutation = affine_map<(i, j) -> (j, i)> } : !row_major_C, !column_major_C
-  return
-}
-
-func @print_perf(%iters: index, %total_time: f64) {
-  %c2 = constant 2 : index
-  %cM = constant ${M} : index
-  %cN = constant ${N} : index
-  %cK = constant ${K} : index
-
-  %mn = muli %cM, %cN : index
-  %mnk = muli %mn, %cK : index
-
-  // 2*M*N*K.
-  %flops_per_iter = muli %c2, %mnk : index
-  %flops = muli %iters, %flops_per_iter : index
-  %flops_i64 = index_cast %flops : index to i64
-  %flops_f = sitofp %flops_i64 : i64 to f64
-  %flops_per_s = divf %flops_f, %total_time : f64
-  vector.print %flops_per_s : f64
-
-  return
-}
-
-func @main() {
-  %f0 = constant 0.0 : !elem_type_c
-  %f1 = constant 1.0 : !elem_type_a
-
-  %cA = memref.alloc() : !column_major_A
-  %cB = memref.alloc() : !column_major_B
-  %cC = memref.alloc() : !column_major_C
-
-  linalg.fill(%f1, %cA) : !elem_type_a, !column_major_A
-  linalg.fill(%f1, %cB) : !elem_type_b, !column_major_B
-  linalg.fill(%f0, %cC) : !elem_type_c, !column_major_C
-
-  %c0 = constant 0: index
-  %c1 = constant 1: index
-  %iters = constant ${ITERS}: index
-
-  /// Run and dump performance for matmul_column_major as a row-major
-  %A = memref.alloc() : !row_major_A
-  %B = memref.alloc() : !row_major_B
-  %C = memref.alloc() : !row_major_C
-  %t_start_matmul_column_major_as_row_major = call @rtclock() : () -> f64
-  scf.for %arg0 = %c0 to %iters step %c1 {
-    // linalg.matmul writes %C in place, need to reset it to zero every time.
-    // This is accounts for about 10-15% perf hit on small sizes.
-    // Once linalg on tensors is ready, fusing fill at the register level will
-    // be easy.
-    linalg.fill(%f0, %C) : !elem_type_c, !row_major_C
-    call @matmul_column_major_as_row_major(%cA, %cB, %cC, %A, %B, %C) :
-      (!column_major_A, !column_major_B, !column_major_C,
-       !row_major_A, !row_major_B, !row_major_C) -> ()
-  }
-  %t_end_matmul_column_major_as_row_major = call @rtclock() : () -> f64
-  %tmatmul_column_major_as_row_major = subf %t_end_matmul_column_major_as_row_major, %t_start_matmul_column_major_as_row_major: f64
-  call @print_perf(%iters, %tmatmul_column_major_as_row_major) : (index, f64) -> ()
-
-  // CHECK: {{^0$}}
-  %cC_ref = memref.alloc() : !column_major_C
-  linalg.fill(%f0, %cC_ref) : !elem_type_c, !column_major_C
-  linalg.matmul_column_major ins(%cA, %cB : !column_major_A, !column_major_B)
-    outs(%cC_ref: !column_major_C)
-  %act1 = memref.cast %cC : !column_major_C to memref<*xf32>
-  %exp1 = memref.cast %cC_ref : !column_major_C to memref<*xf32>
-  %errors1 = call @verifyMemRefF32(%act1, %exp1) : (memref<*xf32>, memref<*xf32>) -> i64
-  vector.print %errors1 : i64
-  memref.dealloc %cC_ref : !column_major_C
-
-  // CHECK: {{^0$}}
-  %C_ref = memref.alloc() : !row_major_C
-  linalg.fill(%f0, %C_ref) : !elem_type_c, !row_major_C
-  linalg.matmul ins(%A, %B : !row_major_A, !row_major_B)
-    outs(%C_ref: !row_major_C)
-  %act2 = memref.cast %C : !row_major_C to memref<*xf32>
-  %exp2 = memref.cast %C_ref : !row_major_C to memref<*xf32>
-  %errors2 = call @verifyMemRefF32(%act2, %exp2) : (memref<*xf32>, memref<*xf32>) -> i64
-  vector.print %errors2 : i64
-  memref.dealloc %C_ref : !row_major_C
-
-  memref.dealloc %A : !row_major_A
-  memref.dealloc %B : !row_major_B
-  memref.dealloc %C : !row_major_C
-
-  memref.dealloc %cA : !column_major_A
-  memref.dealloc %cB : !column_major_B
-  memref.dealloc %cC : !column_major_C
-
-  return
-}
-
-func private @rtclock() -> f64
-func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface }
-
-// TODO: init with random, run and check output.
-// func private @fill_random_f32(memref<*xf32>)
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-ncw-filter-wcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-ncw-filter-wcf-call.mlir
deleted file mode 100644
index 7e4b27679b5ce..0000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-ncw-filter-wcf-call.mlir
+++ /dev/null
@@ -1,70 +0,0 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,4" -convert-linalg-to-loops -convert-scf-to-std \
-// RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,4" \
-// RUN:   -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-func private @print_memref_f32(memref<*xf32>)
-
-// Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f
-func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref<?x?x?xf32> {
-  %buf = memref.alloc(%s1, %s2, %s3) : memref<?x?x?xf32>
-  linalg.fill(%f, %buf) : f32, memref<?x?x?xf32>
-  return %buf : memref<?x?x?xf32>
-}
-
-func @conv_1d_input_ncw_filter_wcf(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
-  linalg.conv_1d_input_ncw_filter_wcf {dilations = dense<1> : tensor<1xi64>,
-                         strides = dense<1> : tensor<1xi64>}
-     ins (%arg0, %arg1: memref<?x?x?xf32>, memref<?x?x?xf32>)
-    outs (%arg2: memref<?x?x?xf32>)
-  return
-}
-
-func @main() {
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %c3 = constant 3 : index
-  %c6 = constant 6 : index
-  %c8 = constant 8 : index
-  %f10 = constant 10.00000e+00 : f32
-  %val = constant 2.00000e+00 : f32
-  %zero = constant 0.00000e+00 : f32
-
-  %filter1D_ncw = call @alloc_3d_filled_f32(%c3, %c1, %c1, %val) : (index, index, index, f32) -> (memref<?x?x?xf32>)
-  %in1D_ncw = call @alloc_3d_filled_f32(%c1, %c1, %c8, %val) : (index, index, index, f32) -> (memref<?x?x?xf32>)
-  %out1D_ncw = call @alloc_3d_filled_f32(%c1, %c1, %c6, %zero) : (index, index, index, f32) -> (memref<?x?x?xf32>)
-
-  memref.store %f10, %in1D_ncw[%c0, %c0, %c3] : memref<?x?x?xf32>
-  call @conv_1d_input_ncw_filter_wcf(%in1D_ncw, %filter1D_ncw, %out1D_ncw) : (memref<?x?x?xf32>, memref<?x?x?xf32>, memref<?x?x?xf32>) -> ()
-  %out1D_ncw_ = memref.cast %out1D_ncw : memref<?x?x?xf32> to memref<*xf32>
-  call @print_memref_f32(%out1D_ncw_): (memref<*xf32>) -> ()
-
-  memref.dealloc %filter1D_ncw : memref<?x?x?xf32>
-  memref.dealloc %in1D_ncw : memref<?x?x?xf32>
-  memref.dealloc %out1D_ncw : memref<?x?x?xf32>
-  return
-}
-
-// CHECK:       Unranked Memref {{.*}}
-// CHECK-NEXT:  [
-// CHECK-SAME:   [
-// CHECK-SAME:    [12, 28, 28, 28, 12, 12]
-// CHECK-SAME:   ]
-// CHECK-SAME:  ]
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir
deleted file mode 100644
index 3a85d500460f7..0000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir
+++ /dev/null
@@ -1,68 +0,0 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,4" -convert-linalg-to-loops -convert-scf-to-std \
-// RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,4" \
-// RUN:   -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-func private @print_memref_f32(memref<*xf32>)
-
-// Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f
-func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref<?x?x?xf32> {
-  %buf = memref.alloc(%s1, %s2, %s3) : memref<?x?x?xf32>
-  linalg.fill(%f, %buf) : f32, memref<?x?x?xf32>
-  return %buf : memref<?x?x?xf32>
-}
-
-func @conv_1d_ncw(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
-  linalg.conv_1d_ncw ins (%arg0, %arg1: memref<?x?x?xf32>, memref<?x?x?xf32>)
-                    outs (%arg2: memref<?x?x?xf32>)
-  return
-}
-
-func @main() {
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %c3 = constant 3 : index
-  %c6 = constant 6 : index
-  %c8 = constant 8 : index
-  %f10 = constant 10.00000e+00 : f32
-  %val = constant 2.00000e+00 : f32
-  %zero = constant 0.00000e+00 : f32
-
-  %filter1D_ncw = call @alloc_3d_filled_f32(%c1, %c1, %c3, %val) : (index, index, index, f32) -> (memref<?x?x?xf32>)
-  %in1D_ncw = call @alloc_3d_filled_f32(%c1, %c1, %c8, %val) : (index, index, index, f32) -> (memref<?x?x?xf32>)
-  %out1D_ncw = call @alloc_3d_filled_f32(%c1, %c1, %c6, %zero) : (index, index, index, f32) -> (memref<?x?x?xf32>)
-
-  memref.store %f10, %in1D_ncw[%c0, %c0, %c3] : memref<?x?x?xf32>
-  call @conv_1d_ncw(%in1D_ncw, %filter1D_ncw, %out1D_ncw) : (memref<?x?x?xf32>, memref<?x?x?xf32>, memref<?x?x?xf32>) -> ()
-  %out1D_ncw_ = memref.cast %out1D_ncw : memref<?x?x?xf32> to memref<*xf32>
-  call @print_memref_f32(%out1D_ncw_): (memref<*xf32>) -> ()
-
-  memref.dealloc %filter1D_ncw : memref<?x?x?xf32>
-  memref.dealloc %in1D_ncw : memref<?x?x?xf32>
-  memref.dealloc %out1D_ncw : memref<?x?x?xf32>
-  return
-}
-
-// CHECK:       Unranked Memref {{.*}}
-// CHECK-NEXT:  [
-// CHECK-SAME:   [
-// CHECK-SAME:    [12, 28, 28, 28, 12, 12]
-// CHECK-SAME:   ]
-// CHECK-SAME:  ]
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir
deleted file mode 100644
index 68f890c9dabb2..0000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir
+++ /dev/null
@@ -1,79 +0,0 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,4" -convert-linalg-to-loops -convert-scf-to-std \
-// RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,4" \
-// RUN:   -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-func private @print_memref_f32(memref<*xf32>)
-
-// Creates and returns 3-D buffer of size (%s1, %s2, %s3) filled with the value %f
-func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> memref<?x?x?xf32> {
-  %buf = memref.alloc(%s1, %s2, %s3) : memref<?x?x?xf32>
-  linalg.fill(%f, %buf) : f32, memref<?x?x?xf32>
-  return %buf : memref<?x?x?xf32>
-}
-
-func @conv_1d_nwc(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
-  linalg.conv_1d_nwc ins (%arg0, %arg1: memref<?x?x?xf32>, memref<?x?x?xf32>)
-                    outs (%arg2: memref<?x?x?xf32>)
-  return
-}
-
-func @main() {
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %c3 = constant 3 : index
-  %c6 = constant 6 : index
-  %c8 = constant 8 : index
-  %f10 = constant 10.00000e+00 : f32
-  %val = constant 2.00000e+00 : f32
-  %zero = constant 0.00000e+00 : f32
-
-  %filter1D_nwc = call @alloc_3d_filled_f32(%c1, %c3, %c1, %val) : (index, index, index, f32) -> (memref<?x?x?xf32>)
-  %in1D_nwc = call @alloc_3d_filled_f32(%c3, %c8, %c1, %val) : (index, index, index, f32) -> (memref<?x?x?xf32>)
-  %out1D_nwc = call @alloc_3d_filled_f32(%c3, %c6, %c1, %zero) : (index, index, index, f32) -> (memref<?x?x?xf32>)
-
-  memref.store %f10, %in1D_nwc[%c0, %c3, %c0] : memref<?x?x?xf32>
-  call @conv_1d_nwc(%in1D_nwc, %filter1D_nwc, %out1D_nwc) : (memref<?x?x?xf32>, memref<?x?x?xf32>, memref<?x?x?xf32>) -> ()
-  %out1D_nwc_ = memref.cast %out1D_nwc : memref<?x?x?xf32> to memref<*xf32>
-  call @print_memref_f32(%out1D_nwc_): (memref<*xf32>) -> ()
-
-  memref.dealloc %filter1D_nwc : memref<?x?x?xf32>
-  memref.dealloc %in1D_nwc : memref<?x?x?xf32>
-  memref.dealloc %out1D_nwc : memref<?x?x?xf32>
-  return
-}
-
-// CHECK:       Unranked Memref {{.*}}
-// CHECK-NEXT:  [
-// CHECK-SAME:   [
-// CHECK-SAME:    [12],
-// CHECK-COUNT-3: [28],
-// CHECK-NEXT:    [12],
-// CHECK-NEXT:    [12]
-// CHECK-SAME:   ],
-// CHECK-NEXT:   [
-// CHECK-COUNT-5: [12],
-// CHECK-NEXT:    [12]
-// CHECK-SAME:   ],
-// CHECK-NEXT:   [
-// CHECK-COUNT-5: [12],
-// CHECK-NEXT:    [12]
-// CHECK-SAME:   ]
-// CHECK-SAME:  ]
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-nwc-filter-wcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
similarity index 91%
rename from mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-nwc-filter-wcf-call.mlir
rename to mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
index 71d0c8a56e364..022c60d1d02dd 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-input-nwc-filter-wcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
@@ -29,8 +29,8 @@ func @alloc_3d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %f : f32) -> me
   return %buf : memref<?x?x?xf32>
 }
 
-func @conv_1d_input_nwc_filter_wcf(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
-  linalg.conv_1d_input_nwc_filter_wcf {dilations = dense<1> : tensor<1xi64>,
+func @conv_1d_nwc_wcf(%arg0: memref<?x?x?xf32>, %arg1: memref<?x?x?xf32>, %arg2: memref<?x?x?xf32>) {
+  linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>,
                          strides = dense<1> : tensor<1xi64>}
      ins (%arg0, %arg1: memref<?x?x?xf32>, memref<?x?x?xf32>)
     outs (%arg2: memref<?x?x?xf32>)
@@ -52,7 +52,7 @@ func @main() {
   %out1D_nwc = call @alloc_3d_filled_f32(%c3, %c6, %c1, %zero) : (index, index, index, f32) -> (memref<?x?x?xf32>)
 
   memref.store %f10, %in1D_nwc[%c0, %c3, %c0] : memref<?x?x?xf32>
-  call @conv_1d_input_nwc_filter_wcf(%in1D_nwc, %filter1D_nwc, %out1D_nwc) : (memref<?x?x?xf32>, memref<?x?x?xf32>, memref<?x?x?xf32>) -> ()
+  call @conv_1d_nwc_wcf(%in1D_nwc, %filter1D_nwc, %out1D_nwc) : (memref<?x?x?xf32>, memref<?x?x?xf32>, memref<?x?x?xf32>) -> ()
   %out1D_nwc_ = memref.cast %out1D_nwc : memref<?x?x?xf32> to memref<*xf32>
   call @print_memref_f32(%out1D_nwc_): (memref<*xf32>) -> ()
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nchw-filter-hwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nchw-filter-hwcf-call.mlir
deleted file mode 100644
index 32e548cbb3240..0000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nchw-filter-hwcf-call.mlir
+++ /dev/null
@@ -1,83 +0,0 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,0,4,4" -convert-linalg-to-loops -convert-scf-to-std \
-// RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3"  -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,0,4,4" \
-// RUN:   -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-func private @print_memref_f32(memref<*xf32>)
-
-// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
-func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref<?x?x?x?xf32> {
-  %buf = memref.alloc(%s1, %s2, %s3, %s4) : memref<?x?x?x?xf32>
-  linalg.fill(%f, %buf) : f32, memref<?x?x?x?xf32>
-  return %buf : memref<?x?x?x?xf32>
-}
-
-func @conv_2d_input_nchw_filter_hwcf(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
-  linalg.conv_2d_input_nchw_filter_hwcf {dilations = dense<1> : tensor<2xi64>,
-                          strides = dense<1> : tensor<2xi64>}
-     ins (%arg0, %arg1: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
-    outs (%arg2: memref<?x?x?x?xf32>)
-  return
-}
-
-func @main() {
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %c3 = constant 3 : index
-  %c6 = constant 6 : index
-  %c8 = constant 8 : index
-  %f10 = constant 10.00000e+00 : f32
-  %val = constant 2.00000e+00 : f32
-  %zero = constant 0.00000e+00 : f32
-
-  %filter2D_nchw = call @alloc_4d_filled_f32(%c3, %c3, %c1, %c1, %val) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
-  %in2D_nchw = call @alloc_4d_filled_f32(%c3, %c1, %c8, %c8, %val) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
-  %out2D_nchw = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
-
-  memref.store %f10, %in2D_nchw[%c0, %c0, %c0, %c3] : memref<?x?x?x?xf32>
-  call @conv_2d_input_nchw_filter_hwcf(%in2D_nchw, %filter2D_nchw, %out2D_nchw) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
-  %out2D_nchw_ = memref.cast %out2D_nchw : memref<?x?x?x?xf32> to memref<*xf32>
-  call @print_memref_f32(%out2D_nchw_): (memref<*xf32>) -> ()
-
-  memref.dealloc %filter2D_nchw : memref<?x?x?x?xf32>
-  memref.dealloc %in2D_nchw : memref<?x?x?x?xf32>
-  memref.dealloc %out2D_nchw : memref<?x?x?x?xf32>
-  return
-}
-
-// CHECK:       Unranked Memref {{.*}}
-// CHECK-NEXT:  [
-// CHECK-SAME:   [
-// CHECK-SAME:    [
-// CHECK-SAME:     [36,     52,     52,     52,     36,     36],
-// CHECK-COUNT-5:  [36,     36,     36,     36,     36,     36]
-// CHECK-SAME:    ]
-// CHECK-SAME:   ],
-// CHECK-NEXT:   [
-// CHECK-SAME:    [
-// CHECK-COUNT-6:  [36,     36,     36,     36,     36,     36]
-// CHECK-SAME:    ]
-// CHECK-SAME:   ],
-// CHECK-NEXT:   [
-// CHECK-SAME:    [
-// CHECK-COUNT-6:  [36,     36,     36,     36,     36,     36]
-// CHECK-SAME:    ]
-// CHECK-SAME:   ]
-// CHECK-SAME:  ]
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir
deleted file mode 100644
index 5c75aa4fc6dd6..0000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir
+++ /dev/null
@@ -1,83 +0,0 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,0,4,4" -convert-linalg-to-loops -convert-scf-to-std \
-// RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3"  -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,0,4,4" \
-// RUN:   -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-func private @print_memref_f32(memref<*xf32>)
-
-// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
-func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref<?x?x?x?xf32> {
-  %buf = memref.alloc(%s1, %s2, %s3, %s4) : memref<?x?x?x?xf32>
-  linalg.fill(%f, %buf) : f32, memref<?x?x?x?xf32>
-  return %buf : memref<?x?x?x?xf32>
-}
-
-func @conv_2d_nchw(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
-  linalg.conv_2d_nchw
-  {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>}
-  ins (%arg0, %arg1: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
-  outs (%arg2: memref<?x?x?x?xf32>)
-  return
-}
-
-func @main() {
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %c3 = constant 3 : index
-  %c6 = constant 6 : index
-  %c8 = constant 8 : index
-  %f10 = constant 10.00000e+00 : f32
-  %val = constant 2.00000e+00 : f32
-  %zero = constant 0.00000e+00 : f32
-
-  %filter2D_nchw = call @alloc_4d_filled_f32(%c1, %c1, %c3, %c3, %val) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
-  %in2D_nchw = call @alloc_4d_filled_f32(%c3, %c1, %c8, %c8, %val) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
-  %out2D_nchw = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
-
-  memref.store %f10, %in2D_nchw[%c0, %c0, %c0, %c3] : memref<?x?x?x?xf32>
-  call @conv_2d_nchw(%in2D_nchw, %filter2D_nchw, %out2D_nchw) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
-  %out2D_nchw_ = memref.cast %out2D_nchw : memref<?x?x?x?xf32> to memref<*xf32>
-  call @print_memref_f32(%out2D_nchw_): (memref<*xf32>) -> ()
-
-  memref.dealloc %filter2D_nchw : memref<?x?x?x?xf32>
-  memref.dealloc %in2D_nchw : memref<?x?x?x?xf32>
-  memref.dealloc %out2D_nchw : memref<?x?x?x?xf32>
-  return
-}
-
-// CHECK:       Unranked Memref {{.*}}
-// CHECK-NEXT:  [
-// CHECK-SAME:   [
-// CHECK-SAME:    [
-// CHECK-SAME:     [36,     52,     52,     52,     36,     36],
-// CHECK-COUNT-5:  [36,     36,     36,     36,     36,     36]
-// CHECK-SAME:    ]
-// CHECK-SAME:   ],
-// CHECK-NEXT:   [
-// CHECK-SAME:    [
-// CHECK-COUNT-6:  [36,     36,     36,     36,     36,     36]
-// CHECK-SAME:    ]
-// CHECK-SAME:   ],
-// CHECK-NEXT:   [
-// CHECK-SAME:    [
-// CHECK-COUNT-6:  [36,     36,     36,     36,     36,     36]
-// CHECK-SAME:    ]
-// CHECK-SAME:   ]
-// CHECK-SAME:  ]
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir
deleted file mode 100644
index 51326560b59e1..0000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir
+++ /dev/null
@@ -1,127 +0,0 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,3,2" -convert-linalg-to-loops -convert-scf-to-std \
-// RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,3,2" \
-// RUN:   -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-func private @print_memref_f32(memref<*xf32>)
-
-// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
-func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> memref<?x?x?x?xf32> {
-  %buf = memref.alloc(%s1, %s2, %s3, %s4) : memref<?x?x?x?xf32>
-  linalg.fill(%f, %buf) : f32, memref<?x?x?x?xf32>
-  return %buf : memref<?x?x?x?xf32>
-}
-
-func @conv_2d_nhwc(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
-  linalg.conv_2d_nhwc ins (%arg0, %arg1: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
-                     outs (%arg2: memref<?x?x?x?xf32>)
-  return
-}
-
-func @main() {
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %c3 = constant 3 : index
-  %c6 = constant 6 : index
-  %c8 = constant 8 : index
-  %f10 = constant 10.00000e+00 : f32
-  %val = constant 2.00000e+00 : f32
-  %zero = constant 0.00000e+00 : f32
-
-  %filter2D_nhwc = call @alloc_4d_filled_f32(%c1, %c3, %c3, %c3, %val) :(index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
-  %in2D_nhwc = call @alloc_4d_filled_f32(%c3, %c8, %c8, %c3, %val) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
-  %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c6, %c6, %c1, %zero) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
-
-  memref.store %f10, %in2D_nhwc[%c0, %c0, %c3, %c0] : memref<?x?x?x?xf32>
-  call @conv_2d_nhwc(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
-  %out2D_nhwc_ = memref.cast %out2D_nhwc : memref<?x?x?x?xf32> to memref<*xf32>
-  call @print_memref_f32(%out2D_nhwc_): (memref<*xf32>) -> ()
-
-  memref.dealloc %filter2D_nhwc : memref<?x?x?x?xf32>
-  memref.dealloc %in2D_nhwc : memref<?x?x?x?xf32>
-  memref.dealloc %out2D_nhwc : memref<?x?x?x?xf32>
-  return
-}
-
-// CHECK:       Unranked Memref {{.*}}
-// CHECK-NEXT:  [
-// CHECK-SAME:   [
-// CHECK-SAME:    [
-// CHECK-SAME:     [108],
-// CHECK-COUNT-3:  [124],
-// CHECK-COUNT-2:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ]
-// CHECK-SAME:   ],
-// CHECK-NEXT:   [
-// CHECK-SAME:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ]
-// CHECK-SAME:   ],
-// CHECK-NEXT:   [
-// CHECK-SAME:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-COUNT-6:  [108]
-// CHECK-SAME:    ]
-// CHECK-SAME:   ]
-// CHECK-SAME:  ]
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nhwc-filter-hwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
similarity index 92%
rename from mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nhwc-filter-hwcf-call.mlir
rename to mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
index 50b9861c9dc13..5627c924dadd2 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-input-nhwc-filter-hwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
@@ -29,8 +29,8 @@ func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f
   return %buf : memref<?x?x?x?xf32>
 }
 
-func @conv_2d_input_nhwc_filter_hwcf(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
-  linalg.conv_2d_input_nhwc_filter_hwcf {dilations = dense<1> : tensor<2xi64>,
+func @conv_2d_nhwc_hwcf(%arg0: memref<?x?x?x?xf32>, %arg1: memref<?x?x?x?xf32>, %arg2: memref<?x?x?x?xf32>) {
+  linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>,
                           strides = dense<1> : tensor<2xi64>}
      ins (%arg0, %arg1: memref<?x?x?x?xf32>, memref<?x?x?x?xf32>)
     outs (%arg2: memref<?x?x?x?xf32>)
@@ -52,7 +52,7 @@ func @main() {
   %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c6, %c6, %c1, %zero) : (index, index, index, index, f32) -> (memref<?x?x?x?xf32>)
 
   memref.store %f10, %in2D_nhwc[%c0, %c0, %c3, %c0] : memref<?x?x?x?xf32>
-  call @conv_2d_input_nhwc_filter_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
+  call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()
   %out2D_nhwc_ = memref.cast %out2D_nhwc : memref<?x?x?x?xf32> to memref<*xf32>
   call @print_memref_f32(%out2D_nhwc_): (memref<*xf32>) -> ()
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ncdhw-filter-dhwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ncdhw-filter-dhwcf-call.mlir
deleted file mode 100644
index 5f063543a584e..0000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ncdhw-filter-dhwcf-call.mlir
+++ /dev/null
@@ -1,90 +0,0 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,5,5,5" -convert-linalg-to-loops -convert-scf-to-std \
-// RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,5,5,5" \
-// RUN:   -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-func private @print_memref_f32(memref<*xf32>)
-
-// Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f
-func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> memref<?x?x?x?x?xf32> {
-  %buf = memref.alloc(%s1, %s2, %s3, %s4, %s5) : memref<?x?x?x?x?xf32>
-  linalg.fill(%f, %buf) : f32, memref<?x?x?x?x?xf32>
-  return %buf : memref<?x?x?x?x?xf32>
-}
-
-func @conv_3d_input_ncdhw_filter_dhwcf(%arg0: memref<?x?x?x?x?xf32>, %arg1: memref<?x?x?x?x?xf32>, %arg2: memref<?x?x?x?x?xf32>) {
-  linalg.conv_3d_input_ncdhw_filter_dhwcf {dilations = dense<1> : tensor<3xi64>,
-                           strides = dense<1> : tensor<3xi64>}
-     ins (%arg0, %arg1: memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
-    outs (%arg2: memref<?x?x?x?x?xf32>)
-  return
-}
-
-func @main() {
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %c3 = constant 3 : index
-  %c6 = constant 6 : index
-  %c8 = constant 8 : index
-  %f10 = constant 10.00000e+00 : f32
-  %val = constant 2.00000e+00 : f32
-  %zero = constant 0.00000e+00 : f32
-
-  %filter3D_ncdhw = call @alloc_5d_filled_f32(%c3, %c3, %c3, %c1, %c1, %val) : (index, index, index, index, index, f32) -> (memref<?x?x?x?x?xf32>)
-  %in3D_ncdhw = call @alloc_5d_filled_f32(%c1, %c1, %c8, %c8, %c8, %val) : (index, index, index, index, index, f32) -> (memref<?x?x?x?x?xf32>)
-  %out3D_ncdhw = call @alloc_5d_filled_f32(%c1, %c1, %c6, %c6, %c6, %zero) : (index, index, index, index, index, f32) -> (memref<?x?x?x?x?xf32>)
-
-  memref.store %f10, %in3D_ncdhw[%c0, %c0, %c0, %c0, %c3] : memref<?x?x?x?x?xf32>
-  call @conv_3d_input_ncdhw_filter_dhwcf(%in3D_ncdhw, %filter3D_ncdhw, %out3D_ncdhw) : (memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>) -> ()
-  %out3D_ncdhw_ = memref.cast %out3D_ncdhw : memref<?x?x?x?x?xf32> to memref<*xf32>
-  call @print_memref_f32(%out3D_ncdhw_): (memref<*xf32>) -> ()
-
-  memref.dealloc %filter3D_ncdhw : memref<?x?x?x?x?xf32>
-  memref.dealloc %in3D_ncdhw : memref<?x?x?x?x?xf32>
-  memref.dealloc %out3D_ncdhw : memref<?x?x?x?x?xf32>
-  return
-}
-
-// CHECK:       Unranked Memref {{.*}}
-// CHECK-NEXT:  [
-// CHECK-SAME:   [
-// CHECK-SAME:    [
-// CHECK-SAME:     [
-// CHECK-SAME:      [108,      124,      124,      124,      108,      108],
-// CHECK-COUNT-5:   [108,      108,      108,      108,      108,      108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108,      108,      108,      108,      108,      108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108,      108,      108,      108,      108,      108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108,      108,      108,      108,      108,      108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108,      108,      108,      108,      108,      108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108,      108,      108,      108,      108,      108]
-// CHECK-SAME:     ]
-// CHECK-SAME:    ]
-// CHECK-SAME:   ]
-// CHECK-SAME:  ]
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir
deleted file mode 100644
index cb7b49eec4d22..0000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir
+++ /dev/null
@@ -1,88 +0,0 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,5,5,5" -convert-linalg-to-loops -convert-scf-to-std \
-// RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,5,5,5" \
-// RUN:   -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-func private @print_memref_f32(memref<*xf32>)
-
-// Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f
-func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> memref<?x?x?x?x?xf32> {
-  %buf = memref.alloc(%s1, %s2, %s3, %s4, %s5) : memref<?x?x?x?x?xf32>
-  linalg.fill(%f, %buf) : f32, memref<?x?x?x?x?xf32>
-  return %buf : memref<?x?x?x?x?xf32>
-}
-
-func @conv_3d_ncdhw(%arg0: memref<?x?x?x?x?xf32>, %arg1: memref<?x?x?x?x?xf32>, %arg2: memref<?x?x?x?x?xf32>) {
-  linalg.conv_3d_ncdhw ins (%arg0, %arg1: memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
-                      outs (%arg2: memref<?x?x?x?x?xf32>)
-  return
-}
-
-func @main() {
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %c3 = constant 3 : index
-  %c6 = constant 6 : index
-  %c8 = constant 8 : index
-  %f10 = constant 10.00000e+00 : f32
-  %val = constant 2.00000e+00 : f32
-  %zero = constant 0.00000e+00 : f32
-
-  %filter3D_ncdhw = call @alloc_5d_filled_f32(%c1, %c1, %c3, %c3, %c3, %val) : (index, index, index, index, index, f32) -> (memref<?x?x?x?x?xf32>)
-  %in3D_ncdhw = call @alloc_5d_filled_f32(%c1, %c1, %c8, %c8, %c8, %val) : (index, index, index, index, index, f32) -> (memref<?x?x?x?x?xf32>)
-  %out3D_ncdhw = call @alloc_5d_filled_f32(%c1, %c1, %c6, %c6, %c6, %zero) : (index, index, index, index, index, f32) -> (memref<?x?x?x?x?xf32>)
-
-  memref.store %f10, %in3D_ncdhw[%c0, %c0, %c0, %c0, %c3] : memref<?x?x?x?x?xf32>
-  call @conv_3d_ncdhw(%in3D_ncdhw, %filter3D_ncdhw, %out3D_ncdhw) : (memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>) -> ()
-  %out3D_ncdhw_ = memref.cast %out3D_ncdhw : memref<?x?x?x?x?xf32> to memref<*xf32>
-  call @print_memref_f32(%out3D_ncdhw_): (memref<*xf32>) -> ()
-
-  memref.dealloc %filter3D_ncdhw : memref<?x?x?x?x?xf32>
-  memref.dealloc %in3D_ncdhw : memref<?x?x?x?x?xf32>
-  memref.dealloc %out3D_ncdhw : memref<?x?x?x?x?xf32>
-  return
-}
-
-// CHECK:       Unranked Memref {{.*}}
-// CHECK-NEXT:  [
-// CHECK-SAME:   [
-// CHECK-SAME:    [
-// CHECK-SAME:     [
-// CHECK-SAME:      [108,      124,      124,      124,      108,      108],
-// CHECK-COUNT-5:   [108,      108,      108,      108,      108,      108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108,      108,      108,      108,      108,      108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108,      108,      108,      108,      108,      108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108,      108,      108,      108,      108,      108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108,      108,      108,      108,      108,      108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108,      108,      108,      108,      108,      108]
-// CHECK-SAME:     ]
-// CHECK-SAME:    ]
-// CHECK-SAME:   ]
-// CHECK-SAME:  ]
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir
deleted file mode 100644
index f761088b22811..0000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir
+++ /dev/null
@@ -1,190 +0,0 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-std -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,5,5,5" -convert-linalg-to-loops -convert-scf-to-std \
-// RUN:   -convert-linalg-to-llvm -lower-affine -convert-scf-to-std --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,5,5,5" \
-// RUN:   -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm -lower-affine -convert-scf-to-std -convert-vector-to-llvm --convert-memref-to-llvm -convert-std-to-llvm | \
-// RUN: mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \
-// RUN: | FileCheck %s
-
-func private @print_memref_f32(memref<*xf32>)
-
-// Creates and returns 5-D buffer of size (%s1, %s2, %s3, %s4, %s5) filled with the value %f
-func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s5 : index, %f : f32) -> memref<?x?x?x?x?xf32> {
-  %buf = memref.alloc(%s1, %s2, %s3, %s4, %s5) : memref<?x?x?x?x?xf32>
-  linalg.fill(%f, %buf) : f32, memref<?x?x?x?x?xf32>
-  return %buf : memref<?x?x?x?x?xf32>
-}
-
-func @conv_3d_ndhwc(%arg0: memref<?x?x?x?x?xf32>, %arg1: memref<?x?x?x?x?xf32>, %arg2: memref<?x?x?x?x?xf32>) {
-  linalg.conv_3d_ndhwc ins (%arg0, %arg1: memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
-                      outs (%arg2: memref<?x?x?x?x?xf32>)
-  return
-}
-
-
-func @main() {
-  %c0 = constant 0 : index
-  %c1 = constant 1 : index
-  %c3 = constant 3 : index
-  %c6 = constant 6 : index
-  %c8 = constant 8 : index
-  %f10 = constant 10.00000e+00 : f32
-  %val = constant 2.00000e+00 : f32
-  %zero = constant 0.00000e+00 : f32
-
-  %filter3D_ndhwc = call @alloc_5d_filled_f32(%c1, %c3, %c3, %c3, %c1, %val) : (index, index, index, index, index, f32) -> (memref<?x?x?x?x?xf32>)
-  %in3D_ndhwc = call @alloc_5d_filled_f32(%c1, %c8, %c8, %c8, %c1, %val) : (index, index, index, index, index, f32) -> (memref<?x?x?x?x?xf32>)
-  %out3D_ndhwc = call @alloc_5d_filled_f32(%c1, %c6, %c6, %c6, %c1, %zero) : (index, index, index, index, index, f32) -> (memref<?x?x?x?x?xf32>)
-
-  memref.store %f10, %in3D_ndhwc[%c0, %c0, %c0, %c3, %c0] : memref<?x?x?x?x?xf32>
-  call @conv_3d_ndhwc(%in3D_ndhwc, %filter3D_ndhwc, %out3D_ndhwc) : (memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>) -> ()
-  %out3D_ndhwc_ = memref.cast %out3D_ndhwc : memref<?x?x?x?x?xf32> to memref<*xf32>
-  call @print_memref_f32(%out3D_ndhwc_): (memref<*xf32>) -> ()
-
-  memref.dealloc %filter3D_ndhwc : memref<?x?x?x?x?xf32>
-  memref.dealloc %in3D_ndhwc : memref<?x?x?x?x?xf32>
-  memref.dealloc %out3D_ndhwc : memref<?x?x?x?x?xf32>
-  return
-}
-
-// CHECK:       Unranked Memref {{.*}}
-// CHECK-NEXT:  [
-// CHECK-SAME:   [
-// CHECK-SAME:    [
-// CHECK-SAME:     [
-// CHECK-SAME:      [108],
-// CHECK-COUNT-3:   [124],
-// CHECK-COUNT-2:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-SAME:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-SAME:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-SAME:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-SAME:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ]
-// CHECK-SAME:    ],
-// CHECK-NEXT:    [
-// CHECK-SAME:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ],
-// CHECK-NEXT:     [
-// CHECK-COUNT-6:   [108]
-// CHECK-SAME:     ]
-// CHECK-SAME:    ]
-// CHECK-SAME:   ]
-// CHECK-SAME:  ]
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ndhwc-filter-dhwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
similarity index 94%
rename from mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ndhwc-filter-dhwcf-call.mlir
rename to mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
index 490bb62c14dac..a7ef161eaefde 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-input-ndhwc-filter-dhwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
@@ -29,8 +29,8 @@ func @alloc_5d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %s
   return %buf : memref<?x?x?x?x?xf32>
 }
 
-func @conv_3d_input_ndhwc_filter_dhwcf(%arg0: memref<?x?x?x?x?xf32>, %arg1: memref<?x?x?x?x?xf32>, %arg2: memref<?x?x?x?x?xf32>) {
-  linalg.conv_3d_input_ndhwc_filter_dhwcf {dilations = dense<1> : tensor<3xi64>,
+func @conv_3d_ndhwc_dhwcf(%arg0: memref<?x?x?x?x?xf32>, %arg1: memref<?x?x?x?x?xf32>, %arg2: memref<?x?x?x?x?xf32>) {
+  linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>,
                            strides = dense<1> : tensor<3xi64>}
      ins (%arg0, %arg1: memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>)
     outs (%arg2: memref<?x?x?x?x?xf32>)
@@ -53,7 +53,7 @@ func @main() {
   %out3D_ndhwc = call @alloc_5d_filled_f32(%c1, %c6, %c6, %c6, %c1, %zero) : (index, index, index, index, index, f32) -> (memref<?x?x?x?x?xf32>)
 
   memref.store %f10, %in3D_ndhwc[%c0, %c0, %c0, %c3, %c0] : memref<?x?x?x?x?xf32>
-  call @conv_3d_input_ndhwc_filter_dhwcf(%in3D_ndhwc, %filter3D_ndhwc, %out3D_ndhwc) : (memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>) -> ()
+  call @conv_3d_ndhwc_dhwcf(%in3D_ndhwc, %filter3D_ndhwc, %out3D_ndhwc) : (memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>, memref<?x?x?x?x?xf32>) -> ()
   %out3D_ndhwc_ = memref.cast %out3D_ndhwc : memref<?x?x?x?x?xf32> to memref<*xf32>
   call @print_memref_f32(%out3D_ndhwc_): (memref<*xf32>) -> ()
 
diff --git a/mlir/test/lib/Analysis/TestMemRefDependenceCheck.cpp b/mlir/test/lib/Analysis/TestMemRefDependenceCheck.cpp
index b53092cfc1f47..3be9dc14b7520 100644
--- a/mlir/test/lib/Analysis/TestMemRefDependenceCheck.cpp
+++ b/mlir/test/lib/Analysis/TestMemRefDependenceCheck.cpp
@@ -81,7 +81,7 @@ static void checkDependences(ArrayRef<Operation *> loadsAndStores) {
       unsigned numCommonLoops =
           getNumCommonSurroundingLoops(*srcOpInst, *dstOpInst);
       for (unsigned d = 1; d <= numCommonLoops + 1; ++d) {
-        FlatAffineConstraints dependenceConstraints;
+        FlatAffineValueConstraints dependenceConstraints;
         SmallVector<DependenceComponent, 2> dependenceComponents;
         DependenceResult result = checkMemrefAccessDependence(
             srcAccess, dstAccess, d, &dependenceConstraints,
diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.cpp b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
index b6a2ca6145e27..94b9ea8429944 100644
--- a/mlir/test/lib/Dialect/Test/TestAttributes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
@@ -21,7 +21,7 @@
 #include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
-using namespace mlir::test;
+using namespace test;
 
 //===----------------------------------------------------------------------===//
 // AttrWithSelfTypeParamAttr
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index e56c2d1a92d0f..2a1f37119e2d5 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -22,12 +22,14 @@
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/StringSwitch.h"
 
-using namespace mlir;
-using namespace mlir::test;
-
+// Include this before the using namespace lines below to
+// test that we don't have namespace dependencies.
 #include "TestOpsDialect.cpp.inc"
 
-void mlir::test::registerTestDialect(DialectRegistry &registry) {
+using namespace mlir;
+using namespace test;
+
+void test::registerTestDialect(DialectRegistry &registry) {
   registry.insert<TestDialect>();
 }
 
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.h b/mlir/test/lib/Dialect/Test/TestDialect.h
index d57a2c119723a..5aca160c3f183 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.h
+++ b/mlir/test/lib/Dialect/Test/TestDialect.h
@@ -45,11 +45,9 @@ class RewritePatternSet;
 #define GET_OP_CLASSES
 #include "TestOps.h.inc"
 
-namespace mlir {
 namespace test {
-void registerTestDialect(DialectRegistry &registry);
-void populateTestReductionPatterns(RewritePatternSet &patterns);
-} // namespace test
-} // namespace mlir
+void registerTestDialect(::mlir::DialectRegistry &registry);
+void populateTestReductionPatterns(::mlir::RewritePatternSet &patterns);
+} // end namespace test
 
 #endif // MLIR_TESTDIALECT_H
diff --git a/mlir/test/lib/Dialect/Test/TestInterfaces.td b/mlir/test/lib/Dialect/Test/TestInterfaces.td
index 817f2f78bc914..1d9fd9c21e46d 100644
--- a/mlir/test/lib/Dialect/Test/TestInterfaces.td
+++ b/mlir/test/lib/Dialect/Test/TestInterfaces.td
@@ -14,26 +14,26 @@ include "mlir/Interfaces/SideEffectInterfaceBase.td"
 
 // A type interface used to test the ODS generation of type interfaces.
 def TestTypeInterface : TypeInterface<"TestTypeInterface"> {
-  let cppNamespace = "::mlir::test";
+  let cppNamespace = "::test";
   let methods = [
     InterfaceMethod<"Prints the type name.",
-      "void", "printTypeA", (ins "Location":$loc), [{
+      "void", "printTypeA", (ins "::mlir::Location":$loc), [{
         emitRemark(loc) << $_type << " - TestA";
       }]
     >,
     InterfaceMethod<"Prints the type name.",
-      "void", "printTypeB", (ins "Location":$loc),
+      "void", "printTypeB", (ins "::mlir::Location":$loc),
       [{}], /*defaultImplementation=*/[{
         emitRemark(loc) << $_type << " - TestB";
       }]
     >,
     InterfaceMethod<"Prints the type name.",
-      "void", "printTypeC", (ins "Location":$loc)
+      "void", "printTypeC", (ins "::mlir::Location":$loc)
     >,
     // It should be possible to use the interface type name as result type
     // as well as in the implementation.
     InterfaceMethod<"Prints the type name and returns the type as interface.",
-      "TestTypeInterface", "printTypeRet", (ins "Location":$loc),
+      "TestTypeInterface", "printTypeRet", (ins "::mlir::Location":$loc),
       [{}], /*defaultImplementation=*/[{
         emitRemark(loc) << $_type << " - TestRet";
         return $_type;
@@ -42,13 +42,13 @@ def TestTypeInterface : TypeInterface<"TestTypeInterface"> {
   ];
   let extraClassDeclaration = [{
     /// Prints the type name.
-    void printTypeD(Location loc) const {
+    void printTypeD(::mlir::Location loc) const {
       emitRemark(loc) << *this << " - TestD";
     }
   }];
   let extraTraitClassDeclaration = [{
     /// Prints the type name.
-    void printTypeE(Location loc) const {
+    void printTypeE(::mlir::Location loc) const {
       emitRemark(loc) << $_type << " - TestE";
     }
   }];
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index c8b656a75e0db..add66b421f1f2 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -24,7 +24,7 @@ include "TestInterfaces.td"
 
 def Test_Dialect : Dialect {
   let name = "test";
-  let cppNamespace = "::mlir::test";
+  let cppNamespace = "::test";
   let hasCanonicalizer = 1;
   let hasConstantMaterializer = 1;
   let hasOperationAttrVerify = 1;
@@ -38,16 +38,16 @@ def Test_Dialect : Dialect {
     void registerAttributes();
     void registerTypes();
 
-    Attribute parseAttribute(DialectAsmParser &parser,
-                             Type type) const override;
-    void printAttribute(Attribute attr,
-                        DialectAsmPrinter &printer) const override;
+    ::mlir::Attribute parseAttribute(::mlir::DialectAsmParser &parser,
+                             ::mlir::Type type) const override;
+    void printAttribute(::mlir::Attribute attr,
+                        ::mlir::DialectAsmPrinter &printer) const override;
 
     // Provides a custom printing/parsing for some operations.
-    Optional<ParseOpHook>
-      getParseOperationHook(StringRef opName) const override;
-    LogicalResult printOperation(Operation *op,
-                                 OpAsmPrinter &printer) const override;
+    ::llvm::Optional<ParseOpHook>
+      getParseOperationHook(::llvm::StringRef opName) const override;
+    ::mlir::LogicalResult printOperation(::mlir::Operation *op,
+                                 ::mlir::OpAsmPrinter &printer) const override;
   private:
     // Storage for a custom fallback interface.
     void *fallbackEffectOpInterfaces;
@@ -117,8 +117,8 @@ def MultiTensorRankOf : TEST_Op<"multi_tensor_rank_of"> {
 }
 
 def TEST_TestType : DialectType<Test_Dialect,
-    CPred<"$_self.isa<::mlir::test::TestType>()">, "test">,
-    BuildableType<"$_builder.getType<::mlir::test::TestType>()">;
+    CPred<"$_self.isa<::test::TestType>()">, "test">,
+    BuildableType<"$_builder.getType<::test::TestType>()">;
 
 //===----------------------------------------------------------------------===//
 // Test Symbols
@@ -372,8 +372,8 @@ def ConversionCallOp : TEST_Op<"conversion_call_op",
     operand_range getArgOperands() { return inputs(); }
 
     /// Return the callee of this operation.
-    CallInterfaceCallable getCallableForCallee() {
-      return (*this)->getAttrOfType<SymbolRefAttr>("callee");
+    ::mlir::CallInterfaceCallable getCallableForCallee() {
+      return (*this)->getAttrOfType<::mlir::SymbolRefAttr>("callee");
     }
   }];
 }
@@ -384,9 +384,9 @@ def FunctionalRegionOp : TEST_Op<"functional_region_op",
   let results = (outs FunctionType);
 
   let extraClassDeclaration = [{
-    Region *getCallableRegion() { return &body(); }
-    ArrayRef<Type> getCallableResults() {
-      return getType().cast<FunctionType>().getResults();
+    ::mlir::Region *getCallableRegion() { return &body(); }
+    ::llvm::ArrayRef<::mlir::Type> getCallableResults() {
+      return getType().cast<::mlir::FunctionType>().getResults();
     }
   }];
 }
@@ -748,7 +748,7 @@ def OpFuncRef : TEST_Op<"op_funcref"> {
   let description = [{
     The "test.op_funcref" is a test op with a reference to a function symbol.
   }];
-  let builders = [OpBuilder<(ins "FuncOp":$function)>];
+  let builders = [OpBuilder<(ins "::mlir::FuncOp":$function)>];
 }
 
 // Pattern add the argument plus a increasing static number hidden in
@@ -898,10 +898,10 @@ def OpAllAttrConstraint2 : TEST_Op<"all_attr_constraint_of2"> {
 }
 def Constraint0 : AttrConstraint<
     CPred<"$_self.cast<ArrayAttr>()[0]."
-          "cast<IntegerAttr>().getInt() == 0">,
+          "cast<::mlir::IntegerAttr>().getInt() == 0">,
     "[0] == 0">;
 def Constraint1 : AttrConstraint<
-    CPred<"$_self.cast<ArrayAttr>()[1].cast<IntegerAttr>().getInt() == 1">,
+    CPred<"$_self.cast<ArrayAttr>()[1].cast<::mlir::IntegerAttr>().getInt() == 1">,
     "[1] == 1">;
 def : Pat<(OpAllAttrConstraint1
             AllAttrConstraintsOf<[Constraint0, Constraint1]>:$attr),
@@ -917,7 +917,7 @@ def TestOpConstant : TEST_Op<"constant", [ConstantLike, NoSideEffect]> {
   let arguments = (ins AnyAttr:$value);
   let results = (outs AnyType);
   let extraClassDeclaration = [{
-    Attribute getValue() { return (*this)->getAttr("value"); }
+    ::mlir::Attribute getValue() { return (*this)->getAttr("value"); }
   }];
 
   let hasFolder = 1;
@@ -1268,7 +1268,7 @@ def MixedVResultOp3 : TEST_Op<"mixed_variadic_out3",
   // We will use this op in a nested result pattern, where we cannot deduce the
   // result type. So need to provide a builder not requiring result types.
   let builders = [
-    OpBuilder<(ins "IntegerAttr":$count),
+    OpBuilder<(ins "::mlir::IntegerAttr":$count),
     [{
       auto i32Type = $_builder.getIntegerType(32);
       $_state.addTypes(i32Type); // $output1
@@ -1936,8 +1936,8 @@ def CopyOp : TEST_Op<"copy", [CopyOpInterface]> {
      attr-dict
   }];
   let extraClassDeclaration = [{
-    Value getSource() { return source(); }
-    Value getTarget() { return target(); }
+    ::mlir::Value getSource() { return source(); }
+    ::mlir::Value getTarget() { return target(); }
   }];
 }
 
@@ -2027,16 +2027,16 @@ def RegionIfOp : TEST_Op<"region_if",
                         AnyRegion:$elseRegion,
                         AnyRegion:$joinRegion);
   let extraClassDeclaration = [{
-    Block::BlockArgListType getThenArgs() {
+    ::mlir::Block::BlockArgListType getThenArgs() {
       return getBody(0)->getArguments();
     }
-    Block::BlockArgListType getElseArgs() {
+    ::mlir::Block::BlockArgListType getElseArgs() {
       return getBody(1)->getArguments();
     }
-    Block::BlockArgListType getJoinArgs() {
+    ::mlir::Block::BlockArgListType getJoinArgs() {
       return getBody(2)->getArguments();
     }
-    OperandRange getSuccessorEntryOperands(unsigned index);
+    ::mlir::OperandRange getSuccessorEntryOperands(unsigned index);
   }];
 }
 
@@ -2089,12 +2089,12 @@ def TableGenBuildOp5 : TEST_Op<"tblgen_build_5",
   let results = (outs AnyType:$result);
 
   let extraClassDeclaration = [{
-    static LogicalResult inferReturnTypes(MLIRContext *,
-          Optional<Location> location, ValueRange operands,
-          DictionaryAttr attributes, RegionRange regions,
-          SmallVectorImpl<Type> &inferredReturnTypes) {
+    static ::mlir::LogicalResult inferReturnTypes(::mlir::MLIRContext *,
+          ::llvm::Optional<::mlir::Location> location, ::mlir::ValueRange operands,
+          ::mlir::DictionaryAttr attributes, ::mlir::RegionRange regions,
+          ::llvm::SmallVectorImpl<::mlir::Type> &inferredReturnTypes) {
       inferredReturnTypes.assign({operands[0].getType()});
-      return success();
+      return ::mlir::success();
     }
    }];
 }
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index a6b0d970792f7..62bed7e0bba2c 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -17,7 +17,7 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using namespace mlir;
-using namespace mlir::test;
+using namespace test;
 
 // Native function for testing NativeCodeCall
 static Value chooseOperand(Value input1, Value input2, BoolAttr choice) {
@@ -67,7 +67,7 @@ namespace {
 // Test Reduce Pattern Interface
 //===----------------------------------------------------------------------===//
 
-void mlir::test::populateTestReductionPatterns(RewritePatternSet &patterns) {
+void test::populateTestReductionPatterns(RewritePatternSet &patterns) {
   populateWithGenerated(patterns);
 }
 
diff --git a/mlir/test/lib/Dialect/Test/TestTraits.cpp b/mlir/test/lib/Dialect/Test/TestTraits.cpp
index a1a78e724a584..bb78a4b175da1 100644
--- a/mlir/test/lib/Dialect/Test/TestTraits.cpp
+++ b/mlir/test/lib/Dialect/Test/TestTraits.cpp
@@ -11,7 +11,7 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 using namespace mlir;
-using namespace mlir::test;
+using namespace test;
 
 //===----------------------------------------------------------------------===//
 // Trait Folder.
diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
index a5ae219780b4b..e11a042766bf0 100644
--- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
@@ -56,7 +56,7 @@ def IntegerType : Test_Type<"TestInteger"> {
     ins
     "unsigned":$width,
     // SignednessSemantics is defined below.
-    "::mlir::test::TestIntegerType::SignednessSemantics":$signedness
+    "::test::TestIntegerType::SignednessSemantics":$signedness
   );
 
   // We define the printer inline.
@@ -84,7 +84,7 @@ def IntegerType : Test_Type<"TestInteger"> {
     int width;
     if ($_parser.parseInteger(width)) return Type();
     if ($_parser.parseGreater()) return Type();
-    Location loc = $_parser.getEncodedSourceLoc($_parser.getNameLoc());
+    ::mlir::Location loc = $_parser.getEncodedSourceLoc($_parser.getNameLoc());
     return getChecked(loc, loc.getContext(), width, signedness);
   }];
 
@@ -114,7 +114,7 @@ class FieldInfo_Type<string name> : Test_Type<name> {
     // An ArrayRef of something which requires allocation in the storage
     // constructor.
     ArrayRefOfSelfAllocationParameter<
-      "::mlir::test::FieldInfo", // FieldInfo is defined/declared in TestTypes.h.
+      "::test::FieldInfo", // FieldInfo is defined/declared in TestTypes.h.
       "Models struct fields">: $fields
   );
 
@@ -136,7 +136,7 @@ class FieldInfo_Type<string name> : Test_Type<name> {
     llvm::SmallVector<FieldInfo, 4> parameters;
     if ($_parser.parseLess()) return Type();
     while (mlir::succeeded($_parser.parseOptionalLBrace())) {
-      StringRef name;
+      llvm::StringRef name;
       if ($_parser.parseKeyword(&name)) return Type();
       if ($_parser.parseComma()) return Type();
       Type type;
@@ -166,12 +166,12 @@ def TestTypeWithLayoutType : Test_Type<"TestTypeWithLayout", [
   let mnemonic = "test_type_with_layout";
   let parameters = (ins "unsigned":$key);
   let extraClassDeclaration = [{
-    LogicalResult verifyEntries(DataLayoutEntryListRef params,
-                                Location loc) const;
+    ::mlir::LogicalResult verifyEntries(::mlir::DataLayoutEntryListRef params,
+                                ::mlir::Location loc) const;
 
   private:
-    unsigned extractKind(DataLayoutEntryListRef params,
-                         StringRef expectedKind) const;
+    unsigned extractKind(::mlir::DataLayoutEntryListRef params,
+                         ::llvm::StringRef expectedKind) const;
 
   public:
   }];
diff --git a/mlir/test/lib/Dialect/Test/TestTypes.cpp b/mlir/test/lib/Dialect/Test/TestTypes.cpp
index 6f01540c8b393..960fbbb5348ce 100644
--- a/mlir/test/lib/Dialect/Test/TestTypes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestTypes.cpp
@@ -22,7 +22,7 @@
 #include "llvm/ADT/TypeSwitch.h"
 
 using namespace mlir;
-using namespace mlir::test;
+using namespace test;
 
 // Custom parser for SignednessSemantics.
 static ParseResult
@@ -65,7 +65,6 @@ static void printSignedness(DialectAsmPrinter &printer,
 // The functions don't need to be in the header file, but need to be in the mlir
 // namespace. Declare them here, then define them immediately below. Separating
 // the declaration and definition adheres to the LLVM coding standards.
-namespace mlir {
 namespace test {
 // FieldInfo is used as part of a parameter, so equality comparison is
 // compulsory.
@@ -73,16 +72,15 @@ static bool operator==(const FieldInfo &a, const FieldInfo &b);
 // FieldInfo is used as part of a parameter, so a hash will be computed.
 static llvm::hash_code hash_value(const FieldInfo &fi); // NOLINT
 } // namespace test
-} // namespace mlir
 
 // FieldInfo is used as part of a parameter, so equality comparison is
 // compulsory.
-static bool mlir::test::operator==(const FieldInfo &a, const FieldInfo &b) {
+static bool test::operator==(const FieldInfo &a, const FieldInfo &b) {
   return a.name == b.name && a.type == b.type;
 }
 
 // FieldInfo is used as part of a parameter, so a hash will be computed.
-static llvm::hash_code mlir::test::hash_value(const FieldInfo &fi) { // NOLINT
+static llvm::hash_code test::hash_value(const FieldInfo &fi) { // NOLINT
   return llvm::hash_combine(fi.name, fi.type);
 }
 
diff --git a/mlir/test/lib/Dialect/Test/TestTypes.h b/mlir/test/lib/Dialect/Test/TestTypes.h
index f9a0289f20b01..7ee722197a25f 100644
--- a/mlir/test/lib/Dialect/Test/TestTypes.h
+++ b/mlir/test/lib/Dialect/Test/TestTypes.h
@@ -23,81 +23,77 @@
 #include "mlir/IR/Types.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
 
-namespace mlir {
 namespace test {
 
 /// FieldInfo represents a field in the StructType data type. It is used as a
 /// parameter in TestTypeDefs.td.
 struct FieldInfo {
-  StringRef name;
-  Type type;
+  ::llvm::StringRef name;
+  ::mlir::Type type;
 
   // Custom allocation called from generated constructor code
-  FieldInfo allocateInto(TypeStorageAllocator &alloc) const {
+  FieldInfo allocateInto(::mlir::TypeStorageAllocator &alloc) const {
     return FieldInfo{alloc.copyInto(name), type};
   }
 };
 
 } // namespace test
-} // namespace mlir
 
 #include "TestTypeInterfaces.h.inc"
 
 #define GET_TYPEDEF_CLASSES
 #include "TestTypeDefs.h.inc"
 
-namespace mlir {
 namespace test {
 
 /// Storage for simple named recursive types, where the type is identified by
 /// its name and can "contain" another type, including itself.
-struct TestRecursiveTypeStorage : public TypeStorage {
-  using KeyTy = StringRef;
+struct TestRecursiveTypeStorage : public ::mlir::TypeStorage {
+  using KeyTy = ::llvm::StringRef;
 
-  explicit TestRecursiveTypeStorage(StringRef key) : name(key), body(Type()) {}
+  explicit TestRecursiveTypeStorage(::llvm::StringRef key) : name(key), body(::mlir::Type()) {}
 
   bool operator==(const KeyTy &other) const { return name == other; }
 
-  static TestRecursiveTypeStorage *construct(TypeStorageAllocator &allocator,
+  static TestRecursiveTypeStorage *construct(::mlir::TypeStorageAllocator &allocator,
                                              const KeyTy &key) {
     return new (allocator.allocate<TestRecursiveTypeStorage>())
         TestRecursiveTypeStorage(allocator.copyInto(key));
   }
 
-  LogicalResult mutate(TypeStorageAllocator &allocator, Type newBody) {
+  ::mlir::LogicalResult mutate(::mlir::TypeStorageAllocator &allocator, ::mlir::Type newBody) {
     // Cannot set a different body than before.
     if (body && body != newBody)
-      return failure();
+      return ::mlir::failure();
 
     body = newBody;
-    return success();
+    return ::mlir::success();
   }
 
-  StringRef name;
-  Type body;
+  ::llvm::StringRef name;
+  ::mlir::Type body;
 };
 
 /// Simple recursive type identified by its name and pointing to another named
 /// type, potentially itself. This requires the body to be mutated separately
 /// from type creation.
 class TestRecursiveType
-    : public Type::TypeBase<TestRecursiveType, Type, TestRecursiveTypeStorage> {
+    : public ::mlir::Type::TypeBase<TestRecursiveType, ::mlir::Type, TestRecursiveTypeStorage> {
 public:
   using Base::Base;
 
-  static TestRecursiveType get(MLIRContext *ctx, StringRef name) {
+  static TestRecursiveType get(::mlir::MLIRContext *ctx, ::llvm::StringRef name) {
     return Base::get(ctx, name);
   }
 
   /// Body getter and setter.
-  LogicalResult setBody(Type body) { return Base::mutate(body); }
-  Type getBody() { return getImpl()->body; }
+  ::mlir::LogicalResult setBody(Type body) { return Base::mutate(body); }
+  ::mlir::Type getBody() { return getImpl()->body; }
 
   /// Name/key getter.
-  StringRef getName() { return getImpl()->name; }
+  ::llvm::StringRef getName() { return getImpl()->name; }
 };
 
 } // namespace test
-} // namespace mlir
 
 #endif // MLIR_TESTTYPES_H
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index 11b56a583cc83..907f9aedfdb17 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -444,6 +444,9 @@ struct TestVectorTransferLoweringPatterns
 struct TestVectorMultiReductionLoweringPatterns
     : public PassWrapper<TestVectorMultiReductionLoweringPatterns,
                          FunctionPass> {
+  TestVectorMultiReductionLoweringPatterns() = default;
+  TestVectorMultiReductionLoweringPatterns(
+      const TestVectorMultiReductionLoweringPatterns &pass) {}
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<memref::MemRefDialect>();
   }
@@ -454,9 +457,13 @@ struct TestVectorMultiReductionLoweringPatterns
     return "Test conversion patterns to lower vector.multi_reduction to other "
            "vector ops";
   }
+  Option<bool> useOuterReductions{
+      *this, "use-outer-reductions",
+      llvm::cl::desc("Move reductions to outer most dimensions"),
+      llvm::cl::init(false)};
   void runOnFunction() override {
     RewritePatternSet patterns(&getContext());
-    populateVectorMultiReductionLoweringPatterns(patterns);
+    populateVectorMultiReductionLoweringPatterns(patterns, !useOuterReductions);
     (void)applyPatternsAndFoldGreedily(getFunction(), std::move(patterns));
   }
 };
diff --git a/mlir/test/lib/IR/TestInterfaces.cpp b/mlir/test/lib/IR/TestInterfaces.cpp
index d5e1d699502fb..bafcdcb5d0c6f 100644
--- a/mlir/test/lib/IR/TestInterfaces.cpp
+++ b/mlir/test/lib/IR/TestInterfaces.cpp
@@ -10,7 +10,7 @@
 #include "mlir/Pass/Pass.h"
 
 using namespace mlir;
-using namespace mlir::test;
+using namespace test;
 
 namespace {
 /// This test checks various aspects of Type interface generation and
diff --git a/mlir/test/lib/IR/TestTypes.cpp b/mlir/test/lib/IR/TestTypes.cpp
index 0b6e003902341..7d078868f9dd8 100644
--- a/mlir/test/lib/IR/TestTypes.cpp
+++ b/mlir/test/lib/IR/TestTypes.cpp
@@ -11,7 +11,7 @@
 #include "mlir/Pass/Pass.h"
 
 using namespace mlir;
-using namespace mlir::test;
+using namespace test;
 
 namespace {
 struct TestRecursiveTypesPass
diff --git a/mlir/test/lib/Transforms/TestInlining.cpp b/mlir/test/lib/Transforms/TestInlining.cpp
index 53a16679b512c..c88ee9e7c1c92 100644
--- a/mlir/test/lib/Transforms/TestInlining.cpp
+++ b/mlir/test/lib/Transforms/TestInlining.cpp
@@ -22,7 +22,7 @@
 #include "llvm/ADT/StringSet.h"
 
 using namespace mlir;
-using namespace mlir::test;
+using namespace test;
 
 namespace {
 struct Inliner : public PassWrapper<Inliner, FunctionPass> {
diff --git a/mlir/test/mlir-linalg-ods-gen/test-linalg-ods-yaml-gen.yaml b/mlir/test/mlir-linalg-ods-gen/test-linalg-ods-yaml-gen.yaml
index 3c8b5271cf5c3..6613ab2a006f1 100644
--- a/mlir/test/mlir-linalg-ods-gen/test-linalg-ods-yaml-gen.yaml
+++ b/mlir/test/mlir-linalg-ods-gen/test-linalg-ods-yaml-gen.yaml
@@ -62,8 +62,14 @@ structured_op: !LinalgStructuredOpConfig
 #  ODS-NEXT:    Variadic<AnyShaped>:$outputs
 
 #       ODS:  let builders =
+#       ODS:  (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs,
+#  ODS-NEXT:       "ValueRange":$outputs,
+#  ODS-NEXT:       CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes),
+
 #       ODS:    $_state.addOperands(inputs);
 #  ODS-NEXT:    $_state.addOperands(outputs);
+#  ODS-NEXT:    $_state.addTypes(resultTensorTypes);
+#  ODS-NEXT:    $_state.addAttributes(attributes);
 #  ODS-NEXT:    $_state.addAttribute(
 #  ODS-NEXT:      "operand_segment_sizes",
 #  ODS-NEXT:      $_builder.getI32VectorAttr({
diff --git a/mlir/test/mlir-tblgen/attrdefs.td b/mlir/test/mlir-tblgen/attrdefs.td
index a5a41b3039918..c42da9433e434 100644
--- a/mlir/test/mlir-tblgen/attrdefs.td
+++ b/mlir/test/mlir-tblgen/attrdefs.td
@@ -13,21 +13,21 @@ include "mlir/IR/OpBase.td"
 
 // DEF: #ifdef GET_ATTRDEF_LIST
 // DEF: #undef GET_ATTRDEF_LIST
-// DEF: ::mlir::test::SimpleAAttr,
-// DEF: ::mlir::test::CompoundAAttr,
-// DEF: ::mlir::test::IndexAttr,
-// DEF: ::mlir::test::SingleParameterAttr
+// DEF: ::test::SimpleAAttr,
+// DEF: ::test::CompoundAAttr,
+// DEF: ::test::IndexAttr,
+// DEF: ::test::SingleParameterAttr
 
 // DEF-LABEL: ::mlir::OptionalParseResult generatedAttributeParser(::mlir::MLIRContext *context,
 // DEF-NEXT: ::mlir::DialectAsmParser &parser,
 // DEF-NEXT: ::llvm::StringRef mnemonic, ::mlir::Type type,
 // DEF-NEXT: ::mlir::Attribute &value) {
-// DEF: if (mnemonic == ::mlir::test::CompoundAAttr::getMnemonic()) {
-// DEF-NEXT: value = ::mlir::test::CompoundAAttr::parse(context, parser, type);
+// DEF: if (mnemonic == ::test::CompoundAAttr::getMnemonic()) {
+// DEF-NEXT: value = ::test::CompoundAAttr::parse(context, parser, type);
 // DEF-NEXT: return ::mlir::success(!!value);
 // DEF-NEXT: }
-// DEF-NEXT: if (mnemonic == ::mlir::test::IndexAttr::getMnemonic()) {
-// DEF-NEXT:   value = ::mlir::test::IndexAttr::parse(context, parser, type);
+// DEF-NEXT: if (mnemonic == ::test::IndexAttr::getMnemonic()) {
+// DEF-NEXT:   value = ::test::IndexAttr::parse(context, parser, type);
 // DEF-NEXT:   return ::mlir::success(!!value);
 // DEF: return {};
 
@@ -35,7 +35,7 @@ def Test_Dialect: Dialect {
 // DECL-NOT: TestDialect
 // DEF-NOT: TestDialect
     let name = "TestDialect";
-    let cppNamespace = "::mlir::test";
+    let cppNamespace = "::test";
 }
 
 class TestAttr<string name> : AttrDef<Test_Dialect, name> { }
@@ -52,7 +52,7 @@ def B_CompoundAttrA : TestAttr<"CompoundA"> {
   let parameters = (
       ins
       "int":$widthOfSomething,
-      "::mlir::test::SimpleTypeA": $exampleTdType,
+      "::test::SimpleTypeA": $exampleTdType,
       APFloatParameter<"">: $apFloat,
       ArrayRefParameter<"int", "Matrix dimensions">:$dims,
       AttributeSelfTypeParameter<"">:$inner
@@ -61,8 +61,8 @@ def B_CompoundAttrA : TestAttr<"CompoundA"> {
   let genVerifyDecl = 1;
 
 // DECL-LABEL: class CompoundAAttr : public ::mlir::Attribute
-// DECL: static CompoundAAttr getChecked(llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, ::mlir::MLIRContext *context, int widthOfSomething, ::mlir::test::SimpleTypeA exampleTdType, ::llvm::APFloat apFloat, ::llvm::ArrayRef<int> dims, ::mlir::Type inner);
-// DECL: static ::mlir::LogicalResult verify(::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, int widthOfSomething, ::mlir::test::SimpleTypeA exampleTdType, ::llvm::APFloat apFloat, ::llvm::ArrayRef<int> dims, ::mlir::Type inner);
+// DECL: static CompoundAAttr getChecked(llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, ::mlir::MLIRContext *context, int widthOfSomething, ::test::SimpleTypeA exampleTdType, ::llvm::APFloat apFloat, ::llvm::ArrayRef<int> dims, ::mlir::Type inner);
+// DECL: static ::mlir::LogicalResult verify(::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, int widthOfSomething, ::test::SimpleTypeA exampleTdType, ::llvm::APFloat apFloat, ::llvm::ArrayRef<int> dims, ::mlir::Type inner);
 // DECL: static constexpr ::llvm::StringLiteral getMnemonic() {
 // DECL:   return ::llvm::StringLiteral("cmpnd_a");
 // DECL: }
@@ -70,7 +70,7 @@ def B_CompoundAttrA : TestAttr<"CompoundA"> {
 // DECL-NEXT: ::mlir::DialectAsmParser &parser, ::mlir::Type type);
 // DECL: void print(::mlir::DialectAsmPrinter &printer) const;
 // DECL: int getWidthOfSomething() const;
-// DECL: ::mlir::test::SimpleTypeA getExampleTdType() const;
+// DECL: ::test::SimpleTypeA getExampleTdType() const;
 // DECL: ::llvm::APFloat getApFloat() const;
 
 // Check that AttributeSelfTypeParameter is handled properly.
diff --git a/mlir/test/mlir-tblgen/gen-dialect-doc.td b/mlir/test/mlir-tblgen/gen-dialect-doc.td
new file mode 100644
index 0000000000000..a49ebaae8800b
--- /dev/null
+++ b/mlir/test/mlir-tblgen/gen-dialect-doc.td
@@ -0,0 +1,37 @@
+// RUN: mlir-tblgen -gen-dialect-doc -I %S/../../include %s | FileCheck %s
+
+include "mlir/IR/OpBase.td"
+
+def Test_Dialect : Dialect {
+  let name = "test";
+  let summary = "Dialect of ops to test";
+  let description = [{
+    Dialect without a [TOC] here.
+    TOC added by tool.
+  }];
+  let cppNamespace = "NS";
+}
+def AOp : Op<Test_Dialect, "a", []>;
+
+// CHECK: Dialect without a [TOC] here.
+// CHECK: TOC added by tool.
+// CHECK: [TOC]
+// CHECK-NOT: [TOC]
+
+def Toc_Dialect : Dialect {
+  let name = "test_toc";
+  let summary = "Dialect of ops to test";
+  let description = [{
+    Dialect with
+
+    [TOC]
+
+    here.
+  }];
+  let cppNamespace = "NS";
+}
+def BOp : Op<Toc_Dialect, "b", []>;
+
+// CHECK: Dialect with
+// CHECK: [TOC]
+// CHECK: here.
diff --git a/mlir/test/mlir-tblgen/op-result.td b/mlir/test/mlir-tblgen/op-result.td
index ac8f1cbd49f68..63bc5d0fb417c 100644
--- a/mlir/test/mlir-tblgen/op-result.td
+++ b/mlir/test/mlir-tblgen/op-result.td
@@ -115,12 +115,23 @@ def OpK : NS_Op<"only_input_is_variadic_with_same_value_type_op", [SameOperandsA
 
 // Test with inferred shapes and interleaved with operands/attributes.
 //
-def OpL : NS_Op<"op_with_all_types_constraint",
+def OpL1 : NS_Op<"op_with_all_types_constraint",
     [AllTypesMatch<["a", "b"]>]> {
   let arguments = (ins I32Attr:$attr1, AnyType:$a);
   let results = (outs Res<AnyType, "output b", []>:$b);
 }
 
-// CHECK-LABEL: LogicalResult OpL::inferReturnTypes
+// CHECK-LABEL: LogicalResult OpL1::inferReturnTypes
 // CHECK-NOT: }
 // CHECK: inferredReturnTypes[0] = operands[0].getType();
+
+def OpL2 : NS_Op<"op_with_all_types_constraint",
+    [AllTypesMatch<["c", "b"]>, AllTypesMatch<["a", "d"]>]> {
+  let arguments = (ins I32Attr:$attr1, AnyType:$a, AnyType:$a2, AnyType:$c);
+  let results = (outs Res<AnyType, "output b", []>:$b, AnyType:$d);
+}
+
+// CHECK-LABEL: LogicalResult OpL2::inferReturnTypes
+// CHECK-NOT: }
+// CHECK: inferredReturnTypes[0] = operands[2].getType();
+// CHECK: inferredReturnTypes[1] = operands[0].getType();
diff --git a/mlir/test/mlir-tblgen/typedefs.td b/mlir/test/mlir-tblgen/typedefs.td
index 69848116fbd36..cffc0ec58faf8 100644
--- a/mlir/test/mlir-tblgen/typedefs.td
+++ b/mlir/test/mlir-tblgen/typedefs.td
@@ -13,22 +13,22 @@ include "mlir/IR/OpBase.td"
 
 // DEF: #ifdef GET_TYPEDEF_LIST
 // DEF: #undef GET_TYPEDEF_LIST
-// DEF: ::mlir::test::SimpleAType,
-// DEF: ::mlir::test::CompoundAType,
-// DEF: ::mlir::test::IndexType,
-// DEF: ::mlir::test::SingleParameterType,
-// DEF: ::mlir::test::IntegerType
+// DEF: ::test::SimpleAType,
+// DEF: ::test::CompoundAType,
+// DEF: ::test::IndexType,
+// DEF: ::test::SingleParameterType,
+// DEF: ::test::IntegerType
 
 // DEF-LABEL: ::mlir::OptionalParseResult generatedTypeParser(::mlir::MLIRContext *context,
 // DEF-NEXT: ::mlir::DialectAsmParser &parser,
 // DEF-NEXT: ::llvm::StringRef mnemonic,
 // DEF-NEXT: ::mlir::Type &value) {
-// DEF: if (mnemonic == ::mlir::test::CompoundAType::getMnemonic()) {
-// DEF-NEXT:   value = ::mlir::test::CompoundAType::parse(context, parser);
+// DEF: if (mnemonic == ::test::CompoundAType::getMnemonic()) {
+// DEF-NEXT:   value = ::test::CompoundAType::parse(context, parser);
 // DEF-NEXT:   return ::mlir::success(!!value);
 // DEF-NEXT: }
-// DEF-NEXT: if (mnemonic == ::mlir::test::IndexType::getMnemonic()) {
-// DEF-NEXT:   value = ::mlir::test::IndexType::parse(context, parser);
+// DEF-NEXT: if (mnemonic == ::test::IndexType::getMnemonic()) {
+// DEF-NEXT:   value = ::test::IndexType::parse(context, parser);
 // DEF-NEXT:   return ::mlir::success(!!value);
 // DEF: return {};
 
@@ -36,7 +36,7 @@ def Test_Dialect: Dialect {
 // DECL-NOT: TestDialect
 // DEF-NOT: TestDialect
     let name = "TestDialect";
-    let cppNamespace = "::mlir::test";
+    let cppNamespace = "::test";
 }
 
 class TestType<string name> : TypeDef<Test_Dialect, name> { }
@@ -57,7 +57,7 @@ def B_CompoundTypeA : TestType<"CompoundA"> {
   let parameters = (
       ins
       "int":$widthOfSomething,
-      "::mlir::test::SimpleTypeA": $exampleTdType,
+      "::test::SimpleTypeA": $exampleTdType,
       "SomeCppStruct": $exampleCppType,
       ArrayRefParameter<"int", "Matrix dimensions">:$dims,
       RTLValueType:$inner
@@ -66,8 +66,8 @@ def B_CompoundTypeA : TestType<"CompoundA"> {
   let genVerifyDecl = 1;
 
 // DECL-LABEL: class CompoundAType : public ::mlir::Type
-// DECL: static CompoundAType getChecked(llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, ::mlir::MLIRContext *context, int widthOfSomething, ::mlir::test::SimpleTypeA exampleTdType, SomeCppStruct exampleCppType, ::llvm::ArrayRef<int> dims, ::mlir::Type inner);
-// DECL: static ::mlir::LogicalResult verify(::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, int widthOfSomething, ::mlir::test::SimpleTypeA exampleTdType, SomeCppStruct exampleCppType, ::llvm::ArrayRef<int> dims, ::mlir::Type inner);
+// DECL: static CompoundAType getChecked(llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, ::mlir::MLIRContext *context, int widthOfSomething, ::test::SimpleTypeA exampleTdType, SomeCppStruct exampleCppType, ::llvm::ArrayRef<int> dims, ::mlir::Type inner);
+// DECL: static ::mlir::LogicalResult verify(::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, int widthOfSomething, ::test::SimpleTypeA exampleTdType, SomeCppStruct exampleCppType, ::llvm::ArrayRef<int> dims, ::mlir::Type inner);
 // DECL: static constexpr ::llvm::StringLiteral getMnemonic() {
 // DECL:   return ::llvm::StringLiteral("cmpnd_a");
 // DECL: }
@@ -75,7 +75,7 @@ def B_CompoundTypeA : TestType<"CompoundA"> {
 // DECL-NEXT: ::mlir::DialectAsmParser &parser);
 // DECL: void print(::mlir::DialectAsmPrinter &printer) const;
 // DECL: int getWidthOfSomething() const;
-// DECL: ::mlir::test::SimpleTypeA getExampleTdType() const;
+// DECL: ::test::SimpleTypeA getExampleTdType() const;
 // DECL: SomeCppStruct getExampleCppType() const;
 }
 
diff --git a/mlir/test/python/dialects/sparse_tensor/test_SpMM.py b/mlir/test/python/dialects/sparse_tensor/test_SpMM.py
index 17ed92cb092ce..1c04ce5500775 100644
--- a/mlir/test/python/dialects/sparse_tensor/test_SpMM.py
+++ b/mlir/test/python/dialects/sparse_tensor/test_SpMM.py
@@ -1,17 +1,19 @@
 # RUN: SUPPORT_LIB=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext %PYTHON %s | FileCheck %s
 
-import os
 import ctypes
-import mlir.all_passes_registration
 import numpy as np
+import os
+
+import mlir.all_passes_registration
+
+from mlir import ir
+from mlir import runtime as rt
+from mlir import execution_engine
+from mlir import passmanager
 
+from mlir.dialects import sparse_tensor as st
 from mlir.dialects import builtin
-from mlir.dialects.linalg.opdsl.lang import *
-from mlir.dialects.sparse_tensor import *
-from mlir.execution_engine import *
-from mlir.ir import *
-from mlir.passmanager import *
-from mlir.runtime import *
+from mlir.dialects.linalg.opdsl import lang as dsl
 
 
 def run(f):
@@ -20,28 +22,28 @@ def run(f):
   return f
 
 
-@linalg_structured_op
+@dsl.linalg_structured_op
 def matmul_dsl(
-    A=TensorDef(T, S.M, S.K),
-    B=TensorDef(T, S.K, S.N),
-    C=TensorDef(T, S.M, S.N, output=True)):
-  C[D.m, D.n] += A[D.m, D.k] * B[D.k, D.n]
+    A=dsl.TensorDef(dsl.T, dsl.S.M, dsl.S.K),
+    B=dsl.TensorDef(dsl.T, dsl.S.K, dsl.S.N),
+    C=dsl.TensorDef(dsl.T, dsl.S.M, dsl.S.N, output=True)):
+  C[dsl.D.m, dsl.D.n] += A[dsl.D.m, dsl.D.k] * B[dsl.D.k, dsl.D.n]
 
 
-def build_SpMM(attr: EncodingAttr):
+def build_SpMM(attr: st.EncodingAttr):
   """Build SpMM kernel.
 
   This method generates a linalg op with for matrix multiplication using
   just the Python API. Effectively, a generic linalg op is constructed
   that computes C(i,j) += A(i,k) * B(k,j) for annotated matrix A.
   """
-  module = Module.create()
+  module = ir.Module.create()
   f64 = ir.F64Type.get()
-  a = RankedTensorType.get([3, 4], f64, attr)
-  b = RankedTensorType.get([4, 2], f64)
-  c = RankedTensorType.get([3, 2], f64)
+  a = ir.RankedTensorType.get([3, 4], f64, attr)
+  b = ir.RankedTensorType.get([4, 2], f64)
+  c = ir.RankedTensorType.get([3, 2], f64)
   arguments = [a, b, c]
-  with InsertionPoint(module.body):
+  with ir.InsertionPoint(module.body):
 
     @builtin.FuncOp.from_py_func(*arguments)
     def spMxM(*args):
@@ -50,55 +52,60 @@ def spMxM(*args):
   return module
 
 
-def boilerplate(attr: EncodingAttr):
+def boilerplate(attr: st.EncodingAttr):
   """Returns boilerplate main method.
 
-  This method sets up a boilerplate main method that calls the generated
-  sparse kernel. For convenience, this part is purely done as string input.
+  This method sets up a boilerplate main method that takes three tensors
+  (a, b, c), converts the first tensor a into s sparse tensor, and then
+  calls the sparse kernel for matrix multiplication. For convenience,
+  this part is purely done as string input.
   """
   return f"""
-func @main(%c: tensor<3x2xf64>) -> tensor<3x2xf64>
+func @main(%ad: tensor<3x4xf64>, %b: tensor<4x2xf64>, %c: tensor<3x2xf64>) -> tensor<3x2xf64>
   attributes {{ llvm.emit_c_interface }} {{
-  %0 = constant dense<[ [ 1.1,  0.0,  0.0,  1.4 ],
-                        [ 0.0,  0.0,  0.0,  0.0 ],
-                        [ 0.0,  0.0,  3.3,  0.0 ]]> : tensor<3x4xf64>
-  %a = sparse_tensor.convert %0 : tensor<3x4xf64> to tensor<3x4xf64, {attr}>
-  %b = constant dense<[ [ 1.0,  2.0 ],
-                        [ 4.0,  3.0 ],
-                        [ 5.0,  6.0 ],
-                        [ 8.0,  7.0 ]]> : tensor<4x2xf64>
-  %1 = call @spMxM(%a, %b, %c) : (tensor<3x4xf64, {attr}>,
+  %a = sparse_tensor.convert %ad : tensor<3x4xf64> to tensor<3x4xf64, {attr}>
+  %0 = call @spMxM(%a, %b, %c) : (tensor<3x4xf64, {attr}>,
                                   tensor<4x2xf64>,
                                   tensor<3x2xf64>) -> tensor<3x2xf64>
-  return %1 : tensor<3x2xf64>
+  return %0 : tensor<3x2xf64>
 }}
 """
 
 
-def build_compile_and_run_SpMM(attr: EncodingAttr, support_lib: str, compiler):
+def build_compile_and_run_SpMM(attr: st.EncodingAttr, support_lib: str,
+                               compiler):
   # Build.
   module = build_SpMM(attr)
   func = str(module.operation.regions[0].blocks[0].operations[0].operation)
-  module = Module.parse(func + boilerplate(attr))
+  module = ir.Module.parse(func + boilerplate(attr))
+
   # Compile.
   compiler(module)
-  execution_engine = ExecutionEngine(
+  engine = execution_engine.ExecutionEngine(
       module, opt_level=0, shared_libs=[support_lib])
-  # Set up numpy input, invoke the kernel, and get numpy output.
+
+  # Set up numpy input and buffer for output.
+  a = np.array(
+      [[1.1, 0.0, 0.0, 1.4], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 3.3, 0.0]],
+      np.float64)
+  b = np.array([[1.0, 2.0], [4.0, 3.0], [5.0, 6.0], [8.0, 7.0]], np.float64)
+  c = np.zeros((3, 2), np.float64)
+  out = np.zeros((3, 2), np.float64)
+
+  mem_a = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(a)))
+  mem_b = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(b)))
+  mem_c = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(c)))
+  mem_out = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(out)))
+
+  # Invoke the kernel and get numpy output.
   # Built-in bufferization uses in-out buffers.
   # TODO: replace with inplace comprehensive bufferization.
-  Cin = np.zeros((3, 2), np.double)
-  Cout = np.zeros((3, 2), np.double)
-  Cin_memref_ptr = ctypes.pointer(
-      ctypes.pointer(get_ranked_memref_descriptor(Cin)))
-  Cout_memref_ptr = ctypes.pointer(
-      ctypes.pointer(get_ranked_memref_descriptor(Cout)))
-  execution_engine.invoke('main', Cout_memref_ptr, Cin_memref_ptr)
-  Cresult = ranked_memref_to_numpy(Cout_memref_ptr[0])
+  engine.invoke('main', mem_out, mem_a, mem_b, mem_c)
 
   # Sanity check on computed result.
-  expected = [[12.3, 12.0], [0.0, 0.0], [16.5, 19.8]]
-  if np.allclose(Cresult, expected):
+  expected = np.matmul(a, b);
+  c = rt.ranked_memref_to_numpy(mem_out[0])
+  if np.allclose(c, expected):
     pass
   else:
     quit(f'FAILURE')
@@ -121,16 +128,19 @@ def __init__(self, options: str):
         f'convert-std-to-llvm')
     self.pipeline = pipeline
 
-  def __call__(self, module: Module):
-    PassManager.parse(self.pipeline).run(module)
+  def __call__(self, module: ir.Module):
+    passmanager.PassManager.parse(self.pipeline).run(module)
 
 
 # CHECK-LABEL: TEST: testSpMM
 # CHECK: Passed 72 tests
 @run
 def testSpMM():
+  # Obtain path to runtime support library.
   support_lib = os.getenv('SUPPORT_LIB')
-  with Context() as ctx, Location.unknown():
+  assert os.path.exists(support_lib), f'{support_lib} does not exist'
+
+  with ir.Context() as ctx, ir.Location.unknown():
     count = 0
     # Fixed compiler optimization strategy.
     # TODO: explore state space here too
@@ -144,20 +154,20 @@ def testSpMM():
     # Exhaustive loop over various ways to annotate a kernel with
     # a *single* sparse tensor. Even this subset already gives
     # quite a large state space!
-    levels = [[DimLevelType.dense, DimLevelType.dense],
-              [DimLevelType.dense, DimLevelType.compressed],
-              [DimLevelType.compressed, DimLevelType.dense],
-              [DimLevelType.compressed, DimLevelType.compressed]]
+    levels = [[st.DimLevelType.dense, st.DimLevelType.dense],
+              [st.DimLevelType.dense, st.DimLevelType.compressed],
+              [st.DimLevelType.compressed, st.DimLevelType.dense],
+              [st.DimLevelType.compressed, st.DimLevelType.compressed]]
     orderings = [
-        AffineMap.get_permutation([0, 1]),
-        AffineMap.get_permutation([1, 0])
+        ir.AffineMap.get_permutation([0, 1]),
+        ir.AffineMap.get_permutation([1, 0])
     ]
     bitwidths = [0, 8, 32]
-    for levels in levels:
+    for level in levels:
       for ordering in orderings:
         for pwidth in bitwidths:
           for iwidth in bitwidths:
-            attr = EncodingAttr.get(levels, ordering, pwidth, iwidth)
+            attr = st.EncodingAttr.get(level, ordering, pwidth, iwidth)
             compiler = SparseCompiler(options=opt)
             build_compile_and_run_SpMM(attr, support_lib, compiler)
             count = count + 1
diff --git a/mlir/test/python/integration/dialects/linalg/opsrun.py b/mlir/test/python/integration/dialects/linalg/opsrun.py
index f730e637b6cdd..b6e5d98c0e256 100644
--- a/mlir/test/python/integration/dialects/linalg/opsrun.py
+++ b/mlir/test/python/integration/dialects/linalg/opsrun.py
@@ -244,71 +244,6 @@ def fill_on_buffers(min, max, seed, out):
 test_fill_generic()
 
 
-def test_conv_builtin():
-  with Context() as ctx, Location.unknown():
-    module = Module.create()
-    f64 = F64Type.get()
-    i32 = IntegerType.get_signless(32)
-    with InsertionPoint(module.body):
-
-      @builtin.FuncOp.from_py_func(
-          MemRefType.get((1, 4, 16, 1), f64), MemRefType.get((2, 2, 1), f64),
-          MemRefType.get((1, 2, 4, 1), i32))
-      def conv_on_buffers(input, filter, output):
-        linalg.depthwise_conv_2d_input_nhwc_filter_hwc_poly(
-            input, filter, outs=[output], strides=[2, 4], dilations=[1, 2])
-
-    execution_engine = ExecutionEngine(transform(module, conv_boiler))
-
-    # TODO: FFI-based solution to allow testing and printing with python code.
-    # Prepare arguments: one result i32.
-    # Arguments must be passed as pointers.
-    c_int_p = ctypes.c_int * 1
-    res = c_int_p(-1)
-    execution_engine.invoke("main", res)
-
-    log("RESULT: ", res[0])
-    # CHECK: RESULT: 8
-
-
-test_conv_builtin()
-
-
-def test_conv_generic():
-  with Context() as ctx, Location.unknown():
-    module = Module.create()
-    f64 = F64Type.get()
-    i32 = IntegerType.get_signless(32)
-    with InsertionPoint(module.body):
-
-      @builtin.FuncOp.from_py_func(
-          MemRefType.get((1, 4, 16, 1), f64), MemRefType.get((2, 2, 1), f64),
-          MemRefType.get((1, 2, 4, 1), i32))
-      def conv_on_buffers(input, filter, output):
-        linalg.depthwise_conv_2d_input_nhwc_filter_hwc_poly(
-            input,
-            filter,
-            outs=[output],
-            strides=[2, 4],
-            dilations=[1, 2],
-            emit_generic=True)
-
-    execution_engine = ExecutionEngine(transform(module, conv_boiler))
-
-    # TODO: FFI-based solution to allow testing and printing with python code.
-    # Prepare arguments: one result i32.
-    # Arguments must be passed as pointers.
-    c_int_p = ctypes.c_int * 1
-    res = c_int_p(-1)
-    execution_engine.invoke("main", res)
-
-    log("RESULT: ", res[0])
-    # CHECK: RESULT: 8
-
-
-test_conv_generic()
-
-
 def test_max_pooling_builtin():
   with Context() as ctx, Location.unknown():
     module = Module.create()
diff --git a/mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt b/mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt
index da39db4532f93..2716bcabc4229 100644
--- a/mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt
+++ b/mlir/tools/mlir-linalg-ods-gen/CMakeLists.txt
@@ -69,3 +69,7 @@ if(LLVM_USE_HOST_TOOLS)
     endif()
   endif()
 endif()
+
+configure_file(
+  update_core_linalg_named_ops.sh.in
+  ${MLIR_TOOLS_DIR}/update_core_linalg_named_ops.sh)
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
index 83447f4930170..a0eb1dea88603 100644
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp
@@ -461,6 +461,11 @@ def {0} : LinalgStructuredBase_Op<"{1}", !listconcat([
       [{{
         $_state.addOperands(inputs);
         $_state.addOperands(outputs);
+        SmallVector<Type> resultTensorTypes;
+        copy_if(outputs.getTypes(),
+                std::back_inserter(resultTensorTypes),
+                [](Type type) {{ return type.isa<RankedTensorType>(); });
+        $_state.addTypes(resultTensorTypes);
         $_state.addAttribute(
           "operand_segment_sizes",
           $_builder.getI32VectorAttr({{
@@ -474,11 +479,13 @@ def {0} : LinalgStructuredBase_Op<"{1}", !listconcat([
       }]>,
       OpBuilder<
       (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs,
-            "ValueRange":$outputs),
+            "ValueRange":$outputs,
+            CArg<"ArrayRef<NamedAttribute>", "{{}">:$attributes),
       [{{
         $_state.addOperands(inputs);
         $_state.addOperands(outputs);
         $_state.addTypes(resultTensorTypes);
+        $_state.addAttributes(attributes);
         $_state.addAttribute(
           "operand_segment_sizes",
           $_builder.getI32VectorAttr({{
diff --git a/mlir/tools/mlir-linalg-ods-gen/update_core_linalg_named_ops.sh.in b/mlir/tools/mlir-linalg-ods-gen/update_core_linalg_named_ops.sh.in
new file mode 100755
index 0000000000000..b0fbd644e8f38
--- /dev/null
+++ b/mlir/tools/mlir-linalg-ods-gen/update_core_linalg_named_ops.sh.in
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Updates the LinalgStructuredOps.yaml file based on the
+# core_named_ops.py source of truth.
+
+set -eu -o errtrace
+shopt -s inherit_errexit
+
+mlir_source_dir="@MLIR_SOURCE_DIR@"
+python_exe="@Python3_EXECUTABLE@"
+mlir_binary_dir="@MLIR_BINARY_DIR@"
+python_package_dir="$mlir_binary_dir/python_packages/mlir_core"
+
+if ! [[ -d "$python_package_dir" ]]; then
+  echo "ERROR: Python bindings not found. Build with -DMLIR_ENABLE_BINDINGS_PYTHON=ON"
+  exit 1
+fi
+
+dest_file="$mlir_source_dir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml"
+echo "Updating ops in $dest_file"
+
+export PYTHONPATH="$python_package_dir"
+OUTPUT="$(
+  echo "### AUTOGENERATED from core_named_ops.py" && \
+  echo "### To regenerate, run: bin/update_core_linalg_named_ops.sh" && \
+  "$python_exe" -m mlir.dialects.linalg.opdsl.dump_oplib .ops.core_named_ops \
+)"
+echo "$OUTPUT" > "$dest_file"
+echo "Success."
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index efbd9ed883b1d..3059b1fafb96a 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -70,7 +70,6 @@ void registerTestGpuSerializeToHsacoPass();
 void registerTestDataLayoutQuery();
 void registerTestDecomposeCallGraphTypes();
 void registerTestDiagnosticsPass();
-void registerTestDialect(DialectRegistry &);
 void registerTestDominancePass();
 void registerTestDynamicPipelinePass();
 void registerTestExpandTanhPass();
@@ -108,6 +107,10 @@ void registerTestVectorConversions();
 } // namespace test
 } // namespace mlir
 
+namespace test {
+void registerTestDialect(DialectRegistry &);
+} // namespace test
+
 #ifdef MLIR_INCLUDE_TESTS
 void registerTestPasses() {
   registerConvertToTargetEnvPass();
@@ -135,58 +138,58 @@ void registerTestPasses() {
   registerVectorizerTestPass();
   registerTosaTestQuantUtilAPIPass();
 
-  test::registerConvertCallOpPass();
-  test::registerInliner();
-  test::registerMemRefBoundCheck();
-  test::registerPatternsTestPass();
-  test::registerSimpleParametricTilingPass();
-  test::registerTestAffineLoopParametricTilingPass();
-  test::registerTestAliasAnalysisPass();
-  test::registerTestCallGraphPass();
-  test::registerTestConstantFold();
-  test::registerTestDiagnosticsPass();
+  mlir::test::registerConvertCallOpPass();
+  mlir::test::registerInliner();
+  mlir::test::registerMemRefBoundCheck();
+  mlir::test::registerPatternsTestPass();
+  mlir::test::registerSimpleParametricTilingPass();
+  mlir::test::registerTestAffineLoopParametricTilingPass();
+  mlir::test::registerTestAliasAnalysisPass();
+  mlir::test::registerTestCallGraphPass();
+  mlir::test::registerTestConstantFold();
+  mlir::test::registerTestDiagnosticsPass();
 #if MLIR_CUDA_CONVERSIONS_ENABLED
-  test::registerTestGpuSerializeToCubinPass();
+  mlir::test::registerTestGpuSerializeToCubinPass();
 #endif
 #if MLIR_ROCM_CONVERSIONS_ENABLED
-  test::registerTestGpuSerializeToHsacoPass();
+  mlir::test::registerTestGpuSerializeToHsacoPass();
 #endif
-  test::registerTestConvVectorization();
-  test::registerTestDecomposeCallGraphTypes();
-  test::registerTestDataLayoutQuery();
-  test::registerTestDominancePass();
-  test::registerTestDynamicPipelinePass();
-  test::registerTestExpandTanhPass();
-  test::registerTestComposeSubView();
-  test::registerTestGpuParallelLoopMappingPass();
-  test::registerTestIRVisitorsPass();
-  test::registerTestInterfaces();
-  test::registerTestLinalgCodegenStrategy();
-  test::registerTestLinalgDistribution();
-  test::registerTestLinalgElementwiseFusion();
-  test::registerTestPushExpandingReshape();
-  test::registerTestLinalgFusionTransforms();
-  test::registerTestLinalgTensorFusionTransforms();
-  test::registerTestLinalgTiledLoopFusionTransforms();
-  test::registerTestLinalgGreedyFusion();
-  test::registerTestLinalgHoisting();
-  test::registerTestLinalgTileAndFuseSequencePass();
-  test::registerTestLinalgTransforms();
-  test::registerTestLivenessPass();
-  test::registerTestLoopFusion();
-  test::registerTestLoopMappingPass();
-  test::registerTestLoopUnrollingPass();
-  test::registerTestMathAlgebraicSimplificationPass();
-  test::registerTestMathPolynomialApproximationPass();
-  test::registerTestMemRefDependenceCheck();
-  test::registerTestMemRefStrideCalculation();
-  test::registerTestNumberOfBlockExecutionsPass();
-  test::registerTestNumberOfOperationExecutionsPass();
-  test::registerTestOpaqueLoc();
-  test::registerTestPDLByteCodePass();
-  test::registerTestRecursiveTypesPass();
-  test::registerTestSCFUtilsPass();
-  test::registerTestVectorConversions();
+  mlir::test::registerTestConvVectorization();
+  mlir::test::registerTestDecomposeCallGraphTypes();
+  mlir::test::registerTestDataLayoutQuery();
+  mlir::test::registerTestDominancePass();
+  mlir::test::registerTestDynamicPipelinePass();
+  mlir::test::registerTestExpandTanhPass();
+  mlir::test::registerTestComposeSubView();
+  mlir::test::registerTestGpuParallelLoopMappingPass();
+  mlir::test::registerTestIRVisitorsPass();
+  mlir::test::registerTestInterfaces();
+  mlir::test::registerTestLinalgCodegenStrategy();
+  mlir::test::registerTestLinalgDistribution();
+  mlir::test::registerTestLinalgElementwiseFusion();
+  mlir::test::registerTestPushExpandingReshape();
+  mlir::test::registerTestLinalgFusionTransforms();
+  mlir::test::registerTestLinalgTensorFusionTransforms();
+  mlir::test::registerTestLinalgTiledLoopFusionTransforms();
+  mlir::test::registerTestLinalgGreedyFusion();
+  mlir::test::registerTestLinalgHoisting();
+  mlir::test::registerTestLinalgTileAndFuseSequencePass();
+  mlir::test::registerTestLinalgTransforms();
+  mlir::test::registerTestLivenessPass();
+  mlir::test::registerTestLoopFusion();
+  mlir::test::registerTestLoopMappingPass();
+  mlir::test::registerTestLoopUnrollingPass();
+  mlir::test::registerTestMathAlgebraicSimplificationPass();
+  mlir::test::registerTestMathPolynomialApproximationPass();
+  mlir::test::registerTestMemRefDependenceCheck();
+  mlir::test::registerTestMemRefStrideCalculation();
+  mlir::test::registerTestNumberOfBlockExecutionsPass();
+  mlir::test::registerTestNumberOfOperationExecutionsPass();
+  mlir::test::registerTestOpaqueLoc();
+  mlir::test::registerTestPDLByteCodePass();
+  mlir::test::registerTestRecursiveTypesPass();
+  mlir::test::registerTestSCFUtilsPass();
+  mlir::test::registerTestVectorConversions();
 }
 #endif
 
@@ -198,7 +201,7 @@ int main(int argc, char **argv) {
   DialectRegistry registry;
   registerAllDialects(registry);
 #ifdef MLIR_INCLUDE_TESTS
-  test::registerTestDialect(registry);
+  ::test::registerTestDialect(registry);
 #endif
   return mlir::asMainReturnCode(
       mlir::MlirOptMain(argc, argv, "MLIR modular optimizer driver\n", registry,
diff --git a/mlir/tools/mlir-reduce/mlir-reduce.cpp b/mlir/tools/mlir-reduce/mlir-reduce.cpp
index 01d7c96cd8bbc..44b21b805e8c3 100644
--- a/mlir/tools/mlir-reduce/mlir-reduce.cpp
+++ b/mlir/tools/mlir-reduce/mlir-reduce.cpp
@@ -21,13 +21,11 @@
 
 using namespace mlir;
 
-namespace mlir {
 namespace test {
 #ifdef MLIR_INCLUDE_TESTS
 void registerTestDialect(DialectRegistry &);
 #endif
 } // namespace test
-} // namespace mlir
 
 int main(int argc, char **argv) {
   registerAllPasses();
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 269803f50788f..d2b921fb5dd49 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -1058,7 +1058,7 @@ void OpEmitter::genSeparateArgParamBuilder() {
       // TODO: Expand to handle regions.
       body << formatv(R"(
         ::llvm::SmallVector<::mlir::Type, 2> inferredReturnTypes;
-        if (succeeded({0}::inferReturnTypes(odsBuilder.getContext(),
+        if (::mlir::succeeded({0}::inferReturnTypes(odsBuilder.getContext(),
                       {1}.location, {1}.operands,
                       {1}.attributes.getDictionary({1}.getContext()),
                       /*regions=*/{{}, inferredReturnTypes)))
@@ -1195,7 +1195,7 @@ void OpEmitter::genInferredTypeCollectiveParamBuilder() {
   // Result types
   body << formatv(R"(
     ::mlir::SmallVector<::mlir::Type, 2> inferredReturnTypes;
-    if (succeeded({0}::inferReturnTypes(odsBuilder.getContext(),
+    if (::mlir::succeeded({0}::inferReturnTypes(odsBuilder.getContext(),
                   {1}.location, operands,
                   {1}.attributes.getDictionary({1}.getContext()),
                   /*regions=*/{{}, inferredReturnTypes))) {{)",
@@ -1914,7 +1914,7 @@ void OpEmitter::genPrinter() {
 void OpEmitter::genVerifier() {
   auto *method = opClass.addMethodAndPrune("::mlir::LogicalResult", "verify");
   auto &body = method->body();
-  body << "  if (failed(" << op.getAdaptorName()
+  body << "  if (::mlir::failed(" << op.getAdaptorName()
        << "(*this).verify((*this)->getLoc()))) "
        << "return ::mlir::failure();\n";
 
diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp
index d4a60e30ced1c..ab3248390ea8d 100644
--- a/mlir/tools/mlir-tblgen/OpDocGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Regex.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
@@ -247,7 +248,10 @@ static void emitDialectDoc(const Dialect &dialect, ArrayRef<AttrDef> attrDefs,
   emitIfNotEmpty(dialect.getSummary(), os);
   emitIfNotEmpty(dialect.getDescription(), os);
 
-  os << "[TOC]\n\n";
+  // Generate a TOC marker except if description already contains one.
+  llvm::Regex r("^[[:space:]]*\\[TOC\\]$", llvm::Regex::RegexFlags::Newline);
+  if (!r.match(dialect.getDescription()))
+    os << "[TOC]\n\n";
 
   if (!attrDefs.empty()) {
     os << "## Attribute definition\n\n";
diff --git a/mlir/unittests/Analysis/AffineStructuresTest.cpp b/mlir/unittests/Analysis/AffineStructuresTest.cpp
index 4fe46c342c29e..971ca2b2ce303 100644
--- a/mlir/unittests/Analysis/AffineStructuresTest.cpp
+++ b/mlir/unittests/Analysis/AffineStructuresTest.cpp
@@ -551,12 +551,12 @@ TEST(FlatAffineConstraintsTest, removeRedundantConstraintsTest) {
 
 TEST(FlatAffineConstraintsTest, addConstantUpperBound) {
   FlatAffineConstraints fac = makeFACFromConstraints(2, {}, {});
-  fac.addConstantUpperBound(0, 1);
+  fac.addBound(FlatAffineConstraints::UB, 0, 1);
   EXPECT_EQ(fac.atIneq(0, 0), -1);
   EXPECT_EQ(fac.atIneq(0, 1), 0);
   EXPECT_EQ(fac.atIneq(0, 2), 1);
 
-  fac.addConstantUpperBound({1, 2, 3}, 1);
+  fac.addBound(FlatAffineConstraints::UB, {1, 2, 3}, 1);
   EXPECT_EQ(fac.atIneq(1, 0), -1);
   EXPECT_EQ(fac.atIneq(1, 1), -2);
   EXPECT_EQ(fac.atIneq(1, 2), -2);
@@ -564,12 +564,12 @@ TEST(FlatAffineConstraintsTest, addConstantUpperBound) {
 
 TEST(FlatAffineConstraintsTest, addConstantLowerBound) {
   FlatAffineConstraints fac = makeFACFromConstraints(2, {}, {});
-  fac.addConstantLowerBound(0, 1);
+  fac.addBound(FlatAffineConstraints::LB, 0, 1);
   EXPECT_EQ(fac.atIneq(0, 0), 1);
   EXPECT_EQ(fac.atIneq(0, 1), 0);
   EXPECT_EQ(fac.atIneq(0, 2), -1);
 
-  fac.addConstantLowerBound({1, 2, 3}, 1);
+  fac.addBound(FlatAffineConstraints::LB, {1, 2, 3}, 1);
   EXPECT_EQ(fac.atIneq(1, 0), 1);
   EXPECT_EQ(fac.atIneq(1, 1), 2);
   EXPECT_EQ(fac.atIneq(1, 2), 2);
diff --git a/mlir/unittests/IR/InterfaceAttachmentTest.cpp b/mlir/unittests/IR/InterfaceAttachmentTest.cpp
index 76124707cbfc7..3b362fa221899 100644
--- a/mlir/unittests/IR/InterfaceAttachmentTest.cpp
+++ b/mlir/unittests/IR/InterfaceAttachmentTest.cpp
@@ -22,7 +22,7 @@
 #include "../../test/lib/Dialect/Test/TestTypes.h"
 
 using namespace mlir;
-using namespace mlir::test;
+using namespace test;
 
 namespace {
 
diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
index 5e98104a46a9d..148dad2a04fc7 100644
--- a/openmp/libomptarget/DeviceRTL/CMakeLists.txt
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -151,6 +151,7 @@ endif()
 
 # Create target to build all Bitcode libraries.
 add_custom_target(omptarget-new-nvptx-bc)
+add_dependencies(omptarget-new-nvptx-bc opt llvm-link)
 
 # Generate a Bitcode library for all the compute capabilities the user requested
 foreach(sm ${nvptx_sm_list})
@@ -197,6 +198,7 @@ foreach(sm ${nvptx_sm_list})
 
   add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}_opt)
   add_dependencies(omptarget-new-nvptx-bc ${bclib_target_name})
+  add_dependencies(${bclib_target_name} opt llvm-link)
 
   # Copy library to destination.
   add_custom_command(TARGET ${bclib_target_name} POST_BUILD
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
index 21ff48c30d2cc..9ed396d06af4a 100644
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -247,9 +247,9 @@ void __kmpc_end_single(IdentTy *Loc, int32_t TId);
 
 void __kmpc_flush(IdentTy *Loc);
 
-__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask();
+uint64_t __kmpc_warp_active_thread_mask(void);
 
-void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask);
+void __kmpc_syncwarp(uint64_t Mask);
 
 void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
 
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
index a055ad6d17ec2..c9a1ac6f73697 100644
--- a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -286,11 +286,9 @@ void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
 
 void __kmpc_flush(IdentTy *Loc) { fence::kernel(__ATOMIC_SEQ_CST); }
 
-__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
-  return mapping::activemask();
-}
+uint64_t __kmpc_warp_active_thread_mask(void) { return mapping::activemask(); }
 
-void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) { synchronize::warp(Mask); }
+void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); }
 
 void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
   omp_set_lock(reinterpret_cast<omp_lock_t *>(Name));
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
index 5bd0ed72ca1f6..6753b9708cdc4 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@@ -107,7 +107,7 @@ macro(add_cuda_bc_library)
   set(cu_cmd ${CLANG_TOOL}
     -xc++
     -c
-    -mllvm -openmp-opt-disable-internalization
+    -mllvm -openmp-opt-disable
     -std=c++14
     -ffreestanding
     -target amdgcn-amd-amdhsa
@@ -157,7 +157,7 @@ foreach(mcpu ${mcpus})
   add_custom_command(
     OUTPUT ${bc_libname}
     COMMAND ${LINK_TOOL} ${bc_files} | ${OPT_TOOL} --always-inline -o ${OUTPUTDIR}/${bc_libname}
-    DEPENDS ${bc_files} opt)
+    DEPENDS ${bc_files} llvm-link opt)
 
   add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname})
 
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
index 1e653dad581e2..c3b2f59d636e2 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h
@@ -12,7 +12,6 @@
 #include <stdint.h>
 
 #define EXTERN extern "C"
-typedef uint64_t __kmpc_impl_lanemask_t;
 typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
 
 EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads();
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index 615335df54885..830911d28a2af 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -25,6 +25,8 @@
 #define PRId64 "ld"
 #define PRIu64 "lu"
 
+typedef uint64_t __kmpc_impl_lanemask_t;
+
 #define INLINE inline
 #define NOINLINE __attribute__((noinline))
 #define ALIGN(N) __attribute__((aligned(N)))
@@ -49,8 +51,6 @@
 
 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
-  // The maximum number of workers in a kernel.
-  DS_Max_Worker_Threads = 960,
   // The size reserved for data in a shared memory slot.
   DS_Slot_Size = 256,
   // The slot size that should be reserved for a working warp.
diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
index 1dcd9abfa9e63..8711cd200051a 100644
--- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu
@@ -123,7 +123,7 @@ EXTERN void __kmpc_flush(kmp_Ident *loc) {
 // Vote
 ////////////////////////////////////////////////////////////////////////////////
 
-EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
+EXTERN uint64_t __kmpc_warp_active_thread_mask(void) {
   PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
   return __kmpc_impl_activemask();
 }
@@ -132,7 +132,7 @@ EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
 // Syncwarp
 ////////////////////////////////////////////////////////////////////////////////
 
-EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) {
+EXTERN void __kmpc_syncwarp(uint64_t Mask) {
   PRINT0(LD_IO, "call __kmpc_syncwarp\n");
   __kmpc_impl_syncwarp(Mask);
 }
diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
index 2e80dc3a82ac9..ee5fc5b1f75c9 100644
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -375,9 +375,9 @@ EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
 EXTERN void __kmpc_flush(kmp_Ident *loc);
 
 // vote
-EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask();
+EXTERN uint64_t __kmpc_warp_active_thread_mask(void);
 // syncwarp
-EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t);
+EXTERN void __kmpc_syncwarp(uint64_t);
 
 // tasks
 EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, uint32_t global_tid,
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
index f3e34c2044b1e..b8fd91edcd8f7 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
@@ -12,8 +12,6 @@
 #include <stdint.h>
 
 #define EXTERN extern "C"
-
-typedef uint32_t __kmpc_impl_lanemask_t;
 typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
 
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
index 9e69f6016ea52..196f6236135c4 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -18,6 +18,8 @@
 
 #include "nvptx_interface.h"
 
+typedef uint32_t __kmpc_impl_lanemask_t;
+
 #define INLINE inline __attribute__((always_inline))
 #define NOINLINE __attribute__((noinline))
 #define ALIGN(N) __attribute__((aligned(N)))
@@ -61,16 +63,12 @@
 
 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
-  // The maximum number of workers in a kernel.
-  DS_Max_Worker_Threads = 992,
   // The size reserved for data in a shared memory slot.
   DS_Slot_Size = 256,
   // The slot size that should be reserved for a working warp.
   DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
   // The maximum number of warps in use
   DS_Max_Warp_Number = 32,
-  // The size of the preallocated shared memory buffer per team
-  DS_Shared_Memory_Size = 128,
 };
 
 enum : __kmpc_impl_lanemask_t {
diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
index 4865ef6ffbbaa..31adc72e9b929 100644
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -501,14 +501,11 @@ class RTLDeviceInfoTy {
   static const unsigned HardTeamLimit =
       (1 << 16) - 1; // 64K needed to fit in uint16
   static const int DefaultNumTeams = 128;
-  static const int Max_Teams =
-      llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_Teams];
-  static const int Warp_Size =
-      llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size];
-  static const int Max_WG_Size =
-      llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Max_WG_Size];
+  static const int Max_Teams = llvm::omp::AMDGPUGridValues.GV_Max_Teams;
+  static const int Warp_Size = llvm::omp::AMDGPUGridValues.GV_Warp_Size;
+  static const int Max_WG_Size = llvm::omp::AMDGPUGridValues.GV_Max_WG_Size;
   static const int Default_WG_Size =
-      llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Default_WG_Size];
+      llvm::omp::AMDGPUGridValues.GV_Default_WG_Size;
 
   using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, const void *,
                                       size_t size, hsa_agent_t);
@@ -1058,9 +1055,8 @@ int32_t __tgt_rtl_init_device(int device_id) {
     DeviceInfo.WarpSize[device_id] = wavefront_size;
   } else {
     DP("Default wavefront size: %d\n",
-       llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size]);
-    DeviceInfo.WarpSize[device_id] =
-        llvm::omp::AMDGPUGpuGridValues[llvm::omp::GVIDX::GV_Warp_Size];
+       llvm::omp::AMDGPUGridValues.GV_Warp_Size);
+    DeviceInfo.WarpSize[device_id] = llvm::omp::AMDGPUGridValues.GV_Warp_Size;
   }
 
   // Adjust teams to the env variables
diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
index f69dc76f615cc..44fc67225d198 100644
--- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp
@@ -829,10 +829,9 @@ class DeviceRTLTy {
           return nullptr;
         }
       } else {
-        REPORT("Loading global exec_mode '%s' - symbol missing, using default "
-               "value GENERIC (1)\n",
-               ExecModeName);
-        CUDA_ERR_STRING(Err);
+        DP("Loading global exec_mode '%s' - symbol missing, using default "
+           "value GENERIC (1)\n",
+           ExecModeName);
       }
 
       KernelsList.emplace_back(Func, ExecModeVal);
diff --git a/openmp/libomptarget/test/mapping/reduction_implicit_map.cpp b/openmp/libomptarget/test/mapping/reduction_implicit_map.cpp
new file mode 100644
index 0000000000000..040accd2eb4b7
--- /dev/null
+++ b/openmp/libomptarget/test/mapping/reduction_implicit_map.cpp
@@ -0,0 +1,28 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// amdgcn does not have printf definition
+// UNSUPPORTED: amdgcn-amd-amdhsa
+
+#include <stdio.h>
+
+void sum(int* input, int size, int* output)
+{
+#pragma omp target teams distribute parallel for reduction(+:output[0]) \
+                                                 map(to:input[0:size])
+  for (int i = 0; i < size; i++)
+    output[0] += input[i];
+}
+int main()
+{
+  const int size = 100;
+  int *array = new int[size];
+  int result = 0;
+  for (int i = 0; i < size; i++)
+    array[i] = i + 1;
+  sum(array, size, &result);
+  // CHECK: Result=5050
+  printf("Result=%d\n", result);
+  delete[] array;
+  return 0;
+}
+
diff --git a/openmp/libomptarget/test/offloading/bug49021.cpp b/openmp/libomptarget/test/offloading/bug49021.cpp
index 6cd617f7a06bb..521adf230bed8 100644
--- a/openmp/libomptarget/test/offloading/bug49021.cpp
+++ b/openmp/libomptarget/test/offloading/bug49021.cpp
@@ -1,5 +1,8 @@
 // RUN: %libomptarget-compilexx-generic -O3 && %libomptarget-run-generic
 
+// Wrong results on amdgcn
+// UNSUPPORTED: amdgcn-amd-amdhsa
+
 #include <iostream>
 
 template <typename T> int test_map() {
diff --git a/openmp/libomptarget/test/offloading/bug49334.cpp b/openmp/libomptarget/test/offloading/bug49334.cpp
index cb096215fcf2c..0ba081555453c 100644
--- a/openmp/libomptarget/test/offloading/bug49334.cpp
+++ b/openmp/libomptarget/test/offloading/bug49334.cpp
@@ -3,6 +3,8 @@
 // Currently hangs on amdgpu
 // UNSUPPORTED: amdgcn-amd-amdhsa
 
+// UNSUPPORTED: x86_64-pc-linux-gnu
+
 #include <cassert>
 #include <iostream>
 #include <memory>
diff --git a/openmp/libomptarget/test/offloading/bug50022.cpp b/openmp/libomptarget/test/offloading/bug50022.cpp
index 54ce06ea2d539..a520442c835c5 100644
--- a/openmp/libomptarget/test/offloading/bug50022.cpp
+++ b/openmp/libomptarget/test/offloading/bug50022.cpp
@@ -1,5 +1,7 @@
 // RUN: %libomptarget-compilexx-and-run-generic
 
+// UNSUPPORTED: amdgcn-amd-amdhsa
+
 #include <cassert>
 #include <iostream>
 #include <stdexcept>
diff --git a/openmp/libomptarget/test/offloading/global_constructor.cpp b/openmp/libomptarget/test/offloading/global_constructor.cpp
new file mode 100644
index 0000000000000..d73fe1ad938f3
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/global_constructor.cpp
@@ -0,0 +1,22 @@
+// RUN: %libomptarget-compilexx-generic && %libomptarget-run-generic | %fcheck-generic
+
+// Fails in DAGToDAG on an address space problem
+// UNSUPPORTED: amdgcn-amd-amdhsa
+
+#include <cmath>
+#include <cstdio>
+
+const double Host = log(2.0) / log(2.0);
+#pragma omp declare target
+const double Device = log(2.0) / log(2.0);
+#pragma omp end declare target
+
+int main() {
+  double X;
+#pragma omp target map(from : X)
+  { X = Device; }
+
+  // CHECK: PASS
+  if (X == Host)
+    printf("PASS\n");
+}
diff --git a/openmp/libomptarget/test/offloading/memory_manager.cpp b/openmp/libomptarget/test/offloading/memory_manager.cpp
index ab7872cb7fb68..aa481a078408c 100644
--- a/openmp/libomptarget/test/offloading/memory_manager.cpp
+++ b/openmp/libomptarget/test/offloading/memory_manager.cpp
@@ -1,5 +1,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
+// UNSUPPORTED: x86_64-pc-linux-gnu
+
 #include <omp.h>
 
 #include <cassert>
diff --git a/openmp/libomptarget/test/offloading/parallel_offloading_map.cpp b/openmp/libomptarget/test/offloading/parallel_offloading_map.cpp
index e7e9ba8268dda..eace4cb97247a 100644
--- a/openmp/libomptarget/test/offloading/parallel_offloading_map.cpp
+++ b/openmp/libomptarget/test/offloading/parallel_offloading_map.cpp
@@ -1,5 +1,7 @@
 // RUN: %libomptarget-compilexx-run-and-check-generic
 
+// UNSUPPORTED: x86_64-pc-linux-gnu
+
 #include <cassert>
 #include <iostream>
 
diff --git a/openmp/libomptarget/test/offloading/taskloop_offload_nowait.cpp b/openmp/libomptarget/test/offloading/taskloop_offload_nowait.cpp
index c90d6d4fcb40d..e522f9f19631c 100644
--- a/openmp/libomptarget/test/offloading/taskloop_offload_nowait.cpp
+++ b/openmp/libomptarget/test/offloading/taskloop_offload_nowait.cpp
@@ -1,5 +1,7 @@
 // RUN: %libomptarget-compilexx-and-run-generic
 
+// UNSUPPORTED: x86_64-pc-linux-gnu
+
 #include <cmath>
 #include <cstdlib>
 #include <iostream>
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index dbf1111a8976f..a815ee862a7e9 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -1439,6 +1439,8 @@ __kmp_mm_mwait(unsigned extensions, unsigned hints) {
 /* Support datatypes for the orphaned construct nesting checks.             */
 /* ------------------------------------------------------------------------ */
 
+/* When adding to this enum, add its corresponding string in cons_text_c[]
+ * array in kmp_error.cpp */
 enum cons_type {
   ct_none,
   ct_parallel,
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index d0a70d292a51c..404586487f4be 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -2386,7 +2386,10 @@ static bool __kmp_affinity_create_cpuinfo_map(int *line,
         unsigned val;
         if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
           goto no_val;
-        KMP_ASSERT(nodeIdIndex + level <= maxIndex);
+        // validate the input before using level:
+        if (level > (unsigned)__kmp_xproc) { // level is too big
+          level = __kmp_xproc;
+        }
         if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
           goto dup_field;
         threadInfo[num_avail][nodeIdIndex + level] = val;
diff --git a/openmp/runtime/src/kmp_alloc.cpp b/openmp/runtime/src/kmp_alloc.cpp
index d379e71f921e9..a67d298ec756f 100644
--- a/openmp/runtime/src/kmp_alloc.cpp
+++ b/openmp/runtime/src/kmp_alloc.cpp
@@ -883,7 +883,7 @@ static void bpool(kmp_info_t *th, void *buf, bufsize len) {
   __kmp_bget_dequeue(th); /* Release any queued buffers */
 
 #ifdef SizeQuant
-  len &= ~(SizeQuant - 1);
+  len &= ~((bufsize)(SizeQuant - 1));
 #endif
   if (thr->pool_len == 0) {
     thr->pool_len = len;
diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp
index 2a7c9a8cb2ecc..0b5f899cfd8f0 100644
--- a/openmp/runtime/src/kmp_csupport.cpp
+++ b/openmp/runtime/src/kmp_csupport.cpp
@@ -578,9 +578,6 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     __kmp_free(top);
   }
 
-  // if( serial_team -> t.t_serialized > 1 )
-  serial_team->t.t_level--;
-
   /* pop dispatch buffers stack */
   KMP_DEBUG_ASSERT(serial_team->t.t_dispatch->th_disp_buffer);
   {
@@ -605,6 +602,7 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     }
 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
 
+    __kmp_pop_current_task_from_thread(this_thr);
 #if OMPD_SUPPORT
     if (ompd_state & OMPD_ENABLE_BP)
       ompd_bp_parallel_end();
@@ -623,8 +621,6 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     this_thr->th.th_dispatch =
         &this_thr->th.th_team->t.t_dispatch[serial_team->t.t_master_tid];
 
-    __kmp_pop_current_task_from_thread(this_thr);
-
     KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 0);
     this_thr->th.th_current_task->td_flags.executing = 1;
 
@@ -645,6 +641,7 @@ void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
     }
   }
 
+  serial_team->t.t_level--;
   if (__kmp_env_consistency_check)
     __kmp_pop_parallel(global_tid, NULL);
 #if OMPT_SUPPORT
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index cc2d0012bf38d..0a75f8a54f595 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -561,6 +561,7 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
         _control87(_PC_64, _MCW_PC); // 0,0x30000
 #endif
         /* value used for comparison in solver for cross-over point */
+        KMP_ASSERT(tc > 0);
         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
 
         /* crossover point--chunk indexes equal to or greater than
@@ -668,6 +669,8 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
   case kmp_sch_static_chunked:
   case kmp_sch_dynamic_chunked:
   dynamic_init:
+    if (tc == 0)
+      break;
     if (pr->u.p.parm1 <= 0)
       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
     else if (pr->u.p.parm1 > tc)
@@ -1713,7 +1716,7 @@ int __kmp_dispatch_next_algorithm(int gtid,
         status = 0; // nothing to do, don't try atomic op
         break;
       }
-      KMP_DEBUG_ASSERT(init % chunk == 0);
+      KMP_DEBUG_ASSERT(chunk && init % chunk == 0);
       // compare with K*nproc*(chunk+1), K=2 by default
       if ((T)remaining < pr->u.p.parm2) {
         // use dynamic-style schedule
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index a98a2a43b0d06..b0a045dc4eb97 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -164,7 +164,12 @@ int __kmp_convert_to_milliseconds(char const *data) {
     return (INT_MAX);
   value = (double)0.0;
   mult = '\0';
+#if KMP_OS_WINDOWS && KMP_MSVC_COMPAT
+  // On Windows, each %c parameter needs additional size parameter for sscanf_s
+  nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, 1, &extra, 1);
+#else
   nvalues = KMP_SSCANF(data, "%lf%c%c", &value, &mult, &extra);
+#endif
   if (nvalues < 1)
     return (-1);
   if (nvalues == 1)
@@ -426,6 +431,7 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
                                       int *out_range, char *out_routine,
                                       char *out_file, int *out_lb,
                                       int *out_ub) {
+  const char *par_range_value;
   size_t len = KMP_STRLEN(value) + 1;
   par_range_to_print = (char *)KMP_INTERNAL_MALLOC(len + 1);
   KMP_STRNCPY_S(par_range_to_print, len + 1, value, len + 1);
@@ -434,11 +440,14 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
   __kmp_par_range_ub = INT_MAX;
   for (;;) {
     unsigned int len;
-    if (*value == '\0') {
+    if (!value || *value == '\0') {
       break;
     }
     if (!__kmp_strcasecmp_with_sentinel("routine", value, '=')) {
-      value = strchr(value, '=') + 1;
+      par_range_value = strchr(value, '=') + 1;
+      if (!par_range_value)
+        goto par_range_error;
+      value = par_range_value;
       len = __kmp_readstr_with_sentinel(out_routine, value,
                                         KMP_PAR_RANGE_ROUTINE_LEN - 1, ',');
       if (len == 0) {
@@ -451,7 +460,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
       continue;
     }
     if (!__kmp_strcasecmp_with_sentinel("filename", value, '=')) {
-      value = strchr(value, '=') + 1;
+      par_range_value = strchr(value, '=') + 1;
+      if (!par_range_value)
+        goto par_range_error;
+      value = par_range_value;
       len = __kmp_readstr_with_sentinel(out_file, value,
                                         KMP_PAR_RANGE_FILENAME_LEN - 1, ',');
       if (len == 0) {
@@ -465,7 +477,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
     }
     if ((!__kmp_strcasecmp_with_sentinel("range", value, '=')) ||
         (!__kmp_strcasecmp_with_sentinel("incl_range", value, '='))) {
-      value = strchr(value, '=') + 1;
+      par_range_value = strchr(value, '=') + 1;
+      if (!par_range_value)
+        goto par_range_error;
+      value = par_range_value;
       if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) {
         goto par_range_error;
       }
@@ -477,7 +492,10 @@ static void __kmp_stg_parse_par_range(char const *name, char const *value,
       continue;
     }
     if (!__kmp_strcasecmp_with_sentinel("excl_range", value, '=')) {
-      value = strchr(value, '=') + 1;
+      par_range_value = strchr(value, '=') + 1;
+      if (!par_range_value)
+        goto par_range_error;
+      value = par_range_value;
       if (KMP_SSCANF(value, "%d:%d", out_lb, out_ub) != 2) {
         goto par_range_error;
       }
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index b3b6f3fe81c71..4efffca2a089c 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -324,10 +324,16 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
 
-  // We don't need to map to shadow gtid if it is already hidden helper thread
-  if (taskdata->td_flags.hidden_helper && !KMP_HIDDEN_HELPER_THREAD(gtid)) {
-    gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
-    thread = __kmp_threads[gtid];
+  // If we encounter a hidden helper task, and the current thread is not a
+  // hidden helper thread, we have to give the task to any hidden helper thread
+  // starting from its shadow one.
+  if (UNLIKELY(taskdata->td_flags.hidden_helper &&
+               !KMP_HIDDEN_HELPER_THREAD(gtid))) {
+    kmp_int32 shadow_gtid = KMP_GTID_TO_SHADOW_GTID(gtid);
+    __kmpc_give_task(task, __kmp_tid_from_gtid(shadow_gtid));
+    // Signal the hidden helper threads.
+    __kmp_hidden_helper_worker_thread_signal();
+    return TASK_SUCCESSFULLY_PUSHED;
   }
 
   kmp_task_team_t *task_team = thread->th.th_task_team;
@@ -434,16 +440,8 @@ static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
                 gtid, taskdata, thread_data->td.td_deque_ntasks,
                 thread_data->td.td_deque_head, thread_data->td.td_deque_tail));
 
-  auto hidden_helper = taskdata->td_flags.hidden_helper;
-
   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
 
-  // Signal one worker thread to execute the task
-  if (UNLIKELY(hidden_helper)) {
-    // Wake hidden helper threads up if they're sleeping
-    __kmp_hidden_helper_worker_thread_signal();
-  }
-
   return TASK_SUCCESSFULLY_PUSHED;
 }
 
diff --git a/openmp/runtime/src/ompt-specific.cpp b/openmp/runtime/src/ompt-specific.cpp
index 1ad0e17ed408e..c90edebc46a9c 100644
--- a/openmp/runtime/src/ompt-specific.cpp
+++ b/openmp/runtime/src/ompt-specific.cpp
@@ -283,10 +283,6 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
     link_lwt->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
     *OMPT_CUR_TEAM_INFO(thr) = tmp_team;
 
-    ompt_task_info_t tmp_task = lwt->ompt_task_info;
-    link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
-    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
-
     // link the taskteam into the list of taskteams:
     ompt_lw_taskteam_t *my_parent =
         thr->th.th_team->t.ompt_serialized_team_info;
@@ -297,6 +293,10 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
       ompd_bp_parallel_begin();
     }
 #endif
+
+    ompt_task_info_t tmp_task = lwt->ompt_task_info;
+    link_lwt->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
+    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
   } else {
     // this is the first serialized team, so we just store the values in the
     // team and drop the taskteam-object
@@ -313,6 +313,9 @@ void __ompt_lw_taskteam_link(ompt_lw_taskteam_t *lwt, kmp_info_t *thr,
 void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
   ompt_lw_taskteam_t *lwtask = thr->th.th_team->t.ompt_serialized_team_info;
   if (lwtask) {
+    ompt_task_info_t tmp_task = lwtask->ompt_task_info;
+    lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
+    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
 #if OMPD_SUPPORT
     if (ompd_state & OMPD_ENABLE_BP) {
       ompd_bp_parallel_end();
@@ -324,10 +327,6 @@ void __ompt_lw_taskteam_unlink(kmp_info_t *thr) {
     lwtask->ompt_team_info = *OMPT_CUR_TEAM_INFO(thr);
     *OMPT_CUR_TEAM_INFO(thr) = tmp_team;
 
-    ompt_task_info_t tmp_task = lwtask->ompt_task_info;
-    lwtask->ompt_task_info = *OMPT_CUR_TASK_INFO(thr);
-    *OMPT_CUR_TASK_INFO(thr) = tmp_task;
-
     if (lwtask->heap) {
       __kmp_free(lwtask);
       lwtask = NULL;
diff --git a/polly/include/polly/CodeGen/IslNodeBuilder.h b/polly/include/polly/CodeGen/IslNodeBuilder.h
index bb729b8611473..ee0a1e58ae864 100644
--- a/polly/include/polly/CodeGen/IslNodeBuilder.h
+++ b/polly/include/polly/CodeGen/IslNodeBuilder.h
@@ -217,7 +217,8 @@ class IslNodeBuilder {
   //    of loop iterations.
   //
   // 3. With the existing code, upper bounds have been easier to implement.
-  isl::ast_expr getUpperBound(isl::ast_node For, CmpInst::Predicate &Predicate);
+  isl::ast_expr getUpperBound(isl::ast_node_for For,
+                              CmpInst::Predicate &Predicate);
 
   /// Return non-negative number of iterations in case of the following form
   /// of a loop and -1 otherwise.
@@ -228,7 +229,7 @@ class IslNodeBuilder {
   ///
   /// NumIter is a non-negative integer value. Condition can have
   /// isl_ast_op_lt type.
-  int getNumberOfIterations(isl::ast_node For);
+  int getNumberOfIterations(isl::ast_node_for For);
 
   /// Compute the values and loops referenced in this subtree.
   ///
@@ -317,7 +318,7 @@ class IslNodeBuilder {
   bool preloadInvariantEquivClass(InvariantEquivClassTy &IAClass);
 
   void createForVector(__isl_take isl_ast_node *For, int VectorWidth);
-  void createForSequential(isl::ast_node For, bool MarkParallel);
+  void createForSequential(isl::ast_node_for For, bool MarkParallel);
 
   /// Create LLVM-IR that executes a for node thread parallel.
   ///
diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h
index 7960b4a95cd20..ccda4ef650f21 100644
--- a/polly/include/polly/LinkAllPasses.h
+++ b/polly/include/polly/LinkAllPasses.h
@@ -16,6 +16,7 @@
 
 #include "polly/CodeGen/PPCGCodeGeneration.h"
 #include "polly/Config/config.h"
+#include "polly/Support/DumpFunctionPass.h"
 #include "polly/Support/DumpModulePass.h"
 #include "llvm/ADT/StringRef.h"
 #include <cstdlib>
@@ -100,6 +101,7 @@ struct PollyForcePassLinking {
     polly::createForwardOpTreeWrapperPass();
     polly::createDeLICMWrapperPass();
     polly::createDumpModuleWrapperPass("", true);
+    polly::createDumpFunctionWrapperPass("");
     polly::createSimplifyWrapperPass(0);
     polly::createPruneUnprofitableWrapperPass();
   }
diff --git a/polly/include/polly/ScheduleTreeTransform.h b/polly/include/polly/ScheduleTreeTransform.h
index 8727414c0bc3a..bdf4d77565207 100644
--- a/polly/include/polly/ScheduleTreeTransform.h
+++ b/polly/include/polly/ScheduleTreeTransform.h
@@ -134,7 +134,7 @@ struct RecursiveScheduleTreeVisitor
 
   /// By default, recursively visit the child nodes.
   RetTy visitNode(const isl::schedule_node &Node, Args... args) {
-    isl_size NumChildren = Node.n_children();
+    isl_size NumChildren = Node.n_children().release();
     for (isl_size i = 0; i < NumChildren; i += 1)
       getDerived().visit(Node.child(i), std::forward<Args>(args)...);
     return RetTy();
diff --git a/polly/include/polly/ScopDetection.h b/polly/include/polly/ScopDetection.h
index df9e275c59c24..e1195c92b96ef 100644
--- a/polly/include/polly/ScopDetection.h
+++ b/polly/include/polly/ScopDetection.h
@@ -214,7 +214,11 @@ class ScopDetection {
   /// Map to remember detection contexts for all regions.
   using DetectionContextMapTy =
       DenseMap<BBPair, std::unique_ptr<DetectionContext>>;
-  mutable DetectionContextMapTy DetectionContextMap;
+  DetectionContextMapTy DetectionContextMap;
+
+  /// Cache for the isErrorBlock function.
+  DenseMap<std::tuple<const BasicBlock *, const Region *>, bool>
+      ErrorBlockCache;
 
   /// Remove cached results for @p R.
   void removeCachedResults(const Region &R);
@@ -305,7 +309,7 @@ class ScopDetection {
   /// @param Context The context of scop detection.
   ///
   /// @return True if all blocks in R are valid, false otherwise.
-  bool allBlocksValid(DetectionContext &Context) const;
+  bool allBlocksValid(DetectionContext &Context);
 
   /// Check if a region has sufficient compute instructions.
   ///
@@ -347,7 +351,7 @@ class ScopDetection {
   /// @param Context The context of scop detection.
   ///
   /// @return True if R is a Scop, false otherwise.
-  bool isValidRegion(DetectionContext &Context) const;
+  bool isValidRegion(DetectionContext &Context);
 
   /// Check if an intrinsic call can be part of a Scop.
   ///
@@ -420,7 +424,7 @@ class ScopDetection {
   /// @param Context The context of scop detection.
   ///
   /// @return True if the instruction is valid, false otherwise.
-  bool isValidInstruction(Instruction &Inst, DetectionContext &Context) const;
+  bool isValidInstruction(Instruction &Inst, DetectionContext &Context);
 
   /// Check if the switch @p SI with condition @p Condition is valid.
   ///
@@ -444,7 +448,7 @@ class ScopDetection {
   ///
   /// @return True if the branch @p BI is valid.
   bool isValidBranch(BasicBlock &BB, BranchInst *BI, Value *Condition,
-                     bool IsLoopBranch, DetectionContext &Context) const;
+                     bool IsLoopBranch, DetectionContext &Context);
 
   /// Check if the SCEV @p S is affine in the current @p Context.
   ///
@@ -472,7 +476,7 @@ class ScopDetection {
   ///
   /// @return True if the BB contains only valid control flow.
   bool isValidCFG(BasicBlock &BB, bool IsLoopBranch, bool AllowUnreachable,
-                  DetectionContext &Context) const;
+                  DetectionContext &Context);
 
   /// Is a loop valid with respect to a given region.
   ///
@@ -480,7 +484,7 @@ class ScopDetection {
   /// @param Context The context of scop detection.
   ///
   /// @return True if the loop is valid in the region.
-  bool isValidLoop(Loop *L, DetectionContext &Context) const;
+  bool isValidLoop(Loop *L, DetectionContext &Context);
 
   /// Count the number of loops and the maximal loop depth in @p L.
   ///
@@ -505,7 +509,7 @@ class ScopDetection {
   /// @param Context The context of scop detection.
   ///
   /// @return True if ISL can compute the trip count of the loop.
-  bool canUseISLTripCount(Loop *L, DetectionContext &Context) const;
+  bool canUseISLTripCount(Loop *L, DetectionContext &Context);
 
   /// Print the locations of all detected scops.
   void printLocations(Function &F);
@@ -550,7 +554,7 @@ class ScopDetection {
   ///               referenced by a Scop that is still to be processed.
   ///
   /// @return Return true if R is the maximum Region in a Scop, false otherwise.
-  bool isMaxRegionInScop(const Region &R, bool Verify = true) const;
+  bool isMaxRegionInScop(const Region &R, bool Verify = true);
 
   /// Return the detection context for @p R, nullptr if @p R was invalid.
   DetectionContext *getDetectionContext(const Region *R) const;
@@ -596,12 +600,12 @@ class ScopDetection {
 
   /// Verify if all valid Regions in this Function are still valid
   /// after some transformations.
-  void verifyAnalysis() const;
+  void verifyAnalysis();
 
   /// Verify if R is still a valid part of Scop after some transformations.
   ///
   /// @param R The Region to verify.
-  void verifyRegion(const Region &R) const;
+  void verifyRegion(const Region &R);
 
   /// Count the number of loops and the maximal loop depth in @p R.
   ///
@@ -615,6 +619,24 @@ class ScopDetection {
   countBeneficialLoops(Region *R, ScalarEvolution &SE, LoopInfo &LI,
                        unsigned MinProfitableTrips);
 
+  /// Check if the block is a error block.
+  ///
+  /// A error block is currently any block that fulfills at least one of
+  /// the following conditions:
+  ///
+  ///  - It is terminated by an unreachable instruction
+  ///  - It contains a call to a non-pure function that is not immediately
+  ///    dominated by a loop header and that does not dominate the region exit.
+  ///    This is a heuristic to pick only error blocks that are conditionally
+  ///    executed and can be assumed to be not executed at all without the
+  ///    domains being available.
+  ///
+  /// @param BB The block to check.
+  /// @param R  The analyzed region.
+  ///
+  /// @return True if the block is a error block, false otherwise.
+  bool isErrorBlock(llvm::BasicBlock &BB, const llvm::Region &R);
+
 private:
   /// OptimizationRemarkEmitter object used to emit diagnostic remarks
   OptimizationRemarkEmitter &ORE;
@@ -652,8 +674,7 @@ struct ScopDetectionWrapperPass : public FunctionPass {
   void print(raw_ostream &OS, const Module *) const override;
   //@}
 
-  ScopDetection &getSD() { return *Result; }
-  const ScopDetection &getSD() const { return *Result; }
+  ScopDetection &getSD() const { return *Result; }
 };
 } // namespace polly
 
diff --git a/polly/include/polly/Support/DumpFunctionPass.h b/polly/include/polly/Support/DumpFunctionPass.h
new file mode 100644
index 0000000000000..21dfc632fc281
--- /dev/null
+++ b/polly/include/polly/Support/DumpFunctionPass.h
@@ -0,0 +1,43 @@
+//===------ DumpFunctionPass.cpp --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Write a function to a file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POLLY_SUPPORT_DUMPFUNCTIONPASS_H
+#define POLLY_SUPPORT_DUMPFUNCTIONPASS_H
+
+#include "llvm/IR/PassManager.h"
+#include <string>
+
+namespace llvm {
+class ModulePass;
+} // namespace llvm
+
+namespace polly {
+llvm::FunctionPass *createDumpFunctionWrapperPass(std::string Suffix);
+
+/// A pass that isolates a function into a new Module and writes it into a file.
+struct DumpFunctionPass : llvm::PassInfoMixin<DumpFunctionPass> {
+  std::string Suffix;
+
+  DumpFunctionPass(std::string Suffix) : Suffix(std::move(Suffix)) {}
+
+  llvm::PreservedAnalyses run(llvm::Function &F,
+                              llvm::FunctionAnalysisManager &AM);
+};
+
+} // namespace polly
+
+namespace llvm {
+class PassRegistry;
+void initializeDumpFunctionWrapperPassPass(llvm::PassRegistry &);
+} // namespace llvm
+
+#endif /* POLLY_SUPPORT_DUMPFUNCTIONPASS_H */
diff --git a/polly/include/polly/Support/GICHelper.h b/polly/include/polly/Support/GICHelper.h
index 59f1f891c9488..5c6a3256c1c36 100644
--- a/polly/include/polly/Support/GICHelper.h
+++ b/polly/include/polly/Support/GICHelper.h
@@ -186,6 +186,51 @@ ISL_OBJECT_TO_STRING(union_pw_aff)
 ISL_OBJECT_TO_STRING(union_pw_multi_aff)
 //@}
 
+/// C++ wrapper for isl_*_dump() functions.
+//@{
+#define ISL_DUMP_OBJECT(name)                                                  \
+  inline void dumpIslObj(const isl::name &Obj) { isl_##name##_dump(Obj.get()); }
+
+ISL_DUMP_OBJECT(aff)
+ISL_DUMP_OBJECT(aff_list)
+ISL_DUMP_OBJECT(ast_expr)
+ISL_DUMP_OBJECT(ast_node)
+ISL_DUMP_OBJECT(ast_node_list)
+ISL_DUMP_OBJECT(basic_map)
+ISL_DUMP_OBJECT(basic_map_list)
+ISL_DUMP_OBJECT(basic_set)
+ISL_DUMP_OBJECT(basic_set_list)
+ISL_DUMP_OBJECT(constraint)
+ISL_DUMP_OBJECT(id)
+ISL_DUMP_OBJECT(id_list)
+ISL_DUMP_OBJECT(id_to_ast_expr)
+ISL_DUMP_OBJECT(local_space)
+ISL_DUMP_OBJECT(map)
+ISL_DUMP_OBJECT(map_list)
+ISL_DUMP_OBJECT(multi_aff)
+ISL_DUMP_OBJECT(multi_pw_aff)
+ISL_DUMP_OBJECT(multi_union_pw_aff)
+ISL_DUMP_OBJECT(multi_val)
+ISL_DUMP_OBJECT(point)
+ISL_DUMP_OBJECT(pw_aff)
+ISL_DUMP_OBJECT(pw_aff_list)
+ISL_DUMP_OBJECT(pw_multi_aff)
+ISL_DUMP_OBJECT(schedule)
+ISL_DUMP_OBJECT(schedule_constraints)
+ISL_DUMP_OBJECT(schedule_node)
+ISL_DUMP_OBJECT(set)
+ISL_DUMP_OBJECT(set_list)
+ISL_DUMP_OBJECT(space)
+ISL_DUMP_OBJECT(union_map)
+ISL_DUMP_OBJECT(union_pw_aff)
+ISL_DUMP_OBJECT(union_pw_aff_list)
+ISL_DUMP_OBJECT(union_pw_multi_aff)
+ISL_DUMP_OBJECT(union_set)
+ISL_DUMP_OBJECT(union_set_list)
+ISL_DUMP_OBJECT(val)
+ISL_DUMP_OBJECT(val_list)
+//@}
+
 inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
                                      __isl_keep isl_union_map *Map) {
   OS << polly::stringFromIslObj(Map, "null");
diff --git a/polly/include/polly/Support/ISLTools.h b/polly/include/polly/Support/ISLTools.h
index 8172389b7da0a..c4e62ca312852 100644
--- a/polly/include/polly/Support/ISLTools.h
+++ b/polly/include/polly/Support/ISLTools.h
@@ -32,7 +32,7 @@ struct isl_iterator
   using ElementT = list_element_type<ListT>;
 
   explicit isl_iterator(const ListT &List)
-      : List(&List), Position(std::max(List.size(), 0)) {}
+      : List(&List), Position(std::max(List.size().release(), 0)) {}
   isl_iterator(const ListT &List, int Position)
       : List(&List), Position(Position) {}
 
diff --git a/polly/include/polly/Support/SCEVValidator.h b/polly/include/polly/Support/SCEVValidator.h
index a341dc08c4b6b..cf97655e50bd2 100644
--- a/polly/include/polly/Support/SCEVValidator.h
+++ b/polly/include/polly/Support/SCEVValidator.h
@@ -18,6 +18,7 @@ class SCEVConstant;
 } // namespace llvm
 
 namespace polly {
+class ScopDetection;
 
 /// Check if a call is side-effect free and has only constant arguments.
 ///
@@ -94,17 +95,14 @@ extractConstantFactor(const llvm::SCEV *M, llvm::ScalarEvolution &SE);
 /// conditions that seemed non-affine before are now in fact affine.
 const llvm::SCEV *tryForwardThroughPHI(const llvm::SCEV *Expr, llvm::Region &R,
                                        llvm::ScalarEvolution &SE,
-                                       llvm::LoopInfo &LI,
-                                       const llvm::DominatorTree &DT);
+                                       ScopDetection *SD);
 
 /// Return a unique non-error block incoming value for @p PHI if available.
 ///
 /// @param R The region to run our code on.
-/// @param LI The loopinfo tree
-/// @param DT The dominator tree
+/// @param SD The ScopDetection
 llvm::Value *getUniqueNonErrorValue(llvm::PHINode *PHI, llvm::Region *R,
-                                    llvm::LoopInfo &LI,
-                                    const llvm::DominatorTree &DT);
+                                    ScopDetection *SD);
 } // namespace polly
 
 #endif
diff --git a/polly/include/polly/Support/ScopHelper.h b/polly/include/polly/Support/ScopHelper.h
index b1c605549c610..1219b7a97d44f 100644
--- a/polly/include/polly/Support/ScopHelper.h
+++ b/polly/include/polly/Support/ScopHelper.h
@@ -413,27 +413,6 @@ llvm::Value *expandCodeFor(Scop &S, llvm::ScalarEvolution &SE,
                            llvm::Instruction *IP, ValueMapT *VMap,
                            llvm::BasicBlock *RTCBB);
 
-/// Check if the block is a error block.
-///
-/// A error block is currently any block that fulfills at least one of
-/// the following conditions:
-///
-///  - It is terminated by an unreachable instruction
-///  - It contains a call to a non-pure function that is not immediately
-///    dominated by a loop header and that does not dominate the region exit.
-///    This is a heuristic to pick only error blocks that are conditionally
-///    executed and can be assumed to be not executed at all without the domains
-///    being available.
-///
-/// @param BB The block to check.
-/// @param R  The analyzed region.
-/// @param LI The loop info analysis.
-/// @param DT The dominator tree of the function.
-///
-/// @return True if the block is a error block, false otherwise.
-bool isErrorBlock(llvm::BasicBlock &BB, const llvm::Region &R,
-                  llvm::LoopInfo &LI, const llvm::DominatorTree &DT);
-
 /// Return the condition for the terminator @p TI.
 ///
 /// For unconditional branches the "i1 true" condition will be returned.
diff --git a/polly/lib/Analysis/DependenceInfo.cpp b/polly/lib/Analysis/DependenceInfo.cpp
index f00507ad3a48d..709bce7ea3b60 100644
--- a/polly/lib/Analysis/DependenceInfo.cpp
+++ b/polly/lib/Analysis/DependenceInfo.cpp
@@ -190,7 +190,7 @@ static void collectInfo(Scop &S, isl_union_map *&Read,
 
 /// Fix all dimension of @p Zero to 0 and add it to @p user
 static void fixSetToZero(isl::set Zero, isl::union_set *User) {
-  for (auto i : seq<isl_size>(0, Zero.tuple_dim()))
+  for (auto i : seq<isl_size>(0, Zero.tuple_dim().release()))
     Zero = Zero.fix_si(isl::dim::set, i, 0);
   *User = User->unite(Zero);
 }
@@ -667,7 +667,7 @@ bool Dependences::isValidSchedule(
   Dependences = Dependences.apply_range(Schedule);
 
   isl::set Zero = isl::set::universe(ScheduleSpace);
-  for (auto i : seq<isl_size>(0, Zero.tuple_dim()))
+  for (auto i : seq<isl_size>(0, Zero.tuple_dim().release()))
     Zero = Zero.fix_si(isl::dim::set, i, 0);
 
   isl::union_set UDeltas = Dependences.deltas();
diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
index 1aab9e4935284..78f4f5337383a 100644
--- a/polly/lib/Analysis/ScopBuilder.cpp
+++ b/polly/lib/Analysis/ScopBuilder.cpp
@@ -180,12 +180,12 @@ getRegionNodeSuccessor(RegionNode *RN, Instruction *TI, unsigned idx) {
   return TI->getSuccessor(idx);
 }
 
-static bool containsErrorBlock(RegionNode *RN, const Region &R, LoopInfo &LI,
-                               const DominatorTree &DT) {
+static bool containsErrorBlock(RegionNode *RN, const Region &R,
+                               ScopDetection *SD) {
   if (!RN->isSubRegion())
-    return isErrorBlock(*RN->getNodeAs<BasicBlock>(), R, LI, DT);
+    return SD->isErrorBlock(*RN->getNodeAs<BasicBlock>(), R);
   for (BasicBlock *BB : RN->getNodeAs<Region>()->blocks())
-    if (isErrorBlock(*BB, R, LI, DT))
+    if (SD->isErrorBlock(*BB, R))
       return true;
   return false;
 }
@@ -202,7 +202,7 @@ static bool containsErrorBlock(RegionNode *RN, const Region &R, LoopInfo &LI,
 static isl::map createNextIterationMap(isl::space SetSpace, unsigned Dim) {
   isl::space MapSpace = SetSpace.map_from_set();
   isl::map NextIterationMap = isl::map::universe(MapSpace);
-  for (auto u : seq<isl_size>(0, NextIterationMap.domain_tuple_dim()))
+  for (auto u : seq<isl_size>(0, NextIterationMap.domain_tuple_dim().release()))
     if (u != (isl_size)Dim)
       NextIterationMap =
           NextIterationMap.equate(isl::dim::in, u, isl::dim::out, u);
@@ -230,10 +230,10 @@ static isl::set collectBoundedParts(isl::set S) {
 ///          both with regards to the dimension @p Dim.
 static std::pair<isl::set, isl::set> partitionSetParts(isl::set S,
                                                        unsigned Dim) {
-  for (unsigned u = 0, e = S.tuple_dim(); u < e; u++)
+  for (unsigned u = 0, e = S.tuple_dim().release(); u < e; u++)
     S = S.lower_bound_si(isl::dim::set, u, 0);
 
-  unsigned NumDimsS = S.tuple_dim();
+  unsigned NumDimsS = S.tuple_dim().release();
   isl::set OnlyDimS = S;
 
   // Remove dimensions that are greater than Dim as they are not interesting.
@@ -328,7 +328,7 @@ isl::set ScopBuilder::adjustDomainDimensions(isl::set Dom, Loop *OldL,
   } else {
     assert(OldDepth > NewDepth);
     int Diff = OldDepth - NewDepth;
-    int NumDim = Dom.tuple_dim();
+    int NumDim = Dom.tuple_dim().release();
     assert(NumDim >= Diff);
     Dom = Dom.project_out(isl::dim::set, NumDim - Diff, Diff);
   }
@@ -448,7 +448,10 @@ bool ScopBuilder::buildConditionSets(
                              .release();
   } else if (auto *PHI = dyn_cast<PHINode>(Condition)) {
     auto *Unique = dyn_cast<ConstantInt>(
-        getUniqueNonErrorValue(PHI, &scop->getRegion(), LI, DT));
+        getUniqueNonErrorValue(PHI, &scop->getRegion(), &SD));
+    assert(Unique &&
+           "A PHINode condition should only be accepted by ScopDetection if "
+           "getUniqueNonErrorValue returns non-NULL");
 
     if (Unique->isZero())
       ConsequenceCondSet = isl_set_empty(isl_set_get_space(Domain));
@@ -497,8 +500,8 @@ bool ScopBuilder::buildConditionSets(
     const SCEV *LeftOperand = SE.getSCEVAtScope(ICond->getOperand(0), L),
                *RightOperand = SE.getSCEVAtScope(ICond->getOperand(1), L);
 
-    LeftOperand = tryForwardThroughPHI(LeftOperand, R, SE, LI, DT);
-    RightOperand = tryForwardThroughPHI(RightOperand, R, SE, LI, DT);
+    LeftOperand = tryForwardThroughPHI(LeftOperand, R, SE, &SD);
+    RightOperand = tryForwardThroughPHI(RightOperand, R, SE, &SD);
 
     switch (ICond->getPredicate()) {
     case ICmpInst::ICMP_ULT:
@@ -706,9 +709,11 @@ isl::set ScopBuilder::getPredecessorDomainConstraints(BasicBlock *BB,
 
     // Check if there is a valid region we can use for propagation, thus look
     // for a region that contains the predecessor and has @p BB as exit block.
+    // FIXME: This was an side-effect-free (and possibly infinite) loop when
+    //        committed and seems not to be needed.
     auto *PredR = RI.getRegionFor(PredBB);
     while (PredR->getExit() != BB && !PredR->contains(BB))
-      PredR->getParent();
+      PredR = PredR->getParent();
 
     // If a valid region for propagation was found use the entry of that region
     // for propagation, otherwise the PredBB directly.
@@ -838,11 +843,11 @@ bool ScopBuilder::buildDomains(
       isl_set_universe(isl_space_set_alloc(scop->getIslCtx().get(), 0, LD + 1));
 
   InvalidDomainMap[EntryBB] = isl::manage(isl_set_empty(isl_set_get_space(S)));
-  isl::noexceptions::set Domain = isl::manage(S);
+  isl::set Domain = isl::manage(S);
   scop->setDomain(EntryBB, Domain);
 
   if (IsOnlyNonAffineRegion)
-    return !containsErrorBlock(R->getNode(), *R, LI, DT);
+    return !containsErrorBlock(R->getNode(), *R, &SD);
 
   if (!buildDomainsWithBranchConstraints(R, InvalidDomainMap))
     return false;
@@ -895,7 +900,7 @@ bool ScopBuilder::buildDomainsWithBranchConstraints(
       }
     }
 
-    if (containsErrorBlock(RN, scop->getRegion(), LI, DT))
+    if (containsErrorBlock(RN, scop->getRegion(), &SD))
       scop->notifyErrorBlock();
     ;
 
@@ -909,7 +914,7 @@ bool ScopBuilder::buildDomainsWithBranchConstraints(
       continue;
     isl::set Domain = scop->getDomainConditions(BB);
 
-    scop->updateMaxLoopDepth(Domain.tuple_dim());
+    scop->updateMaxLoopDepth(Domain.tuple_dim().release());
 
     auto *BBLoop = getRegionNodeLoop(RN, LI);
     // Propagate the domain from BB directly to blocks that have a superset
@@ -983,7 +988,7 @@ bool ScopBuilder::buildDomainsWithBranchConstraints(
 
       // Check if the maximal number of domain disjunctions was reached.
       // In case this happens we will clean up and bail.
-      if (SuccDomain.n_basic_set() < MaxDisjunctsInDomain)
+      if (SuccDomain.n_basic_set().release() < MaxDisjunctsInDomain)
         continue;
 
       scop->invalidate(COMPLEXITY, DebugLoc());
@@ -1011,7 +1016,7 @@ bool ScopBuilder::propagateInvalidStmtDomains(
       }
     }
 
-    bool ContainsErrorBlock = containsErrorBlock(RN, scop->getRegion(), LI, DT);
+    bool ContainsErrorBlock = containsErrorBlock(RN, scop->getRegion(), &SD);
     BasicBlock *BB = getRegionNodeBasicBlock(RN);
     isl::set &Domain = scop->getOrInitEmptyDomain(BB);
     assert(!Domain.is_null() && "Cannot propagate a nullptr");
@@ -1063,7 +1068,7 @@ bool ScopBuilder::propagateInvalidStmtDomains(
 
       // Check if the maximal number of domain disjunctions was reached.
       // In case this happens we will bail.
-      if (SuccInvalidDomain.n_basic_set() < MaxDisjunctsInDomain)
+      if (SuccInvalidDomain.n_basic_set().release() < MaxDisjunctsInDomain)
         continue;
 
       InvalidDomainMap.erase(BB);
@@ -1162,7 +1167,7 @@ static isl::multi_union_pw_aff mapToDimension(isl::union_set USet, int N) {
   auto Result = isl::union_pw_multi_aff::empty(USet.get_space());
 
   for (isl::set S : USet.get_set_list()) {
-    int Dim = S.tuple_dim();
+    int Dim = S.tuple_dim().release();
     auto PMA = isl::pw_multi_aff::project_out_map(S.get_space(), isl::dim::set,
                                                   N, Dim - N);
     if (N > 1)
@@ -1307,10 +1312,8 @@ void ScopBuilder::buildSchedule(RegionNode *RN, LoopStackTy &LoopStack) {
       // It is easier to insert the marks here that do it retroactively.
       isl::id IslLoopId = createIslLoopAttr(scop->getIslCtx(), L);
       if (!IslLoopId.is_null())
-        Schedule = Schedule.get_root()
-                       .get_child(0)
-                       .insert_mark(IslLoopId)
-                       .get_schedule();
+        Schedule =
+            Schedule.get_root().child(0).insert_mark(IslLoopId).get_schedule();
 
       LoopData->Schedule = combineInSequence(LoopData->Schedule, Schedule);
     }
@@ -2257,7 +2260,7 @@ void ScopBuilder::buildAccessFunctions(ScopStmt *Stmt, BasicBlock &BB,
 
   // We do not build access functions for error blocks, as they may contain
   // instructions we can not model.
-  if (isErrorBlock(BB, scop->getRegion(), LI, DT))
+  if (SD.isErrorBlock(BB, scop->getRegion()))
     return;
 
   auto BuildAccessesForInst = [this, Stmt,
@@ -2405,7 +2408,7 @@ void ScopBuilder::foldSizeConstantsToRight() {
     isl::map Transform = isl::map::universe(Array->getSpace().map_from_set());
 
     std::vector<int> Int;
-    int Dims = Elements.tuple_dim();
+    int Dims = Elements.tuple_dim().release();
     for (int i = 0; i < Dims; i++) {
       isl::set DimOnly = isl::set(Elements).project_out(isl::dim::set, 0, i);
       DimOnly = DimOnly.project_out(isl::dim::set, 1, Dims - i - 1);
@@ -2419,7 +2422,7 @@ void ScopBuilder::foldSizeConstantsToRight() {
         continue;
       }
 
-      if (DimHull.dim(isl::dim::div) == 1) {
+      if (DimHull.dim(isl::dim::div).release() == 1) {
         isl::aff Diff = DimHull.get_div(0);
         isl::val Val = Diff.get_denominator_val();
 
@@ -2839,8 +2842,8 @@ static bool isAccessRangeTooComplex(isl::set AccessRange) {
   int NumTotalDims = 0;
 
   for (isl::basic_set BSet : AccessRange.get_basic_set_list()) {
-    NumTotalDims += BSet.dim(isl::dim::div);
-    NumTotalDims += BSet.dim(isl::dim::set);
+    NumTotalDims += BSet.dim(isl::dim::div).release();
+    NumTotalDims += BSet.dim(isl::dim::set).release();
   }
 
   if (NumTotalDims > MaxDimensionsInAccessRange)
@@ -2869,7 +2872,8 @@ void ScopBuilder::addUserContext() {
 
   isl::set UserContext = isl::set(scop->getIslCtx(), UserContextStr.c_str());
   isl::space Space = scop->getParamSpace();
-  if (Space.dim(isl::dim::param) != UserContext.dim(isl::dim::param)) {
+  if (Space.dim(isl::dim::param).release() !=
+      UserContext.dim(isl::dim::param).release()) {
     std::string SpaceStr = stringFromIslObj(Space, "null");
     errs() << "Error: the context provided in -polly-context has not the same "
            << "number of dimensions than the computed context. Due to this "
@@ -2878,7 +2882,7 @@ void ScopBuilder::addUserContext() {
     return;
   }
 
-  for (auto i : seq<isl_size>(0, Space.dim(isl::dim::param))) {
+  for (auto i : seq<isl_size>(0, Space.dim(isl::dim::param).release())) {
     std::string NameContext =
         scop->getContext().get_dim_name(isl::dim::param, i);
     std::string NameUserContext = UserContext.get_dim_name(isl::dim::param, i);
@@ -2962,7 +2966,7 @@ isl::set ScopBuilder::getNonHoistableCtx(MemoryAccess *Access,
     return WrittenCtx;
 
   WrittenCtx = WrittenCtx.remove_divs();
-  bool TooComplex = WrittenCtx.n_basic_set() >= MaxDisjunctsInDomain;
+  bool TooComplex = WrittenCtx.n_basic_set().release() >= MaxDisjunctsInDomain;
   if (TooComplex || !isRequiredInvariantLoad(LI))
     return {};
 
@@ -3028,7 +3032,7 @@ void ScopBuilder::addInvariantLoads(ScopStmt &Stmt,
   isl::set DomainCtx = Stmt.getDomain().params();
   DomainCtx = DomainCtx.subtract(StmtInvalidCtx);
 
-  if (DomainCtx.n_basic_set() >= MaxDisjunctsInDomain) {
+  if (DomainCtx.n_basic_set().release() >= MaxDisjunctsInDomain) {
     auto *AccInst = InvMAs.front().MA->getAccessInstruction();
     scop->invalidate(COMPLEXITY, AccInst->getDebugLoc(), AccInst->getParent());
     return;
@@ -3304,7 +3308,7 @@ static bool buildMinMaxAccess(isl::set Set,
   Set = Set.remove_divs();
   polly::simplify(Set);
 
-  if (Set.n_basic_set() > RunTimeChecksMaxAccessDisjuncts)
+  if (Set.n_basic_set().release() > RunTimeChecksMaxAccessDisjuncts)
     Set = Set.simple_hull();
 
   // Restrict the number of parameters involved in the access as the lexmin/
@@ -3342,11 +3346,11 @@ static bool buildMinMaxAccess(isl::set Set,
   // enclose the accessed memory region by MinPMA and MaxPMA. The pointer
   // we test during code generation might now point after the end of the
   // allocated array but we will never dereference it anyway.
-  assert((MaxPMA.is_null() || MaxPMA.dim(isl::dim::out)) &&
+  assert((MaxPMA.is_null() || MaxPMA.dim(isl::dim::out).release()) &&
          "Assumed at least one output dimension");
 
-  Pos = MaxPMA.dim(isl::dim::out) - 1;
-  LastDimAff = MaxPMA.get_pw_aff(Pos);
+  Pos = MaxPMA.dim(isl::dim::out).release() - 1;
+  LastDimAff = MaxPMA.at(Pos);
   OneAff = isl::aff(isl::local_space(LastDimAff.get_domain_space()));
   OneAff = OneAff.add_constant_si(1);
   LastDimAff = LastDimAff.add(OneAff);
@@ -3386,7 +3390,7 @@ bool ScopBuilder::calculateMinMaxAccess(AliasGroupTy AliasGroup,
 
 static isl::set getAccessDomain(MemoryAccess *MA) {
   isl::set Domain = MA->getStatement()->getDomain();
-  Domain = Domain.project_out(isl::dim::set, 0, Domain.tuple_dim());
+  Domain = Domain.project_out(isl::dim::set, 0, Domain.tuple_dim().release());
   return Domain.reset_tuple_id();
 }
 
@@ -3672,7 +3676,7 @@ void ScopBuilder::buildScop(Region &R, AssumptionCache &AC) {
   // created somewhere.
   const InvariantLoadsSetTy &RIL = scop->getRequiredInvariantLoads();
   for (BasicBlock *BB : scop->getRegion().blocks()) {
-    if (isErrorBlock(*BB, scop->getRegion(), LI, DT))
+    if (SD.isErrorBlock(*BB, scop->getRegion()))
       continue;
 
     for (Instruction &Inst : *BB) {
diff --git a/polly/lib/Analysis/ScopDetection.cpp b/polly/lib/Analysis/ScopDetection.cpp
index 65694b78f90ca..51d8ddd274431 100644
--- a/polly/lib/Analysis/ScopDetection.cpp
+++ b/polly/lib/Analysis/ScopDetection.cpp
@@ -226,6 +226,11 @@ static cl::opt<bool, true> XPollyInvariantLoadHoisting(
     cl::location(PollyInvariantLoadHoisting), cl::Hidden, cl::ZeroOrMore,
     cl::init(false), cl::cat(PollyCategory));
 
+static cl::opt<bool> PollyAllowErrorBlocks(
+    "polly-allow-error-blocks",
+    cl::desc("Allow to speculate on the execution of 'error blocks'."),
+    cl::Hidden, cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
+
 /// The minimal trip count under which loops are considered unprofitable.
 static const unsigned MIN_LOOP_TRIP_COUNT = 8;
 
@@ -410,7 +415,7 @@ inline bool ScopDetection::invalid(DetectionContext &Context, bool Assert,
   return false;
 }
 
-bool ScopDetection::isMaxRegionInScop(const Region &R, bool Verify) const {
+bool ScopDetection::isMaxRegionInScop(const Region &R, bool Verify) {
   if (!ValidRegions.count(&R))
     return false;
 
@@ -570,7 +575,7 @@ bool ScopDetection::isValidSwitch(BasicBlock &BB, SwitchInst *SI,
 
 bool ScopDetection::isValidBranch(BasicBlock &BB, BranchInst *BI,
                                   Value *Condition, bool IsLoopBranch,
-                                  DetectionContext &Context) const {
+                                  DetectionContext &Context) {
   // Constant integer conditions are always affine.
   if (isa<ConstantInt>(Condition))
     return true;
@@ -587,7 +592,7 @@ bool ScopDetection::isValidBranch(BasicBlock &BB, BranchInst *BI,
 
   if (auto PHI = dyn_cast<PHINode>(Condition)) {
     auto *Unique = dyn_cast_or_null<ConstantInt>(
-        getUniqueNonErrorValue(PHI, &Context.CurRegion, LI, DT));
+        getUniqueNonErrorValue(PHI, &Context.CurRegion, this));
     if (Unique && (Unique->isZero() || Unique->isOne()))
       return true;
   }
@@ -617,8 +622,8 @@ bool ScopDetection::isValidBranch(BasicBlock &BB, BranchInst *BI,
   const SCEV *LHS = SE.getSCEVAtScope(ICmp->getOperand(0), L);
   const SCEV *RHS = SE.getSCEVAtScope(ICmp->getOperand(1), L);
 
-  LHS = tryForwardThroughPHI(LHS, Context.CurRegion, SE, LI, DT);
-  RHS = tryForwardThroughPHI(RHS, Context.CurRegion, SE, LI, DT);
+  LHS = tryForwardThroughPHI(LHS, Context.CurRegion, SE, this);
+  RHS = tryForwardThroughPHI(RHS, Context.CurRegion, SE, this);
 
   // If unsigned operations are not allowed try to approximate the region.
   if (ICmp->isUnsigned() && !PollyAllowUnsignedOperations)
@@ -650,7 +655,7 @@ bool ScopDetection::isValidBranch(BasicBlock &BB, BranchInst *BI,
 
 bool ScopDetection::isValidCFG(BasicBlock &BB, bool IsLoopBranch,
                                bool AllowUnreachable,
-                               DetectionContext &Context) const {
+                               DetectionContext &Context) {
   Region &CurRegion = Context.CurRegion;
 
   Instruction *TI = BB.getTerminator();
@@ -1210,14 +1215,14 @@ bool ScopDetection::isValidMemoryAccess(MemAccInst Inst,
 }
 
 bool ScopDetection::isValidInstruction(Instruction &Inst,
-                                       DetectionContext &Context) const {
+                                       DetectionContext &Context) {
   for (auto &Op : Inst.operands()) {
     auto *OpInst = dyn_cast<Instruction>(&Op);
 
     if (!OpInst)
       continue;
 
-    if (isErrorBlock(*OpInst->getParent(), Context.CurRegion, LI, DT)) {
+    if (isErrorBlock(*OpInst->getParent(), Context.CurRegion)) {
       auto *PHI = dyn_cast<PHINode>(OpInst);
       if (PHI) {
         for (User *U : PHI->users()) {
@@ -1275,8 +1280,7 @@ static bool hasExitingBlocks(Loop *L) {
   return !ExitingBlocks.empty();
 }
 
-bool ScopDetection::canUseISLTripCount(Loop *L,
-                                       DetectionContext &Context) const {
+bool ScopDetection::canUseISLTripCount(Loop *L, DetectionContext &Context) {
   // Ensure the loop has valid exiting blocks as well as latches, otherwise we
   // need to overapproximate it as a boxed loop.
   SmallVector<BasicBlock *, 4> LoopControlBlocks;
@@ -1291,7 +1295,7 @@ bool ScopDetection::canUseISLTripCount(Loop *L,
   return true;
 }
 
-bool ScopDetection::isValidLoop(Loop *L, DetectionContext &Context) const {
+bool ScopDetection::isValidLoop(Loop *L, DetectionContext &Context) {
   // Loops that contain part but not all of the blocks of a region cannot be
   // handled by the schedule generation. Such loop constructs can happen
   // because a region can contain BBs that have no path to the exit block
@@ -1405,6 +1409,75 @@ ScopDetection::countBeneficialLoops(Region *R, ScalarEvolution &SE,
   return {LoopNum, MaxLoopDepth};
 }
 
+static bool isErrorBlockImpl(BasicBlock &BB, const Region &R, LoopInfo &LI,
+                             const DominatorTree &DT) {
+  if (isa<UnreachableInst>(BB.getTerminator()))
+    return true;
+
+  if (LI.isLoopHeader(&BB))
+    return false;
+
+  // Don't consider something outside the SCoP as error block. It will precede
+  // the code versioning runtime check.
+  if (!R.contains(&BB))
+    return false;
+
+  // Basic blocks that are always executed are not considered error blocks,
+  // as their execution can not be a rare event.
+  bool DominatesAllPredecessors = true;
+  if (R.isTopLevelRegion()) {
+    for (BasicBlock &I : *R.getEntry()->getParent()) {
+      if (isa<ReturnInst>(I.getTerminator()) && !DT.dominates(&BB, &I)) {
+        DominatesAllPredecessors = false;
+        break;
+      }
+    }
+  } else {
+    for (auto Pred : predecessors(R.getExit())) {
+      if (R.contains(Pred) && !DT.dominates(&BB, Pred)) {
+        DominatesAllPredecessors = false;
+        break;
+      }
+    }
+  }
+
+  if (DominatesAllPredecessors)
+    return false;
+
+  for (Instruction &Inst : BB)
+    if (CallInst *CI = dyn_cast<CallInst>(&Inst)) {
+      if (isDebugCall(CI))
+        continue;
+
+      if (isIgnoredIntrinsic(CI))
+        continue;
+
+      // memset, memcpy and memmove are modeled intrinsics.
+      if (isa<MemSetInst>(CI) || isa<MemTransferInst>(CI))
+        continue;
+
+      if (!CI->doesNotAccessMemory())
+        return true;
+      if (CI->doesNotReturn())
+        return true;
+    }
+
+  return false;
+}
+
+bool ScopDetection::isErrorBlock(llvm::BasicBlock &BB, const llvm::Region &R) {
+  if (!PollyAllowErrorBlocks)
+    return false;
+
+  auto It = ErrorBlockCache.insert({{&BB, &R}, false});
+  if (!It.second)
+    return It.first->getSecond();
+
+  bool Result = isErrorBlockImpl(BB, R, LI, DT);
+  It.first->second = Result;
+  return Result;
+}
+
 Region *ScopDetection::expandRegion(Region &R) {
   // Initial no valid region was found (greater than R)
   std::unique_ptr<Region> LastValidRegion;
@@ -1542,7 +1615,7 @@ void ScopDetection::findScops(Region &R) {
   }
 }
 
-bool ScopDetection::allBlocksValid(DetectionContext &Context) const {
+bool ScopDetection::allBlocksValid(DetectionContext &Context) {
   Region &CurRegion = Context.CurRegion;
 
   for (const BasicBlock *BB : CurRegion.blocks()) {
@@ -1563,7 +1636,7 @@ bool ScopDetection::allBlocksValid(DetectionContext &Context) const {
   }
 
   for (BasicBlock *BB : CurRegion.blocks()) {
-    bool IsErrorBlock = isErrorBlock(*BB, CurRegion, LI, DT);
+    bool IsErrorBlock = isErrorBlock(*BB, CurRegion);
 
     // Also check exception blocks (and possibly register them as non-affine
     // regions). Even though exception blocks are not modeled, we use them
@@ -1657,7 +1730,7 @@ bool ScopDetection::isProfitableRegion(DetectionContext &Context) const {
   return invalid<ReportUnprofitable>(Context, /*Assert=*/true, &CurRegion);
 }
 
-bool ScopDetection::isValidRegion(DetectionContext &Context) const {
+bool ScopDetection::isValidRegion(DetectionContext &Context) {
   Region &CurRegion = Context.CurRegion;
 
   LLVM_DEBUG(dbgs() << "Checking region: " << CurRegion.getNameStr() << "\n\t");
@@ -1857,14 +1930,14 @@ const RejectLog *ScopDetection::lookupRejectionLog(const Region *R) const {
   return DC ? &DC->Log : nullptr;
 }
 
-void ScopDetection::verifyRegion(const Region &R) const {
+void ScopDetection::verifyRegion(const Region &R) {
   assert(isMaxRegionInScop(R) && "Expect R is a valid region.");
 
   DetectionContext Context(const_cast<Region &>(R), AA, true /*verifying*/);
   isValidRegion(Context);
 }
 
-void ScopDetection::verifyAnalysis() const {
+void ScopDetection::verifyAnalysis() {
   if (!VerifyScops)
     return;
 
diff --git a/polly/lib/Analysis/ScopGraphPrinter.cpp b/polly/lib/Analysis/ScopGraphPrinter.cpp
index 21fc7e3408d30..1092d5aa4a461 100644
--- a/polly/lib/Analysis/ScopGraphPrinter.cpp
+++ b/polly/lib/Analysis/ScopGraphPrinter.cpp
@@ -135,7 +135,7 @@ struct DOTGraphTraits<ScopDetectionWrapperPass *>
 
   // Print the cluster of the subregions. This groups the single basic blocks
   // and adds a different background color for each group.
-  static void printRegionCluster(const ScopDetection *SD, const Region *R,
+  static void printRegionCluster(ScopDetection *SD, const Region *R,
                                  raw_ostream &O, unsigned depth = 0) {
     O.indent(2 * depth) << "subgraph cluster_" << static_cast<const void *>(R)
                         << " {\n";
diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp
index 8d11b503018f4..659961215b951 100644
--- a/polly/lib/Analysis/ScopInfo.cpp
+++ b/polly/lib/Analysis/ScopInfo.cpp
@@ -185,7 +185,7 @@ static isl::set addRangeBoundsToSet(isl::set S, const ConstantRange &Range,
   if (Range.isFullSet())
     return S;
 
-  if (S.n_basic_set() > MaxDisjunctsInContext)
+  if (S.n_basic_set().release() > MaxDisjunctsInContext)
     return S;
 
   // In case of signed wrapping, we can refine the set of valid values by
@@ -195,7 +195,7 @@ static isl::set addRangeBoundsToSet(isl::set S, const ConstantRange &Range,
     isl::set SLB = S.lower_bound_val(type, dim, V);
 
     V = valFromAPInt(Ctx.get(), Range.getUpper(), true);
-    V = V.sub_ui(1);
+    V = V.sub(1);
     isl::set SUB = S.upper_bound_val(type, dim, V);
     S = SLB.unite(SUB);
   }
@@ -473,8 +473,8 @@ void MemoryAccess::updateDimensionality() {
   isl::space AccessSpace = AccessRelation.get_space().range();
   isl::ctx Ctx = ArraySpace.ctx();
 
-  auto DimsArray = ArraySpace.dim(isl::dim::set);
-  auto DimsAccess = AccessSpace.dim(isl::dim::set);
+  auto DimsArray = ArraySpace.dim(isl::dim::set).release();
+  auto DimsAccess = AccessSpace.dim(isl::dim::set).release();
   auto DimsMissing = DimsArray - DimsAccess;
 
   auto *BB = getStatement()->getEntryBlock();
@@ -671,14 +671,14 @@ isl::set MemoryAccess::assumeNoOutOfBound() {
   auto *SAI = getScopArrayInfo();
   isl::space Space = getOriginalAccessRelationSpace().range();
   isl::set Outside = isl::set::empty(Space);
-  for (int i = 1, Size = Space.dim(isl::dim::set); i < Size; ++i) {
+  for (int i = 1, Size = Space.dim(isl::dim::set).release(); i < Size; ++i) {
     isl::local_space LS(Space);
     isl::pw_aff Var = isl::pw_aff::var_on_domain(LS, isl::dim::set, i);
     isl::pw_aff Zero = isl::pw_aff(LS);
 
     isl::set DimOutside = Var.lt_set(Zero);
     isl::pw_aff SizeE = SAI->getDimensionSizePw(i);
-    SizeE = SizeE.add_dims(isl::dim::in, Space.dim(isl::dim::set));
+    SizeE = SizeE.add_dims(isl::dim::in, Space.dim(isl::dim::set).release());
     SizeE = SizeE.set_tuple_id(isl::dim::in, Space.get_tuple_id(isl::dim::set));
     DimOutside = DimOutside.unite(SizeE.le_set(Var));
 
@@ -830,8 +830,8 @@ void MemoryAccess::foldAccessRelation() {
   // Access dimension folding might in certain cases increase the number of
   // disjuncts in the memory access, which can possibly complicate the generated
   // run-time checks and can lead to costly compilation.
-  if (!PollyPreciseFoldAccesses &&
-      NewAccessRelation.n_basic_map() > AccessRelation.n_basic_map()) {
+  if (!PollyPreciseFoldAccesses && NewAccessRelation.n_basic_map().release() >
+                                       AccessRelation.n_basic_map().release()) {
   } else {
     AccessRelation = NewAccessRelation;
   }
@@ -1006,7 +1006,7 @@ isl::pw_aff MemoryAccess::getPwAff(const SCEV *E) {
 static isl::map getEqualAndLarger(isl::space SetDomain) {
   isl::space Space = SetDomain.map_from_set();
   isl::map Map = isl::map::universe(Space);
-  unsigned lastDimension = Map.domain_tuple_dim() - 1;
+  unsigned lastDimension = Map.domain_tuple_dim().release() - 1;
 
   // Set all but the last dimension to be equal for the input and output
   //
@@ -1046,9 +1046,10 @@ bool MemoryAccess::isStrideX(isl::map Schedule, int StrideWidth) const {
 
   Stride = getStride(Schedule);
   StrideX = isl::set::universe(Stride.get_space());
-  for (auto i : seq<isl_size>(0, StrideX.tuple_dim() - 1))
+  for (auto i : seq<isl_size>(0, StrideX.tuple_dim().release() - 1))
     StrideX = StrideX.fix_si(isl::dim::set, i, 0);
-  StrideX = StrideX.fix_si(isl::dim::set, StrideX.tuple_dim() - 1, StrideWidth);
+  StrideX = StrideX.fix_si(isl::dim::set, StrideX.tuple_dim().release() - 1,
+                           StrideWidth);
   IsStrideX = Stride.is_subset(StrideX);
 
   return IsStrideX;
@@ -1108,7 +1109,7 @@ void MemoryAccess::setNewAccessRelation(isl::map NewAccess) {
   // Check whether access dimensions correspond to number of dimensions of the
   // accesses array.
   isl_size Dims = SAI->getNumberOfDimensions();
-  assert(NewAccessSpace.dim(isl::dim::set) == Dims &&
+  assert(NewAccessSpace.dim(isl::dim::set).release() == Dims &&
          "Access dims must match array dims");
 #endif
 
@@ -2143,10 +2144,10 @@ void Scop::intersectDefinedBehavior(isl::set Set, AssumptionSign Sign) {
 
   // Limit the complexity of the context. If complexity is exceeded, simplify
   // the set and check again.
-  if (DefinedBehaviorContext.n_basic_set() >
+  if (DefinedBehaviorContext.n_basic_set().release() >
       MaxDisjunktsInDefinedBehaviourContext) {
     simplify(DefinedBehaviorContext);
-    if (DefinedBehaviorContext.n_basic_set() >
+    if (DefinedBehaviorContext.n_basic_set().release() >
         MaxDisjunktsInDefinedBehaviourContext)
       DefinedBehaviorContext = {};
   }
diff --git a/polly/lib/CMakeLists.txt b/polly/lib/CMakeLists.txt
index 65fed2634ab85..9a967b106ea65 100644
--- a/polly/lib/CMakeLists.txt
+++ b/polly/lib/CMakeLists.txt
@@ -83,6 +83,7 @@ add_llvm_pass_plugin(Polly
   Support/ScopLocation.cpp
   Support/ISLTools.cpp
   Support/DumpModulePass.cpp
+  Support/DumpFunctionPass.cpp
   Support/VirtualInstruction.cpp
   Transform/Canonicalization.cpp
   Transform/CodePreparation.cpp
diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp
index f8cc47ca5a054..9297a51ef23c2 100644
--- a/polly/lib/CodeGen/BlockGenerators.cpp
+++ b/polly/lib/CodeGen/BlockGenerators.cpp
@@ -688,13 +688,12 @@ void BlockGenerator::generateBeginStmtTrace(ScopStmt &Stmt, LoopToScevMapT &LTS,
   Values.push_back(RuntimeDebugBuilder::getPrintableString(Builder, "("));
 
   // Add the coordinate of the statement instance.
-  int DomDims = ScheduleMultiPwAff.dim(isl::dim::out);
+  int DomDims = ScheduleMultiPwAff.dim(isl::dim::out).release();
   for (int i = 0; i < DomDims; i += 1) {
     if (i > 0)
       Values.push_back(RuntimeDebugBuilder::getPrintableString(Builder, ","));
 
-    isl::ast_expr IsInSet =
-        RestrictedBuild.expr_from(ScheduleMultiPwAff.get_pw_aff(i));
+    isl::ast_expr IsInSet = RestrictedBuild.expr_from(ScheduleMultiPwAff.at(i));
     Values.push_back(ExprBuilder->create(IsInSet.copy()));
   }
 
diff --git a/polly/lib/CodeGen/IslAst.cpp b/polly/lib/CodeGen/IslAst.cpp
index cc9112b754149..ab0bcdaf8ef5f 100644
--- a/polly/lib/CodeGen/IslAst.cpp
+++ b/polly/lib/CodeGen/IslAst.cpp
@@ -678,8 +678,8 @@ static __isl_give isl_printer *cbPrintUser(__isl_take isl_printer *P,
                                            __isl_take isl_ast_print_options *O,
                                            __isl_keep isl_ast_node *Node,
                                            void *User) {
-  isl::ast_node AstNode = isl::manage_copy(Node);
-  isl::ast_expr NodeExpr = AstNode.user_get_expr();
+  isl::ast_node_user AstNode = isl::manage_copy(Node).as<isl::ast_node_user>();
+  isl::ast_expr NodeExpr = AstNode.expr();
   isl::ast_expr CallExpr = NodeExpr.get_op_arg(0);
   isl::id CallExprId = CallExpr.get_id();
   ScopStmt *AccessStmt = (ScopStmt *)CallExprId.get_user();
diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp
index 16a89063499bd..5119b53eafacd 100644
--- a/polly/lib/CodeGen/IslNodeBuilder.cpp
+++ b/polly/lib/CodeGen/IslNodeBuilder.cpp
@@ -107,10 +107,10 @@ static cl::opt<OpenMPBackend> PollyOmpBackend(
                clEnumValN(OpenMPBackend::LLVM, "LLVM", "LLVM OpenMP")),
     cl::Hidden, cl::init(OpenMPBackend::GNU), cl::cat(PollyCategory));
 
-isl::ast_expr IslNodeBuilder::getUpperBound(isl::ast_node For,
+isl::ast_expr IslNodeBuilder::getUpperBound(isl::ast_node_for For,
                                             ICmpInst::Predicate &Predicate) {
-  isl::ast_expr Cond = For.for_get_cond();
-  isl::ast_expr Iterator = For.for_get_iterator();
+  isl::ast_expr Cond = For.cond();
+  isl::ast_expr Iterator = For.iterator();
   assert(isl_ast_expr_get_type(Cond.get()) == isl_ast_expr_op &&
          "conditional expression is not an atomic upper bound");
 
@@ -163,16 +163,17 @@ static bool checkIslAstExprInt(__isl_take isl_ast_expr *Expr,
   return true;
 }
 
-int IslNodeBuilder::getNumberOfIterations(isl::ast_node For) {
+int IslNodeBuilder::getNumberOfIterations(isl::ast_node_for For) {
   assert(isl_ast_node_get_type(For.get()) == isl_ast_node_for);
-  isl::ast_node Body = For.for_get_body();
+  isl::ast_node Body = For.body();
 
   // First, check if we can actually handle this code.
   switch (isl_ast_node_get_type(Body.get())) {
   case isl_ast_node_user:
     break;
   case isl_ast_node_block: {
-    isl::ast_node_list List = Body.block_get_children();
+    isl::ast_node_block BodyBlock = Body.as<isl::ast_node_block>();
+    isl::ast_node_list List = BodyBlock.children();
     for (isl::ast_node Node : List) {
       isl_ast_node_type NodeType = isl_ast_node_get_type(Node.get());
       if (NodeType != isl_ast_node_user)
@@ -184,10 +185,10 @@ int IslNodeBuilder::getNumberOfIterations(isl::ast_node For) {
     return -1;
   }
 
-  isl::ast_expr Init = For.for_get_init();
+  isl::ast_expr Init = For.init();
   if (!checkIslAstExprInt(Init.release(), isl_val_is_zero))
     return -1;
-  isl::ast_expr Inc = For.for_get_inc();
+  isl::ast_expr Inc = For.inc();
   if (!checkIslAstExprInt(Inc.release(), isl_val_is_one))
     return -1;
   CmpInst::Predicate Predicate;
@@ -413,11 +414,12 @@ void IslNodeBuilder::createMark(__isl_take isl_ast_node *Node) {
   if (strcmp(isl_id_get_name(Id), "SIMD") == 0 &&
       isl_ast_node_get_type(Child) == isl_ast_node_for) {
     bool Vector = PollyVectorizerChoice == VECTORIZER_POLLY;
-    int VectorWidth = getNumberOfIterations(isl::manage_copy(Child));
+    int VectorWidth =
+        getNumberOfIterations(isl::manage_copy(Child).as<isl::ast_node_for>());
     if (Vector && 1 < VectorWidth && VectorWidth <= 16)
       createForVector(Child, VectorWidth);
     else
-      createForSequential(isl::manage(Child), true);
+      createForSequential(isl::manage(Child).as<isl::ast_node_for>(), true);
     isl_id_free(Id);
     return;
   }
@@ -518,18 +520,21 @@ void IslNodeBuilder::createForVector(__isl_take isl_ast_node *For,
 ///
 /// @param Node The band node to be modified.
 /// @return The modified schedule node.
-static bool IsLoopVectorizerDisabled(isl::ast_node Node) {
+static bool IsLoopVectorizerDisabled(isl::ast_node_for Node) {
   assert(isl_ast_node_get_type(Node.get()) == isl_ast_node_for);
-  auto Body = Node.for_get_body();
+  isl::ast_node Body = Node.body();
   if (isl_ast_node_get_type(Body.get()) != isl_ast_node_mark)
     return false;
-  auto Id = Body.mark_get_id();
+
+  isl::ast_node_mark BodyMark = Body.as<isl::ast_node_mark>();
+  auto Id = BodyMark.id();
   if (strcmp(Id.get_name().c_str(), "Loop Vectorizer Disabled") == 0)
     return true;
   return false;
 }
 
-void IslNodeBuilder::createForSequential(isl::ast_node For, bool MarkParallel) {
+void IslNodeBuilder::createForSequential(isl::ast_node_for For,
+                                         bool MarkParallel) {
   Value *ValueLB, *ValueUB, *ValueInc;
   Type *MaxType;
   BasicBlock *ExitBlock;
@@ -538,7 +543,7 @@ void IslNodeBuilder::createForSequential(isl::ast_node For, bool MarkParallel) {
 
   bool LoopVectorizerDisabled = IsLoopVectorizerDisabled(For);
 
-  isl::ast_node Body = For.for_get_body();
+  isl::ast_node Body = For.body();
 
   // isl_ast_node_for_is_degenerate(For)
   //
@@ -546,9 +551,9 @@ void IslNodeBuilder::createForSequential(isl::ast_node For, bool MarkParallel) {
   //       However, for now we just reuse the logic for normal loops, which will
   //       create a loop with a single iteration.
 
-  isl::ast_expr Init = For.for_get_init();
-  isl::ast_expr Inc = For.for_get_inc();
-  isl::ast_expr Iterator = For.for_get_iterator();
+  isl::ast_expr Init = For.init();
+  isl::ast_expr Inc = For.inc();
+  isl::ast_expr Iterator = For.iterator();
   isl::id IteratorID = Iterator.get_id();
   isl::ast_expr UB = getUpperBound(For, Predicate);
 
@@ -654,7 +659,8 @@ void IslNodeBuilder::createForParallel(__isl_take isl_ast_node *For) {
   Inc = isl_ast_node_for_get_inc(For);
   Iterator = isl_ast_node_for_get_iterator(For);
   IteratorID = isl_ast_expr_get_id(Iterator);
-  UB = getUpperBound(isl::manage_copy(For), Predicate).release();
+  UB = getUpperBound(isl::manage_copy(For).as<isl::ast_node_for>(), Predicate)
+           .release();
 
   ValueLB = ExprBuilder.create(Init);
   ValueUB = ExprBuilder.create(UB);
@@ -782,7 +788,8 @@ void IslNodeBuilder::createFor(__isl_take isl_ast_node *For) {
 
   if (Vector && IslAstInfo::isInnermostParallel(isl::manage_copy(For)) &&
       !IslAstInfo::isReductionParallel(isl::manage_copy(For))) {
-    int VectorWidth = getNumberOfIterations(isl::manage_copy(For));
+    int VectorWidth =
+        getNumberOfIterations(isl::manage_copy(For).as<isl::ast_node_for>());
     if (1 < VectorWidth && VectorWidth <= 16 && !hasPartialAccesses(For)) {
       createForVector(For, VectorWidth);
       return;
@@ -795,7 +802,7 @@ void IslNodeBuilder::createFor(__isl_take isl_ast_node *For) {
   }
   bool Parallel = (IslAstInfo::isParallel(isl::manage_copy(For)) &&
                    !IslAstInfo::isReductionParallel(isl::manage_copy(For)));
-  createForSequential(isl::manage(For), Parallel);
+  createForSequential(isl::manage(For).as<isl::ast_node_for>(), Parallel);
 }
 
 void IslNodeBuilder::createIf(__isl_take isl_ast_node *If) {
diff --git a/polly/lib/CodeGen/PPCGCodeGeneration.cpp b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
index aaa365cffa94d..1c4e24998f38c 100644
--- a/polly/lib/CodeGen/PPCGCodeGeneration.cpp
+++ b/polly/lib/CodeGen/PPCGCodeGeneration.cpp
@@ -1124,11 +1124,11 @@ Value *GPUNodeBuilder::getArraySize(gpu_array_info *Array) {
   if (!gpu_array_is_scalar(Array)) {
     isl::multi_pw_aff ArrayBound = isl::manage_copy(Array->bound);
 
-    isl::pw_aff OffsetDimZero = ArrayBound.get_pw_aff(0);
+    isl::pw_aff OffsetDimZero = ArrayBound.at(0);
     isl::ast_expr Res = Build.expr_from(OffsetDimZero);
 
     for (unsigned int i = 1; i < Array->n_index; i++) {
-      isl::pw_aff Bound_I = ArrayBound.get_pw_aff(i);
+      isl::pw_aff Bound_I = ArrayBound.at(i);
       isl::ast_expr Expr = Build.expr_from(Bound_I);
       Res = Res.mul(Expr);
     }
@@ -1151,7 +1151,7 @@ Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
 
   isl::set ZeroSet = isl::set::universe(Min.get_space());
 
-  for (long i = 0, n = Min.tuple_dim(); i < n; i++)
+  for (long i = 0, n = Min.tuple_dim().release(); i < n; i++)
     ZeroSet = ZeroSet.fix_si(isl::dim::set, i, 0);
 
   if (Min.is_subset(ZeroSet)) {
@@ -1160,7 +1160,7 @@ Value *GPUNodeBuilder::getArrayOffset(gpu_array_info *Array) {
 
   isl::ast_expr Result = isl::ast_expr::from_val(isl::val(Min.ctx(), 0));
 
-  for (long i = 0, n = Min.tuple_dim(); i < n; i++) {
+  for (long i = 0, n = Min.tuple_dim().release(); i < n; i++) {
     if (i > 0) {
       isl::pw_aff Bound_I =
           isl::manage(isl_multi_pw_aff_get_pw_aff(Array->bound, i - 1));
@@ -1307,7 +1307,7 @@ void GPUNodeBuilder::createUser(__isl_take isl_ast_node *UserStmt) {
 }
 
 void GPUNodeBuilder::createFor(__isl_take isl_ast_node *Node) {
-  createForSequential(isl::manage(Node), false);
+  createForSequential(isl::manage(Node).as<isl::ast_node_for>(), false);
 }
 
 void GPUNodeBuilder::createKernelCopy(ppcg_kernel_stmt *KernelStmt) {
@@ -1596,7 +1596,7 @@ std::tuple<Value *, Value *> GPUNodeBuilder::getGridSizes(ppcg_kernel *Kernel) {
 
   isl::multi_pw_aff GridSizePwAffs = isl::manage_copy(Kernel->grid_size);
   for (long i = 0; i < Kernel->n_grid; i++) {
-    isl::pw_aff Size = GridSizePwAffs.get_pw_aff(i);
+    isl::pw_aff Size = GridSizePwAffs.at(i);
     isl::ast_expr GridSize = Context.expr_from(Size);
     Value *Res = ExprBuilder.create(GridSize.release());
     Res = Builder.CreateTrunc(Res, Builder.getInt32Ty());
@@ -2885,8 +2885,8 @@ class PPCGCodeGeneration : public ScopPass {
     isl::pw_aff Val = isl::aff::var_on_domain(LS, isl::dim::set, 0);
     isl::pw_aff OuterMin = AccessSet.dim_min(0);
     isl::pw_aff OuterMax = AccessSet.dim_max(0);
-    OuterMin = OuterMin.add_dims(isl::dim::in, Val.dim(isl::dim::in));
-    OuterMax = OuterMax.add_dims(isl::dim::in, Val.dim(isl::dim::in));
+    OuterMin = OuterMin.add_dims(isl::dim::in, Val.dim(isl::dim::in).release());
+    OuterMax = OuterMax.add_dims(isl::dim::in, Val.dim(isl::dim::in).release());
     OuterMin = OuterMin.set_tuple_id(isl::dim::in, Array->getBasePtrId());
     OuterMax = OuterMax.set_tuple_id(isl::dim::in, Array->getBasePtrId());
 
@@ -2910,7 +2910,7 @@ class PPCGCodeGeneration : public ScopPass {
 
       isl::pw_aff Val = isl::aff::var_on_domain(
           isl::local_space(Array->getSpace()), isl::dim::set, i);
-      PwAff = PwAff.add_dims(isl::dim::in, Val.dim(isl::dim::in));
+      PwAff = PwAff.add_dims(isl::dim::in, Val.dim(isl::dim::in).release());
       PwAff = PwAff.set_tuple_id(isl::dim::in, Val.get_tuple_id(isl::dim::in));
       isl::set Set = PwAff.gt_set(Val);
       Extent = Set.intersect(Extent);
diff --git a/polly/lib/Exchange/JSONExporter.cpp b/polly/lib/Exchange/JSONExporter.cpp
index 5a4e01a8b4b74..4bff2e033bc72 100644
--- a/polly/lib/Exchange/JSONExporter.cpp
+++ b/polly/lib/Exchange/JSONExporter.cpp
@@ -230,8 +230,8 @@ static bool importContext(Scop &S, const json::Object &JScop) {
     return false;
   }
 
-  unsigned OldContextDim = OldContext.dim(isl::dim::param);
-  unsigned NewContextDim = NewContext.dim(isl::dim::param);
+  unsigned OldContextDim = OldContext.dim(isl::dim::param).release();
+  unsigned NewContextDim = NewContext.dim(isl::dim::param).release();
 
   // Check if the imported context has the right number of parameters.
   if (OldContextDim != NewContextDim) {
diff --git a/polly/lib/External/isl/include/isl/isl-noexceptions.h b/polly/lib/External/isl/include/isl/isl-noexceptions.h
index 0aae386f0620a..4e81211c77094 100644
--- a/polly/lib/External/isl/include/isl/isl-noexceptions.h
+++ b/polly/lib/External/isl/include/isl/isl-noexceptions.h
@@ -10,34 +10,17 @@
 #ifndef ISL_CPP_CHECKED
 #define ISL_CPP_CHECKED
 
-#include <isl/id.h>
-#include <isl/space.h>
-#include <isl/val.h>
-#include <isl/aff.h>
-#include <isl/set.h>
-#include <isl/id.h>
-#include <isl/map.h>
-#include <isl/vec.h>
-#include <isl/ilp.h>
-#include <isl/union_set.h>
-#include <isl/union_map.h>
-#include <isl/flow.h>
-#include <isl/schedule.h>
-#include <isl/schedule_node.h>
-#include <isl/ast_build.h>
-#include <isl/fixed_box.h>
-#include <isl/constraint.h>
-#include <isl/polynomial.h>
-#include <isl/mat.h>
-#include <isl/fixed_box.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <isl/set.h>
 
 #include <functional>
+#include <memory>
+#include <ostream>
 #include <string>
+#include <type_traits>
 
 namespace isl {
-inline namespace noexceptions {
 
 #define ISLPP_STRINGIZE_(X) #X
 #define ISLPP_STRINGIZE(X) ISLPP_STRINGIZE_(X)
@@ -52,37 +35,60 @@ inline namespace noexceptions {
     abort();                                                 \
   } while (0)
 
+/* Class used to check that isl::checked::boolean,
+ * isl::checked::stat and isl::checked::size values are checked for errors.
+ */
+struct checker {
+	bool checked = false;
+	~checker() {
+		//ISLPP_ASSERT(checked, "IMPLEMENTATION ERROR: Unchecked state");
+	}
+};
+
 class boolean {
 private:
-  mutable bool checked = false;
+  mutable std::shared_ptr<checker> check = std::make_shared<checker>();
   isl_bool val;
 
   friend boolean manage(isl_bool val);
   boolean(isl_bool val): val(val) {}
 public:
+  static boolean error() {
+    return boolean(isl_bool_error);
+  }
   boolean()
       : val(isl_bool_error) {}
-  ~boolean() {
-    // ISLPP_ASSERT(checked, "IMPLEMENTATION ERROR: Unchecked state");
-  }
 
   /* implicit */ boolean(bool val)
       : val(val ? isl_bool_true : isl_bool_false) {}
 
-  bool is_error() const { checked = true; return val == isl_bool_error; }
-  bool is_false() const { checked = true; return val == isl_bool_false; }
-  bool is_true() const { checked = true; return val == isl_bool_true; }
+  isl_bool release() {
+    auto tmp = val;
+    val = isl_bool_error;
+    check->checked = true;
+    return tmp;
+  }
+
+  bool is_error() const { check->checked = true; return val == isl_bool_error; }
+  bool is_false() const { check->checked = true; return val == isl_bool_false; }
+  bool is_true() const { check->checked = true; return val == isl_bool_true; }
 
   operator bool() const {
-    // ISLPP_ASSERT(checked, "IMPLEMENTATION ERROR: Unchecked error state");
+    //ISLPP_ASSERT(check->checked, "IMPLEMENTATION ERROR: Unchecked error state");
     ISLPP_ASSERT(!is_error(), "IMPLEMENTATION ERROR: Unhandled error state");
     return is_true();
   }
 
+  boolean negate() {
+    if (val == isl_bool_true)
+      val = isl_bool_false;
+    else if (val == isl_bool_false)
+      val = isl_bool_true;
+    return *this;
+  }
+
   boolean operator!() const {
-    if (is_error())
-      return *this;
-    return !is_true();
+    return boolean(*this).negate();
   }
 };
 
@@ -109,12 +115,12 @@ class ctx {
  */
 class stat {
 private:
-	mutable bool checked = false;
+	mutable std::shared_ptr<checker> check = std::make_shared<checker>();
 	isl_stat val;
 
 	friend stat manage(isl_stat val);
+	stat(isl_stat val) : val(val) {}
 public:
-	constexpr stat(isl_stat val) : val(val) {}
 	static stat ok() {
 		return stat(isl_stat_ok);
 	}
@@ -122,31 +128,65 @@ class stat {
 		return stat(isl_stat_error);
 	}
 	stat() : val(isl_stat_error) {}
-	~stat() {
-		// ISLPP_ASSERT(checked, "IMPLEMENTATION ERROR: Unchecked state");
-	}
 
 	isl_stat release() {
-		checked = true;
+		check->checked = true;
 		return val;
 	}
 
 	bool is_error() const {
-		checked = true;
+		check->checked = true;
 		return val == isl_stat_error;
 	}
 	bool is_ok() const {
-		checked = true;
+		check->checked = true;
 		return val == isl_stat_ok;
 	}
 };
 
-
 inline stat manage(isl_stat val)
 {
 	return stat(val);
 }
 
+/* Class encapsulating an isl_size value.
+ */
+class size {
+private:
+	mutable std::shared_ptr<checker> check = std::make_shared<checker>();
+	isl_size val;
+
+	friend size manage(isl_size val);
+	size(isl_size val) : val(val) {}
+public:
+	size() : val(isl_size_error) {}
+
+	isl_size release() {
+		auto tmp = val;
+		val = isl_size_error;
+		check->checked = true;
+		return tmp;
+	}
+
+	bool is_error() const {
+		check->checked = true;
+		return val == isl_size_error;
+	}
+
+	explicit operator unsigned() const {
+		ISLPP_ASSERT(check->checked,
+			    "IMPLEMENTATION ERROR: Unchecked error state");
+		ISLPP_ASSERT(!is_error(),
+			    "IMPLEMENTATION ERROR: Unhandled error state");
+		return val;
+	}
+};
+
+inline size manage(isl_size val)
+{
+	return size(val);
+}
+
 enum class dim {
   cst = isl_dim_cst,
   param = isl_dim_param,
@@ -157,27 +197,72 @@ enum class dim {
   all = isl_dim_all
 };
 
-}
 } // namespace isl
 
-namespace isl {
+#include <isl/id.h>
+#include <isl/space.h>
+#include <isl/val.h>
+#include <isl/aff.h>
+#include <isl/set.h>
+#include <isl/map.h>
+#include <isl/ilp.h>
+#include <isl/constraint.h>
+#include <isl/union_set.h>
+#include <isl/union_map.h>
+#include <isl/flow.h>
+#include <isl/schedule.h>
+#include <isl/schedule_node.h>
+#include <isl/ast_build.h>
+#include <isl/fixed_box.h>
 
-inline namespace noexceptions {
+namespace isl {
 
 // forward declarations
 class aff;
 class aff_list;
 class ast_build;
 class ast_expr;
-class ast_expr_list;
+class ast_expr_id;
+class ast_expr_int;
+class ast_expr_op;
+class ast_expr_op_access;
+class ast_expr_op_add;
+class ast_expr_op_address_of;
+class ast_expr_op_and;
+class ast_expr_op_and_then;
+class ast_expr_op_call;
+class ast_expr_op_cond;
+class ast_expr_op_div;
+class ast_expr_op_eq;
+class ast_expr_op_fdiv_q;
+class ast_expr_op_ge;
+class ast_expr_op_gt;
+class ast_expr_op_le;
+class ast_expr_op_lt;
+class ast_expr_op_max;
+class ast_expr_op_member;
+class ast_expr_op_min;
+class ast_expr_op_minus;
+class ast_expr_op_mul;
+class ast_expr_op_or;
+class ast_expr_op_or_else;
+class ast_expr_op_pdiv_q;
+class ast_expr_op_pdiv_r;
+class ast_expr_op_select;
+class ast_expr_op_sub;
+class ast_expr_op_zdiv_r;
 class ast_node;
+class ast_node_block;
+class ast_node_for;
+class ast_node_if;
 class ast_node_list;
+class ast_node_mark;
+class ast_node_user;
 class basic_map;
 class basic_map_list;
 class basic_set;
 class basic_set_list;
 class constraint;
-class constraint_list;
 class fixed_box;
 class id;
 class id_list;
@@ -185,7 +270,6 @@ class id_to_ast_expr;
 class local_space;
 class map;
 class map_list;
-class mat;
 class multi_aff;
 class multi_id;
 class multi_pw_aff;
@@ -196,32 +280,33 @@ class pw_aff;
 class pw_aff_list;
 class pw_multi_aff;
 class pw_multi_aff_list;
-class pw_qpolynomial;
-class pw_qpolynomial_fold_list;
-class pw_qpolynomial_list;
-class qpolynomial;
-class qpolynomial_list;
 class schedule;
 class schedule_constraints;
 class schedule_node;
+class schedule_node_band;
+class schedule_node_context;
+class schedule_node_domain;
+class schedule_node_expansion;
+class schedule_node_extension;
+class schedule_node_filter;
+class schedule_node_guard;
+class schedule_node_leaf;
+class schedule_node_mark;
+class schedule_node_sequence;
+class schedule_node_set;
 class set;
 class set_list;
 class space;
-class term;
 class union_access_info;
 class union_flow;
 class union_map;
-class union_map_list;
 class union_pw_aff;
 class union_pw_aff_list;
 class union_pw_multi_aff;
-class union_pw_multi_aff_list;
-class union_pw_qpolynomial;
 class union_set;
 class union_set_list;
 class val;
 class val_list;
-class vec;
 
 // declarations for isl::aff
 inline aff manage(__isl_take isl_aff *ptr);
@@ -231,6 +316,7 @@ class aff {
   friend inline aff manage(__isl_take isl_aff *ptr);
   friend inline aff manage_copy(__isl_keep isl_aff *ptr);
 
+protected:
   isl_aff *ptr = nullptr;
 
   inline explicit aff(__isl_take isl_aff *ptr);
@@ -249,81 +335,182 @@ class aff {
   inline __isl_give isl_aff *release();
   inline bool is_null() const;
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
   inline isl::aff add(isl::aff aff2) const;
-  inline isl::aff add_coefficient_si(isl::dim type, int pos, int v) const;
-  inline isl::aff add_coefficient_val(isl::dim type, int pos, isl::val v) const;
+  inline isl::multi_aff add(const isl::multi_aff &multi2) const;
+  inline isl::multi_pw_aff add(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff add(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_aff add(const isl::pw_aff &pwaff2) const;
+  inline isl::pw_multi_aff add(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_aff add(const isl::union_pw_aff &upa2) const;
+  inline isl::union_pw_multi_aff add(const isl::union_pw_multi_aff &upma2) const;
   inline isl::aff add_constant(isl::val v) const;
-  inline isl::aff add_constant_num_si(int v) const;
+  inline isl::aff add_constant(long v) const;
+  inline isl::multi_aff add_constant(const isl::multi_val &mv) const;
   inline isl::aff add_constant_si(int v) const;
-  inline isl::aff add_dims(isl::dim type, unsigned int n) const;
-  inline isl::aff align_params(isl::space model) const;
+  inline isl::pw_aff add_dims(isl::dim type, unsigned int n) const;
+  inline isl::union_pw_multi_aff add_pw_multi_aff(const isl::pw_multi_aff &pma) const;
+  inline isl::union_pw_multi_aff apply(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::aff as_aff() const;
+  inline isl::map as_map() const;
+  inline isl::multi_aff as_multi_aff() const;
+  inline isl::multi_union_pw_aff as_multi_union_pw_aff() const;
+  inline isl::pw_multi_aff as_pw_multi_aff() const;
+  inline isl::set as_set() const;
+  inline isl::union_map as_union_map() const;
+  inline isl::aff at(int pos) const;
   inline isl::basic_set bind(isl::id id) const;
+  inline isl::basic_set bind(const std::string &id) const;
+  inline isl::basic_set bind(const isl::multi_id &tuple) const;
+  inline isl::pw_aff bind_domain(const isl::multi_id &tuple) const;
+  inline isl::pw_aff bind_domain_wrapped_domain(const isl::multi_id &tuple) const;
   inline isl::aff ceil() const;
-  inline int coefficient_sgn(isl::dim type, int pos) const;
-  inline isl_size dim(isl::dim type) const;
+  inline isl::pw_aff coalesce() const;
+  inline isl::pw_aff cond(const isl::pw_aff &pwaff_true, const isl::pw_aff &pwaff_false) const;
+  inline isl::multi_val constant_multi_val() const;
+  inline isl::val constant_val() const;
+  inline isl::val get_constant_val() const;
+  inline isl::val denominator_val() const;
+  inline isl::val get_denominator_val() const;
+  inline class size dim(isl::dim type) const;
+  inline isl::id dim_id(isl::dim type, unsigned int pos) const;
   inline isl::aff div(isl::aff aff2) const;
-  inline isl::aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::basic_set eq_basic_set(isl::aff aff2) const;
+  inline isl::pw_aff div(const isl::pw_aff &pa2) const;
+  inline isl::set domain() const;
+  inline isl::space domain_space() const;
+  inline isl::pw_multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
   inline isl::set eq_set(isl::aff aff2) const;
+  inline isl::set eq_set(const isl::pw_aff &pwaff2) const;
   inline isl::val eval(isl::point pnt) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
+  inline isl::pw_multi_aff extract_pw_multi_aff(const isl::space &space) const;
+  inline isl::multi_aff flat_range_product(const isl::multi_aff &multi2) const;
+  inline isl::multi_pw_aff flat_range_product(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff flat_range_product(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff flat_range_product(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_multi_aff flat_range_product(const isl::union_pw_multi_aff &upma2) const;
   inline isl::aff floor() const;
-  inline isl::aff from_range() const;
-  inline isl::basic_set ge_basic_set(isl::aff aff2) const;
+  inline stat foreach_piece(const std::function<stat(isl::set, isl::aff)> &fn) const;
+  inline stat foreach_piece(const std::function<stat(isl::set, isl::multi_aff)> &fn) const;
+  inline stat foreach_pw_aff(const std::function<stat(isl::pw_aff)> &fn) const;
   inline isl::set ge_set(isl::aff aff2) const;
-  inline isl::val get_coefficient_val(isl::dim type, int pos) const;
-  inline isl::val get_constant_val() const;
-  inline isl::val get_denominator_val() const;
-  inline std::string get_dim_name(isl::dim type, unsigned int pos) const;
-  inline isl::aff get_div(int pos) const;
-  inline isl::local_space get_domain_local_space() const;
-  inline isl::space get_domain_space() const;
-  inline uint32_t get_hash() const;
-  inline isl::local_space get_local_space() const;
-  inline isl::space get_space() const;
+  inline isl::set ge_set(const isl::pw_aff &pwaff2) const;
   inline isl::aff gist(isl::set context) const;
-  inline isl::aff gist_params(isl::set context) const;
-  inline isl::basic_set gt_basic_set(isl::aff aff2) const;
+  inline isl::union_pw_aff gist(const isl::union_set &context) const;
+  inline isl::aff gist(const isl::basic_set &context) const;
+  inline isl::aff gist(const isl::point &context) const;
   inline isl::set gt_set(isl::aff aff2) const;
-  inline isl::aff insert_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
+  inline isl::set gt_set(const isl::pw_aff &pwaff2) const;
+  inline boolean has_range_tuple_id() const;
+  inline isl::multi_aff identity() const;
+  inline isl::pw_aff insert_domain(const isl::space &domain) const;
+  inline isl::pw_aff intersect_domain(const isl::set &set) const;
+  inline isl::union_pw_aff intersect_domain(const isl::space &space) const;
+  inline isl::union_pw_aff intersect_domain(const isl::union_set &uset) const;
+  inline isl::union_pw_aff intersect_domain_wrapped_domain(const isl::union_set &uset) const;
+  inline isl::union_pw_aff intersect_domain_wrapped_range(const isl::union_set &uset) const;
+  inline isl::pw_aff intersect_params(const isl::set &set) const;
   inline boolean involves_locals() const;
+  inline boolean involves_nan() const;
+  inline boolean involves_param(const isl::id &id) const;
+  inline boolean involves_param(const std::string &id) const;
+  inline boolean involves_param(const isl::id_list &list) const;
   inline boolean is_cst() const;
-  inline boolean is_nan() const;
-  inline isl::basic_set le_basic_set(isl::aff aff2) const;
+  inline boolean is_equal(const isl::pw_aff &pa2) const;
+  inline boolean isa_aff() const;
+  inline boolean isa_multi_aff() const;
+  inline boolean isa_pw_multi_aff() const;
   inline isl::set le_set(isl::aff aff2) const;
-  inline isl::basic_set lt_basic_set(isl::aff aff2) const;
+  inline isl::set le_set(const isl::pw_aff &pwaff2) const;
+  inline isl::aff_list list() const;
   inline isl::set lt_set(isl::aff aff2) const;
+  inline isl::set lt_set(const isl::pw_aff &pwaff2) const;
+  inline isl::multi_pw_aff max(const isl::multi_pw_aff &multi2) const;
+  inline isl::pw_aff max(const isl::pw_aff &pwaff2) const;
+  inline isl::multi_val max_multi_val() const;
+  inline isl::multi_pw_aff min(const isl::multi_pw_aff &multi2) const;
+  inline isl::pw_aff min(const isl::pw_aff &pwaff2) const;
+  inline isl::multi_val min_multi_val() const;
   inline isl::aff mod(isl::val mod) const;
-  inline isl::aff move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const;
+  inline isl::aff mod(long mod) const;
   inline isl::aff mul(isl::aff aff2) const;
-  static inline isl::aff nan_on_domain(isl::local_space ls);
-  static inline isl::aff nan_on_domain_space(isl::space space);
+  inline isl::pw_aff mul(const isl::pw_aff &pwaff2) const;
+  inline class size n_piece() const;
   inline isl::set ne_set(isl::aff aff2) const;
+  inline isl::set ne_set(const isl::pw_aff &pwaff2) const;
   inline isl::aff neg() const;
-  inline isl::basic_set neg_basic_set() const;
-  static inline isl::aff param_on_domain_space_id(isl::space space, isl::id id);
-  inline boolean plain_is_equal(const isl::aff &aff2) const;
-  inline boolean plain_is_zero() const;
-  inline isl::aff project_domain_on_params() const;
+  inline boolean plain_is_empty() const;
+  inline boolean plain_is_equal(const isl::multi_aff &multi2) const;
+  inline boolean plain_is_equal(const isl::multi_pw_aff &multi2) const;
+  inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff preimage_domain_wrapped_domain(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::multi_aff product(const isl::multi_aff &multi2) const;
+  inline isl::multi_pw_aff product(const isl::multi_pw_aff &multi2) const;
+  inline isl::pw_multi_aff product(const isl::pw_multi_aff &pma2) const;
   inline isl::aff pullback(isl::multi_aff ma) const;
-  inline isl::aff pullback_aff(isl::aff aff2) const;
+  inline isl::pw_aff pullback(const isl::multi_pw_aff &mpa) const;
+  inline isl::pw_aff pullback(const isl::pw_multi_aff &pma) const;
+  inline isl::union_pw_aff pullback(const isl::union_pw_multi_aff &upma) const;
+  inline isl::aff pullback(const isl::aff &ma) const;
+  inline isl::pw_multi_aff_list pw_multi_aff_list() const;
+  inline isl::pw_multi_aff range_factor_domain() const;
+  inline isl::pw_multi_aff range_factor_range() const;
+  inline isl::multi_aff range_product(const isl::multi_aff &multi2) const;
+  inline isl::multi_pw_aff range_product(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff range_product(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff range_product(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_multi_aff range_product(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::id range_tuple_id() const;
+  inline isl::multi_aff reset_range_tuple_id() const;
+  inline isl::multi_aff reset_tuple_id(isl::dim type) const;
   inline isl::aff scale(isl::val v) const;
+  inline isl::aff scale(long v) const;
+  inline isl::multi_aff scale(const isl::multi_val &mv) const;
   inline isl::aff scale_down(isl::val v) const;
-  inline isl::aff scale_down_ui(unsigned int f) const;
-  inline isl::aff set_coefficient_si(isl::dim type, int pos, int v) const;
-  inline isl::aff set_coefficient_val(isl::dim type, int pos, isl::val v) const;
+  inline isl::aff scale_down(long v) const;
+  inline isl::multi_aff scale_down(const isl::multi_val &mv) const;
+  inline isl::multi_aff set_aff(int pos, const isl::aff &el) const;
+  inline isl::multi_aff set_at(int pos, const isl::aff &el) const;
+  inline isl::multi_pw_aff set_at(int pos, const isl::pw_aff &el) const;
+  inline isl::multi_union_pw_aff set_at(int pos, const isl::union_pw_aff &el) const;
   inline isl::aff set_constant_si(int v) const;
-  inline isl::aff set_constant_val(isl::val v) const;
-  inline isl::aff set_dim_id(isl::dim type, unsigned int pos, isl::id id) const;
-  inline isl::aff set_tuple_id(isl::dim type, isl::id id) const;
+  inline isl::multi_pw_aff set_pw_aff(int pos, const isl::pw_aff &el) const;
+  inline isl::pw_multi_aff set_pw_aff(unsigned int pos, const isl::pw_aff &pa) const;
+  inline isl::multi_aff set_range_tuple(const isl::id &id) const;
+  inline isl::multi_aff set_range_tuple(const std::string &id) const;
+  inline isl::pw_aff set_tuple_id(isl::dim type, const isl::id &id) const;
+  inline isl::pw_aff set_tuple_id(isl::dim type, const std::string &id) const;
+  inline isl::multi_union_pw_aff set_union_pw_aff(int pos, const isl::union_pw_aff &el) const;
+  inline class size size() const;
+  inline isl::space space() const;
   inline isl::aff sub(isl::aff aff2) const;
+  inline isl::multi_aff sub(const isl::multi_aff &multi2) const;
+  inline isl::multi_pw_aff sub(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff sub(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_aff sub(const isl::pw_aff &pwaff2) const;
+  inline isl::pw_multi_aff sub(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_aff sub(const isl::union_pw_aff &upa2) const;
+  inline isl::union_pw_multi_aff sub(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::pw_aff subtract_domain(const isl::set &set) const;
+  inline isl::union_pw_aff subtract_domain(const isl::space &space) const;
+  inline isl::union_pw_aff subtract_domain(const isl::union_set &uset) const;
+  inline isl::pw_aff tdiv_q(const isl::pw_aff &pa2) const;
+  inline isl::pw_aff tdiv_r(const isl::pw_aff &pa2) const;
+  inline isl::aff_list to_list() const;
+  inline isl::multi_pw_aff to_multi_pw_aff() const;
+  inline isl::multi_union_pw_aff to_multi_union_pw_aff() const;
+  inline isl::pw_multi_aff to_pw_multi_aff() const;
+  inline isl::union_pw_aff to_union_pw_aff() const;
+  inline isl::union_pw_multi_aff to_union_pw_multi_aff() const;
+  inline isl::id tuple_id(isl::dim type) const;
   inline isl::aff unbind_params_insert_domain(isl::multi_id domain) const;
-  static inline isl::aff val_on_domain_space(isl::space space, isl::val val);
+  inline isl::multi_pw_aff union_add(const isl::multi_pw_aff &mpa2) const;
+  inline isl::multi_union_pw_aff union_add(const isl::multi_union_pw_aff &mupa2) const;
+  inline isl::pw_aff union_add(const isl::pw_aff &pwaff2) const;
+  inline isl::pw_multi_aff union_add(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_aff union_add(const isl::union_pw_aff &upa2) const;
+  inline isl::union_pw_multi_aff union_add(const isl::union_pw_multi_aff &upma2) const;
   static inline isl::aff var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos);
-  inline isl::basic_set zero_basic_set() const;
   static inline isl::aff zero_on_domain(isl::space space);
 };
 
@@ -335,6 +522,7 @@ class aff_list {
   friend inline aff_list manage(__isl_take isl_aff_list *ptr);
   friend inline aff_list manage_copy(__isl_keep isl_aff_list *ptr);
 
+protected:
   isl_aff_list *ptr = nullptr;
 
   inline explicit aff_list(__isl_take isl_aff_list *ptr);
@@ -342,6 +530,9 @@ class aff_list {
 public:
   inline /* implicit */ aff_list();
   inline /* implicit */ aff_list(const aff_list &obj);
+  inline explicit aff_list(isl::ctx ctx, int n);
+  inline explicit aff_list(isl::aff el);
+  inline explicit aff_list(isl::ctx ctx, const std::string &str);
   inline aff_list &operator=(aff_list obj);
   inline ~aff_list();
   inline __isl_give isl_aff_list *copy() const &;
@@ -350,23 +541,16 @@ class aff_list {
   inline __isl_give isl_aff_list *release();
   inline bool is_null() const;
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
   inline isl::aff_list add(isl::aff el) const;
-  static inline isl::aff_list alloc(isl::ctx ctx, int n);
+  inline isl::aff at(int index) const;
+  inline isl::aff get_at(int index) const;
   inline isl::aff_list clear() const;
   inline isl::aff_list concat(isl::aff_list list2) const;
   inline isl::aff_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(aff)> &fn) const;
-  static inline isl::aff_list from_aff(isl::aff el);
-  inline isl::aff get_aff(int index) const;
-  inline isl::aff get_at(int index) const;
+  inline stat foreach(const std::function<stat(isl::aff)> &fn) const;
   inline isl::aff_list insert(unsigned int pos, isl::aff el) const;
-  inline isl_size n_aff() const;
-  inline isl::aff_list reverse() const;
-  inline isl::aff_list set_aff(int index, isl::aff el) const;
-  inline isl_size size() const;
-  inline isl::aff_list swap(unsigned int pos1, unsigned int pos2) const;
+  inline class size size() const;
 };
 
 // declarations for isl::ast_build
@@ -377,6 +561,7 @@ class ast_build {
   friend inline ast_build manage(__isl_take isl_ast_build *ptr);
   friend inline ast_build manage_copy(__isl_keep isl_ast_build *ptr);
 
+protected:
   isl_ast_build *ptr = nullptr;
 
   inline explicit ast_build(__isl_take isl_ast_build *ptr);
@@ -394,19 +579,28 @@ class ast_build {
   inline bool is_null() const;
   inline isl::ctx ctx() const;
 
+private:
+  inline ast_build &copy_callbacks(const ast_build &obj);
+  struct at_each_domain_data {
+    std::function<isl::ast_node(isl::ast_node, isl::ast_build)> func;
+  };
+  std::shared_ptr<at_each_domain_data> at_each_domain_data;
+  static inline isl_ast_node *at_each_domain(isl_ast_node *arg_0, isl_ast_build *arg_1, void *arg_2);
+  inline void set_at_each_domain_data(const std::function<isl::ast_node(isl::ast_node, isl::ast_build)> &fn);
+public:
+  inline isl::ast_build set_at_each_domain(const std::function<isl::ast_node(isl::ast_node, isl::ast_build)> &fn) const;
   inline isl::ast_expr access_from(isl::multi_pw_aff mpa) const;
   inline isl::ast_expr access_from(isl::pw_multi_aff pma) const;
-  inline isl::ast_node ast_from_schedule(isl::union_map schedule) const;
   inline isl::ast_expr call_from(isl::multi_pw_aff mpa) const;
   inline isl::ast_expr call_from(isl::pw_multi_aff pma) const;
   inline isl::ast_expr expr_from(isl::pw_aff pa) const;
   inline isl::ast_expr expr_from(isl::set set) const;
   static inline isl::ast_build from_context(isl::set set);
-  inline isl::union_map get_schedule() const;
-  inline isl::space get_schedule_space() const;
   inline isl::ast_node node_from(isl::schedule schedule) const;
   inline isl::ast_node node_from_schedule_map(isl::union_map schedule) const;
   inline isl::ast_build restrict(isl::set set) const;
+  inline isl::union_map schedule() const;
+  inline isl::union_map get_schedule() const;
 };
 
 // declarations for isl::ast_expr
@@ -417,6 +611,7 @@ class ast_expr {
   friend inline ast_expr manage(__isl_take isl_ast_expr *ptr);
   friend inline ast_expr manage_copy(__isl_keep isl_ast_expr *ptr);
 
+protected:
   isl_ast_expr *ptr = nullptr;
 
   inline explicit ast_expr(__isl_take isl_ast_expr *ptr);
@@ -431,6511 +626,9222 @@ class ast_expr {
   inline __isl_keep isl_ast_expr *get() const;
   inline __isl_give isl_ast_expr *release();
   inline bool is_null() const;
+private:
+  template <typename T,
+          typename = typename std::enable_if<std::is_same<
+                  const decltype(isl_ast_expr_get_type(NULL)),
+                  const T>::value>::type>
+  inline boolean isa_type(T subtype) const;
+public:
+  template <class T> inline boolean isa() const;
+  template <class T> inline T as() const;
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::ast_expr access(isl::ast_expr_list indices) const;
   inline isl::ast_expr add(isl::ast_expr expr2) const;
   inline isl::ast_expr address_of() const;
-  inline isl::ast_expr call(isl::ast_expr_list arguments) const;
-  inline isl::ast_expr div(isl::ast_expr expr2) const;
   inline isl::ast_expr eq(isl::ast_expr expr2) const;
-  static inline isl::ast_expr from_id(isl::id id);
   static inline isl::ast_expr from_val(isl::val v);
-  inline isl::ast_expr ge(isl::ast_expr expr2) const;
+  inline isl::id id() const;
   inline isl::id get_id() const;
-  inline isl::ast_expr get_op_arg(int pos) const;
-  inline isl_size get_op_n_arg() const;
-  inline isl::val get_val() const;
-  inline isl::ast_expr gt(isl::ast_expr expr2) const;
-  inline isl::id id_get_id() const;
-  inline isl::val int_get_val() const;
-  inline boolean is_equal(const isl::ast_expr &expr2) const;
   inline isl::ast_expr le(isl::ast_expr expr2) const;
-  inline isl::ast_expr lt(isl::ast_expr expr2) const;
   inline isl::ast_expr mul(isl::ast_expr expr2) const;
-  inline isl::ast_expr neg() const;
-  inline isl::ast_expr op_get_arg(int pos) const;
-  inline isl_size op_get_n_arg() const;
-  inline isl::ast_expr pdiv_q(isl::ast_expr expr2) const;
-  inline isl::ast_expr pdiv_r(isl::ast_expr expr2) const;
-  inline isl::ast_expr set_op_arg(int pos, isl::ast_expr arg) const;
-  inline isl::ast_expr sub(isl::ast_expr expr2) const;
-  inline isl::ast_expr substitute_ids(isl::id_to_ast_expr id2expr) const;
+  inline isl::ast_expr op_arg(int pos) const;
+  inline isl::ast_expr get_op_arg(int pos) const;
   inline std::string to_C_str() const;
+  inline isl::val val() const;
+  inline isl::val get_val() const;
 };
 
-// declarations for isl::ast_expr_list
-inline ast_expr_list manage(__isl_take isl_ast_expr_list *ptr);
-inline ast_expr_list manage_copy(__isl_keep isl_ast_expr_list *ptr);
-
-class ast_expr_list {
-  friend inline ast_expr_list manage(__isl_take isl_ast_expr_list *ptr);
-  friend inline ast_expr_list manage_copy(__isl_keep isl_ast_expr_list *ptr);
+// declarations for isl::ast_expr_id
 
-  isl_ast_expr_list *ptr = nullptr;
+class ast_expr_id : public ast_expr {
+  template <class T>
+  friend boolean ast_expr::isa() const;
+  friend ast_expr_id ast_expr::as<ast_expr_id>() const;
+  static const auto type = isl_ast_expr_id;
 
-  inline explicit ast_expr_list(__isl_take isl_ast_expr_list *ptr);
+protected:
+  inline explicit ast_expr_id(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ ast_expr_list();
-  inline /* implicit */ ast_expr_list(const ast_expr_list &obj);
-  inline ast_expr_list &operator=(ast_expr_list obj);
-  inline ~ast_expr_list();
-  inline __isl_give isl_ast_expr_list *copy() const &;
-  inline __isl_give isl_ast_expr_list *copy() && = delete;
-  inline __isl_keep isl_ast_expr_list *get() const;
-  inline __isl_give isl_ast_expr_list *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_id();
+  inline /* implicit */ ast_expr_id(const ast_expr_id &obj);
+  inline ast_expr_id &operator=(ast_expr_id obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::ast_expr_list add(isl::ast_expr el) const;
-  static inline isl::ast_expr_list alloc(isl::ctx ctx, int n);
-  inline isl::ast_expr_list clear() const;
-  inline isl::ast_expr_list concat(isl::ast_expr_list list2) const;
-  inline isl::ast_expr_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(ast_expr)> &fn) const;
-  static inline isl::ast_expr_list from_ast_expr(isl::ast_expr el);
-  inline isl::ast_expr get_ast_expr(int index) const;
-  inline isl::ast_expr get_at(int index) const;
-  inline isl::ast_expr_list insert(unsigned int pos, isl::ast_expr el) const;
-  inline isl_size n_ast_expr() const;
-  inline isl::ast_expr_list reverse() const;
-  inline isl::ast_expr_list set_ast_expr(int index, isl::ast_expr el) const;
-  inline isl_size size() const;
-  inline isl::ast_expr_list swap(unsigned int pos1, unsigned int pos2) const;
-};
 
-// declarations for isl::ast_node
-inline ast_node manage(__isl_take isl_ast_node *ptr);
-inline ast_node manage_copy(__isl_keep isl_ast_node *ptr);
+  inline isl::id id() const;
+  inline isl::id get_id() const;
+};
 
-class ast_node {
-  friend inline ast_node manage(__isl_take isl_ast_node *ptr);
-  friend inline ast_node manage_copy(__isl_keep isl_ast_node *ptr);
+// declarations for isl::ast_expr_int
 
-  isl_ast_node *ptr = nullptr;
+class ast_expr_int : public ast_expr {
+  template <class T>
+  friend boolean ast_expr::isa() const;
+  friend ast_expr_int ast_expr::as<ast_expr_int>() const;
+  static const auto type = isl_ast_expr_int;
 
-  inline explicit ast_node(__isl_take isl_ast_node *ptr);
+protected:
+  inline explicit ast_expr_int(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ ast_node();
-  inline /* implicit */ ast_node(const ast_node &obj);
-  inline ast_node &operator=(ast_node obj);
-  inline ~ast_node();
-  inline __isl_give isl_ast_node *copy() const &;
-  inline __isl_give isl_ast_node *copy() && = delete;
-  inline __isl_keep isl_ast_node *get() const;
-  inline __isl_give isl_ast_node *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_int();
+  inline /* implicit */ ast_expr_int(const ast_expr_int &obj);
+  inline ast_expr_int &operator=(ast_expr_int obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  static inline isl::ast_node alloc_user(isl::ast_expr expr);
-  inline isl::ast_node_list block_get_children() const;
-  inline isl::ast_node for_get_body() const;
-  inline isl::ast_expr for_get_cond() const;
-  inline isl::ast_expr for_get_inc() const;
-  inline isl::ast_expr for_get_init() const;
-  inline isl::ast_expr for_get_iterator() const;
-  inline boolean for_is_degenerate() const;
-  inline isl::id get_annotation() const;
-  inline isl::ast_expr if_get_cond() const;
-  inline isl::ast_node if_get_else() const;
-  inline isl::ast_node if_get_else_node() const;
-  inline isl::ast_node if_get_then() const;
-  inline isl::ast_node if_get_then_node() const;
-  inline boolean if_has_else() const;
-  inline boolean if_has_else_node() const;
-  inline isl::id mark_get_id() const;
-  inline isl::ast_node mark_get_node() const;
-  inline isl::ast_node set_annotation(isl::id annotation) const;
-  inline std::string to_C_str() const;
-  inline isl::ast_expr user_get_expr() const;
-};
 
-// declarations for isl::ast_node_list
-inline ast_node_list manage(__isl_take isl_ast_node_list *ptr);
-inline ast_node_list manage_copy(__isl_keep isl_ast_node_list *ptr);
+  inline isl::val val() const;
+  inline isl::val get_val() const;
+};
 
-class ast_node_list {
-  friend inline ast_node_list manage(__isl_take isl_ast_node_list *ptr);
-  friend inline ast_node_list manage_copy(__isl_keep isl_ast_node_list *ptr);
+// declarations for isl::ast_expr_op
 
-  isl_ast_node_list *ptr = nullptr;
+class ast_expr_op : public ast_expr {
+  template <class T>
+  friend boolean ast_expr::isa() const;
+  friend ast_expr_op ast_expr::as<ast_expr_op>() const;
+  static const auto type = isl_ast_expr_op;
 
-  inline explicit ast_node_list(__isl_take isl_ast_node_list *ptr);
+protected:
+  inline explicit ast_expr_op(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ ast_node_list();
-  inline /* implicit */ ast_node_list(const ast_node_list &obj);
-  inline ast_node_list &operator=(ast_node_list obj);
-  inline ~ast_node_list();
-  inline __isl_give isl_ast_node_list *copy() const &;
-  inline __isl_give isl_ast_node_list *copy() && = delete;
-  inline __isl_keep isl_ast_node_list *get() const;
-  inline __isl_give isl_ast_node_list *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op();
+  inline /* implicit */ ast_expr_op(const ast_expr_op &obj);
+  inline ast_expr_op &operator=(ast_expr_op obj);
+private:
+  template <typename T,
+          typename = typename std::enable_if<std::is_same<
+                  const decltype(isl_ast_expr_op_get_type(NULL)),
+                  const T>::value>::type>
+  inline boolean isa_type(T subtype) const;
+public:
+  template <class T> inline boolean isa() const;
+  template <class T> inline T as() const;
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::ast_node_list add(isl::ast_node el) const;
-  static inline isl::ast_node_list alloc(isl::ctx ctx, int n);
-  inline isl::ast_node_list clear() const;
-  inline isl::ast_node_list concat(isl::ast_node_list list2) const;
-  inline isl::ast_node_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(ast_node)> &fn) const;
-  static inline isl::ast_node_list from_ast_node(isl::ast_node el);
-  inline isl::ast_node get_ast_node(int index) const;
-  inline isl::ast_node get_at(int index) const;
-  inline isl::ast_node_list insert(unsigned int pos, isl::ast_node el) const;
-  inline isl_size n_ast_node() const;
-  inline isl::ast_node_list reverse() const;
-  inline isl::ast_node_list set_ast_node(int index, isl::ast_node el) const;
-  inline isl_size size() const;
-  inline isl::ast_node_list swap(unsigned int pos1, unsigned int pos2) const;
+  inline isl::ast_expr arg(int pos) const;
+  inline isl::ast_expr get_arg(int pos) const;
+  inline class size n_arg() const;
+  inline class size get_n_arg() const;
 };
 
-// declarations for isl::basic_map
-inline basic_map manage(__isl_take isl_basic_map *ptr);
-inline basic_map manage_copy(__isl_keep isl_basic_map *ptr);
-
-class basic_map {
-  friend inline basic_map manage(__isl_take isl_basic_map *ptr);
-  friend inline basic_map manage_copy(__isl_keep isl_basic_map *ptr);
+// declarations for isl::ast_expr_op_access
 
-  isl_basic_map *ptr = nullptr;
+class ast_expr_op_access : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_access ast_expr_op::as<ast_expr_op_access>() const;
+  static const auto type = isl_ast_expr_op_access;
 
-  inline explicit basic_map(__isl_take isl_basic_map *ptr);
+protected:
+  inline explicit ast_expr_op_access(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ basic_map();
-  inline /* implicit */ basic_map(const basic_map &obj);
-  inline explicit basic_map(isl::ctx ctx, const std::string &str);
-  inline basic_map &operator=(basic_map obj);
-  inline ~basic_map();
-  inline __isl_give isl_basic_map *copy() const &;
-  inline __isl_give isl_basic_map *copy() && = delete;
-  inline __isl_keep isl_basic_map *get() const;
-  inline __isl_give isl_basic_map *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_access();
+  inline /* implicit */ ast_expr_op_access(const ast_expr_op_access &obj);
+  inline ast_expr_op_access &operator=(ast_expr_op_access obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::basic_map add_constraint(isl::constraint constraint) const;
-  inline isl::basic_map add_dims(isl::dim type, unsigned int n) const;
-  inline isl::basic_map affine_hull() const;
-  inline isl::basic_map align_params(isl::space model) const;
-  inline isl::basic_map apply_domain(isl::basic_map bmap2) const;
-  inline isl::basic_map apply_range(isl::basic_map bmap2) const;
-  inline boolean can_curry() const;
-  inline boolean can_uncurry() const;
-  inline boolean can_zip() const;
-  inline isl::basic_map curry() const;
-  inline isl::basic_set deltas() const;
-  inline isl::basic_map deltas_map() const;
-  inline isl::basic_map detect_equalities() const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::basic_set domain() const;
-  inline isl::basic_map domain_map() const;
-  inline isl::basic_map domain_product(isl::basic_map bmap2) const;
-  inline isl::basic_map drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::basic_map drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::basic_map drop_unused_params() const;
-  inline isl::basic_map eliminate(isl::dim type, unsigned int first, unsigned int n) const;
-  static inline isl::basic_map empty(isl::space space);
-  static inline isl::basic_map equal(isl::space space, unsigned int n_equal);
-  inline isl::mat equalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4, isl::dim c5) const;
-  inline isl::basic_map equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::basic_map fix_si(isl::dim type, unsigned int pos, int value) const;
-  inline isl::basic_map fix_val(isl::dim type, unsigned int pos, isl::val v) const;
-  inline isl::basic_map flat_product(isl::basic_map bmap2) const;
-  inline isl::basic_map flat_range_product(isl::basic_map bmap2) const;
-  inline isl::basic_map flatten() const;
-  inline isl::basic_map flatten_domain() const;
-  inline isl::basic_map flatten_range() const;
-  inline stat foreach_constraint(const std::function<stat(constraint)> &fn) const;
-  static inline isl::basic_map from_aff(isl::aff aff);
-  static inline isl::basic_map from_aff_list(isl::space domain_space, isl::aff_list list);
-  static inline isl::basic_map from_constraint(isl::constraint constraint);
-  static inline isl::basic_map from_domain(isl::basic_set bset);
-  static inline isl::basic_map from_domain_and_range(isl::basic_set domain, isl::basic_set range);
-  static inline isl::basic_map from_multi_aff(isl::multi_aff maff);
-  static inline isl::basic_map from_qpolynomial(isl::qpolynomial qp);
-  static inline isl::basic_map from_range(isl::basic_set bset);
-  inline isl::constraint_list get_constraint_list() const;
-  inline std::string get_dim_name(isl::dim type, unsigned int pos) const;
-  inline isl::aff get_div(int pos) const;
-  inline isl::local_space get_local_space() const;
-  inline isl::space get_space() const;
-  inline std::string get_tuple_name(isl::dim type) const;
-  inline isl::basic_map gist(isl::basic_map context) const;
-  inline isl::basic_map gist_domain(isl::basic_set context) const;
-  inline boolean has_dim_id(isl::dim type, unsigned int pos) const;
-  static inline isl::basic_map identity(isl::space space);
-  inline boolean image_is_bounded() const;
-  inline isl::mat inequalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4, isl::dim c5) const;
-  inline isl::basic_map insert_dims(isl::dim type, unsigned int pos, unsigned int n) const;
-  inline isl::basic_map intersect(isl::basic_map bmap2) const;
-  inline isl::basic_map intersect_domain(isl::basic_set bset) const;
-  inline isl::basic_map intersect_range(isl::basic_set bset) const;
-  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean is_disjoint(const isl::basic_map &bmap2) const;
-  inline boolean is_empty() const;
-  inline boolean is_equal(const isl::basic_map &bmap2) const;
-  inline boolean is_rational() const;
-  inline boolean is_single_valued() const;
-  inline boolean is_strict_subset(const isl::basic_map &bmap2) const;
-  inline boolean is_subset(const isl::basic_map &bmap2) const;
-  inline boolean is_universe() const;
-  static inline isl::basic_map less_at(isl::space space, unsigned int pos);
-  inline isl::map lexmax() const;
-  inline isl::map lexmin() const;
-  inline isl::pw_multi_aff lexmin_pw_multi_aff() const;
-  inline isl::basic_map lower_bound_si(isl::dim type, unsigned int pos, int value) const;
-  static inline isl::basic_map more_at(isl::space space, unsigned int pos);
-  inline isl::basic_map move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const;
-  inline isl_size n_constraint() const;
-  static inline isl::basic_map nat_universe(isl::space space);
-  inline isl::basic_map neg() const;
-  inline isl::basic_map order_ge(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
-  inline isl::basic_map order_gt(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
-  inline isl::val plain_get_val_if_fixed(isl::dim type, unsigned int pos) const;
-  inline boolean plain_is_empty() const;
-  inline boolean plain_is_universe() const;
-  inline isl::basic_map preimage_domain_multi_aff(isl::multi_aff ma) const;
-  inline isl::basic_map preimage_range_multi_aff(isl::multi_aff ma) const;
-  inline isl::basic_map product(isl::basic_map bmap2) const;
-  inline isl::basic_map project_out(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::basic_set range() const;
-  inline isl::basic_map range_map() const;
-  inline isl::basic_map range_product(isl::basic_map bmap2) const;
-  inline isl::basic_map remove_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::basic_map remove_divs() const;
-  inline isl::basic_map remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::basic_map remove_redundancies() const;
-  inline isl::basic_map reverse() const;
-  inline isl::basic_map sample() const;
-  inline isl::basic_map set_tuple_id(isl::dim type, isl::id id) const;
-  inline isl::basic_map set_tuple_name(isl::dim type, const std::string &s) const;
-  inline isl::basic_map sum(isl::basic_map bmap2) const;
-  inline isl::basic_map uncurry() const;
-  inline isl::map unite(isl::basic_map bmap2) const;
-  static inline isl::basic_map universe(isl::space space);
-  inline isl::basic_map upper_bound_si(isl::dim type, unsigned int pos, int value) const;
-  inline isl::basic_set wrap() const;
-  inline isl::basic_map zip() const;
 };
 
-// declarations for isl::basic_map_list
-inline basic_map_list manage(__isl_take isl_basic_map_list *ptr);
-inline basic_map_list manage_copy(__isl_keep isl_basic_map_list *ptr);
-
-class basic_map_list {
-  friend inline basic_map_list manage(__isl_take isl_basic_map_list *ptr);
-  friend inline basic_map_list manage_copy(__isl_keep isl_basic_map_list *ptr);
+// declarations for isl::ast_expr_op_add
 
-  isl_basic_map_list *ptr = nullptr;
+class ast_expr_op_add : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_add ast_expr_op::as<ast_expr_op_add>() const;
+  static const auto type = isl_ast_expr_op_add;
 
-  inline explicit basic_map_list(__isl_take isl_basic_map_list *ptr);
+protected:
+  inline explicit ast_expr_op_add(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ basic_map_list();
-  inline /* implicit */ basic_map_list(const basic_map_list &obj);
-  inline basic_map_list &operator=(basic_map_list obj);
-  inline ~basic_map_list();
-  inline __isl_give isl_basic_map_list *copy() const &;
-  inline __isl_give isl_basic_map_list *copy() && = delete;
-  inline __isl_keep isl_basic_map_list *get() const;
-  inline __isl_give isl_basic_map_list *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_add();
+  inline /* implicit */ ast_expr_op_add(const ast_expr_op_add &obj);
+  inline ast_expr_op_add &operator=(ast_expr_op_add obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::basic_map_list add(isl::basic_map el) const;
-  static inline isl::basic_map_list alloc(isl::ctx ctx, int n);
-  inline isl::basic_map_list clear() const;
-  inline isl::basic_map_list concat(isl::basic_map_list list2) const;
-  inline isl::basic_map_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(basic_map)> &fn) const;
-  static inline isl::basic_map_list from_basic_map(isl::basic_map el);
-  inline isl::basic_map get_at(int index) const;
-  inline isl::basic_map get_basic_map(int index) const;
-  inline isl::basic_map_list insert(unsigned int pos, isl::basic_map el) const;
-  inline isl_size n_basic_map() const;
-  inline isl::basic_map_list reverse() const;
-  inline isl::basic_map_list set_basic_map(int index, isl::basic_map el) const;
-  inline isl_size size() const;
-  inline isl::basic_map_list swap(unsigned int pos1, unsigned int pos2) const;
 };
 
-// declarations for isl::basic_set
-inline basic_set manage(__isl_take isl_basic_set *ptr);
-inline basic_set manage_copy(__isl_keep isl_basic_set *ptr);
-
-class basic_set {
-  friend inline basic_set manage(__isl_take isl_basic_set *ptr);
-  friend inline basic_set manage_copy(__isl_keep isl_basic_set *ptr);
+// declarations for isl::ast_expr_op_address_of
 
-  isl_basic_set *ptr = nullptr;
+class ast_expr_op_address_of : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_address_of ast_expr_op::as<ast_expr_op_address_of>() const;
+  static const auto type = isl_ast_expr_op_address_of;
 
-  inline explicit basic_set(__isl_take isl_basic_set *ptr);
+protected:
+  inline explicit ast_expr_op_address_of(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ basic_set();
-  inline /* implicit */ basic_set(const basic_set &obj);
-  inline /* implicit */ basic_set(isl::point pnt);
-  inline explicit basic_set(isl::ctx ctx, const std::string &str);
-  inline basic_set &operator=(basic_set obj);
-  inline ~basic_set();
-  inline __isl_give isl_basic_set *copy() const &;
-  inline __isl_give isl_basic_set *copy() && = delete;
-  inline __isl_keep isl_basic_set *get() const;
-  inline __isl_give isl_basic_set *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_address_of();
+  inline /* implicit */ ast_expr_op_address_of(const ast_expr_op_address_of &obj);
+  inline ast_expr_op_address_of &operator=(ast_expr_op_address_of obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::basic_set affine_hull() const;
-  inline isl::basic_set align_params(isl::space model) const;
-  inline isl::basic_set apply(isl::basic_map bmap) const;
-  static inline isl::basic_set box_from_points(isl::point pnt1, isl::point pnt2);
-  inline isl::basic_set coefficients() const;
-  inline isl::basic_set detect_equalities() const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::val dim_max_val(int pos) const;
-  inline isl::basic_set drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::basic_set drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::basic_set drop_unused_params() const;
-  inline isl::basic_set eliminate(isl::dim type, unsigned int first, unsigned int n) const;
-  static inline isl::basic_set empty(isl::space space);
-  inline isl::mat equalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4) const;
-  inline isl::basic_set fix_si(isl::dim type, unsigned int pos, int value) const;
-  inline isl::basic_set fix_val(isl::dim type, unsigned int pos, isl::val v) const;
-  inline isl::basic_set flat_product(isl::basic_set bset2) const;
-  inline isl::basic_set flatten() const;
-  inline stat foreach_bound_pair(isl::dim type, unsigned int pos, const std::function<stat(constraint, constraint, basic_set)> &fn) const;
-  inline stat foreach_constraint(const std::function<stat(constraint)> &fn) const;
-  static inline isl::basic_set from_constraint(isl::constraint constraint);
-  static inline isl::basic_set from_multi_aff(isl::multi_aff ma);
-  inline isl::basic_set from_params() const;
-  inline isl::constraint_list get_constraint_list() const;
-  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
-  inline std::string get_dim_name(isl::dim type, unsigned int pos) const;
-  inline isl::aff get_div(int pos) const;
-  inline isl::local_space get_local_space() const;
-  inline isl::space get_space() const;
-  inline std::string get_tuple_name() const;
-  inline isl::basic_set gist(isl::basic_set context) const;
-  inline isl::mat inequalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4) const;
-  inline isl::basic_set insert_dims(isl::dim type, unsigned int pos, unsigned int n) const;
-  inline isl::basic_set intersect(isl::basic_set bset2) const;
-  inline isl::basic_set intersect_params(isl::basic_set bset2) const;
-  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean is_bounded() const;
-  inline boolean is_disjoint(const isl::basic_set &bset2) const;
-  inline boolean is_empty() const;
-  inline boolean is_equal(const isl::basic_set &bset2) const;
-  inline int is_rational() const;
-  inline boolean is_subset(const isl::basic_set &bset2) const;
-  inline boolean is_universe() const;
-  inline boolean is_wrapping() const;
-  inline isl::set lexmax() const;
-  inline isl::set lexmin() const;
-  inline isl::basic_set lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const;
-  inline isl::val max_val(const isl::aff &obj) const;
-  inline isl::basic_set move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const;
-  inline isl_size n_constraint() const;
-  inline isl_size n_dim() const;
-  static inline isl::basic_set nat_universe(isl::space space);
-  inline isl::basic_set neg() const;
-  inline isl::basic_set params() const;
-  inline boolean plain_is_empty() const;
-  inline boolean plain_is_equal(const isl::basic_set &bset2) const;
-  inline boolean plain_is_universe() const;
-  static inline isl::basic_set positive_orthant(isl::space space);
-  inline isl::basic_set preimage_multi_aff(isl::multi_aff ma) const;
-  inline isl::basic_set project_out(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::mat reduced_basis() const;
-  inline isl::basic_set remove_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::basic_set remove_divs() const;
-  inline isl::basic_set remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::basic_set remove_redundancies() const;
-  inline isl::basic_set remove_unknown_divs() const;
-  inline isl::basic_set sample() const;
-  inline isl::point sample_point() const;
-  inline isl::basic_set set_tuple_id(isl::id id) const;
-  inline isl::basic_set set_tuple_name(const std::string &s) const;
-  inline isl::basic_set solutions() const;
-  inline isl::set unite(isl::basic_set bset2) const;
-  static inline isl::basic_set universe(isl::space space);
-  inline isl::basic_map unwrap() const;
-  inline isl::basic_set upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const;
 };
 
-// declarations for isl::basic_set_list
-inline basic_set_list manage(__isl_take isl_basic_set_list *ptr);
-inline basic_set_list manage_copy(__isl_keep isl_basic_set_list *ptr);
-
-class basic_set_list {
-  friend inline basic_set_list manage(__isl_take isl_basic_set_list *ptr);
-  friend inline basic_set_list manage_copy(__isl_keep isl_basic_set_list *ptr);
+// declarations for isl::ast_expr_op_and
 
-  isl_basic_set_list *ptr = nullptr;
+class ast_expr_op_and : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_and ast_expr_op::as<ast_expr_op_and>() const;
+  static const auto type = isl_ast_expr_op_and;
 
-  inline explicit basic_set_list(__isl_take isl_basic_set_list *ptr);
+protected:
+  inline explicit ast_expr_op_and(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ basic_set_list();
-  inline /* implicit */ basic_set_list(const basic_set_list &obj);
-  inline basic_set_list &operator=(basic_set_list obj);
-  inline ~basic_set_list();
-  inline __isl_give isl_basic_set_list *copy() const &;
-  inline __isl_give isl_basic_set_list *copy() && = delete;
-  inline __isl_keep isl_basic_set_list *get() const;
-  inline __isl_give isl_basic_set_list *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_and();
+  inline /* implicit */ ast_expr_op_and(const ast_expr_op_and &obj);
+  inline ast_expr_op_and &operator=(ast_expr_op_and obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::basic_set_list add(isl::basic_set el) const;
-  static inline isl::basic_set_list alloc(isl::ctx ctx, int n);
-  inline isl::basic_set_list clear() const;
-  inline isl::basic_set_list coefficients() const;
-  inline isl::basic_set_list concat(isl::basic_set_list list2) const;
-  inline isl::basic_set_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(basic_set)> &fn) const;
-  static inline isl::basic_set_list from_basic_set(isl::basic_set el);
-  inline isl::basic_set get_at(int index) const;
-  inline isl::basic_set get_basic_set(int index) const;
-  inline isl::basic_set_list insert(unsigned int pos, isl::basic_set el) const;
-  inline isl_size n_basic_set() const;
-  inline isl::basic_set_list reverse() const;
-  inline isl::basic_set_list set_basic_set(int index, isl::basic_set el) const;
-  inline isl_size size() const;
-  inline isl::basic_set_list swap(unsigned int pos1, unsigned int pos2) const;
 };
 
-// declarations for isl::constraint
-inline constraint manage(__isl_take isl_constraint *ptr);
-inline constraint manage_copy(__isl_keep isl_constraint *ptr);
-
-class constraint {
-  friend inline constraint manage(__isl_take isl_constraint *ptr);
-  friend inline constraint manage_copy(__isl_keep isl_constraint *ptr);
+// declarations for isl::ast_expr_op_and_then
 
-  isl_constraint *ptr = nullptr;
+class ast_expr_op_and_then : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_and_then ast_expr_op::as<ast_expr_op_and_then>() const;
+  static const auto type = isl_ast_expr_op_and_then;
 
-  inline explicit constraint(__isl_take isl_constraint *ptr);
+protected:
+  inline explicit ast_expr_op_and_then(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ constraint();
-  inline /* implicit */ constraint(const constraint &obj);
-  inline constraint &operator=(constraint obj);
-  inline ~constraint();
-  inline __isl_give isl_constraint *copy() const &;
-  inline __isl_give isl_constraint *copy() && = delete;
-  inline __isl_keep isl_constraint *get() const;
-  inline __isl_give isl_constraint *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_and_then();
+  inline /* implicit */ ast_expr_op_and_then(const ast_expr_op_and_then &obj);
+  inline ast_expr_op_and_then &operator=(ast_expr_op_and_then obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  static inline isl::constraint alloc_equality(isl::local_space ls);
-  static inline isl::constraint alloc_inequality(isl::local_space ls);
-  inline int cmp_last_non_zero(const isl::constraint &c2) const;
-  inline isl::aff get_aff() const;
-  inline isl::aff get_bound(isl::dim type, int pos) const;
-  inline isl::val get_coefficient_val(isl::dim type, int pos) const;
-  inline isl::val get_constant_val() const;
-  inline std::string get_dim_name(isl::dim type, unsigned int pos) const;
-  inline isl::aff get_div(int pos) const;
-  inline isl::local_space get_local_space() const;
-  inline isl::space get_space() const;
-  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean is_div_constraint() const;
-  inline boolean is_lower_bound(isl::dim type, unsigned int pos) const;
-  inline boolean is_upper_bound(isl::dim type, unsigned int pos) const;
-  inline int plain_cmp(const isl::constraint &c2) const;
-  inline isl::constraint set_coefficient_si(isl::dim type, int pos, int v) const;
-  inline isl::constraint set_coefficient_val(isl::dim type, int pos, isl::val v) const;
-  inline isl::constraint set_constant_si(int v) const;
-  inline isl::constraint set_constant_val(isl::val v) const;
 };
 
-// declarations for isl::constraint_list
-inline constraint_list manage(__isl_take isl_constraint_list *ptr);
-inline constraint_list manage_copy(__isl_keep isl_constraint_list *ptr);
-
-class constraint_list {
-  friend inline constraint_list manage(__isl_take isl_constraint_list *ptr);
-  friend inline constraint_list manage_copy(__isl_keep isl_constraint_list *ptr);
+// declarations for isl::ast_expr_op_call
 
-  isl_constraint_list *ptr = nullptr;
+class ast_expr_op_call : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_call ast_expr_op::as<ast_expr_op_call>() const;
+  static const auto type = isl_ast_expr_op_call;
 
-  inline explicit constraint_list(__isl_take isl_constraint_list *ptr);
+protected:
+  inline explicit ast_expr_op_call(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ constraint_list();
-  inline /* implicit */ constraint_list(const constraint_list &obj);
-  inline constraint_list &operator=(constraint_list obj);
-  inline ~constraint_list();
-  inline __isl_give isl_constraint_list *copy() const &;
-  inline __isl_give isl_constraint_list *copy() && = delete;
-  inline __isl_keep isl_constraint_list *get() const;
-  inline __isl_give isl_constraint_list *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_call();
+  inline /* implicit */ ast_expr_op_call(const ast_expr_op_call &obj);
+  inline ast_expr_op_call &operator=(ast_expr_op_call obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::constraint_list add(isl::constraint el) const;
-  static inline isl::constraint_list alloc(isl::ctx ctx, int n);
-  inline isl::constraint_list clear() const;
-  inline isl::constraint_list concat(isl::constraint_list list2) const;
-  inline isl::constraint_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(constraint)> &fn) const;
-  static inline isl::constraint_list from_constraint(isl::constraint el);
-  inline isl::constraint get_at(int index) const;
-  inline isl::constraint get_constraint(int index) const;
-  inline isl::constraint_list insert(unsigned int pos, isl::constraint el) const;
-  inline isl_size n_constraint() const;
-  inline isl::constraint_list reverse() const;
-  inline isl::constraint_list set_constraint(int index, isl::constraint el) const;
-  inline isl_size size() const;
-  inline isl::constraint_list swap(unsigned int pos1, unsigned int pos2) const;
-};
 
-// declarations for isl::fixed_box
-inline fixed_box manage(__isl_take isl_fixed_box *ptr);
-inline fixed_box manage_copy(__isl_keep isl_fixed_box *ptr);
+};
 
-class fixed_box {
-  friend inline fixed_box manage(__isl_take isl_fixed_box *ptr);
-  friend inline fixed_box manage_copy(__isl_keep isl_fixed_box *ptr);
+// declarations for isl::ast_expr_op_cond
 
-  isl_fixed_box *ptr = nullptr;
+class ast_expr_op_cond : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_cond ast_expr_op::as<ast_expr_op_cond>() const;
+  static const auto type = isl_ast_expr_op_cond;
 
-  inline explicit fixed_box(__isl_take isl_fixed_box *ptr);
+protected:
+  inline explicit ast_expr_op_cond(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ fixed_box();
-  inline /* implicit */ fixed_box(const fixed_box &obj);
-  inline fixed_box &operator=(fixed_box obj);
-  inline ~fixed_box();
-  inline __isl_give isl_fixed_box *copy() const &;
-  inline __isl_give isl_fixed_box *copy() && = delete;
-  inline __isl_keep isl_fixed_box *get() const;
-  inline __isl_give isl_fixed_box *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_cond();
+  inline /* implicit */ ast_expr_op_cond(const ast_expr_op_cond &obj);
+  inline ast_expr_op_cond &operator=(ast_expr_op_cond obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::multi_aff get_offset() const;
-  inline isl::multi_val get_size() const;
-  inline isl::space get_space() const;
-  inline boolean is_valid() const;
 };
 
-// declarations for isl::id
-inline id manage(__isl_take isl_id *ptr);
-inline id manage_copy(__isl_keep isl_id *ptr);
-
-class id {
-  friend inline id manage(__isl_take isl_id *ptr);
-  friend inline id manage_copy(__isl_keep isl_id *ptr);
+// declarations for isl::ast_expr_op_div
 
-  isl_id *ptr = nullptr;
+class ast_expr_op_div : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_div ast_expr_op::as<ast_expr_op_div>() const;
+  static const auto type = isl_ast_expr_op_div;
 
-  inline explicit id(__isl_take isl_id *ptr);
+protected:
+  inline explicit ast_expr_op_div(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ id();
-  inline /* implicit */ id(const id &obj);
-  inline explicit id(isl::ctx ctx, const std::string &str);
-  inline id &operator=(id obj);
-  inline ~id();
-  inline __isl_give isl_id *copy() const &;
-  inline __isl_give isl_id *copy() && = delete;
-  inline __isl_keep isl_id *get() const;
-  inline __isl_give isl_id *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_div();
+  inline /* implicit */ ast_expr_op_div(const ast_expr_op_div &obj);
+  inline ast_expr_op_div &operator=(ast_expr_op_div obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  static inline isl::id alloc(isl::ctx ctx, const std::string &name, void * user);
-  inline uint32_t get_hash() const;
-  inline std::string get_name() const;
-  inline void * get_user() const;
 };
 
-// declarations for isl::id_list
-inline id_list manage(__isl_take isl_id_list *ptr);
-inline id_list manage_copy(__isl_keep isl_id_list *ptr);
+// declarations for isl::ast_expr_op_eq
 
-class id_list {
-  friend inline id_list manage(__isl_take isl_id_list *ptr);
-  friend inline id_list manage_copy(__isl_keep isl_id_list *ptr);
+class ast_expr_op_eq : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_eq ast_expr_op::as<ast_expr_op_eq>() const;
+  static const auto type = isl_ast_expr_op_eq;
 
-  isl_id_list *ptr = nullptr;
+protected:
+  inline explicit ast_expr_op_eq(__isl_take isl_ast_expr *ptr);
 
-  inline explicit id_list(__isl_take isl_id_list *ptr);
+public:
+  inline /* implicit */ ast_expr_op_eq();
+  inline /* implicit */ ast_expr_op_eq(const ast_expr_op_eq &obj);
+  inline ast_expr_op_eq &operator=(ast_expr_op_eq obj);
+  inline isl::ctx ctx() const;
+
+};
+
+// declarations for isl::ast_expr_op_fdiv_q
+
+class ast_expr_op_fdiv_q : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_fdiv_q ast_expr_op::as<ast_expr_op_fdiv_q>() const;
+  static const auto type = isl_ast_expr_op_fdiv_q;
+
+protected:
+  inline explicit ast_expr_op_fdiv_q(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ id_list();
-  inline /* implicit */ id_list(const id_list &obj);
-  inline id_list &operator=(id_list obj);
-  inline ~id_list();
-  inline __isl_give isl_id_list *copy() const &;
-  inline __isl_give isl_id_list *copy() && = delete;
-  inline __isl_keep isl_id_list *get() const;
-  inline __isl_give isl_id_list *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_fdiv_q();
+  inline /* implicit */ ast_expr_op_fdiv_q(const ast_expr_op_fdiv_q &obj);
+  inline ast_expr_op_fdiv_q &operator=(ast_expr_op_fdiv_q obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::id_list add(isl::id el) const;
-  static inline isl::id_list alloc(isl::ctx ctx, int n);
-  inline isl::id_list clear() const;
-  inline isl::id_list concat(isl::id_list list2) const;
-  inline isl::id_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(id)> &fn) const;
-  static inline isl::id_list from_id(isl::id el);
-  inline isl::id get_at(int index) const;
-  inline isl::id get_id(int index) const;
-  inline isl::id_list insert(unsigned int pos, isl::id el) const;
-  inline isl_size n_id() const;
-  inline isl::id_list reverse() const;
-  inline isl::id_list set_id(int index, isl::id el) const;
-  inline isl_size size() const;
-  inline isl::id_list swap(unsigned int pos1, unsigned int pos2) const;
 };
 
-// declarations for isl::id_to_ast_expr
-inline id_to_ast_expr manage(__isl_take isl_id_to_ast_expr *ptr);
-inline id_to_ast_expr manage_copy(__isl_keep isl_id_to_ast_expr *ptr);
+// declarations for isl::ast_expr_op_ge
 
-class id_to_ast_expr {
-  friend inline id_to_ast_expr manage(__isl_take isl_id_to_ast_expr *ptr);
-  friend inline id_to_ast_expr manage_copy(__isl_keep isl_id_to_ast_expr *ptr);
+class ast_expr_op_ge : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_ge ast_expr_op::as<ast_expr_op_ge>() const;
+  static const auto type = isl_ast_expr_op_ge;
 
-  isl_id_to_ast_expr *ptr = nullptr;
+protected:
+  inline explicit ast_expr_op_ge(__isl_take isl_ast_expr *ptr);
 
-  inline explicit id_to_ast_expr(__isl_take isl_id_to_ast_expr *ptr);
+public:
+  inline /* implicit */ ast_expr_op_ge();
+  inline /* implicit */ ast_expr_op_ge(const ast_expr_op_ge &obj);
+  inline ast_expr_op_ge &operator=(ast_expr_op_ge obj);
+  inline isl::ctx ctx() const;
+
+};
+
+// declarations for isl::ast_expr_op_gt
+
+class ast_expr_op_gt : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_gt ast_expr_op::as<ast_expr_op_gt>() const;
+  static const auto type = isl_ast_expr_op_gt;
+
+protected:
+  inline explicit ast_expr_op_gt(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ id_to_ast_expr();
-  inline /* implicit */ id_to_ast_expr(const id_to_ast_expr &obj);
-  inline id_to_ast_expr &operator=(id_to_ast_expr obj);
-  inline ~id_to_ast_expr();
-  inline __isl_give isl_id_to_ast_expr *copy() const &;
-  inline __isl_give isl_id_to_ast_expr *copy() && = delete;
-  inline __isl_keep isl_id_to_ast_expr *get() const;
-  inline __isl_give isl_id_to_ast_expr *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_gt();
+  inline /* implicit */ ast_expr_op_gt(const ast_expr_op_gt &obj);
+  inline ast_expr_op_gt &operator=(ast_expr_op_gt obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  static inline isl::id_to_ast_expr alloc(isl::ctx ctx, int min_size);
-  inline isl::id_to_ast_expr drop(isl::id key) const;
-  inline stat foreach(const std::function<stat(id, ast_expr)> &fn) const;
-  inline isl::ast_expr get(isl::id key) const;
-  inline boolean has(const isl::id &key) const;
-  inline isl::id_to_ast_expr set(isl::id key, isl::ast_expr val) const;
 };
 
-// declarations for isl::local_space
-inline local_space manage(__isl_take isl_local_space *ptr);
-inline local_space manage_copy(__isl_keep isl_local_space *ptr);
+// declarations for isl::ast_expr_op_le
 
-class local_space {
-  friend inline local_space manage(__isl_take isl_local_space *ptr);
-  friend inline local_space manage_copy(__isl_keep isl_local_space *ptr);
+class ast_expr_op_le : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_le ast_expr_op::as<ast_expr_op_le>() const;
+  static const auto type = isl_ast_expr_op_le;
 
-  isl_local_space *ptr = nullptr;
+protected:
+  inline explicit ast_expr_op_le(__isl_take isl_ast_expr *ptr);
 
-  inline explicit local_space(__isl_take isl_local_space *ptr);
+public:
+  inline /* implicit */ ast_expr_op_le();
+  inline /* implicit */ ast_expr_op_le(const ast_expr_op_le &obj);
+  inline ast_expr_op_le &operator=(ast_expr_op_le obj);
+  inline isl::ctx ctx() const;
+
+};
+
+// declarations for isl::ast_expr_op_lt
+
+class ast_expr_op_lt : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_lt ast_expr_op::as<ast_expr_op_lt>() const;
+  static const auto type = isl_ast_expr_op_lt;
+
+protected:
+  inline explicit ast_expr_op_lt(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ local_space();
-  inline /* implicit */ local_space(const local_space &obj);
-  inline explicit local_space(isl::space space);
-  inline local_space &operator=(local_space obj);
-  inline ~local_space();
-  inline __isl_give isl_local_space *copy() const &;
-  inline __isl_give isl_local_space *copy() && = delete;
-  inline __isl_keep isl_local_space *get() const;
-  inline __isl_give isl_local_space *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_lt();
+  inline /* implicit */ ast_expr_op_lt(const ast_expr_op_lt &obj);
+  inline ast_expr_op_lt &operator=(ast_expr_op_lt obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::local_space add_dims(isl::dim type, unsigned int n) const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::local_space domain() const;
-  inline isl::local_space drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::local_space flatten_domain() const;
-  inline isl::local_space flatten_range() const;
-  inline isl::local_space from_domain() const;
-  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
-  inline std::string get_dim_name(isl::dim type, unsigned int pos) const;
-  inline isl::aff get_div(int pos) const;
-  inline isl::space get_space() const;
-  inline boolean has_dim_id(isl::dim type, unsigned int pos) const;
-  inline boolean has_dim_name(isl::dim type, unsigned int pos) const;
-  inline isl::local_space insert_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::local_space intersect(isl::local_space ls2) const;
-  inline boolean is_equal(const isl::local_space &ls2) const;
-  inline boolean is_params() const;
-  inline boolean is_set() const;
-  inline isl::local_space range() const;
-  inline isl::local_space set_dim_id(isl::dim type, unsigned int pos, isl::id id) const;
-  inline isl::local_space set_from_params() const;
-  inline isl::local_space set_tuple_id(isl::dim type, isl::id id) const;
-  inline isl::local_space wrap() const;
+
 };
 
-// declarations for isl::map
-inline map manage(__isl_take isl_map *ptr);
-inline map manage_copy(__isl_keep isl_map *ptr);
+// declarations for isl::ast_expr_op_max
 
-class map {
-  friend inline map manage(__isl_take isl_map *ptr);
-  friend inline map manage_copy(__isl_keep isl_map *ptr);
+class ast_expr_op_max : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_max ast_expr_op::as<ast_expr_op_max>() const;
+  static const auto type = isl_ast_expr_op_max;
 
-  isl_map *ptr = nullptr;
+protected:
+  inline explicit ast_expr_op_max(__isl_take isl_ast_expr *ptr);
 
-  inline explicit map(__isl_take isl_map *ptr);
+public:
+  inline /* implicit */ ast_expr_op_max();
+  inline /* implicit */ ast_expr_op_max(const ast_expr_op_max &obj);
+  inline ast_expr_op_max &operator=(ast_expr_op_max obj);
+  inline isl::ctx ctx() const;
+
+};
+
+// declarations for isl::ast_expr_op_member
+
+class ast_expr_op_member : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_member ast_expr_op::as<ast_expr_op_member>() const;
+  static const auto type = isl_ast_expr_op_member;
+
+protected:
+  inline explicit ast_expr_op_member(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ map();
-  inline /* implicit */ map(const map &obj);
-  inline /* implicit */ map(isl::basic_map bmap);
-  inline explicit map(isl::ctx ctx, const std::string &str);
-  inline map &operator=(map obj);
-  inline ~map();
-  inline __isl_give isl_map *copy() const &;
-  inline __isl_give isl_map *copy() && = delete;
-  inline __isl_keep isl_map *get() const;
-  inline __isl_give isl_map *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_member();
+  inline /* implicit */ ast_expr_op_member(const ast_expr_op_member &obj);
+  inline ast_expr_op_member &operator=(ast_expr_op_member obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::map add_constraint(isl::constraint constraint) const;
-  inline isl::map add_dims(isl::dim type, unsigned int n) const;
-  inline isl::basic_map affine_hull() const;
-  inline isl::map align_params(isl::space model) const;
-  inline isl::map apply_domain(isl::map map2) const;
-  inline isl::map apply_range(isl::map map2) const;
-  inline isl::set bind_domain(isl::multi_id tuple) const;
-  inline isl::set bind_range(isl::multi_id tuple) const;
-  inline boolean can_curry() const;
-  inline boolean can_range_curry() const;
-  inline boolean can_uncurry() const;
-  inline boolean can_zip() const;
-  inline isl::map coalesce() const;
-  inline isl::map complement() const;
-  inline isl::basic_map convex_hull() const;
-  inline isl::map curry() const;
-  inline isl::set deltas() const;
-  inline isl::map deltas_map() const;
-  inline isl::map detect_equalities() const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::pw_aff dim_max(int pos) const;
-  inline isl::pw_aff dim_min(int pos) const;
-  inline isl::set domain() const;
-  inline isl::map domain_factor_domain() const;
-  inline isl::map domain_factor_range() const;
-  inline boolean domain_is_wrapping() const;
-  inline isl::map domain_map() const;
-  inline isl::map domain_product(isl::map map2) const;
-  inline isl_size domain_tuple_dim() const;
-  inline isl::map drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::map drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::map drop_unused_params() const;
-  inline isl::map eliminate(isl::dim type, unsigned int first, unsigned int n) const;
-  static inline isl::map empty(isl::space space);
-  inline isl::map eq_at(isl::multi_pw_aff mpa) const;
-  inline isl::map equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
-  inline isl::map factor_domain() const;
-  inline isl::map factor_range() const;
-  inline int find_dim_by_id(isl::dim type, const isl::id &id) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::map fix_si(isl::dim type, unsigned int pos, int value) const;
-  inline isl::map fix_val(isl::dim type, unsigned int pos, isl::val v) const;
-  inline isl::map fixed_power_val(isl::val exp) const;
-  inline isl::map flat_domain_product(isl::map map2) const;
-  inline isl::map flat_product(isl::map map2) const;
-  inline isl::map flat_range_product(isl::map map2) const;
-  inline isl::map flatten() const;
-  inline isl::map flatten_domain() const;
-  inline isl::map flatten_range() const;
-  inline isl::map floordiv_val(isl::val d) const;
-  inline stat foreach_basic_map(const std::function<stat(basic_map)> &fn) const;
-  static inline isl::map from_aff(isl::aff aff);
-  static inline isl::map from_domain(isl::set set);
-  static inline isl::map from_domain_and_range(isl::set domain, isl::set range);
-  static inline isl::map from_multi_aff(isl::multi_aff maff);
-  static inline isl::map from_multi_pw_aff(isl::multi_pw_aff mpa);
-  static inline isl::map from_pw_aff(isl::pw_aff pwaff);
-  static inline isl::map from_pw_multi_aff(isl::pw_multi_aff pma);
-  static inline isl::map from_range(isl::set set);
-  static inline isl::map from_union_map(isl::union_map umap);
-  inline isl::basic_map_list get_basic_map_list() const;
-  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
-  inline std::string get_dim_name(isl::dim type, unsigned int pos) const;
-  inline uint32_t get_hash() const;
-  inline isl::fixed_box get_range_simple_fixed_box_hull() const;
-  inline isl::space get_space() const;
-  inline isl::id get_tuple_id(isl::dim type) const;
-  inline std::string get_tuple_name(isl::dim type) const;
-  inline isl::map gist(isl::map context) const;
-  inline isl::map gist_basic_map(isl::basic_map context) const;
-  inline isl::map gist_domain(isl::set context) const;
-  inline isl::map gist_params(isl::set context) const;
-  inline isl::map gist_range(isl::set context) const;
-  inline boolean has_dim_id(isl::dim type, unsigned int pos) const;
-  inline boolean has_dim_name(isl::dim type, unsigned int pos) const;
-  inline boolean has_equal_space(const isl::map &map2) const;
-  inline boolean has_tuple_id(isl::dim type) const;
-  inline boolean has_tuple_name(isl::dim type) const;
-  static inline isl::map identity(isl::space space);
-  inline isl::map insert_dims(isl::dim type, unsigned int pos, unsigned int n) const;
-  inline isl::map intersect(isl::map map2) const;
-  inline isl::map intersect_domain(isl::set set) const;
-  inline isl::map intersect_domain_factor_domain(isl::map factor) const;
-  inline isl::map intersect_domain_factor_range(isl::map factor) const;
-  inline isl::map intersect_params(isl::set params) const;
-  inline isl::map intersect_range(isl::set set) const;
-  inline isl::map intersect_range_factor_domain(isl::map factor) const;
-  inline isl::map intersect_range_factor_range(isl::map factor) const;
-  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean is_bijective() const;
-  inline boolean is_disjoint(const isl::map &map2) const;
-  inline boolean is_empty() const;
-  inline boolean is_equal(const isl::map &map2) const;
-  inline boolean is_identity() const;
-  inline boolean is_injective() const;
-  inline boolean is_product() const;
-  inline boolean is_single_valued() const;
-  inline boolean is_strict_subset(const isl::map &map2) const;
-  inline boolean is_subset(const isl::map &map2) const;
-  inline int is_translation() const;
-  static inline isl::map lex_ge(isl::space set_space);
-  inline isl::map lex_ge_at(isl::multi_pw_aff mpa) const;
-  static inline isl::map lex_ge_first(isl::space space, unsigned int n);
-  inline isl::map lex_ge_map(isl::map map2) const;
-  static inline isl::map lex_gt(isl::space set_space);
-  inline isl::map lex_gt_at(isl::multi_pw_aff mpa) const;
-  static inline isl::map lex_gt_first(isl::space space, unsigned int n);
-  inline isl::map lex_gt_map(isl::map map2) const;
-  static inline isl::map lex_le(isl::space set_space);
-  inline isl::map lex_le_at(isl::multi_pw_aff mpa) const;
-  static inline isl::map lex_le_first(isl::space space, unsigned int n);
-  inline isl::map lex_le_map(isl::map map2) const;
-  static inline isl::map lex_lt(isl::space set_space);
-  inline isl::map lex_lt_at(isl::multi_pw_aff mpa) const;
-  static inline isl::map lex_lt_first(isl::space space, unsigned int n);
-  inline isl::map lex_lt_map(isl::map map2) const;
-  inline isl::map lexmax() const;
-  inline isl::pw_multi_aff lexmax_pw_multi_aff() const;
-  inline isl::map lexmin() const;
-  inline isl::pw_multi_aff lexmin_pw_multi_aff() const;
-  inline isl::map lower_bound(isl::multi_pw_aff lower) const;
-  inline isl::map lower_bound_si(isl::dim type, unsigned int pos, int value) const;
-  inline isl::map lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const;
-  inline isl::multi_pw_aff max_multi_pw_aff() const;
-  inline isl::multi_pw_aff min_multi_pw_aff() const;
-  inline isl::map move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const;
-  inline isl_size n_basic_map() const;
-  static inline isl::map nat_universe(isl::space space);
-  inline isl::map neg() const;
-  inline isl::map oppose(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
-  inline isl::map order_ge(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
-  inline isl::map order_gt(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
-  inline isl::map order_le(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
-  inline isl::map order_lt(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
-  inline isl::set params() const;
-  inline isl::val plain_get_val_if_fixed(isl::dim type, unsigned int pos) const;
-  inline boolean plain_is_empty() const;
-  inline boolean plain_is_equal(const isl::map &map2) const;
-  inline boolean plain_is_injective() const;
-  inline boolean plain_is_single_valued() const;
-  inline boolean plain_is_universe() const;
-  inline isl::basic_map plain_unshifted_simple_hull() const;
-  inline isl::basic_map polyhedral_hull() const;
-  inline isl::map preimage_domain(isl::multi_aff ma) const;
-  inline isl::map preimage_domain(isl::multi_pw_aff mpa) const;
-  inline isl::map preimage_domain(isl::pw_multi_aff pma) const;
-  inline isl::map preimage_range(isl::multi_aff ma) const;
-  inline isl::map preimage_range(isl::pw_multi_aff pma) const;
-  inline isl::map product(isl::map map2) const;
-  inline isl::map project_out(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::map project_out_all_params() const;
-  inline isl::set range() const;
-  inline isl::map range_curry() const;
-  inline isl::map range_factor_domain() const;
-  inline isl::map range_factor_range() const;
-  inline boolean range_is_wrapping() const;
-  inline isl::map range_map() const;
-  inline isl::map range_product(isl::map map2) const;
-  inline isl::map range_reverse() const;
-  inline isl_size range_tuple_dim() const;
-  inline isl::map remove_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::map remove_divs() const;
-  inline isl::map remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::map remove_redundancies() const;
-  inline isl::map remove_unknown_divs() const;
-  inline isl::map reset_tuple_id(isl::dim type) const;
-  inline isl::map reset_user() const;
-  inline isl::map reverse() const;
-  inline isl::basic_map sample() const;
-  inline isl::map set_dim_id(isl::dim type, unsigned int pos, isl::id id) const;
-  inline isl::map set_tuple_id(isl::dim type, isl::id id) const;
-  inline isl::map set_tuple_name(isl::dim type, const std::string &s) const;
-  inline isl::basic_map simple_hull() const;
-  inline isl::map subtract(isl::map map2) const;
-  inline isl::map subtract_domain(isl::set dom) const;
-  inline isl::map subtract_range(isl::set dom) const;
-  inline isl::map sum(isl::map map2) const;
-  inline isl::map uncurry() const;
-  inline isl::map unite(isl::map map2) const;
-  static inline isl::map universe(isl::space space);
-  inline isl::basic_map unshifted_simple_hull() const;
-  inline isl::basic_map unshifted_simple_hull_from_map_list(isl::map_list list) const;
-  inline isl::map upper_bound(isl::multi_pw_aff upper) const;
-  inline isl::map upper_bound_si(isl::dim type, unsigned int pos, int value) const;
-  inline isl::map upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const;
-  inline isl::set wrap() const;
-  inline isl::map zip() const;
 };
 
-// declarations for isl::map_list
-inline map_list manage(__isl_take isl_map_list *ptr);
-inline map_list manage_copy(__isl_keep isl_map_list *ptr);
-
-class map_list {
-  friend inline map_list manage(__isl_take isl_map_list *ptr);
-  friend inline map_list manage_copy(__isl_keep isl_map_list *ptr);
+// declarations for isl::ast_expr_op_min
 
-  isl_map_list *ptr = nullptr;
+class ast_expr_op_min : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_min ast_expr_op::as<ast_expr_op_min>() const;
+  static const auto type = isl_ast_expr_op_min;
 
-  inline explicit map_list(__isl_take isl_map_list *ptr);
+protected:
+  inline explicit ast_expr_op_min(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ map_list();
-  inline /* implicit */ map_list(const map_list &obj);
-  inline map_list &operator=(map_list obj);
-  inline ~map_list();
-  inline __isl_give isl_map_list *copy() const &;
-  inline __isl_give isl_map_list *copy() && = delete;
-  inline __isl_keep isl_map_list *get() const;
-  inline __isl_give isl_map_list *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_min();
+  inline /* implicit */ ast_expr_op_min(const ast_expr_op_min &obj);
+  inline ast_expr_op_min &operator=(ast_expr_op_min obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::map_list add(isl::map el) const;
-  static inline isl::map_list alloc(isl::ctx ctx, int n);
-  inline isl::map_list clear() const;
-  inline isl::map_list concat(isl::map_list list2) const;
-  inline isl::map_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(map)> &fn) const;
-  static inline isl::map_list from_map(isl::map el);
-  inline isl::map get_at(int index) const;
-  inline isl::map get_map(int index) const;
-  inline isl::map_list insert(unsigned int pos, isl::map el) const;
-  inline isl_size n_map() const;
-  inline isl::map_list reverse() const;
-  inline isl::map_list set_map(int index, isl::map el) const;
-  inline isl_size size() const;
-  inline isl::map_list swap(unsigned int pos1, unsigned int pos2) const;
 };
 
-// declarations for isl::mat
-inline mat manage(__isl_take isl_mat *ptr);
-inline mat manage_copy(__isl_keep isl_mat *ptr);
+// declarations for isl::ast_expr_op_minus
 
-class mat {
-  friend inline mat manage(__isl_take isl_mat *ptr);
-  friend inline mat manage_copy(__isl_keep isl_mat *ptr);
+class ast_expr_op_minus : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_minus ast_expr_op::as<ast_expr_op_minus>() const;
+  static const auto type = isl_ast_expr_op_minus;
 
-  isl_mat *ptr = nullptr;
-
-  inline explicit mat(__isl_take isl_mat *ptr);
+protected:
+  inline explicit ast_expr_op_minus(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ mat();
-  inline /* implicit */ mat(const mat &obj);
-  inline mat &operator=(mat obj);
-  inline ~mat();
-  inline __isl_give isl_mat *copy() const &;
-  inline __isl_give isl_mat *copy() && = delete;
-  inline __isl_keep isl_mat *get() const;
-  inline __isl_give isl_mat *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_minus();
+  inline /* implicit */ ast_expr_op_minus(const ast_expr_op_minus &obj);
+  inline ast_expr_op_minus &operator=(ast_expr_op_minus obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::mat add_rows(unsigned int n) const;
-  inline isl::mat add_zero_cols(unsigned int n) const;
-  inline isl::mat add_zero_rows(unsigned int n) const;
-  inline isl::mat aff_direct_sum(isl::mat right) const;
-  static inline isl::mat alloc(isl::ctx ctx, unsigned int n_row, unsigned int n_col);
-  inline isl_size cols() const;
-  inline isl::mat concat(isl::mat bot) const;
-  inline isl::mat diagonal(isl::mat mat2) const;
-  inline isl::mat drop_cols(unsigned int col, unsigned int n) const;
-  inline isl::mat drop_rows(unsigned int row, unsigned int n) const;
-  static inline isl::mat from_row_vec(isl::vec vec);
-  inline isl::val get_element_val(int row, int col) const;
-  inline boolean has_linearly_independent_rows(const isl::mat &mat2) const;
-  inline int initial_non_zero_cols() const;
-  inline isl::mat insert_cols(unsigned int col, unsigned int n) const;
-  inline isl::mat insert_rows(unsigned int row, unsigned int n) const;
-  inline isl::mat insert_zero_cols(unsigned int first, unsigned int n) const;
-  inline isl::mat insert_zero_rows(unsigned int row, unsigned int n) const;
-  inline isl::mat inverse_product(isl::mat right) const;
-  inline boolean is_equal(const isl::mat &mat2) const;
-  inline isl::mat lin_to_aff() const;
-  inline isl::mat move_cols(unsigned int dst_col, unsigned int src_col, unsigned int n) const;
-  inline isl::mat normalize() const;
-  inline isl::mat normalize_row(int row) const;
-  inline isl::mat product(isl::mat right) const;
-  inline isl_size rank() const;
-  inline isl::mat right_inverse() const;
-  inline isl::mat right_kernel() const;
-  inline isl::mat row_basis() const;
-  inline isl::mat row_basis_extension(isl::mat mat2) const;
-  inline isl_size rows() const;
-  inline isl::mat set_element_si(int row, int col, int v) const;
-  inline isl::mat set_element_val(int row, int col, isl::val v) const;
-  inline isl::mat swap_cols(unsigned int i, unsigned int j) const;
-  inline isl::mat swap_rows(unsigned int i, unsigned int j) const;
-  inline isl::mat transpose() const;
-  inline isl::mat unimodular_complete(int row) const;
-  inline isl::mat vec_concat(isl::vec bot) const;
-  inline isl::vec vec_inverse_product(isl::vec vec) const;
-  inline isl::vec vec_product(isl::vec vec) const;
-};
 
-// declarations for isl::multi_aff
-inline multi_aff manage(__isl_take isl_multi_aff *ptr);
-inline multi_aff manage_copy(__isl_keep isl_multi_aff *ptr);
+};
 
-class multi_aff {
-  friend inline multi_aff manage(__isl_take isl_multi_aff *ptr);
-  friend inline multi_aff manage_copy(__isl_keep isl_multi_aff *ptr);
+// declarations for isl::ast_expr_op_mul
 
-  isl_multi_aff *ptr = nullptr;
+class ast_expr_op_mul : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_mul ast_expr_op::as<ast_expr_op_mul>() const;
+  static const auto type = isl_ast_expr_op_mul;
 
-  inline explicit multi_aff(__isl_take isl_multi_aff *ptr);
+protected:
+  inline explicit ast_expr_op_mul(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ multi_aff();
-  inline /* implicit */ multi_aff(const multi_aff &obj);
-  inline /* implicit */ multi_aff(isl::aff aff);
-  inline explicit multi_aff(isl::space space, isl::aff_list list);
-  inline explicit multi_aff(isl::ctx ctx, const std::string &str);
-  inline multi_aff &operator=(multi_aff obj);
-  inline ~multi_aff();
-  inline __isl_give isl_multi_aff *copy() const &;
-  inline __isl_give isl_multi_aff *copy() && = delete;
-  inline __isl_keep isl_multi_aff *get() const;
-  inline __isl_give isl_multi_aff *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_mul();
+  inline /* implicit */ ast_expr_op_mul(const ast_expr_op_mul &obj);
+  inline ast_expr_op_mul &operator=(ast_expr_op_mul obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::multi_aff add(isl::multi_aff multi2) const;
-  inline isl::multi_aff add_constant(isl::multi_val mv) const;
-  inline isl::multi_aff add_constant(isl::val v) const;
-  inline isl::multi_aff add_dims(isl::dim type, unsigned int n) const;
-  inline isl::multi_aff align_params(isl::space model) const;
-  inline isl::basic_set bind(isl::multi_id tuple) const;
-  inline isl::multi_aff bind_domain(isl::multi_id tuple) const;
-  inline isl::multi_aff bind_domain_wrapped_domain(isl::multi_id tuple) const;
-  inline isl_size dim(isl::dim type) const;
-  static inline isl::multi_aff domain_map(isl::space space);
-  inline isl::multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::multi_aff factor_range() const;
-  inline int find_dim_by_id(isl::dim type, const isl::id &id) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::multi_aff flat_range_product(isl::multi_aff multi2) const;
-  inline isl::multi_aff flatten_domain() const;
-  inline isl::multi_aff flatten_range() const;
-  inline isl::multi_aff floor() const;
-  inline isl::multi_aff from_range() const;
-  inline isl::aff get_aff(int pos) const;
-  inline isl::aff get_at(int pos) const;
-  inline isl::multi_val get_constant_multi_val() const;
-  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
-  inline isl::space get_domain_space() const;
-  inline isl::aff_list get_list() const;
-  inline isl::space get_space() const;
-  inline isl::id get_tuple_id(isl::dim type) const;
-  inline std::string get_tuple_name(isl::dim type) const;
-  inline isl::multi_aff gist(isl::set context) const;
-  inline isl::multi_aff gist_params(isl::set context) const;
-  inline boolean has_tuple_id(isl::dim type) const;
-  static inline isl::multi_aff identity(isl::space space);
-  inline isl::multi_aff identity() const;
-  static inline isl::multi_aff identity_on_domain(isl::space space);
-  inline isl::multi_aff insert_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::multi_aff insert_domain(isl::space domain) const;
-  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean involves_locals() const;
-  inline boolean involves_nan() const;
-  inline isl::set lex_ge_set(isl::multi_aff ma2) const;
-  inline isl::set lex_gt_set(isl::multi_aff ma2) const;
-  inline isl::set lex_le_set(isl::multi_aff ma2) const;
-  inline isl::set lex_lt_set(isl::multi_aff ma2) const;
-  inline isl::multi_aff mod_multi_val(isl::multi_val mv) const;
-  inline isl::multi_aff move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const;
-  static inline isl::multi_aff multi_val_on_space(isl::space space, isl::multi_val mv);
-  inline isl::multi_aff neg() const;
-  inline int plain_cmp(const isl::multi_aff &multi2) const;
-  inline boolean plain_is_equal(const isl::multi_aff &multi2) const;
-  inline isl::multi_aff product(isl::multi_aff multi2) const;
-  inline isl::multi_aff project_domain_on_params() const;
-  static inline isl::multi_aff project_out_map(isl::space space, isl::dim type, unsigned int first, unsigned int n);
-  inline isl::multi_aff pullback(isl::multi_aff ma2) const;
-  inline isl::multi_aff range_factor_domain() const;
-  inline isl::multi_aff range_factor_range() const;
-  inline boolean range_is_wrapping() const;
-  static inline isl::multi_aff range_map(isl::space space);
-  inline isl::multi_aff range_product(isl::multi_aff multi2) const;
-  inline isl::multi_aff range_splice(unsigned int pos, isl::multi_aff multi2) const;
-  inline isl::multi_aff reset_tuple_id(isl::dim type) const;
-  inline isl::multi_aff reset_user() const;
-  inline isl::multi_aff scale(isl::multi_val mv) const;
-  inline isl::multi_aff scale(isl::val v) const;
-  inline isl::multi_aff scale_down(isl::multi_val mv) const;
-  inline isl::multi_aff scale_down(isl::val v) const;
-  inline isl::multi_aff set_aff(int pos, isl::aff el) const;
-  inline isl::multi_aff set_at(int pos, isl::aff el) const;
-  inline isl::multi_aff set_dim_id(isl::dim type, unsigned int pos, isl::id id) const;
-  inline isl::multi_aff set_tuple_id(isl::dim type, isl::id id) const;
-  inline isl::multi_aff set_tuple_name(isl::dim type, const std::string &s) const;
-  inline isl_size size() const;
-  inline isl::multi_aff splice(unsigned int in_pos, unsigned int out_pos, isl::multi_aff multi2) const;
-  inline isl::multi_aff sub(isl::multi_aff multi2) const;
-  inline isl::multi_aff unbind_params_insert_domain(isl::multi_id domain) const;
-  static inline isl::multi_aff zero(isl::space space);
 };
 
-// declarations for isl::multi_id
-inline multi_id manage(__isl_take isl_multi_id *ptr);
-inline multi_id manage_copy(__isl_keep isl_multi_id *ptr);
-
-class multi_id {
-  friend inline multi_id manage(__isl_take isl_multi_id *ptr);
-  friend inline multi_id manage_copy(__isl_keep isl_multi_id *ptr);
+// declarations for isl::ast_expr_op_or
 
-  isl_multi_id *ptr = nullptr;
+class ast_expr_op_or : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_or ast_expr_op::as<ast_expr_op_or>() const;
+  static const auto type = isl_ast_expr_op_or;
 
-  inline explicit multi_id(__isl_take isl_multi_id *ptr);
+protected:
+  inline explicit ast_expr_op_or(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ multi_id();
-  inline /* implicit */ multi_id(const multi_id &obj);
-  inline explicit multi_id(isl::space space, isl::id_list list);
-  inline explicit multi_id(isl::ctx ctx, const std::string &str);
-  inline multi_id &operator=(multi_id obj);
-  inline ~multi_id();
-  inline __isl_give isl_multi_id *copy() const &;
-  inline __isl_give isl_multi_id *copy() && = delete;
-  inline __isl_keep isl_multi_id *get() const;
-  inline __isl_give isl_multi_id *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_or();
+  inline /* implicit */ ast_expr_op_or(const ast_expr_op_or &obj);
+  inline ast_expr_op_or &operator=(ast_expr_op_or obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::multi_id align_params(isl::space model) const;
-  inline isl::multi_id factor_range() const;
-  inline isl::multi_id flat_range_product(isl::multi_id multi2) const;
-  inline isl::multi_id flatten_range() const;
-  inline isl::multi_id from_range() const;
-  inline isl::id get_at(int pos) const;
-  inline isl::space get_domain_space() const;
-  inline isl::id get_id(int pos) const;
-  inline isl::id_list get_list() const;
-  inline isl::space get_space() const;
-  inline boolean plain_is_equal(const isl::multi_id &multi2) const;
-  inline isl::multi_id range_factor_domain() const;
-  inline isl::multi_id range_factor_range() const;
-  inline boolean range_is_wrapping() const;
-  inline isl::multi_id range_product(isl::multi_id multi2) const;
-  inline isl::multi_id range_splice(unsigned int pos, isl::multi_id multi2) const;
-  inline isl::multi_id reset_user() const;
-  inline isl::multi_id set_at(int pos, isl::id el) const;
-  inline isl::multi_id set_id(int pos, isl::id el) const;
-  inline isl_size size() const;
 };
 
-// declarations for isl::multi_pw_aff
-inline multi_pw_aff manage(__isl_take isl_multi_pw_aff *ptr);
-inline multi_pw_aff manage_copy(__isl_keep isl_multi_pw_aff *ptr);
+// declarations for isl::ast_expr_op_or_else
 
-class multi_pw_aff {
-  friend inline multi_pw_aff manage(__isl_take isl_multi_pw_aff *ptr);
-  friend inline multi_pw_aff manage_copy(__isl_keep isl_multi_pw_aff *ptr);
+class ast_expr_op_or_else : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_or_else ast_expr_op::as<ast_expr_op_or_else>() const;
+  static const auto type = isl_ast_expr_op_or_else;
 
-  isl_multi_pw_aff *ptr = nullptr;
+protected:
+  inline explicit ast_expr_op_or_else(__isl_take isl_ast_expr *ptr);
 
-  inline explicit multi_pw_aff(__isl_take isl_multi_pw_aff *ptr);
+public:
+  inline /* implicit */ ast_expr_op_or_else();
+  inline /* implicit */ ast_expr_op_or_else(const ast_expr_op_or_else &obj);
+  inline ast_expr_op_or_else &operator=(ast_expr_op_or_else obj);
+  inline isl::ctx ctx() const;
+
+};
+
+// declarations for isl::ast_expr_op_pdiv_q
+
+class ast_expr_op_pdiv_q : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_pdiv_q ast_expr_op::as<ast_expr_op_pdiv_q>() const;
+  static const auto type = isl_ast_expr_op_pdiv_q;
+
+protected:
+  inline explicit ast_expr_op_pdiv_q(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ multi_pw_aff();
-  inline /* implicit */ multi_pw_aff(const multi_pw_aff &obj);
-  inline /* implicit */ multi_pw_aff(isl::aff aff);
-  inline /* implicit */ multi_pw_aff(isl::multi_aff ma);
-  inline /* implicit */ multi_pw_aff(isl::pw_aff pa);
-  inline explicit multi_pw_aff(isl::space space, isl::pw_aff_list list);
-  inline /* implicit */ multi_pw_aff(isl::pw_multi_aff pma);
-  inline explicit multi_pw_aff(isl::ctx ctx, const std::string &str);
-  inline multi_pw_aff &operator=(multi_pw_aff obj);
-  inline ~multi_pw_aff();
-  inline __isl_give isl_multi_pw_aff *copy() const &;
-  inline __isl_give isl_multi_pw_aff *copy() && = delete;
-  inline __isl_keep isl_multi_pw_aff *get() const;
-  inline __isl_give isl_multi_pw_aff *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_pdiv_q();
+  inline /* implicit */ ast_expr_op_pdiv_q(const ast_expr_op_pdiv_q &obj);
+  inline ast_expr_op_pdiv_q &operator=(ast_expr_op_pdiv_q obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::multi_pw_aff add(isl::multi_pw_aff multi2) const;
-  inline isl::multi_pw_aff add_constant(isl::multi_val mv) const;
-  inline isl::multi_pw_aff add_constant(isl::val v) const;
-  inline isl::multi_pw_aff add_dims(isl::dim type, unsigned int n) const;
-  inline isl::multi_pw_aff align_params(isl::space model) const;
-  inline isl::set bind(isl::multi_id tuple) const;
-  inline isl::multi_pw_aff bind_domain(isl::multi_id tuple) const;
-  inline isl::multi_pw_aff bind_domain_wrapped_domain(isl::multi_id tuple) const;
-  inline isl::multi_pw_aff coalesce() const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::set domain() const;
-  inline isl::multi_pw_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::map eq_map(isl::multi_pw_aff mpa2) const;
-  inline isl::multi_pw_aff factor_range() const;
-  inline int find_dim_by_id(isl::dim type, const isl::id &id) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::multi_pw_aff flat_range_product(isl::multi_pw_aff multi2) const;
-  inline isl::multi_pw_aff flatten_range() const;
-  inline isl::multi_pw_aff from_range() const;
-  inline isl::pw_aff get_at(int pos) const;
-  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
-  inline isl::space get_domain_space() const;
-  inline uint32_t get_hash() const;
-  inline isl::pw_aff_list get_list() const;
-  inline isl::pw_aff get_pw_aff(int pos) const;
-  inline isl::space get_space() const;
-  inline isl::id get_tuple_id(isl::dim type) const;
-  inline std::string get_tuple_name(isl::dim type) const;
-  inline isl::multi_pw_aff gist(isl::set set) const;
-  inline isl::multi_pw_aff gist_params(isl::set set) const;
-  inline boolean has_tuple_id(isl::dim type) const;
-  static inline isl::multi_pw_aff identity(isl::space space);
-  inline isl::multi_pw_aff identity() const;
-  static inline isl::multi_pw_aff identity_on_domain(isl::space space);
-  inline isl::multi_pw_aff insert_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::multi_pw_aff insert_domain(isl::space domain) const;
-  inline isl::multi_pw_aff intersect_domain(isl::set domain) const;
-  inline isl::multi_pw_aff intersect_params(isl::set set) const;
-  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean involves_nan() const;
-  inline boolean involves_param(const isl::id &id) const;
-  inline boolean involves_param(const isl::id_list &list) const;
-  inline boolean is_cst() const;
-  inline boolean is_equal(const isl::multi_pw_aff &mpa2) const;
-  inline isl::map lex_ge_map(isl::multi_pw_aff mpa2) const;
-  inline isl::map lex_gt_map(isl::multi_pw_aff mpa2) const;
-  inline isl::map lex_le_map(isl::multi_pw_aff mpa2) const;
-  inline isl::map lex_lt_map(isl::multi_pw_aff mpa2) const;
-  inline isl::multi_pw_aff max(isl::multi_pw_aff multi2) const;
-  inline isl::multi_val max_multi_val() const;
-  inline isl::multi_pw_aff min(isl::multi_pw_aff multi2) const;
-  inline isl::multi_val min_multi_val() const;
-  inline isl::multi_pw_aff mod_multi_val(isl::multi_val mv) const;
-  inline isl::multi_pw_aff move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const;
-  inline isl::multi_pw_aff neg() const;
-  inline boolean plain_is_equal(const isl::multi_pw_aff &multi2) const;
-  inline isl::multi_pw_aff product(isl::multi_pw_aff multi2) const;
-  inline isl::multi_pw_aff project_domain_on_params() const;
-  inline isl::multi_pw_aff pullback(isl::multi_aff ma) const;
-  inline isl::multi_pw_aff pullback(isl::multi_pw_aff mpa2) const;
-  inline isl::multi_pw_aff pullback(isl::pw_multi_aff pma) const;
-  inline isl::multi_pw_aff range_factor_domain() const;
-  inline isl::multi_pw_aff range_factor_range() const;
-  inline boolean range_is_wrapping() const;
-  inline isl::multi_pw_aff range_product(isl::multi_pw_aff multi2) const;
-  inline isl::multi_pw_aff range_splice(unsigned int pos, isl::multi_pw_aff multi2) const;
-  inline isl::multi_pw_aff reset_tuple_id(isl::dim type) const;
-  inline isl::multi_pw_aff reset_user() const;
-  inline isl::multi_pw_aff scale(isl::multi_val mv) const;
-  inline isl::multi_pw_aff scale(isl::val v) const;
-  inline isl::multi_pw_aff scale_down(isl::multi_val mv) const;
-  inline isl::multi_pw_aff scale_down(isl::val v) const;
-  inline isl::multi_pw_aff set_at(int pos, isl::pw_aff el) const;
-  inline isl::multi_pw_aff set_dim_id(isl::dim type, unsigned int pos, isl::id id) const;
-  inline isl::multi_pw_aff set_pw_aff(int pos, isl::pw_aff el) const;
-  inline isl::multi_pw_aff set_tuple_id(isl::dim type, isl::id id) const;
-  inline isl::multi_pw_aff set_tuple_name(isl::dim type, const std::string &s) const;
-  inline isl_size size() const;
-  inline isl::multi_pw_aff splice(unsigned int in_pos, unsigned int out_pos, isl::multi_pw_aff multi2) const;
-  inline isl::multi_pw_aff sub(isl::multi_pw_aff multi2) const;
-  inline isl::multi_pw_aff unbind_params_insert_domain(isl::multi_id domain) const;
-  inline isl::multi_pw_aff union_add(isl::multi_pw_aff mpa2) const;
-  static inline isl::multi_pw_aff zero(isl::space space);
 };
 
-// declarations for isl::multi_union_pw_aff
-inline multi_union_pw_aff manage(__isl_take isl_multi_union_pw_aff *ptr);
-inline multi_union_pw_aff manage_copy(__isl_keep isl_multi_union_pw_aff *ptr);
+// declarations for isl::ast_expr_op_pdiv_r
 
-class multi_union_pw_aff {
-  friend inline multi_union_pw_aff manage(__isl_take isl_multi_union_pw_aff *ptr);
-  friend inline multi_union_pw_aff manage_copy(__isl_keep isl_multi_union_pw_aff *ptr);
+class ast_expr_op_pdiv_r : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_pdiv_r ast_expr_op::as<ast_expr_op_pdiv_r>() const;
+  static const auto type = isl_ast_expr_op_pdiv_r;
 
-  isl_multi_union_pw_aff *ptr = nullptr;
+protected:
+  inline explicit ast_expr_op_pdiv_r(__isl_take isl_ast_expr *ptr);
 
-  inline explicit multi_union_pw_aff(__isl_take isl_multi_union_pw_aff *ptr);
+public:
+  inline /* implicit */ ast_expr_op_pdiv_r();
+  inline /* implicit */ ast_expr_op_pdiv_r(const ast_expr_op_pdiv_r &obj);
+  inline ast_expr_op_pdiv_r &operator=(ast_expr_op_pdiv_r obj);
+  inline isl::ctx ctx() const;
+
+};
+
+// declarations for isl::ast_expr_op_select
+
+class ast_expr_op_select : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_select ast_expr_op::as<ast_expr_op_select>() const;
+  static const auto type = isl_ast_expr_op_select;
+
+protected:
+  inline explicit ast_expr_op_select(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ multi_union_pw_aff();
-  inline /* implicit */ multi_union_pw_aff(const multi_union_pw_aff &obj);
-  inline /* implicit */ multi_union_pw_aff(isl::multi_pw_aff mpa);
-  inline /* implicit */ multi_union_pw_aff(isl::union_pw_aff upa);
-  inline explicit multi_union_pw_aff(isl::space space, isl::union_pw_aff_list list);
-  inline explicit multi_union_pw_aff(isl::union_pw_multi_aff upma);
-  inline explicit multi_union_pw_aff(isl::ctx ctx, const std::string &str);
-  inline multi_union_pw_aff &operator=(multi_union_pw_aff obj);
-  inline ~multi_union_pw_aff();
-  inline __isl_give isl_multi_union_pw_aff *copy() const &;
-  inline __isl_give isl_multi_union_pw_aff *copy() && = delete;
-  inline __isl_keep isl_multi_union_pw_aff *get() const;
-  inline __isl_give isl_multi_union_pw_aff *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_select();
+  inline /* implicit */ ast_expr_op_select(const ast_expr_op_select &obj);
+  inline ast_expr_op_select &operator=(ast_expr_op_select obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::multi_union_pw_aff add(isl::multi_union_pw_aff multi2) const;
-  inline isl::multi_union_pw_aff align_params(isl::space model) const;
-  inline isl::union_pw_aff apply_aff(isl::aff aff) const;
-  inline isl::union_pw_aff apply_pw_aff(isl::pw_aff pa) const;
-  inline isl::multi_union_pw_aff apply_pw_multi_aff(isl::pw_multi_aff pma) const;
-  inline isl::union_set bind(isl::multi_id tuple) const;
-  inline isl::multi_union_pw_aff coalesce() const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::union_set domain() const;
-  inline isl::multi_union_pw_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::multi_pw_aff extract_multi_pw_aff(isl::space space) const;
-  inline isl::multi_union_pw_aff factor_range() const;
-  inline int find_dim_by_id(isl::dim type, const isl::id &id) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::multi_union_pw_aff flat_range_product(isl::multi_union_pw_aff multi2) const;
-  inline isl::multi_union_pw_aff flatten_range() const;
-  inline isl::multi_union_pw_aff floor() const;
-  static inline isl::multi_union_pw_aff from_multi_aff(isl::multi_aff ma);
-  inline isl::multi_union_pw_aff from_range() const;
-  static inline isl::multi_union_pw_aff from_union_map(isl::union_map umap);
-  inline isl::union_pw_aff get_at(int pos) const;
-  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
-  inline isl::space get_domain_space() const;
-  inline isl::union_pw_aff_list get_list() const;
-  inline isl::space get_space() const;
-  inline isl::id get_tuple_id(isl::dim type) const;
-  inline std::string get_tuple_name(isl::dim type) const;
-  inline isl::union_pw_aff get_union_pw_aff(int pos) const;
-  inline isl::multi_union_pw_aff gist(isl::union_set context) const;
-  inline isl::multi_union_pw_aff gist_params(isl::set context) const;
-  inline boolean has_tuple_id(isl::dim type) const;
-  inline isl::multi_union_pw_aff intersect_domain(isl::union_set uset) const;
-  inline isl::multi_union_pw_aff intersect_params(isl::set params) const;
-  inline isl::multi_union_pw_aff intersect_range(isl::set set) const;
-  inline boolean involves_nan() const;
-  inline isl::multi_val max_multi_val() const;
-  inline isl::multi_val min_multi_val() const;
-  inline isl::multi_union_pw_aff mod_multi_val(isl::multi_val mv) const;
-  static inline isl::multi_union_pw_aff multi_aff_on_domain(isl::union_set domain, isl::multi_aff ma);
-  static inline isl::multi_union_pw_aff multi_val_on_domain(isl::union_set domain, isl::multi_val mv);
-  inline isl::multi_union_pw_aff neg() const;
-  inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const;
-  inline isl::multi_union_pw_aff pullback(isl::union_pw_multi_aff upma) const;
-  static inline isl::multi_union_pw_aff pw_multi_aff_on_domain(isl::union_set domain, isl::pw_multi_aff pma);
-  inline isl::multi_union_pw_aff range_factor_domain() const;
-  inline isl::multi_union_pw_aff range_factor_range() const;
-  inline boolean range_is_wrapping() const;
-  inline isl::multi_union_pw_aff range_product(isl::multi_union_pw_aff multi2) const;
-  inline isl::multi_union_pw_aff range_splice(unsigned int pos, isl::multi_union_pw_aff multi2) const;
-  inline isl::multi_union_pw_aff reset_tuple_id(isl::dim type) const;
-  inline isl::multi_union_pw_aff reset_user() const;
-  inline isl::multi_union_pw_aff scale(isl::multi_val mv) const;
-  inline isl::multi_union_pw_aff scale(isl::val v) const;
-  inline isl::multi_union_pw_aff scale_down(isl::multi_val mv) const;
-  inline isl::multi_union_pw_aff scale_down(isl::val v) const;
-  inline isl::multi_union_pw_aff set_at(int pos, isl::union_pw_aff el) const;
-  inline isl::multi_union_pw_aff set_dim_id(isl::dim type, unsigned int pos, isl::id id) const;
-  inline isl::multi_union_pw_aff set_tuple_id(isl::dim type, isl::id id) const;
-  inline isl::multi_union_pw_aff set_tuple_name(isl::dim type, const std::string &s) const;
-  inline isl::multi_union_pw_aff set_union_pw_aff(int pos, isl::union_pw_aff el) const;
-  inline isl_size size() const;
-  inline isl::multi_union_pw_aff sub(isl::multi_union_pw_aff multi2) const;
-  inline isl::multi_union_pw_aff union_add(isl::multi_union_pw_aff mupa2) const;
-  static inline isl::multi_union_pw_aff zero(isl::space space);
-  inline isl::union_set zero_union_set() const;
 };
 
-// declarations for isl::multi_val
-inline multi_val manage(__isl_take isl_multi_val *ptr);
-inline multi_val manage_copy(__isl_keep isl_multi_val *ptr);
+// declarations for isl::ast_expr_op_sub
 
-class multi_val {
-  friend inline multi_val manage(__isl_take isl_multi_val *ptr);
-  friend inline multi_val manage_copy(__isl_keep isl_multi_val *ptr);
+class ast_expr_op_sub : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_sub ast_expr_op::as<ast_expr_op_sub>() const;
+  static const auto type = isl_ast_expr_op_sub;
 
-  isl_multi_val *ptr = nullptr;
+protected:
+  inline explicit ast_expr_op_sub(__isl_take isl_ast_expr *ptr);
 
-  inline explicit multi_val(__isl_take isl_multi_val *ptr);
+public:
+  inline /* implicit */ ast_expr_op_sub();
+  inline /* implicit */ ast_expr_op_sub(const ast_expr_op_sub &obj);
+  inline ast_expr_op_sub &operator=(ast_expr_op_sub obj);
+  inline isl::ctx ctx() const;
+
+};
+
+// declarations for isl::ast_expr_op_zdiv_r
+
+class ast_expr_op_zdiv_r : public ast_expr_op {
+  template <class T>
+  friend boolean ast_expr_op::isa() const;
+  friend ast_expr_op_zdiv_r ast_expr_op::as<ast_expr_op_zdiv_r>() const;
+  static const auto type = isl_ast_expr_op_zdiv_r;
+
+protected:
+  inline explicit ast_expr_op_zdiv_r(__isl_take isl_ast_expr *ptr);
 
 public:
-  inline /* implicit */ multi_val();
-  inline /* implicit */ multi_val(const multi_val &obj);
-  inline explicit multi_val(isl::space space, isl::val_list list);
-  inline explicit multi_val(isl::ctx ctx, const std::string &str);
-  inline multi_val &operator=(multi_val obj);
-  inline ~multi_val();
-  inline __isl_give isl_multi_val *copy() const &;
-  inline __isl_give isl_multi_val *copy() && = delete;
-  inline __isl_keep isl_multi_val *get() const;
-  inline __isl_give isl_multi_val *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_expr_op_zdiv_r();
+  inline /* implicit */ ast_expr_op_zdiv_r(const ast_expr_op_zdiv_r &obj);
+  inline ast_expr_op_zdiv_r &operator=(ast_expr_op_zdiv_r obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::multi_val add(isl::multi_val multi2) const;
-  inline isl::multi_val add(isl::val v) const;
-  inline isl::multi_val add_dims(isl::dim type, unsigned int n) const;
-  inline isl::multi_val align_params(isl::space model) const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::multi_val drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::multi_val factor_range() const;
-  inline int find_dim_by_id(isl::dim type, const isl::id &id) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::multi_val flat_range_product(isl::multi_val multi2) const;
-  inline isl::multi_val flatten_range() const;
-  inline isl::multi_val from_range() const;
-  inline isl::val get_at(int pos) const;
-  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
-  inline isl::space get_domain_space() const;
-  inline isl::val_list get_list() const;
-  inline isl::space get_space() const;
-  inline isl::id get_tuple_id(isl::dim type) const;
-  inline std::string get_tuple_name(isl::dim type) const;
-  inline isl::val get_val(int pos) const;
-  inline boolean has_tuple_id(isl::dim type) const;
-  inline isl::multi_val insert_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean involves_nan() const;
-  inline boolean is_zero() const;
-  inline isl::multi_val max(isl::multi_val multi2) const;
-  inline isl::multi_val min(isl::multi_val multi2) const;
-  inline isl::multi_val mod_multi_val(isl::multi_val mv) const;
-  inline isl::multi_val mod_val(isl::val v) const;
-  inline isl::multi_val neg() const;
-  inline boolean plain_is_equal(const isl::multi_val &multi2) const;
-  inline isl::multi_val product(isl::multi_val multi2) const;
-  inline isl::multi_val project_domain_on_params() const;
-  inline isl::multi_val range_factor_domain() const;
-  inline isl::multi_val range_factor_range() const;
-  inline boolean range_is_wrapping() const;
-  inline isl::multi_val range_product(isl::multi_val multi2) const;
-  inline isl::multi_val range_splice(unsigned int pos, isl::multi_val multi2) const;
-  inline isl::multi_val reset_tuple_id(isl::dim type) const;
-  inline isl::multi_val reset_user() const;
-  inline isl::multi_val scale(isl::multi_val mv) const;
-  inline isl::multi_val scale(isl::val v) const;
-  inline isl::multi_val scale_down(isl::multi_val mv) const;
-  inline isl::multi_val scale_down(isl::val v) const;
-  inline isl::multi_val set_at(int pos, isl::val el) const;
-  inline isl::multi_val set_dim_id(isl::dim type, unsigned int pos, isl::id id) const;
-  inline isl::multi_val set_tuple_id(isl::dim type, isl::id id) const;
-  inline isl::multi_val set_tuple_name(isl::dim type, const std::string &s) const;
-  inline isl::multi_val set_val(int pos, isl::val el) const;
-  inline isl_size size() const;
-  inline isl::multi_val splice(unsigned int in_pos, unsigned int out_pos, isl::multi_val multi2) const;
-  inline isl::multi_val sub(isl::multi_val multi2) const;
-  static inline isl::multi_val zero(isl::space space);
 };
 
-// declarations for isl::point
-inline point manage(__isl_take isl_point *ptr);
-inline point manage_copy(__isl_keep isl_point *ptr);
+// declarations for isl::ast_node
+inline ast_node manage(__isl_take isl_ast_node *ptr);
+inline ast_node manage_copy(__isl_keep isl_ast_node *ptr);
 
-class point {
-  friend inline point manage(__isl_take isl_point *ptr);
-  friend inline point manage_copy(__isl_keep isl_point *ptr);
+class ast_node {
+  friend inline ast_node manage(__isl_take isl_ast_node *ptr);
+  friend inline ast_node manage_copy(__isl_keep isl_ast_node *ptr);
 
-  isl_point *ptr = nullptr;
+protected:
+  isl_ast_node *ptr = nullptr;
 
-  inline explicit point(__isl_take isl_point *ptr);
+  inline explicit ast_node(__isl_take isl_ast_node *ptr);
 
 public:
-  inline /* implicit */ point();
-  inline /* implicit */ point(const point &obj);
-  inline explicit point(isl::space dim);
-  inline point &operator=(point obj);
-  inline ~point();
-  inline __isl_give isl_point *copy() const &;
-  inline __isl_give isl_point *copy() && = delete;
-  inline __isl_keep isl_point *get() const;
-  inline __isl_give isl_point *release();
+  inline /* implicit */ ast_node();
+  inline /* implicit */ ast_node(const ast_node &obj);
+  inline ast_node &operator=(ast_node obj);
+  inline ~ast_node();
+  inline __isl_give isl_ast_node *copy() const &;
+  inline __isl_give isl_ast_node *copy() && = delete;
+  inline __isl_keep isl_ast_node *get() const;
+  inline __isl_give isl_ast_node *release();
   inline bool is_null() const;
+private:
+  template <typename T,
+          typename = typename std::enable_if<std::is_same<
+                  const decltype(isl_ast_node_get_type(NULL)),
+                  const T>::value>::type>
+  inline boolean isa_type(T subtype) const;
+public:
+  template <class T> inline boolean isa() const;
+  template <class T> inline T as() const;
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::point add_ui(isl::dim type, int pos, unsigned int val) const;
-  inline isl::val get_coordinate_val(isl::dim type, int pos) const;
-  inline isl::multi_val get_multi_val() const;
-  inline isl::space get_space() const;
-  inline isl::point set_coordinate_val(isl::dim type, int pos, isl::val v) const;
-  inline isl::point sub_ui(isl::dim type, int pos, unsigned int val) const;
+  inline isl::id annotation() const;
+  inline isl::id get_annotation() const;
+  inline std::string to_C_str() const;
+  inline isl::ast_node_list to_list() const;
 };
 
-// declarations for isl::pw_aff
-inline pw_aff manage(__isl_take isl_pw_aff *ptr);
-inline pw_aff manage_copy(__isl_keep isl_pw_aff *ptr);
+// declarations for isl::ast_node_block
 
-class pw_aff {
-  friend inline pw_aff manage(__isl_take isl_pw_aff *ptr);
-  friend inline pw_aff manage_copy(__isl_keep isl_pw_aff *ptr);
+class ast_node_block : public ast_node {
+  template <class T>
+  friend boolean ast_node::isa() const;
+  friend ast_node_block ast_node::as<ast_node_block>() const;
+  static const auto type = isl_ast_node_block;
 
-  isl_pw_aff *ptr = nullptr;
+protected:
+  inline explicit ast_node_block(__isl_take isl_ast_node *ptr);
 
-  inline explicit pw_aff(__isl_take isl_pw_aff *ptr);
+public:
+  inline /* implicit */ ast_node_block();
+  inline /* implicit */ ast_node_block(const ast_node_block &obj);
+  inline ast_node_block &operator=(ast_node_block obj);
+  inline isl::ctx ctx() const;
+
+  inline isl::ast_node_list children() const;
+  inline isl::ast_node_list get_children() const;
+};
+
+// declarations for isl::ast_node_for
+
+class ast_node_for : public ast_node {
+  template <class T>
+  friend boolean ast_node::isa() const;
+  friend ast_node_for ast_node::as<ast_node_for>() const;
+  static const auto type = isl_ast_node_for;
+
+protected:
+  inline explicit ast_node_for(__isl_take isl_ast_node *ptr);
 
 public:
-  inline /* implicit */ pw_aff();
-  inline /* implicit */ pw_aff(const pw_aff &obj);
-  inline /* implicit */ pw_aff(isl::aff aff);
-  inline explicit pw_aff(isl::ctx ctx, const std::string &str);
-  inline explicit pw_aff(isl::set domain, isl::val v);
-  inline explicit pw_aff(isl::local_space ls);
-  inline pw_aff &operator=(pw_aff obj);
-  inline ~pw_aff();
-  inline __isl_give isl_pw_aff *copy() const &;
-  inline __isl_give isl_pw_aff *copy() && = delete;
-  inline __isl_keep isl_pw_aff *get() const;
-  inline __isl_give isl_pw_aff *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_node_for();
+  inline /* implicit */ ast_node_for(const ast_node_for &obj);
+  inline ast_node_for &operator=(ast_node_for obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::pw_aff add(isl::pw_aff pwaff2) const;
-  inline isl::pw_aff add_constant(isl::val v) const;
-  inline isl::pw_aff add_dims(isl::dim type, unsigned int n) const;
-  inline isl::pw_aff align_params(isl::space model) const;
-  static inline isl::pw_aff alloc(isl::set set, isl::aff aff);
-  inline isl::aff as_aff() const;
-  inline isl::set bind(isl::id id) const;
-  inline isl::pw_aff bind_domain(isl::multi_id tuple) const;
-  inline isl::pw_aff bind_domain_wrapped_domain(isl::multi_id tuple) const;
-  inline isl::pw_aff ceil() const;
-  inline isl::pw_aff coalesce() const;
-  inline isl::pw_aff cond(isl::pw_aff pwaff_true, isl::pw_aff pwaff_false) const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::pw_aff div(isl::pw_aff pa2) const;
-  inline isl::set domain() const;
-  inline isl::pw_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::pw_aff drop_unused_params() const;
-  static inline isl::pw_aff empty(isl::space space);
-  inline isl::map eq_map(isl::pw_aff pa2) const;
-  inline isl::set eq_set(isl::pw_aff pwaff2) const;
-  inline isl::val eval(isl::point pnt) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::pw_aff floor() const;
-  inline stat foreach_piece(const std::function<stat(set, aff)> &fn) const;
-  inline isl::pw_aff from_range() const;
-  inline isl::map ge_map(isl::pw_aff pa2) const;
-  inline isl::set ge_set(isl::pw_aff pwaff2) const;
-  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
-  inline std::string get_dim_name(isl::dim type, unsigned int pos) const;
-  inline isl::space get_domain_space() const;
-  inline uint32_t get_hash() const;
-  inline isl::space get_space() const;
-  inline isl::id get_tuple_id(isl::dim type) const;
-  inline isl::pw_aff gist(isl::set context) const;
-  inline isl::pw_aff gist_params(isl::set context) const;
-  inline isl::map gt_map(isl::pw_aff pa2) const;
-  inline isl::set gt_set(isl::pw_aff pwaff2) const;
-  inline boolean has_dim_id(isl::dim type, unsigned int pos) const;
-  inline boolean has_tuple_id(isl::dim type) const;
-  inline isl::pw_aff insert_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::pw_aff insert_domain(isl::space domain) const;
-  inline isl::pw_aff intersect_domain(isl::set set) const;
-  inline isl::pw_aff intersect_domain_wrapped_domain(isl::set set) const;
-  inline isl::pw_aff intersect_domain_wrapped_range(isl::set set) const;
-  inline isl::pw_aff intersect_params(isl::set set) const;
-  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean involves_nan() const;
-  inline boolean involves_param_id(const isl::id &id) const;
-  inline boolean is_cst() const;
-  inline boolean is_empty() const;
-  inline boolean is_equal(const isl::pw_aff &pa2) const;
-  inline boolean isa_aff() const;
-  inline isl::map le_map(isl::pw_aff pa2) const;
-  inline isl::set le_set(isl::pw_aff pwaff2) const;
-  inline isl::map lt_map(isl::pw_aff pa2) const;
-  inline isl::set lt_set(isl::pw_aff pwaff2) const;
-  inline isl::pw_aff max(isl::pw_aff pwaff2) const;
-  inline isl::pw_aff min(isl::pw_aff pwaff2) const;
-  inline isl::pw_aff mod(isl::val mod) const;
-  inline isl::pw_aff move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const;
-  inline isl::pw_aff mul(isl::pw_aff pwaff2) const;
-  inline isl_size n_piece() const;
-  static inline isl::pw_aff nan_on_domain(isl::local_space ls);
-  static inline isl::pw_aff nan_on_domain_space(isl::space space);
-  inline isl::set ne_set(isl::pw_aff pwaff2) const;
-  inline isl::pw_aff neg() const;
-  inline isl::set non_zero_set() const;
-  inline isl::set nonneg_set() const;
-  static inline isl::pw_aff param_on_domain(isl::set domain, isl::id id);
-  inline isl::set params() const;
-  inline int plain_cmp(const isl::pw_aff &pa2) const;
-  inline boolean plain_is_equal(const isl::pw_aff &pwaff2) const;
-  inline isl::set pos_set() const;
-  inline isl::pw_aff project_domain_on_params() const;
-  inline isl::pw_aff pullback(isl::multi_aff ma) const;
-  inline isl::pw_aff pullback(isl::multi_pw_aff mpa) const;
-  inline isl::pw_aff pullback(isl::pw_multi_aff pma) const;
-  inline isl::pw_aff reset_tuple_id(isl::dim type) const;
-  inline isl::pw_aff reset_user() const;
-  inline isl::pw_aff scale(isl::val v) const;
-  inline isl::pw_aff scale_down(isl::val f) const;
-  inline isl::pw_aff set_dim_id(isl::dim type, unsigned int pos, isl::id id) const;
-  inline isl::pw_aff set_tuple_id(isl::dim type, isl::id id) const;
-  inline isl::pw_aff sub(isl::pw_aff pwaff2) const;
-  inline isl::pw_aff subtract_domain(isl::set set) const;
-  inline isl::pw_aff tdiv_q(isl::pw_aff pa2) const;
-  inline isl::pw_aff tdiv_r(isl::pw_aff pa2) const;
-  inline isl::pw_aff union_add(isl::pw_aff pwaff2) const;
-  inline isl::pw_aff union_max(isl::pw_aff pwaff2) const;
-  inline isl::pw_aff union_min(isl::pw_aff pwaff2) const;
-  static inline isl::pw_aff var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos);
-  inline isl::set zero_set() const;
-};
-
-// declarations for isl::pw_aff_list
-inline pw_aff_list manage(__isl_take isl_pw_aff_list *ptr);
-inline pw_aff_list manage_copy(__isl_keep isl_pw_aff_list *ptr);
-
-class pw_aff_list {
-  friend inline pw_aff_list manage(__isl_take isl_pw_aff_list *ptr);
-  friend inline pw_aff_list manage_copy(__isl_keep isl_pw_aff_list *ptr);
-
-  isl_pw_aff_list *ptr = nullptr;
-
-  inline explicit pw_aff_list(__isl_take isl_pw_aff_list *ptr);
-
-public:
-  inline /* implicit */ pw_aff_list();
-  inline /* implicit */ pw_aff_list(const pw_aff_list &obj);
-  inline pw_aff_list &operator=(pw_aff_list obj);
-  inline ~pw_aff_list();
-  inline __isl_give isl_pw_aff_list *copy() const &;
-  inline __isl_give isl_pw_aff_list *copy() && = delete;
-  inline __isl_keep isl_pw_aff_list *get() const;
-  inline __isl_give isl_pw_aff_list *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::pw_aff_list add(isl::pw_aff el) const;
-  static inline isl::pw_aff_list alloc(isl::ctx ctx, int n);
-  inline isl::pw_aff_list clear() const;
-  inline isl::pw_aff_list concat(isl::pw_aff_list list2) const;
-  inline isl::pw_aff_list drop(unsigned int first, unsigned int n) const;
-  inline isl::set eq_set(isl::pw_aff_list list2) const;
-  inline stat foreach(const std::function<stat(pw_aff)> &fn) const;
-  static inline isl::pw_aff_list from_pw_aff(isl::pw_aff el);
-  inline isl::set ge_set(isl::pw_aff_list list2) const;
-  inline isl::pw_aff get_at(int index) const;
-  inline isl::pw_aff get_pw_aff(int index) const;
-  inline isl::set gt_set(isl::pw_aff_list list2) const;
-  inline isl::pw_aff_list insert(unsigned int pos, isl::pw_aff el) const;
-  inline isl::set le_set(isl::pw_aff_list list2) const;
-  inline isl::set lt_set(isl::pw_aff_list list2) const;
-  inline isl::pw_aff max() const;
-  inline isl::pw_aff min() const;
-  inline isl_size n_pw_aff() const;
-  inline isl::set ne_set(isl::pw_aff_list list2) const;
-  inline isl::pw_aff_list reverse() const;
-  inline isl::pw_aff_list set_pw_aff(int index, isl::pw_aff el) const;
-  inline isl_size size() const;
-  inline isl::pw_aff_list swap(unsigned int pos1, unsigned int pos2) const;
+  inline isl::ast_node body() const;
+  inline isl::ast_node get_body() const;
+  inline isl::ast_expr cond() const;
+  inline isl::ast_expr get_cond() const;
+  inline isl::ast_expr inc() const;
+  inline isl::ast_expr get_inc() const;
+  inline isl::ast_expr init() const;
+  inline isl::ast_expr get_init() const;
+  inline boolean is_degenerate() const;
+  inline isl::ast_expr iterator() const;
+  inline isl::ast_expr get_iterator() const;
 };
 
-// declarations for isl::pw_multi_aff
-inline pw_multi_aff manage(__isl_take isl_pw_multi_aff *ptr);
-inline pw_multi_aff manage_copy(__isl_keep isl_pw_multi_aff *ptr);
-
-class pw_multi_aff {
-  friend inline pw_multi_aff manage(__isl_take isl_pw_multi_aff *ptr);
-  friend inline pw_multi_aff manage_copy(__isl_keep isl_pw_multi_aff *ptr);
+// declarations for isl::ast_node_if
 
-  isl_pw_multi_aff *ptr = nullptr;
+class ast_node_if : public ast_node {
+  template <class T>
+  friend boolean ast_node::isa() const;
+  friend ast_node_if ast_node::as<ast_node_if>() const;
+  static const auto type = isl_ast_node_if;
 
-  inline explicit pw_multi_aff(__isl_take isl_pw_multi_aff *ptr);
+protected:
+  inline explicit ast_node_if(__isl_take isl_ast_node *ptr);
 
 public:
-  inline /* implicit */ pw_multi_aff();
-  inline /* implicit */ pw_multi_aff(const pw_multi_aff &obj);
-  inline /* implicit */ pw_multi_aff(isl::multi_aff ma);
-  inline /* implicit */ pw_multi_aff(isl::pw_aff pa);
-  inline explicit pw_multi_aff(isl::ctx ctx, const std::string &str);
-  inline pw_multi_aff &operator=(pw_multi_aff obj);
-  inline ~pw_multi_aff();
-  inline __isl_give isl_pw_multi_aff *copy() const &;
-  inline __isl_give isl_pw_multi_aff *copy() && = delete;
-  inline __isl_keep isl_pw_multi_aff *get() const;
-  inline __isl_give isl_pw_multi_aff *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_node_if();
+  inline /* implicit */ ast_node_if(const ast_node_if &obj);
+  inline ast_node_if &operator=(ast_node_if obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::pw_multi_aff add(isl::pw_multi_aff pma2) const;
-  inline isl::pw_multi_aff add_constant(isl::multi_val mv) const;
-  inline isl::pw_multi_aff add_constant(isl::val v) const;
-  inline isl::pw_multi_aff align_params(isl::space model) const;
-  static inline isl::pw_multi_aff alloc(isl::set set, isl::multi_aff maff);
-  inline isl::multi_aff as_multi_aff() const;
-  inline isl::pw_multi_aff bind_domain(isl::multi_id tuple) const;
-  inline isl::pw_multi_aff bind_domain_wrapped_domain(isl::multi_id tuple) const;
-  inline isl::pw_multi_aff coalesce() const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::set domain() const;
-  static inline isl::pw_multi_aff domain_map(isl::space space);
-  inline isl::pw_multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::pw_multi_aff drop_unused_params() const;
-  static inline isl::pw_multi_aff empty(isl::space space);
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::pw_multi_aff fix_si(isl::dim type, unsigned int pos, int value) const;
-  inline isl::pw_multi_aff flat_range_product(isl::pw_multi_aff pma2) const;
-  inline stat foreach_piece(const std::function<stat(set, multi_aff)> &fn) const;
-  static inline isl::pw_multi_aff from_domain(isl::set set);
-  static inline isl::pw_multi_aff from_map(isl::map map);
-  static inline isl::pw_multi_aff from_multi_pw_aff(isl::multi_pw_aff mpa);
-  static inline isl::pw_multi_aff from_set(isl::set set);
-  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
-  inline std::string get_dim_name(isl::dim type, unsigned int pos) const;
-  inline isl::space get_domain_space() const;
-  inline isl::pw_aff get_pw_aff(int pos) const;
-  inline isl::space get_space() const;
-  inline isl::id get_tuple_id(isl::dim type) const;
-  inline std::string get_tuple_name(isl::dim type) const;
-  inline isl::pw_multi_aff gist(isl::set set) const;
-  inline isl::pw_multi_aff gist_params(isl::set set) const;
-  inline boolean has_tuple_id(isl::dim type) const;
-  inline boolean has_tuple_name(isl::dim type) const;
-  static inline isl::pw_multi_aff identity(isl::space space);
-  static inline isl::pw_multi_aff identity_on_domain(isl::space space);
-  inline isl::pw_multi_aff insert_domain(isl::space domain) const;
-  inline isl::pw_multi_aff intersect_domain(isl::set set) const;
-  inline isl::pw_multi_aff intersect_domain_wrapped_domain(isl::set set) const;
-  inline isl::pw_multi_aff intersect_domain_wrapped_range(isl::set set) const;
-  inline isl::pw_multi_aff intersect_params(isl::set set) const;
-  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean involves_locals() const;
-  inline boolean involves_nan() const;
-  inline boolean involves_param_id(const isl::id &id) const;
-  inline boolean is_equal(const isl::pw_multi_aff &pma2) const;
-  inline boolean isa_multi_aff() const;
-  inline isl::multi_val max_multi_val() const;
-  inline isl::multi_val min_multi_val() const;
-  static inline isl::pw_multi_aff multi_val_on_domain(isl::set domain, isl::multi_val mv);
-  inline isl_size n_piece() const;
-  inline isl::pw_multi_aff neg() const;
-  inline boolean plain_is_equal(const isl::pw_multi_aff &pma2) const;
-  inline isl::pw_multi_aff preimage_domain_wrapped_domain(isl::pw_multi_aff pma2) const;
-  inline isl::pw_multi_aff product(isl::pw_multi_aff pma2) const;
-  inline isl::pw_multi_aff project_domain_on_params() const;
-  static inline isl::pw_multi_aff project_out_map(isl::space space, isl::dim type, unsigned int first, unsigned int n);
-  inline isl::pw_multi_aff pullback(isl::multi_aff ma) const;
-  inline isl::pw_multi_aff pullback(isl::pw_multi_aff pma2) const;
-  inline isl::pw_multi_aff range_factor_domain() const;
-  inline isl::pw_multi_aff range_factor_range() const;
-  static inline isl::pw_multi_aff range_map(isl::space space);
-  inline isl::pw_multi_aff range_product(isl::pw_multi_aff pma2) const;
-  inline isl::pw_multi_aff reset_tuple_id(isl::dim type) const;
-  inline isl::pw_multi_aff reset_user() const;
-  inline isl::pw_multi_aff scale(isl::val v) const;
-  inline isl::pw_multi_aff scale_down(isl::val v) const;
-  inline isl::pw_multi_aff scale_multi_val(isl::multi_val mv) const;
-  inline isl::pw_multi_aff set_dim_id(isl::dim type, unsigned int pos, isl::id id) const;
-  inline isl::pw_multi_aff set_pw_aff(unsigned int pos, isl::pw_aff pa) const;
-  inline isl::pw_multi_aff set_tuple_id(isl::dim type, isl::id id) const;
-  inline isl::pw_multi_aff sub(isl::pw_multi_aff pma2) const;
-  inline isl::pw_multi_aff subtract_domain(isl::set set) const;
-  inline isl::pw_multi_aff union_add(isl::pw_multi_aff pma2) const;
-  inline isl::pw_multi_aff union_lexmax(isl::pw_multi_aff pma2) const;
-  inline isl::pw_multi_aff union_lexmin(isl::pw_multi_aff pma2) const;
-  static inline isl::pw_multi_aff zero(isl::space space);
+  inline isl::ast_expr cond() const;
+  inline isl::ast_expr get_cond() const;
+  inline isl::ast_node else_node() const;
+  inline isl::ast_node get_else_node() const;
+  inline boolean has_else_node() const;
+  inline isl::ast_node then_node() const;
+  inline isl::ast_node get_then_node() const;
 };
 
-// declarations for isl::pw_multi_aff_list
-inline pw_multi_aff_list manage(__isl_take isl_pw_multi_aff_list *ptr);
-inline pw_multi_aff_list manage_copy(__isl_keep isl_pw_multi_aff_list *ptr);
+// declarations for isl::ast_node_list
+inline ast_node_list manage(__isl_take isl_ast_node_list *ptr);
+inline ast_node_list manage_copy(__isl_keep isl_ast_node_list *ptr);
 
-class pw_multi_aff_list {
-  friend inline pw_multi_aff_list manage(__isl_take isl_pw_multi_aff_list *ptr);
-  friend inline pw_multi_aff_list manage_copy(__isl_keep isl_pw_multi_aff_list *ptr);
+class ast_node_list {
+  friend inline ast_node_list manage(__isl_take isl_ast_node_list *ptr);
+  friend inline ast_node_list manage_copy(__isl_keep isl_ast_node_list *ptr);
 
-  isl_pw_multi_aff_list *ptr = nullptr;
+protected:
+  isl_ast_node_list *ptr = nullptr;
 
-  inline explicit pw_multi_aff_list(__isl_take isl_pw_multi_aff_list *ptr);
+  inline explicit ast_node_list(__isl_take isl_ast_node_list *ptr);
 
 public:
-  inline /* implicit */ pw_multi_aff_list();
-  inline /* implicit */ pw_multi_aff_list(const pw_multi_aff_list &obj);
-  inline pw_multi_aff_list &operator=(pw_multi_aff_list obj);
-  inline ~pw_multi_aff_list();
-  inline __isl_give isl_pw_multi_aff_list *copy() const &;
-  inline __isl_give isl_pw_multi_aff_list *copy() && = delete;
-  inline __isl_keep isl_pw_multi_aff_list *get() const;
-  inline __isl_give isl_pw_multi_aff_list *release();
+  inline /* implicit */ ast_node_list();
+  inline /* implicit */ ast_node_list(const ast_node_list &obj);
+  inline explicit ast_node_list(isl::ctx ctx, int n);
+  inline explicit ast_node_list(isl::ast_node el);
+  inline ast_node_list &operator=(ast_node_list obj);
+  inline ~ast_node_list();
+  inline __isl_give isl_ast_node_list *copy() const &;
+  inline __isl_give isl_ast_node_list *copy() && = delete;
+  inline __isl_keep isl_ast_node_list *get() const;
+  inline __isl_give isl_ast_node_list *release();
   inline bool is_null() const;
   inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::pw_multi_aff_list add(isl::pw_multi_aff el) const;
-  static inline isl::pw_multi_aff_list alloc(isl::ctx ctx, int n);
-  inline isl::pw_multi_aff_list clear() const;
-  inline isl::pw_multi_aff_list concat(isl::pw_multi_aff_list list2) const;
-  inline isl::pw_multi_aff_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(pw_multi_aff)> &fn) const;
-  static inline isl::pw_multi_aff_list from_pw_multi_aff(isl::pw_multi_aff el);
-  inline isl::pw_multi_aff get_at(int index) const;
-  inline isl::pw_multi_aff get_pw_multi_aff(int index) const;
-  inline isl::pw_multi_aff_list insert(unsigned int pos, isl::pw_multi_aff el) const;
-  inline isl_size n_pw_multi_aff() const;
-  inline isl::pw_multi_aff_list reverse() const;
-  inline isl::pw_multi_aff_list set_pw_multi_aff(int index, isl::pw_multi_aff el) const;
-  inline isl_size size() const;
-  inline isl::pw_multi_aff_list swap(unsigned int pos1, unsigned int pos2) const;
-};
-
-// declarations for isl::pw_qpolynomial
-inline pw_qpolynomial manage(__isl_take isl_pw_qpolynomial *ptr);
-inline pw_qpolynomial manage_copy(__isl_keep isl_pw_qpolynomial *ptr);
 
-class pw_qpolynomial {
-  friend inline pw_qpolynomial manage(__isl_take isl_pw_qpolynomial *ptr);
-  friend inline pw_qpolynomial manage_copy(__isl_keep isl_pw_qpolynomial *ptr);
-
-  isl_pw_qpolynomial *ptr = nullptr;
-
-  inline explicit pw_qpolynomial(__isl_take isl_pw_qpolynomial *ptr);
-
-public:
-  inline /* implicit */ pw_qpolynomial();
-  inline /* implicit */ pw_qpolynomial(const pw_qpolynomial &obj);
-  inline explicit pw_qpolynomial(isl::ctx ctx, const std::string &str);
-  inline pw_qpolynomial &operator=(pw_qpolynomial obj);
-  inline ~pw_qpolynomial();
-  inline __isl_give isl_pw_qpolynomial *copy() const &;
-  inline __isl_give isl_pw_qpolynomial *copy() && = delete;
-  inline __isl_keep isl_pw_qpolynomial *get() const;
-  inline __isl_give isl_pw_qpolynomial *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::pw_qpolynomial add(isl::pw_qpolynomial pwqp2) const;
-  inline isl::pw_qpolynomial add_dims(isl::dim type, unsigned int n) const;
-  static inline isl::pw_qpolynomial alloc(isl::set set, isl::qpolynomial qp);
-  inline isl::qpolynomial as_qpolynomial() const;
-  inline isl::pw_qpolynomial coalesce() const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::set domain() const;
-  inline isl::pw_qpolynomial drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::pw_qpolynomial drop_unused_params() const;
-  inline isl::val eval(isl::point pnt) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::pw_qpolynomial fix_val(isl::dim type, unsigned int n, isl::val v) const;
-  inline stat foreach_piece(const std::function<stat(set, qpolynomial)> &fn) const;
-  static inline isl::pw_qpolynomial from_pw_aff(isl::pw_aff pwaff);
-  static inline isl::pw_qpolynomial from_qpolynomial(isl::qpolynomial qp);
-  inline isl::pw_qpolynomial from_range() const;
-  inline isl::space get_domain_space() const;
-  inline isl::space get_space() const;
-  inline isl::pw_qpolynomial gist(isl::set context) const;
-  inline isl::pw_qpolynomial gist_params(isl::set context) const;
-  inline boolean has_equal_space(const isl::pw_qpolynomial &pwqp2) const;
-  inline isl::pw_qpolynomial insert_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::pw_qpolynomial intersect_domain(isl::set set) const;
-  inline isl::pw_qpolynomial intersect_domain_wrapped_domain(isl::set set) const;
-  inline isl::pw_qpolynomial intersect_domain_wrapped_range(isl::set set) const;
-  inline isl::pw_qpolynomial intersect_params(isl::set set) const;
-  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean involves_nan() const;
-  inline boolean involves_param_id(const isl::id &id) const;
-  inline boolean is_zero() const;
-  inline boolean isa_qpolynomial() const;
-  inline isl::val max() const;
-  inline isl::val min() const;
-  inline isl::pw_qpolynomial move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const;
-  inline isl::pw_qpolynomial mul(isl::pw_qpolynomial pwqp2) const;
-  inline isl_size n_piece() const;
-  inline isl::pw_qpolynomial neg() const;
-  inline boolean plain_is_equal(const isl::pw_qpolynomial &pwqp2) const;
-  inline isl::pw_qpolynomial pow(unsigned int exponent) const;
-  inline isl::pw_qpolynomial project_domain_on_params() const;
-  inline isl::pw_qpolynomial reset_domain_space(isl::space space) const;
-  inline isl::pw_qpolynomial reset_user() const;
-  inline isl::pw_qpolynomial scale_down_val(isl::val v) const;
-  inline isl::pw_qpolynomial scale_val(isl::val v) const;
-  inline isl::pw_qpolynomial split_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::pw_qpolynomial split_periods(int max_periods) const;
-  inline isl::pw_qpolynomial sub(isl::pw_qpolynomial pwqp2) const;
-  inline isl::pw_qpolynomial subtract_domain(isl::set set) const;
-  inline isl::pw_qpolynomial to_polynomial(int sign) const;
-  static inline isl::pw_qpolynomial zero(isl::space space);
+  inline isl::ast_node_list add(isl::ast_node el) const;
+  inline isl::ast_node at(int index) const;
+  inline isl::ast_node get_at(int index) const;
+  inline isl::ast_node_list clear() const;
+  inline isl::ast_node_list concat(isl::ast_node_list list2) const;
+  inline isl::ast_node_list drop(unsigned int first, unsigned int n) const;
+  inline stat foreach(const std::function<stat(isl::ast_node)> &fn) const;
+  inline isl::ast_node_list insert(unsigned int pos, isl::ast_node el) const;
+  inline class size size() const;
 };
 
-// declarations for isl::pw_qpolynomial_fold_list
-inline pw_qpolynomial_fold_list manage(__isl_take isl_pw_qpolynomial_fold_list *ptr);
-inline pw_qpolynomial_fold_list manage_copy(__isl_keep isl_pw_qpolynomial_fold_list *ptr);
-
-class pw_qpolynomial_fold_list {
-  friend inline pw_qpolynomial_fold_list manage(__isl_take isl_pw_qpolynomial_fold_list *ptr);
-  friend inline pw_qpolynomial_fold_list manage_copy(__isl_keep isl_pw_qpolynomial_fold_list *ptr);
+// declarations for isl::ast_node_mark
 
-  isl_pw_qpolynomial_fold_list *ptr = nullptr;
+class ast_node_mark : public ast_node {
+  template <class T>
+  friend boolean ast_node::isa() const;
+  friend ast_node_mark ast_node::as<ast_node_mark>() const;
+  static const auto type = isl_ast_node_mark;
 
-  inline explicit pw_qpolynomial_fold_list(__isl_take isl_pw_qpolynomial_fold_list *ptr);
+protected:
+  inline explicit ast_node_mark(__isl_take isl_ast_node *ptr);
 
 public:
-  inline /* implicit */ pw_qpolynomial_fold_list();
-  inline /* implicit */ pw_qpolynomial_fold_list(const pw_qpolynomial_fold_list &obj);
-  inline pw_qpolynomial_fold_list &operator=(pw_qpolynomial_fold_list obj);
-  inline ~pw_qpolynomial_fold_list();
-  inline __isl_give isl_pw_qpolynomial_fold_list *copy() const &;
-  inline __isl_give isl_pw_qpolynomial_fold_list *copy() && = delete;
-  inline __isl_keep isl_pw_qpolynomial_fold_list *get() const;
-  inline __isl_give isl_pw_qpolynomial_fold_list *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_node_mark();
+  inline /* implicit */ ast_node_mark(const ast_node_mark &obj);
+  inline ast_node_mark &operator=(ast_node_mark obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
+  inline isl::id id() const;
+  inline isl::id get_id() const;
+  inline isl::ast_node node() const;
+  inline isl::ast_node get_node() const;
 };
 
-// declarations for isl::pw_qpolynomial_list
-inline pw_qpolynomial_list manage(__isl_take isl_pw_qpolynomial_list *ptr);
-inline pw_qpolynomial_list manage_copy(__isl_keep isl_pw_qpolynomial_list *ptr);
+// declarations for isl::ast_node_user
 
-class pw_qpolynomial_list {
-  friend inline pw_qpolynomial_list manage(__isl_take isl_pw_qpolynomial_list *ptr);
-  friend inline pw_qpolynomial_list manage_copy(__isl_keep isl_pw_qpolynomial_list *ptr);
+class ast_node_user : public ast_node {
+  template <class T>
+  friend boolean ast_node::isa() const;
+  friend ast_node_user ast_node::as<ast_node_user>() const;
+  static const auto type = isl_ast_node_user;
 
-  isl_pw_qpolynomial_list *ptr = nullptr;
-
-  inline explicit pw_qpolynomial_list(__isl_take isl_pw_qpolynomial_list *ptr);
+protected:
+  inline explicit ast_node_user(__isl_take isl_ast_node *ptr);
 
 public:
-  inline /* implicit */ pw_qpolynomial_list();
-  inline /* implicit */ pw_qpolynomial_list(const pw_qpolynomial_list &obj);
-  inline pw_qpolynomial_list &operator=(pw_qpolynomial_list obj);
-  inline ~pw_qpolynomial_list();
-  inline __isl_give isl_pw_qpolynomial_list *copy() const &;
-  inline __isl_give isl_pw_qpolynomial_list *copy() && = delete;
-  inline __isl_keep isl_pw_qpolynomial_list *get() const;
-  inline __isl_give isl_pw_qpolynomial_list *release();
-  inline bool is_null() const;
+  inline /* implicit */ ast_node_user();
+  inline /* implicit */ ast_node_user(const ast_node_user &obj);
+  inline ast_node_user &operator=(ast_node_user obj);
   inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::pw_qpolynomial_list add(isl::pw_qpolynomial el) const;
-  static inline isl::pw_qpolynomial_list alloc(isl::ctx ctx, int n);
-  inline isl::pw_qpolynomial_list clear() const;
-  inline isl::pw_qpolynomial_list concat(isl::pw_qpolynomial_list list2) const;
-  inline isl::pw_qpolynomial_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(pw_qpolynomial)> &fn) const;
-  static inline isl::pw_qpolynomial_list from_pw_qpolynomial(isl::pw_qpolynomial el);
-  inline isl::pw_qpolynomial get_at(int index) const;
-  inline isl::pw_qpolynomial get_pw_qpolynomial(int index) const;
-  inline isl::pw_qpolynomial_list insert(unsigned int pos, isl::pw_qpolynomial el) const;
-  inline isl_size n_pw_qpolynomial() const;
-  inline isl::pw_qpolynomial_list reverse() const;
-  inline isl::pw_qpolynomial_list set_pw_qpolynomial(int index, isl::pw_qpolynomial el) const;
-  inline isl_size size() const;
-  inline isl::pw_qpolynomial_list swap(unsigned int pos1, unsigned int pos2) const;
-};
-
-// declarations for isl::qpolynomial
-inline qpolynomial manage(__isl_take isl_qpolynomial *ptr);
-inline qpolynomial manage_copy(__isl_keep isl_qpolynomial *ptr);
-
-class qpolynomial {
-  friend inline qpolynomial manage(__isl_take isl_qpolynomial *ptr);
-  friend inline qpolynomial manage_copy(__isl_keep isl_qpolynomial *ptr);
-
-  isl_qpolynomial *ptr = nullptr;
 
-  inline explicit qpolynomial(__isl_take isl_qpolynomial *ptr);
-
-public:
-  inline /* implicit */ qpolynomial();
-  inline /* implicit */ qpolynomial(const qpolynomial &obj);
-  inline qpolynomial &operator=(qpolynomial obj);
-  inline ~qpolynomial();
-  inline __isl_give isl_qpolynomial *copy() const &;
-  inline __isl_give isl_qpolynomial *copy() && = delete;
-  inline __isl_keep isl_qpolynomial *get() const;
-  inline __isl_give isl_qpolynomial *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::qpolynomial add(isl::qpolynomial qp2) const;
-  inline isl::qpolynomial add_dims(isl::dim type, unsigned int n) const;
-  inline isl::qpolynomial align_params(isl::space model) const;
-  inline stat as_polynomial_on_domain(const isl::basic_set &bset, const std::function<stat(basic_set, qpolynomial)> &fn) const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::qpolynomial drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::val eval(isl::point pnt) const;
-  inline stat foreach_term(const std::function<stat(term)> &fn) const;
-  static inline isl::qpolynomial from_aff(isl::aff aff);
-  static inline isl::qpolynomial from_constraint(isl::constraint c, isl::dim type, unsigned int pos);
-  static inline isl::qpolynomial from_term(isl::term term);
-  inline isl::val get_constant_val() const;
-  inline isl::space get_domain_space() const;
-  inline isl::space get_space() const;
-  inline isl::qpolynomial gist(isl::set context) const;
-  inline isl::qpolynomial gist_params(isl::set context) const;
-  inline isl::qpolynomial homogenize() const;
-  static inline isl::qpolynomial infty_on_domain(isl::space domain);
-  inline isl::qpolynomial insert_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean is_infty() const;
-  inline boolean is_nan() const;
-  inline boolean is_neginfty() const;
-  inline boolean is_zero() const;
-  inline isl::qpolynomial move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const;
-  inline isl::qpolynomial mul(isl::qpolynomial qp2) const;
-  static inline isl::qpolynomial nan_on_domain(isl::space domain);
-  inline isl::qpolynomial neg() const;
-  static inline isl::qpolynomial neginfty_on_domain(isl::space domain);
-  static inline isl::qpolynomial one_on_domain(isl::space domain);
-  inline boolean plain_is_equal(const isl::qpolynomial &qp2) const;
-  inline isl::qpolynomial pow(unsigned int power) const;
-  inline isl::qpolynomial project_domain_on_params() const;
-  inline isl::qpolynomial scale_down_val(isl::val v) const;
-  inline isl::qpolynomial scale_val(isl::val v) const;
-  inline int sgn() const;
-  inline isl::qpolynomial sub(isl::qpolynomial qp2) const;
-  static inline isl::qpolynomial val_on_domain(isl::space space, isl::val val);
-  static inline isl::qpolynomial var_on_domain(isl::space domain, isl::dim type, unsigned int pos);
-  static inline isl::qpolynomial zero_on_domain(isl::space domain);
+  inline isl::ast_expr expr() const;
+  inline isl::ast_expr get_expr() const;
 };
 
-// declarations for isl::qpolynomial_list
-inline qpolynomial_list manage(__isl_take isl_qpolynomial_list *ptr);
-inline qpolynomial_list manage_copy(__isl_keep isl_qpolynomial_list *ptr);
+// declarations for isl::basic_map
+inline basic_map manage(__isl_take isl_basic_map *ptr);
+inline basic_map manage_copy(__isl_keep isl_basic_map *ptr);
 
-class qpolynomial_list {
-  friend inline qpolynomial_list manage(__isl_take isl_qpolynomial_list *ptr);
-  friend inline qpolynomial_list manage_copy(__isl_keep isl_qpolynomial_list *ptr);
+class basic_map {
+  friend inline basic_map manage(__isl_take isl_basic_map *ptr);
+  friend inline basic_map manage_copy(__isl_keep isl_basic_map *ptr);
 
-  isl_qpolynomial_list *ptr = nullptr;
+protected:
+  isl_basic_map *ptr = nullptr;
 
-  inline explicit qpolynomial_list(__isl_take isl_qpolynomial_list *ptr);
+  inline explicit basic_map(__isl_take isl_basic_map *ptr);
 
 public:
-  inline /* implicit */ qpolynomial_list();
-  inline /* implicit */ qpolynomial_list(const qpolynomial_list &obj);
-  inline qpolynomial_list &operator=(qpolynomial_list obj);
-  inline ~qpolynomial_list();
-  inline __isl_give isl_qpolynomial_list *copy() const &;
-  inline __isl_give isl_qpolynomial_list *copy() && = delete;
-  inline __isl_keep isl_qpolynomial_list *get() const;
-  inline __isl_give isl_qpolynomial_list *release();
+  inline /* implicit */ basic_map();
+  inline /* implicit */ basic_map(const basic_map &obj);
+  inline explicit basic_map(isl::ctx ctx, const std::string &str);
+  inline basic_map &operator=(basic_map obj);
+  inline ~basic_map();
+  inline __isl_give isl_basic_map *copy() const &;
+  inline __isl_give isl_basic_map *copy() && = delete;
+  inline __isl_keep isl_basic_map *get() const;
+  inline __isl_give isl_basic_map *release();
   inline bool is_null() const;
   inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::qpolynomial_list add(isl::qpolynomial el) const;
-  static inline isl::qpolynomial_list alloc(isl::ctx ctx, int n);
-  inline isl::qpolynomial_list clear() const;
-  inline isl::qpolynomial_list concat(isl::qpolynomial_list list2) const;
-  inline isl::qpolynomial_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(qpolynomial)> &fn) const;
-  static inline isl::qpolynomial_list from_qpolynomial(isl::qpolynomial el);
-  inline isl::qpolynomial get_at(int index) const;
-  inline isl::qpolynomial get_qpolynomial(int index) const;
-  inline isl::qpolynomial_list insert(unsigned int pos, isl::qpolynomial el) const;
-  inline isl_size n_qpolynomial() const;
-  inline isl::qpolynomial_list reverse() const;
-  inline isl::qpolynomial_list set_qpolynomial(int index, isl::qpolynomial el) const;
-  inline isl_size size() const;
-  inline isl::qpolynomial_list swap(unsigned int pos1, unsigned int pos2) const;
-};
-
-// declarations for isl::schedule
-inline schedule manage(__isl_take isl_schedule *ptr);
-inline schedule manage_copy(__isl_keep isl_schedule *ptr);
 
-class schedule {
-  friend inline schedule manage(__isl_take isl_schedule *ptr);
-  friend inline schedule manage_copy(__isl_keep isl_schedule *ptr);
-
-  isl_schedule *ptr = nullptr;
-
-  inline explicit schedule(__isl_take isl_schedule *ptr);
-
-public:
-  inline /* implicit */ schedule();
-  inline /* implicit */ schedule(const schedule &obj);
-  inline explicit schedule(isl::ctx ctx, const std::string &str);
-  inline schedule &operator=(schedule obj);
-  inline ~schedule();
-  inline __isl_give isl_schedule *copy() const &;
-  inline __isl_give isl_schedule *copy() && = delete;
-  inline __isl_keep isl_schedule *get() const;
-  inline __isl_give isl_schedule *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::schedule align_params(isl::space space) const;
-  static inline isl::schedule empty(isl::space space);
-  static inline isl::schedule from_domain(isl::union_set domain);
-  inline isl::union_set get_domain() const;
-  inline isl::union_map get_map() const;
-  inline isl::schedule_node get_root() const;
-  inline isl::schedule gist_domain_params(isl::set context) const;
-  inline isl::schedule insert_context(isl::set context) const;
-  inline isl::schedule insert_guard(isl::set guard) const;
-  inline isl::schedule insert_partial_schedule(isl::multi_union_pw_aff partial) const;
-  inline isl::schedule intersect_domain(isl::union_set domain) const;
-  inline boolean plain_is_equal(const isl::schedule &schedule2) const;
-  inline isl::schedule pullback(isl::union_pw_multi_aff upma) const;
-  inline isl::schedule reset_user() const;
-  inline isl::schedule sequence(isl::schedule schedule2) const;
+  inline isl::map add_constraint(const isl::constraint &constraint) const;
+  inline isl::map add_dims(isl::dim type, unsigned int n) const;
+  inline isl::basic_map affine_hull() const;
+  inline isl::map align_params(const isl::space &model) const;
+  inline isl::basic_map apply_domain(isl::basic_map bmap2) const;
+  inline isl::map apply_domain(const isl::map &map2) const;
+  inline isl::union_map apply_domain(const isl::union_map &umap2) const;
+  inline isl::basic_map apply_range(isl::basic_map bmap2) const;
+  inline isl::map apply_range(const isl::map &map2) const;
+  inline isl::union_map apply_range(const isl::union_map &umap2) const;
+  inline isl::map as_map() const;
+  inline isl::multi_union_pw_aff as_multi_union_pw_aff() const;
+  inline isl::pw_multi_aff as_pw_multi_aff() const;
+  inline isl::union_pw_multi_aff as_union_pw_multi_aff() const;
+  inline isl::basic_map_list basic_map_list() const;
+  inline isl::set bind_domain(const isl::multi_id &tuple) const;
+  inline isl::set bind_range(const isl::multi_id &tuple) const;
+  inline boolean can_curry() const;
+  inline isl::map coalesce() const;
+  inline isl::map complement() const;
+  inline isl::union_map compute_divs() const;
+  inline isl::map curry() const;
+  inline isl::basic_set deltas() const;
+  inline isl::basic_map detect_equalities() const;
+  inline class size dim(isl::dim type) const;
+  inline isl::pw_aff dim_max(int pos) const;
+  inline isl::pw_aff dim_min(int pos) const;
+  inline isl::basic_set domain() const;
+  inline isl::map domain_factor_domain() const;
+  inline isl::map domain_factor_range() const;
+  inline isl::map domain_map() const;
+  inline isl::union_pw_multi_aff domain_map_union_pw_multi_aff() const;
+  inline isl::map domain_product(const isl::map &map2) const;
+  inline isl::union_map domain_product(const isl::union_map &umap2) const;
+  inline class size domain_tuple_dim() const;
+  inline isl::id domain_tuple_id() const;
+  inline isl::map eq_at(const isl::multi_pw_aff &mpa) const;
+  inline isl::union_map eq_at(const isl::multi_union_pw_aff &mupa) const;
+  static inline isl::basic_map equal(isl::space space, unsigned int n_equal);
+  inline isl::basic_map equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
+  inline boolean every_map(const std::function<boolean(isl::map)> &test) const;
+  inline isl::map extract_map(const isl::space &space) const;
+  inline isl::map factor_domain() const;
+  inline isl::map factor_range() const;
+  inline isl::basic_map fix_si(isl::dim type, unsigned int pos, int value) const;
+  inline isl::basic_map fix_val(isl::dim type, unsigned int pos, isl::val v) const;
+  inline isl::basic_map fix_val(isl::dim type, unsigned int pos, long v) const;
+  inline isl::union_map fixed_power(const isl::val &exp) const;
+  inline isl::union_map fixed_power(long exp) const;
+  inline isl::map flat_range_product(const isl::map &map2) const;
+  inline isl::union_map flat_range_product(const isl::union_map &umap2) const;
+  inline isl::basic_map flatten() const;
+  inline isl::basic_map flatten_domain() const;
+  inline isl::basic_map flatten_range() const;
+  inline isl::map floordiv_val(const isl::val &d) const;
+  inline isl::map floordiv_val(long d) const;
+  inline stat foreach_basic_map(const std::function<stat(isl::basic_map)> &fn) const;
+  inline stat foreach_map(const std::function<stat(isl::map)> &fn) const;
+  static inline isl::basic_map from_aff(isl::aff aff);
+  static inline isl::basic_map from_domain_and_range(isl::basic_set domain, isl::basic_set range);
+  inline isl::basic_map gist(isl::basic_map context) const;
+  inline isl::map gist(const isl::map &context) const;
+  inline isl::union_map gist(const isl::union_map &context) const;
+  inline isl::map gist_domain(const isl::set &context) const;
+  inline isl::union_map gist_domain(const isl::union_set &uset) const;
+  inline isl::map gist_params(const isl::set &context) const;
+  inline isl::union_map gist_range(const isl::union_set &uset) const;
+  inline boolean has_domain_tuple_id() const;
+  inline boolean has_equal_space(const isl::map &map2) const;
+  inline boolean has_range_tuple_id() const;
+  inline boolean has_tuple_id(isl::dim type) const;
+  inline boolean has_tuple_name(isl::dim type) const;
+  inline isl::basic_map intersect(isl::basic_map bmap2) const;
+  inline isl::map intersect(const isl::map &map2) const;
+  inline isl::union_map intersect(const isl::union_map &umap2) const;
+  inline isl::basic_map intersect_domain(isl::basic_set bset) const;
+  inline isl::map intersect_domain(const isl::set &set) const;
+  inline isl::union_map intersect_domain(const isl::space &space) const;
+  inline isl::union_map intersect_domain(const isl::union_set &uset) const;
+  inline isl::basic_map intersect_domain(const isl::point &bset) const;
+  inline isl::map intersect_domain_factor_domain(const isl::map &factor) const;
+  inline isl::union_map intersect_domain_factor_domain(const isl::union_map &factor) const;
+  inline isl::map intersect_domain_factor_range(const isl::map &factor) const;
+  inline isl::union_map intersect_domain_factor_range(const isl::union_map &factor) const;
+  inline isl::map intersect_params(const isl::set &params) const;
+  inline isl::basic_map intersect_range(isl::basic_set bset) const;
+  inline isl::map intersect_range(const isl::set &set) const;
+  inline isl::union_map intersect_range(const isl::space &space) const;
+  inline isl::union_map intersect_range(const isl::union_set &uset) const;
+  inline isl::basic_map intersect_range(const isl::point &bset) const;
+  inline isl::map intersect_range_factor_domain(const isl::map &factor) const;
+  inline isl::union_map intersect_range_factor_domain(const isl::union_map &factor) const;
+  inline isl::map intersect_range_factor_range(const isl::map &factor) const;
+  inline isl::union_map intersect_range_factor_range(const isl::union_map &factor) const;
+  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
+  inline boolean is_bijective() const;
+  inline boolean is_disjoint(const isl::map &map2) const;
+  inline boolean is_disjoint(const isl::union_map &umap2) const;
+  inline boolean is_empty() const;
+  inline boolean is_equal(const isl::basic_map &bmap2) const;
+  inline boolean is_equal(const isl::map &map2) const;
+  inline boolean is_equal(const isl::union_map &umap2) const;
+  inline boolean is_injective() const;
+  inline boolean is_single_valued() const;
+  inline boolean is_strict_subset(const isl::map &map2) const;
+  inline boolean is_strict_subset(const isl::union_map &umap2) const;
+  inline boolean is_subset(const isl::basic_map &bmap2) const;
+  inline boolean is_subset(const isl::map &map2) const;
+  inline boolean is_subset(const isl::union_map &umap2) const;
+  inline boolean isa_map() const;
+  inline isl::map lex_ge_at(const isl::multi_pw_aff &mpa) const;
+  inline isl::map lex_gt_at(const isl::multi_pw_aff &mpa) const;
+  inline isl::map lex_le_at(const isl::multi_pw_aff &mpa) const;
+  inline isl::map lex_lt_at(const isl::multi_pw_aff &mpa) const;
+  inline isl::map lexmax() const;
+  inline isl::pw_multi_aff lexmax_pw_multi_aff() const;
+  inline isl::map lexmin() const;
+  inline isl::pw_multi_aff lexmin_pw_multi_aff() const;
+  inline isl::map lower_bound(const isl::multi_pw_aff &lower) const;
+  inline isl::map lower_bound_si(isl::dim type, unsigned int pos, int value) const;
+  inline isl::map_list map_list() const;
+  inline isl::multi_pw_aff max_multi_pw_aff() const;
+  inline isl::multi_pw_aff min_multi_pw_aff() const;
+  inline isl::map move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const;
+  inline class size n_basic_map() const;
+  inline isl::map order_lt(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
+  inline isl::set params() const;
+  inline isl::val plain_get_val_if_fixed(isl::dim type, unsigned int pos) const;
+  inline isl::basic_map polyhedral_hull() const;
+  inline isl::map preimage_domain(const isl::multi_aff &ma) const;
+  inline isl::map preimage_domain(const isl::multi_pw_aff &mpa) const;
+  inline isl::map preimage_domain(const isl::pw_multi_aff &pma) const;
+  inline isl::union_map preimage_domain(const isl::union_pw_multi_aff &upma) const;
+  inline isl::map preimage_range(const isl::multi_aff &ma) const;
+  inline isl::map preimage_range(const isl::pw_multi_aff &pma) const;
+  inline isl::union_map preimage_range(const isl::union_pw_multi_aff &upma) const;
+  inline isl::map product(const isl::map &map2) const;
+  inline isl::union_map product(const isl::union_map &umap2) const;
+  inline isl::map project_out(isl::dim type, unsigned int first, unsigned int n) const;
+  inline isl::map project_out_all_params() const;
+  inline isl::set range() const;
+  inline isl::map range_factor_domain() const;
+  inline isl::map range_factor_range() const;
+  inline isl::fixed_box range_lattice_tile() const;
+  inline isl::map range_map() const;
+  inline isl::map range_product(const isl::map &map2) const;
+  inline isl::union_map range_product(const isl::union_map &umap2) const;
+  inline isl::map range_reverse() const;
+  inline isl::fixed_box range_simple_fixed_box_hull() const;
+  inline class size range_tuple_dim() const;
+  inline isl::id range_tuple_id() const;
+  inline isl::basic_map reverse() const;
+  inline isl::basic_map sample() const;
+  inline isl::map set_domain_tuple(const isl::id &id) const;
+  inline isl::map set_domain_tuple(const std::string &id) const;
+  inline isl::map set_range_tuple(const isl::id &id) const;
+  inline isl::map set_range_tuple(const std::string &id) const;
+  inline isl::map set_tuple_id(isl::dim type, const isl::id &id) const;
+  inline isl::map set_tuple_id(isl::dim type, const std::string &id) const;
+  inline isl::space space() const;
+  inline isl::map subtract(const isl::map &map2) const;
+  inline isl::union_map subtract(const isl::union_map &umap2) const;
+  inline isl::union_map subtract_domain(const isl::union_set &dom) const;
+  inline isl::union_map subtract_range(const isl::union_set &dom) const;
+  inline isl::map sum(const isl::map &map2) const;
+  inline isl::basic_map_list to_list() const;
+  inline isl::union_map to_union_map() const;
+  inline isl::id tuple_id(isl::dim type) const;
+  inline isl::map uncurry() const;
+  inline isl::map unite(isl::basic_map bmap2) const;
+  inline isl::map unite(const isl::map &map2) const;
+  inline isl::union_map unite(const isl::union_map &umap2) const;
+  static inline isl::basic_map universe(isl::space space);
+  inline isl::basic_map unshifted_simple_hull() const;
+  inline isl::map upper_bound(const isl::multi_pw_aff &upper) const;
+  inline isl::map upper_bound_si(isl::dim type, unsigned int pos, int value) const;
+  inline isl::set wrap() const;
+  inline isl::map zip() const;
 };
 
-// declarations for isl::schedule_constraints
-inline schedule_constraints manage(__isl_take isl_schedule_constraints *ptr);
-inline schedule_constraints manage_copy(__isl_keep isl_schedule_constraints *ptr);
+// declarations for isl::basic_map_list
+inline basic_map_list manage(__isl_take isl_basic_map_list *ptr);
+inline basic_map_list manage_copy(__isl_keep isl_basic_map_list *ptr);
 
-class schedule_constraints {
-  friend inline schedule_constraints manage(__isl_take isl_schedule_constraints *ptr);
-  friend inline schedule_constraints manage_copy(__isl_keep isl_schedule_constraints *ptr);
+class basic_map_list {
+  friend inline basic_map_list manage(__isl_take isl_basic_map_list *ptr);
+  friend inline basic_map_list manage_copy(__isl_keep isl_basic_map_list *ptr);
 
-  isl_schedule_constraints *ptr = nullptr;
+protected:
+  isl_basic_map_list *ptr = nullptr;
 
-  inline explicit schedule_constraints(__isl_take isl_schedule_constraints *ptr);
+  inline explicit basic_map_list(__isl_take isl_basic_map_list *ptr);
 
 public:
-  inline /* implicit */ schedule_constraints();
-  inline /* implicit */ schedule_constraints(const schedule_constraints &obj);
-  inline explicit schedule_constraints(isl::ctx ctx, const std::string &str);
-  inline schedule_constraints &operator=(schedule_constraints obj);
-  inline ~schedule_constraints();
-  inline __isl_give isl_schedule_constraints *copy() const &;
-  inline __isl_give isl_schedule_constraints *copy() && = delete;
-  inline __isl_keep isl_schedule_constraints *get() const;
-  inline __isl_give isl_schedule_constraints *release();
+  inline /* implicit */ basic_map_list();
+  inline /* implicit */ basic_map_list(const basic_map_list &obj);
+  inline explicit basic_map_list(isl::ctx ctx, int n);
+  inline explicit basic_map_list(isl::basic_map el);
+  inline basic_map_list &operator=(basic_map_list obj);
+  inline ~basic_map_list();
+  inline __isl_give isl_basic_map_list *copy() const &;
+  inline __isl_give isl_basic_map_list *copy() && = delete;
+  inline __isl_keep isl_basic_map_list *get() const;
+  inline __isl_give isl_basic_map_list *release();
   inline bool is_null() const;
   inline isl::ctx ctx() const;
-  inline void dump() const;
 
-  inline isl::schedule_constraints apply(isl::union_map umap) const;
-  inline isl::schedule compute_schedule() const;
-  inline isl::union_map get_coincidence() const;
-  inline isl::union_map get_conditional_validity() const;
-  inline isl::union_map get_conditional_validity_condition() const;
-  inline isl::set get_context() const;
-  inline isl::union_set get_domain() const;
-  inline isl::union_map get_proximity() const;
-  inline isl::union_map get_validity() const;
-  static inline isl::schedule_constraints on_domain(isl::union_set domain);
-  inline isl::schedule_constraints set_coincidence(isl::union_map coincidence) const;
-  inline isl::schedule_constraints set_conditional_validity(isl::union_map condition, isl::union_map validity) const;
-  inline isl::schedule_constraints set_context(isl::set context) const;
-  inline isl::schedule_constraints set_proximity(isl::union_map proximity) const;
-  inline isl::schedule_constraints set_validity(isl::union_map validity) const;
+  inline isl::basic_map_list add(isl::basic_map el) const;
+  inline isl::basic_map at(int index) const;
+  inline isl::basic_map get_at(int index) const;
+  inline isl::basic_map_list clear() const;
+  inline isl::basic_map_list concat(isl::basic_map_list list2) const;
+  inline isl::basic_map_list drop(unsigned int first, unsigned int n) const;
+  inline stat foreach(const std::function<stat(isl::basic_map)> &fn) const;
+  inline isl::basic_map_list insert(unsigned int pos, isl::basic_map el) const;
+  inline class size size() const;
 };
 
-// declarations for isl::schedule_node
-inline schedule_node manage(__isl_take isl_schedule_node *ptr);
-inline schedule_node manage_copy(__isl_keep isl_schedule_node *ptr);
+// declarations for isl::basic_set
+inline basic_set manage(__isl_take isl_basic_set *ptr);
+inline basic_set manage_copy(__isl_keep isl_basic_set *ptr);
 
-class schedule_node {
-  friend inline schedule_node manage(__isl_take isl_schedule_node *ptr);
-  friend inline schedule_node manage_copy(__isl_keep isl_schedule_node *ptr);
+class basic_set {
+  friend inline basic_set manage(__isl_take isl_basic_set *ptr);
+  friend inline basic_set manage_copy(__isl_keep isl_basic_set *ptr);
 
-  isl_schedule_node *ptr = nullptr;
+protected:
+  isl_basic_set *ptr = nullptr;
 
-  inline explicit schedule_node(__isl_take isl_schedule_node *ptr);
+  inline explicit basic_set(__isl_take isl_basic_set *ptr);
 
 public:
-  inline /* implicit */ schedule_node();
-  inline /* implicit */ schedule_node(const schedule_node &obj);
-  inline schedule_node &operator=(schedule_node obj);
-  inline ~schedule_node();
-  inline __isl_give isl_schedule_node *copy() const &;
-  inline __isl_give isl_schedule_node *copy() && = delete;
-  inline __isl_keep isl_schedule_node *get() const;
-  inline __isl_give isl_schedule_node *release();
+  inline /* implicit */ basic_set();
+  inline /* implicit */ basic_set(const basic_set &obj);
+  inline /* implicit */ basic_set(isl::point pnt);
+  inline explicit basic_set(isl::ctx ctx, const std::string &str);
+  inline basic_set &operator=(basic_set obj);
+  inline ~basic_set();
+  inline __isl_give isl_basic_set *copy() const &;
+  inline __isl_give isl_basic_set *copy() && = delete;
+  inline __isl_keep isl_basic_set *get() const;
+  inline __isl_give isl_basic_set *release();
   inline bool is_null() const;
   inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::schedule_node align_params(isl::space space) const;
-  inline isl::schedule_node ancestor(int generation) const;
-  inline boolean band_member_get_coincident(int pos) const;
-  inline isl::schedule_node band_member_set_coincident(int pos, int coincident) const;
-  inline isl::schedule_node band_set_ast_build_options(isl::union_set options) const;
-  inline isl::schedule_node child(int pos) const;
-  inline isl::set context_get_context() const;
-  inline isl::schedule_node cut() const;
-  inline isl::union_set domain_get_domain() const;
-  inline isl::union_pw_multi_aff expansion_get_contraction() const;
-  inline isl::union_map expansion_get_expansion() const;
-  inline isl::union_map extension_get_extension() const;
-  inline isl::union_set filter_get_filter() const;
-  inline isl::schedule_node first_child() const;
-  inline stat foreach_ancestor_top_down(const std::function<stat(schedule_node)> &fn) const;
-  static inline isl::schedule_node from_domain(isl::union_set domain);
-  static inline isl::schedule_node from_extension(isl::union_map extension);
-  inline isl_size get_ancestor_child_position(const isl::schedule_node &ancestor) const;
-  inline isl::schedule_node get_child(int pos) const;
-  inline isl_size get_child_position() const;
-  inline isl::union_set get_domain() const;
-  inline isl::multi_union_pw_aff get_prefix_schedule_multi_union_pw_aff() const;
-  inline isl::union_map get_prefix_schedule_relation() const;
-  inline isl::union_map get_prefix_schedule_union_map() const;
-  inline isl::union_pw_multi_aff get_prefix_schedule_union_pw_multi_aff() const;
-  inline isl::schedule get_schedule() const;
-  inline isl_size get_schedule_depth() const;
-  inline isl::schedule_node get_shared_ancestor(const isl::schedule_node &node2) const;
-  inline isl::union_pw_multi_aff get_subtree_contraction() const;
-  inline isl::union_map get_subtree_expansion() const;
-  inline isl::union_map get_subtree_schedule_union_map() const;
-  inline isl_size get_tree_depth() const;
-  inline isl::union_set get_universe_domain() const;
-  inline isl::schedule_node graft_after(isl::schedule_node graft) const;
-  inline isl::schedule_node graft_before(isl::schedule_node graft) const;
-  inline isl::schedule_node group(isl::id group_id) const;
-  inline isl::set guard_get_guard() const;
-  inline boolean has_children() const;
-  inline boolean has_next_sibling() const;
-  inline boolean has_parent() const;
-  inline boolean has_previous_sibling() const;
-  inline isl::schedule_node insert_context(isl::set context) const;
-  inline isl::schedule_node insert_filter(isl::union_set filter) const;
-  inline isl::schedule_node insert_guard(isl::set context) const;
-  inline isl::schedule_node insert_mark(isl::id mark) const;
-  inline isl::schedule_node insert_partial_schedule(isl::multi_union_pw_aff schedule) const;
-  inline isl::schedule_node insert_sequence(isl::union_set_list filters) const;
-  inline isl::schedule_node insert_set(isl::union_set_list filters) const;
-  inline boolean is_equal(const isl::schedule_node &node2) const;
-  inline boolean is_subtree_anchored() const;
-  inline isl::id mark_get_id() const;
-  inline isl_size n_children() const;
-  inline isl::schedule_node next_sibling() const;
-  inline isl::schedule_node order_after(isl::union_set filter) const;
-  inline isl::schedule_node order_before(isl::union_set filter) const;
-  inline isl::schedule_node parent() const;
-  inline isl::schedule_node previous_sibling() const;
-  inline isl::schedule_node reset_user() const;
-  inline isl::schedule_node root() const;
-  inline isl::schedule_node sequence_splice_child(int pos) const;
-};
-
-// declarations for isl::set
-inline set manage(__isl_take isl_set *ptr);
-inline set manage_copy(__isl_keep isl_set *ptr);
-
-class set {
-  friend inline set manage(__isl_take isl_set *ptr);
-  friend inline set manage_copy(__isl_keep isl_set *ptr);
-
-  isl_set *ptr = nullptr;
 
-  inline explicit set(__isl_take isl_set *ptr);
-
-public:
-  inline /* implicit */ set();
-  inline /* implicit */ set(const set &obj);
-  inline /* implicit */ set(isl::basic_set bset);
-  inline /* implicit */ set(isl::point pnt);
-  inline explicit set(isl::union_set uset);
-  inline explicit set(isl::ctx ctx, const std::string &str);
-  inline set &operator=(set obj);
-  inline ~set();
-  inline __isl_give isl_set *copy() const &;
-  inline __isl_give isl_set *copy() && = delete;
-  inline __isl_keep isl_set *get() const;
-  inline __isl_give isl_set *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::set add_constraint(isl::constraint constraint) const;
+  inline isl::set add_constraint(const isl::constraint &constraint) const;
   inline isl::set add_dims(isl::dim type, unsigned int n) const;
   inline isl::basic_set affine_hull() const;
-  inline isl::set align_params(isl::space model) const;
-  inline isl::set apply(isl::map map) const;
-  inline isl::set bind(isl::multi_id tuple) const;
-  inline isl::basic_set bounded_simple_hull() const;
-  static inline isl::set box_from_points(isl::point pnt1, isl::point pnt2);
+  inline isl::set align_params(const isl::space &model) const;
+  inline isl::basic_set apply(isl::basic_map bmap) const;
+  inline isl::set apply(const isl::map &map) const;
+  inline isl::union_set apply(const isl::union_map &umap) const;
+  inline isl::pw_multi_aff as_pw_multi_aff() const;
+  inline isl::set as_set() const;
+  inline isl::basic_set_list basic_set_list() const;
+  inline isl::set bind(const isl::multi_id &tuple) const;
   inline isl::set coalesce() const;
-  inline isl::basic_set coefficients() const;
   inline isl::set complement() const;
+  inline isl::union_set compute_divs() const;
+  inline boolean contains(const isl::space &space) const;
   inline isl::basic_set convex_hull() const;
-  inline isl::val count_val() const;
-  inline isl::set detect_equalities() const;
-  inline isl_size dim(isl::dim type) const;
+  inline isl::basic_set detect_equalities() const;
+  inline class size dim(isl::dim type) const;
   inline boolean dim_has_any_lower_bound(isl::dim type, unsigned int pos) const;
-  inline boolean dim_has_any_upper_bound(isl::dim type, unsigned int pos) const;
-  inline boolean dim_has_lower_bound(isl::dim type, unsigned int pos) const;
-  inline boolean dim_has_upper_bound(isl::dim type, unsigned int pos) const;
-  inline boolean dim_is_bounded(isl::dim type, unsigned int pos) const;
+  inline isl::id dim_id(isl::dim type, unsigned int pos) const;
   inline isl::pw_aff dim_max(int pos) const;
   inline isl::val dim_max_val(int pos) const;
   inline isl::pw_aff dim_min(int pos) const;
   inline isl::val dim_min_val(int pos) const;
+  inline std::string dim_name(isl::dim type, unsigned int pos) const;
+  inline isl::aff div(int pos) const;
+  inline isl::aff get_div(int pos) const;
   inline isl::set drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::set drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::set drop_unused_params() const;
   inline isl::set eliminate(isl::dim type, unsigned int first, unsigned int n) const;
-  static inline isl::set empty(isl::space space);
-  inline isl::set equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
+  inline boolean every_set(const std::function<boolean(isl::set)> &test) const;
+  inline isl::set extract_set(const isl::space &space) const;
   inline int find_dim_by_id(isl::dim type, const isl::id &id) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::set fix_si(isl::dim type, unsigned int pos, int value) const;
-  inline isl::set fix_val(isl::dim type, unsigned int pos, isl::val v) const;
-  inline isl::set flat_product(isl::set set2) const;
-  inline isl::set flatten() const;
-  inline isl::map flatten_map() const;
-  inline int follows_at(const isl::set &set2, int pos) const;
-  inline stat foreach_basic_set(const std::function<stat(basic_set)> &fn) const;
-  inline stat foreach_point(const std::function<stat(point)> &fn) const;
-  static inline isl::set from_multi_aff(isl::multi_aff ma);
-  static inline isl::set from_multi_pw_aff(isl::multi_pw_aff mpa);
-  inline isl::set from_params() const;
-  static inline isl::set from_pw_aff(isl::pw_aff pwaff);
-  static inline isl::set from_pw_multi_aff(isl::pw_multi_aff pma);
-  inline isl::basic_set_list get_basic_set_list() const;
-  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
-  inline std::string get_dim_name(isl::dim type, unsigned int pos) const;
-  inline isl::multi_val get_plain_multi_val_if_fixed() const;
-  inline isl::fixed_box get_simple_fixed_box_hull() const;
-  inline isl::space get_space() const;
-  inline isl::val get_stride(int pos) const;
-  inline isl::id get_tuple_id() const;
-  inline std::string get_tuple_name() const;
-  inline isl::set gist(isl::set context) const;
-  inline isl::set gist_basic_set(isl::basic_set context) const;
-  inline isl::set gist_params(isl::set context) const;
-  inline boolean has_dim_id(isl::dim type, unsigned int pos) const;
-  inline boolean has_dim_name(isl::dim type, unsigned int pos) const;
+  inline int find_dim_by_id(isl::dim type, const std::string &id) const;
+  inline isl::basic_set fix_si(isl::dim type, unsigned int pos, int value) const;
+  inline isl::basic_set fix_val(isl::dim type, unsigned int pos, isl::val v) const;
+  inline isl::basic_set fix_val(isl::dim type, unsigned int pos, long v) const;
+  inline isl::basic_set flatten() const;
+  inline stat foreach_basic_set(const std::function<stat(isl::basic_set)> &fn) const;
+  inline stat foreach_point(const std::function<stat(isl::point)> &fn) const;
+  inline stat foreach_set(const std::function<stat(isl::set)> &fn) const;
+  inline isl::basic_set gist(isl::basic_set context) const;
+  inline isl::set gist(const isl::set &context) const;
+  inline isl::union_set gist(const isl::union_set &context) const;
+  inline isl::basic_set gist(const isl::point &context) const;
+  inline isl::set gist_params(const isl::set &context) const;
   inline boolean has_equal_space(const isl::set &set2) const;
-  inline boolean has_tuple_id() const;
-  inline boolean has_tuple_name() const;
   inline isl::map identity() const;
+  inline isl::union_pw_multi_aff identity_union_pw_multi_aff() const;
   inline isl::pw_aff indicator_function() const;
   inline isl::set insert_dims(isl::dim type, unsigned int pos, unsigned int n) const;
-  inline isl::map insert_domain(isl::space domain) const;
-  inline isl::set intersect(isl::set set2) const;
-  inline isl::set intersect_factor_domain(isl::set domain) const;
-  inline isl::set intersect_factor_range(isl::set range) const;
-  inline isl::set intersect_params(isl::set params) const;
+  inline isl::map insert_domain(const isl::space &domain) const;
+  inline isl::basic_set intersect(isl::basic_set bset2) const;
+  inline isl::set intersect(const isl::set &set2) const;
+  inline isl::union_set intersect(const isl::union_set &uset2) const;
+  inline isl::basic_set intersect(const isl::point &bset2) const;
+  inline isl::basic_set intersect_params(isl::basic_set bset2) const;
+  inline isl::set intersect_params(const isl::set &params) const;
+  inline isl::basic_set intersect_params(const isl::point &bset2) const;
   inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
   inline boolean involves_locals() const;
   inline boolean is_bounded() const;
-  inline boolean is_box() const;
   inline boolean is_disjoint(const isl::set &set2) const;
+  inline boolean is_disjoint(const isl::union_set &uset2) const;
   inline boolean is_empty() const;
+  inline boolean is_equal(const isl::basic_set &bset2) const;
   inline boolean is_equal(const isl::set &set2) const;
+  inline boolean is_equal(const isl::union_set &uset2) const;
+  inline boolean is_equal(const isl::point &bset2) const;
   inline boolean is_params() const;
   inline boolean is_singleton() const;
   inline boolean is_strict_subset(const isl::set &set2) const;
+  inline boolean is_strict_subset(const isl::union_set &uset2) const;
+  inline boolean is_subset(const isl::basic_set &bset2) const;
   inline boolean is_subset(const isl::set &set2) const;
+  inline boolean is_subset(const isl::union_set &uset2) const;
+  inline boolean is_subset(const isl::point &bset2) const;
   inline boolean is_wrapping() const;
-  inline isl::map lex_ge_set(isl::set set2) const;
-  inline isl::map lex_gt_set(isl::set set2) const;
-  inline isl::map lex_lt_set(isl::set set2) const;
+  inline boolean isa_set() const;
   inline isl::set lexmax() const;
   inline isl::pw_multi_aff lexmax_pw_multi_aff() const;
   inline isl::set lexmin() const;
   inline isl::pw_multi_aff lexmin_pw_multi_aff() const;
-  inline isl::set lower_bound(isl::multi_pw_aff lower) const;
-  inline isl::set lower_bound(isl::multi_val lower) const;
+  inline isl::set lower_bound(const isl::multi_pw_aff &lower) const;
+  inline isl::set lower_bound(const isl::multi_val &lower) const;
   inline isl::set lower_bound_si(isl::dim type, unsigned int pos, int value) const;
-  inline isl::set lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const;
+  inline isl::set lower_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const;
+  inline isl::set lower_bound_val(isl::dim type, unsigned int pos, long value) const;
   inline isl::multi_pw_aff max_multi_pw_aff() const;
   inline isl::val max_val(const isl::aff &obj) const;
   inline isl::multi_pw_aff min_multi_pw_aff() const;
   inline isl::val min_val(const isl::aff &obj) const;
-  inline isl::set move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const;
-  inline isl_size n_basic_set() const;
-  inline isl_size n_dim() const;
-  static inline isl::set nat_universe(isl::space space);
-  inline isl::set neg() const;
-  inline isl::set params() const;
-  inline int plain_cmp(const isl::set &set2) const;
+  inline class size n_basic_set() const;
+  inline isl::basic_set params() const;
   inline isl::val plain_get_val_if_fixed(isl::dim type, unsigned int pos) const;
-  inline boolean plain_is_disjoint(const isl::set &set2) const;
-  inline boolean plain_is_empty() const;
-  inline boolean plain_is_equal(const isl::set &set2) const;
-  inline boolean plain_is_universe() const;
-  inline isl::basic_set plain_unshifted_simple_hull() const;
+  inline isl::multi_val plain_multi_val_if_fixed() const;
   inline isl::basic_set polyhedral_hull() const;
-  inline isl::set preimage(isl::multi_aff ma) const;
-  inline isl::set preimage(isl::multi_pw_aff mpa) const;
-  inline isl::set preimage(isl::pw_multi_aff pma) const;
-  inline isl::set product(isl::set set2) const;
-  inline isl::map project_onto_map(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::set project_out(isl::dim type, unsigned int first, unsigned int n) const;
+  inline isl::set preimage(const isl::multi_aff &ma) const;
+  inline isl::set preimage(const isl::multi_pw_aff &mpa) const;
+  inline isl::set preimage(const isl::pw_multi_aff &pma) const;
+  inline isl::union_set preimage(const isl::union_pw_multi_aff &upma) const;
+  inline isl::set product(const isl::set &set2) const;
+  inline isl::basic_set project_out(isl::dim type, unsigned int first, unsigned int n) const;
   inline isl::set project_out_all_params() const;
-  inline isl::set project_out_param(isl::id id) const;
-  inline isl::set project_out_param(isl::id_list list) const;
+  inline isl::set project_out_param(const isl::id &id) const;
+  inline isl::set project_out_param(const std::string &id) const;
+  inline isl::set project_out_param(const isl::id_list &list) const;
+  inline isl::pw_multi_aff pw_multi_aff_on_domain(const isl::multi_val &mv) const;
   inline isl::set remove_dims(isl::dim type, unsigned int first, unsigned int n) const;
   inline isl::set remove_divs() const;
-  inline isl::set remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
   inline isl::set remove_redundancies() const;
-  inline isl::set remove_unknown_divs() const;
-  inline isl::set reset_space(isl::space space) const;
   inline isl::set reset_tuple_id() const;
-  inline isl::set reset_user() const;
   inline isl::basic_set sample() const;
   inline isl::point sample_point() const;
-  inline isl::set set_dim_id(isl::dim type, unsigned int pos, isl::id id) const;
-  inline isl::set set_tuple_id(isl::id id) const;
-  inline isl::set set_tuple_name(const std::string &s) const;
+  inline isl::set set_dim_id(isl::dim type, unsigned int pos, const isl::id &id) const;
+  inline isl::set set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const;
+  inline isl::set_list set_list() const;
+  inline isl::set set_tuple_id(const isl::id &id) const;
+  inline isl::set set_tuple_id(const std::string &id) const;
+  inline isl::fixed_box simple_fixed_box_hull() const;
   inline isl::basic_set simple_hull() const;
-  inline int size() const;
-  inline isl::basic_set solutions() const;
-  inline isl::set split_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::set subtract(isl::set set2) const;
-  inline isl::set sum(isl::set set2) const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+  inline isl::val stride(int pos) const;
+  inline isl::set subtract(const isl::set &set2) const;
+  inline isl::union_set subtract(const isl::union_set &uset2) const;
+  inline isl::basic_set_list to_list() const;
+  inline isl::set to_set() const;
+  inline isl::union_set to_union_set() const;
   inline isl::map translation() const;
-  inline isl_size tuple_dim() const;
-  inline isl::set unbind_params(isl::multi_id tuple) const;
-  inline isl::map unbind_params_insert_domain(isl::multi_id domain) const;
-  inline isl::set unite(isl::set set2) const;
-  static inline isl::set universe(isl::space space);
+  inline class size tuple_dim() const;
+  inline isl::id tuple_id() const;
+  inline std::string tuple_name() const;
+  inline isl::set unbind_params(const isl::multi_id &tuple) const;
+  inline isl::map unbind_params_insert_domain(const isl::multi_id &domain) const;
+  inline isl::set unite(isl::basic_set bset2) const;
+  inline isl::set unite(const isl::set &set2) const;
+  inline isl::union_set unite(const isl::union_set &uset2) const;
+  inline isl::set unite(const isl::point &bset2) const;
+  static inline isl::basic_set universe(isl::space space);
   inline isl::basic_set unshifted_simple_hull() const;
-  inline isl::basic_set unshifted_simple_hull_from_set_list(isl::set_list list) const;
   inline isl::map unwrap() const;
-  inline isl::set upper_bound(isl::multi_pw_aff upper) const;
-  inline isl::set upper_bound(isl::multi_val upper) const;
-  inline isl::set upper_bound_si(isl::dim type, unsigned int pos, int value) const;
-  inline isl::set upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const;
-  inline isl::map wrapped_domain_map() const;
+  inline isl::set upper_bound(const isl::multi_pw_aff &upper) const;
+  inline isl::set upper_bound(const isl::multi_val &upper) const;
+  inline isl::set upper_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const;
+  inline isl::set upper_bound_val(isl::dim type, unsigned int pos, long value) const;
+};
+
+// declarations for isl::basic_set_list
+inline basic_set_list manage(__isl_take isl_basic_set_list *ptr);
+inline basic_set_list manage_copy(__isl_keep isl_basic_set_list *ptr);
+
+class basic_set_list {
+  friend inline basic_set_list manage(__isl_take isl_basic_set_list *ptr);
+  friend inline basic_set_list manage_copy(__isl_keep isl_basic_set_list *ptr);
+
+protected:
+  isl_basic_set_list *ptr = nullptr;
+
+  inline explicit basic_set_list(__isl_take isl_basic_set_list *ptr);
+
+public:
+  inline /* implicit */ basic_set_list();
+  inline /* implicit */ basic_set_list(const basic_set_list &obj);
+  inline explicit basic_set_list(isl::ctx ctx, int n);
+  inline explicit basic_set_list(isl::basic_set el);
+  inline basic_set_list &operator=(basic_set_list obj);
+  inline ~basic_set_list();
+  inline __isl_give isl_basic_set_list *copy() const &;
+  inline __isl_give isl_basic_set_list *copy() && = delete;
+  inline __isl_keep isl_basic_set_list *get() const;
+  inline __isl_give isl_basic_set_list *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::basic_set_list add(isl::basic_set el) const;
+  inline isl::basic_set at(int index) const;
+  inline isl::basic_set get_at(int index) const;
+  inline isl::basic_set_list clear() const;
+  inline isl::basic_set_list concat(isl::basic_set_list list2) const;
+  inline isl::basic_set_list drop(unsigned int first, unsigned int n) const;
+  inline stat foreach(const std::function<stat(isl::basic_set)> &fn) const;
+  inline isl::basic_set_list insert(unsigned int pos, isl::basic_set el) const;
+  inline class size size() const;
+};
+
+// declarations for isl::constraint
+inline constraint manage(__isl_take isl_constraint *ptr);
+inline constraint manage_copy(__isl_keep isl_constraint *ptr);
+
+class constraint {
+  friend inline constraint manage(__isl_take isl_constraint *ptr);
+  friend inline constraint manage_copy(__isl_keep isl_constraint *ptr);
+
+protected:
+  isl_constraint *ptr = nullptr;
+
+  inline explicit constraint(__isl_take isl_constraint *ptr);
+
+public:
+  inline /* implicit */ constraint();
+  inline /* implicit */ constraint(const constraint &obj);
+  inline constraint &operator=(constraint obj);
+  inline ~constraint();
+  inline __isl_give isl_constraint *copy() const &;
+  inline __isl_give isl_constraint *copy() && = delete;
+  inline __isl_keep isl_constraint *get() const;
+  inline __isl_give isl_constraint *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  static inline isl::constraint alloc_equality(isl::local_space ls);
+  static inline isl::constraint alloc_inequality(isl::local_space ls);
+  inline isl::constraint set_coefficient_si(isl::dim type, int pos, int v) const;
+  inline isl::constraint set_constant_si(int v) const;
+  inline isl::constraint set_constant_val(isl::val v) const;
+  inline isl::constraint set_constant_val(long v) const;
+};
+
+// declarations for isl::fixed_box
+inline fixed_box manage(__isl_take isl_fixed_box *ptr);
+inline fixed_box manage_copy(__isl_keep isl_fixed_box *ptr);
+
+class fixed_box {
+  friend inline fixed_box manage(__isl_take isl_fixed_box *ptr);
+  friend inline fixed_box manage_copy(__isl_keep isl_fixed_box *ptr);
+
+protected:
+  isl_fixed_box *ptr = nullptr;
+
+  inline explicit fixed_box(__isl_take isl_fixed_box *ptr);
+
+public:
+  inline /* implicit */ fixed_box();
+  inline /* implicit */ fixed_box(const fixed_box &obj);
+  inline fixed_box &operator=(fixed_box obj);
+  inline ~fixed_box();
+  inline __isl_give isl_fixed_box *copy() const &;
+  inline __isl_give isl_fixed_box *copy() && = delete;
+  inline __isl_keep isl_fixed_box *get() const;
+  inline __isl_give isl_fixed_box *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline boolean is_valid() const;
+  inline isl::multi_aff offset() const;
+  inline isl::multi_aff get_offset() const;
+  inline isl::multi_val size() const;
+  inline isl::multi_val get_size() const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+};
+
+// declarations for isl::id
+inline id manage(__isl_take isl_id *ptr);
+inline id manage_copy(__isl_keep isl_id *ptr);
+
+class id {
+  friend inline id manage(__isl_take isl_id *ptr);
+  friend inline id manage_copy(__isl_keep isl_id *ptr);
+
+protected:
+  isl_id *ptr = nullptr;
+
+  inline explicit id(__isl_take isl_id *ptr);
+
+public:
+  inline /* implicit */ id();
+  inline /* implicit */ id(const id &obj);
+  inline explicit id(isl::ctx ctx, const std::string &str);
+  inline id &operator=(id obj);
+  inline ~id();
+  inline __isl_give isl_id *copy() const &;
+  inline __isl_give isl_id *copy() && = delete;
+  inline __isl_keep isl_id *get() const;
+  inline __isl_give isl_id *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  static inline isl::id alloc(isl::ctx ctx, const std::string &name, void * user);
+  inline std::string name() const;
+  inline std::string get_name() const;
+  inline isl::id_list to_list() const;
+  inline void * user() const;
+  inline void * get_user() const;
+};
+
+// declarations for isl::id_list
+inline id_list manage(__isl_take isl_id_list *ptr);
+inline id_list manage_copy(__isl_keep isl_id_list *ptr);
+
+class id_list {
+  friend inline id_list manage(__isl_take isl_id_list *ptr);
+  friend inline id_list manage_copy(__isl_keep isl_id_list *ptr);
+
+protected:
+  isl_id_list *ptr = nullptr;
+
+  inline explicit id_list(__isl_take isl_id_list *ptr);
+
+public:
+  inline /* implicit */ id_list();
+  inline /* implicit */ id_list(const id_list &obj);
+  inline explicit id_list(isl::ctx ctx, int n);
+  inline explicit id_list(isl::id el);
+  inline explicit id_list(isl::ctx ctx, const std::string &str);
+  inline id_list &operator=(id_list obj);
+  inline ~id_list();
+  inline __isl_give isl_id_list *copy() const &;
+  inline __isl_give isl_id_list *copy() && = delete;
+  inline __isl_keep isl_id_list *get() const;
+  inline __isl_give isl_id_list *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::id_list add(isl::id el) const;
+  inline isl::id_list add(const std::string &el) const;
+  inline isl::id at(int index) const;
+  inline isl::id get_at(int index) const;
+  inline isl::id_list clear() const;
+  inline isl::id_list concat(isl::id_list list2) const;
+  inline isl::id_list drop(unsigned int first, unsigned int n) const;
+  inline stat foreach(const std::function<stat(isl::id)> &fn) const;
+  inline isl::id_list insert(unsigned int pos, isl::id el) const;
+  inline isl::id_list insert(unsigned int pos, const std::string &el) const;
+  inline class size size() const;
 };
 
-// declarations for isl::set_list
-inline set_list manage(__isl_take isl_set_list *ptr);
-inline set_list manage_copy(__isl_keep isl_set_list *ptr);
+// declarations for isl::id_to_ast_expr
+inline id_to_ast_expr manage(__isl_take isl_id_to_ast_expr *ptr);
+inline id_to_ast_expr manage_copy(__isl_keep isl_id_to_ast_expr *ptr);
+
+class id_to_ast_expr {
+  friend inline id_to_ast_expr manage(__isl_take isl_id_to_ast_expr *ptr);
+  friend inline id_to_ast_expr manage_copy(__isl_keep isl_id_to_ast_expr *ptr);
+
+protected:
+  isl_id_to_ast_expr *ptr = nullptr;
+
+  inline explicit id_to_ast_expr(__isl_take isl_id_to_ast_expr *ptr);
+
+public:
+  inline /* implicit */ id_to_ast_expr();
+  inline /* implicit */ id_to_ast_expr(const id_to_ast_expr &obj);
+  inline id_to_ast_expr &operator=(id_to_ast_expr obj);
+  inline ~id_to_ast_expr();
+  inline __isl_give isl_id_to_ast_expr *copy() const &;
+  inline __isl_give isl_id_to_ast_expr *copy() && = delete;
+  inline __isl_keep isl_id_to_ast_expr *get() const;
+  inline __isl_give isl_id_to_ast_expr *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  static inline isl::id_to_ast_expr alloc(isl::ctx ctx, int min_size);
+  inline isl::id_to_ast_expr set(isl::id key, isl::ast_expr val) const;
+  inline isl::id_to_ast_expr set(const std::string &key, const isl::ast_expr &val) const;
+};
+
+// declarations for isl::local_space
+inline local_space manage(__isl_take isl_local_space *ptr);
+inline local_space manage_copy(__isl_keep isl_local_space *ptr);
+
+class local_space {
+  friend inline local_space manage(__isl_take isl_local_space *ptr);
+  friend inline local_space manage_copy(__isl_keep isl_local_space *ptr);
+
+protected:
+  isl_local_space *ptr = nullptr;
+
+  inline explicit local_space(__isl_take isl_local_space *ptr);
+
+public:
+  inline /* implicit */ local_space();
+  inline /* implicit */ local_space(const local_space &obj);
+  inline explicit local_space(isl::space space);
+  inline local_space &operator=(local_space obj);
+  inline ~local_space();
+  inline __isl_give isl_local_space *copy() const &;
+  inline __isl_give isl_local_space *copy() && = delete;
+  inline __isl_keep isl_local_space *get() const;
+  inline __isl_give isl_local_space *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+};
+
+// declarations for isl::map
+inline map manage(__isl_take isl_map *ptr);
+inline map manage_copy(__isl_keep isl_map *ptr);
+
+class map {
+  friend inline map manage(__isl_take isl_map *ptr);
+  friend inline map manage_copy(__isl_keep isl_map *ptr);
+
+protected:
+  isl_map *ptr = nullptr;
+
+  inline explicit map(__isl_take isl_map *ptr);
+
+public:
+  inline /* implicit */ map();
+  inline /* implicit */ map(const map &obj);
+  inline /* implicit */ map(isl::basic_map bmap);
+  inline explicit map(isl::ctx ctx, const std::string &str);
+  inline map &operator=(map obj);
+  inline ~map();
+  inline __isl_give isl_map *copy() const &;
+  inline __isl_give isl_map *copy() && = delete;
+  inline __isl_keep isl_map *get() const;
+  inline __isl_give isl_map *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::map add_constraint(isl::constraint constraint) const;
+  inline isl::map add_dims(isl::dim type, unsigned int n) const;
+  inline isl::basic_map affine_hull() const;
+  inline isl::map align_params(isl::space model) const;
+  inline isl::map apply_domain(isl::map map2) const;
+  inline isl::union_map apply_domain(const isl::union_map &umap2) const;
+  inline isl::map apply_domain(const isl::basic_map &map2) const;
+  inline isl::map apply_range(isl::map map2) const;
+  inline isl::union_map apply_range(const isl::union_map &umap2) const;
+  inline isl::map apply_range(const isl::basic_map &map2) const;
+  inline isl::map as_map() const;
+  inline isl::multi_union_pw_aff as_multi_union_pw_aff() const;
+  inline isl::pw_multi_aff as_pw_multi_aff() const;
+  inline isl::union_pw_multi_aff as_union_pw_multi_aff() const;
+  inline isl::basic_map_list basic_map_list() const;
+  inline isl::basic_map_list get_basic_map_list() const;
+  inline isl::set bind_domain(isl::multi_id tuple) const;
+  inline isl::set bind_range(isl::multi_id tuple) const;
+  inline boolean can_curry() const;
+  inline isl::map coalesce() const;
+  inline isl::map complement() const;
+  inline isl::union_map compute_divs() const;
+  inline isl::map curry() const;
+  inline isl::set deltas() const;
+  inline isl::map detect_equalities() const;
+  inline class size dim(isl::dim type) const;
+  inline isl::pw_aff dim_max(int pos) const;
+  inline isl::pw_aff dim_min(int pos) const;
+  inline isl::set domain() const;
+  inline isl::map domain_factor_domain() const;
+  inline isl::map domain_factor_range() const;
+  inline isl::map domain_map() const;
+  inline isl::union_pw_multi_aff domain_map_union_pw_multi_aff() const;
+  inline isl::map domain_product(isl::map map2) const;
+  inline isl::union_map domain_product(const isl::union_map &umap2) const;
+  inline isl::map domain_product(const isl::basic_map &map2) const;
+  inline class size domain_tuple_dim() const;
+  inline isl::id domain_tuple_id() const;
+  inline isl::id get_domain_tuple_id() const;
+  static inline isl::map empty(isl::space space);
+  inline isl::map eq_at(isl::multi_pw_aff mpa) const;
+  inline isl::union_map eq_at(const isl::multi_union_pw_aff &mupa) const;
+  inline isl::map eq_at(const isl::aff &mpa) const;
+  inline isl::map eq_at(const isl::multi_aff &mpa) const;
+  inline isl::map eq_at(const isl::pw_aff &mpa) const;
+  inline isl::map eq_at(const isl::pw_multi_aff &mpa) const;
+  inline isl::map equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
+  inline boolean every_map(const std::function<boolean(isl::map)> &test) const;
+  inline isl::map extract_map(const isl::space &space) const;
+  inline isl::map factor_domain() const;
+  inline isl::map factor_range() const;
+  inline isl::map fix_si(isl::dim type, unsigned int pos, int value) const;
+  inline isl::union_map fixed_power(const isl::val &exp) const;
+  inline isl::union_map fixed_power(long exp) const;
+  inline isl::map flat_range_product(isl::map map2) const;
+  inline isl::union_map flat_range_product(const isl::union_map &umap2) const;
+  inline isl::map flat_range_product(const isl::basic_map &map2) const;
+  inline isl::map flatten() const;
+  inline isl::map flatten_domain() const;
+  inline isl::map flatten_range() const;
+  inline isl::map floordiv_val(isl::val d) const;
+  inline isl::map floordiv_val(long d) const;
+  inline stat foreach_basic_map(const std::function<stat(isl::basic_map)> &fn) const;
+  inline stat foreach_map(const std::function<stat(isl::map)> &fn) const;
+  static inline isl::map from_aff(isl::aff aff);
+  static inline isl::map from_domain(isl::set set);
+  static inline isl::map from_domain_and_range(isl::set domain, isl::set range);
+  static inline isl::map from_multi_aff(isl::multi_aff maff);
+  static inline isl::map from_pw_aff(isl::pw_aff pwaff);
+  static inline isl::map from_range(isl::set set);
+  static inline isl::map from_union_map(isl::union_map umap);
+  inline isl::map gist(isl::map context) const;
+  inline isl::union_map gist(const isl::union_map &context) const;
+  inline isl::map gist(const isl::basic_map &context) const;
+  inline isl::map gist_domain(isl::set context) const;
+  inline isl::union_map gist_domain(const isl::union_set &uset) const;
+  inline isl::map gist_domain(const isl::basic_set &context) const;
+  inline isl::map gist_domain(const isl::point &context) const;
+  inline isl::map gist_params(isl::set context) const;
+  inline isl::union_map gist_range(const isl::union_set &uset) const;
+  inline boolean has_domain_tuple_id() const;
+  inline boolean has_equal_space(const isl::map &map2) const;
+  inline boolean has_range_tuple_id() const;
+  inline boolean has_tuple_id(isl::dim type) const;
+  inline boolean has_tuple_name(isl::dim type) const;
+  static inline isl::map identity(isl::space space);
+  inline isl::map intersect(isl::map map2) const;
+  inline isl::union_map intersect(const isl::union_map &umap2) const;
+  inline isl::map intersect(const isl::basic_map &map2) const;
+  inline isl::map intersect_domain(isl::set set) const;
+  inline isl::union_map intersect_domain(const isl::space &space) const;
+  inline isl::union_map intersect_domain(const isl::union_set &uset) const;
+  inline isl::map intersect_domain(const isl::basic_set &set) const;
+  inline isl::map intersect_domain(const isl::point &set) const;
+  inline isl::map intersect_domain_factor_domain(isl::map factor) const;
+  inline isl::union_map intersect_domain_factor_domain(const isl::union_map &factor) const;
+  inline isl::map intersect_domain_factor_domain(const isl::basic_map &factor) const;
+  inline isl::map intersect_domain_factor_range(isl::map factor) const;
+  inline isl::union_map intersect_domain_factor_range(const isl::union_map &factor) const;
+  inline isl::map intersect_domain_factor_range(const isl::basic_map &factor) const;
+  inline isl::map intersect_params(isl::set params) const;
+  inline isl::map intersect_range(isl::set set) const;
+  inline isl::union_map intersect_range(const isl::space &space) const;
+  inline isl::union_map intersect_range(const isl::union_set &uset) const;
+  inline isl::map intersect_range(const isl::basic_set &set) const;
+  inline isl::map intersect_range(const isl::point &set) const;
+  inline isl::map intersect_range_factor_domain(isl::map factor) const;
+  inline isl::union_map intersect_range_factor_domain(const isl::union_map &factor) const;
+  inline isl::map intersect_range_factor_domain(const isl::basic_map &factor) const;
+  inline isl::map intersect_range_factor_range(isl::map factor) const;
+  inline isl::union_map intersect_range_factor_range(const isl::union_map &factor) const;
+  inline isl::map intersect_range_factor_range(const isl::basic_map &factor) const;
+  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
+  inline boolean is_bijective() const;
+  inline boolean is_disjoint(const isl::map &map2) const;
+  inline boolean is_disjoint(const isl::union_map &umap2) const;
+  inline boolean is_disjoint(const isl::basic_map &map2) const;
+  inline boolean is_empty() const;
+  inline boolean is_equal(const isl::map &map2) const;
+  inline boolean is_equal(const isl::union_map &umap2) const;
+  inline boolean is_equal(const isl::basic_map &map2) const;
+  inline boolean is_injective() const;
+  inline boolean is_single_valued() const;
+  inline boolean is_strict_subset(const isl::map &map2) const;
+  inline boolean is_strict_subset(const isl::union_map &umap2) const;
+  inline boolean is_strict_subset(const isl::basic_map &map2) const;
+  inline boolean is_subset(const isl::map &map2) const;
+  inline boolean is_subset(const isl::union_map &umap2) const;
+  inline boolean is_subset(const isl::basic_map &map2) const;
+  inline boolean isa_map() const;
+  static inline isl::map lex_ge(isl::space set_space);
+  inline isl::map lex_ge_at(isl::multi_pw_aff mpa) const;
+  static inline isl::map lex_gt(isl::space set_space);
+  inline isl::map lex_gt_at(isl::multi_pw_aff mpa) const;
+  static inline isl::map lex_le(isl::space set_space);
+  inline isl::map lex_le_at(isl::multi_pw_aff mpa) const;
+  static inline isl::map lex_lt(isl::space set_space);
+  inline isl::map lex_lt_at(isl::multi_pw_aff mpa) const;
+  inline isl::map lexmax() const;
+  inline isl::pw_multi_aff lexmax_pw_multi_aff() const;
+  inline isl::map lexmin() const;
+  inline isl::pw_multi_aff lexmin_pw_multi_aff() const;
+  inline isl::map lower_bound(isl::multi_pw_aff lower) const;
+  inline isl::map lower_bound_si(isl::dim type, unsigned int pos, int value) const;
+  inline isl::map_list map_list() const;
+  inline isl::multi_pw_aff max_multi_pw_aff() const;
+  inline isl::multi_pw_aff min_multi_pw_aff() const;
+  inline isl::map move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const;
+  inline class size n_basic_map() const;
+  inline isl::map order_lt(isl::dim type1, int pos1, isl::dim type2, int pos2) const;
+  inline isl::set params() const;
+  inline isl::basic_map polyhedral_hull() const;
+  inline isl::map preimage_domain(isl::multi_aff ma) const;
+  inline isl::map preimage_domain(isl::multi_pw_aff mpa) const;
+  inline isl::map preimage_domain(isl::pw_multi_aff pma) const;
+  inline isl::union_map preimage_domain(const isl::union_pw_multi_aff &upma) const;
+  inline isl::map preimage_range(isl::multi_aff ma) const;
+  inline isl::map preimage_range(isl::pw_multi_aff pma) const;
+  inline isl::union_map preimage_range(const isl::union_pw_multi_aff &upma) const;
+  inline isl::map product(isl::map map2) const;
+  inline isl::union_map product(const isl::union_map &umap2) const;
+  inline isl::map product(const isl::basic_map &map2) const;
+  inline isl::map project_out(isl::dim type, unsigned int first, unsigned int n) const;
+  inline isl::map project_out_all_params() const;
+  inline isl::set range() const;
+  inline isl::map range_factor_domain() const;
+  inline isl::map range_factor_range() const;
+  inline isl::fixed_box range_lattice_tile() const;
+  inline isl::fixed_box get_range_lattice_tile() const;
+  inline isl::map range_map() const;
+  inline isl::map range_product(isl::map map2) const;
+  inline isl::union_map range_product(const isl::union_map &umap2) const;
+  inline isl::map range_product(const isl::basic_map &map2) const;
+  inline isl::map range_reverse() const;
+  inline isl::fixed_box range_simple_fixed_box_hull() const;
+  inline isl::fixed_box get_range_simple_fixed_box_hull() const;
+  inline class size range_tuple_dim() const;
+  inline isl::id range_tuple_id() const;
+  inline isl::id get_range_tuple_id() const;
+  inline isl::map reverse() const;
+  inline isl::basic_map sample() const;
+  inline isl::map set_domain_tuple(isl::id id) const;
+  inline isl::map set_domain_tuple(const std::string &id) const;
+  inline isl::map set_range_tuple(isl::id id) const;
+  inline isl::map set_range_tuple(const std::string &id) const;
+  inline isl::map set_tuple_id(isl::dim type, isl::id id) const;
+  inline isl::map set_tuple_id(isl::dim type, const std::string &id) const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+  inline isl::map subtract(isl::map map2) const;
+  inline isl::union_map subtract(const isl::union_map &umap2) const;
+  inline isl::map subtract(const isl::basic_map &map2) const;
+  inline isl::union_map subtract_domain(const isl::union_set &dom) const;
+  inline isl::union_map subtract_range(const isl::union_set &dom) const;
+  inline isl::map sum(isl::map map2) const;
+  inline isl::map_list to_list() const;
+  inline isl::union_map to_union_map() const;
+  inline isl::id tuple_id(isl::dim type) const;
+  inline isl::id get_tuple_id(isl::dim type) const;
+  inline isl::map uncurry() const;
+  inline isl::map unite(isl::map map2) const;
+  inline isl::union_map unite(const isl::union_map &umap2) const;
+  inline isl::map unite(const isl::basic_map &map2) const;
+  static inline isl::map universe(isl::space space);
+  inline isl::basic_map unshifted_simple_hull() const;
+  inline isl::map upper_bound(isl::multi_pw_aff upper) const;
+  inline isl::map upper_bound_si(isl::dim type, unsigned int pos, int value) const;
+  inline isl::set wrap() const;
+  inline isl::map zip() const;
+};
+
+// declarations for isl::map_list
+inline map_list manage(__isl_take isl_map_list *ptr);
+inline map_list manage_copy(__isl_keep isl_map_list *ptr);
+
+class map_list {
+  friend inline map_list manage(__isl_take isl_map_list *ptr);
+  friend inline map_list manage_copy(__isl_keep isl_map_list *ptr);
+
+protected:
+  isl_map_list *ptr = nullptr;
+
+  inline explicit map_list(__isl_take isl_map_list *ptr);
+
+public:
+  inline /* implicit */ map_list();
+  inline /* implicit */ map_list(const map_list &obj);
+  inline explicit map_list(isl::ctx ctx, int n);
+  inline explicit map_list(isl::map el);
+  inline explicit map_list(isl::ctx ctx, const std::string &str);
+  inline map_list &operator=(map_list obj);
+  inline ~map_list();
+  inline __isl_give isl_map_list *copy() const &;
+  inline __isl_give isl_map_list *copy() && = delete;
+  inline __isl_keep isl_map_list *get() const;
+  inline __isl_give isl_map_list *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::map_list add(isl::map el) const;
+  inline isl::map at(int index) const;
+  inline isl::map get_at(int index) const;
+  inline isl::map_list clear() const;
+  inline isl::map_list concat(isl::map_list list2) const;
+  inline isl::map_list drop(unsigned int first, unsigned int n) const;
+  inline stat foreach(const std::function<stat(isl::map)> &fn) const;
+  inline isl::map_list insert(unsigned int pos, isl::map el) const;
+  inline class size size() const;
+};
+
+// declarations for isl::multi_aff
+inline multi_aff manage(__isl_take isl_multi_aff *ptr);
+inline multi_aff manage_copy(__isl_keep isl_multi_aff *ptr);
+
+class multi_aff {
+  friend inline multi_aff manage(__isl_take isl_multi_aff *ptr);
+  friend inline multi_aff manage_copy(__isl_keep isl_multi_aff *ptr);
+
+protected:
+  isl_multi_aff *ptr = nullptr;
+
+  inline explicit multi_aff(__isl_take isl_multi_aff *ptr);
+
+public:
+  inline /* implicit */ multi_aff();
+  inline /* implicit */ multi_aff(const multi_aff &obj);
+  inline /* implicit */ multi_aff(isl::aff aff);
+  inline explicit multi_aff(isl::space space, isl::aff_list list);
+  inline explicit multi_aff(isl::ctx ctx, const std::string &str);
+  inline multi_aff &operator=(multi_aff obj);
+  inline ~multi_aff();
+  inline __isl_give isl_multi_aff *copy() const &;
+  inline __isl_give isl_multi_aff *copy() && = delete;
+  inline __isl_keep isl_multi_aff *get() const;
+  inline __isl_give isl_multi_aff *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::multi_aff add(isl::multi_aff multi2) const;
+  inline isl::multi_pw_aff add(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff add(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff add(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_multi_aff add(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::multi_aff add(const isl::aff &multi2) const;
+  inline isl::multi_aff add_constant(isl::multi_val mv) const;
+  inline isl::multi_aff add_constant(isl::val v) const;
+  inline isl::multi_aff add_constant(long v) const;
+  inline isl::union_pw_multi_aff add_pw_multi_aff(const isl::pw_multi_aff &pma) const;
+  inline isl::union_pw_multi_aff apply(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::map as_map() const;
+  inline isl::multi_aff as_multi_aff() const;
+  inline isl::multi_union_pw_aff as_multi_union_pw_aff() const;
+  inline isl::pw_multi_aff as_pw_multi_aff() const;
+  inline isl::set as_set() const;
+  inline isl::union_map as_union_map() const;
+  inline isl::aff at(int pos) const;
+  inline isl::aff get_at(int pos) const;
+  inline isl::basic_set bind(isl::multi_id tuple) const;
+  inline isl::multi_aff bind_domain(isl::multi_id tuple) const;
+  inline isl::multi_aff bind_domain_wrapped_domain(isl::multi_id tuple) const;
+  inline isl::pw_multi_aff coalesce() const;
+  inline isl::multi_val constant_multi_val() const;
+  inline isl::multi_val get_constant_multi_val() const;
+  inline class size dim(isl::dim type) const;
+  inline isl::set domain() const;
+  static inline isl::multi_aff domain_map(isl::space space);
+  inline isl::pw_multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
+  inline isl::pw_multi_aff extract_pw_multi_aff(const isl::space &space) const;
+  inline isl::multi_aff flat_range_product(isl::multi_aff multi2) const;
+  inline isl::multi_pw_aff flat_range_product(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff flat_range_product(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff flat_range_product(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_multi_aff flat_range_product(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::multi_aff flat_range_product(const isl::aff &multi2) const;
+  inline isl::multi_aff floor() const;
+  inline stat foreach_piece(const std::function<stat(isl::set, isl::multi_aff)> &fn) const;
+  inline isl::multi_aff gist(isl::set context) const;
+  inline isl::union_pw_multi_aff gist(const isl::union_set &context) const;
+  inline isl::multi_aff gist(const isl::basic_set &context) const;
+  inline isl::multi_aff gist(const isl::point &context) const;
+  inline boolean has_range_tuple_id() const;
+  static inline isl::multi_aff identity(isl::space space);
+  inline isl::multi_aff identity() const;
+  static inline isl::multi_aff identity_on_domain(isl::space space);
+  inline isl::multi_aff insert_domain(isl::space domain) const;
+  inline isl::pw_multi_aff intersect_domain(const isl::set &set) const;
+  inline isl::union_pw_multi_aff intersect_domain(const isl::space &space) const;
+  inline isl::union_pw_multi_aff intersect_domain(const isl::union_set &uset) const;
+  inline isl::union_pw_multi_aff intersect_domain_wrapped_domain(const isl::union_set &uset) const;
+  inline isl::union_pw_multi_aff intersect_domain_wrapped_range(const isl::union_set &uset) const;
+  inline isl::pw_multi_aff intersect_params(const isl::set &set) const;
+  inline boolean involves_locals() const;
+  inline boolean involves_nan() const;
+  inline boolean involves_param(const isl::id &id) const;
+  inline boolean involves_param(const std::string &id) const;
+  inline boolean involves_param(const isl::id_list &list) const;
+  inline boolean isa_multi_aff() const;
+  inline boolean isa_pw_multi_aff() const;
+  inline isl::aff_list list() const;
+  inline isl::aff_list get_list() const;
+  inline isl::multi_pw_aff max(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_val max_multi_val() const;
+  inline isl::multi_pw_aff min(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_val min_multi_val() const;
+  static inline isl::multi_aff multi_val_on_domain(isl::space space, isl::multi_val mv);
+  inline class size n_piece() const;
+  inline isl::multi_aff neg() const;
+  inline boolean plain_is_empty() const;
+  inline boolean plain_is_equal(const isl::multi_aff &multi2) const;
+  inline boolean plain_is_equal(const isl::multi_pw_aff &multi2) const;
+  inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const;
+  inline boolean plain_is_equal(const isl::aff &multi2) const;
+  inline isl::pw_multi_aff preimage_domain_wrapped_domain(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::multi_aff product(isl::multi_aff multi2) const;
+  inline isl::multi_pw_aff product(const isl::multi_pw_aff &multi2) const;
+  inline isl::pw_multi_aff product(const isl::pw_multi_aff &pma2) const;
+  inline isl::multi_aff product(const isl::aff &multi2) const;
+  inline isl::multi_aff pullback(isl::multi_aff ma2) const;
+  inline isl::multi_pw_aff pullback(const isl::multi_pw_aff &mpa2) const;
+  inline isl::pw_multi_aff pullback(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_multi_aff pullback(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::multi_aff pullback(const isl::aff &ma2) const;
+  inline isl::pw_multi_aff_list pw_multi_aff_list() const;
+  inline isl::pw_multi_aff range_factor_domain() const;
+  inline isl::pw_multi_aff range_factor_range() const;
+  static inline isl::multi_aff range_map(isl::space space);
+  inline isl::multi_aff range_product(isl::multi_aff multi2) const;
+  inline isl::multi_pw_aff range_product(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff range_product(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff range_product(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_multi_aff range_product(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::multi_aff range_product(const isl::aff &multi2) const;
+  inline isl::id range_tuple_id() const;
+  inline isl::id get_range_tuple_id() const;
+  inline isl::multi_aff reset_range_tuple_id() const;
+  inline isl::multi_aff reset_tuple_id(isl::dim type) const;
+  inline isl::multi_aff scale(isl::multi_val mv) const;
+  inline isl::multi_aff scale(isl::val v) const;
+  inline isl::multi_aff scale(long v) const;
+  inline isl::multi_aff scale_down(isl::multi_val mv) const;
+  inline isl::multi_aff scale_down(isl::val v) const;
+  inline isl::multi_aff scale_down(long v) const;
+  inline isl::multi_aff set_aff(int pos, isl::aff el) const;
+  inline isl::multi_aff set_at(int pos, isl::aff el) const;
+  inline isl::multi_pw_aff set_at(int pos, const isl::pw_aff &el) const;
+  inline isl::multi_union_pw_aff set_at(int pos, const isl::union_pw_aff &el) const;
+  inline isl::multi_pw_aff set_pw_aff(int pos, const isl::pw_aff &el) const;
+  inline isl::pw_multi_aff set_pw_aff(unsigned int pos, const isl::pw_aff &pa) const;
+  inline isl::multi_aff set_range_tuple(isl::id id) const;
+  inline isl::multi_aff set_range_tuple(const std::string &id) const;
+  inline isl::multi_union_pw_aff set_union_pw_aff(int pos, const isl::union_pw_aff &el) const;
+  inline class size size() const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+  inline isl::multi_aff sub(isl::multi_aff multi2) const;
+  inline isl::multi_pw_aff sub(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff sub(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff sub(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_multi_aff sub(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::multi_aff sub(const isl::aff &multi2) const;
+  inline isl::pw_multi_aff subtract_domain(const isl::set &set) const;
+  inline isl::union_pw_multi_aff subtract_domain(const isl::space &space) const;
+  inline isl::union_pw_multi_aff subtract_domain(const isl::union_set &uset) const;
+  inline isl::pw_multi_aff_list to_list() const;
+  inline isl::multi_pw_aff to_multi_pw_aff() const;
+  inline isl::multi_union_pw_aff to_multi_union_pw_aff() const;
+  inline isl::pw_multi_aff to_pw_multi_aff() const;
+  inline isl::union_pw_multi_aff to_union_pw_multi_aff() const;
+  inline isl::id tuple_id(isl::dim type) const;
+  inline isl::multi_aff unbind_params_insert_domain(isl::multi_id domain) const;
+  inline isl::multi_pw_aff union_add(const isl::multi_pw_aff &mpa2) const;
+  inline isl::multi_union_pw_aff union_add(const isl::multi_union_pw_aff &mupa2) const;
+  inline isl::pw_multi_aff union_add(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_multi_aff union_add(const isl::union_pw_multi_aff &upma2) const;
+  static inline isl::multi_aff zero(isl::space space);
+};
+
+// declarations for isl::multi_id
+inline multi_id manage(__isl_take isl_multi_id *ptr);
+inline multi_id manage_copy(__isl_keep isl_multi_id *ptr);
+
+class multi_id {
+  friend inline multi_id manage(__isl_take isl_multi_id *ptr);
+  friend inline multi_id manage_copy(__isl_keep isl_multi_id *ptr);
+
+protected:
+  isl_multi_id *ptr = nullptr;
+
+  inline explicit multi_id(__isl_take isl_multi_id *ptr);
+
+public:
+  inline /* implicit */ multi_id();
+  inline /* implicit */ multi_id(const multi_id &obj);
+  inline explicit multi_id(isl::space space, isl::id_list list);
+  inline explicit multi_id(isl::ctx ctx, const std::string &str);
+  inline multi_id &operator=(multi_id obj);
+  inline ~multi_id();
+  inline __isl_give isl_multi_id *copy() const &;
+  inline __isl_give isl_multi_id *copy() && = delete;
+  inline __isl_keep isl_multi_id *get() const;
+  inline __isl_give isl_multi_id *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::id at(int pos) const;
+  inline isl::id get_at(int pos) const;
+  inline isl::multi_id flat_range_product(isl::multi_id multi2) const;
+  inline isl::id_list list() const;
+  inline isl::id_list get_list() const;
+  inline boolean plain_is_equal(const isl::multi_id &multi2) const;
+  inline isl::multi_id range_product(isl::multi_id multi2) const;
+  inline isl::multi_id set_at(int pos, isl::id el) const;
+  inline isl::multi_id set_at(int pos, const std::string &el) const;
+  inline isl::multi_id set_id(int pos, isl::id el) const;
+  inline isl::multi_id set_id(int pos, const std::string &el) const;
+  inline class size size() const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+};
+
+// declarations for isl::multi_pw_aff
+inline multi_pw_aff manage(__isl_take isl_multi_pw_aff *ptr);
+inline multi_pw_aff manage_copy(__isl_keep isl_multi_pw_aff *ptr);
+
+class multi_pw_aff {
+  friend inline multi_pw_aff manage(__isl_take isl_multi_pw_aff *ptr);
+  friend inline multi_pw_aff manage_copy(__isl_keep isl_multi_pw_aff *ptr);
+
+protected:
+  isl_multi_pw_aff *ptr = nullptr;
+
+  inline explicit multi_pw_aff(__isl_take isl_multi_pw_aff *ptr);
+
+public:
+  inline /* implicit */ multi_pw_aff();
+  inline /* implicit */ multi_pw_aff(const multi_pw_aff &obj);
+  inline /* implicit */ multi_pw_aff(isl::aff aff);
+  inline /* implicit */ multi_pw_aff(isl::multi_aff ma);
+  inline /* implicit */ multi_pw_aff(isl::pw_aff pa);
+  inline explicit multi_pw_aff(isl::space space, isl::pw_aff_list list);
+  inline /* implicit */ multi_pw_aff(isl::pw_multi_aff pma);
+  inline explicit multi_pw_aff(isl::ctx ctx, const std::string &str);
+  inline multi_pw_aff &operator=(multi_pw_aff obj);
+  inline ~multi_pw_aff();
+  inline __isl_give isl_multi_pw_aff *copy() const &;
+  inline __isl_give isl_multi_pw_aff *copy() && = delete;
+  inline __isl_keep isl_multi_pw_aff *get() const;
+  inline __isl_give isl_multi_pw_aff *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::multi_pw_aff add(isl::multi_pw_aff multi2) const;
+  inline isl::multi_union_pw_aff add(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::multi_pw_aff add(const isl::aff &multi2) const;
+  inline isl::multi_pw_aff add(const isl::multi_aff &multi2) const;
+  inline isl::multi_pw_aff add(const isl::pw_aff &multi2) const;
+  inline isl::multi_pw_aff add(const isl::pw_multi_aff &multi2) const;
+  inline isl::multi_pw_aff add_constant(isl::multi_val mv) const;
+  inline isl::multi_pw_aff add_constant(isl::val v) const;
+  inline isl::multi_pw_aff add_constant(long v) const;
+  inline isl::map as_map() const;
+  inline isl::multi_aff as_multi_aff() const;
+  inline isl::set as_set() const;
+  inline isl::pw_aff at(int pos) const;
+  inline isl::pw_aff get_at(int pos) const;
+  inline isl::set bind(isl::multi_id tuple) const;
+  inline isl::multi_pw_aff bind_domain(isl::multi_id tuple) const;
+  inline isl::multi_pw_aff bind_domain_wrapped_domain(isl::multi_id tuple) const;
+  inline isl::multi_pw_aff coalesce() const;
+  inline class size dim(isl::dim type) const;
+  inline isl::set domain() const;
+  inline isl::multi_pw_aff flat_range_product(isl::multi_pw_aff multi2) const;
+  inline isl::multi_union_pw_aff flat_range_product(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::multi_pw_aff flat_range_product(const isl::aff &multi2) const;
+  inline isl::multi_pw_aff flat_range_product(const isl::multi_aff &multi2) const;
+  inline isl::multi_pw_aff flat_range_product(const isl::pw_aff &multi2) const;
+  inline isl::multi_pw_aff flat_range_product(const isl::pw_multi_aff &multi2) const;
+  inline isl::multi_pw_aff gist(isl::set set) const;
+  inline isl::multi_union_pw_aff gist(const isl::union_set &context) const;
+  inline isl::multi_pw_aff gist(const isl::basic_set &set) const;
+  inline isl::multi_pw_aff gist(const isl::point &set) const;
+  inline boolean has_range_tuple_id() const;
+  static inline isl::multi_pw_aff identity(isl::space space);
+  inline isl::multi_pw_aff identity() const;
+  static inline isl::multi_pw_aff identity_on_domain(isl::space space);
+  inline isl::multi_pw_aff insert_domain(isl::space domain) const;
+  inline isl::multi_pw_aff intersect_domain(isl::set domain) const;
+  inline isl::multi_union_pw_aff intersect_domain(const isl::union_set &uset) const;
+  inline isl::multi_pw_aff intersect_domain(const isl::basic_set &domain) const;
+  inline isl::multi_pw_aff intersect_domain(const isl::point &domain) const;
+  inline isl::multi_pw_aff intersect_params(isl::set set) const;
+  inline boolean involves_nan() const;
+  inline boolean involves_param(const isl::id &id) const;
+  inline boolean involves_param(const std::string &id) const;
+  inline boolean involves_param(const isl::id_list &list) const;
+  inline boolean isa_multi_aff() const;
+  inline isl::pw_aff_list list() const;
+  inline isl::pw_aff_list get_list() const;
+  inline isl::multi_pw_aff max(isl::multi_pw_aff multi2) const;
+  inline isl::multi_val max_multi_val() const;
+  inline isl::multi_pw_aff min(isl::multi_pw_aff multi2) const;
+  inline isl::multi_val min_multi_val() const;
+  inline isl::multi_pw_aff neg() const;
+  inline boolean plain_is_equal(const isl::multi_pw_aff &multi2) const;
+  inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const;
+  inline boolean plain_is_equal(const isl::aff &multi2) const;
+  inline boolean plain_is_equal(const isl::multi_aff &multi2) const;
+  inline boolean plain_is_equal(const isl::pw_aff &multi2) const;
+  inline boolean plain_is_equal(const isl::pw_multi_aff &multi2) const;
+  inline isl::multi_pw_aff product(isl::multi_pw_aff multi2) const;
+  inline isl::multi_pw_aff pullback(isl::multi_aff ma) const;
+  inline isl::multi_pw_aff pullback(isl::multi_pw_aff mpa2) const;
+  inline isl::multi_pw_aff pullback(isl::pw_multi_aff pma) const;
+  inline isl::multi_union_pw_aff pullback(const isl::union_pw_multi_aff &upma) const;
+  inline isl::multi_pw_aff range_product(isl::multi_pw_aff multi2) const;
+  inline isl::multi_union_pw_aff range_product(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::multi_pw_aff range_product(const isl::aff &multi2) const;
+  inline isl::multi_pw_aff range_product(const isl::multi_aff &multi2) const;
+  inline isl::multi_pw_aff range_product(const isl::pw_aff &multi2) const;
+  inline isl::multi_pw_aff range_product(const isl::pw_multi_aff &multi2) const;
+  inline isl::id range_tuple_id() const;
+  inline isl::id get_range_tuple_id() const;
+  inline isl::multi_pw_aff reset_range_tuple_id() const;
+  inline isl::multi_pw_aff reset_tuple_id(isl::dim type) const;
+  inline isl::multi_pw_aff scale(isl::multi_val mv) const;
+  inline isl::multi_pw_aff scale(isl::val v) const;
+  inline isl::multi_pw_aff scale(long v) const;
+  inline isl::multi_pw_aff scale_down(isl::multi_val mv) const;
+  inline isl::multi_pw_aff scale_down(isl::val v) const;
+  inline isl::multi_pw_aff scale_down(long v) const;
+  inline isl::multi_pw_aff set_at(int pos, isl::pw_aff el) const;
+  inline isl::multi_union_pw_aff set_at(int pos, const isl::union_pw_aff &el) const;
+  inline isl::multi_pw_aff set_pw_aff(int pos, isl::pw_aff el) const;
+  inline isl::multi_pw_aff set_range_tuple(isl::id id) const;
+  inline isl::multi_pw_aff set_range_tuple(const std::string &id) const;
+  inline isl::multi_union_pw_aff set_union_pw_aff(int pos, const isl::union_pw_aff &el) const;
+  inline class size size() const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+  inline isl::multi_pw_aff sub(isl::multi_pw_aff multi2) const;
+  inline isl::multi_union_pw_aff sub(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::multi_pw_aff sub(const isl::aff &multi2) const;
+  inline isl::multi_pw_aff sub(const isl::multi_aff &multi2) const;
+  inline isl::multi_pw_aff sub(const isl::pw_aff &multi2) const;
+  inline isl::multi_pw_aff sub(const isl::pw_multi_aff &multi2) const;
+  inline isl::multi_pw_aff unbind_params_insert_domain(isl::multi_id domain) const;
+  inline isl::multi_pw_aff union_add(isl::multi_pw_aff mpa2) const;
+  inline isl::multi_union_pw_aff union_add(const isl::multi_union_pw_aff &mupa2) const;
+  inline isl::multi_pw_aff union_add(const isl::aff &mpa2) const;
+  inline isl::multi_pw_aff union_add(const isl::multi_aff &mpa2) const;
+  inline isl::multi_pw_aff union_add(const isl::pw_aff &mpa2) const;
+  inline isl::multi_pw_aff union_add(const isl::pw_multi_aff &mpa2) const;
+  static inline isl::multi_pw_aff zero(isl::space space);
+};
+
+// declarations for isl::multi_union_pw_aff
+inline multi_union_pw_aff manage(__isl_take isl_multi_union_pw_aff *ptr);
+inline multi_union_pw_aff manage_copy(__isl_keep isl_multi_union_pw_aff *ptr);
+
+class multi_union_pw_aff {
+  friend inline multi_union_pw_aff manage(__isl_take isl_multi_union_pw_aff *ptr);
+  friend inline multi_union_pw_aff manage_copy(__isl_keep isl_multi_union_pw_aff *ptr);
+
+protected:
+  isl_multi_union_pw_aff *ptr = nullptr;
+
+  inline explicit multi_union_pw_aff(__isl_take isl_multi_union_pw_aff *ptr);
+
+public:
+  inline /* implicit */ multi_union_pw_aff();
+  inline /* implicit */ multi_union_pw_aff(const multi_union_pw_aff &obj);
+  inline /* implicit */ multi_union_pw_aff(isl::multi_pw_aff mpa);
+  inline /* implicit */ multi_union_pw_aff(isl::union_pw_aff upa);
+  inline explicit multi_union_pw_aff(isl::space space, isl::union_pw_aff_list list);
+  inline explicit multi_union_pw_aff(isl::union_pw_multi_aff upma);
+  inline explicit multi_union_pw_aff(isl::ctx ctx, const std::string &str);
+  inline multi_union_pw_aff &operator=(multi_union_pw_aff obj);
+  inline ~multi_union_pw_aff();
+  inline __isl_give isl_multi_union_pw_aff *copy() const &;
+  inline __isl_give isl_multi_union_pw_aff *copy() && = delete;
+  inline __isl_keep isl_multi_union_pw_aff *get() const;
+  inline __isl_give isl_multi_union_pw_aff *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::multi_union_pw_aff add(isl::multi_union_pw_aff multi2) const;
+  inline isl::union_pw_aff at(int pos) const;
+  inline isl::union_pw_aff get_at(int pos) const;
+  inline isl::union_set bind(isl::multi_id tuple) const;
+  inline isl::multi_union_pw_aff coalesce() const;
+  inline class size dim(isl::dim type) const;
+  inline isl::union_set domain() const;
+  inline isl::multi_union_pw_aff flat_range_product(isl::multi_union_pw_aff multi2) const;
+  static inline isl::multi_union_pw_aff from_union_map(isl::union_map umap);
+  inline isl::multi_union_pw_aff gist(isl::union_set context) const;
+  inline boolean has_range_tuple_id() const;
+  inline isl::multi_union_pw_aff intersect_domain(isl::union_set uset) const;
+  inline isl::multi_union_pw_aff intersect_params(isl::set params) const;
+  inline boolean involves_nan() const;
+  inline isl::union_pw_aff_list list() const;
+  inline isl::union_pw_aff_list get_list() const;
+  inline isl::multi_union_pw_aff neg() const;
+  inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff pullback(isl::union_pw_multi_aff upma) const;
+  inline isl::multi_union_pw_aff range_product(isl::multi_union_pw_aff multi2) const;
+  inline isl::id range_tuple_id() const;
+  inline isl::id get_range_tuple_id() const;
+  inline isl::multi_union_pw_aff reset_range_tuple_id() const;
+  inline isl::multi_union_pw_aff reset_tuple_id(isl::dim type) const;
+  inline isl::multi_union_pw_aff scale(isl::multi_val mv) const;
+  inline isl::multi_union_pw_aff scale(isl::val v) const;
+  inline isl::multi_union_pw_aff scale(long v) const;
+  inline isl::multi_union_pw_aff scale_down(isl::multi_val mv) const;
+  inline isl::multi_union_pw_aff scale_down(isl::val v) const;
+  inline isl::multi_union_pw_aff scale_down(long v) const;
+  inline isl::multi_union_pw_aff set_at(int pos, isl::union_pw_aff el) const;
+  inline isl::multi_union_pw_aff set_range_tuple(isl::id id) const;
+  inline isl::multi_union_pw_aff set_range_tuple(const std::string &id) const;
+  inline isl::multi_union_pw_aff set_union_pw_aff(int pos, isl::union_pw_aff el) const;
+  inline class size size() const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+  inline isl::multi_union_pw_aff sub(isl::multi_union_pw_aff multi2) const;
+  inline isl::multi_union_pw_aff union_add(isl::multi_union_pw_aff mupa2) const;
+  static inline isl::multi_union_pw_aff zero(isl::space space);
+};
+
+// declarations for isl::multi_val
+inline multi_val manage(__isl_take isl_multi_val *ptr);
+inline multi_val manage_copy(__isl_keep isl_multi_val *ptr);
+
+class multi_val {
+  friend inline multi_val manage(__isl_take isl_multi_val *ptr);
+  friend inline multi_val manage_copy(__isl_keep isl_multi_val *ptr);
+
+protected:
+  isl_multi_val *ptr = nullptr;
+
+  inline explicit multi_val(__isl_take isl_multi_val *ptr);
+
+public:
+  inline /* implicit */ multi_val();
+  inline /* implicit */ multi_val(const multi_val &obj);
+  inline explicit multi_val(isl::space space, isl::val_list list);
+  inline explicit multi_val(isl::ctx ctx, const std::string &str);
+  inline multi_val &operator=(multi_val obj);
+  inline ~multi_val();
+  inline __isl_give isl_multi_val *copy() const &;
+  inline __isl_give isl_multi_val *copy() && = delete;
+  inline __isl_keep isl_multi_val *get() const;
+  inline __isl_give isl_multi_val *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::multi_val add(isl::multi_val multi2) const;
+  inline isl::multi_val add(isl::val v) const;
+  inline isl::multi_val add(long v) const;
+  inline isl::val at(int pos) const;
+  inline isl::val get_at(int pos) const;
+  inline class size dim(isl::dim type) const;
+  inline isl::multi_val flat_range_product(isl::multi_val multi2) const;
+  inline boolean has_range_tuple_id() const;
+  inline boolean involves_nan() const;
+  inline isl::val_list list() const;
+  inline isl::val_list get_list() const;
+  inline isl::multi_val max(isl::multi_val multi2) const;
+  inline isl::multi_val min(isl::multi_val multi2) const;
+  inline isl::multi_val neg() const;
+  inline boolean plain_is_equal(const isl::multi_val &multi2) const;
+  inline isl::multi_val product(isl::multi_val multi2) const;
+  inline isl::multi_val range_product(isl::multi_val multi2) const;
+  inline isl::id range_tuple_id() const;
+  inline isl::id get_range_tuple_id() const;
+  inline isl::multi_val reset_range_tuple_id() const;
+  inline isl::multi_val reset_tuple_id(isl::dim type) const;
+  inline isl::multi_val scale(isl::multi_val mv) const;
+  inline isl::multi_val scale(isl::val v) const;
+  inline isl::multi_val scale(long v) const;
+  inline isl::multi_val scale_down(isl::multi_val mv) const;
+  inline isl::multi_val scale_down(isl::val v) const;
+  inline isl::multi_val scale_down(long v) const;
+  inline isl::multi_val set_at(int pos, isl::val el) const;
+  inline isl::multi_val set_at(int pos, long el) const;
+  inline isl::multi_val set_range_tuple(isl::id id) const;
+  inline isl::multi_val set_range_tuple(const std::string &id) const;
+  inline isl::multi_val set_val(int pos, isl::val el) const;
+  inline isl::multi_val set_val(int pos, long el) const;
+  inline class size size() const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+  inline isl::multi_val sub(isl::multi_val multi2) const;
+  static inline isl::multi_val zero(isl::space space);
+};
+
+// declarations for isl::point
+inline point manage(__isl_take isl_point *ptr);
+inline point manage_copy(__isl_keep isl_point *ptr);
+
+class point {
+  friend inline point manage(__isl_take isl_point *ptr);
+  friend inline point manage_copy(__isl_keep isl_point *ptr);
+
+protected:
+  isl_point *ptr = nullptr;
+
+  inline explicit point(__isl_take isl_point *ptr);
+
+public:
+  inline /* implicit */ point();
+  inline /* implicit */ point(const point &obj);
+  inline explicit point(isl::space space);
+  inline point &operator=(point obj);
+  inline ~point();
+  inline __isl_give isl_point *copy() const &;
+  inline __isl_give isl_point *copy() && = delete;
+  inline __isl_keep isl_point *get() const;
+  inline __isl_give isl_point *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::set add_constraint(const isl::constraint &constraint) const;
+  inline isl::set add_dims(isl::dim type, unsigned int n) const;
+  inline isl::basic_set affine_hull() const;
+  inline isl::set align_params(const isl::space &model) const;
+  inline isl::basic_set apply(const isl::basic_map &bmap) const;
+  inline isl::set apply(const isl::map &map) const;
+  inline isl::union_set apply(const isl::union_map &umap) const;
+  inline isl::pw_multi_aff as_pw_multi_aff() const;
+  inline isl::set as_set() const;
+  inline isl::basic_set_list basic_set_list() const;
+  inline isl::set bind(const isl::multi_id &tuple) const;
+  inline isl::set coalesce() const;
+  inline isl::set complement() const;
+  inline isl::union_set compute_divs() const;
+  inline boolean contains(const isl::space &space) const;
+  inline isl::basic_set convex_hull() const;
+  inline isl::val coordinate_val(isl::dim type, int pos) const;
+  inline isl::val get_coordinate_val(isl::dim type, int pos) const;
+  inline isl::basic_set detect_equalities() const;
+  inline class size dim(isl::dim type) const;
+  inline boolean dim_has_any_lower_bound(isl::dim type, unsigned int pos) const;
+  inline isl::id dim_id(isl::dim type, unsigned int pos) const;
+  inline isl::pw_aff dim_max(int pos) const;
+  inline isl::val dim_max_val(int pos) const;
+  inline isl::pw_aff dim_min(int pos) const;
+  inline isl::val dim_min_val(int pos) const;
+  inline std::string dim_name(isl::dim type, unsigned int pos) const;
+  inline isl::aff div(int pos) const;
+  inline isl::set drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
+  inline isl::set eliminate(isl::dim type, unsigned int first, unsigned int n) const;
+  inline boolean every_set(const std::function<boolean(isl::set)> &test) const;
+  inline isl::set extract_set(const isl::space &space) const;
+  inline int find_dim_by_id(isl::dim type, const isl::id &id) const;
+  inline int find_dim_by_id(isl::dim type, const std::string &id) const;
+  inline isl::basic_set fix_si(isl::dim type, unsigned int pos, int value) const;
+  inline isl::basic_set fix_val(isl::dim type, unsigned int pos, const isl::val &v) const;
+  inline isl::basic_set fix_val(isl::dim type, unsigned int pos, long v) const;
+  inline isl::basic_set flatten() const;
+  inline stat foreach_basic_set(const std::function<stat(isl::basic_set)> &fn) const;
+  inline stat foreach_point(const std::function<stat(isl::point)> &fn) const;
+  inline stat foreach_set(const std::function<stat(isl::set)> &fn) const;
+  inline isl::basic_set gist(const isl::basic_set &context) const;
+  inline isl::set gist(const isl::set &context) const;
+  inline isl::union_set gist(const isl::union_set &context) const;
+  inline isl::set gist_params(const isl::set &context) const;
+  inline boolean has_equal_space(const isl::set &set2) const;
+  inline isl::map identity() const;
+  inline isl::union_pw_multi_aff identity_union_pw_multi_aff() const;
+  inline isl::pw_aff indicator_function() const;
+  inline isl::set insert_dims(isl::dim type, unsigned int pos, unsigned int n) const;
+  inline isl::map insert_domain(const isl::space &domain) const;
+  inline isl::basic_set intersect(const isl::basic_set &bset2) const;
+  inline isl::set intersect(const isl::set &set2) const;
+  inline isl::union_set intersect(const isl::union_set &uset2) const;
+  inline isl::basic_set intersect_params(const isl::basic_set &bset2) const;
+  inline isl::set intersect_params(const isl::set &params) const;
+  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
+  inline boolean involves_locals() const;
+  inline boolean is_bounded() const;
+  inline boolean is_disjoint(const isl::set &set2) const;
+  inline boolean is_disjoint(const isl::union_set &uset2) const;
+  inline boolean is_empty() const;
+  inline boolean is_equal(const isl::basic_set &bset2) const;
+  inline boolean is_equal(const isl::set &set2) const;
+  inline boolean is_equal(const isl::union_set &uset2) const;
+  inline boolean is_params() const;
+  inline boolean is_singleton() const;
+  inline boolean is_strict_subset(const isl::set &set2) const;
+  inline boolean is_strict_subset(const isl::union_set &uset2) const;
+  inline boolean is_subset(const isl::basic_set &bset2) const;
+  inline boolean is_subset(const isl::set &set2) const;
+  inline boolean is_subset(const isl::union_set &uset2) const;
+  inline boolean is_wrapping() const;
+  inline boolean isa_set() const;
+  inline isl::set lexmax() const;
+  inline isl::pw_multi_aff lexmax_pw_multi_aff() const;
+  inline isl::set lexmin() const;
+  inline isl::pw_multi_aff lexmin_pw_multi_aff() const;
+  inline isl::set lower_bound(const isl::multi_pw_aff &lower) const;
+  inline isl::set lower_bound(const isl::multi_val &lower) const;
+  inline isl::set lower_bound_si(isl::dim type, unsigned int pos, int value) const;
+  inline isl::set lower_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const;
+  inline isl::set lower_bound_val(isl::dim type, unsigned int pos, long value) const;
+  inline isl::multi_pw_aff max_multi_pw_aff() const;
+  inline isl::val max_val(const isl::aff &obj) const;
+  inline isl::multi_pw_aff min_multi_pw_aff() const;
+  inline isl::val min_val(const isl::aff &obj) const;
+  inline isl::multi_val multi_val() const;
+  inline isl::multi_val get_multi_val() const;
+  inline class size n_basic_set() const;
+  inline isl::basic_set params() const;
+  inline isl::val plain_get_val_if_fixed(isl::dim type, unsigned int pos) const;
+  inline isl::multi_val plain_multi_val_if_fixed() const;
+  inline isl::basic_set polyhedral_hull() const;
+  inline isl::set preimage(const isl::multi_aff &ma) const;
+  inline isl::set preimage(const isl::multi_pw_aff &mpa) const;
+  inline isl::set preimage(const isl::pw_multi_aff &pma) const;
+  inline isl::union_set preimage(const isl::union_pw_multi_aff &upma) const;
+  inline isl::set product(const isl::set &set2) const;
+  inline isl::basic_set project_out(isl::dim type, unsigned int first, unsigned int n) const;
+  inline isl::set project_out_all_params() const;
+  inline isl::set project_out_param(const isl::id &id) const;
+  inline isl::set project_out_param(const std::string &id) const;
+  inline isl::set project_out_param(const isl::id_list &list) const;
+  inline isl::pw_multi_aff pw_multi_aff_on_domain(const isl::multi_val &mv) const;
+  inline isl::set remove_dims(isl::dim type, unsigned int first, unsigned int n) const;
+  inline isl::set remove_divs() const;
+  inline isl::set remove_redundancies() const;
+  inline isl::set reset_tuple_id() const;
+  inline isl::basic_set sample() const;
+  inline isl::point sample_point() const;
+  inline isl::set set_dim_id(isl::dim type, unsigned int pos, const isl::id &id) const;
+  inline isl::set set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const;
+  inline isl::set_list set_list() const;
+  inline isl::set set_tuple_id(const isl::id &id) const;
+  inline isl::set set_tuple_id(const std::string &id) const;
+  inline isl::fixed_box simple_fixed_box_hull() const;
+  inline isl::basic_set simple_hull() const;
+  inline isl::space space() const;
+  inline isl::val stride(int pos) const;
+  inline isl::set subtract(const isl::set &set2) const;
+  inline isl::union_set subtract(const isl::union_set &uset2) const;
+  inline isl::basic_set_list to_list() const;
+  inline isl::set to_set() const;
+  inline isl::union_set to_union_set() const;
+  inline isl::map translation() const;
+  inline class size tuple_dim() const;
+  inline isl::id tuple_id() const;
+  inline std::string tuple_name() const;
+  inline isl::set unbind_params(const isl::multi_id &tuple) const;
+  inline isl::map unbind_params_insert_domain(const isl::multi_id &domain) const;
+  inline isl::set unite(const isl::basic_set &bset2) const;
+  inline isl::set unite(const isl::set &set2) const;
+  inline isl::union_set unite(const isl::union_set &uset2) const;
+  inline isl::basic_set unshifted_simple_hull() const;
+  inline isl::map unwrap() const;
+  inline isl::set upper_bound(const isl::multi_pw_aff &upper) const;
+  inline isl::set upper_bound(const isl::multi_val &upper) const;
+  inline isl::set upper_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const;
+  inline isl::set upper_bound_val(isl::dim type, unsigned int pos, long value) const;
+};
+
+// declarations for isl::pw_aff
+inline pw_aff manage(__isl_take isl_pw_aff *ptr);
+inline pw_aff manage_copy(__isl_keep isl_pw_aff *ptr);
+
+class pw_aff {
+  friend inline pw_aff manage(__isl_take isl_pw_aff *ptr);
+  friend inline pw_aff manage_copy(__isl_keep isl_pw_aff *ptr);
+
+protected:
+  isl_pw_aff *ptr = nullptr;
+
+  inline explicit pw_aff(__isl_take isl_pw_aff *ptr);
+
+public:
+  inline /* implicit */ pw_aff();
+  inline /* implicit */ pw_aff(const pw_aff &obj);
+  inline /* implicit */ pw_aff(isl::aff aff);
+  inline explicit pw_aff(isl::ctx ctx, const std::string &str);
+  inline explicit pw_aff(isl::set domain, isl::val v);
+  inline explicit pw_aff(isl::local_space ls);
+  inline pw_aff &operator=(pw_aff obj);
+  inline ~pw_aff();
+  inline __isl_give isl_pw_aff *copy() const &;
+  inline __isl_give isl_pw_aff *copy() && = delete;
+  inline __isl_keep isl_pw_aff *get() const;
+  inline __isl_give isl_pw_aff *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::multi_pw_aff add(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff add(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_aff add(isl::pw_aff pwaff2) const;
+  inline isl::pw_multi_aff add(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_aff add(const isl::union_pw_aff &upa2) const;
+  inline isl::union_pw_multi_aff add(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::pw_aff add(const isl::aff &pwaff2) const;
+  inline isl::pw_aff add_constant(isl::val v) const;
+  inline isl::pw_aff add_constant(long v) const;
+  inline isl::pw_multi_aff add_constant(const isl::multi_val &mv) const;
+  inline isl::pw_aff add_dims(isl::dim type, unsigned int n) const;
+  inline isl::union_pw_multi_aff add_pw_multi_aff(const isl::pw_multi_aff &pma) const;
+  inline isl::union_pw_multi_aff apply(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::aff as_aff() const;
+  inline isl::map as_map() const;
+  inline isl::multi_aff as_multi_aff() const;
+  inline isl::multi_union_pw_aff as_multi_union_pw_aff() const;
+  inline isl::pw_multi_aff as_pw_multi_aff() const;
+  inline isl::set as_set() const;
+  inline isl::union_map as_union_map() const;
+  inline isl::pw_aff at(int pos) const;
+  inline isl::set bind(const isl::multi_id &tuple) const;
+  inline isl::set bind(isl::id id) const;
+  inline isl::set bind(const std::string &id) const;
+  inline isl::pw_aff bind_domain(isl::multi_id tuple) const;
+  inline isl::pw_aff bind_domain_wrapped_domain(isl::multi_id tuple) const;
+  inline isl::pw_aff ceil() const;
+  inline isl::pw_aff coalesce() const;
+  inline isl::pw_aff cond(isl::pw_aff pwaff_true, isl::pw_aff pwaff_false) const;
+  inline class size dim(isl::dim type) const;
+  inline isl::id dim_id(isl::dim type, unsigned int pos) const;
+  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
+  inline isl::pw_aff div(isl::pw_aff pa2) const;
+  inline isl::set domain() const;
+  inline isl::space domain_space() const;
+  inline isl::space get_domain_space() const;
+  inline isl::pw_multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
+  inline isl::set eq_set(isl::pw_aff pwaff2) const;
+  inline isl::val eval(isl::point pnt) const;
+  inline isl::pw_multi_aff extract_pw_multi_aff(const isl::space &space) const;
+  inline isl::multi_pw_aff flat_range_product(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff flat_range_product(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff flat_range_product(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_multi_aff flat_range_product(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::pw_aff floor() const;
+  inline stat foreach_piece(const std::function<stat(isl::set, isl::aff)> &fn) const;
+  inline stat foreach_piece(const std::function<stat(isl::set, isl::multi_aff)> &fn) const;
+  inline stat foreach_pw_aff(const std::function<stat(isl::pw_aff)> &fn) const;
+  inline isl::set ge_set(isl::pw_aff pwaff2) const;
+  inline isl::pw_aff gist(isl::set context) const;
+  inline isl::union_pw_aff gist(const isl::union_set &context) const;
+  inline isl::pw_aff gist(const isl::basic_set &context) const;
+  inline isl::pw_aff gist(const isl::point &context) const;
+  inline isl::set gt_set(isl::pw_aff pwaff2) const;
+  inline boolean has_range_tuple_id() const;
+  inline isl::multi_pw_aff identity() const;
+  inline isl::pw_aff insert_domain(isl::space domain) const;
+  inline isl::pw_aff intersect_domain(isl::set set) const;
+  inline isl::union_pw_aff intersect_domain(const isl::space &space) const;
+  inline isl::union_pw_aff intersect_domain(const isl::union_set &uset) const;
+  inline isl::pw_aff intersect_domain(const isl::basic_set &set) const;
+  inline isl::pw_aff intersect_domain(const isl::point &set) const;
+  inline isl::union_pw_aff intersect_domain_wrapped_domain(const isl::union_set &uset) const;
+  inline isl::union_pw_aff intersect_domain_wrapped_range(const isl::union_set &uset) const;
+  inline isl::pw_aff intersect_params(isl::set set) const;
+  inline boolean involves_locals() const;
+  inline boolean involves_nan() const;
+  inline boolean involves_param(const isl::id &id) const;
+  inline boolean involves_param(const std::string &id) const;
+  inline boolean involves_param(const isl::id_list &list) const;
+  inline boolean is_cst() const;
+  inline boolean is_equal(const isl::pw_aff &pa2) const;
+  inline boolean isa_aff() const;
+  inline boolean isa_multi_aff() const;
+  inline boolean isa_pw_multi_aff() const;
+  inline isl::set le_set(isl::pw_aff pwaff2) const;
+  inline isl::pw_aff_list list() const;
+  inline isl::set lt_set(isl::pw_aff pwaff2) const;
+  inline isl::multi_pw_aff max(const isl::multi_pw_aff &multi2) const;
+  inline isl::pw_aff max(isl::pw_aff pwaff2) const;
+  inline isl::pw_aff max(const isl::aff &pwaff2) const;
+  inline isl::multi_val max_multi_val() const;
+  inline isl::multi_pw_aff min(const isl::multi_pw_aff &multi2) const;
+  inline isl::pw_aff min(isl::pw_aff pwaff2) const;
+  inline isl::pw_aff min(const isl::aff &pwaff2) const;
+  inline isl::multi_val min_multi_val() const;
+  inline isl::pw_aff mod(isl::val mod) const;
+  inline isl::pw_aff mod(long mod) const;
+  inline isl::pw_aff mul(isl::pw_aff pwaff2) const;
+  inline class size n_piece() const;
+  inline isl::set ne_set(isl::pw_aff pwaff2) const;
+  inline isl::pw_aff neg() const;
+  static inline isl::pw_aff param_on_domain(isl::set domain, isl::id id);
+  inline boolean plain_is_empty() const;
+  inline boolean plain_is_equal(const isl::multi_pw_aff &multi2) const;
+  inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff preimage_domain_wrapped_domain(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::multi_pw_aff product(const isl::multi_pw_aff &multi2) const;
+  inline isl::pw_multi_aff product(const isl::pw_multi_aff &pma2) const;
+  inline isl::pw_aff pullback(isl::multi_aff ma) const;
+  inline isl::pw_aff pullback(isl::multi_pw_aff mpa) const;
+  inline isl::pw_aff pullback(isl::pw_multi_aff pma) const;
+  inline isl::union_pw_aff pullback(const isl::union_pw_multi_aff &upma) const;
+  inline isl::pw_multi_aff_list pw_multi_aff_list() const;
+  inline isl::pw_multi_aff range_factor_domain() const;
+  inline isl::pw_multi_aff range_factor_range() const;
+  inline isl::multi_pw_aff range_product(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff range_product(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff range_product(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_multi_aff range_product(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::id range_tuple_id() const;
+  inline isl::multi_pw_aff reset_range_tuple_id() const;
+  inline isl::multi_pw_aff reset_tuple_id(isl::dim type) const;
+  inline isl::multi_pw_aff scale(const isl::multi_val &mv) const;
+  inline isl::pw_aff scale(isl::val v) const;
+  inline isl::pw_aff scale(long v) const;
+  inline isl::multi_pw_aff scale_down(const isl::multi_val &mv) const;
+  inline isl::pw_aff scale_down(isl::val f) const;
+  inline isl::pw_aff scale_down(long f) const;
+  inline isl::multi_pw_aff set_at(int pos, const isl::pw_aff &el) const;
+  inline isl::multi_union_pw_aff set_at(int pos, const isl::union_pw_aff &el) const;
+  inline isl::multi_pw_aff set_pw_aff(int pos, const isl::pw_aff &el) const;
+  inline isl::pw_multi_aff set_pw_aff(unsigned int pos, const isl::pw_aff &pa) const;
+  inline isl::pw_multi_aff set_range_tuple(const isl::id &id) const;
+  inline isl::pw_multi_aff set_range_tuple(const std::string &id) const;
+  inline isl::pw_aff set_tuple_id(isl::dim type, isl::id id) const;
+  inline isl::pw_aff set_tuple_id(isl::dim type, const std::string &id) const;
+  inline isl::multi_union_pw_aff set_union_pw_aff(int pos, const isl::union_pw_aff &el) const;
+  inline class size size() const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+  inline isl::multi_pw_aff sub(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff sub(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_aff sub(isl::pw_aff pwaff2) const;
+  inline isl::pw_multi_aff sub(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_aff sub(const isl::union_pw_aff &upa2) const;
+  inline isl::union_pw_multi_aff sub(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::pw_aff sub(const isl::aff &pwaff2) const;
+  inline isl::pw_aff subtract_domain(isl::set set) const;
+  inline isl::union_pw_aff subtract_domain(const isl::space &space) const;
+  inline isl::union_pw_aff subtract_domain(const isl::union_set &uset) const;
+  inline isl::pw_aff subtract_domain(const isl::basic_set &set) const;
+  inline isl::pw_aff subtract_domain(const isl::point &set) const;
+  inline isl::pw_aff tdiv_q(isl::pw_aff pa2) const;
+  inline isl::pw_aff tdiv_r(isl::pw_aff pa2) const;
+  inline isl::pw_aff_list to_list() const;
+  inline isl::multi_pw_aff to_multi_pw_aff() const;
+  inline isl::union_pw_aff to_union_pw_aff() const;
+  inline isl::union_pw_multi_aff to_union_pw_multi_aff() const;
+  inline isl::id tuple_id(isl::dim type) const;
+  inline isl::id get_tuple_id(isl::dim type) const;
+  inline isl::multi_pw_aff unbind_params_insert_domain(const isl::multi_id &domain) const;
+  inline isl::multi_pw_aff union_add(const isl::multi_pw_aff &mpa2) const;
+  inline isl::multi_union_pw_aff union_add(const isl::multi_union_pw_aff &mupa2) const;
+  inline isl::pw_aff union_add(isl::pw_aff pwaff2) const;
+  inline isl::pw_multi_aff union_add(const isl::pw_multi_aff &pma2) const;
+  inline isl::union_pw_aff union_add(const isl::union_pw_aff &upa2) const;
+  inline isl::union_pw_multi_aff union_add(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::pw_aff union_add(const isl::aff &pwaff2) const;
+  static inline isl::pw_aff var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos);
+};
+
+// declarations for isl::pw_aff_list
+inline pw_aff_list manage(__isl_take isl_pw_aff_list *ptr);
+inline pw_aff_list manage_copy(__isl_keep isl_pw_aff_list *ptr);
+
+class pw_aff_list {
+  friend inline pw_aff_list manage(__isl_take isl_pw_aff_list *ptr);
+  friend inline pw_aff_list manage_copy(__isl_keep isl_pw_aff_list *ptr);
+
+protected:
+  isl_pw_aff_list *ptr = nullptr;
+
+  inline explicit pw_aff_list(__isl_take isl_pw_aff_list *ptr);
+
+public:
+  inline /* implicit */ pw_aff_list();
+  inline /* implicit */ pw_aff_list(const pw_aff_list &obj);
+  inline explicit pw_aff_list(isl::ctx ctx, int n);
+  inline explicit pw_aff_list(isl::pw_aff el);
+  inline explicit pw_aff_list(isl::ctx ctx, const std::string &str);
+  inline pw_aff_list &operator=(pw_aff_list obj);
+  inline ~pw_aff_list();
+  inline __isl_give isl_pw_aff_list *copy() const &;
+  inline __isl_give isl_pw_aff_list *copy() && = delete;
+  inline __isl_keep isl_pw_aff_list *get() const;
+  inline __isl_give isl_pw_aff_list *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::pw_aff_list add(isl::pw_aff el) const;
+  inline isl::pw_aff at(int index) const;
+  inline isl::pw_aff get_at(int index) const;
+  inline isl::pw_aff_list clear() const;
+  inline isl::pw_aff_list concat(isl::pw_aff_list list2) const;
+  inline isl::pw_aff_list drop(unsigned int first, unsigned int n) const;
+  inline stat foreach(const std::function<stat(isl::pw_aff)> &fn) const;
+  inline isl::pw_aff_list insert(unsigned int pos, isl::pw_aff el) const;
+  inline class size size() const;
+};
+
+// declarations for isl::pw_multi_aff
+inline pw_multi_aff manage(__isl_take isl_pw_multi_aff *ptr);
+inline pw_multi_aff manage_copy(__isl_keep isl_pw_multi_aff *ptr);
+
+class pw_multi_aff {
+  friend inline pw_multi_aff manage(__isl_take isl_pw_multi_aff *ptr);
+  friend inline pw_multi_aff manage_copy(__isl_keep isl_pw_multi_aff *ptr);
+
+protected:
+  isl_pw_multi_aff *ptr = nullptr;
+
+  inline explicit pw_multi_aff(__isl_take isl_pw_multi_aff *ptr);
+
+public:
+  inline /* implicit */ pw_multi_aff();
+  inline /* implicit */ pw_multi_aff(const pw_multi_aff &obj);
+  inline /* implicit */ pw_multi_aff(isl::multi_aff ma);
+  inline /* implicit */ pw_multi_aff(isl::pw_aff pa);
+  inline explicit pw_multi_aff(isl::ctx ctx, const std::string &str);
+  inline pw_multi_aff &operator=(pw_multi_aff obj);
+  inline ~pw_multi_aff();
+  inline __isl_give isl_pw_multi_aff *copy() const &;
+  inline __isl_give isl_pw_multi_aff *copy() && = delete;
+  inline __isl_keep isl_pw_multi_aff *get() const;
+  inline __isl_give isl_pw_multi_aff *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::multi_pw_aff add(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff add(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff add(isl::pw_multi_aff pma2) const;
+  inline isl::union_pw_multi_aff add(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::pw_multi_aff add(const isl::multi_aff &pma2) const;
+  inline isl::pw_multi_aff add(const isl::pw_aff &pma2) const;
+  inline isl::pw_multi_aff add_constant(isl::multi_val mv) const;
+  inline isl::pw_multi_aff add_constant(isl::val v) const;
+  inline isl::pw_multi_aff add_constant(long v) const;
+  inline isl::union_pw_multi_aff add_pw_multi_aff(const isl::pw_multi_aff &pma) const;
+  inline isl::union_pw_multi_aff apply(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::map as_map() const;
+  inline isl::multi_aff as_multi_aff() const;
+  inline isl::multi_union_pw_aff as_multi_union_pw_aff() const;
+  inline isl::pw_multi_aff as_pw_multi_aff() const;
+  inline isl::set as_set() const;
+  inline isl::union_map as_union_map() const;
+  inline isl::pw_aff at(int pos) const;
+  inline isl::pw_aff get_at(int pos) const;
+  inline isl::set bind(const isl::multi_id &tuple) const;
+  inline isl::pw_multi_aff bind_domain(isl::multi_id tuple) const;
+  inline isl::pw_multi_aff bind_domain_wrapped_domain(isl::multi_id tuple) const;
+  inline isl::pw_multi_aff coalesce() const;
+  inline class size dim(isl::dim type) const;
+  inline isl::set domain() const;
+  static inline isl::pw_multi_aff domain_map(isl::space space);
+  inline isl::pw_multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
+  inline isl::pw_multi_aff extract_pw_multi_aff(const isl::space &space) const;
+  inline isl::multi_pw_aff flat_range_product(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff flat_range_product(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff flat_range_product(isl::pw_multi_aff pma2) const;
+  inline isl::union_pw_multi_aff flat_range_product(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::pw_multi_aff flat_range_product(const isl::multi_aff &pma2) const;
+  inline isl::pw_multi_aff flat_range_product(const isl::pw_aff &pma2) const;
+  inline stat foreach_piece(const std::function<stat(isl::set, isl::multi_aff)> &fn) const;
+  static inline isl::pw_multi_aff from_map(isl::map map);
+  inline isl::pw_multi_aff gist(isl::set set) const;
+  inline isl::union_pw_multi_aff gist(const isl::union_set &context) const;
+  inline isl::pw_multi_aff gist(const isl::basic_set &set) const;
+  inline isl::pw_multi_aff gist(const isl::point &set) const;
+  inline boolean has_range_tuple_id() const;
+  inline isl::multi_pw_aff identity() const;
+  static inline isl::pw_multi_aff identity_on_domain(isl::space space);
+  inline isl::pw_multi_aff insert_domain(isl::space domain) const;
+  inline isl::pw_multi_aff intersect_domain(isl::set set) const;
+  inline isl::union_pw_multi_aff intersect_domain(const isl::space &space) const;
+  inline isl::union_pw_multi_aff intersect_domain(const isl::union_set &uset) const;
+  inline isl::pw_multi_aff intersect_domain(const isl::basic_set &set) const;
+  inline isl::pw_multi_aff intersect_domain(const isl::point &set) const;
+  inline isl::union_pw_multi_aff intersect_domain_wrapped_domain(const isl::union_set &uset) const;
+  inline isl::union_pw_multi_aff intersect_domain_wrapped_range(const isl::union_set &uset) const;
+  inline isl::pw_multi_aff intersect_params(isl::set set) const;
+  inline boolean involves_locals() const;
+  inline boolean involves_nan() const;
+  inline boolean involves_param(const isl::id &id) const;
+  inline boolean involves_param(const std::string &id) const;
+  inline boolean involves_param(const isl::id_list &list) const;
+  inline boolean isa_multi_aff() const;
+  inline boolean isa_pw_multi_aff() const;
+  inline isl::pw_aff_list list() const;
+  inline isl::multi_pw_aff max(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_val max_multi_val() const;
+  inline isl::multi_pw_aff min(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_val min_multi_val() const;
+  static inline isl::pw_multi_aff multi_val_on_domain(isl::set domain, isl::multi_val mv);
+  inline class size n_piece() const;
+  inline isl::multi_pw_aff neg() const;
+  inline boolean plain_is_empty() const;
+  inline boolean plain_is_equal(const isl::multi_pw_aff &multi2) const;
+  inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff preimage_domain_wrapped_domain(isl::pw_multi_aff pma2) const;
+  inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::pw_multi_aff preimage_domain_wrapped_domain(const isl::multi_aff &pma2) const;
+  inline isl::pw_multi_aff preimage_domain_wrapped_domain(const isl::pw_aff &pma2) const;
+  inline isl::multi_pw_aff product(const isl::multi_pw_aff &multi2) const;
+  inline isl::pw_multi_aff product(isl::pw_multi_aff pma2) const;
+  inline isl::pw_multi_aff product(const isl::multi_aff &pma2) const;
+  inline isl::pw_multi_aff product(const isl::pw_aff &pma2) const;
+  static inline isl::pw_multi_aff project_out_map(isl::space space, isl::dim type, unsigned int first, unsigned int n);
+  inline isl::multi_pw_aff pullback(const isl::multi_pw_aff &mpa2) const;
+  inline isl::pw_multi_aff pullback(isl::multi_aff ma) const;
+  inline isl::pw_multi_aff pullback(isl::pw_multi_aff pma2) const;
+  inline isl::union_pw_multi_aff pullback(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::pw_multi_aff_list pw_multi_aff_list() const;
+  inline isl::pw_multi_aff range_factor_domain() const;
+  inline isl::pw_multi_aff range_factor_range() const;
+  static inline isl::pw_multi_aff range_map(isl::space space);
+  inline isl::multi_pw_aff range_product(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff range_product(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff range_product(isl::pw_multi_aff pma2) const;
+  inline isl::union_pw_multi_aff range_product(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::pw_multi_aff range_product(const isl::multi_aff &pma2) const;
+  inline isl::pw_multi_aff range_product(const isl::pw_aff &pma2) const;
+  inline isl::id range_tuple_id() const;
+  inline isl::id get_range_tuple_id() const;
+  inline isl::multi_pw_aff reset_range_tuple_id() const;
+  inline isl::multi_pw_aff reset_tuple_id(isl::dim type) const;
+  inline isl::multi_pw_aff scale(const isl::multi_val &mv) const;
+  inline isl::pw_multi_aff scale(isl::val v) const;
+  inline isl::pw_multi_aff scale(long v) const;
+  inline isl::multi_pw_aff scale_down(const isl::multi_val &mv) const;
+  inline isl::pw_multi_aff scale_down(isl::val v) const;
+  inline isl::pw_multi_aff scale_down(long v) const;
+  inline isl::multi_pw_aff set_at(int pos, const isl::pw_aff &el) const;
+  inline isl::multi_union_pw_aff set_at(int pos, const isl::union_pw_aff &el) const;
+  inline isl::multi_pw_aff set_pw_aff(int pos, const isl::pw_aff &el) const;
+  inline isl::pw_multi_aff set_pw_aff(unsigned int pos, isl::pw_aff pa) const;
+  inline isl::pw_multi_aff set_range_tuple(isl::id id) const;
+  inline isl::pw_multi_aff set_range_tuple(const std::string &id) const;
+  inline isl::multi_union_pw_aff set_union_pw_aff(int pos, const isl::union_pw_aff &el) const;
+  inline class size size() const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+  inline isl::multi_pw_aff sub(const isl::multi_pw_aff &multi2) const;
+  inline isl::multi_union_pw_aff sub(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::pw_multi_aff sub(isl::pw_multi_aff pma2) const;
+  inline isl::union_pw_multi_aff sub(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::pw_multi_aff sub(const isl::multi_aff &pma2) const;
+  inline isl::pw_multi_aff sub(const isl::pw_aff &pma2) const;
+  inline isl::pw_multi_aff subtract_domain(isl::set set) const;
+  inline isl::union_pw_multi_aff subtract_domain(const isl::space &space) const;
+  inline isl::union_pw_multi_aff subtract_domain(const isl::union_set &uset) const;
+  inline isl::pw_multi_aff subtract_domain(const isl::basic_set &set) const;
+  inline isl::pw_multi_aff subtract_domain(const isl::point &set) const;
+  inline isl::pw_multi_aff_list to_list() const;
+  inline isl::multi_pw_aff to_multi_pw_aff() const;
+  inline isl::union_pw_multi_aff to_union_pw_multi_aff() const;
+  inline isl::id tuple_id(isl::dim type) const;
+  inline isl::id get_tuple_id(isl::dim type) const;
+  inline isl::multi_pw_aff unbind_params_insert_domain(const isl::multi_id &domain) const;
+  inline isl::multi_pw_aff union_add(const isl::multi_pw_aff &mpa2) const;
+  inline isl::multi_union_pw_aff union_add(const isl::multi_union_pw_aff &mupa2) const;
+  inline isl::pw_multi_aff union_add(isl::pw_multi_aff pma2) const;
+  inline isl::union_pw_multi_aff union_add(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::pw_multi_aff union_add(const isl::multi_aff &pma2) const;
+  inline isl::pw_multi_aff union_add(const isl::pw_aff &pma2) const;
+  static inline isl::pw_multi_aff zero(isl::space space);
+};
+
+// declarations for isl::pw_multi_aff_list
+inline pw_multi_aff_list manage(__isl_take isl_pw_multi_aff_list *ptr);
+inline pw_multi_aff_list manage_copy(__isl_keep isl_pw_multi_aff_list *ptr);
+
+class pw_multi_aff_list {
+  friend inline pw_multi_aff_list manage(__isl_take isl_pw_multi_aff_list *ptr);
+  friend inline pw_multi_aff_list manage_copy(__isl_keep isl_pw_multi_aff_list *ptr);
+
+protected:
+  isl_pw_multi_aff_list *ptr = nullptr;
+
+  inline explicit pw_multi_aff_list(__isl_take isl_pw_multi_aff_list *ptr);
+
+public:
+  inline /* implicit */ pw_multi_aff_list();
+  inline /* implicit */ pw_multi_aff_list(const pw_multi_aff_list &obj);
+  inline explicit pw_multi_aff_list(isl::ctx ctx, int n);
+  inline explicit pw_multi_aff_list(isl::pw_multi_aff el);
+  inline explicit pw_multi_aff_list(isl::ctx ctx, const std::string &str);
+  inline pw_multi_aff_list &operator=(pw_multi_aff_list obj);
+  inline ~pw_multi_aff_list();
+  inline __isl_give isl_pw_multi_aff_list *copy() const &;
+  inline __isl_give isl_pw_multi_aff_list *copy() && = delete;
+  inline __isl_keep isl_pw_multi_aff_list *get() const;
+  inline __isl_give isl_pw_multi_aff_list *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::pw_multi_aff_list add(isl::pw_multi_aff el) const;
+  inline isl::pw_multi_aff at(int index) const;
+  inline isl::pw_multi_aff get_at(int index) const;
+  inline isl::pw_multi_aff_list clear() const;
+  inline isl::pw_multi_aff_list concat(isl::pw_multi_aff_list list2) const;
+  inline isl::pw_multi_aff_list drop(unsigned int first, unsigned int n) const;
+  inline stat foreach(const std::function<stat(isl::pw_multi_aff)> &fn) const;
+  inline isl::pw_multi_aff_list insert(unsigned int pos, isl::pw_multi_aff el) const;
+  inline class size size() const;
+};
+
+// declarations for isl::schedule
+inline schedule manage(__isl_take isl_schedule *ptr);
+inline schedule manage_copy(__isl_keep isl_schedule *ptr);
+
+class schedule {
+  friend inline schedule manage(__isl_take isl_schedule *ptr);
+  friend inline schedule manage_copy(__isl_keep isl_schedule *ptr);
+
+protected:
+  isl_schedule *ptr = nullptr;
+
+  inline explicit schedule(__isl_take isl_schedule *ptr);
+
+public:
+  inline /* implicit */ schedule();
+  inline /* implicit */ schedule(const schedule &obj);
+  inline explicit schedule(isl::ctx ctx, const std::string &str);
+  inline schedule &operator=(schedule obj);
+  inline ~schedule();
+  inline __isl_give isl_schedule *copy() const &;
+  inline __isl_give isl_schedule *copy() && = delete;
+  inline __isl_keep isl_schedule *get() const;
+  inline __isl_give isl_schedule *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::schedule align_params(isl::space space) const;
+  inline isl::union_set domain() const;
+  inline isl::union_set get_domain() const;
+  static inline isl::schedule from_domain(isl::union_set domain);
+  inline isl::schedule gist_domain_params(isl::set context) const;
+  inline isl::schedule insert_partial_schedule(isl::multi_union_pw_aff partial) const;
+  inline isl::schedule intersect_domain(isl::union_set domain) const;
+  inline isl::union_map map() const;
+  inline isl::union_map get_map() const;
+  inline isl::schedule pullback(isl::union_pw_multi_aff upma) const;
+  inline isl::schedule_node root() const;
+  inline isl::schedule_node get_root() const;
+  inline isl::schedule sequence(isl::schedule schedule2) const;
+};
+
+// declarations for isl::schedule_constraints
+inline schedule_constraints manage(__isl_take isl_schedule_constraints *ptr);
+inline schedule_constraints manage_copy(__isl_keep isl_schedule_constraints *ptr);
+
+class schedule_constraints {
+  friend inline schedule_constraints manage(__isl_take isl_schedule_constraints *ptr);
+  friend inline schedule_constraints manage_copy(__isl_keep isl_schedule_constraints *ptr);
+
+protected:
+  isl_schedule_constraints *ptr = nullptr;
+
+  inline explicit schedule_constraints(__isl_take isl_schedule_constraints *ptr);
+
+public:
+  inline /* implicit */ schedule_constraints();
+  inline /* implicit */ schedule_constraints(const schedule_constraints &obj);
+  inline explicit schedule_constraints(isl::ctx ctx, const std::string &str);
+  inline schedule_constraints &operator=(schedule_constraints obj);
+  inline ~schedule_constraints();
+  inline __isl_give isl_schedule_constraints *copy() const &;
+  inline __isl_give isl_schedule_constraints *copy() && = delete;
+  inline __isl_keep isl_schedule_constraints *get() const;
+  inline __isl_give isl_schedule_constraints *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::union_map coincidence() const;
+  inline isl::union_map get_coincidence() const;
+  inline isl::schedule compute_schedule() const;
+  inline isl::union_map conditional_validity() const;
+  inline isl::union_map get_conditional_validity() const;
+  inline isl::union_map conditional_validity_condition() const;
+  inline isl::union_map get_conditional_validity_condition() const;
+  inline isl::set context() const;
+  inline isl::set get_context() const;
+  inline isl::union_set domain() const;
+  inline isl::union_set get_domain() const;
+  static inline isl::schedule_constraints on_domain(isl::union_set domain);
+  inline isl::union_map proximity() const;
+  inline isl::union_map get_proximity() const;
+  inline isl::schedule_constraints set_coincidence(isl::union_map coincidence) const;
+  inline isl::schedule_constraints set_conditional_validity(isl::union_map condition, isl::union_map validity) const;
+  inline isl::schedule_constraints set_context(isl::set context) const;
+  inline isl::schedule_constraints set_proximity(isl::union_map proximity) const;
+  inline isl::schedule_constraints set_validity(isl::union_map validity) const;
+  inline isl::union_map validity() const;
+  inline isl::union_map get_validity() const;
+};
+
+// declarations for isl::schedule_node
+inline schedule_node manage(__isl_take isl_schedule_node *ptr);
+inline schedule_node manage_copy(__isl_keep isl_schedule_node *ptr);
+
+class schedule_node {
+  friend inline schedule_node manage(__isl_take isl_schedule_node *ptr);
+  friend inline schedule_node manage_copy(__isl_keep isl_schedule_node *ptr);
+
+protected:
+  isl_schedule_node *ptr = nullptr;
+
+  inline explicit schedule_node(__isl_take isl_schedule_node *ptr);
+
+public:
+  inline /* implicit */ schedule_node();
+  inline /* implicit */ schedule_node(const schedule_node &obj);
+  inline schedule_node &operator=(schedule_node obj);
+  inline ~schedule_node();
+  inline __isl_give isl_schedule_node *copy() const &;
+  inline __isl_give isl_schedule_node *copy() && = delete;
+  inline __isl_keep isl_schedule_node *get() const;
+  inline __isl_give isl_schedule_node *release();
+  inline bool is_null() const;
+private:
+  template <typename T,
+          typename = typename std::enable_if<std::is_same<
+                  const decltype(isl_schedule_node_get_type(NULL)),
+                  const T>::value>::type>
+  inline boolean isa_type(T subtype) const;
+public:
+  template <class T> inline boolean isa() const;
+  template <class T> inline T as() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::schedule_node ancestor(int generation) const;
+  inline class size ancestor_child_position(const isl::schedule_node &ancestor) const;
+  inline class size get_ancestor_child_position(const isl::schedule_node &ancestor) const;
+  inline isl::schedule_node child(int pos) const;
+  inline class size child_position() const;
+  inline class size get_child_position() const;
+  inline isl::union_set domain() const;
+  inline isl::union_set get_domain() const;
+  inline boolean every_descendant(const std::function<boolean(isl::schedule_node)> &test) const;
+  inline isl::schedule_node first_child() const;
+  inline stat foreach_ancestor_top_down(const std::function<stat(isl::schedule_node)> &fn) const;
+  inline stat foreach_descendant_top_down(const std::function<boolean(isl::schedule_node)> &fn) const;
+  static inline isl::schedule_node from_domain(isl::union_set domain);
+  static inline isl::schedule_node from_extension(isl::union_map extension);
+  inline isl::schedule_node graft_after(isl::schedule_node graft) const;
+  inline isl::schedule_node graft_before(isl::schedule_node graft) const;
+  inline boolean has_children() const;
+  inline boolean has_next_sibling() const;
+  inline boolean has_parent() const;
+  inline boolean has_previous_sibling() const;
+  inline isl::schedule_node insert_context(isl::set context) const;
+  inline isl::schedule_node insert_filter(isl::union_set filter) const;
+  inline isl::schedule_node insert_guard(isl::set context) const;
+  inline isl::schedule_node insert_mark(isl::id mark) const;
+  inline isl::schedule_node insert_mark(const std::string &mark) const;
+  inline isl::schedule_node insert_partial_schedule(isl::multi_union_pw_aff schedule) const;
+  inline isl::schedule_node insert_sequence(isl::union_set_list filters) const;
+  inline isl::schedule_node insert_set(isl::union_set_list filters) const;
+  inline boolean is_equal(const isl::schedule_node &node2) const;
+  inline boolean is_subtree_anchored() const;
+  inline isl::schedule_node map_descendant_bottom_up(const std::function<isl::schedule_node(isl::schedule_node)> &fn) const;
+  inline class size n_children() const;
+  inline isl::schedule_node next_sibling() const;
+  inline isl::schedule_node order_after(isl::union_set filter) const;
+  inline isl::schedule_node order_before(isl::union_set filter) const;
+  inline isl::schedule_node parent() const;
+  inline isl::multi_union_pw_aff prefix_schedule_multi_union_pw_aff() const;
+  inline isl::multi_union_pw_aff get_prefix_schedule_multi_union_pw_aff() const;
+  inline isl::union_map prefix_schedule_relation() const;
+  inline isl::union_map get_prefix_schedule_relation() const;
+  inline isl::union_map prefix_schedule_union_map() const;
+  inline isl::union_map get_prefix_schedule_union_map() const;
+  inline isl::union_pw_multi_aff prefix_schedule_union_pw_multi_aff() const;
+  inline isl::union_pw_multi_aff get_prefix_schedule_union_pw_multi_aff() const;
+  inline isl::schedule_node previous_sibling() const;
+  inline isl::schedule_node root() const;
+  inline isl::schedule schedule() const;
+  inline isl::schedule get_schedule() const;
+  inline class size schedule_depth() const;
+  inline class size get_schedule_depth() const;
+  inline isl::schedule_node shared_ancestor(const isl::schedule_node &node2) const;
+  inline isl::schedule_node get_shared_ancestor(const isl::schedule_node &node2) const;
+  inline class size tree_depth() const;
+  inline class size get_tree_depth() const;
+  inline isl::union_set universe_domain() const;
+  inline isl::union_set get_universe_domain() const;
+};
+
+// declarations for isl::schedule_node_band
+
+class schedule_node_band : public schedule_node {
+  template <class T>
+  friend boolean schedule_node::isa() const;
+  friend schedule_node_band schedule_node::as<schedule_node_band>() const;
+  static const auto type = isl_schedule_node_band;
+
+protected:
+  inline explicit schedule_node_band(__isl_take isl_schedule_node *ptr);
+
+public:
+  inline /* implicit */ schedule_node_band();
+  inline /* implicit */ schedule_node_band(const schedule_node_band &obj);
+  inline schedule_node_band &operator=(schedule_node_band obj);
+  inline isl::ctx ctx() const;
+
+  inline isl::union_set ast_build_options() const;
+  inline isl::union_set get_ast_build_options() const;
+  inline isl::set ast_isolate_option() const;
+  inline isl::set get_ast_isolate_option() const;
+  inline boolean member_get_coincident(int pos) const;
+  inline schedule_node_band member_set_coincident(int pos, int coincident) const;
+  inline schedule_node_band mod(isl::multi_val mv) const;
+  inline class size n_member() const;
+  inline isl::multi_union_pw_aff partial_schedule() const;
+  inline isl::multi_union_pw_aff get_partial_schedule() const;
+  inline boolean permutable() const;
+  inline boolean get_permutable() const;
+  inline schedule_node_band scale(isl::multi_val mv) const;
+  inline schedule_node_band scale_down(isl::multi_val mv) const;
+  inline schedule_node_band set_ast_build_options(isl::union_set options) const;
+  inline schedule_node_band set_permutable(int permutable) const;
+  inline schedule_node_band shift(isl::multi_union_pw_aff shift) const;
+  inline schedule_node_band split(int pos) const;
+  inline schedule_node_band tile(isl::multi_val sizes) const;
+  inline schedule_node_band member_set_ast_loop_default(int pos) const;
+  inline schedule_node_band member_set_ast_loop_atomic(int pos) const;
+  inline schedule_node_band member_set_ast_loop_unroll(int pos) const;
+  inline schedule_node_band member_set_ast_loop_separate(int pos) const;
+};
+
+// declarations for isl::schedule_node_context
+
+class schedule_node_context : public schedule_node {
+  template <class T>
+  friend boolean schedule_node::isa() const;
+  friend schedule_node_context schedule_node::as<schedule_node_context>() const;
+  static const auto type = isl_schedule_node_context;
+
+protected:
+  inline explicit schedule_node_context(__isl_take isl_schedule_node *ptr);
+
+public:
+  inline /* implicit */ schedule_node_context();
+  inline /* implicit */ schedule_node_context(const schedule_node_context &obj);
+  inline schedule_node_context &operator=(schedule_node_context obj);
+  inline isl::ctx ctx() const;
+
+  inline isl::set context() const;
+  inline isl::set get_context() const;
+};
+
+// declarations for isl::schedule_node_domain
+
+class schedule_node_domain : public schedule_node {
+  template <class T>
+  friend boolean schedule_node::isa() const;
+  friend schedule_node_domain schedule_node::as<schedule_node_domain>() const;
+  static const auto type = isl_schedule_node_domain;
+
+protected:
+  inline explicit schedule_node_domain(__isl_take isl_schedule_node *ptr);
+
+public:
+  inline /* implicit */ schedule_node_domain();
+  inline /* implicit */ schedule_node_domain(const schedule_node_domain &obj);
+  inline schedule_node_domain &operator=(schedule_node_domain obj);
+  inline isl::ctx ctx() const;
+
+  inline isl::union_set domain() const;
+  inline isl::union_set get_domain() const;
+};
+
+// declarations for isl::schedule_node_expansion
+
+class schedule_node_expansion : public schedule_node {
+  template <class T>
+  friend boolean schedule_node::isa() const;
+  friend schedule_node_expansion schedule_node::as<schedule_node_expansion>() const;
+  static const auto type = isl_schedule_node_expansion;
+
+protected:
+  inline explicit schedule_node_expansion(__isl_take isl_schedule_node *ptr);
+
+public:
+  inline /* implicit */ schedule_node_expansion();
+  inline /* implicit */ schedule_node_expansion(const schedule_node_expansion &obj);
+  inline schedule_node_expansion &operator=(schedule_node_expansion obj);
+  inline isl::ctx ctx() const;
+
+  inline isl::union_pw_multi_aff contraction() const;
+  inline isl::union_pw_multi_aff get_contraction() const;
+  inline isl::union_map expansion() const;
+  inline isl::union_map get_expansion() const;
+};
+
+// declarations for isl::schedule_node_extension
+
+class schedule_node_extension : public schedule_node {
+  template <class T>
+  friend boolean schedule_node::isa() const;
+  friend schedule_node_extension schedule_node::as<schedule_node_extension>() const;
+  static const auto type = isl_schedule_node_extension;
+
+protected:
+  inline explicit schedule_node_extension(__isl_take isl_schedule_node *ptr);
+
+public:
+  inline /* implicit */ schedule_node_extension();
+  inline /* implicit */ schedule_node_extension(const schedule_node_extension &obj);
+  inline schedule_node_extension &operator=(schedule_node_extension obj);
+  inline isl::ctx ctx() const;
+
+  inline isl::union_map extension() const;
+  inline isl::union_map get_extension() const;
+};
+
+// declarations for isl::schedule_node_filter
+
+class schedule_node_filter : public schedule_node {
+  template <class T>
+  friend boolean schedule_node::isa() const;
+  friend schedule_node_filter schedule_node::as<schedule_node_filter>() const;
+  static const auto type = isl_schedule_node_filter;
+
+protected:
+  inline explicit schedule_node_filter(__isl_take isl_schedule_node *ptr);
+
+public:
+  inline /* implicit */ schedule_node_filter();
+  inline /* implicit */ schedule_node_filter(const schedule_node_filter &obj);
+  inline schedule_node_filter &operator=(schedule_node_filter obj);
+  inline isl::ctx ctx() const;
+
+  inline isl::union_set filter() const;
+  inline isl::union_set get_filter() const;
+};
+
+// declarations for isl::schedule_node_guard
+
+class schedule_node_guard : public schedule_node {
+  template <class T>
+  friend boolean schedule_node::isa() const;
+  friend schedule_node_guard schedule_node::as<schedule_node_guard>() const;
+  static const auto type = isl_schedule_node_guard;
+
+protected:
+  inline explicit schedule_node_guard(__isl_take isl_schedule_node *ptr);
+
+public:
+  inline /* implicit */ schedule_node_guard();
+  inline /* implicit */ schedule_node_guard(const schedule_node_guard &obj);
+  inline schedule_node_guard &operator=(schedule_node_guard obj);
+  inline isl::ctx ctx() const;
+
+  inline isl::set guard() const;
+  inline isl::set get_guard() const;
+};
+
+// declarations for isl::schedule_node_leaf
+
+class schedule_node_leaf : public schedule_node {
+  template <class T>
+  friend boolean schedule_node::isa() const;
+  friend schedule_node_leaf schedule_node::as<schedule_node_leaf>() const;
+  static const auto type = isl_schedule_node_leaf;
+
+protected:
+  inline explicit schedule_node_leaf(__isl_take isl_schedule_node *ptr);
+
+public:
+  inline /* implicit */ schedule_node_leaf();
+  inline /* implicit */ schedule_node_leaf(const schedule_node_leaf &obj);
+  inline schedule_node_leaf &operator=(schedule_node_leaf obj);
+  inline isl::ctx ctx() const;
+
+};
+
+// declarations for isl::schedule_node_mark
+
+class schedule_node_mark : public schedule_node {
+  template <class T>
+  friend boolean schedule_node::isa() const;
+  friend schedule_node_mark schedule_node::as<schedule_node_mark>() const;
+  static const auto type = isl_schedule_node_mark;
+
+protected:
+  inline explicit schedule_node_mark(__isl_take isl_schedule_node *ptr);
+
+public:
+  inline /* implicit */ schedule_node_mark();
+  inline /* implicit */ schedule_node_mark(const schedule_node_mark &obj);
+  inline schedule_node_mark &operator=(schedule_node_mark obj);
+  inline isl::ctx ctx() const;
+
+  inline isl::id id() const;
+  inline isl::id get_id() const;
+};
+
+// declarations for isl::schedule_node_sequence
+
+class schedule_node_sequence : public schedule_node {
+  template <class T>
+  friend boolean schedule_node::isa() const;
+  friend schedule_node_sequence schedule_node::as<schedule_node_sequence>() const;
+  static const auto type = isl_schedule_node_sequence;
+
+protected:
+  inline explicit schedule_node_sequence(__isl_take isl_schedule_node *ptr);
+
+public:
+  inline /* implicit */ schedule_node_sequence();
+  inline /* implicit */ schedule_node_sequence(const schedule_node_sequence &obj);
+  inline schedule_node_sequence &operator=(schedule_node_sequence obj);
+  inline isl::ctx ctx() const;
+
+};
+
+// declarations for isl::schedule_node_set
+
+class schedule_node_set : public schedule_node {
+  template <class T>
+  friend boolean schedule_node::isa() const;
+  friend schedule_node_set schedule_node::as<schedule_node_set>() const;
+  static const auto type = isl_schedule_node_set;
+
+protected:
+  inline explicit schedule_node_set(__isl_take isl_schedule_node *ptr);
+
+public:
+  inline /* implicit */ schedule_node_set();
+  inline /* implicit */ schedule_node_set(const schedule_node_set &obj);
+  inline schedule_node_set &operator=(schedule_node_set obj);
+  inline isl::ctx ctx() const;
+
+};
+
+// declarations for isl::set
+inline set manage(__isl_take isl_set *ptr);
+inline set manage_copy(__isl_keep isl_set *ptr);
+
+class set {
+  friend inline set manage(__isl_take isl_set *ptr);
+  friend inline set manage_copy(__isl_keep isl_set *ptr);
+
+protected:
+  isl_set *ptr = nullptr;
+
+  inline explicit set(__isl_take isl_set *ptr);
+
+public:
+  inline /* implicit */ set();
+  inline /* implicit */ set(const set &obj);
+  inline /* implicit */ set(isl::basic_set bset);
+  inline /* implicit */ set(isl::point pnt);
+  inline explicit set(isl::union_set uset);
+  inline explicit set(isl::ctx ctx, const std::string &str);
+  inline set &operator=(set obj);
+  inline ~set();
+  inline __isl_give isl_set *copy() const &;
+  inline __isl_give isl_set *copy() && = delete;
+  inline __isl_keep isl_set *get() const;
+  inline __isl_give isl_set *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::set add_constraint(isl::constraint constraint) const;
+  inline isl::set add_dims(isl::dim type, unsigned int n) const;
+  inline isl::basic_set affine_hull() const;
+  inline isl::set align_params(isl::space model) const;
+  inline isl::set apply(isl::map map) const;
+  inline isl::union_set apply(const isl::union_map &umap) const;
+  inline isl::set apply(const isl::basic_map &map) const;
+  inline isl::pw_multi_aff as_pw_multi_aff() const;
+  inline isl::set as_set() const;
+  inline isl::basic_set_list basic_set_list() const;
+  inline isl::basic_set_list get_basic_set_list() const;
+  inline isl::set bind(isl::multi_id tuple) const;
+  inline isl::set coalesce() const;
+  inline isl::set complement() const;
+  inline isl::union_set compute_divs() const;
+  inline boolean contains(const isl::space &space) const;
+  inline isl::basic_set convex_hull() const;
+  inline isl::set detect_equalities() const;
+  inline class size dim(isl::dim type) const;
+  inline boolean dim_has_any_lower_bound(isl::dim type, unsigned int pos) const;
+  inline isl::id dim_id(isl::dim type, unsigned int pos) const;
+  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
+  inline isl::pw_aff dim_max(int pos) const;
+  inline isl::val dim_max_val(int pos) const;
+  inline isl::pw_aff dim_min(int pos) const;
+  inline isl::val dim_min_val(int pos) const;
+  inline std::string dim_name(isl::dim type, unsigned int pos) const;
+  inline std::string get_dim_name(isl::dim type, unsigned int pos) const;
+  inline isl::set drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const;
+  inline isl::set eliminate(isl::dim type, unsigned int first, unsigned int n) const;
+  static inline isl::set empty(isl::space space);
+  inline boolean every_set(const std::function<boolean(isl::set)> &test) const;
+  inline isl::set extract_set(const isl::space &space) const;
+  inline int find_dim_by_id(isl::dim type, const isl::id &id) const;
+  inline int find_dim_by_id(isl::dim type, const std::string &id) const;
+  inline isl::set fix_si(isl::dim type, unsigned int pos, int value) const;
+  inline isl::set flatten() const;
+  inline stat foreach_basic_set(const std::function<stat(isl::basic_set)> &fn) const;
+  inline stat foreach_point(const std::function<stat(isl::point)> &fn) const;
+  inline stat foreach_set(const std::function<stat(isl::set)> &fn) const;
+  inline isl::set gist(isl::set context) const;
+  inline isl::union_set gist(const isl::union_set &context) const;
+  inline isl::set gist(const isl::basic_set &context) const;
+  inline isl::set gist(const isl::point &context) const;
+  inline isl::set gist_params(isl::set context) const;
+  inline boolean has_equal_space(const isl::set &set2) const;
+  inline isl::map identity() const;
+  inline isl::union_pw_multi_aff identity_union_pw_multi_aff() const;
+  inline isl::pw_aff indicator_function() const;
+  inline isl::set insert_dims(isl::dim type, unsigned int pos, unsigned int n) const;
+  inline isl::map insert_domain(isl::space domain) const;
+  inline isl::set intersect(isl::set set2) const;
+  inline isl::union_set intersect(const isl::union_set &uset2) const;
+  inline isl::set intersect(const isl::basic_set &set2) const;
+  inline isl::set intersect(const isl::point &set2) const;
+  inline isl::set intersect_params(isl::set params) const;
+  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
+  inline boolean involves_locals() const;
+  inline boolean is_bounded() const;
+  inline boolean is_disjoint(const isl::set &set2) const;
+  inline boolean is_disjoint(const isl::union_set &uset2) const;
+  inline boolean is_disjoint(const isl::basic_set &set2) const;
+  inline boolean is_disjoint(const isl::point &set2) const;
+  inline boolean is_empty() const;
+  inline boolean is_equal(const isl::set &set2) const;
+  inline boolean is_equal(const isl::union_set &uset2) const;
+  inline boolean is_equal(const isl::basic_set &set2) const;
+  inline boolean is_equal(const isl::point &set2) const;
+  inline boolean is_params() const;
+  inline boolean is_singleton() const;
+  inline boolean is_strict_subset(const isl::set &set2) const;
+  inline boolean is_strict_subset(const isl::union_set &uset2) const;
+  inline boolean is_strict_subset(const isl::basic_set &set2) const;
+  inline boolean is_strict_subset(const isl::point &set2) const;
+  inline boolean is_subset(const isl::set &set2) const;
+  inline boolean is_subset(const isl::union_set &uset2) const;
+  inline boolean is_subset(const isl::basic_set &set2) const;
+  inline boolean is_subset(const isl::point &set2) const;
+  inline boolean is_wrapping() const;
+  inline boolean isa_set() const;
+  inline isl::set lexmax() const;
+  inline isl::pw_multi_aff lexmax_pw_multi_aff() const;
+  inline isl::set lexmin() const;
+  inline isl::pw_multi_aff lexmin_pw_multi_aff() const;
+  inline isl::set lower_bound(isl::multi_pw_aff lower) const;
+  inline isl::set lower_bound(isl::multi_val lower) const;
+  inline isl::set lower_bound_si(isl::dim type, unsigned int pos, int value) const;
+  inline isl::set lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const;
+  inline isl::set lower_bound_val(isl::dim type, unsigned int pos, long value) const;
+  inline isl::multi_pw_aff max_multi_pw_aff() const;
+  inline isl::val max_val(const isl::aff &obj) const;
+  inline isl::multi_pw_aff min_multi_pw_aff() const;
+  inline isl::val min_val(const isl::aff &obj) const;
+  inline class size n_basic_set() const;
+  inline isl::set params() const;
+  inline isl::val plain_get_val_if_fixed(isl::dim type, unsigned int pos) const;
+  inline isl::multi_val plain_multi_val_if_fixed() const;
+  inline isl::multi_val get_plain_multi_val_if_fixed() const;
+  inline isl::basic_set polyhedral_hull() const;
+  inline isl::set preimage(isl::multi_aff ma) const;
+  inline isl::set preimage(isl::multi_pw_aff mpa) const;
+  inline isl::set preimage(isl::pw_multi_aff pma) const;
+  inline isl::union_set preimage(const isl::union_pw_multi_aff &upma) const;
+  inline isl::set product(isl::set set2) const;
+  inline isl::set project_out(isl::dim type, unsigned int first, unsigned int n) const;
+  inline isl::set project_out_all_params() const;
+  inline isl::set project_out_param(isl::id id) const;
+  inline isl::set project_out_param(const std::string &id) const;
+  inline isl::set project_out_param(isl::id_list list) const;
+  inline isl::pw_multi_aff pw_multi_aff_on_domain(isl::multi_val mv) const;
+  inline isl::set remove_dims(isl::dim type, unsigned int first, unsigned int n) const;
+  inline isl::set remove_divs() const;
+  inline isl::set remove_redundancies() const;
+  inline isl::set reset_tuple_id() const;
+  inline isl::basic_set sample() const;
+  inline isl::point sample_point() const;
+  inline isl::set set_dim_id(isl::dim type, unsigned int pos, isl::id id) const;
+  inline isl::set set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const;
+  inline isl::set_list set_list() const;
+  inline isl::set set_tuple_id(isl::id id) const;
+  inline isl::set set_tuple_id(const std::string &id) const;
+  inline isl::fixed_box simple_fixed_box_hull() const;
+  inline isl::fixed_box get_simple_fixed_box_hull() const;
+  inline isl::basic_set simple_hull() const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+  inline isl::val stride(int pos) const;
+  inline isl::val get_stride(int pos) const;
+  inline isl::set subtract(isl::set set2) const;
+  inline isl::union_set subtract(const isl::union_set &uset2) const;
+  inline isl::set subtract(const isl::basic_set &set2) const;
+  inline isl::set subtract(const isl::point &set2) const;
+  inline isl::set_list to_list() const;
+  inline isl::union_set to_union_set() const;
+  inline isl::map translation() const;
+  inline class size tuple_dim() const;
+  inline isl::id tuple_id() const;
+  inline isl::id get_tuple_id() const;
+  inline std::string tuple_name() const;
+  inline std::string get_tuple_name() const;
+  inline isl::set unbind_params(isl::multi_id tuple) const;
+  inline isl::map unbind_params_insert_domain(isl::multi_id domain) const;
+  inline isl::set unite(isl::set set2) const;
+  inline isl::union_set unite(const isl::union_set &uset2) const;
+  inline isl::set unite(const isl::basic_set &set2) const;
+  inline isl::set unite(const isl::point &set2) const;
+  static inline isl::set universe(isl::space space);
+  inline isl::basic_set unshifted_simple_hull() const;
+  inline isl::map unwrap() const;
+  inline isl::set upper_bound(isl::multi_pw_aff upper) const;
+  inline isl::set upper_bound(isl::multi_val upper) const;
+  inline isl::set upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const;
+  inline isl::set upper_bound_val(isl::dim type, unsigned int pos, long value) const;
+};
+
+// declarations for isl::set_list
+inline set_list manage(__isl_take isl_set_list *ptr);
+inline set_list manage_copy(__isl_keep isl_set_list *ptr);
+
+class set_list {
+  friend inline set_list manage(__isl_take isl_set_list *ptr);
+  friend inline set_list manage_copy(__isl_keep isl_set_list *ptr);
+
+protected:
+  isl_set_list *ptr = nullptr;
+
+  inline explicit set_list(__isl_take isl_set_list *ptr);
+
+public:
+  inline /* implicit */ set_list();
+  inline /* implicit */ set_list(const set_list &obj);
+  inline explicit set_list(isl::ctx ctx, int n);
+  inline explicit set_list(isl::set el);
+  inline explicit set_list(isl::ctx ctx, const std::string &str);
+  inline set_list &operator=(set_list obj);
+  inline ~set_list();
+  inline __isl_give isl_set_list *copy() const &;
+  inline __isl_give isl_set_list *copy() && = delete;
+  inline __isl_keep isl_set_list *get() const;
+  inline __isl_give isl_set_list *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::set_list add(isl::set el) const;
+  inline isl::set at(int index) const;
+  inline isl::set get_at(int index) const;
+  inline isl::set_list clear() const;
+  inline isl::set_list concat(isl::set_list list2) const;
+  inline isl::set_list drop(unsigned int first, unsigned int n) const;
+  inline stat foreach(const std::function<stat(isl::set)> &fn) const;
+  inline isl::set_list insert(unsigned int pos, isl::set el) const;
+  inline class size size() const;
+};
+
+// declarations for isl::space
+inline space manage(__isl_take isl_space *ptr);
+inline space manage_copy(__isl_keep isl_space *ptr);
+
+class space {
+  friend inline space manage(__isl_take isl_space *ptr);
+  friend inline space manage_copy(__isl_keep isl_space *ptr);
+
+protected:
+  isl_space *ptr = nullptr;
+
+  inline explicit space(__isl_take isl_space *ptr);
+
+public:
+  inline /* implicit */ space();
+  inline /* implicit */ space(const space &obj);
+  inline explicit space(isl::ctx ctx, unsigned int nparam, unsigned int n_in, unsigned int n_out);
+  inline explicit space(isl::ctx ctx, unsigned int nparam, unsigned int dim);
+  inline space &operator=(space obj);
+  inline ~space();
+  inline __isl_give isl_space *copy() const &;
+  inline __isl_give isl_space *copy() && = delete;
+  inline __isl_keep isl_space *get() const;
+  inline __isl_give isl_space *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::space add_dims(isl::dim type, unsigned int n) const;
+  inline isl::space add_named_tuple(isl::id tuple_id, unsigned int dim) const;
+  inline isl::space add_named_tuple(const std::string &tuple_id, unsigned int dim) const;
+  inline isl::space add_param(isl::id id) const;
+  inline isl::space add_param(const std::string &id) const;
+  inline isl::space add_unnamed_tuple(unsigned int dim) const;
+  inline isl::space align_params(isl::space space2) const;
+  inline isl::space curry() const;
+  inline class size dim(isl::dim type) const;
+  inline isl::id dim_id(isl::dim type, unsigned int pos) const;
+  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
+  inline isl::space domain() const;
+  inline isl::multi_aff domain_map_multi_aff() const;
+  inline isl::pw_multi_aff domain_map_pw_multi_aff() const;
+  inline isl::id domain_tuple_id() const;
+  inline isl::id get_domain_tuple_id() const;
+  inline isl::space drop_dims(isl::dim type, unsigned int first, unsigned int num) const;
+  inline int find_dim_by_id(isl::dim type, const isl::id &id) const;
+  inline int find_dim_by_id(isl::dim type, const std::string &id) const;
+  inline isl::space flatten_domain() const;
+  inline isl::space flatten_range() const;
+  inline boolean has_domain_tuple_id() const;
+  inline boolean has_equal_tuples(const isl::space &space2) const;
+  inline boolean has_range_tuple_id() const;
+  inline boolean has_tuple_id(isl::dim type) const;
+  inline boolean has_tuple_name(isl::dim type) const;
+  inline isl::multi_aff identity_multi_aff_on_domain() const;
+  inline isl::multi_pw_aff identity_multi_pw_aff_on_domain() const;
+  inline isl::pw_multi_aff identity_pw_multi_aff_on_domain() const;
+  inline boolean is_equal(const isl::space &space2) const;
+  inline boolean is_params() const;
+  inline boolean is_set() const;
+  inline boolean is_wrapping() const;
+  inline isl::space map_from_domain_and_range(isl::space range) const;
+  inline isl::space map_from_set() const;
+  inline isl::multi_aff multi_aff(isl::aff_list list) const;
+  inline isl::multi_aff multi_aff_on_domain(isl::multi_val mv) const;
+  inline isl::multi_id multi_id(isl::id_list list) const;
+  inline isl::multi_pw_aff multi_pw_aff(isl::pw_aff_list list) const;
+  inline isl::multi_union_pw_aff multi_union_pw_aff(isl::union_pw_aff_list list) const;
+  inline isl::multi_val multi_val(isl::val_list list) const;
+  inline isl::aff param_aff_on_domain(isl::id id) const;
+  inline isl::aff param_aff_on_domain(const std::string &id) const;
+  inline isl::space params() const;
+  static inline isl::space params_alloc(isl::ctx ctx, unsigned int nparam);
+  inline isl::space product(isl::space right) const;
+  inline isl::space range() const;
+  inline isl::multi_aff range_map_multi_aff() const;
+  inline isl::pw_multi_aff range_map_pw_multi_aff() const;
+  inline isl::space range_reverse() const;
+  inline isl::id range_tuple_id() const;
+  inline isl::id get_range_tuple_id() const;
+  inline isl::space reverse() const;
+  inline isl::space set_dim_id(isl::dim type, unsigned int pos, isl::id id) const;
+  inline isl::space set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const;
+  inline isl::space set_domain_tuple(isl::id id) const;
+  inline isl::space set_domain_tuple(const std::string &id) const;
+  inline isl::space set_from_params() const;
+  inline isl::space set_range_tuple(isl::id id) const;
+  inline isl::space set_range_tuple(const std::string &id) const;
+  inline isl::space set_tuple_id(isl::dim type, isl::id id) const;
+  inline isl::space set_tuple_id(isl::dim type, const std::string &id) const;
+  inline isl::id tuple_id(isl::dim type) const;
+  inline isl::id get_tuple_id(isl::dim type) const;
+  inline std::string tuple_name(isl::dim type) const;
+  inline std::string get_tuple_name(isl::dim type) const;
+  inline isl::space uncurry() const;
+  static inline isl::space unit(isl::ctx ctx);
+  inline isl::map universe_map() const;
+  inline isl::set universe_set() const;
+  inline isl::space unwrap() const;
+  inline isl::space wrap() const;
+  inline isl::aff zero_aff_on_domain() const;
+  inline isl::multi_aff zero_multi_aff() const;
+  inline isl::multi_pw_aff zero_multi_pw_aff() const;
+  inline isl::multi_union_pw_aff zero_multi_union_pw_aff() const;
+  inline isl::multi_val zero_multi_val() const;
+};
+
+// declarations for isl::union_access_info
+inline union_access_info manage(__isl_take isl_union_access_info *ptr);
+inline union_access_info manage_copy(__isl_keep isl_union_access_info *ptr);
+
+class union_access_info {
+  friend inline union_access_info manage(__isl_take isl_union_access_info *ptr);
+  friend inline union_access_info manage_copy(__isl_keep isl_union_access_info *ptr);
+
+protected:
+  isl_union_access_info *ptr = nullptr;
+
+  inline explicit union_access_info(__isl_take isl_union_access_info *ptr);
+
+public:
+  inline /* implicit */ union_access_info();
+  inline /* implicit */ union_access_info(const union_access_info &obj);
+  inline explicit union_access_info(isl::union_map sink);
+  inline union_access_info &operator=(union_access_info obj);
+  inline ~union_access_info();
+  inline __isl_give isl_union_access_info *copy() const &;
+  inline __isl_give isl_union_access_info *copy() && = delete;
+  inline __isl_keep isl_union_access_info *get() const;
+  inline __isl_give isl_union_access_info *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::union_flow compute_flow() const;
+  inline isl::union_access_info set_kill(isl::union_map kill) const;
+  inline isl::union_access_info set_may_source(isl::union_map may_source) const;
+  inline isl::union_access_info set_must_source(isl::union_map must_source) const;
+  inline isl::union_access_info set_schedule(isl::schedule schedule) const;
+  inline isl::union_access_info set_schedule_map(isl::union_map schedule_map) const;
+};
+
+// declarations for isl::union_flow
+inline union_flow manage(__isl_take isl_union_flow *ptr);
+inline union_flow manage_copy(__isl_keep isl_union_flow *ptr);
+
+class union_flow {
+  friend inline union_flow manage(__isl_take isl_union_flow *ptr);
+  friend inline union_flow manage_copy(__isl_keep isl_union_flow *ptr);
+
+protected:
+  isl_union_flow *ptr = nullptr;
+
+  inline explicit union_flow(__isl_take isl_union_flow *ptr);
+
+public:
+  inline /* implicit */ union_flow();
+  inline /* implicit */ union_flow(const union_flow &obj);
+  inline union_flow &operator=(union_flow obj);
+  inline ~union_flow();
+  inline __isl_give isl_union_flow *copy() const &;
+  inline __isl_give isl_union_flow *copy() && = delete;
+  inline __isl_keep isl_union_flow *get() const;
+  inline __isl_give isl_union_flow *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::union_map full_may_dependence() const;
+  inline isl::union_map get_full_may_dependence() const;
+  inline isl::union_map full_must_dependence() const;
+  inline isl::union_map get_full_must_dependence() const;
+  inline isl::union_map may_dependence() const;
+  inline isl::union_map get_may_dependence() const;
+  inline isl::union_map may_no_source() const;
+  inline isl::union_map get_may_no_source() const;
+  inline isl::union_map must_dependence() const;
+  inline isl::union_map get_must_dependence() const;
+  inline isl::union_map must_no_source() const;
+  inline isl::union_map get_must_no_source() const;
+};
+
+// declarations for isl::union_map
+inline union_map manage(__isl_take isl_union_map *ptr);
+inline union_map manage_copy(__isl_keep isl_union_map *ptr);
+
+class union_map {
+  friend inline union_map manage(__isl_take isl_union_map *ptr);
+  friend inline union_map manage_copy(__isl_keep isl_union_map *ptr);
+
+protected:
+  isl_union_map *ptr = nullptr;
+
+  inline explicit union_map(__isl_take isl_union_map *ptr);
+
+public:
+  inline /* implicit */ union_map();
+  inline /* implicit */ union_map(const union_map &obj);
+  inline /* implicit */ union_map(isl::basic_map bmap);
+  inline /* implicit */ union_map(isl::map map);
+  inline explicit union_map(isl::ctx ctx, const std::string &str);
+  inline union_map &operator=(union_map obj);
+  inline ~union_map();
+  inline __isl_give isl_union_map *copy() const &;
+  inline __isl_give isl_union_map *copy() && = delete;
+  inline __isl_keep isl_union_map *get() const;
+  inline __isl_give isl_union_map *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::union_map affine_hull() const;
+  inline isl::union_map apply_domain(isl::union_map umap2) const;
+  inline isl::union_map apply_range(isl::union_map umap2) const;
+  inline isl::map as_map() const;
+  inline isl::multi_union_pw_aff as_multi_union_pw_aff() const;
+  inline isl::union_pw_multi_aff as_union_pw_multi_aff() const;
+  inline isl::union_set bind_range(isl::multi_id tuple) const;
+  inline isl::union_map coalesce() const;
+  inline isl::union_map compute_divs() const;
+  inline isl::union_map curry() const;
+  inline isl::union_set deltas() const;
+  inline isl::union_map detect_equalities() const;
+  inline isl::union_set domain() const;
+  inline isl::union_map domain_factor_domain() const;
+  inline isl::union_map domain_factor_range() const;
+  inline isl::union_map domain_map() const;
+  inline isl::union_pw_multi_aff domain_map_union_pw_multi_aff() const;
+  inline isl::union_map domain_product(isl::union_map umap2) const;
+  static inline isl::union_map empty(isl::ctx ctx);
+  inline isl::union_map eq_at(isl::multi_union_pw_aff mupa) const;
+  inline boolean every_map(const std::function<boolean(isl::map)> &test) const;
+  inline isl::map extract_map(isl::space space) const;
+  inline isl::union_map factor_domain() const;
+  inline isl::union_map factor_range() const;
+  inline isl::union_map fixed_power(isl::val exp) const;
+  inline isl::union_map fixed_power(long exp) const;
+  inline isl::union_map flat_range_product(isl::union_map umap2) const;
+  inline stat foreach_map(const std::function<stat(isl::map)> &fn) const;
+  static inline isl::union_map from(isl::multi_union_pw_aff mupa);
+  static inline isl::union_map from(isl::union_pw_multi_aff upma);
+  static inline isl::union_map from_domain(isl::union_set uset);
+  static inline isl::union_map from_domain_and_range(isl::union_set domain, isl::union_set range);
+  static inline isl::union_map from_range(isl::union_set uset);
+  inline isl::union_map gist(isl::union_map context) const;
+  inline isl::union_map gist_domain(isl::union_set uset) const;
+  inline isl::union_map gist_params(isl::set set) const;
+  inline isl::union_map gist_range(isl::union_set uset) const;
+  inline isl::union_map intersect(isl::union_map umap2) const;
+  inline isl::union_map intersect_domain(isl::space space) const;
+  inline isl::union_map intersect_domain(isl::union_set uset) const;
+  inline isl::union_map intersect_domain_factor_domain(isl::union_map factor) const;
+  inline isl::union_map intersect_domain_factor_range(isl::union_map factor) const;
+  inline isl::union_map intersect_params(isl::set set) const;
+  inline isl::union_map intersect_range(isl::space space) const;
+  inline isl::union_map intersect_range(isl::union_set uset) const;
+  inline isl::union_map intersect_range_factor_domain(isl::union_map factor) const;
+  inline isl::union_map intersect_range_factor_range(isl::union_map factor) const;
+  inline boolean is_bijective() const;
+  inline boolean is_disjoint(const isl::union_map &umap2) const;
+  inline boolean is_empty() const;
+  inline boolean is_equal(const isl::union_map &umap2) const;
+  inline boolean is_injective() const;
+  inline boolean is_single_valued() const;
+  inline boolean is_strict_subset(const isl::union_map &umap2) const;
+  inline boolean is_subset(const isl::union_map &umap2) const;
+  inline boolean isa_map() const;
+  inline isl::union_map lexmax() const;
+  inline isl::union_map lexmin() const;
+  inline isl::map_list map_list() const;
+  inline isl::map_list get_map_list() const;
+  inline isl::set params() const;
+  inline isl::union_map polyhedral_hull() const;
+  inline isl::union_map preimage_domain(isl::multi_aff ma) const;
+  inline isl::union_map preimage_domain(isl::multi_pw_aff mpa) const;
+  inline isl::union_map preimage_domain(isl::pw_multi_aff pma) const;
+  inline isl::union_map preimage_domain(isl::union_pw_multi_aff upma) const;
+  inline isl::union_map preimage_range(isl::multi_aff ma) const;
+  inline isl::union_map preimage_range(isl::pw_multi_aff pma) const;
+  inline isl::union_map preimage_range(isl::union_pw_multi_aff upma) const;
+  inline isl::union_map product(isl::union_map umap2) const;
+  inline isl::union_map project_out_all_params() const;
+  inline isl::union_set range() const;
+  inline isl::union_map range_factor_domain() const;
+  inline isl::union_map range_factor_range() const;
+  inline isl::union_map range_map() const;
+  inline isl::union_map range_product(isl::union_map umap2) const;
+  inline isl::union_map range_reverse() const;
+  inline isl::union_map reverse() const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+  inline isl::union_map subtract(isl::union_map umap2) const;
+  inline isl::union_map subtract_domain(isl::union_set dom) const;
+  inline isl::union_map subtract_range(isl::union_set dom) const;
+  inline isl::union_map uncurry() const;
+  inline isl::union_map unite(isl::union_map umap2) const;
+  inline isl::union_map universe() const;
+  inline isl::union_set wrap() const;
+  inline isl::union_map zip() const;
+};
+
+// declarations for isl::union_pw_aff
+inline union_pw_aff manage(__isl_take isl_union_pw_aff *ptr);
+inline union_pw_aff manage_copy(__isl_keep isl_union_pw_aff *ptr);
+
+class union_pw_aff {
+  friend inline union_pw_aff manage(__isl_take isl_union_pw_aff *ptr);
+  friend inline union_pw_aff manage_copy(__isl_keep isl_union_pw_aff *ptr);
+
+protected:
+  isl_union_pw_aff *ptr = nullptr;
+
+  inline explicit union_pw_aff(__isl_take isl_union_pw_aff *ptr);
+
+public:
+  inline /* implicit */ union_pw_aff();
+  inline /* implicit */ union_pw_aff(const union_pw_aff &obj);
+  inline /* implicit */ union_pw_aff(isl::aff aff);
+  inline /* implicit */ union_pw_aff(isl::pw_aff pa);
+  inline explicit union_pw_aff(isl::ctx ctx, const std::string &str);
+  inline explicit union_pw_aff(isl::union_set domain, isl::val v);
+  inline union_pw_aff &operator=(union_pw_aff obj);
+  inline ~union_pw_aff();
+  inline __isl_give isl_union_pw_aff *copy() const &;
+  inline __isl_give isl_union_pw_aff *copy() && = delete;
+  inline __isl_keep isl_union_pw_aff *get() const;
+  inline __isl_give isl_union_pw_aff *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::multi_union_pw_aff add(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::union_pw_aff add(isl::union_pw_aff upa2) const;
+  inline isl::union_pw_multi_aff add(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::union_pw_aff add(const isl::aff &upa2) const;
+  inline isl::union_pw_aff add(const isl::pw_aff &upa2) const;
+  inline isl::union_pw_multi_aff add_pw_multi_aff(const isl::pw_multi_aff &pma) const;
+  inline isl::union_pw_multi_aff apply(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::multi_union_pw_aff as_multi_union_pw_aff() const;
+  inline isl::pw_multi_aff as_pw_multi_aff() const;
+  inline isl::union_map as_union_map() const;
+  inline isl::union_pw_aff at(int pos) const;
+  inline isl::union_set bind(const isl::multi_id &tuple) const;
+  inline isl::union_set bind(isl::id id) const;
+  inline isl::union_set bind(const std::string &id) const;
+  inline isl::union_pw_aff coalesce() const;
+  inline class size dim(isl::dim type) const;
+  inline isl::union_set domain() const;
+  static inline isl::union_pw_aff empty(isl::space space);
+  inline isl::pw_multi_aff extract_pw_multi_aff(const isl::space &space) const;
+  inline isl::multi_union_pw_aff flat_range_product(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::union_pw_multi_aff flat_range_product(const isl::union_pw_multi_aff &upma2) const;
+  inline stat foreach_pw_aff(const std::function<stat(isl::pw_aff)> &fn) const;
+  inline isl::union_pw_aff gist(isl::union_set context) const;
+  inline boolean has_range_tuple_id() const;
+  inline isl::union_pw_aff intersect_domain(isl::space space) const;
+  inline isl::union_pw_aff intersect_domain(isl::union_set uset) const;
+  inline isl::union_pw_aff intersect_domain_wrapped_domain(isl::union_set uset) const;
+  inline isl::union_pw_aff intersect_domain_wrapped_range(isl::union_set uset) const;
+  inline isl::union_pw_aff intersect_params(isl::set set) const;
+  inline boolean involves_locals() const;
+  inline boolean involves_nan() const;
+  inline boolean isa_pw_multi_aff() const;
+  inline isl::union_pw_aff_list list() const;
+  inline isl::multi_union_pw_aff neg() const;
+  inline boolean plain_is_empty() const;
+  inline boolean plain_is_equal(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::union_pw_aff pullback(isl::union_pw_multi_aff upma) const;
+  inline isl::pw_multi_aff_list pw_multi_aff_list() const;
+  inline isl::union_pw_multi_aff range_factor_domain() const;
+  inline isl::union_pw_multi_aff range_factor_range() const;
+  inline isl::multi_union_pw_aff range_product(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::union_pw_multi_aff range_product(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::id range_tuple_id() const;
+  inline isl::multi_union_pw_aff reset_range_tuple_id() const;
+  inline isl::multi_union_pw_aff reset_tuple_id(isl::dim type) const;
+  inline isl::multi_union_pw_aff scale(const isl::multi_val &mv) const;
+  inline isl::multi_union_pw_aff scale(const isl::val &v) const;
+  inline isl::multi_union_pw_aff scale(long v) const;
+  inline isl::multi_union_pw_aff scale_down(const isl::multi_val &mv) const;
+  inline isl::multi_union_pw_aff scale_down(const isl::val &v) const;
+  inline isl::multi_union_pw_aff scale_down(long v) const;
+  inline isl::multi_union_pw_aff set_at(int pos, const isl::union_pw_aff &el) const;
+  inline isl::multi_union_pw_aff set_range_tuple(const isl::id &id) const;
+  inline isl::multi_union_pw_aff set_range_tuple(const std::string &id) const;
+  inline isl::multi_union_pw_aff set_union_pw_aff(int pos, const isl::union_pw_aff &el) const;
+  inline class size size() const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+  inline isl::multi_union_pw_aff sub(const isl::multi_union_pw_aff &multi2) const;
+  inline isl::union_pw_aff sub(isl::union_pw_aff upa2) const;
+  inline isl::union_pw_multi_aff sub(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::union_pw_aff sub(const isl::aff &upa2) const;
+  inline isl::union_pw_aff sub(const isl::pw_aff &upa2) const;
+  inline isl::union_pw_aff subtract_domain(isl::space space) const;
+  inline isl::union_pw_aff subtract_domain(isl::union_set uset) const;
+  inline isl::union_pw_aff_list to_list() const;
+  inline isl::multi_union_pw_aff union_add(const isl::multi_union_pw_aff &mupa2) const;
+  inline isl::union_pw_aff union_add(isl::union_pw_aff upa2) const;
+  inline isl::union_pw_multi_aff union_add(const isl::union_pw_multi_aff &upma2) const;
+  inline isl::union_pw_aff union_add(const isl::aff &upa2) const;
+  inline isl::union_pw_aff union_add(const isl::pw_aff &upa2) const;
+};
+
+// declarations for isl::union_pw_aff_list
+inline union_pw_aff_list manage(__isl_take isl_union_pw_aff_list *ptr);
+inline union_pw_aff_list manage_copy(__isl_keep isl_union_pw_aff_list *ptr);
+
+class union_pw_aff_list {
+  friend inline union_pw_aff_list manage(__isl_take isl_union_pw_aff_list *ptr);
+  friend inline union_pw_aff_list manage_copy(__isl_keep isl_union_pw_aff_list *ptr);
+
+protected:
+  isl_union_pw_aff_list *ptr = nullptr;
+
+  inline explicit union_pw_aff_list(__isl_take isl_union_pw_aff_list *ptr);
+
+public:
+  inline /* implicit */ union_pw_aff_list();
+  inline /* implicit */ union_pw_aff_list(const union_pw_aff_list &obj);
+  inline explicit union_pw_aff_list(isl::ctx ctx, int n);
+  inline explicit union_pw_aff_list(isl::union_pw_aff el);
+  inline explicit union_pw_aff_list(isl::ctx ctx, const std::string &str);
+  inline union_pw_aff_list &operator=(union_pw_aff_list obj);
+  inline ~union_pw_aff_list();
+  inline __isl_give isl_union_pw_aff_list *copy() const &;
+  inline __isl_give isl_union_pw_aff_list *copy() && = delete;
+  inline __isl_keep isl_union_pw_aff_list *get() const;
+  inline __isl_give isl_union_pw_aff_list *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::union_pw_aff_list add(isl::union_pw_aff el) const;
+  inline isl::union_pw_aff at(int index) const;
+  inline isl::union_pw_aff get_at(int index) const;
+  inline isl::union_pw_aff_list clear() const;
+  inline isl::union_pw_aff_list concat(isl::union_pw_aff_list list2) const;
+  inline isl::union_pw_aff_list drop(unsigned int first, unsigned int n) const;
+  inline stat foreach(const std::function<stat(isl::union_pw_aff)> &fn) const;
+  inline isl::union_pw_aff_list insert(unsigned int pos, isl::union_pw_aff el) const;
+  inline class size size() const;
+};
+
+// declarations for isl::union_pw_multi_aff
+inline union_pw_multi_aff manage(__isl_take isl_union_pw_multi_aff *ptr);
+inline union_pw_multi_aff manage_copy(__isl_keep isl_union_pw_multi_aff *ptr);
+
+class union_pw_multi_aff {
+  friend inline union_pw_multi_aff manage(__isl_take isl_union_pw_multi_aff *ptr);
+  friend inline union_pw_multi_aff manage_copy(__isl_keep isl_union_pw_multi_aff *ptr);
+
+protected:
+  isl_union_pw_multi_aff *ptr = nullptr;
+
+  inline explicit union_pw_multi_aff(__isl_take isl_union_pw_multi_aff *ptr);
+
+public:
+  inline /* implicit */ union_pw_multi_aff();
+  inline /* implicit */ union_pw_multi_aff(const union_pw_multi_aff &obj);
+  inline explicit union_pw_multi_aff(isl::union_set uset);
+  inline /* implicit */ union_pw_multi_aff(isl::multi_aff ma);
+  inline /* implicit */ union_pw_multi_aff(isl::pw_multi_aff pma);
+  inline explicit union_pw_multi_aff(isl::union_map umap);
+  inline /* implicit */ union_pw_multi_aff(isl::union_pw_aff upa);
+  inline explicit union_pw_multi_aff(isl::ctx ctx, const std::string &str);
+  inline union_pw_multi_aff &operator=(union_pw_multi_aff obj);
+  inline ~union_pw_multi_aff();
+  inline __isl_give isl_union_pw_multi_aff *copy() const &;
+  inline __isl_give isl_union_pw_multi_aff *copy() && = delete;
+  inline __isl_keep isl_union_pw_multi_aff *get() const;
+  inline __isl_give isl_union_pw_multi_aff *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::union_pw_multi_aff add(isl::union_pw_multi_aff upma2) const;
+  inline isl::union_pw_multi_aff add_pw_multi_aff(isl::pw_multi_aff pma) const;
+  inline isl::union_pw_multi_aff apply(isl::union_pw_multi_aff upma2) const;
+  inline isl::multi_union_pw_aff as_multi_union_pw_aff() const;
+  inline isl::pw_multi_aff as_pw_multi_aff() const;
+  inline isl::union_map as_union_map() const;
+  inline isl::union_pw_multi_aff coalesce() const;
+  inline isl::union_set domain() const;
+  static inline isl::union_pw_multi_aff empty(isl::space space);
+  static inline isl::union_pw_multi_aff empty(isl::ctx ctx);
+  inline isl::pw_multi_aff extract_pw_multi_aff(isl::space space) const;
+  inline isl::union_pw_multi_aff flat_range_product(isl::union_pw_multi_aff upma2) const;
+  inline isl::union_pw_multi_aff gist(isl::union_set context) const;
+  inline isl::union_pw_multi_aff intersect_domain(isl::space space) const;
+  inline isl::union_pw_multi_aff intersect_domain(isl::union_set uset) const;
+  inline isl::union_pw_multi_aff intersect_domain_wrapped_domain(isl::union_set uset) const;
+  inline isl::union_pw_multi_aff intersect_domain_wrapped_range(isl::union_set uset) const;
+  inline isl::union_pw_multi_aff intersect_params(isl::set set) const;
+  inline boolean involves_locals() const;
+  inline boolean isa_pw_multi_aff() const;
+  inline boolean plain_is_empty() const;
+  inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(isl::union_pw_multi_aff upma2) const;
+  inline isl::union_pw_multi_aff pullback(isl::union_pw_multi_aff upma2) const;
+  inline isl::pw_multi_aff_list pw_multi_aff_list() const;
+  inline isl::pw_multi_aff_list get_pw_multi_aff_list() const;
+  inline isl::union_pw_multi_aff range_factor_domain() const;
+  inline isl::union_pw_multi_aff range_factor_range() const;
+  inline isl::union_pw_multi_aff range_product(isl::union_pw_multi_aff upma2) const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+  inline isl::union_pw_multi_aff sub(isl::union_pw_multi_aff upma2) const;
+  inline isl::union_pw_multi_aff subtract_domain(isl::space space) const;
+  inline isl::union_pw_multi_aff subtract_domain(isl::union_set uset) const;
+  inline isl::union_pw_multi_aff union_add(isl::union_pw_multi_aff upma2) const;
+};
+
+// declarations for isl::union_set
+inline union_set manage(__isl_take isl_union_set *ptr);
+inline union_set manage_copy(__isl_keep isl_union_set *ptr);
+
+class union_set {
+  friend inline union_set manage(__isl_take isl_union_set *ptr);
+  friend inline union_set manage_copy(__isl_keep isl_union_set *ptr);
+
+protected:
+  isl_union_set *ptr = nullptr;
+
+  inline explicit union_set(__isl_take isl_union_set *ptr);
+
+public:
+  inline /* implicit */ union_set();
+  inline /* implicit */ union_set(const union_set &obj);
+  inline /* implicit */ union_set(isl::basic_set bset);
+  inline /* implicit */ union_set(isl::point pnt);
+  inline /* implicit */ union_set(isl::set set);
+  inline explicit union_set(isl::ctx ctx, const std::string &str);
+  inline union_set &operator=(union_set obj);
+  inline ~union_set();
+  inline __isl_give isl_union_set *copy() const &;
+  inline __isl_give isl_union_set *copy() && = delete;
+  inline __isl_keep isl_union_set *get() const;
+  inline __isl_give isl_union_set *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::union_set affine_hull() const;
+  inline isl::union_set apply(isl::union_map umap) const;
+  inline isl::set as_set() const;
+  inline isl::union_set coalesce() const;
+  inline isl::union_set compute_divs() const;
+  inline boolean contains(const isl::space &space) const;
+  inline isl::union_set detect_equalities() const;
+  static inline isl::union_set empty(isl::ctx ctx);
+  inline boolean every_set(const std::function<boolean(isl::set)> &test) const;
+  inline isl::set extract_set(isl::space space) const;
+  inline stat foreach_point(const std::function<stat(isl::point)> &fn) const;
+  inline stat foreach_set(const std::function<stat(isl::set)> &fn) const;
+  inline isl::union_set gist(isl::union_set context) const;
+  inline isl::union_set gist_params(isl::set set) const;
+  inline isl::union_map identity() const;
+  inline isl::union_pw_multi_aff identity_union_pw_multi_aff() const;
+  inline isl::union_set intersect(isl::union_set uset2) const;
+  inline isl::union_set intersect_params(isl::set set) const;
+  inline boolean is_disjoint(const isl::union_set &uset2) const;
+  inline boolean is_empty() const;
+  inline boolean is_equal(const isl::union_set &uset2) const;
+  inline boolean is_strict_subset(const isl::union_set &uset2) const;
+  inline boolean is_subset(const isl::union_set &uset2) const;
+  inline boolean isa_set() const;
+  inline isl::union_set lexmax() const;
+  inline isl::union_set lexmin() const;
+  inline isl::set params() const;
+  inline isl::union_set polyhedral_hull() const;
+  inline isl::union_set preimage(isl::multi_aff ma) const;
+  inline isl::union_set preimage(isl::pw_multi_aff pma) const;
+  inline isl::union_set preimage(isl::union_pw_multi_aff upma) const;
+  inline isl::point sample_point() const;
+  inline isl::set_list set_list() const;
+  inline isl::set_list get_set_list() const;
+  inline isl::space space() const;
+  inline isl::space get_space() const;
+  inline isl::union_set subtract(isl::union_set uset2) const;
+  inline isl::union_set_list to_list() const;
+  inline isl::union_set unite(isl::union_set uset2) const;
+  inline isl::union_set universe() const;
+  inline isl::union_map unwrap() const;
+};
+
+// declarations for isl::union_set_list
+inline union_set_list manage(__isl_take isl_union_set_list *ptr);
+inline union_set_list manage_copy(__isl_keep isl_union_set_list *ptr);
+
+class union_set_list {
+  friend inline union_set_list manage(__isl_take isl_union_set_list *ptr);
+  friend inline union_set_list manage_copy(__isl_keep isl_union_set_list *ptr);
+
+protected:
+  isl_union_set_list *ptr = nullptr;
+
+  inline explicit union_set_list(__isl_take isl_union_set_list *ptr);
+
+public:
+  inline /* implicit */ union_set_list();
+  inline /* implicit */ union_set_list(const union_set_list &obj);
+  inline explicit union_set_list(isl::ctx ctx, int n);
+  inline explicit union_set_list(isl::union_set el);
+  inline explicit union_set_list(isl::ctx ctx, const std::string &str);
+  inline union_set_list &operator=(union_set_list obj);
+  inline ~union_set_list();
+  inline __isl_give isl_union_set_list *copy() const &;
+  inline __isl_give isl_union_set_list *copy() && = delete;
+  inline __isl_keep isl_union_set_list *get() const;
+  inline __isl_give isl_union_set_list *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::union_set_list add(isl::union_set el) const;
+  inline isl::union_set at(int index) const;
+  inline isl::union_set get_at(int index) const;
+  inline isl::union_set_list clear() const;
+  inline isl::union_set_list concat(isl::union_set_list list2) const;
+  inline isl::union_set_list drop(unsigned int first, unsigned int n) const;
+  inline stat foreach(const std::function<stat(isl::union_set)> &fn) const;
+  inline isl::union_set_list insert(unsigned int pos, isl::union_set el) const;
+  inline class size size() const;
+};
+
+// declarations for isl::val
+inline val manage(__isl_take isl_val *ptr);
+inline val manage_copy(__isl_keep isl_val *ptr);
+
+class val {
+  friend inline val manage(__isl_take isl_val *ptr);
+  friend inline val manage_copy(__isl_keep isl_val *ptr);
+
+protected:
+  isl_val *ptr = nullptr;
+
+  inline explicit val(__isl_take isl_val *ptr);
+
+public:
+  inline /* implicit */ val();
+  inline /* implicit */ val(const val &obj);
+  inline explicit val(isl::ctx ctx, long i);
+  inline explicit val(isl::ctx ctx, const std::string &str);
+  inline val &operator=(val obj);
+  inline ~val();
+  inline __isl_give isl_val *copy() const &;
+  inline __isl_give isl_val *copy() && = delete;
+  inline __isl_keep isl_val *get() const;
+  inline __isl_give isl_val *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::val abs() const;
+  inline boolean abs_eq(const isl::val &v2) const;
+  inline boolean abs_eq(long v2) const;
+  inline isl::val add(isl::val v2) const;
+  inline isl::val add(long v2) const;
+  inline isl::val ceil() const;
+  inline int cmp_si(long i) const;
+  inline long den_si() const;
+  inline long get_den_si() const;
+  inline isl::val div(isl::val v2) const;
+  inline isl::val div(long v2) const;
+  inline boolean eq(const isl::val &v2) const;
+  inline boolean eq(long v2) const;
+  inline isl::val floor() const;
+  inline isl::val gcd(isl::val v2) const;
+  inline isl::val gcd(long v2) const;
+  inline boolean ge(const isl::val &v2) const;
+  inline boolean ge(long v2) const;
+  inline boolean gt(const isl::val &v2) const;
+  inline boolean gt(long v2) const;
+  static inline isl::val infty(isl::ctx ctx);
+  static inline isl::val int_from_ui(isl::ctx ctx, unsigned long u);
+  inline isl::val inv() const;
+  inline boolean is_divisible_by(const isl::val &v2) const;
+  inline boolean is_divisible_by(long v2) const;
+  inline boolean is_infty() const;
+  inline boolean is_int() const;
+  inline boolean is_nan() const;
+  inline boolean is_neg() const;
+  inline boolean is_neginfty() const;
+  inline boolean is_negone() const;
+  inline boolean is_nonneg() const;
+  inline boolean is_nonpos() const;
+  inline boolean is_one() const;
+  inline boolean is_pos() const;
+  inline boolean is_rat() const;
+  inline boolean is_zero() const;
+  inline boolean le(const isl::val &v2) const;
+  inline boolean le(long v2) const;
+  inline boolean lt(const isl::val &v2) const;
+  inline boolean lt(long v2) const;
+  inline isl::val max(isl::val v2) const;
+  inline isl::val max(long v2) const;
+  inline isl::val min(isl::val v2) const;
+  inline isl::val min(long v2) const;
+  inline isl::val mod(isl::val v2) const;
+  inline isl::val mod(long v2) const;
+  inline isl::val mul(isl::val v2) const;
+  inline isl::val mul(long v2) const;
+  static inline isl::val nan(isl::ctx ctx);
+  inline boolean ne(const isl::val &v2) const;
+  inline boolean ne(long v2) const;
+  inline isl::val neg() const;
+  static inline isl::val neginfty(isl::ctx ctx);
+  static inline isl::val negone(isl::ctx ctx);
+  inline long num_si() const;
+  inline long get_num_si() const;
+  static inline isl::val one(isl::ctx ctx);
+  inline isl::val pow2() const;
+  inline int sgn() const;
+  inline isl::val sub(isl::val v2) const;
+  inline isl::val sub(long v2) const;
+  inline isl::val_list to_list() const;
+  inline isl::val trunc() const;
+  static inline isl::val zero(isl::ctx ctx);
+};
+
+// declarations for isl::val_list
+inline val_list manage(__isl_take isl_val_list *ptr);
+inline val_list manage_copy(__isl_keep isl_val_list *ptr);
+
+class val_list {
+  friend inline val_list manage(__isl_take isl_val_list *ptr);
+  friend inline val_list manage_copy(__isl_keep isl_val_list *ptr);
+
+protected:
+  isl_val_list *ptr = nullptr;
+
+  inline explicit val_list(__isl_take isl_val_list *ptr);
+
+public:
+  inline /* implicit */ val_list();
+  inline /* implicit */ val_list(const val_list &obj);
+  inline explicit val_list(isl::ctx ctx, int n);
+  inline explicit val_list(isl::val el);
+  inline explicit val_list(isl::ctx ctx, const std::string &str);
+  inline val_list &operator=(val_list obj);
+  inline ~val_list();
+  inline __isl_give isl_val_list *copy() const &;
+  inline __isl_give isl_val_list *copy() && = delete;
+  inline __isl_keep isl_val_list *get() const;
+  inline __isl_give isl_val_list *release();
+  inline bool is_null() const;
+  inline isl::ctx ctx() const;
+
+  inline isl::val_list add(isl::val el) const;
+  inline isl::val_list add(long el) const;
+  inline isl::val at(int index) const;
+  inline isl::val get_at(int index) const;
+  inline isl::val_list clear() const;
+  inline isl::val_list concat(isl::val_list list2) const;
+  inline isl::val_list drop(unsigned int first, unsigned int n) const;
+  inline stat foreach(const std::function<stat(isl::val)> &fn) const;
+  inline isl::val_list insert(unsigned int pos, isl::val el) const;
+  inline isl::val_list insert(unsigned int pos, long el) const;
+  inline class size size() const;
+};
+
+// implementations for isl::aff
+aff manage(__isl_take isl_aff *ptr) {
+  return aff(ptr);
+}
+aff manage_copy(__isl_keep isl_aff *ptr) {
+  ptr = isl_aff_copy(ptr);
+  return aff(ptr);
+}
+
+aff::aff()
+    : ptr(nullptr) {}
+
+aff::aff(const aff &obj)
+    : ptr(nullptr)
+{
+  ptr = obj.copy();
+}
+
+aff::aff(__isl_take isl_aff *ptr)
+    : ptr(ptr) {}
+
+aff::aff(isl::ctx ctx, const std::string &str)
+{
+  auto res = isl_aff_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
+}
+
+aff::aff(isl::local_space ls, isl::val val)
+{
+  auto res = isl_aff_val_on_domain(ls.release(), val.release());
+  ptr = res;
+}
+
+aff::aff(isl::local_space ls)
+{
+  auto res = isl_aff_zero_on_domain(ls.release());
+  ptr = res;
+}
+
+aff &aff::operator=(aff obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+aff::~aff() {
+  if (ptr)
+    isl_aff_free(ptr);
+}
+
+__isl_give isl_aff *aff::copy() const & {
+  return isl_aff_copy(ptr);
+}
+
+__isl_keep isl_aff *aff::get() const {
+  return ptr;
+}
+
+__isl_give isl_aff *aff::release() {
+  isl_aff *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
+}
+
+bool aff::is_null() const {
+  return ptr == nullptr;
+}
+
+isl::ctx aff::ctx() const {
+  return isl::ctx(isl_aff_get_ctx(ptr));
+}
+
+isl::aff aff::add(isl::aff aff2) const
+{
+  auto res = isl_aff_add(copy(), aff2.release());
+  return manage(res);
+}
+
+isl::multi_aff aff::add(const isl::multi_aff &multi2) const
+{
+  return isl::multi_aff(*this).add(multi2);
+}
+
+isl::multi_pw_aff aff::add(const isl::multi_pw_aff &multi2) const
+{
+  return isl::pw_aff(*this).add(multi2);
+}
+
+isl::multi_union_pw_aff aff::add(const isl::multi_union_pw_aff &multi2) const
+{
+  return isl::pw_aff(*this).add(multi2);
+}
+
+isl::pw_aff aff::add(const isl::pw_aff &pwaff2) const
+{
+  return isl::pw_aff(*this).add(pwaff2);
+}
+
+isl::pw_multi_aff aff::add(const isl::pw_multi_aff &pma2) const
+{
+  return isl::pw_aff(*this).add(pma2);
+}
+
+isl::union_pw_aff aff::add(const isl::union_pw_aff &upa2) const
+{
+  return isl::pw_aff(*this).add(upa2);
+}
+
+isl::union_pw_multi_aff aff::add(const isl::union_pw_multi_aff &upma2) const
+{
+  return isl::pw_aff(*this).add(upma2);
+}
+
+isl::aff aff::add_constant(isl::val v) const
+{
+  auto res = isl_aff_add_constant_val(copy(), v.release());
+  return manage(res);
+}
+
+isl::aff aff::add_constant(long v) const
+{
+  return this->add_constant(isl::val(ctx(), v));
+}
+
+isl::multi_aff aff::add_constant(const isl::multi_val &mv) const
+{
+  return isl::multi_aff(*this).add_constant(mv);
+}
+
+isl::aff aff::add_constant_si(int v) const
+{
+  auto res = isl_aff_add_constant_si(copy(), v);
+  return manage(res);
+}
+
+isl::pw_aff aff::add_dims(isl::dim type, unsigned int n) const
+{
+  return isl::pw_aff(*this).add_dims(type, n);
+}
+
+isl::union_pw_multi_aff aff::add_pw_multi_aff(const isl::pw_multi_aff &pma) const
+{
+  return isl::pw_aff(*this).add_pw_multi_aff(pma);
+}
+
+isl::union_pw_multi_aff aff::apply(const isl::union_pw_multi_aff &upma2) const
+{
+  return isl::pw_aff(*this).apply(upma2);
+}
+
+isl::aff aff::as_aff() const
+{
+  return isl::pw_aff(*this).as_aff();
+}
+
+isl::map aff::as_map() const
+{
+  return isl::pw_aff(*this).as_map();
+}
+
+isl::multi_aff aff::as_multi_aff() const
+{
+  return isl::pw_aff(*this).as_multi_aff();
+}
+
+isl::multi_union_pw_aff aff::as_multi_union_pw_aff() const
+{
+  return isl::pw_aff(*this).as_multi_union_pw_aff();
+}
+
+isl::pw_multi_aff aff::as_pw_multi_aff() const
+{
+  return isl::pw_aff(*this).as_pw_multi_aff();
+}
+
+isl::set aff::as_set() const
+{
+  return isl::multi_aff(*this).as_set();
+}
+
+isl::union_map aff::as_union_map() const
+{
+  return isl::pw_aff(*this).as_union_map();
+}
+
+isl::aff aff::at(int pos) const
+{
+  return isl::multi_aff(*this).at(pos);
+}
+
+isl::basic_set aff::bind(isl::id id) const
+{
+  auto res = isl_aff_bind_id(copy(), id.release());
+  return manage(res);
+}
+
+isl::basic_set aff::bind(const std::string &id) const
+{
+  return this->bind(isl::id(ctx(), id));
+}
+
+isl::basic_set aff::bind(const isl::multi_id &tuple) const
+{
+  return isl::multi_aff(*this).bind(tuple);
+}
+
+isl::pw_aff aff::bind_domain(const isl::multi_id &tuple) const
+{
+  return isl::pw_aff(*this).bind_domain(tuple);
+}
+
+isl::pw_aff aff::bind_domain_wrapped_domain(const isl::multi_id &tuple) const
+{
+  return isl::pw_aff(*this).bind_domain_wrapped_domain(tuple);
+}
+
+isl::aff aff::ceil() const
+{
+  auto res = isl_aff_ceil(copy());
+  return manage(res);
+}
+
+isl::pw_aff aff::coalesce() const
+{
+  return isl::pw_aff(*this).coalesce();
+}
+
+isl::pw_aff aff::cond(const isl::pw_aff &pwaff_true, const isl::pw_aff &pwaff_false) const
+{
+  return isl::pw_aff(*this).cond(pwaff_true, pwaff_false);
+}
+
+isl::multi_val aff::constant_multi_val() const
+{
+  return isl::multi_aff(*this).constant_multi_val();
+}
+
+isl::val aff::constant_val() const
+{
+  auto res = isl_aff_get_constant_val(get());
+  return manage(res);
+}
+
+isl::val aff::get_constant_val() const
+{
+  return constant_val();
+}
+
+isl::val aff::denominator_val() const
+{
+  auto res = isl_aff_get_denominator_val(get());
+  return manage(res);
+}
+
+isl::val aff::get_denominator_val() const
+{
+  return denominator_val();
+}
+
+class size aff::dim(isl::dim type) const
+{
+  return isl::multi_aff(*this).dim(type);
+}
+
+isl::id aff::dim_id(isl::dim type, unsigned int pos) const
+{
+  return isl::pw_aff(*this).dim_id(type, pos);
+}
+
+isl::aff aff::div(isl::aff aff2) const
+{
+  auto res = isl_aff_div(copy(), aff2.release());
+  return manage(res);
+}
+
+isl::pw_aff aff::div(const isl::pw_aff &pa2) const
+{
+  return isl::pw_aff(*this).div(pa2);
+}
+
+isl::set aff::domain() const
+{
+  return isl::pw_aff(*this).domain();
+}
+
+isl::space aff::domain_space() const
+{
+  return isl::pw_aff(*this).domain_space();
+}
+
+isl::pw_multi_aff aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+{
+  return isl::pw_aff(*this).drop_dims(type, first, n);
+}
+
+isl::set aff::eq_set(isl::aff aff2) const
+{
+  auto res = isl_aff_eq_set(copy(), aff2.release());
+  return manage(res);
+}
+
+isl::set aff::eq_set(const isl::pw_aff &pwaff2) const
+{
+  return isl::pw_aff(*this).eq_set(pwaff2);
+}
+
+isl::val aff::eval(isl::point pnt) const
+{
+  auto res = isl_aff_eval(copy(), pnt.release());
+  return manage(res);
+}
+
+isl::pw_multi_aff aff::extract_pw_multi_aff(const isl::space &space) const
+{
+  return isl::pw_aff(*this).extract_pw_multi_aff(space);
+}
+
+isl::multi_aff aff::flat_range_product(const isl::multi_aff &multi2) const
+{
+  return isl::multi_aff(*this).flat_range_product(multi2);
+}
+
+isl::multi_pw_aff aff::flat_range_product(const isl::multi_pw_aff &multi2) const
+{
+  return isl::pw_aff(*this).flat_range_product(multi2);
+}
+
+isl::multi_union_pw_aff aff::flat_range_product(const isl::multi_union_pw_aff &multi2) const
+{
+  return isl::pw_aff(*this).flat_range_product(multi2);
+}
+
+isl::pw_multi_aff aff::flat_range_product(const isl::pw_multi_aff &pma2) const
+{
+  return isl::pw_aff(*this).flat_range_product(pma2);
+}
+
+isl::union_pw_multi_aff aff::flat_range_product(const isl::union_pw_multi_aff &upma2) const
+{
+  return isl::pw_aff(*this).flat_range_product(upma2);
+}
+
+isl::aff aff::floor() const
+{
+  auto res = isl_aff_floor(copy());
+  return manage(res);
+}
+
+stat aff::foreach_piece(const std::function<stat(isl::set, isl::aff)> &fn) const
+{
+  return isl::pw_aff(*this).foreach_piece(fn);
+}
+
+stat aff::foreach_piece(const std::function<stat(isl::set, isl::multi_aff)> &fn) const
+{
+  return isl::pw_aff(*this).foreach_piece(fn);
+}
+
+stat aff::foreach_pw_aff(const std::function<stat(isl::pw_aff)> &fn) const
+{
+  return isl::pw_aff(*this).foreach_pw_aff(fn);
+}
+
+isl::set aff::ge_set(isl::aff aff2) const
+{
+  auto res = isl_aff_ge_set(copy(), aff2.release());
+  return manage(res);
+}
+
+isl::set aff::ge_set(const isl::pw_aff &pwaff2) const
+{
+  return isl::pw_aff(*this).ge_set(pwaff2);
+}
+
+isl::aff aff::gist(isl::set context) const
+{
+  auto res = isl_aff_gist(copy(), context.release());
+  return manage(res);
+}
+
+isl::union_pw_aff aff::gist(const isl::union_set &context) const
+{
+  return isl::pw_aff(*this).gist(context);
+}
+
+isl::aff aff::gist(const isl::basic_set &context) const
+{
+  return this->gist(isl::set(context));
+}
+
+isl::aff aff::gist(const isl::point &context) const
+{
+  return this->gist(isl::set(context));
+}
+
+isl::set aff::gt_set(isl::aff aff2) const
+{
+  auto res = isl_aff_gt_set(copy(), aff2.release());
+  return manage(res);
+}
+
+isl::set aff::gt_set(const isl::pw_aff &pwaff2) const
+{
+  return isl::pw_aff(*this).gt_set(pwaff2);
+}
+
+boolean aff::has_range_tuple_id() const
+{
+  return isl::multi_aff(*this).has_range_tuple_id();
+}
+
+isl::multi_aff aff::identity() const
+{
+  return isl::multi_aff(*this).identity();
+}
+
+isl::pw_aff aff::insert_domain(const isl::space &domain) const
+{
+  return isl::pw_aff(*this).insert_domain(domain);
+}
+
+isl::pw_aff aff::intersect_domain(const isl::set &set) const
+{
+  return isl::pw_aff(*this).intersect_domain(set);
+}
+
+isl::union_pw_aff aff::intersect_domain(const isl::space &space) const
+{
+  return isl::pw_aff(*this).intersect_domain(space);
+}
+
+isl::union_pw_aff aff::intersect_domain(const isl::union_set &uset) const
+{
+  return isl::pw_aff(*this).intersect_domain(uset);
+}
+
+isl::union_pw_aff aff::intersect_domain_wrapped_domain(const isl::union_set &uset) const
+{
+  return isl::pw_aff(*this).intersect_domain_wrapped_domain(uset);
+}
+
+isl::union_pw_aff aff::intersect_domain_wrapped_range(const isl::union_set &uset) const
+{
+  return isl::pw_aff(*this).intersect_domain_wrapped_range(uset);
+}
+
+isl::pw_aff aff::intersect_params(const isl::set &set) const
+{
+  return isl::pw_aff(*this).intersect_params(set);
+}
+
+boolean aff::involves_locals() const
+{
+  return isl::multi_aff(*this).involves_locals();
+}
+
+boolean aff::involves_nan() const
+{
+  return isl::multi_aff(*this).involves_nan();
+}
+
+boolean aff::involves_param(const isl::id &id) const
+{
+  return isl::pw_aff(*this).involves_param(id);
+}
+
+boolean aff::involves_param(const std::string &id) const
+{
+  return this->involves_param(isl::id(ctx(), id));
+}
+
+boolean aff::involves_param(const isl::id_list &list) const
+{
+  return isl::pw_aff(*this).involves_param(list);
+}
+
+boolean aff::is_cst() const
+{
+  auto res = isl_aff_is_cst(get());
+  return manage(res);
+}
+
+boolean aff::is_equal(const isl::pw_aff &pa2) const
+{
+  return isl::pw_aff(*this).is_equal(pa2);
+}
+
+boolean aff::isa_aff() const
+{
+  return isl::pw_aff(*this).isa_aff();
+}
+
+boolean aff::isa_multi_aff() const
+{
+  return isl::pw_aff(*this).isa_multi_aff();
+}
+
+boolean aff::isa_pw_multi_aff() const
+{
+  return isl::pw_aff(*this).isa_pw_multi_aff();
+}
+
+isl::set aff::le_set(isl::aff aff2) const
+{
+  auto res = isl_aff_le_set(copy(), aff2.release());
+  return manage(res);
+}
+
+isl::set aff::le_set(const isl::pw_aff &pwaff2) const
+{
+  return isl::pw_aff(*this).le_set(pwaff2);
+}
+
+isl::aff_list aff::list() const
+{
+  return isl::multi_aff(*this).list();
+}
+
+isl::set aff::lt_set(isl::aff aff2) const
+{
+  auto res = isl_aff_lt_set(copy(), aff2.release());
+  return manage(res);
+}
+
+isl::set aff::lt_set(const isl::pw_aff &pwaff2) const
+{
+  return isl::pw_aff(*this).lt_set(pwaff2);
+}
+
+isl::multi_pw_aff aff::max(const isl::multi_pw_aff &multi2) const
+{
+  return isl::pw_aff(*this).max(multi2);
+}
+
+isl::pw_aff aff::max(const isl::pw_aff &pwaff2) const
+{
+  return isl::pw_aff(*this).max(pwaff2);
+}
+
+isl::multi_val aff::max_multi_val() const
+{
+  return isl::pw_aff(*this).max_multi_val();
+}
+
+isl::multi_pw_aff aff::min(const isl::multi_pw_aff &multi2) const
+{
+  return isl::pw_aff(*this).min(multi2);
+}
+
+isl::pw_aff aff::min(const isl::pw_aff &pwaff2) const
+{
+  return isl::pw_aff(*this).min(pwaff2);
+}
+
+isl::multi_val aff::min_multi_val() const
+{
+  return isl::pw_aff(*this).min_multi_val();
+}
+
+isl::aff aff::mod(isl::val mod) const
+{
+  auto res = isl_aff_mod_val(copy(), mod.release());
+  return manage(res);
+}
+
+isl::aff aff::mod(long mod) const
+{
+  return this->mod(isl::val(ctx(), mod));
+}
+
+isl::aff aff::mul(isl::aff aff2) const
+{
+  auto res = isl_aff_mul(copy(), aff2.release());
+  return manage(res);
+}
+
+isl::pw_aff aff::mul(const isl::pw_aff &pwaff2) const
+{
+  return isl::pw_aff(*this).mul(pwaff2);
+}
+
+class size aff::n_piece() const
+{
+  return isl::pw_aff(*this).n_piece();
+}
+
+isl::set aff::ne_set(isl::aff aff2) const
+{
+  auto res = isl_aff_ne_set(copy(), aff2.release());
+  return manage(res);
+}
+
+isl::set aff::ne_set(const isl::pw_aff &pwaff2) const
+{
+  return isl::pw_aff(*this).ne_set(pwaff2);
+}
+
+isl::aff aff::neg() const
+{
+  auto res = isl_aff_neg(copy());
+  return manage(res);
+}
+
+boolean aff::plain_is_empty() const
+{
+  return isl::pw_aff(*this).plain_is_empty();
+}
+
+boolean aff::plain_is_equal(const isl::multi_aff &multi2) const
+{
+  return isl::multi_aff(*this).plain_is_equal(multi2);
+}
+
+boolean aff::plain_is_equal(const isl::multi_pw_aff &multi2) const
+{
+  return isl::pw_aff(*this).plain_is_equal(multi2);
+}
+
+boolean aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const
+{
+  return isl::pw_aff(*this).plain_is_equal(multi2);
+}
+
+isl::pw_multi_aff aff::preimage_domain_wrapped_domain(const isl::pw_multi_aff &pma2) const
+{
+  return isl::pw_aff(*this).preimage_domain_wrapped_domain(pma2);
+}
+
+isl::union_pw_multi_aff aff::preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const
+{
+  return isl::pw_aff(*this).preimage_domain_wrapped_domain(upma2);
+}
+
+isl::multi_aff aff::product(const isl::multi_aff &multi2) const
+{
+  return isl::multi_aff(*this).product(multi2);
+}
+
+isl::multi_pw_aff aff::product(const isl::multi_pw_aff &multi2) const
+{
+  return isl::pw_aff(*this).product(multi2);
+}
+
+isl::pw_multi_aff aff::product(const isl::pw_multi_aff &pma2) const
+{
+  return isl::pw_aff(*this).product(pma2);
+}
+
+isl::aff aff::pullback(isl::multi_aff ma) const
+{
+  auto res = isl_aff_pullback_multi_aff(copy(), ma.release());
+  return manage(res);
+}
+
+isl::pw_aff aff::pullback(const isl::multi_pw_aff &mpa) const
+{
+  return isl::pw_aff(*this).pullback(mpa);
+}
+
+isl::pw_aff aff::pullback(const isl::pw_multi_aff &pma) const
+{
+  return isl::pw_aff(*this).pullback(pma);
+}
+
+isl::union_pw_aff aff::pullback(const isl::union_pw_multi_aff &upma) const
+{
+  return isl::pw_aff(*this).pullback(upma);
+}
+
+isl::aff aff::pullback(const isl::aff &ma) const
+{
+  return this->pullback(isl::multi_aff(ma));
+}
+
+isl::pw_multi_aff_list aff::pw_multi_aff_list() const
+{
+  return isl::pw_aff(*this).pw_multi_aff_list();
+}
+
+isl::pw_multi_aff aff::range_factor_domain() const
+{
+  return isl::pw_aff(*this).range_factor_domain();
+}
+
+isl::pw_multi_aff aff::range_factor_range() const
+{
+  return isl::pw_aff(*this).range_factor_range();
+}
+
+isl::multi_aff aff::range_product(const isl::multi_aff &multi2) const
+{
+  return isl::multi_aff(*this).range_product(multi2);
+}
+
+isl::multi_pw_aff aff::range_product(const isl::multi_pw_aff &multi2) const
+{
+  return isl::pw_aff(*this).range_product(multi2);
+}
+
+isl::multi_union_pw_aff aff::range_product(const isl::multi_union_pw_aff &multi2) const
+{
+  return isl::pw_aff(*this).range_product(multi2);
+}
+
+isl::pw_multi_aff aff::range_product(const isl::pw_multi_aff &pma2) const
+{
+  return isl::pw_aff(*this).range_product(pma2);
+}
+
+isl::union_pw_multi_aff aff::range_product(const isl::union_pw_multi_aff &upma2) const
+{
+  return isl::pw_aff(*this).range_product(upma2);
+}
+
+isl::id aff::range_tuple_id() const
+{
+  return isl::multi_aff(*this).range_tuple_id();
+}
+
+isl::multi_aff aff::reset_range_tuple_id() const
+{
+  return isl::multi_aff(*this).reset_range_tuple_id();
+}
+
+isl::multi_aff aff::reset_tuple_id(isl::dim type) const
+{
+  return isl::multi_aff(*this).reset_tuple_id(type);
+}
+
+isl::aff aff::scale(isl::val v) const
+{
+  auto res = isl_aff_scale_val(copy(), v.release());
+  return manage(res);
+}
+
+isl::aff aff::scale(long v) const
+{
+  return this->scale(isl::val(ctx(), v));
+}
+
+isl::multi_aff aff::scale(const isl::multi_val &mv) const
+{
+  return isl::multi_aff(*this).scale(mv);
+}
+
+isl::aff aff::scale_down(isl::val v) const
+{
+  auto res = isl_aff_scale_down_val(copy(), v.release());
+  return manage(res);
+}
+
+isl::aff aff::scale_down(long v) const
+{
+  return this->scale_down(isl::val(ctx(), v));
+}
+
+isl::multi_aff aff::scale_down(const isl::multi_val &mv) const
+{
+  return isl::multi_aff(*this).scale_down(mv);
+}
+
+isl::multi_aff aff::set_aff(int pos, const isl::aff &el) const
+{
+  return isl::multi_aff(*this).set_aff(pos, el);
+}
+
+isl::multi_aff aff::set_at(int pos, const isl::aff &el) const
+{
+  return isl::multi_aff(*this).set_at(pos, el);
+}
+
+isl::multi_pw_aff aff::set_at(int pos, const isl::pw_aff &el) const
+{
+  return isl::pw_aff(*this).set_at(pos, el);
+}
+
+isl::multi_union_pw_aff aff::set_at(int pos, const isl::union_pw_aff &el) const
+{
+  return isl::pw_aff(*this).set_at(pos, el);
+}
+
+isl::aff aff::set_constant_si(int v) const
+{
+  auto res = isl_aff_set_constant_si(copy(), v);
+  return manage(res);
+}
+
+isl::multi_pw_aff aff::set_pw_aff(int pos, const isl::pw_aff &el) const
+{
+  return isl::pw_aff(*this).set_pw_aff(pos, el);
+}
+
+isl::pw_multi_aff aff::set_pw_aff(unsigned int pos, const isl::pw_aff &pa) const
+{
+  return isl::pw_aff(*this).set_pw_aff(pos, pa);
+}
+
+isl::multi_aff aff::set_range_tuple(const isl::id &id) const
+{
+  return isl::multi_aff(*this).set_range_tuple(id);
+}
+
+isl::multi_aff aff::set_range_tuple(const std::string &id) const
+{
+  return this->set_range_tuple(isl::id(ctx(), id));
+}
+
+isl::pw_aff aff::set_tuple_id(isl::dim type, const isl::id &id) const
+{
+  return isl::pw_aff(*this).set_tuple_id(type, id);
+}
+
+isl::pw_aff aff::set_tuple_id(isl::dim type, const std::string &id) const
+{
+  return this->set_tuple_id(type, isl::id(ctx(), id));
+}
+
+isl::multi_union_pw_aff aff::set_union_pw_aff(int pos, const isl::union_pw_aff &el) const
+{
+  return isl::pw_aff(*this).set_union_pw_aff(pos, el);
+}
+
+class size aff::size() const
+{
+  return isl::multi_aff(*this).size();
+}
+
+isl::space aff::space() const
+{
+  return isl::pw_aff(*this).space();
+}
+
+isl::aff aff::sub(isl::aff aff2) const
+{
+  auto res = isl_aff_sub(copy(), aff2.release());
+  return manage(res);
+}
+
+isl::multi_aff aff::sub(const isl::multi_aff &multi2) const
+{
+  return isl::multi_aff(*this).sub(multi2);
+}
+
+isl::multi_pw_aff aff::sub(const isl::multi_pw_aff &multi2) const
+{
+  return isl::pw_aff(*this).sub(multi2);
+}
+
+isl::multi_union_pw_aff aff::sub(const isl::multi_union_pw_aff &multi2) const
+{
+  return isl::pw_aff(*this).sub(multi2);
+}
+
+isl::pw_aff aff::sub(const isl::pw_aff &pwaff2) const
+{
+  return isl::pw_aff(*this).sub(pwaff2);
+}
+
+isl::pw_multi_aff aff::sub(const isl::pw_multi_aff &pma2) const
+{
+  return isl::pw_aff(*this).sub(pma2);
+}
+
+isl::union_pw_aff aff::sub(const isl::union_pw_aff &upa2) const
+{
+  return isl::pw_aff(*this).sub(upa2);
+}
+
+isl::union_pw_multi_aff aff::sub(const isl::union_pw_multi_aff &upma2) const
+{
+  return isl::pw_aff(*this).sub(upma2);
+}
+
+isl::pw_aff aff::subtract_domain(const isl::set &set) const
+{
+  return isl::pw_aff(*this).subtract_domain(set);
+}
+
+isl::union_pw_aff aff::subtract_domain(const isl::space &space) const
+{
+  return isl::pw_aff(*this).subtract_domain(space);
+}
+
+isl::union_pw_aff aff::subtract_domain(const isl::union_set &uset) const
+{
+  return isl::pw_aff(*this).subtract_domain(uset);
+}
+
+isl::pw_aff aff::tdiv_q(const isl::pw_aff &pa2) const
+{
+  return isl::pw_aff(*this).tdiv_q(pa2);
+}
+
+isl::pw_aff aff::tdiv_r(const isl::pw_aff &pa2) const
+{
+  return isl::pw_aff(*this).tdiv_r(pa2);
+}
+
+isl::aff_list aff::to_list() const
+{
+  auto res = isl_aff_to_list(copy());
+  return manage(res);
+}
+
+isl::multi_pw_aff aff::to_multi_pw_aff() const
+{
+  return isl::multi_aff(*this).to_multi_pw_aff();
+}
+
+isl::multi_union_pw_aff aff::to_multi_union_pw_aff() const
+{
+  return isl::multi_aff(*this).to_multi_union_pw_aff();
+}
+
+isl::pw_multi_aff aff::to_pw_multi_aff() const
+{
+  return isl::multi_aff(*this).to_pw_multi_aff();
+}
+
+isl::union_pw_aff aff::to_union_pw_aff() const
+{
+  return isl::pw_aff(*this).to_union_pw_aff();
+}
+
+isl::union_pw_multi_aff aff::to_union_pw_multi_aff() const
+{
+  return isl::pw_aff(*this).to_union_pw_multi_aff();
+}
+
+isl::id aff::tuple_id(isl::dim type) const
+{
+  return isl::pw_aff(*this).tuple_id(type);
+}
+
+isl::aff aff::unbind_params_insert_domain(isl::multi_id domain) const
+{
+  auto res = isl_aff_unbind_params_insert_domain(copy(), domain.release());
+  return manage(res);
+}
+
+isl::multi_pw_aff aff::union_add(const isl::multi_pw_aff &mpa2) const
+{
+  return isl::pw_aff(*this).union_add(mpa2);
+}
+
+isl::multi_union_pw_aff aff::union_add(const isl::multi_union_pw_aff &mupa2) const
+{
+  return isl::pw_aff(*this).union_add(mupa2);
+}
+
+isl::pw_aff aff::union_add(const isl::pw_aff &pwaff2) const
+{
+  return isl::pw_aff(*this).union_add(pwaff2);
+}
+
+isl::pw_multi_aff aff::union_add(const isl::pw_multi_aff &pma2) const
+{
+  return isl::pw_aff(*this).union_add(pma2);
+}
+
+isl::union_pw_aff aff::union_add(const isl::union_pw_aff &upa2) const
+{
+  return isl::pw_aff(*this).union_add(upa2);
+}
+
+isl::union_pw_multi_aff aff::union_add(const isl::union_pw_multi_aff &upma2) const
+{
+  return isl::pw_aff(*this).union_add(upma2);
+}
+
+isl::aff aff::var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos)
+{
+  auto res = isl_aff_var_on_domain(ls.release(), static_cast<enum isl_dim_type>(type), pos);
+  return manage(res);
+}
+
+isl::aff aff::zero_on_domain(isl::space space)
+{
+  auto res = isl_aff_zero_on_domain_space(space.release());
+  return manage(res);
+}
+
+inline std::ostream &operator<<(std::ostream &os, const aff &obj)
+{
+  char *str = isl_aff_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
+
+// implementations for isl::aff_list
+aff_list manage(__isl_take isl_aff_list *ptr) {
+  return aff_list(ptr);
+}
+aff_list manage_copy(__isl_keep isl_aff_list *ptr) {
+  ptr = isl_aff_list_copy(ptr);
+  return aff_list(ptr);
+}
+
+aff_list::aff_list()
+    : ptr(nullptr) {}
+
+aff_list::aff_list(const aff_list &obj)
+    : ptr(nullptr)
+{
+  ptr = obj.copy();
+}
+
+aff_list::aff_list(__isl_take isl_aff_list *ptr)
+    : ptr(ptr) {}
+
+aff_list::aff_list(isl::ctx ctx, int n)
+{
+  auto res = isl_aff_list_alloc(ctx.release(), n);
+  ptr = res;
+}
+
+aff_list::aff_list(isl::aff el)
+{
+  auto res = isl_aff_list_from_aff(el.release());
+  ptr = res;
+}
+
+aff_list::aff_list(isl::ctx ctx, const std::string &str)
+{
+  auto res = isl_aff_list_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
+}
+
+aff_list &aff_list::operator=(aff_list obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+aff_list::~aff_list() {
+  if (ptr)
+    isl_aff_list_free(ptr);
+}
+
+__isl_give isl_aff_list *aff_list::copy() const & {
+  return isl_aff_list_copy(ptr);
+}
+
+__isl_keep isl_aff_list *aff_list::get() const {
+  return ptr;
+}
+
+__isl_give isl_aff_list *aff_list::release() {
+  isl_aff_list *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
+}
+
+bool aff_list::is_null() const {
+  return ptr == nullptr;
+}
+
+isl::ctx aff_list::ctx() const {
+  return isl::ctx(isl_aff_list_get_ctx(ptr));
+}
+
+isl::aff_list aff_list::add(isl::aff el) const
+{
+  auto res = isl_aff_list_add(copy(), el.release());
+  return manage(res);
+}
+
+isl::aff aff_list::at(int index) const
+{
+  auto res = isl_aff_list_get_at(get(), index);
+  return manage(res);
+}
+
+isl::aff aff_list::get_at(int index) const
+{
+  return at(index);
+}
+
+isl::aff_list aff_list::clear() const
+{
+  auto res = isl_aff_list_clear(copy());
+  return manage(res);
+}
+
+isl::aff_list aff_list::concat(isl::aff_list list2) const
+{
+  auto res = isl_aff_list_concat(copy(), list2.release());
+  return manage(res);
+}
+
+isl::aff_list aff_list::drop(unsigned int first, unsigned int n) const
+{
+  auto res = isl_aff_list_drop(copy(), first, n);
+  return manage(res);
+}
+
+stat aff_list::foreach(const std::function<stat(isl::aff)> &fn) const
+{
+  struct fn_data {
+    std::function<stat(isl::aff)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_aff *arg_0, void *arg_1) -> isl_stat {
+    auto *data = static_cast<struct fn_data *>(arg_1);
+    auto ret = (data->func)(manage(arg_0));
+    return ret.release();
+  };
+  auto res = isl_aff_list_foreach(get(), fn_lambda, &fn_data);
+  return manage(res);
+}
+
+isl::aff_list aff_list::insert(unsigned int pos, isl::aff el) const
+{
+  auto res = isl_aff_list_insert(copy(), pos, el.release());
+  return manage(res);
+}
+
+class size aff_list::size() const
+{
+  auto res = isl_aff_list_size(get());
+  return manage(res);
+}
+
+inline std::ostream &operator<<(std::ostream &os, const aff_list &obj)
+{
+  char *str = isl_aff_list_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
+
+// implementations for isl::ast_build
+ast_build manage(__isl_take isl_ast_build *ptr) {
+  return ast_build(ptr);
+}
+ast_build manage_copy(__isl_keep isl_ast_build *ptr) {
+  ptr = isl_ast_build_copy(ptr);
+  return ast_build(ptr);
+}
+
+ast_build::ast_build()
+    : ptr(nullptr) {}
+
+ast_build::ast_build(const ast_build &obj)
+    : ptr(nullptr)
+{
+  ptr = obj.copy();
+  copy_callbacks(obj);
+}
+
+ast_build::ast_build(__isl_take isl_ast_build *ptr)
+    : ptr(ptr) {}
+
+ast_build::ast_build(isl::ctx ctx)
+{
+  auto res = isl_ast_build_alloc(ctx.release());
+  ptr = res;
+}
+
+ast_build &ast_build::operator=(ast_build obj) {
+  std::swap(this->ptr, obj.ptr);
+  copy_callbacks(obj);
+  return *this;
+}
+
+ast_build::~ast_build() {
+  if (ptr)
+    isl_ast_build_free(ptr);
+}
+
+__isl_give isl_ast_build *ast_build::copy() const & {
+  return isl_ast_build_copy(ptr);
+}
+
+__isl_keep isl_ast_build *ast_build::get() const {
+  return ptr;
+}
+
+__isl_give isl_ast_build *ast_build::release() {
+  if (at_each_domain_data)
+    isl_die(ctx().get(), isl_error_invalid, "cannot release object with persistent callbacks", return nullptr);
+  isl_ast_build *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
+}
+
+bool ast_build::is_null() const {
+  return ptr == nullptr;
+}
+
+isl::ctx ast_build::ctx() const {
+  return isl::ctx(isl_ast_build_get_ctx(ptr));
+}
+
+ast_build &ast_build::copy_callbacks(const ast_build &obj)
+{
+  at_each_domain_data = obj.at_each_domain_data;
+  return *this;
+}
+
+isl_ast_node *ast_build::at_each_domain(isl_ast_node *arg_0, isl_ast_build *arg_1, void *arg_2)
+{
+  auto *data = static_cast<struct at_each_domain_data *>(arg_2);
+  auto ret = (data->func)(manage(arg_0), manage_copy(arg_1));
+  return ret.release();
+}
+
+void ast_build::set_at_each_domain_data(const std::function<isl::ast_node(isl::ast_node, isl::ast_build)> &fn)
+{
+  at_each_domain_data = std::make_shared<struct at_each_domain_data>();
+  at_each_domain_data->func = fn;
+  ptr = isl_ast_build_set_at_each_domain(ptr, &at_each_domain, at_each_domain_data.get());
+}
+
+isl::ast_build ast_build::set_at_each_domain(const std::function<isl::ast_node(isl::ast_node, isl::ast_build)> &fn) const
+{
+  auto copy = *this;
+  copy.set_at_each_domain_data(fn);
+  return copy;
+}
+
+isl::ast_expr ast_build::access_from(isl::multi_pw_aff mpa) const
+{
+  auto res = isl_ast_build_access_from_multi_pw_aff(get(), mpa.release());
+  return manage(res);
+}
+
+isl::ast_expr ast_build::access_from(isl::pw_multi_aff pma) const
+{
+  auto res = isl_ast_build_access_from_pw_multi_aff(get(), pma.release());
+  return manage(res);
+}
+
+isl::ast_expr ast_build::call_from(isl::multi_pw_aff mpa) const
+{
+  auto res = isl_ast_build_call_from_multi_pw_aff(get(), mpa.release());
+  return manage(res);
+}
+
+isl::ast_expr ast_build::call_from(isl::pw_multi_aff pma) const
+{
+  auto res = isl_ast_build_call_from_pw_multi_aff(get(), pma.release());
+  return manage(res);
+}
+
+isl::ast_expr ast_build::expr_from(isl::pw_aff pa) const
+{
+  auto res = isl_ast_build_expr_from_pw_aff(get(), pa.release());
+  return manage(res);
+}
+
+isl::ast_expr ast_build::expr_from(isl::set set) const
+{
+  auto res = isl_ast_build_expr_from_set(get(), set.release());
+  return manage(res);
+}
+
+isl::ast_build ast_build::from_context(isl::set set)
+{
+  auto res = isl_ast_build_from_context(set.release());
+  return manage(res);
+}
+
+isl::ast_node ast_build::node_from(isl::schedule schedule) const
+{
+  auto res = isl_ast_build_node_from_schedule(get(), schedule.release());
+  return manage(res);
+}
+
+isl::ast_node ast_build::node_from_schedule_map(isl::union_map schedule) const
+{
+  auto res = isl_ast_build_node_from_schedule_map(get(), schedule.release());
+  return manage(res);
+}
+
+isl::ast_build ast_build::restrict(isl::set set) const
+{
+  auto res = isl_ast_build_restrict(copy(), set.release());
+  return manage(res).copy_callbacks(*this);
+}
+
+isl::union_map ast_build::schedule() const
+{
+  auto res = isl_ast_build_get_schedule(get());
+  return manage(res);
+}
+
+isl::union_map ast_build::get_schedule() const
+{
+  return schedule();
+}
+
+// implementations for isl::ast_expr
+ast_expr manage(__isl_take isl_ast_expr *ptr) {
+  return ast_expr(ptr);
+}
+ast_expr manage_copy(__isl_keep isl_ast_expr *ptr) {
+  ptr = isl_ast_expr_copy(ptr);
+  return ast_expr(ptr);
+}
+
+ast_expr::ast_expr()
+    : ptr(nullptr) {}
+
+ast_expr::ast_expr(const ast_expr &obj)
+    : ptr(nullptr)
+{
+  ptr = obj.copy();
+}
+
+ast_expr::ast_expr(__isl_take isl_ast_expr *ptr)
+    : ptr(ptr) {}
+
+ast_expr &ast_expr::operator=(ast_expr obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+ast_expr::~ast_expr() {
+  if (ptr)
+    isl_ast_expr_free(ptr);
+}
+
+__isl_give isl_ast_expr *ast_expr::copy() const & {
+  return isl_ast_expr_copy(ptr);
+}
+
+__isl_keep isl_ast_expr *ast_expr::get() const {
+  return ptr;
+}
 
-class set_list {
-  friend inline set_list manage(__isl_take isl_set_list *ptr);
-  friend inline set_list manage_copy(__isl_keep isl_set_list *ptr);
+__isl_give isl_ast_expr *ast_expr::release() {
+  isl_ast_expr *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
+}
 
-  isl_set_list *ptr = nullptr;
+bool ast_expr::is_null() const {
+  return ptr == nullptr;
+}
 
-  inline explicit set_list(__isl_take isl_set_list *ptr);
+template <typename T, typename>
+boolean ast_expr::isa_type(T subtype) const
+{
+  if (is_null())
+    return boolean();
+  return isl_ast_expr_get_type(get()) == subtype;
+}
+template <class T>
+boolean ast_expr::isa() const
+{
+  return isa_type<decltype(T::type)>(T::type);
+}
+template <class T>
+T ast_expr::as() const
+{
+ if (isa<T>().is_false())
+    isl_die(ctx().get(), isl_error_invalid, "not an object of the requested subtype", return T());
+  return T(copy());
+}
 
-public:
-  inline /* implicit */ set_list();
-  inline /* implicit */ set_list(const set_list &obj);
-  inline set_list &operator=(set_list obj);
-  inline ~set_list();
-  inline __isl_give isl_set_list *copy() const &;
-  inline __isl_give isl_set_list *copy() && = delete;
-  inline __isl_keep isl_set_list *get() const;
-  inline __isl_give isl_set_list *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
+isl::ctx ast_expr::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
 
-  inline isl::set_list add(isl::set el) const;
-  static inline isl::set_list alloc(isl::ctx ctx, int n);
-  inline isl::set_list clear() const;
-  inline isl::set_list concat(isl::set_list list2) const;
-  inline isl::set_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(set)> &fn) const;
-  static inline isl::set_list from_set(isl::set el);
-  inline isl::set get_at(int index) const;
-  inline isl::set get_set(int index) const;
-  inline isl::set_list insert(unsigned int pos, isl::set el) const;
-  inline isl_size n_set() const;
-  inline isl::set_list reverse() const;
-  inline isl::set_list set_set(int index, isl::set el) const;
-  inline isl_size size() const;
-  inline isl::set_list swap(unsigned int pos1, unsigned int pos2) const;
-  inline isl::set unite() const;
-};
+isl::ast_expr ast_expr::add(isl::ast_expr expr2) const
+{
+  auto res = isl_ast_expr_add(copy(), expr2.release());
+  return manage(res);
+}
 
-// declarations for isl::space
-inline space manage(__isl_take isl_space *ptr);
-inline space manage_copy(__isl_keep isl_space *ptr);
+isl::ast_expr ast_expr::address_of() const
+{
+  auto res = isl_ast_expr_address_of(copy());
+  return manage(res);
+}
 
-class space {
-  friend inline space manage(__isl_take isl_space *ptr);
-  friend inline space manage_copy(__isl_keep isl_space *ptr);
+isl::ast_expr ast_expr::eq(isl::ast_expr expr2) const
+{
+  auto res = isl_ast_expr_eq(copy(), expr2.release());
+  return manage(res);
+}
 
-  isl_space *ptr = nullptr;
+isl::ast_expr ast_expr::from_val(isl::val v)
+{
+  auto res = isl_ast_expr_from_val(v.release());
+  return manage(res);
+}
 
-  inline explicit space(__isl_take isl_space *ptr);
+isl::id ast_expr::id() const
+{
+  auto res = isl_ast_expr_get_id(get());
+  return manage(res);
+}
 
-public:
-  inline /* implicit */ space();
-  inline /* implicit */ space(const space &obj);
-  inline explicit space(isl::ctx ctx, unsigned int nparam, unsigned int n_in, unsigned int n_out);
-  inline explicit space(isl::ctx ctx, unsigned int nparam, unsigned int dim);
-  inline space &operator=(space obj);
-  inline ~space();
-  inline __isl_give isl_space *copy() const &;
-  inline __isl_give isl_space *copy() && = delete;
-  inline __isl_keep isl_space *get() const;
-  inline __isl_give isl_space *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
+isl::id ast_expr::get_id() const
+{
+  return id();
+}
 
-  inline isl::space add_dims(isl::dim type, unsigned int n) const;
-  inline isl::space add_named_tuple(isl::id tuple_id, unsigned int dim) const;
-  inline isl::space add_param_id(isl::id id) const;
-  inline isl::space add_unnamed_tuple(unsigned int dim) const;
-  inline isl::space align_params(isl::space space2) const;
-  inline boolean can_curry() const;
-  inline boolean can_range_curry() const;
-  inline boolean can_uncurry() const;
-  inline boolean can_zip() const;
-  inline isl::space curry() const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::space domain() const;
-  inline isl::space domain_factor_domain() const;
-  inline isl::space domain_factor_range() const;
-  inline boolean domain_is_wrapping() const;
-  inline isl::space domain_map() const;
-  inline isl::space domain_product(isl::space right) const;
-  inline isl::space drop_all_params() const;
-  inline isl::space drop_dims(isl::dim type, unsigned int first, unsigned int num) const;
-  inline isl::space factor_domain() const;
-  inline isl::space factor_range() const;
-  inline int find_dim_by_id(isl::dim type, const isl::id &id) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::space flatten_domain() const;
-  inline isl::space flatten_range() const;
-  inline isl::space from_domain() const;
-  inline isl::space from_range() const;
-  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
-  inline std::string get_dim_name(isl::dim type, unsigned int pos) const;
-  inline isl::id get_tuple_id(isl::dim type) const;
-  inline std::string get_tuple_name(isl::dim type) const;
-  inline boolean has_dim_id(isl::dim type, unsigned int pos) const;
-  inline boolean has_dim_name(isl::dim type, unsigned int pos) const;
-  inline boolean has_equal_params(const isl::space &space2) const;
-  inline boolean has_equal_tuples(const isl::space &space2) const;
-  inline boolean has_tuple_id(isl::dim type) const;
-  inline boolean has_tuple_name(isl::dim type) const;
-  inline isl::space insert_dims(isl::dim type, unsigned int pos, unsigned int n) const;
-  inline boolean is_domain(const isl::space &space2) const;
-  inline boolean is_equal(const isl::space &space2) const;
-  inline boolean is_map() const;
-  inline boolean is_params() const;
-  inline boolean is_product() const;
-  inline boolean is_range(const isl::space &space2) const;
-  inline boolean is_set() const;
-  inline boolean is_wrapping() const;
-  inline isl::space join(isl::space right) const;
-  inline isl::space map_from_domain_and_range(isl::space range) const;
-  inline isl::space map_from_set() const;
-  inline isl::space move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const;
-  inline isl::space params() const;
-  static inline isl::space params_alloc(isl::ctx ctx, unsigned int nparam);
-  inline isl::space product(isl::space right) const;
-  inline isl::space range() const;
-  inline isl::space range_curry() const;
-  inline isl::space range_factor_domain() const;
-  inline isl::space range_factor_range() const;
-  inline boolean range_is_wrapping() const;
-  inline isl::space range_map() const;
-  inline isl::space range_product(isl::space right) const;
-  inline isl::space range_reverse() const;
-  inline isl::space reset_tuple_id(isl::dim type) const;
-  inline isl::space reset_user() const;
-  inline isl::space reverse() const;
-  inline isl::space set_dim_id(isl::dim type, unsigned int pos, isl::id id) const;
-  inline isl::space set_from_params() const;
-  inline isl::space set_tuple_id(isl::dim type, isl::id id) const;
-  inline isl::space set_tuple_name(isl::dim type, const std::string &s) const;
-  inline boolean tuple_is_equal(isl::dim type1, const isl::space &space2, isl::dim type2) const;
-  inline isl::space uncurry() const;
-  static inline isl::space unit(isl::ctx ctx);
-  inline isl::space unwrap() const;
-  inline isl::space wrap() const;
-  inline isl::space zip() const;
-};
+isl::ast_expr ast_expr::le(isl::ast_expr expr2) const
+{
+  auto res = isl_ast_expr_le(copy(), expr2.release());
+  return manage(res);
+}
 
-// declarations for isl::term
-inline term manage(__isl_take isl_term *ptr);
-inline term manage_copy(__isl_keep isl_term *ptr);
+isl::ast_expr ast_expr::mul(isl::ast_expr expr2) const
+{
+  auto res = isl_ast_expr_mul(copy(), expr2.release());
+  return manage(res);
+}
 
-class term {
-  friend inline term manage(__isl_take isl_term *ptr);
-  friend inline term manage_copy(__isl_keep isl_term *ptr);
+isl::ast_expr ast_expr::op_arg(int pos) const
+{
+  auto res = isl_ast_expr_get_op_arg(get(), pos);
+  return manage(res);
+}
 
-  isl_term *ptr = nullptr;
+isl::ast_expr ast_expr::get_op_arg(int pos) const
+{
+  return op_arg(pos);
+}
 
-  inline explicit term(__isl_take isl_term *ptr);
+std::string ast_expr::to_C_str() const
+{
+  auto res = isl_ast_expr_to_C_str(get());
+  std::string tmp(res);
+  free(res);
+  return tmp;
+}
 
-public:
-  inline /* implicit */ term();
-  inline /* implicit */ term(const term &obj);
-  inline term &operator=(term obj);
-  inline ~term();
-  inline __isl_give isl_term *copy() const &;
-  inline __isl_give isl_term *copy() && = delete;
-  inline __isl_keep isl_term *get() const;
-  inline __isl_give isl_term *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
+isl::val ast_expr::val() const
+{
+  auto res = isl_ast_expr_get_val(get());
+  return manage(res);
+}
 
-  inline isl_size dim(isl::dim type) const;
-  inline isl::val get_coefficient_val() const;
-  inline isl::aff get_div(unsigned int pos) const;
-  inline isl_size get_exp(isl::dim type, unsigned int pos) const;
-};
+isl::val ast_expr::get_val() const
+{
+  return val();
+}
 
-// declarations for isl::union_access_info
-inline union_access_info manage(__isl_take isl_union_access_info *ptr);
-inline union_access_info manage_copy(__isl_keep isl_union_access_info *ptr);
+inline std::ostream &operator<<(std::ostream &os, const ast_expr &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
 
-class union_access_info {
-  friend inline union_access_info manage(__isl_take isl_union_access_info *ptr);
-  friend inline union_access_info manage_copy(__isl_keep isl_union_access_info *ptr);
+// implementations for isl::ast_expr_id
+ast_expr_id::ast_expr_id()
+    : ast_expr() {}
 
-  isl_union_access_info *ptr = nullptr;
+ast_expr_id::ast_expr_id(const ast_expr_id &obj)
+    : ast_expr(obj)
+{
+}
 
-  inline explicit union_access_info(__isl_take isl_union_access_info *ptr);
+ast_expr_id::ast_expr_id(__isl_take isl_ast_expr *ptr)
+    : ast_expr(ptr) {}
 
-public:
-  inline /* implicit */ union_access_info();
-  inline /* implicit */ union_access_info(const union_access_info &obj);
-  inline explicit union_access_info(isl::union_map sink);
-  inline union_access_info &operator=(union_access_info obj);
-  inline ~union_access_info();
-  inline __isl_give isl_union_access_info *copy() const &;
-  inline __isl_give isl_union_access_info *copy() && = delete;
-  inline __isl_keep isl_union_access_info *get() const;
-  inline __isl_give isl_union_access_info *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
+ast_expr_id &ast_expr_id::operator=(ast_expr_id obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
 
-  inline isl::union_flow compute_flow() const;
-  inline isl::union_access_info set_kill(isl::union_map kill) const;
-  inline isl::union_access_info set_may_source(isl::union_map may_source) const;
-  inline isl::union_access_info set_must_source(isl::union_map must_source) const;
-  inline isl::union_access_info set_schedule(isl::schedule schedule) const;
-  inline isl::union_access_info set_schedule_map(isl::union_map schedule_map) const;
-};
+isl::ctx ast_expr_id::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
 
-// declarations for isl::union_flow
-inline union_flow manage(__isl_take isl_union_flow *ptr);
-inline union_flow manage_copy(__isl_keep isl_union_flow *ptr);
+isl::id ast_expr_id::id() const
+{
+  auto res = isl_ast_expr_id_get_id(get());
+  return manage(res);
+}
 
-class union_flow {
-  friend inline union_flow manage(__isl_take isl_union_flow *ptr);
-  friend inline union_flow manage_copy(__isl_keep isl_union_flow *ptr);
+isl::id ast_expr_id::get_id() const
+{
+  return id();
+}
 
-  isl_union_flow *ptr = nullptr;
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_id &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
 
-  inline explicit union_flow(__isl_take isl_union_flow *ptr);
+// implementations for isl::ast_expr_int
+ast_expr_int::ast_expr_int()
+    : ast_expr() {}
 
-public:
-  inline /* implicit */ union_flow();
-  inline /* implicit */ union_flow(const union_flow &obj);
-  inline union_flow &operator=(union_flow obj);
-  inline ~union_flow();
-  inline __isl_give isl_union_flow *copy() const &;
-  inline __isl_give isl_union_flow *copy() && = delete;
-  inline __isl_keep isl_union_flow *get() const;
-  inline __isl_give isl_union_flow *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
+ast_expr_int::ast_expr_int(const ast_expr_int &obj)
+    : ast_expr(obj)
+{
+}
 
-  inline isl::union_map get_full_may_dependence() const;
-  inline isl::union_map get_full_must_dependence() const;
-  inline isl::union_map get_may_dependence() const;
-  inline isl::union_map get_may_no_source() const;
-  inline isl::union_map get_must_dependence() const;
-  inline isl::union_map get_must_no_source() const;
-};
+ast_expr_int::ast_expr_int(__isl_take isl_ast_expr *ptr)
+    : ast_expr(ptr) {}
+
+ast_expr_int &ast_expr_int::operator=(ast_expr_int obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_expr_int::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
+
+isl::val ast_expr_int::val() const
+{
+  auto res = isl_ast_expr_int_get_val(get());
+  return manage(res);
+}
+
+isl::val ast_expr_int::get_val() const
+{
+  return val();
+}
 
-// declarations for isl::union_map
-inline union_map manage(__isl_take isl_union_map *ptr);
-inline union_map manage_copy(__isl_keep isl_union_map *ptr);
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_int &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
 
-class union_map {
-  friend inline union_map manage(__isl_take isl_union_map *ptr);
-  friend inline union_map manage_copy(__isl_keep isl_union_map *ptr);
+// implementations for isl::ast_expr_op
+ast_expr_op::ast_expr_op()
+    : ast_expr() {}
 
-  isl_union_map *ptr = nullptr;
+ast_expr_op::ast_expr_op(const ast_expr_op &obj)
+    : ast_expr(obj)
+{
+}
 
-  inline explicit union_map(__isl_take isl_union_map *ptr);
+ast_expr_op::ast_expr_op(__isl_take isl_ast_expr *ptr)
+    : ast_expr(ptr) {}
 
-public:
-  inline /* implicit */ union_map();
-  inline /* implicit */ union_map(const union_map &obj);
-  inline /* implicit */ union_map(isl::basic_map bmap);
-  inline /* implicit */ union_map(isl::map map);
-  inline explicit union_map(isl::union_pw_multi_aff upma);
-  inline explicit union_map(isl::ctx ctx, const std::string &str);
-  inline union_map &operator=(union_map obj);
-  inline ~union_map();
-  inline __isl_give isl_union_map *copy() const &;
-  inline __isl_give isl_union_map *copy() && = delete;
-  inline __isl_keep isl_union_map *get() const;
-  inline __isl_give isl_union_map *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
+ast_expr_op &ast_expr_op::operator=(ast_expr_op obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
 
-  inline isl::union_map affine_hull() const;
-  inline isl::union_map align_params(isl::space model) const;
-  inline isl::union_map apply_domain(isl::union_map umap2) const;
-  inline isl::union_map apply_range(isl::union_map umap2) const;
-  inline isl::union_set bind_range(isl::multi_id tuple) const;
-  inline isl::union_map coalesce() const;
-  inline boolean contains(const isl::space &space) const;
-  inline isl::union_map curry() const;
-  inline isl::union_set deltas() const;
-  inline isl::union_map deltas_map() const;
-  inline isl::union_map detect_equalities() const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::union_set domain() const;
-  inline isl::union_map domain_factor_domain() const;
-  inline isl::union_map domain_factor_range() const;
-  inline isl::union_map domain_map() const;
-  inline isl::union_pw_multi_aff domain_map_union_pw_multi_aff() const;
-  inline isl::union_map domain_product(isl::union_map umap2) const;
-  static inline isl::union_map empty(isl::ctx ctx);
-  inline isl::union_map eq_at(isl::multi_union_pw_aff mupa) const;
-  inline isl::map extract_map(isl::space space) const;
-  inline isl::union_map factor_domain() const;
-  inline isl::union_map factor_range() const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::union_map fixed_power(isl::val exp) const;
-  inline isl::union_map flat_domain_product(isl::union_map umap2) const;
-  inline isl::union_map flat_range_product(isl::union_map umap2) const;
-  inline stat foreach_map(const std::function<stat(map)> &fn) const;
-  static inline isl::union_map from(isl::multi_union_pw_aff mupa);
-  static inline isl::union_map from_domain(isl::union_set uset);
-  static inline isl::union_map from_domain_and_range(isl::union_set domain, isl::union_set range);
-  static inline isl::union_map from_range(isl::union_set uset);
-  static inline isl::union_map from_union_pw_aff(isl::union_pw_aff upa);
-  inline isl::id get_dim_id(isl::dim type, unsigned int pos) const;
-  inline uint32_t get_hash() const;
-  inline isl::map_list get_map_list() const;
-  inline isl::space get_space() const;
-  inline isl::union_map gist(isl::union_map context) const;
-  inline isl::union_map gist_domain(isl::union_set uset) const;
-  inline isl::union_map gist_params(isl::set set) const;
-  inline isl::union_map gist_range(isl::union_set uset) const;
-  inline isl::union_map intersect(isl::union_map umap2) const;
-  inline isl::union_map intersect_domain(isl::space space) const;
-  inline isl::union_map intersect_domain(isl::union_set uset) const;
-  inline isl::union_map intersect_domain_factor_domain(isl::union_map factor) const;
-  inline isl::union_map intersect_domain_factor_range(isl::union_map factor) const;
-  inline isl::union_map intersect_params(isl::set set) const;
-  inline isl::union_map intersect_range(isl::space space) const;
-  inline isl::union_map intersect_range(isl::union_set uset) const;
-  inline isl::union_map intersect_range_factor_domain(isl::union_map factor) const;
-  inline isl::union_map intersect_range_factor_range(isl::union_map factor) const;
-  inline boolean involves_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline boolean is_bijective() const;
-  inline boolean is_disjoint(const isl::union_map &umap2) const;
-  inline boolean is_empty() const;
-  inline boolean is_equal(const isl::union_map &umap2) const;
-  inline boolean is_identity() const;
-  inline boolean is_injective() const;
-  inline boolean is_single_valued() const;
-  inline boolean is_strict_subset(const isl::union_map &umap2) const;
-  inline boolean is_subset(const isl::union_map &umap2) const;
-  inline boolean isa_map() const;
-  inline isl::union_map lex_ge_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const;
-  inline isl::union_map lex_ge_union_map(isl::union_map umap2) const;
-  inline isl::union_map lex_gt_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const;
-  inline isl::union_map lex_gt_union_map(isl::union_map umap2) const;
-  inline isl::union_map lex_le_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const;
-  inline isl::union_map lex_le_union_map(isl::union_map umap2) const;
-  inline isl::union_map lex_lt_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const;
-  inline isl::union_map lex_lt_union_map(isl::union_map umap2) const;
-  inline isl::union_map lexmax() const;
-  inline isl::union_map lexmin() const;
-  inline isl_size n_map() const;
-  inline isl::set params() const;
-  inline boolean plain_is_empty() const;
-  inline boolean plain_is_injective() const;
-  inline isl::union_map polyhedral_hull() const;
-  inline isl::union_map preimage_domain(isl::multi_aff ma) const;
-  inline isl::union_map preimage_domain(isl::multi_pw_aff mpa) const;
-  inline isl::union_map preimage_domain(isl::pw_multi_aff pma) const;
-  inline isl::union_map preimage_domain(isl::union_pw_multi_aff upma) const;
-  inline isl::union_map preimage_range(isl::multi_aff ma) const;
-  inline isl::union_map preimage_range(isl::pw_multi_aff pma) const;
-  inline isl::union_map preimage_range(isl::union_pw_multi_aff upma) const;
-  inline isl::union_map product(isl::union_map umap2) const;
-  inline isl::union_map project_out(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::union_map project_out_all_params() const;
-  inline isl::union_set range() const;
-  inline isl::union_map range_curry() const;
-  inline isl::union_map range_factor_domain() const;
-  inline isl::union_map range_factor_range() const;
-  inline isl::union_map range_map() const;
-  inline isl::union_map range_product(isl::union_map umap2) const;
-  inline isl::union_map range_reverse() const;
-  inline isl::union_map remove_divs() const;
-  inline isl::union_map remove_redundancies() const;
-  inline isl::union_map reset_user() const;
-  inline isl::union_map reverse() const;
-  inline isl::basic_map sample() const;
-  inline isl::union_map simple_hull() const;
-  inline isl::union_map subtract(isl::union_map umap2) const;
-  inline isl::union_map subtract_domain(isl::union_set dom) const;
-  inline isl::union_map subtract_range(isl::union_set dom) const;
-  inline isl::union_map uncurry() const;
-  inline isl::union_map unite(isl::union_map umap2) const;
-  inline isl::union_map universe() const;
-  inline isl::union_set wrap() const;
-  inline isl::union_map zip() const;
-};
+template <typename T, typename>
+boolean ast_expr_op::isa_type(T subtype) const
+{
+  if (is_null())
+    return boolean();
+  return isl_ast_expr_op_get_type(get()) == subtype;
+}
+template <class T>
+boolean ast_expr_op::isa() const
+{
+  return isa_type<decltype(T::type)>(T::type);
+}
+template <class T>
+T ast_expr_op::as() const
+{
+ if (isa<T>().is_false())
+    isl_die(ctx().get(), isl_error_invalid, "not an object of the requested subtype", return T());
+  return T(copy());
+}
 
-// declarations for isl::union_map_list
-inline union_map_list manage(__isl_take isl_union_map_list *ptr);
-inline union_map_list manage_copy(__isl_keep isl_union_map_list *ptr);
+isl::ctx ast_expr_op::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
 
-class union_map_list {
-  friend inline union_map_list manage(__isl_take isl_union_map_list *ptr);
-  friend inline union_map_list manage_copy(__isl_keep isl_union_map_list *ptr);
+isl::ast_expr ast_expr_op::arg(int pos) const
+{
+  auto res = isl_ast_expr_op_get_arg(get(), pos);
+  return manage(res);
+}
 
-  isl_union_map_list *ptr = nullptr;
+isl::ast_expr ast_expr_op::get_arg(int pos) const
+{
+  return arg(pos);
+}
 
-  inline explicit union_map_list(__isl_take isl_union_map_list *ptr);
+class size ast_expr_op::n_arg() const
+{
+  auto res = isl_ast_expr_op_get_n_arg(get());
+  return manage(res);
+}
 
-public:
-  inline /* implicit */ union_map_list();
-  inline /* implicit */ union_map_list(const union_map_list &obj);
-  inline union_map_list &operator=(union_map_list obj);
-  inline ~union_map_list();
-  inline __isl_give isl_union_map_list *copy() const &;
-  inline __isl_give isl_union_map_list *copy() && = delete;
-  inline __isl_keep isl_union_map_list *get() const;
-  inline __isl_give isl_union_map_list *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::union_map_list add(isl::union_map el) const;
-  static inline isl::union_map_list alloc(isl::ctx ctx, int n);
-  inline isl::union_map_list clear() const;
-  inline isl::union_map_list concat(isl::union_map_list list2) const;
-  inline isl::union_map_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(union_map)> &fn) const;
-  static inline isl::union_map_list from_union_map(isl::union_map el);
-  inline isl::union_map get_at(int index) const;
-  inline isl::union_map get_union_map(int index) const;
-  inline isl::union_map_list insert(unsigned int pos, isl::union_map el) const;
-  inline isl_size n_union_map() const;
-  inline isl::union_map_list reverse() const;
-  inline isl::union_map_list set_union_map(int index, isl::union_map el) const;
-  inline isl_size size() const;
-  inline isl::union_map_list swap(unsigned int pos1, unsigned int pos2) const;
-};
+class size ast_expr_op::get_n_arg() const
+{
+  return n_arg();
+}
 
-// declarations for isl::union_pw_aff
-inline union_pw_aff manage(__isl_take isl_union_pw_aff *ptr);
-inline union_pw_aff manage_copy(__isl_keep isl_union_pw_aff *ptr);
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
 
-class union_pw_aff {
-  friend inline union_pw_aff manage(__isl_take isl_union_pw_aff *ptr);
-  friend inline union_pw_aff manage_copy(__isl_keep isl_union_pw_aff *ptr);
+// implementations for isl::ast_expr_op_access
+ast_expr_op_access::ast_expr_op_access()
+    : ast_expr_op() {}
 
-  isl_union_pw_aff *ptr = nullptr;
+ast_expr_op_access::ast_expr_op_access(const ast_expr_op_access &obj)
+    : ast_expr_op(obj)
+{
+}
 
-  inline explicit union_pw_aff(__isl_take isl_union_pw_aff *ptr);
+ast_expr_op_access::ast_expr_op_access(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
 
-public:
-  inline /* implicit */ union_pw_aff();
-  inline /* implicit */ union_pw_aff(const union_pw_aff &obj);
-  inline /* implicit */ union_pw_aff(isl::aff aff);
-  inline /* implicit */ union_pw_aff(isl::pw_aff pa);
-  inline explicit union_pw_aff(isl::ctx ctx, const std::string &str);
-  inline explicit union_pw_aff(isl::union_set domain, isl::val v);
-  inline union_pw_aff &operator=(union_pw_aff obj);
-  inline ~union_pw_aff();
-  inline __isl_give isl_union_pw_aff *copy() const &;
-  inline __isl_give isl_union_pw_aff *copy() && = delete;
-  inline __isl_keep isl_union_pw_aff *get() const;
-  inline __isl_give isl_union_pw_aff *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
+ast_expr_op_access &ast_expr_op_access::operator=(ast_expr_op_access obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
 
-  inline isl::union_pw_aff add(isl::union_pw_aff upa2) const;
-  inline isl::union_pw_aff add_pw_aff(isl::pw_aff pa) const;
-  static inline isl::union_pw_aff aff_on_domain(isl::union_set domain, isl::aff aff);
-  inline isl::union_pw_aff align_params(isl::space model) const;
-  inline isl::union_set bind(isl::id id) const;
-  inline isl::union_pw_aff coalesce() const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::union_set domain() const;
-  inline isl::union_pw_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  static inline isl::union_pw_aff empty(isl::space space);
-  static inline isl::union_pw_aff empty_ctx(isl::ctx ctx);
-  static inline isl::union_pw_aff empty_space(isl::space space);
-  inline isl::pw_aff extract_pw_aff(isl::space space) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::union_pw_aff floor() const;
-  inline stat foreach_pw_aff(const std::function<stat(pw_aff)> &fn) const;
-  inline isl::pw_aff_list get_pw_aff_list() const;
-  inline isl::space get_space() const;
-  inline isl::union_pw_aff gist(isl::union_set context) const;
-  inline isl::union_pw_aff gist_params(isl::set context) const;
-  inline isl::union_pw_aff intersect_domain(isl::space space) const;
-  inline isl::union_pw_aff intersect_domain(isl::union_set uset) const;
-  inline isl::union_pw_aff intersect_domain_wrapped_domain(isl::union_set uset) const;
-  inline isl::union_pw_aff intersect_domain_wrapped_range(isl::union_set uset) const;
-  inline isl::union_pw_aff intersect_params(isl::set set) const;
-  inline boolean involves_nan() const;
-  inline isl::val max_val() const;
-  inline isl::val min_val() const;
-  inline isl::union_pw_aff mod_val(isl::val f) const;
-  inline isl_size n_pw_aff() const;
-  inline isl::union_pw_aff neg() const;
-  static inline isl::union_pw_aff param_on_domain_id(isl::union_set domain, isl::id id);
-  inline boolean plain_is_equal(const isl::union_pw_aff &upa2) const;
-  inline isl::union_pw_aff pullback(isl::union_pw_multi_aff upma) const;
-  static inline isl::union_pw_aff pw_aff_on_domain(isl::union_set domain, isl::pw_aff pa);
-  inline isl::union_pw_aff reset_user() const;
-  inline isl::union_pw_aff scale_down_val(isl::val v) const;
-  inline isl::union_pw_aff scale_val(isl::val v) const;
-  inline isl::union_pw_aff sub(isl::union_pw_aff upa2) const;
-  inline isl::union_pw_aff subtract_domain(isl::space space) const;
-  inline isl::union_pw_aff subtract_domain(isl::union_set uset) const;
-  inline isl::union_pw_aff union_add(isl::union_pw_aff upa2) const;
-  inline isl::union_set zero_union_set() const;
-};
+isl::ctx ast_expr_op_access::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
 
-// declarations for isl::union_pw_aff_list
-inline union_pw_aff_list manage(__isl_take isl_union_pw_aff_list *ptr);
-inline union_pw_aff_list manage_copy(__isl_keep isl_union_pw_aff_list *ptr);
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_access &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
 
-class union_pw_aff_list {
-  friend inline union_pw_aff_list manage(__isl_take isl_union_pw_aff_list *ptr);
-  friend inline union_pw_aff_list manage_copy(__isl_keep isl_union_pw_aff_list *ptr);
+// implementations for isl::ast_expr_op_add
+ast_expr_op_add::ast_expr_op_add()
+    : ast_expr_op() {}
 
-  isl_union_pw_aff_list *ptr = nullptr;
+ast_expr_op_add::ast_expr_op_add(const ast_expr_op_add &obj)
+    : ast_expr_op(obj)
+{
+}
 
-  inline explicit union_pw_aff_list(__isl_take isl_union_pw_aff_list *ptr);
+ast_expr_op_add::ast_expr_op_add(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
 
-public:
-  inline /* implicit */ union_pw_aff_list();
-  inline /* implicit */ union_pw_aff_list(const union_pw_aff_list &obj);
-  inline union_pw_aff_list &operator=(union_pw_aff_list obj);
-  inline ~union_pw_aff_list();
-  inline __isl_give isl_union_pw_aff_list *copy() const &;
-  inline __isl_give isl_union_pw_aff_list *copy() && = delete;
-  inline __isl_keep isl_union_pw_aff_list *get() const;
-  inline __isl_give isl_union_pw_aff_list *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
+ast_expr_op_add &ast_expr_op_add::operator=(ast_expr_op_add obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
 
-  inline isl::union_pw_aff_list add(isl::union_pw_aff el) const;
-  static inline isl::union_pw_aff_list alloc(isl::ctx ctx, int n);
-  inline isl::union_pw_aff_list clear() const;
-  inline isl::union_pw_aff_list concat(isl::union_pw_aff_list list2) const;
-  inline isl::union_pw_aff_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(union_pw_aff)> &fn) const;
-  static inline isl::union_pw_aff_list from_union_pw_aff(isl::union_pw_aff el);
-  inline isl::union_pw_aff get_at(int index) const;
-  inline isl::union_pw_aff get_union_pw_aff(int index) const;
-  inline isl::union_pw_aff_list insert(unsigned int pos, isl::union_pw_aff el) const;
-  inline isl_size n_union_pw_aff() const;
-  inline isl::union_pw_aff_list reverse() const;
-  inline isl::union_pw_aff_list set_union_pw_aff(int index, isl::union_pw_aff el) const;
-  inline isl_size size() const;
-  inline isl::union_pw_aff_list swap(unsigned int pos1, unsigned int pos2) const;
-};
+isl::ctx ast_expr_op_add::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
 
-// declarations for isl::union_pw_multi_aff
-inline union_pw_multi_aff manage(__isl_take isl_union_pw_multi_aff *ptr);
-inline union_pw_multi_aff manage_copy(__isl_keep isl_union_pw_multi_aff *ptr);
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_add &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
 
-class union_pw_multi_aff {
-  friend inline union_pw_multi_aff manage(__isl_take isl_union_pw_multi_aff *ptr);
-  friend inline union_pw_multi_aff manage_copy(__isl_keep isl_union_pw_multi_aff *ptr);
+// implementations for isl::ast_expr_op_address_of
+ast_expr_op_address_of::ast_expr_op_address_of()
+    : ast_expr_op() {}
 
-  isl_union_pw_multi_aff *ptr = nullptr;
+ast_expr_op_address_of::ast_expr_op_address_of(const ast_expr_op_address_of &obj)
+    : ast_expr_op(obj)
+{
+}
 
-  inline explicit union_pw_multi_aff(__isl_take isl_union_pw_multi_aff *ptr);
+ast_expr_op_address_of::ast_expr_op_address_of(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
 
-public:
-  inline /* implicit */ union_pw_multi_aff();
-  inline /* implicit */ union_pw_multi_aff(const union_pw_multi_aff &obj);
-  inline /* implicit */ union_pw_multi_aff(isl::aff aff);
-  inline explicit union_pw_multi_aff(isl::union_set uset);
-  inline /* implicit */ union_pw_multi_aff(isl::multi_aff ma);
-  inline explicit union_pw_multi_aff(isl::multi_union_pw_aff mupa);
-  inline /* implicit */ union_pw_multi_aff(isl::pw_multi_aff pma);
-  inline explicit union_pw_multi_aff(isl::union_map umap);
-  inline /* implicit */ union_pw_multi_aff(isl::union_pw_aff upa);
-  inline explicit union_pw_multi_aff(isl::ctx ctx, const std::string &str);
-  inline union_pw_multi_aff &operator=(union_pw_multi_aff obj);
-  inline ~union_pw_multi_aff();
-  inline __isl_give isl_union_pw_multi_aff *copy() const &;
-  inline __isl_give isl_union_pw_multi_aff *copy() && = delete;
-  inline __isl_keep isl_union_pw_multi_aff *get() const;
-  inline __isl_give isl_union_pw_multi_aff *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
+ast_expr_op_address_of &ast_expr_op_address_of::operator=(ast_expr_op_address_of obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
 
-  inline isl::union_pw_multi_aff add(isl::union_pw_multi_aff upma2) const;
-  inline isl::union_pw_multi_aff add_pw_multi_aff(isl::pw_multi_aff pma) const;
-  inline isl::union_pw_multi_aff align_params(isl::space model) const;
-  inline isl::union_pw_multi_aff apply(isl::union_pw_multi_aff upma2) const;
-  inline isl::pw_multi_aff as_pw_multi_aff() const;
-  inline isl::union_pw_multi_aff coalesce() const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::union_set domain() const;
-  inline isl::union_pw_multi_aff drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  static inline isl::union_pw_multi_aff empty(isl::space space);
-  static inline isl::union_pw_multi_aff empty(isl::ctx ctx);
-  static inline isl::union_pw_multi_aff empty_space(isl::space space);
-  inline isl::pw_multi_aff extract_pw_multi_aff(isl::space space) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline isl::union_pw_multi_aff flat_range_product(isl::union_pw_multi_aff upma2) const;
-  inline stat foreach_pw_multi_aff(const std::function<stat(pw_multi_aff)> &fn) const;
-  static inline isl::union_pw_multi_aff from_union_set(isl::union_set uset);
-  inline isl::pw_multi_aff_list get_pw_multi_aff_list() const;
-  inline isl::space get_space() const;
-  inline isl::union_pw_aff get_union_pw_aff(int pos) const;
-  inline isl::union_pw_multi_aff gist(isl::union_set context) const;
-  inline isl::union_pw_multi_aff gist_params(isl::set context) const;
-  inline isl::union_pw_multi_aff intersect_domain(isl::space space) const;
-  inline isl::union_pw_multi_aff intersect_domain(isl::union_set uset) const;
-  inline isl::union_pw_multi_aff intersect_domain_wrapped_domain(isl::union_set uset) const;
-  inline isl::union_pw_multi_aff intersect_domain_wrapped_range(isl::union_set uset) const;
-  inline isl::union_pw_multi_aff intersect_params(isl::set set) const;
-  inline boolean involves_locals() const;
-  inline boolean involves_nan() const;
-  inline boolean isa_pw_multi_aff() const;
-  static inline isl::union_pw_multi_aff multi_val_on_domain(isl::union_set domain, isl::multi_val mv);
-  inline isl_size n_pw_multi_aff() const;
-  inline isl::union_pw_multi_aff neg() const;
-  inline boolean plain_is_empty() const;
-  inline boolean plain_is_equal(const isl::union_pw_multi_aff &upma2) const;
-  inline isl::union_pw_multi_aff preimage_domain_wrapped_domain(isl::union_pw_multi_aff upma2) const;
-  inline isl::union_pw_multi_aff pullback(isl::union_pw_multi_aff upma2) const;
-  inline isl::union_pw_multi_aff range_factor_domain() const;
-  inline isl::union_pw_multi_aff range_factor_range() const;
-  inline isl::union_pw_multi_aff range_product(isl::union_pw_multi_aff upma2) const;
-  inline isl::union_pw_multi_aff reset_user() const;
-  inline isl::union_pw_multi_aff scale_down_val(isl::val val) const;
-  inline isl::union_pw_multi_aff scale_multi_val(isl::multi_val mv) const;
-  inline isl::union_pw_multi_aff scale_val(isl::val val) const;
-  inline isl::union_pw_multi_aff sub(isl::union_pw_multi_aff upma2) const;
-  inline isl::union_pw_multi_aff subtract_domain(isl::space space) const;
-  inline isl::union_pw_multi_aff subtract_domain(isl::union_set uset) const;
-  inline isl::union_pw_multi_aff union_add(isl::union_pw_multi_aff upma2) const;
-};
+isl::ctx ast_expr_op_address_of::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
 
-// declarations for isl::union_pw_multi_aff_list
-inline union_pw_multi_aff_list manage(__isl_take isl_union_pw_multi_aff_list *ptr);
-inline union_pw_multi_aff_list manage_copy(__isl_keep isl_union_pw_multi_aff_list *ptr);
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_address_of &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
 
-class union_pw_multi_aff_list {
-  friend inline union_pw_multi_aff_list manage(__isl_take isl_union_pw_multi_aff_list *ptr);
-  friend inline union_pw_multi_aff_list manage_copy(__isl_keep isl_union_pw_multi_aff_list *ptr);
+// implementations for isl::ast_expr_op_and
+ast_expr_op_and::ast_expr_op_and()
+    : ast_expr_op() {}
 
-  isl_union_pw_multi_aff_list *ptr = nullptr;
+ast_expr_op_and::ast_expr_op_and(const ast_expr_op_and &obj)
+    : ast_expr_op(obj)
+{
+}
 
-  inline explicit union_pw_multi_aff_list(__isl_take isl_union_pw_multi_aff_list *ptr);
+ast_expr_op_and::ast_expr_op_and(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
 
-public:
-  inline /* implicit */ union_pw_multi_aff_list();
-  inline /* implicit */ union_pw_multi_aff_list(const union_pw_multi_aff_list &obj);
-  inline union_pw_multi_aff_list &operator=(union_pw_multi_aff_list obj);
-  inline ~union_pw_multi_aff_list();
-  inline __isl_give isl_union_pw_multi_aff_list *copy() const &;
-  inline __isl_give isl_union_pw_multi_aff_list *copy() && = delete;
-  inline __isl_keep isl_union_pw_multi_aff_list *get() const;
-  inline __isl_give isl_union_pw_multi_aff_list *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::union_pw_multi_aff_list add(isl::union_pw_multi_aff el) const;
-  static inline isl::union_pw_multi_aff_list alloc(isl::ctx ctx, int n);
-  inline isl::union_pw_multi_aff_list clear() const;
-  inline isl::union_pw_multi_aff_list concat(isl::union_pw_multi_aff_list list2) const;
-  inline isl::union_pw_multi_aff_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(union_pw_multi_aff)> &fn) const;
-  static inline isl::union_pw_multi_aff_list from_union_pw_multi_aff(isl::union_pw_multi_aff el);
-  inline isl::union_pw_multi_aff get_at(int index) const;
-  inline isl::union_pw_multi_aff get_union_pw_multi_aff(int index) const;
-  inline isl::union_pw_multi_aff_list insert(unsigned int pos, isl::union_pw_multi_aff el) const;
-  inline isl_size n_union_pw_multi_aff() const;
-  inline isl::union_pw_multi_aff_list reverse() const;
-  inline isl::union_pw_multi_aff_list set_union_pw_multi_aff(int index, isl::union_pw_multi_aff el) const;
-  inline isl_size size() const;
-  inline isl::union_pw_multi_aff_list swap(unsigned int pos1, unsigned int pos2) const;
-};
+ast_expr_op_and &ast_expr_op_and::operator=(ast_expr_op_and obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
 
-// declarations for isl::union_pw_qpolynomial
-inline union_pw_qpolynomial manage(__isl_take isl_union_pw_qpolynomial *ptr);
-inline union_pw_qpolynomial manage_copy(__isl_keep isl_union_pw_qpolynomial *ptr);
+isl::ctx ast_expr_op_and::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
 
-class union_pw_qpolynomial {
-  friend inline union_pw_qpolynomial manage(__isl_take isl_union_pw_qpolynomial *ptr);
-  friend inline union_pw_qpolynomial manage_copy(__isl_keep isl_union_pw_qpolynomial *ptr);
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_and &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
 
-  isl_union_pw_qpolynomial *ptr = nullptr;
+// implementations for isl::ast_expr_op_and_then
+ast_expr_op_and_then::ast_expr_op_and_then()
+    : ast_expr_op() {}
 
-  inline explicit union_pw_qpolynomial(__isl_take isl_union_pw_qpolynomial *ptr);
+ast_expr_op_and_then::ast_expr_op_and_then(const ast_expr_op_and_then &obj)
+    : ast_expr_op(obj)
+{
+}
 
-public:
-  inline /* implicit */ union_pw_qpolynomial();
-  inline /* implicit */ union_pw_qpolynomial(const union_pw_qpolynomial &obj);
-  inline explicit union_pw_qpolynomial(isl::ctx ctx, const std::string &str);
-  inline union_pw_qpolynomial &operator=(union_pw_qpolynomial obj);
-  inline ~union_pw_qpolynomial();
-  inline __isl_give isl_union_pw_qpolynomial *copy() const &;
-  inline __isl_give isl_union_pw_qpolynomial *copy() && = delete;
-  inline __isl_keep isl_union_pw_qpolynomial *get() const;
-  inline __isl_give isl_union_pw_qpolynomial *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
+ast_expr_op_and_then::ast_expr_op_and_then(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
 
-  inline isl::union_pw_qpolynomial add(isl::union_pw_qpolynomial upwqp2) const;
-  inline isl::union_pw_qpolynomial add_pw_qpolynomial(isl::pw_qpolynomial pwqp) const;
-  inline isl::union_pw_qpolynomial align_params(isl::space model) const;
-  inline isl::union_pw_qpolynomial coalesce() const;
-  inline isl_size dim(isl::dim type) const;
-  inline isl::union_set domain() const;
-  inline isl::union_pw_qpolynomial drop_dims(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::val eval(isl::point pnt) const;
-  inline isl::pw_qpolynomial extract_pw_qpolynomial(isl::space space) const;
-  inline int find_dim_by_name(isl::dim type, const std::string &name) const;
-  inline stat foreach_pw_qpolynomial(const std::function<stat(pw_qpolynomial)> &fn) const;
-  static inline isl::union_pw_qpolynomial from_pw_qpolynomial(isl::pw_qpolynomial pwqp);
-  inline isl::pw_qpolynomial_list get_pw_qpolynomial_list() const;
-  inline isl::space get_space() const;
-  inline isl::union_pw_qpolynomial gist(isl::union_set context) const;
-  inline isl::union_pw_qpolynomial gist_params(isl::set context) const;
-  inline isl::union_pw_qpolynomial intersect_domain(isl::union_set uset) const;
-  inline isl::union_pw_qpolynomial intersect_domain_space(isl::space space) const;
-  inline isl::union_pw_qpolynomial intersect_domain_union_set(isl::union_set uset) const;
-  inline isl::union_pw_qpolynomial intersect_domain_wrapped_domain(isl::union_set uset) const;
-  inline isl::union_pw_qpolynomial intersect_domain_wrapped_range(isl::union_set uset) const;
-  inline isl::union_pw_qpolynomial intersect_params(isl::set set) const;
-  inline boolean involves_nan() const;
-  inline isl::union_pw_qpolynomial mul(isl::union_pw_qpolynomial upwqp2) const;
-  inline isl_size n_pw_qpolynomial() const;
-  inline isl::union_pw_qpolynomial neg() const;
-  inline boolean plain_is_equal(const isl::union_pw_qpolynomial &upwqp2) const;
-  inline isl::union_pw_qpolynomial reset_user() const;
-  inline isl::union_pw_qpolynomial scale_down_val(isl::val v) const;
-  inline isl::union_pw_qpolynomial scale_val(isl::val v) const;
-  inline isl::union_pw_qpolynomial sub(isl::union_pw_qpolynomial upwqp2) const;
-  inline isl::union_pw_qpolynomial subtract_domain(isl::union_set uset) const;
-  inline isl::union_pw_qpolynomial subtract_domain_space(isl::space space) const;
-  inline isl::union_pw_qpolynomial subtract_domain_union_set(isl::union_set uset) const;
-  inline isl::union_pw_qpolynomial to_polynomial(int sign) const;
-  static inline isl::union_pw_qpolynomial zero(isl::space space);
-  static inline isl::union_pw_qpolynomial zero_ctx(isl::ctx ctx);
-  static inline isl::union_pw_qpolynomial zero_space(isl::space space);
-};
+ast_expr_op_and_then &ast_expr_op_and_then::operator=(ast_expr_op_and_then obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
 
-// declarations for isl::union_set
-inline union_set manage(__isl_take isl_union_set *ptr);
-inline union_set manage_copy(__isl_keep isl_union_set *ptr);
+isl::ctx ast_expr_op_and_then::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
 
-class union_set {
-  friend inline union_set manage(__isl_take isl_union_set *ptr);
-  friend inline union_set manage_copy(__isl_keep isl_union_set *ptr);
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_and_then &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
 
-  isl_union_set *ptr = nullptr;
+// implementations for isl::ast_expr_op_call
+ast_expr_op_call::ast_expr_op_call()
+    : ast_expr_op() {}
 
-  inline explicit union_set(__isl_take isl_union_set *ptr);
+ast_expr_op_call::ast_expr_op_call(const ast_expr_op_call &obj)
+    : ast_expr_op(obj)
+{
+}
 
-public:
-  inline /* implicit */ union_set();
-  inline /* implicit */ union_set(const union_set &obj);
-  inline /* implicit */ union_set(isl::basic_set bset);
-  inline /* implicit */ union_set(isl::point pnt);
-  inline /* implicit */ union_set(isl::set set);
-  inline explicit union_set(isl::ctx ctx, const std::string &str);
-  inline union_set &operator=(union_set obj);
-  inline ~union_set();
-  inline __isl_give isl_union_set *copy() const &;
-  inline __isl_give isl_union_set *copy() && = delete;
-  inline __isl_keep isl_union_set *get() const;
-  inline __isl_give isl_union_set *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
+ast_expr_op_call::ast_expr_op_call(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
 
-  inline isl::union_set affine_hull() const;
-  inline isl::union_set align_params(isl::space model) const;
-  inline isl::union_set apply(isl::union_map umap) const;
-  inline isl::union_set coalesce() const;
-  inline isl::union_set coefficients() const;
-  inline isl::schedule compute_schedule(isl::union_map validity, isl::union_map proximity) const;
-  inline boolean contains(const isl::space &space) const;
-  inline isl::union_set detect_equalities() const;
-  inline isl_size dim(isl::dim type) const;
-  static inline isl::union_set empty(isl::ctx ctx);
-  inline isl::set extract_set(isl::space space) const;
-  inline stat foreach_point(const std::function<stat(point)> &fn) const;
-  inline stat foreach_set(const std::function<stat(set)> &fn) const;
-  inline isl::basic_set_list get_basic_set_list() const;
-  inline uint32_t get_hash() const;
-  inline isl::set_list get_set_list() const;
-  inline isl::space get_space() const;
-  inline isl::union_set gist(isl::union_set context) const;
-  inline isl::union_set gist_params(isl::set set) const;
-  inline isl::union_map identity() const;
-  inline isl::union_pw_multi_aff identity_union_pw_multi_aff() const;
-  inline isl::union_set intersect(isl::union_set uset2) const;
-  inline isl::union_set intersect_params(isl::set set) const;
-  inline boolean is_disjoint(const isl::union_set &uset2) const;
-  inline boolean is_empty() const;
-  inline boolean is_equal(const isl::union_set &uset2) const;
-  inline boolean is_params() const;
-  inline boolean is_strict_subset(const isl::union_set &uset2) const;
-  inline boolean is_subset(const isl::union_set &uset2) const;
-  inline boolean isa_set() const;
-  inline isl::union_map lex_ge_union_set(isl::union_set uset2) const;
-  inline isl::union_map lex_gt_union_set(isl::union_set uset2) const;
-  inline isl::union_map lex_le_union_set(isl::union_set uset2) const;
-  inline isl::union_map lex_lt_union_set(isl::union_set uset2) const;
-  inline isl::union_set lexmax() const;
-  inline isl::union_set lexmin() const;
-  inline isl::multi_val min_multi_union_pw_aff(const isl::multi_union_pw_aff &obj) const;
-  inline isl_size n_set() const;
-  inline isl::set params() const;
-  inline isl::union_set polyhedral_hull() const;
-  inline isl::union_set preimage(isl::multi_aff ma) const;
-  inline isl::union_set preimage(isl::pw_multi_aff pma) const;
-  inline isl::union_set preimage(isl::union_pw_multi_aff upma) const;
-  inline isl::union_set product(isl::union_set uset2) const;
-  inline isl::union_set project_out(isl::dim type, unsigned int first, unsigned int n) const;
-  inline isl::union_set project_out_all_params() const;
-  inline isl::union_set remove_divs() const;
-  inline isl::union_set remove_redundancies() const;
-  inline isl::union_set reset_user() const;
-  inline isl::basic_set sample() const;
-  inline isl::point sample_point() const;
-  inline isl::union_set simple_hull() const;
-  inline isl::union_set solutions() const;
-  inline isl::union_set subtract(isl::union_set uset2) const;
-  inline isl::union_set unite(isl::union_set uset2) const;
-  inline isl::union_set universe() const;
-  inline isl::union_map unwrap() const;
-  inline isl::union_map wrapped_domain_map() const;
-};
+ast_expr_op_call &ast_expr_op_call::operator=(ast_expr_op_call obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
 
-// declarations for isl::union_set_list
-inline union_set_list manage(__isl_take isl_union_set_list *ptr);
-inline union_set_list manage_copy(__isl_keep isl_union_set_list *ptr);
+isl::ctx ast_expr_op_call::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
 
-class union_set_list {
-  friend inline union_set_list manage(__isl_take isl_union_set_list *ptr);
-  friend inline union_set_list manage_copy(__isl_keep isl_union_set_list *ptr);
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_call &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
 
-  isl_union_set_list *ptr = nullptr;
+// implementations for isl::ast_expr_op_cond
+ast_expr_op_cond::ast_expr_op_cond()
+    : ast_expr_op() {}
 
-  inline explicit union_set_list(__isl_take isl_union_set_list *ptr);
+ast_expr_op_cond::ast_expr_op_cond(const ast_expr_op_cond &obj)
+    : ast_expr_op(obj)
+{
+}
 
-public:
-  inline /* implicit */ union_set_list();
-  inline /* implicit */ union_set_list(const union_set_list &obj);
-  inline union_set_list &operator=(union_set_list obj);
-  inline ~union_set_list();
-  inline __isl_give isl_union_set_list *copy() const &;
-  inline __isl_give isl_union_set_list *copy() && = delete;
-  inline __isl_keep isl_union_set_list *get() const;
-  inline __isl_give isl_union_set_list *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
+ast_expr_op_cond::ast_expr_op_cond(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
 
-  inline isl::union_set_list add(isl::union_set el) const;
-  static inline isl::union_set_list alloc(isl::ctx ctx, int n);
-  inline isl::union_set_list clear() const;
-  inline isl::union_set_list concat(isl::union_set_list list2) const;
-  inline isl::union_set_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(union_set)> &fn) const;
-  static inline isl::union_set_list from_union_set(isl::union_set el);
-  inline isl::union_set get_at(int index) const;
-  inline isl::union_set get_union_set(int index) const;
-  inline isl::union_set_list insert(unsigned int pos, isl::union_set el) const;
-  inline isl_size n_union_set() const;
-  inline isl::union_set_list reverse() const;
-  inline isl::union_set_list set_union_set(int index, isl::union_set el) const;
-  inline isl_size size() const;
-  inline isl::union_set_list swap(unsigned int pos1, unsigned int pos2) const;
-  inline isl::union_set unite() const;
-};
+ast_expr_op_cond &ast_expr_op_cond::operator=(ast_expr_op_cond obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
 
-// declarations for isl::val
-inline val manage(__isl_take isl_val *ptr);
-inline val manage_copy(__isl_keep isl_val *ptr);
+isl::ctx ast_expr_op_cond::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
 
-class val {
-  friend inline val manage(__isl_take isl_val *ptr);
-  friend inline val manage_copy(__isl_keep isl_val *ptr);
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_cond &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
 
-  isl_val *ptr = nullptr;
+// implementations for isl::ast_expr_op_div
+ast_expr_op_div::ast_expr_op_div()
+    : ast_expr_op() {}
 
-  inline explicit val(__isl_take isl_val *ptr);
+ast_expr_op_div::ast_expr_op_div(const ast_expr_op_div &obj)
+    : ast_expr_op(obj)
+{
+}
 
-public:
-  inline /* implicit */ val();
-  inline /* implicit */ val(const val &obj);
-  inline explicit val(isl::ctx ctx, long i);
-  inline explicit val(isl::ctx ctx, const std::string &str);
-  inline val &operator=(val obj);
-  inline ~val();
-  inline __isl_give isl_val *copy() const &;
-  inline __isl_give isl_val *copy() && = delete;
-  inline __isl_keep isl_val *get() const;
-  inline __isl_give isl_val *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
+ast_expr_op_div::ast_expr_op_div(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
 
-  inline isl::val abs() const;
-  inline boolean abs_eq(const isl::val &v2) const;
-  inline isl::val add(isl::val v2) const;
-  inline isl::val add_ui(unsigned long v2) const;
-  inline isl::val ceil() const;
-  inline int cmp_si(long i) const;
-  inline isl::val div(isl::val v2) const;
-  inline isl::val div_ui(unsigned long v2) const;
-  inline boolean eq(const isl::val &v2) const;
-  inline boolean eq_si(long i) const;
-  inline isl::val floor() const;
-  inline isl::val gcd(isl::val v2) const;
-  inline boolean ge(const isl::val &v2) const;
-  inline uint32_t get_hash() const;
-  inline long get_num_si() const;
-  inline boolean gt(const isl::val &v2) const;
-  inline boolean gt_si(long i) const;
-  static inline isl::val infty(isl::ctx ctx);
-  static inline isl::val int_from_ui(isl::ctx ctx, unsigned long u);
-  inline isl::val inv() const;
-  inline boolean is_divisible_by(const isl::val &v2) const;
-  inline boolean is_infty() const;
-  inline boolean is_int() const;
-  inline boolean is_nan() const;
-  inline boolean is_neg() const;
-  inline boolean is_neginfty() const;
-  inline boolean is_negone() const;
-  inline boolean is_nonneg() const;
-  inline boolean is_nonpos() const;
-  inline boolean is_one() const;
-  inline boolean is_pos() const;
-  inline boolean is_rat() const;
-  inline boolean is_zero() const;
-  inline boolean le(const isl::val &v2) const;
-  inline boolean lt(const isl::val &v2) const;
-  inline isl::val max(isl::val v2) const;
-  inline isl::val min(isl::val v2) const;
-  inline isl::val mod(isl::val v2) const;
-  inline isl::val mul(isl::val v2) const;
-  inline isl::val mul_ui(unsigned long v2) const;
-  inline isl_size n_abs_num_chunks(size_t size) const;
-  static inline isl::val nan(isl::ctx ctx);
-  inline boolean ne(const isl::val &v2) const;
-  inline isl::val neg() const;
-  static inline isl::val neginfty(isl::ctx ctx);
-  static inline isl::val negone(isl::ctx ctx);
-  static inline isl::val one(isl::ctx ctx);
-  inline isl::val pow2() const;
-  inline isl::val set_si(long i) const;
-  inline int sgn() const;
-  inline isl::val sub(isl::val v2) const;
-  inline isl::val sub_ui(unsigned long v2) const;
-  inline isl::val trunc() const;
-  static inline isl::val zero(isl::ctx ctx);
-};
+ast_expr_op_div &ast_expr_op_div::operator=(ast_expr_op_div obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_expr_op_div::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
+
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_div &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
 
-// declarations for isl::val_list
-inline val_list manage(__isl_take isl_val_list *ptr);
-inline val_list manage_copy(__isl_keep isl_val_list *ptr);
+// implementations for isl::ast_expr_op_eq
+ast_expr_op_eq::ast_expr_op_eq()
+    : ast_expr_op() {}
 
-class val_list {
-  friend inline val_list manage(__isl_take isl_val_list *ptr);
-  friend inline val_list manage_copy(__isl_keep isl_val_list *ptr);
+ast_expr_op_eq::ast_expr_op_eq(const ast_expr_op_eq &obj)
+    : ast_expr_op(obj)
+{
+}
 
-  isl_val_list *ptr = nullptr;
+ast_expr_op_eq::ast_expr_op_eq(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
 
-  inline explicit val_list(__isl_take isl_val_list *ptr);
+ast_expr_op_eq &ast_expr_op_eq::operator=(ast_expr_op_eq obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
 
-public:
-  inline /* implicit */ val_list();
-  inline /* implicit */ val_list(const val_list &obj);
-  inline val_list &operator=(val_list obj);
-  inline ~val_list();
-  inline __isl_give isl_val_list *copy() const &;
-  inline __isl_give isl_val_list *copy() && = delete;
-  inline __isl_keep isl_val_list *get() const;
-  inline __isl_give isl_val_list *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
+isl::ctx ast_expr_op_eq::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
 
-  inline isl::val_list add(isl::val el) const;
-  static inline isl::val_list alloc(isl::ctx ctx, int n);
-  inline isl::val_list clear() const;
-  inline isl::val_list concat(isl::val_list list2) const;
-  inline isl::val_list drop(unsigned int first, unsigned int n) const;
-  inline stat foreach(const std::function<stat(val)> &fn) const;
-  static inline isl::val_list from_val(isl::val el);
-  inline isl::val get_at(int index) const;
-  inline isl::val get_val(int index) const;
-  inline isl::val_list insert(unsigned int pos, isl::val el) const;
-  inline isl_size n_val() const;
-  inline isl::val_list reverse() const;
-  inline isl::val_list set_val(int index, isl::val el) const;
-  inline isl_size size() const;
-  inline isl::val_list swap(unsigned int pos1, unsigned int pos2) const;
-};
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_eq &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
 
-// declarations for isl::vec
-inline vec manage(__isl_take isl_vec *ptr);
-inline vec manage_copy(__isl_keep isl_vec *ptr);
+// implementations for isl::ast_expr_op_fdiv_q
+ast_expr_op_fdiv_q::ast_expr_op_fdiv_q()
+    : ast_expr_op() {}
 
-class vec {
-  friend inline vec manage(__isl_take isl_vec *ptr);
-  friend inline vec manage_copy(__isl_keep isl_vec *ptr);
+ast_expr_op_fdiv_q::ast_expr_op_fdiv_q(const ast_expr_op_fdiv_q &obj)
+    : ast_expr_op(obj)
+{
+}
 
-  isl_vec *ptr = nullptr;
+ast_expr_op_fdiv_q::ast_expr_op_fdiv_q(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
 
-  inline explicit vec(__isl_take isl_vec *ptr);
+ast_expr_op_fdiv_q &ast_expr_op_fdiv_q::operator=(ast_expr_op_fdiv_q obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
 
-public:
-  inline /* implicit */ vec();
-  inline /* implicit */ vec(const vec &obj);
-  inline vec &operator=(vec obj);
-  inline ~vec();
-  inline __isl_give isl_vec *copy() const &;
-  inline __isl_give isl_vec *copy() && = delete;
-  inline __isl_keep isl_vec *get() const;
-  inline __isl_give isl_vec *release();
-  inline bool is_null() const;
-  inline isl::ctx ctx() const;
-  inline void dump() const;
-
-  inline isl::vec add(isl::vec vec2) const;
-  inline isl::vec add_els(unsigned int n) const;
-  static inline isl::vec alloc(isl::ctx ctx, unsigned int size);
-  inline isl::vec ceil() const;
-  inline isl::vec clr() const;
-  inline int cmp_element(const isl::vec &vec2, int pos) const;
-  inline isl::vec concat(isl::vec vec2) const;
-  inline isl::vec drop_els(unsigned int pos, unsigned int n) const;
-  inline isl::vec extend(unsigned int size) const;
-  inline isl::val get_element_val(int pos) const;
-  inline isl::vec insert_els(unsigned int pos, unsigned int n) const;
-  inline isl::vec insert_zero_els(unsigned int pos, unsigned int n) const;
-  inline boolean is_equal(const isl::vec &vec2) const;
-  inline isl::vec mat_product(isl::mat mat) const;
-  inline isl::vec move_els(unsigned int dst_col, unsigned int src_col, unsigned int n) const;
-  inline isl::vec neg() const;
-  inline isl::vec set_element_si(int pos, int v) const;
-  inline isl::vec set_element_val(int pos, isl::val v) const;
-  inline isl::vec set_si(int v) const;
-  inline isl::vec set_val(isl::val v) const;
-  inline isl_size size() const;
-  inline isl::vec sort() const;
-  static inline isl::vec zero(isl::ctx ctx, unsigned int size);
-  inline isl::vec zero_extend(unsigned int size) const;
-};
+isl::ctx ast_expr_op_fdiv_q::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
 
-// implementations for isl::aff
-aff manage(__isl_take isl_aff *ptr) {
-  return aff(ptr);
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_fdiv_q &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
-aff manage_copy(__isl_keep isl_aff *ptr) {
-  ptr = isl_aff_copy(ptr);
-  return aff(ptr);
+
+// implementations for isl::ast_expr_op_ge
+ast_expr_op_ge::ast_expr_op_ge()
+    : ast_expr_op() {}
+
+ast_expr_op_ge::ast_expr_op_ge(const ast_expr_op_ge &obj)
+    : ast_expr_op(obj)
+{
 }
 
-aff::aff()
-    : ptr(nullptr) {}
+ast_expr_op_ge::ast_expr_op_ge(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
 
-aff::aff(const aff &obj)
-    : ptr(nullptr)
+ast_expr_op_ge &ast_expr_op_ge::operator=(ast_expr_op_ge obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_expr_op_ge::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
+
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_ge &obj)
 {
-  ptr = obj.copy();
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
+// implementations for isl::ast_expr_op_gt
+ast_expr_op_gt::ast_expr_op_gt()
+    : ast_expr_op() {}
 
-aff::aff(__isl_take isl_aff *ptr)
-    : ptr(ptr) {}
+ast_expr_op_gt::ast_expr_op_gt(const ast_expr_op_gt &obj)
+    : ast_expr_op(obj)
+{
+}
 
-aff::aff(isl::ctx ctx, const std::string &str)
+ast_expr_op_gt::ast_expr_op_gt(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_gt &ast_expr_op_gt::operator=(ast_expr_op_gt obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_expr_op_gt::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
+
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_gt &obj)
 {
-  auto res = isl_aff_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
-aff::aff(isl::local_space ls, isl::val val)
+
+// implementations for isl::ast_expr_op_le
+ast_expr_op_le::ast_expr_op_le()
+    : ast_expr_op() {}
+
+ast_expr_op_le::ast_expr_op_le(const ast_expr_op_le &obj)
+    : ast_expr_op(obj)
 {
-  auto res = isl_aff_val_on_domain(ls.release(), val.release());
-  ptr = res;
 }
-aff::aff(isl::local_space ls)
+
+ast_expr_op_le::ast_expr_op_le(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_le &ast_expr_op_le::operator=(ast_expr_op_le obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_expr_op_le::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
+
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_le &obj)
 {
-  auto res = isl_aff_zero_on_domain(ls.release());
-  ptr = res;
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-aff &aff::operator=(aff obj) {
+// implementations for isl::ast_expr_op_lt
+ast_expr_op_lt::ast_expr_op_lt()
+    : ast_expr_op() {}
+
+ast_expr_op_lt::ast_expr_op_lt(const ast_expr_op_lt &obj)
+    : ast_expr_op(obj)
+{
+}
+
+ast_expr_op_lt::ast_expr_op_lt(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_lt &ast_expr_op_lt::operator=(ast_expr_op_lt obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-aff::~aff() {
-  if (ptr)
-    isl_aff_free(ptr);
+isl::ctx ast_expr_op_lt::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
 }
 
-__isl_give isl_aff *aff::copy() const & {
-  return isl_aff_copy(ptr);
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_lt &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-__isl_keep isl_aff *aff::get() const {
-  return ptr;
+// implementations for isl::ast_expr_op_max
+ast_expr_op_max::ast_expr_op_max()
+    : ast_expr_op() {}
+
+ast_expr_op_max::ast_expr_op_max(const ast_expr_op_max &obj)
+    : ast_expr_op(obj)
+{
 }
 
-__isl_give isl_aff *aff::release() {
-  isl_aff *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+ast_expr_op_max::ast_expr_op_max(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_max &ast_expr_op_max::operator=(ast_expr_op_max obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-bool aff::is_null() const {
-  return ptr == nullptr;
+isl::ctx ast_expr_op_max::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
+
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_max &obj)
+{
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
+// implementations for isl::ast_expr_op_member
+ast_expr_op_member::ast_expr_op_member()
+    : ast_expr_op() {}
 
-isl::ctx aff::ctx() const {
-  return isl::ctx(isl_aff_get_ctx(ptr));
+ast_expr_op_member::ast_expr_op_member(const ast_expr_op_member &obj)
+    : ast_expr_op(obj)
+{
 }
 
-void aff::dump() const {
-  isl_aff_dump(get());
+ast_expr_op_member::ast_expr_op_member(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_member &ast_expr_op_member::operator=(ast_expr_op_member obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
+isl::ctx ast_expr_op_member::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
 
-isl::aff aff::add(isl::aff aff2) const
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_member &obj)
 {
-  auto res = isl_aff_add(copy(), aff2.release());
-  return manage(res);
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::aff aff::add_coefficient_si(isl::dim type, int pos, int v) const
+// implementations for isl::ast_expr_op_min
+ast_expr_op_min::ast_expr_op_min()
+    : ast_expr_op() {}
+
+ast_expr_op_min::ast_expr_op_min(const ast_expr_op_min &obj)
+    : ast_expr_op(obj)
 {
-  auto res = isl_aff_add_coefficient_si(copy(), static_cast<enum isl_dim_type>(type), pos, v);
-  return manage(res);
 }
 
-isl::aff aff::add_coefficient_val(isl::dim type, int pos, isl::val v) const
+ast_expr_op_min::ast_expr_op_min(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_min &ast_expr_op_min::operator=(ast_expr_op_min obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_expr_op_min::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
+
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_min &obj)
 {
-  auto res = isl_aff_add_coefficient_val(copy(), static_cast<enum isl_dim_type>(type), pos, v.release());
-  return manage(res);
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::aff aff::add_constant(isl::val v) const
+// implementations for isl::ast_expr_op_minus
+ast_expr_op_minus::ast_expr_op_minus()
+    : ast_expr_op() {}
+
+ast_expr_op_minus::ast_expr_op_minus(const ast_expr_op_minus &obj)
+    : ast_expr_op(obj)
 {
-  auto res = isl_aff_add_constant_val(copy(), v.release());
-  return manage(res);
 }
 
-isl::aff aff::add_constant_num_si(int v) const
+ast_expr_op_minus::ast_expr_op_minus(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_minus &ast_expr_op_minus::operator=(ast_expr_op_minus obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_expr_op_minus::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
+
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_minus &obj)
 {
-  auto res = isl_aff_add_constant_num_si(copy(), v);
-  return manage(res);
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::aff aff::add_constant_si(int v) const
+// implementations for isl::ast_expr_op_mul
+ast_expr_op_mul::ast_expr_op_mul()
+    : ast_expr_op() {}
+
+ast_expr_op_mul::ast_expr_op_mul(const ast_expr_op_mul &obj)
+    : ast_expr_op(obj)
 {
-  auto res = isl_aff_add_constant_si(copy(), v);
-  return manage(res);
 }
 
-isl::aff aff::add_dims(isl::dim type, unsigned int n) const
+ast_expr_op_mul::ast_expr_op_mul(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_mul &ast_expr_op_mul::operator=(ast_expr_op_mul obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_expr_op_mul::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
+
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_mul &obj)
 {
-  auto res = isl_aff_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
-  return manage(res);
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::aff aff::align_params(isl::space model) const
+// implementations for isl::ast_expr_op_or
+ast_expr_op_or::ast_expr_op_or()
+    : ast_expr_op() {}
+
+ast_expr_op_or::ast_expr_op_or(const ast_expr_op_or &obj)
+    : ast_expr_op(obj)
 {
-  auto res = isl_aff_align_params(copy(), model.release());
-  return manage(res);
 }
 
-isl::basic_set aff::bind(isl::id id) const
+ast_expr_op_or::ast_expr_op_or(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_or &ast_expr_op_or::operator=(ast_expr_op_or obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_expr_op_or::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
+
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_or &obj)
 {
-  auto res = isl_aff_bind_id(copy(), id.release());
-  return manage(res);
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::aff aff::ceil() const
+// implementations for isl::ast_expr_op_or_else
+ast_expr_op_or_else::ast_expr_op_or_else()
+    : ast_expr_op() {}
+
+ast_expr_op_or_else::ast_expr_op_or_else(const ast_expr_op_or_else &obj)
+    : ast_expr_op(obj)
 {
-  auto res = isl_aff_ceil(copy());
-  return manage(res);
 }
 
-int aff::coefficient_sgn(isl::dim type, int pos) const
+ast_expr_op_or_else::ast_expr_op_or_else(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_or_else &ast_expr_op_or_else::operator=(ast_expr_op_or_else obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_expr_op_or_else::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
+
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_or_else &obj)
 {
-  auto res = isl_aff_coefficient_sgn(get(), static_cast<enum isl_dim_type>(type), pos);
-  return res;
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl_size aff::dim(isl::dim type) const
+// implementations for isl::ast_expr_op_pdiv_q
+ast_expr_op_pdiv_q::ast_expr_op_pdiv_q()
+    : ast_expr_op() {}
+
+ast_expr_op_pdiv_q::ast_expr_op_pdiv_q(const ast_expr_op_pdiv_q &obj)
+    : ast_expr_op(obj)
 {
-  auto res = isl_aff_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
 }
 
-isl::aff aff::div(isl::aff aff2) const
+ast_expr_op_pdiv_q::ast_expr_op_pdiv_q(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_pdiv_q &ast_expr_op_pdiv_q::operator=(ast_expr_op_pdiv_q obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_expr_op_pdiv_q::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
+
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_pdiv_q &obj)
 {
-  auto res = isl_aff_div(copy(), aff2.release());
-  return manage(res);
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::aff aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+// implementations for isl::ast_expr_op_pdiv_r
+ast_expr_op_pdiv_r::ast_expr_op_pdiv_r()
+    : ast_expr_op() {}
+
+ast_expr_op_pdiv_r::ast_expr_op_pdiv_r(const ast_expr_op_pdiv_r &obj)
+    : ast_expr_op(obj)
 {
-  auto res = isl_aff_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
 }
 
-isl::basic_set aff::eq_basic_set(isl::aff aff2) const
+ast_expr_op_pdiv_r::ast_expr_op_pdiv_r(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_pdiv_r &ast_expr_op_pdiv_r::operator=(ast_expr_op_pdiv_r obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_expr_op_pdiv_r::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+}
+
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_pdiv_r &obj)
 {
-  auto res = isl_aff_eq_basic_set(copy(), aff2.release());
-  return manage(res);
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
+
+// implementations for isl::ast_expr_op_select
+ast_expr_op_select::ast_expr_op_select()
+    : ast_expr_op() {}
+
+ast_expr_op_select::ast_expr_op_select(const ast_expr_op_select &obj)
+    : ast_expr_op(obj)
+{
+}
+
+ast_expr_op_select::ast_expr_op_select(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_select &ast_expr_op_select::operator=(ast_expr_op_select obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_expr_op_select::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
 }
 
-isl::set aff::eq_set(isl::aff aff2) const
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_select &obj)
 {
-  auto res = isl_aff_eq_set(copy(), aff2.release());
-  return manage(res);
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::val aff::eval(isl::point pnt) const
-{
-  auto res = isl_aff_eval(copy(), pnt.release());
-  return manage(res);
-}
+// implementations for isl::ast_expr_op_sub
+ast_expr_op_sub::ast_expr_op_sub()
+    : ast_expr_op() {}
 
-int aff::find_dim_by_name(isl::dim type, const std::string &name) const
+ast_expr_op_sub::ast_expr_op_sub(const ast_expr_op_sub &obj)
+    : ast_expr_op(obj)
 {
-  auto res = isl_aff_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
 }
 
-isl::aff aff::floor() const
-{
-  auto res = isl_aff_floor(copy());
-  return manage(res);
+ast_expr_op_sub::ast_expr_op_sub(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_sub &ast_expr_op_sub::operator=(ast_expr_op_sub obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::aff aff::from_range() const
-{
-  auto res = isl_aff_from_range(copy());
-  return manage(res);
+isl::ctx ast_expr_op_sub::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
 }
 
-isl::basic_set aff::ge_basic_set(isl::aff aff2) const
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_sub &obj)
 {
-  auto res = isl_aff_ge_basic_set(copy(), aff2.release());
-  return manage(res);
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::set aff::ge_set(isl::aff aff2) const
+// implementations for isl::ast_expr_op_zdiv_r
+ast_expr_op_zdiv_r::ast_expr_op_zdiv_r()
+    : ast_expr_op() {}
+
+ast_expr_op_zdiv_r::ast_expr_op_zdiv_r(const ast_expr_op_zdiv_r &obj)
+    : ast_expr_op(obj)
 {
-  auto res = isl_aff_ge_set(copy(), aff2.release());
-  return manage(res);
 }
 
-isl::val aff::get_coefficient_val(isl::dim type, int pos) const
-{
-  auto res = isl_aff_get_coefficient_val(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+ast_expr_op_zdiv_r::ast_expr_op_zdiv_r(__isl_take isl_ast_expr *ptr)
+    : ast_expr_op(ptr) {}
+
+ast_expr_op_zdiv_r &ast_expr_op_zdiv_r::operator=(ast_expr_op_zdiv_r obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::val aff::get_constant_val() const
-{
-  auto res = isl_aff_get_constant_val(get());
-  return manage(res);
+isl::ctx ast_expr_op_zdiv_r::ctx() const {
+  return isl::ctx(isl_ast_expr_get_ctx(ptr));
 }
 
-isl::val aff::get_denominator_val() const
+inline std::ostream &operator<<(std::ostream &os, const ast_expr_op_zdiv_r &obj)
 {
-  auto res = isl_aff_get_denominator_val(get());
-  return manage(res);
+  char *str = isl_ast_expr_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-std::string aff::get_dim_name(isl::dim type, unsigned int pos) const
-{
-  auto res = isl_aff_get_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
-  std::string tmp(res);
-  return tmp;
+// implementations for isl::ast_node
+ast_node manage(__isl_take isl_ast_node *ptr) {
+  return ast_node(ptr);
+}
+ast_node manage_copy(__isl_keep isl_ast_node *ptr) {
+  ptr = isl_ast_node_copy(ptr);
+  return ast_node(ptr);
 }
 
-isl::aff aff::get_div(int pos) const
+ast_node::ast_node()
+    : ptr(nullptr) {}
+
+ast_node::ast_node(const ast_node &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_aff_get_div(get(), pos);
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::local_space aff::get_domain_local_space() const
-{
-  auto res = isl_aff_get_domain_local_space(get());
-  return manage(res);
+ast_node::ast_node(__isl_take isl_ast_node *ptr)
+    : ptr(ptr) {}
+
+ast_node &ast_node::operator=(ast_node obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::space aff::get_domain_space() const
-{
-  auto res = isl_aff_get_domain_space(get());
-  return manage(res);
+ast_node::~ast_node() {
+  if (ptr)
+    isl_ast_node_free(ptr);
 }
 
-uint32_t aff::get_hash() const
-{
-  auto res = isl_aff_get_hash(get());
-  return res;
+__isl_give isl_ast_node *ast_node::copy() const & {
+  return isl_ast_node_copy(ptr);
 }
 
-isl::local_space aff::get_local_space() const
-{
-  auto res = isl_aff_get_local_space(get());
-  return manage(res);
+__isl_keep isl_ast_node *ast_node::get() const {
+  return ptr;
 }
 
-isl::space aff::get_space() const
-{
-  auto res = isl_aff_get_space(get());
-  return manage(res);
+__isl_give isl_ast_node *ast_node::release() {
+  isl_ast_node *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
 }
 
-isl::aff aff::gist(isl::set context) const
-{
-  auto res = isl_aff_gist(copy(), context.release());
-  return manage(res);
+bool ast_node::is_null() const {
+  return ptr == nullptr;
 }
 
-isl::aff aff::gist_params(isl::set context) const
+template <typename T, typename>
+boolean ast_node::isa_type(T subtype) const
 {
-  auto res = isl_aff_gist_params(copy(), context.release());
-  return manage(res);
+  if (is_null())
+    return boolean();
+  return isl_ast_node_get_type(get()) == subtype;
 }
-
-isl::basic_set aff::gt_basic_set(isl::aff aff2) const
+template <class T>
+boolean ast_node::isa() const
 {
-  auto res = isl_aff_gt_basic_set(copy(), aff2.release());
-  return manage(res);
+  return isa_type<decltype(T::type)>(T::type);
 }
-
-isl::set aff::gt_set(isl::aff aff2) const
+template <class T>
+T ast_node::as() const
 {
-  auto res = isl_aff_gt_set(copy(), aff2.release());
-  return manage(res);
+ if (isa<T>().is_false())
+    isl_die(ctx().get(), isl_error_invalid, "not an object of the requested subtype", return T());
+  return T(copy());
 }
 
-isl::aff aff::insert_dims(isl::dim type, unsigned int first, unsigned int n) const
-{
-  auto res = isl_aff_insert_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+isl::ctx ast_node::ctx() const {
+  return isl::ctx(isl_ast_node_get_ctx(ptr));
 }
 
-boolean aff::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::id ast_node::annotation() const
 {
-  auto res = isl_aff_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_ast_node_get_annotation(get());
   return manage(res);
 }
 
-boolean aff::involves_locals() const
+isl::id ast_node::get_annotation() const
 {
-  auto res = isl_aff_involves_locals(get());
-  return manage(res);
+  return annotation();
 }
 
-boolean aff::is_cst() const
+std::string ast_node::to_C_str() const
 {
-  auto res = isl_aff_is_cst(get());
-  return manage(res);
+  auto res = isl_ast_node_to_C_str(get());
+  std::string tmp(res);
+  free(res);
+  return tmp;
 }
 
-boolean aff::is_nan() const
+isl::ast_node_list ast_node::to_list() const
 {
-  auto res = isl_aff_is_nan(get());
+  auto res = isl_ast_node_to_list(copy());
   return manage(res);
 }
 
-isl::basic_set aff::le_basic_set(isl::aff aff2) const
+inline std::ostream &operator<<(std::ostream &os, const ast_node &obj)
 {
-  auto res = isl_aff_le_basic_set(copy(), aff2.release());
-  return manage(res);
+  char *str = isl_ast_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::set aff::le_set(isl::aff aff2) const
+// implementations for isl::ast_node_block
+ast_node_block::ast_node_block()
+    : ast_node() {}
+
+ast_node_block::ast_node_block(const ast_node_block &obj)
+    : ast_node(obj)
 {
-  auto res = isl_aff_le_set(copy(), aff2.release());
-  return manage(res);
 }
 
-isl::basic_set aff::lt_basic_set(isl::aff aff2) const
-{
-  auto res = isl_aff_lt_basic_set(copy(), aff2.release());
-  return manage(res);
+ast_node_block::ast_node_block(__isl_take isl_ast_node *ptr)
+    : ast_node(ptr) {}
+
+ast_node_block &ast_node_block::operator=(ast_node_block obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::set aff::lt_set(isl::aff aff2) const
-{
-  auto res = isl_aff_lt_set(copy(), aff2.release());
-  return manage(res);
+isl::ctx ast_node_block::ctx() const {
+  return isl::ctx(isl_ast_node_get_ctx(ptr));
 }
 
-isl::aff aff::mod(isl::val mod) const
+isl::ast_node_list ast_node_block::children() const
 {
-  auto res = isl_aff_mod_val(copy(), mod.release());
+  auto res = isl_ast_node_block_get_children(get());
   return manage(res);
 }
 
-isl::aff aff::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
+isl::ast_node_list ast_node_block::get_children() const
 {
-  auto res = isl_aff_move_dims(copy(), static_cast<enum isl_dim_type>(dst_type), dst_pos, static_cast<enum isl_dim_type>(src_type), src_pos, n);
-  return manage(res);
+  return children();
 }
 
-isl::aff aff::mul(isl::aff aff2) const
+inline std::ostream &operator<<(std::ostream &os, const ast_node_block &obj)
 {
-  auto res = isl_aff_mul(copy(), aff2.release());
-  return manage(res);
+  char *str = isl_ast_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::aff aff::nan_on_domain(isl::local_space ls)
+// implementations for isl::ast_node_for
+ast_node_for::ast_node_for()
+    : ast_node() {}
+
+ast_node_for::ast_node_for(const ast_node_for &obj)
+    : ast_node(obj)
 {
-  auto res = isl_aff_nan_on_domain(ls.release());
-  return manage(res);
 }
 
-isl::aff aff::nan_on_domain_space(isl::space space)
-{
-  auto res = isl_aff_nan_on_domain_space(space.release());
-  return manage(res);
+ast_node_for::ast_node_for(__isl_take isl_ast_node *ptr)
+    : ast_node(ptr) {}
+
+ast_node_for &ast_node_for::operator=(ast_node_for obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::set aff::ne_set(isl::aff aff2) const
-{
-  auto res = isl_aff_ne_set(copy(), aff2.release());
-  return manage(res);
+isl::ctx ast_node_for::ctx() const {
+  return isl::ctx(isl_ast_node_get_ctx(ptr));
 }
 
-isl::aff aff::neg() const
+isl::ast_node ast_node_for::body() const
 {
-  auto res = isl_aff_neg(copy());
+  auto res = isl_ast_node_for_get_body(get());
   return manage(res);
 }
 
-isl::basic_set aff::neg_basic_set() const
+isl::ast_node ast_node_for::get_body() const
 {
-  auto res = isl_aff_neg_basic_set(copy());
-  return manage(res);
+  return body();
 }
 
-isl::aff aff::param_on_domain_space_id(isl::space space, isl::id id)
+isl::ast_expr ast_node_for::cond() const
 {
-  auto res = isl_aff_param_on_domain_space_id(space.release(), id.release());
+  auto res = isl_ast_node_for_get_cond(get());
   return manage(res);
 }
 
-boolean aff::plain_is_equal(const isl::aff &aff2) const
+isl::ast_expr ast_node_for::get_cond() const
 {
-  auto res = isl_aff_plain_is_equal(get(), aff2.get());
-  return manage(res);
+  return cond();
 }
 
-boolean aff::plain_is_zero() const
+isl::ast_expr ast_node_for::inc() const
 {
-  auto res = isl_aff_plain_is_zero(get());
+  auto res = isl_ast_node_for_get_inc(get());
   return manage(res);
 }
 
-isl::aff aff::project_domain_on_params() const
+isl::ast_expr ast_node_for::get_inc() const
 {
-  auto res = isl_aff_project_domain_on_params(copy());
-  return manage(res);
+  return inc();
 }
 
-isl::aff aff::pullback(isl::multi_aff ma) const
+isl::ast_expr ast_node_for::init() const
 {
-  auto res = isl_aff_pullback_multi_aff(copy(), ma.release());
+  auto res = isl_ast_node_for_get_init(get());
   return manage(res);
 }
 
-isl::aff aff::pullback_aff(isl::aff aff2) const
+isl::ast_expr ast_node_for::get_init() const
 {
-  auto res = isl_aff_pullback_aff(copy(), aff2.release());
-  return manage(res);
+  return init();
 }
 
-isl::aff aff::scale(isl::val v) const
+boolean ast_node_for::is_degenerate() const
 {
-  auto res = isl_aff_scale_val(copy(), v.release());
+  auto res = isl_ast_node_for_is_degenerate(get());
   return manage(res);
 }
 
-isl::aff aff::scale_down(isl::val v) const
+isl::ast_expr ast_node_for::iterator() const
 {
-  auto res = isl_aff_scale_down_val(copy(), v.release());
+  auto res = isl_ast_node_for_get_iterator(get());
   return manage(res);
 }
 
-isl::aff aff::scale_down_ui(unsigned int f) const
+isl::ast_expr ast_node_for::get_iterator() const
 {
-  auto res = isl_aff_scale_down_ui(copy(), f);
-  return manage(res);
+  return iterator();
 }
 
-isl::aff aff::set_coefficient_si(isl::dim type, int pos, int v) const
+inline std::ostream &operator<<(std::ostream &os, const ast_node_for &obj)
 {
-  auto res = isl_aff_set_coefficient_si(copy(), static_cast<enum isl_dim_type>(type), pos, v);
-  return manage(res);
+  char *str = isl_ast_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::aff aff::set_coefficient_val(isl::dim type, int pos, isl::val v) const
+// implementations for isl::ast_node_if
+ast_node_if::ast_node_if()
+    : ast_node() {}
+
+ast_node_if::ast_node_if(const ast_node_if &obj)
+    : ast_node(obj)
 {
-  auto res = isl_aff_set_coefficient_val(copy(), static_cast<enum isl_dim_type>(type), pos, v.release());
-  return manage(res);
 }
 
-isl::aff aff::set_constant_si(int v) const
-{
-  auto res = isl_aff_set_constant_si(copy(), v);
-  return manage(res);
+ast_node_if::ast_node_if(__isl_take isl_ast_node *ptr)
+    : ast_node(ptr) {}
+
+ast_node_if &ast_node_if::operator=(ast_node_if obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::aff aff::set_constant_val(isl::val v) const
-{
-  auto res = isl_aff_set_constant_val(copy(), v.release());
-  return manage(res);
+isl::ctx ast_node_if::ctx() const {
+  return isl::ctx(isl_ast_node_get_ctx(ptr));
 }
 
-isl::aff aff::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const
+isl::ast_expr ast_node_if::cond() const
 {
-  auto res = isl_aff_set_dim_id(copy(), static_cast<enum isl_dim_type>(type), pos, id.release());
+  auto res = isl_ast_node_if_get_cond(get());
   return manage(res);
 }
 
-isl::aff aff::set_tuple_id(isl::dim type, isl::id id) const
+isl::ast_expr ast_node_if::get_cond() const
 {
-  auto res = isl_aff_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
-  return manage(res);
+  return cond();
 }
 
-isl::aff aff::sub(isl::aff aff2) const
+isl::ast_node ast_node_if::else_node() const
 {
-  auto res = isl_aff_sub(copy(), aff2.release());
+  auto res = isl_ast_node_if_get_else_node(get());
   return manage(res);
 }
 
-isl::aff aff::unbind_params_insert_domain(isl::multi_id domain) const
+isl::ast_node ast_node_if::get_else_node() const
 {
-  auto res = isl_aff_unbind_params_insert_domain(copy(), domain.release());
-  return manage(res);
+  return else_node();
 }
 
-isl::aff aff::val_on_domain_space(isl::space space, isl::val val)
+boolean ast_node_if::has_else_node() const
 {
-  auto res = isl_aff_val_on_domain_space(space.release(), val.release());
+  auto res = isl_ast_node_if_has_else_node(get());
   return manage(res);
 }
 
-isl::aff aff::var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos)
+isl::ast_node ast_node_if::then_node() const
 {
-  auto res = isl_aff_var_on_domain(ls.release(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_ast_node_if_get_then_node(get());
   return manage(res);
 }
 
-isl::basic_set aff::zero_basic_set() const
+isl::ast_node ast_node_if::get_then_node() const
 {
-  auto res = isl_aff_zero_basic_set(copy());
-  return manage(res);
+  return then_node();
 }
 
-isl::aff aff::zero_on_domain(isl::space space)
+inline std::ostream &operator<<(std::ostream &os, const ast_node_if &obj)
 {
-  auto res = isl_aff_zero_on_domain_space(space.release());
-  return manage(res);
+  char *str = isl_ast_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-// implementations for isl::aff_list
-aff_list manage(__isl_take isl_aff_list *ptr) {
-  return aff_list(ptr);
+// implementations for isl::ast_node_list
+ast_node_list manage(__isl_take isl_ast_node_list *ptr) {
+  return ast_node_list(ptr);
 }
-aff_list manage_copy(__isl_keep isl_aff_list *ptr) {
-  ptr = isl_aff_list_copy(ptr);
-  return aff_list(ptr);
+ast_node_list manage_copy(__isl_keep isl_ast_node_list *ptr) {
+  ptr = isl_ast_node_list_copy(ptr);
+  return ast_node_list(ptr);
 }
 
-aff_list::aff_list()
+ast_node_list::ast_node_list()
     : ptr(nullptr) {}
 
-aff_list::aff_list(const aff_list &obj)
+ast_node_list::ast_node_list(const ast_node_list &obj)
     : ptr(nullptr)
 {
   ptr = obj.copy();
 }
 
-
-aff_list::aff_list(__isl_take isl_aff_list *ptr)
+ast_node_list::ast_node_list(__isl_take isl_ast_node_list *ptr)
     : ptr(ptr) {}
 
+ast_node_list::ast_node_list(isl::ctx ctx, int n)
+{
+  auto res = isl_ast_node_list_alloc(ctx.release(), n);
+  ptr = res;
+}
 
-aff_list &aff_list::operator=(aff_list obj) {
+ast_node_list::ast_node_list(isl::ast_node el)
+{
+  auto res = isl_ast_node_list_from_ast_node(el.release());
+  ptr = res;
+}
+
+ast_node_list &ast_node_list::operator=(ast_node_list obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-aff_list::~aff_list() {
+ast_node_list::~ast_node_list() {
   if (ptr)
-    isl_aff_list_free(ptr);
+    isl_ast_node_list_free(ptr);
 }
 
-__isl_give isl_aff_list *aff_list::copy() const & {
-  return isl_aff_list_copy(ptr);
+__isl_give isl_ast_node_list *ast_node_list::copy() const & {
+  return isl_ast_node_list_copy(ptr);
 }
 
-__isl_keep isl_aff_list *aff_list::get() const {
+__isl_keep isl_ast_node_list *ast_node_list::get() const {
   return ptr;
 }
 
-__isl_give isl_aff_list *aff_list::release() {
-  isl_aff_list *tmp = ptr;
+__isl_give isl_ast_node_list *ast_node_list::release() {
+  isl_ast_node_list *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool aff_list::is_null() const {
+bool ast_node_list::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx aff_list::ctx() const {
-  return isl::ctx(isl_aff_list_get_ctx(ptr));
+isl::ctx ast_node_list::ctx() const {
+  return isl::ctx(isl_ast_node_list_get_ctx(ptr));
 }
 
-void aff_list::dump() const {
-  isl_aff_list_dump(get());
+isl::ast_node_list ast_node_list::add(isl::ast_node el) const
+{
+  auto res = isl_ast_node_list_add(copy(), el.release());
+  return manage(res);
 }
 
-
-isl::aff_list aff_list::add(isl::aff el) const
+isl::ast_node ast_node_list::at(int index) const
 {
-  auto res = isl_aff_list_add(copy(), el.release());
+  auto res = isl_ast_node_list_get_at(get(), index);
   return manage(res);
 }
 
-isl::aff_list aff_list::alloc(isl::ctx ctx, int n)
+isl::ast_node ast_node_list::get_at(int index) const
 {
-  auto res = isl_aff_list_alloc(ctx.release(), n);
-  return manage(res);
+  return at(index);
 }
 
-isl::aff_list aff_list::clear() const
+isl::ast_node_list ast_node_list::clear() const
 {
-  auto res = isl_aff_list_clear(copy());
+  auto res = isl_ast_node_list_clear(copy());
   return manage(res);
 }
 
-isl::aff_list aff_list::concat(isl::aff_list list2) const
+isl::ast_node_list ast_node_list::concat(isl::ast_node_list list2) const
 {
-  auto res = isl_aff_list_concat(copy(), list2.release());
+  auto res = isl_ast_node_list_concat(copy(), list2.release());
   return manage(res);
 }
 
-isl::aff_list aff_list::drop(unsigned int first, unsigned int n) const
+isl::ast_node_list ast_node_list::drop(unsigned int first, unsigned int n) const
 {
-  auto res = isl_aff_list_drop(copy(), first, n);
+  auto res = isl_ast_node_list_drop(copy(), first, n);
   return manage(res);
 }
 
-stat aff_list::foreach(const std::function<stat(aff)> &fn) const
+stat ast_node_list::foreach(const std::function<stat(isl::ast_node)> &fn) const
 {
   struct fn_data {
-    const std::function<stat(aff)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_aff *arg_0, void *arg_1) -> isl_stat {
+    std::function<stat(isl::ast_node)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_ast_node *arg_0, void *arg_1) -> isl_stat {
     auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
+    auto ret = (data->func)(manage(arg_0));
     return ret.release();
   };
-  auto res = isl_aff_list_foreach(get(), fn_lambda, &fn_data);
+  auto res = isl_ast_node_list_foreach(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::aff_list aff_list::from_aff(isl::aff el)
+isl::ast_node_list ast_node_list::insert(unsigned int pos, isl::ast_node el) const
 {
-  auto res = isl_aff_list_from_aff(el.release());
+  auto res = isl_ast_node_list_insert(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::aff aff_list::get_aff(int index) const
+class size ast_node_list::size() const
 {
-  auto res = isl_aff_list_get_aff(get(), index);
+  auto res = isl_ast_node_list_size(get());
   return manage(res);
 }
 
-isl::aff aff_list::get_at(int index) const
+inline std::ostream &operator<<(std::ostream &os, const ast_node_list &obj)
 {
-  auto res = isl_aff_list_get_at(get(), index);
-  return manage(res);
+  char *str = isl_ast_node_list_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::aff_list aff_list::insert(unsigned int pos, isl::aff el) const
+// implementations for isl::ast_node_mark
+ast_node_mark::ast_node_mark()
+    : ast_node() {}
+
+ast_node_mark::ast_node_mark(const ast_node_mark &obj)
+    : ast_node(obj)
 {
-  auto res = isl_aff_list_insert(copy(), pos, el.release());
+}
+
+ast_node_mark::ast_node_mark(__isl_take isl_ast_node *ptr)
+    : ast_node(ptr) {}
+
+ast_node_mark &ast_node_mark::operator=(ast_node_mark obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_node_mark::ctx() const {
+  return isl::ctx(isl_ast_node_get_ctx(ptr));
+}
+
+isl::id ast_node_mark::id() const
+{
+  auto res = isl_ast_node_mark_get_id(get());
   return manage(res);
 }
 
-isl_size aff_list::n_aff() const
+isl::id ast_node_mark::get_id() const
 {
-  auto res = isl_aff_list_n_aff(get());
-  return res;
+  return id();
 }
 
-isl::aff_list aff_list::reverse() const
+isl::ast_node ast_node_mark::node() const
 {
-  auto res = isl_aff_list_reverse(copy());
+  auto res = isl_ast_node_mark_get_node(get());
   return manage(res);
 }
 
-isl::aff_list aff_list::set_aff(int index, isl::aff el) const
+isl::ast_node ast_node_mark::get_node() const
 {
-  auto res = isl_aff_list_set_aff(copy(), index, el.release());
-  return manage(res);
+  return node();
 }
 
-isl_size aff_list::size() const
+inline std::ostream &operator<<(std::ostream &os, const ast_node_mark &obj)
 {
-  auto res = isl_aff_list_size(get());
-  return res;
+  char *str = isl_ast_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
+
+// implementations for isl::ast_node_user
+ast_node_user::ast_node_user()
+    : ast_node() {}
+
+ast_node_user::ast_node_user(const ast_node_user &obj)
+    : ast_node(obj)
+{
+}
+
+ast_node_user::ast_node_user(__isl_take isl_ast_node *ptr)
+    : ast_node(ptr) {}
+
+ast_node_user &ast_node_user::operator=(ast_node_user obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+isl::ctx ast_node_user::ctx() const {
+  return isl::ctx(isl_ast_node_get_ctx(ptr));
 }
 
-isl::aff_list aff_list::swap(unsigned int pos1, unsigned int pos2) const
+isl::ast_expr ast_node_user::expr() const
 {
-  auto res = isl_aff_list_swap(copy(), pos1, pos2);
+  auto res = isl_ast_node_user_get_expr(get());
   return manage(res);
 }
 
-// implementations for isl::ast_build
-ast_build manage(__isl_take isl_ast_build *ptr) {
-  return ast_build(ptr);
+isl::ast_expr ast_node_user::get_expr() const
+{
+  return expr();
 }
-ast_build manage_copy(__isl_keep isl_ast_build *ptr) {
-  ptr = isl_ast_build_copy(ptr);
-  return ast_build(ptr);
+
+inline std::ostream &operator<<(std::ostream &os, const ast_node_user &obj)
+{
+  char *str = isl_ast_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-ast_build::ast_build()
+// implementations for isl::basic_map
+basic_map manage(__isl_take isl_basic_map *ptr) {
+  return basic_map(ptr);
+}
+basic_map manage_copy(__isl_keep isl_basic_map *ptr) {
+  ptr = isl_basic_map_copy(ptr);
+  return basic_map(ptr);
+}
+
+basic_map::basic_map()
     : ptr(nullptr) {}
 
-ast_build::ast_build(const ast_build &obj)
+basic_map::basic_map(const basic_map &obj)
     : ptr(nullptr)
 {
   ptr = obj.copy();
 }
 
-
-ast_build::ast_build(__isl_take isl_ast_build *ptr)
+basic_map::basic_map(__isl_take isl_basic_map *ptr)
     : ptr(ptr) {}
 
-ast_build::ast_build(isl::ctx ctx)
+basic_map::basic_map(isl::ctx ctx, const std::string &str)
 {
-  auto res = isl_ast_build_alloc(ctx.release());
+  auto res = isl_basic_map_read_from_str(ctx.release(), str.c_str());
   ptr = res;
 }
 
-ast_build &ast_build::operator=(ast_build obj) {
+basic_map &basic_map::operator=(basic_map obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-ast_build::~ast_build() {
+basic_map::~basic_map() {
   if (ptr)
-    isl_ast_build_free(ptr);
+    isl_basic_map_free(ptr);
 }
 
-__isl_give isl_ast_build *ast_build::copy() const & {
-  return isl_ast_build_copy(ptr);
+__isl_give isl_basic_map *basic_map::copy() const & {
+  return isl_basic_map_copy(ptr);
 }
 
-__isl_keep isl_ast_build *ast_build::get() const {
+__isl_keep isl_basic_map *basic_map::get() const {
   return ptr;
 }
 
-__isl_give isl_ast_build *ast_build::release() {
-  isl_ast_build *tmp = ptr;
+__isl_give isl_basic_map *basic_map::release() {
+  isl_basic_map *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool ast_build::is_null() const {
+bool basic_map::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx ast_build::ctx() const {
-  return isl::ctx(isl_ast_build_get_ctx(ptr));
+isl::ctx basic_map::ctx() const {
+  return isl::ctx(isl_basic_map_get_ctx(ptr));
 }
 
-
-isl::ast_expr ast_build::access_from(isl::multi_pw_aff mpa) const
+isl::map basic_map::add_constraint(const isl::constraint &constraint) const
 {
-  auto res = isl_ast_build_access_from_multi_pw_aff(get(), mpa.release());
-  return manage(res);
+  return isl::map(*this).add_constraint(constraint);
 }
 
-isl::ast_expr ast_build::access_from(isl::pw_multi_aff pma) const
+isl::map basic_map::add_dims(isl::dim type, unsigned int n) const
 {
-  auto res = isl_ast_build_access_from_pw_multi_aff(get(), pma.release());
-  return manage(res);
+  return isl::map(*this).add_dims(type, n);
 }
 
-isl::ast_node ast_build::ast_from_schedule(isl::union_map schedule) const
+isl::basic_map basic_map::affine_hull() const
 {
-  auto res = isl_ast_build_ast_from_schedule(get(), schedule.release());
+  auto res = isl_basic_map_affine_hull(copy());
   return manage(res);
 }
 
-isl::ast_expr ast_build::call_from(isl::multi_pw_aff mpa) const
+isl::map basic_map::align_params(const isl::space &model) const
 {
-  auto res = isl_ast_build_call_from_multi_pw_aff(get(), mpa.release());
-  return manage(res);
+  return isl::map(*this).align_params(model);
 }
 
-isl::ast_expr ast_build::call_from(isl::pw_multi_aff pma) const
+isl::basic_map basic_map::apply_domain(isl::basic_map bmap2) const
 {
-  auto res = isl_ast_build_call_from_pw_multi_aff(get(), pma.release());
+  auto res = isl_basic_map_apply_domain(copy(), bmap2.release());
   return manage(res);
 }
 
-isl::ast_expr ast_build::expr_from(isl::pw_aff pa) const
+isl::map basic_map::apply_domain(const isl::map &map2) const
 {
-  auto res = isl_ast_build_expr_from_pw_aff(get(), pa.release());
-  return manage(res);
+  return isl::map(*this).apply_domain(map2);
 }
 
-isl::ast_expr ast_build::expr_from(isl::set set) const
+isl::union_map basic_map::apply_domain(const isl::union_map &umap2) const
 {
-  auto res = isl_ast_build_expr_from_set(get(), set.release());
-  return manage(res);
+  return isl::map(*this).apply_domain(umap2);
 }
 
-isl::ast_build ast_build::from_context(isl::set set)
+isl::basic_map basic_map::apply_range(isl::basic_map bmap2) const
 {
-  auto res = isl_ast_build_from_context(set.release());
+  auto res = isl_basic_map_apply_range(copy(), bmap2.release());
   return manage(res);
 }
 
-isl::union_map ast_build::get_schedule() const
+isl::map basic_map::apply_range(const isl::map &map2) const
 {
-  auto res = isl_ast_build_get_schedule(get());
-  return manage(res);
+  return isl::map(*this).apply_range(map2);
 }
 
-isl::space ast_build::get_schedule_space() const
+isl::union_map basic_map::apply_range(const isl::union_map &umap2) const
 {
-  auto res = isl_ast_build_get_schedule_space(get());
-  return manage(res);
+  return isl::map(*this).apply_range(umap2);
 }
 
-isl::ast_node ast_build::node_from(isl::schedule schedule) const
+isl::map basic_map::as_map() const
 {
-  auto res = isl_ast_build_node_from_schedule(get(), schedule.release());
-  return manage(res);
+  return isl::map(*this).as_map();
 }
 
-isl::ast_node ast_build::node_from_schedule_map(isl::union_map schedule) const
+isl::multi_union_pw_aff basic_map::as_multi_union_pw_aff() const
 {
-  auto res = isl_ast_build_node_from_schedule_map(get(), schedule.release());
-  return manage(res);
+  return isl::map(*this).as_multi_union_pw_aff();
 }
 
-isl::ast_build ast_build::restrict(isl::set set) const
+isl::pw_multi_aff basic_map::as_pw_multi_aff() const
 {
-  auto res = isl_ast_build_restrict(copy(), set.release());
-  return manage(res);
-}
-
-// implementations for isl::ast_expr
-ast_expr manage(__isl_take isl_ast_expr *ptr) {
-  return ast_expr(ptr);
-}
-ast_expr manage_copy(__isl_keep isl_ast_expr *ptr) {
-  ptr = isl_ast_expr_copy(ptr);
-  return ast_expr(ptr);
+  return isl::map(*this).as_pw_multi_aff();
 }
 
-ast_expr::ast_expr()
-    : ptr(nullptr) {}
-
-ast_expr::ast_expr(const ast_expr &obj)
-    : ptr(nullptr)
+isl::union_pw_multi_aff basic_map::as_union_pw_multi_aff() const
 {
-  ptr = obj.copy();
-}
-
-
-ast_expr::ast_expr(__isl_take isl_ast_expr *ptr)
-    : ptr(ptr) {}
-
-
-ast_expr &ast_expr::operator=(ast_expr obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
-}
-
-ast_expr::~ast_expr() {
-  if (ptr)
-    isl_ast_expr_free(ptr);
-}
-
-__isl_give isl_ast_expr *ast_expr::copy() const & {
-  return isl_ast_expr_copy(ptr);
-}
-
-__isl_keep isl_ast_expr *ast_expr::get() const {
-  return ptr;
+  return isl::map(*this).as_union_pw_multi_aff();
 }
 
-__isl_give isl_ast_expr *ast_expr::release() {
-  isl_ast_expr *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::basic_map_list basic_map::basic_map_list() const
+{
+  return isl::map(*this).basic_map_list();
 }
 
-bool ast_expr::is_null() const {
-  return ptr == nullptr;
+isl::set basic_map::bind_domain(const isl::multi_id &tuple) const
+{
+  return isl::map(*this).bind_domain(tuple);
 }
 
-
-isl::ctx ast_expr::ctx() const {
-  return isl::ctx(isl_ast_expr_get_ctx(ptr));
+isl::set basic_map::bind_range(const isl::multi_id &tuple) const
+{
+  return isl::map(*this).bind_range(tuple);
 }
 
-void ast_expr::dump() const {
-  isl_ast_expr_dump(get());
+boolean basic_map::can_curry() const
+{
+  return isl::map(*this).can_curry();
 }
 
-
-isl::ast_expr ast_expr::access(isl::ast_expr_list indices) const
+isl::map basic_map::coalesce() const
 {
-  auto res = isl_ast_expr_access(copy(), indices.release());
-  return manage(res);
+  return isl::map(*this).coalesce();
 }
 
-isl::ast_expr ast_expr::add(isl::ast_expr expr2) const
+isl::map basic_map::complement() const
 {
-  auto res = isl_ast_expr_add(copy(), expr2.release());
-  return manage(res);
+  return isl::map(*this).complement();
 }
 
-isl::ast_expr ast_expr::address_of() const
+isl::union_map basic_map::compute_divs() const
 {
-  auto res = isl_ast_expr_address_of(copy());
-  return manage(res);
+  return isl::map(*this).compute_divs();
 }
 
-isl::ast_expr ast_expr::call(isl::ast_expr_list arguments) const
+isl::map basic_map::curry() const
 {
-  auto res = isl_ast_expr_call(copy(), arguments.release());
-  return manage(res);
+  return isl::map(*this).curry();
 }
 
-isl::ast_expr ast_expr::div(isl::ast_expr expr2) const
+isl::basic_set basic_map::deltas() const
 {
-  auto res = isl_ast_expr_div(copy(), expr2.release());
+  auto res = isl_basic_map_deltas(copy());
   return manage(res);
 }
 
-isl::ast_expr ast_expr::eq(isl::ast_expr expr2) const
+isl::basic_map basic_map::detect_equalities() const
 {
-  auto res = isl_ast_expr_eq(copy(), expr2.release());
+  auto res = isl_basic_map_detect_equalities(copy());
   return manage(res);
 }
 
-isl::ast_expr ast_expr::from_id(isl::id id)
+class size basic_map::dim(isl::dim type) const
 {
-  auto res = isl_ast_expr_from_id(id.release());
-  return manage(res);
+  return isl::map(*this).dim(type);
 }
 
-isl::ast_expr ast_expr::from_val(isl::val v)
+isl::pw_aff basic_map::dim_max(int pos) const
 {
-  auto res = isl_ast_expr_from_val(v.release());
-  return manage(res);
+  return isl::map(*this).dim_max(pos);
 }
 
-isl::ast_expr ast_expr::ge(isl::ast_expr expr2) const
+isl::pw_aff basic_map::dim_min(int pos) const
 {
-  auto res = isl_ast_expr_ge(copy(), expr2.release());
-  return manage(res);
+  return isl::map(*this).dim_min(pos);
 }
 
-isl::id ast_expr::get_id() const
+isl::basic_set basic_map::domain() const
 {
-  auto res = isl_ast_expr_get_id(get());
+  auto res = isl_basic_map_domain(copy());
   return manage(res);
 }
 
-isl::ast_expr ast_expr::get_op_arg(int pos) const
+isl::map basic_map::domain_factor_domain() const
 {
-  auto res = isl_ast_expr_get_op_arg(get(), pos);
-  return manage(res);
+  return isl::map(*this).domain_factor_domain();
 }
 
-isl_size ast_expr::get_op_n_arg() const
+isl::map basic_map::domain_factor_range() const
 {
-  auto res = isl_ast_expr_get_op_n_arg(get());
-  return res;
+  return isl::map(*this).domain_factor_range();
 }
 
-isl::val ast_expr::get_val() const
+isl::map basic_map::domain_map() const
 {
-  auto res = isl_ast_expr_get_val(get());
-  return manage(res);
+  return isl::map(*this).domain_map();
 }
 
-isl::ast_expr ast_expr::gt(isl::ast_expr expr2) const
+isl::union_pw_multi_aff basic_map::domain_map_union_pw_multi_aff() const
 {
-  auto res = isl_ast_expr_gt(copy(), expr2.release());
-  return manage(res);
+  return isl::map(*this).domain_map_union_pw_multi_aff();
 }
 
-isl::id ast_expr::id_get_id() const
+isl::map basic_map::domain_product(const isl::map &map2) const
 {
-  auto res = isl_ast_expr_id_get_id(get());
-  return manage(res);
+  return isl::map(*this).domain_product(map2);
 }
 
-isl::val ast_expr::int_get_val() const
+isl::union_map basic_map::domain_product(const isl::union_map &umap2) const
 {
-  auto res = isl_ast_expr_int_get_val(get());
-  return manage(res);
+  return isl::map(*this).domain_product(umap2);
 }
 
-boolean ast_expr::is_equal(const isl::ast_expr &expr2) const
+class size basic_map::domain_tuple_dim() const
 {
-  auto res = isl_ast_expr_is_equal(get(), expr2.get());
-  return manage(res);
+  return isl::map(*this).domain_tuple_dim();
 }
 
-isl::ast_expr ast_expr::le(isl::ast_expr expr2) const
+isl::id basic_map::domain_tuple_id() const
 {
-  auto res = isl_ast_expr_le(copy(), expr2.release());
-  return manage(res);
+  return isl::map(*this).domain_tuple_id();
 }
 
-isl::ast_expr ast_expr::lt(isl::ast_expr expr2) const
+isl::map basic_map::eq_at(const isl::multi_pw_aff &mpa) const
 {
-  auto res = isl_ast_expr_lt(copy(), expr2.release());
-  return manage(res);
+  return isl::map(*this).eq_at(mpa);
 }
 
-isl::ast_expr ast_expr::mul(isl::ast_expr expr2) const
+isl::union_map basic_map::eq_at(const isl::multi_union_pw_aff &mupa) const
 {
-  auto res = isl_ast_expr_mul(copy(), expr2.release());
-  return manage(res);
+  return isl::map(*this).eq_at(mupa);
 }
 
-isl::ast_expr ast_expr::neg() const
+isl::basic_map basic_map::equal(isl::space space, unsigned int n_equal)
 {
-  auto res = isl_ast_expr_neg(copy());
+  auto res = isl_basic_map_equal(space.release(), n_equal);
   return manage(res);
 }
 
-isl::ast_expr ast_expr::op_get_arg(int pos) const
+isl::basic_map basic_map::equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const
 {
-  auto res = isl_ast_expr_op_get_arg(get(), pos);
+  auto res = isl_basic_map_equate(copy(), static_cast<enum isl_dim_type>(type1), pos1, static_cast<enum isl_dim_type>(type2), pos2);
   return manage(res);
 }
 
-isl_size ast_expr::op_get_n_arg() const
+boolean basic_map::every_map(const std::function<boolean(isl::map)> &test) const
 {
-  auto res = isl_ast_expr_op_get_n_arg(get());
-  return res;
+  return isl::map(*this).every_map(test);
 }
 
-isl::ast_expr ast_expr::pdiv_q(isl::ast_expr expr2) const
+isl::map basic_map::extract_map(const isl::space &space) const
 {
-  auto res = isl_ast_expr_pdiv_q(copy(), expr2.release());
-  return manage(res);
+  return isl::map(*this).extract_map(space);
 }
 
-isl::ast_expr ast_expr::pdiv_r(isl::ast_expr expr2) const
+isl::map basic_map::factor_domain() const
 {
-  auto res = isl_ast_expr_pdiv_r(copy(), expr2.release());
-  return manage(res);
+  return isl::map(*this).factor_domain();
 }
 
-isl::ast_expr ast_expr::set_op_arg(int pos, isl::ast_expr arg) const
+isl::map basic_map::factor_range() const
 {
-  auto res = isl_ast_expr_set_op_arg(copy(), pos, arg.release());
-  return manage(res);
+  return isl::map(*this).factor_range();
 }
 
-isl::ast_expr ast_expr::sub(isl::ast_expr expr2) const
+isl::basic_map basic_map::fix_si(isl::dim type, unsigned int pos, int value) const
 {
-  auto res = isl_ast_expr_sub(copy(), expr2.release());
+  auto res = isl_basic_map_fix_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
   return manage(res);
 }
 
-isl::ast_expr ast_expr::substitute_ids(isl::id_to_ast_expr id2expr) const
+isl::basic_map basic_map::fix_val(isl::dim type, unsigned int pos, isl::val v) const
 {
-  auto res = isl_ast_expr_substitute_ids(copy(), id2expr.release());
+  auto res = isl_basic_map_fix_val(copy(), static_cast<enum isl_dim_type>(type), pos, v.release());
   return manage(res);
 }
 
-std::string ast_expr::to_C_str() const
+isl::basic_map basic_map::fix_val(isl::dim type, unsigned int pos, long v) const
 {
-  auto res = isl_ast_expr_to_C_str(get());
-  std::string tmp(res);
-  free(res);
-  return tmp;
+  return this->fix_val(type, pos, isl::val(ctx(), v));
 }
 
-// implementations for isl::ast_expr_list
-ast_expr_list manage(__isl_take isl_ast_expr_list *ptr) {
-  return ast_expr_list(ptr);
-}
-ast_expr_list manage_copy(__isl_keep isl_ast_expr_list *ptr) {
-  ptr = isl_ast_expr_list_copy(ptr);
-  return ast_expr_list(ptr);
+isl::union_map basic_map::fixed_power(const isl::val &exp) const
+{
+  return isl::map(*this).fixed_power(exp);
 }
 
-ast_expr_list::ast_expr_list()
-    : ptr(nullptr) {}
-
-ast_expr_list::ast_expr_list(const ast_expr_list &obj)
-    : ptr(nullptr)
+isl::union_map basic_map::fixed_power(long exp) const
 {
-  ptr = obj.copy();
+  return this->fixed_power(isl::val(ctx(), exp));
 }
 
-
-ast_expr_list::ast_expr_list(__isl_take isl_ast_expr_list *ptr)
-    : ptr(ptr) {}
-
-
-ast_expr_list &ast_expr_list::operator=(ast_expr_list obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::map basic_map::flat_range_product(const isl::map &map2) const
+{
+  return isl::map(*this).flat_range_product(map2);
 }
 
-ast_expr_list::~ast_expr_list() {
-  if (ptr)
-    isl_ast_expr_list_free(ptr);
+isl::union_map basic_map::flat_range_product(const isl::union_map &umap2) const
+{
+  return isl::map(*this).flat_range_product(umap2);
 }
 
-__isl_give isl_ast_expr_list *ast_expr_list::copy() const & {
-  return isl_ast_expr_list_copy(ptr);
+isl::basic_map basic_map::flatten() const
+{
+  auto res = isl_basic_map_flatten(copy());
+  return manage(res);
 }
 
-__isl_keep isl_ast_expr_list *ast_expr_list::get() const {
-  return ptr;
+isl::basic_map basic_map::flatten_domain() const
+{
+  auto res = isl_basic_map_flatten_domain(copy());
+  return manage(res);
 }
 
-__isl_give isl_ast_expr_list *ast_expr_list::release() {
-  isl_ast_expr_list *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::basic_map basic_map::flatten_range() const
+{
+  auto res = isl_basic_map_flatten_range(copy());
+  return manage(res);
 }
 
-bool ast_expr_list::is_null() const {
-  return ptr == nullptr;
+isl::map basic_map::floordiv_val(const isl::val &d) const
+{
+  return isl::map(*this).floordiv_val(d);
 }
 
-
-isl::ctx ast_expr_list::ctx() const {
-  return isl::ctx(isl_ast_expr_list_get_ctx(ptr));
+isl::map basic_map::floordiv_val(long d) const
+{
+  return this->floordiv_val(isl::val(ctx(), d));
 }
 
-void ast_expr_list::dump() const {
-  isl_ast_expr_list_dump(get());
+stat basic_map::foreach_basic_map(const std::function<stat(isl::basic_map)> &fn) const
+{
+  return isl::map(*this).foreach_basic_map(fn);
 }
 
+stat basic_map::foreach_map(const std::function<stat(isl::map)> &fn) const
+{
+  return isl::map(*this).foreach_map(fn);
+}
 
-isl::ast_expr_list ast_expr_list::add(isl::ast_expr el) const
+isl::basic_map basic_map::from_aff(isl::aff aff)
 {
-  auto res = isl_ast_expr_list_add(copy(), el.release());
+  auto res = isl_basic_map_from_aff(aff.release());
   return manage(res);
 }
 
-isl::ast_expr_list ast_expr_list::alloc(isl::ctx ctx, int n)
+isl::basic_map basic_map::from_domain_and_range(isl::basic_set domain, isl::basic_set range)
 {
-  auto res = isl_ast_expr_list_alloc(ctx.release(), n);
+  auto res = isl_basic_map_from_domain_and_range(domain.release(), range.release());
   return manage(res);
 }
 
-isl::ast_expr_list ast_expr_list::clear() const
+isl::basic_map basic_map::gist(isl::basic_map context) const
 {
-  auto res = isl_ast_expr_list_clear(copy());
+  auto res = isl_basic_map_gist(copy(), context.release());
   return manage(res);
 }
 
-isl::ast_expr_list ast_expr_list::concat(isl::ast_expr_list list2) const
+isl::map basic_map::gist(const isl::map &context) const
 {
-  auto res = isl_ast_expr_list_concat(copy(), list2.release());
-  return manage(res);
+  return isl::map(*this).gist(context);
 }
 
-isl::ast_expr_list ast_expr_list::drop(unsigned int first, unsigned int n) const
+isl::union_map basic_map::gist(const isl::union_map &context) const
 {
-  auto res = isl_ast_expr_list_drop(copy(), first, n);
-  return manage(res);
+  return isl::map(*this).gist(context);
 }
 
-stat ast_expr_list::foreach(const std::function<stat(ast_expr)> &fn) const
+isl::map basic_map::gist_domain(const isl::set &context) const
 {
-  struct fn_data {
-    const std::function<stat(ast_expr)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_ast_expr *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_ast_expr_list_foreach(get(), fn_lambda, &fn_data);
-  return manage(res);
+  return isl::map(*this).gist_domain(context);
 }
 
-isl::ast_expr_list ast_expr_list::from_ast_expr(isl::ast_expr el)
+isl::union_map basic_map::gist_domain(const isl::union_set &uset) const
 {
-  auto res = isl_ast_expr_list_from_ast_expr(el.release());
-  return manage(res);
+  return isl::map(*this).gist_domain(uset);
 }
 
-isl::ast_expr ast_expr_list::get_ast_expr(int index) const
+isl::map basic_map::gist_params(const isl::set &context) const
 {
-  auto res = isl_ast_expr_list_get_ast_expr(get(), index);
-  return manage(res);
+  return isl::map(*this).gist_params(context);
 }
 
-isl::ast_expr ast_expr_list::get_at(int index) const
+isl::union_map basic_map::gist_range(const isl::union_set &uset) const
 {
-  auto res = isl_ast_expr_list_get_at(get(), index);
-  return manage(res);
+  return isl::map(*this).gist_range(uset);
 }
 
-isl::ast_expr_list ast_expr_list::insert(unsigned int pos, isl::ast_expr el) const
+boolean basic_map::has_domain_tuple_id() const
 {
-  auto res = isl_ast_expr_list_insert(copy(), pos, el.release());
-  return manage(res);
+  return isl::map(*this).has_domain_tuple_id();
 }
 
-isl_size ast_expr_list::n_ast_expr() const
+boolean basic_map::has_equal_space(const isl::map &map2) const
 {
-  auto res = isl_ast_expr_list_n_ast_expr(get());
-  return res;
+  return isl::map(*this).has_equal_space(map2);
 }
 
-isl::ast_expr_list ast_expr_list::reverse() const
+boolean basic_map::has_range_tuple_id() const
 {
-  auto res = isl_ast_expr_list_reverse(copy());
-  return manage(res);
+  return isl::map(*this).has_range_tuple_id();
 }
 
-isl::ast_expr_list ast_expr_list::set_ast_expr(int index, isl::ast_expr el) const
+boolean basic_map::has_tuple_id(isl::dim type) const
 {
-  auto res = isl_ast_expr_list_set_ast_expr(copy(), index, el.release());
-  return manage(res);
+  return isl::map(*this).has_tuple_id(type);
 }
 
-isl_size ast_expr_list::size() const
+boolean basic_map::has_tuple_name(isl::dim type) const
 {
-  auto res = isl_ast_expr_list_size(get());
-  return res;
+  return isl::map(*this).has_tuple_name(type);
 }
 
-isl::ast_expr_list ast_expr_list::swap(unsigned int pos1, unsigned int pos2) const
+isl::basic_map basic_map::intersect(isl::basic_map bmap2) const
 {
-  auto res = isl_ast_expr_list_swap(copy(), pos1, pos2);
+  auto res = isl_basic_map_intersect(copy(), bmap2.release());
   return manage(res);
 }
 
-// implementations for isl::ast_node
-ast_node manage(__isl_take isl_ast_node *ptr) {
-  return ast_node(ptr);
-}
-ast_node manage_copy(__isl_keep isl_ast_node *ptr) {
-  ptr = isl_ast_node_copy(ptr);
-  return ast_node(ptr);
+isl::map basic_map::intersect(const isl::map &map2) const
+{
+  return isl::map(*this).intersect(map2);
 }
 
-ast_node::ast_node()
-    : ptr(nullptr) {}
-
-ast_node::ast_node(const ast_node &obj)
-    : ptr(nullptr)
+isl::union_map basic_map::intersect(const isl::union_map &umap2) const
 {
-  ptr = obj.copy();
+  return isl::map(*this).intersect(umap2);
 }
 
-
-ast_node::ast_node(__isl_take isl_ast_node *ptr)
-    : ptr(ptr) {}
-
-
-ast_node &ast_node::operator=(ast_node obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::basic_map basic_map::intersect_domain(isl::basic_set bset) const
+{
+  auto res = isl_basic_map_intersect_domain(copy(), bset.release());
+  return manage(res);
 }
 
-ast_node::~ast_node() {
-  if (ptr)
-    isl_ast_node_free(ptr);
+isl::map basic_map::intersect_domain(const isl::set &set) const
+{
+  return isl::map(*this).intersect_domain(set);
 }
 
-__isl_give isl_ast_node *ast_node::copy() const & {
-  return isl_ast_node_copy(ptr);
+isl::union_map basic_map::intersect_domain(const isl::space &space) const
+{
+  return isl::map(*this).intersect_domain(space);
 }
 
-__isl_keep isl_ast_node *ast_node::get() const {
-  return ptr;
+isl::union_map basic_map::intersect_domain(const isl::union_set &uset) const
+{
+  return isl::map(*this).intersect_domain(uset);
 }
 
-__isl_give isl_ast_node *ast_node::release() {
-  isl_ast_node *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::basic_map basic_map::intersect_domain(const isl::point &bset) const
+{
+  return this->intersect_domain(isl::basic_set(bset));
 }
 
-bool ast_node::is_null() const {
-  return ptr == nullptr;
+isl::map basic_map::intersect_domain_factor_domain(const isl::map &factor) const
+{
+  return isl::map(*this).intersect_domain_factor_domain(factor);
 }
 
-
-isl::ctx ast_node::ctx() const {
-  return isl::ctx(isl_ast_node_get_ctx(ptr));
+isl::union_map basic_map::intersect_domain_factor_domain(const isl::union_map &factor) const
+{
+  return isl::map(*this).intersect_domain_factor_domain(factor);
 }
 
-void ast_node::dump() const {
-  isl_ast_node_dump(get());
+isl::map basic_map::intersect_domain_factor_range(const isl::map &factor) const
+{
+  return isl::map(*this).intersect_domain_factor_range(factor);
 }
 
-
-isl::ast_node ast_node::alloc_user(isl::ast_expr expr)
+isl::union_map basic_map::intersect_domain_factor_range(const isl::union_map &factor) const
 {
-  auto res = isl_ast_node_alloc_user(expr.release());
-  return manage(res);
+  return isl::map(*this).intersect_domain_factor_range(factor);
 }
 
-isl::ast_node_list ast_node::block_get_children() const
+isl::map basic_map::intersect_params(const isl::set &params) const
 {
-  auto res = isl_ast_node_block_get_children(get());
-  return manage(res);
+  return isl::map(*this).intersect_params(params);
 }
 
-isl::ast_node ast_node::for_get_body() const
+isl::basic_map basic_map::intersect_range(isl::basic_set bset) const
 {
-  auto res = isl_ast_node_for_get_body(get());
+  auto res = isl_basic_map_intersect_range(copy(), bset.release());
   return manage(res);
 }
 
-isl::ast_expr ast_node::for_get_cond() const
+isl::map basic_map::intersect_range(const isl::set &set) const
 {
-  auto res = isl_ast_node_for_get_cond(get());
-  return manage(res);
+  return isl::map(*this).intersect_range(set);
 }
 
-isl::ast_expr ast_node::for_get_inc() const
+isl::union_map basic_map::intersect_range(const isl::space &space) const
 {
-  auto res = isl_ast_node_for_get_inc(get());
-  return manage(res);
+  return isl::map(*this).intersect_range(space);
 }
 
-isl::ast_expr ast_node::for_get_init() const
+isl::union_map basic_map::intersect_range(const isl::union_set &uset) const
 {
-  auto res = isl_ast_node_for_get_init(get());
-  return manage(res);
+  return isl::map(*this).intersect_range(uset);
 }
 
-isl::ast_expr ast_node::for_get_iterator() const
+isl::basic_map basic_map::intersect_range(const isl::point &bset) const
 {
-  auto res = isl_ast_node_for_get_iterator(get());
-  return manage(res);
+  return this->intersect_range(isl::basic_set(bset));
 }
 
-boolean ast_node::for_is_degenerate() const
+isl::map basic_map::intersect_range_factor_domain(const isl::map &factor) const
 {
-  auto res = isl_ast_node_for_is_degenerate(get());
-  return manage(res);
+  return isl::map(*this).intersect_range_factor_domain(factor);
 }
 
-isl::id ast_node::get_annotation() const
+isl::union_map basic_map::intersect_range_factor_domain(const isl::union_map &factor) const
 {
-  auto res = isl_ast_node_get_annotation(get());
-  return manage(res);
+  return isl::map(*this).intersect_range_factor_domain(factor);
 }
 
-isl::ast_expr ast_node::if_get_cond() const
+isl::map basic_map::intersect_range_factor_range(const isl::map &factor) const
 {
-  auto res = isl_ast_node_if_get_cond(get());
-  return manage(res);
+  return isl::map(*this).intersect_range_factor_range(factor);
 }
 
-isl::ast_node ast_node::if_get_else() const
+isl::union_map basic_map::intersect_range_factor_range(const isl::union_map &factor) const
 {
-  auto res = isl_ast_node_if_get_else(get());
-  return manage(res);
+  return isl::map(*this).intersect_range_factor_range(factor);
 }
 
-isl::ast_node ast_node::if_get_else_node() const
+boolean basic_map::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_ast_node_if_get_else_node(get());
-  return manage(res);
+  return isl::map(*this).involves_dims(type, first, n);
 }
 
-isl::ast_node ast_node::if_get_then() const
+boolean basic_map::is_bijective() const
 {
-  auto res = isl_ast_node_if_get_then(get());
-  return manage(res);
+  return isl::map(*this).is_bijective();
 }
 
-isl::ast_node ast_node::if_get_then_node() const
+boolean basic_map::is_disjoint(const isl::map &map2) const
 {
-  auto res = isl_ast_node_if_get_then_node(get());
-  return manage(res);
+  return isl::map(*this).is_disjoint(map2);
 }
 
-boolean ast_node::if_has_else() const
+boolean basic_map::is_disjoint(const isl::union_map &umap2) const
 {
-  auto res = isl_ast_node_if_has_else(get());
-  return manage(res);
+  return isl::map(*this).is_disjoint(umap2);
 }
 
-boolean ast_node::if_has_else_node() const
+boolean basic_map::is_empty() const
 {
-  auto res = isl_ast_node_if_has_else_node(get());
+  auto res = isl_basic_map_is_empty(get());
   return manage(res);
 }
 
-isl::id ast_node::mark_get_id() const
+boolean basic_map::is_equal(const isl::basic_map &bmap2) const
 {
-  auto res = isl_ast_node_mark_get_id(get());
+  auto res = isl_basic_map_is_equal(get(), bmap2.get());
   return manage(res);
 }
 
-isl::ast_node ast_node::mark_get_node() const
+boolean basic_map::is_equal(const isl::map &map2) const
 {
-  auto res = isl_ast_node_mark_get_node(get());
-  return manage(res);
+  return isl::map(*this).is_equal(map2);
 }
 
-isl::ast_node ast_node::set_annotation(isl::id annotation) const
+boolean basic_map::is_equal(const isl::union_map &umap2) const
 {
-  auto res = isl_ast_node_set_annotation(copy(), annotation.release());
-  return manage(res);
+  return isl::map(*this).is_equal(umap2);
 }
 
-std::string ast_node::to_C_str() const
+boolean basic_map::is_injective() const
 {
-  auto res = isl_ast_node_to_C_str(get());
-  std::string tmp(res);
-  free(res);
-  return tmp;
+  return isl::map(*this).is_injective();
 }
 
-isl::ast_expr ast_node::user_get_expr() const
+boolean basic_map::is_single_valued() const
 {
-  auto res = isl_ast_node_user_get_expr(get());
-  return manage(res);
+  return isl::map(*this).is_single_valued();
 }
 
-// implementations for isl::ast_node_list
-ast_node_list manage(__isl_take isl_ast_node_list *ptr) {
-  return ast_node_list(ptr);
-}
-ast_node_list manage_copy(__isl_keep isl_ast_node_list *ptr) {
-  ptr = isl_ast_node_list_copy(ptr);
-  return ast_node_list(ptr);
+boolean basic_map::is_strict_subset(const isl::map &map2) const
+{
+  return isl::map(*this).is_strict_subset(map2);
 }
 
-ast_node_list::ast_node_list()
-    : ptr(nullptr) {}
-
-ast_node_list::ast_node_list(const ast_node_list &obj)
-    : ptr(nullptr)
+boolean basic_map::is_strict_subset(const isl::union_map &umap2) const
 {
-  ptr = obj.copy();
+  return isl::map(*this).is_strict_subset(umap2);
 }
 
-
-ast_node_list::ast_node_list(__isl_take isl_ast_node_list *ptr)
-    : ptr(ptr) {}
-
-
-ast_node_list &ast_node_list::operator=(ast_node_list obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+boolean basic_map::is_subset(const isl::basic_map &bmap2) const
+{
+  auto res = isl_basic_map_is_subset(get(), bmap2.get());
+  return manage(res);
 }
 
-ast_node_list::~ast_node_list() {
-  if (ptr)
-    isl_ast_node_list_free(ptr);
+boolean basic_map::is_subset(const isl::map &map2) const
+{
+  return isl::map(*this).is_subset(map2);
 }
 
-__isl_give isl_ast_node_list *ast_node_list::copy() const & {
-  return isl_ast_node_list_copy(ptr);
+boolean basic_map::is_subset(const isl::union_map &umap2) const
+{
+  return isl::map(*this).is_subset(umap2);
 }
 
-__isl_keep isl_ast_node_list *ast_node_list::get() const {
-  return ptr;
+boolean basic_map::isa_map() const
+{
+  return isl::map(*this).isa_map();
 }
 
-__isl_give isl_ast_node_list *ast_node_list::release() {
-  isl_ast_node_list *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::map basic_map::lex_ge_at(const isl::multi_pw_aff &mpa) const
+{
+  return isl::map(*this).lex_ge_at(mpa);
 }
 
-bool ast_node_list::is_null() const {
-  return ptr == nullptr;
+isl::map basic_map::lex_gt_at(const isl::multi_pw_aff &mpa) const
+{
+  return isl::map(*this).lex_gt_at(mpa);
 }
 
-
-isl::ctx ast_node_list::ctx() const {
-  return isl::ctx(isl_ast_node_list_get_ctx(ptr));
+isl::map basic_map::lex_le_at(const isl::multi_pw_aff &mpa) const
+{
+  return isl::map(*this).lex_le_at(mpa);
 }
 
-void ast_node_list::dump() const {
-  isl_ast_node_list_dump(get());
+isl::map basic_map::lex_lt_at(const isl::multi_pw_aff &mpa) const
+{
+  return isl::map(*this).lex_lt_at(mpa);
 }
 
-
-isl::ast_node_list ast_node_list::add(isl::ast_node el) const
+isl::map basic_map::lexmax() const
 {
-  auto res = isl_ast_node_list_add(copy(), el.release());
+  auto res = isl_basic_map_lexmax(copy());
   return manage(res);
 }
 
-isl::ast_node_list ast_node_list::alloc(isl::ctx ctx, int n)
+isl::pw_multi_aff basic_map::lexmax_pw_multi_aff() const
 {
-  auto res = isl_ast_node_list_alloc(ctx.release(), n);
-  return manage(res);
+  return isl::map(*this).lexmax_pw_multi_aff();
 }
 
-isl::ast_node_list ast_node_list::clear() const
+isl::map basic_map::lexmin() const
 {
-  auto res = isl_ast_node_list_clear(copy());
+  auto res = isl_basic_map_lexmin(copy());
   return manage(res);
 }
 
-isl::ast_node_list ast_node_list::concat(isl::ast_node_list list2) const
+isl::pw_multi_aff basic_map::lexmin_pw_multi_aff() const
 {
-  auto res = isl_ast_node_list_concat(copy(), list2.release());
-  return manage(res);
+  return isl::map(*this).lexmin_pw_multi_aff();
 }
 
-isl::ast_node_list ast_node_list::drop(unsigned int first, unsigned int n) const
+isl::map basic_map::lower_bound(const isl::multi_pw_aff &lower) const
 {
-  auto res = isl_ast_node_list_drop(copy(), first, n);
-  return manage(res);
+  return isl::map(*this).lower_bound(lower);
 }
 
-stat ast_node_list::foreach(const std::function<stat(ast_node)> &fn) const
+isl::map basic_map::lower_bound_si(isl::dim type, unsigned int pos, int value) const
 {
-  struct fn_data {
-    const std::function<stat(ast_node)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_ast_node *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_ast_node_list_foreach(get(), fn_lambda, &fn_data);
-  return manage(res);
+  return isl::map(*this).lower_bound_si(type, pos, value);
 }
 
-isl::ast_node_list ast_node_list::from_ast_node(isl::ast_node el)
+isl::map_list basic_map::map_list() const
 {
-  auto res = isl_ast_node_list_from_ast_node(el.release());
-  return manage(res);
+  return isl::map(*this).map_list();
 }
 
-isl::ast_node ast_node_list::get_ast_node(int index) const
+isl::multi_pw_aff basic_map::max_multi_pw_aff() const
 {
-  auto res = isl_ast_node_list_get_ast_node(get(), index);
-  return manage(res);
+  return isl::map(*this).max_multi_pw_aff();
 }
 
-isl::ast_node ast_node_list::get_at(int index) const
+isl::multi_pw_aff basic_map::min_multi_pw_aff() const
 {
-  auto res = isl_ast_node_list_get_at(get(), index);
-  return manage(res);
+  return isl::map(*this).min_multi_pw_aff();
 }
 
-isl::ast_node_list ast_node_list::insert(unsigned int pos, isl::ast_node el) const
+isl::map basic_map::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
 {
-  auto res = isl_ast_node_list_insert(copy(), pos, el.release());
-  return manage(res);
+  return isl::map(*this).move_dims(dst_type, dst_pos, src_type, src_pos, n);
+}
+
+class size basic_map::n_basic_map() const
+{
+  return isl::map(*this).n_basic_map();
 }
 
-isl_size ast_node_list::n_ast_node() const
+isl::map basic_map::order_lt(isl::dim type1, int pos1, isl::dim type2, int pos2) const
 {
-  auto res = isl_ast_node_list_n_ast_node(get());
-  return res;
+  return isl::map(*this).order_lt(type1, pos1, type2, pos2);
 }
 
-isl::ast_node_list ast_node_list::reverse() const
+isl::set basic_map::params() const
 {
-  auto res = isl_ast_node_list_reverse(copy());
-  return manage(res);
+  return isl::map(*this).params();
 }
 
-isl::ast_node_list ast_node_list::set_ast_node(int index, isl::ast_node el) const
+isl::val basic_map::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_ast_node_list_set_ast_node(copy(), index, el.release());
+  auto res = isl_basic_map_plain_get_val_if_fixed(get(), static_cast<enum isl_dim_type>(type), pos);
   return manage(res);
 }
 
-isl_size ast_node_list::size() const
+isl::basic_map basic_map::polyhedral_hull() const
 {
-  auto res = isl_ast_node_list_size(get());
-  return res;
+  return isl::map(*this).polyhedral_hull();
 }
 
-isl::ast_node_list ast_node_list::swap(unsigned int pos1, unsigned int pos2) const
+isl::map basic_map::preimage_domain(const isl::multi_aff &ma) const
 {
-  auto res = isl_ast_node_list_swap(copy(), pos1, pos2);
-  return manage(res);
+  return isl::map(*this).preimage_domain(ma);
 }
 
-// implementations for isl::basic_map
-basic_map manage(__isl_take isl_basic_map *ptr) {
-  return basic_map(ptr);
-}
-basic_map manage_copy(__isl_keep isl_basic_map *ptr) {
-  ptr = isl_basic_map_copy(ptr);
-  return basic_map(ptr);
+isl::map basic_map::preimage_domain(const isl::multi_pw_aff &mpa) const
+{
+  return isl::map(*this).preimage_domain(mpa);
 }
 
-basic_map::basic_map()
-    : ptr(nullptr) {}
-
-basic_map::basic_map(const basic_map &obj)
-    : ptr(nullptr)
+isl::map basic_map::preimage_domain(const isl::pw_multi_aff &pma) const
 {
-  ptr = obj.copy();
+  return isl::map(*this).preimage_domain(pma);
 }
 
-
-basic_map::basic_map(__isl_take isl_basic_map *ptr)
-    : ptr(ptr) {}
-
-basic_map::basic_map(isl::ctx ctx, const std::string &str)
+isl::union_map basic_map::preimage_domain(const isl::union_pw_multi_aff &upma) const
 {
-  auto res = isl_basic_map_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
+  return isl::map(*this).preimage_domain(upma);
 }
 
-basic_map &basic_map::operator=(basic_map obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::map basic_map::preimage_range(const isl::multi_aff &ma) const
+{
+  return isl::map(*this).preimage_range(ma);
 }
 
-basic_map::~basic_map() {
-  if (ptr)
-    isl_basic_map_free(ptr);
+isl::map basic_map::preimage_range(const isl::pw_multi_aff &pma) const
+{
+  return isl::map(*this).preimage_range(pma);
 }
 
-__isl_give isl_basic_map *basic_map::copy() const & {
-  return isl_basic_map_copy(ptr);
+isl::union_map basic_map::preimage_range(const isl::union_pw_multi_aff &upma) const
+{
+  return isl::map(*this).preimage_range(upma);
 }
 
-__isl_keep isl_basic_map *basic_map::get() const {
-  return ptr;
+isl::map basic_map::product(const isl::map &map2) const
+{
+  return isl::map(*this).product(map2);
 }
 
-__isl_give isl_basic_map *basic_map::release() {
-  isl_basic_map *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::union_map basic_map::product(const isl::union_map &umap2) const
+{
+  return isl::map(*this).product(umap2);
 }
 
-bool basic_map::is_null() const {
-  return ptr == nullptr;
+isl::map basic_map::project_out(isl::dim type, unsigned int first, unsigned int n) const
+{
+  return isl::map(*this).project_out(type, first, n);
 }
 
-
-isl::ctx basic_map::ctx() const {
-  return isl::ctx(isl_basic_map_get_ctx(ptr));
+isl::map basic_map::project_out_all_params() const
+{
+  return isl::map(*this).project_out_all_params();
 }
 
-void basic_map::dump() const {
-  isl_basic_map_dump(get());
+isl::set basic_map::range() const
+{
+  return isl::map(*this).range();
 }
 
-
-isl::basic_map basic_map::add_constraint(isl::constraint constraint) const
+isl::map basic_map::range_factor_domain() const
 {
-  auto res = isl_basic_map_add_constraint(copy(), constraint.release());
-  return manage(res);
+  return isl::map(*this).range_factor_domain();
 }
 
-isl::basic_map basic_map::add_dims(isl::dim type, unsigned int n) const
+isl::map basic_map::range_factor_range() const
 {
-  auto res = isl_basic_map_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
-  return manage(res);
+  return isl::map(*this).range_factor_range();
 }
 
-isl::basic_map basic_map::affine_hull() const
+isl::fixed_box basic_map::range_lattice_tile() const
 {
-  auto res = isl_basic_map_affine_hull(copy());
-  return manage(res);
+  return isl::map(*this).range_lattice_tile();
 }
 
-isl::basic_map basic_map::align_params(isl::space model) const
+isl::map basic_map::range_map() const
 {
-  auto res = isl_basic_map_align_params(copy(), model.release());
-  return manage(res);
+  return isl::map(*this).range_map();
 }
 
-isl::basic_map basic_map::apply_domain(isl::basic_map bmap2) const
+isl::map basic_map::range_product(const isl::map &map2) const
 {
-  auto res = isl_basic_map_apply_domain(copy(), bmap2.release());
-  return manage(res);
+  return isl::map(*this).range_product(map2);
 }
 
-isl::basic_map basic_map::apply_range(isl::basic_map bmap2) const
+isl::union_map basic_map::range_product(const isl::union_map &umap2) const
 {
-  auto res = isl_basic_map_apply_range(copy(), bmap2.release());
-  return manage(res);
+  return isl::map(*this).range_product(umap2);
 }
 
-boolean basic_map::can_curry() const
+isl::map basic_map::range_reverse() const
 {
-  auto res = isl_basic_map_can_curry(get());
-  return manage(res);
+  return isl::map(*this).range_reverse();
 }
 
-boolean basic_map::can_uncurry() const
+isl::fixed_box basic_map::range_simple_fixed_box_hull() const
 {
-  auto res = isl_basic_map_can_uncurry(get());
-  return manage(res);
+  return isl::map(*this).range_simple_fixed_box_hull();
 }
 
-boolean basic_map::can_zip() const
+class size basic_map::range_tuple_dim() const
 {
-  auto res = isl_basic_map_can_zip(get());
-  return manage(res);
+  return isl::map(*this).range_tuple_dim();
 }
 
-isl::basic_map basic_map::curry() const
+isl::id basic_map::range_tuple_id() const
 {
-  auto res = isl_basic_map_curry(copy());
-  return manage(res);
+  return isl::map(*this).range_tuple_id();
 }
 
-isl::basic_set basic_map::deltas() const
+isl::basic_map basic_map::reverse() const
 {
-  auto res = isl_basic_map_deltas(copy());
+  auto res = isl_basic_map_reverse(copy());
   return manage(res);
 }
 
-isl::basic_map basic_map::deltas_map() const
+isl::basic_map basic_map::sample() const
 {
-  auto res = isl_basic_map_deltas_map(copy());
+  auto res = isl_basic_map_sample(copy());
   return manage(res);
 }
 
-isl::basic_map basic_map::detect_equalities() const
+isl::map basic_map::set_domain_tuple(const isl::id &id) const
 {
-  auto res = isl_basic_map_detect_equalities(copy());
-  return manage(res);
+  return isl::map(*this).set_domain_tuple(id);
 }
 
-isl_size basic_map::dim(isl::dim type) const
+isl::map basic_map::set_domain_tuple(const std::string &id) const
 {
-  auto res = isl_basic_map_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return this->set_domain_tuple(isl::id(ctx(), id));
 }
 
-isl::basic_set basic_map::domain() const
+isl::map basic_map::set_range_tuple(const isl::id &id) const
 {
-  auto res = isl_basic_map_domain(copy());
-  return manage(res);
+  return isl::map(*this).set_range_tuple(id);
 }
 
-isl::basic_map basic_map::domain_map() const
+isl::map basic_map::set_range_tuple(const std::string &id) const
 {
-  auto res = isl_basic_map_domain_map(copy());
-  return manage(res);
+  return this->set_range_tuple(isl::id(ctx(), id));
 }
 
-isl::basic_map basic_map::domain_product(isl::basic_map bmap2) const
+isl::map basic_map::set_tuple_id(isl::dim type, const isl::id &id) const
 {
-  auto res = isl_basic_map_domain_product(copy(), bmap2.release());
-  return manage(res);
+  return isl::map(*this).set_tuple_id(type, id);
 }
 
-isl::basic_map basic_map::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::map basic_map::set_tuple_id(isl::dim type, const std::string &id) const
 {
-  auto res = isl_basic_map_drop_constraints_involving_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return this->set_tuple_id(type, isl::id(ctx(), id));
 }
 
-isl::basic_map basic_map::drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::space basic_map::space() const
 {
-  auto res = isl_basic_map_drop_constraints_not_involving_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::map(*this).space();
 }
 
-isl::basic_map basic_map::drop_unused_params() const
+isl::map basic_map::subtract(const isl::map &map2) const
 {
-  auto res = isl_basic_map_drop_unused_params(copy());
-  return manage(res);
+  return isl::map(*this).subtract(map2);
 }
 
-isl::basic_map basic_map::eliminate(isl::dim type, unsigned int first, unsigned int n) const
+isl::union_map basic_map::subtract(const isl::union_map &umap2) const
 {
-  auto res = isl_basic_map_eliminate(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::map(*this).subtract(umap2);
 }
 
-isl::basic_map basic_map::empty(isl::space space)
+isl::union_map basic_map::subtract_domain(const isl::union_set &dom) const
 {
-  auto res = isl_basic_map_empty(space.release());
-  return manage(res);
+  return isl::map(*this).subtract_domain(dom);
 }
 
-isl::basic_map basic_map::equal(isl::space space, unsigned int n_equal)
+isl::union_map basic_map::subtract_range(const isl::union_set &dom) const
 {
-  auto res = isl_basic_map_equal(space.release(), n_equal);
-  return manage(res);
+  return isl::map(*this).subtract_range(dom);
 }
 
-isl::mat basic_map::equalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4, isl::dim c5) const
+isl::map basic_map::sum(const isl::map &map2) const
 {
-  auto res = isl_basic_map_equalities_matrix(get(), static_cast<enum isl_dim_type>(c1), static_cast<enum isl_dim_type>(c2), static_cast<enum isl_dim_type>(c3), static_cast<enum isl_dim_type>(c4), static_cast<enum isl_dim_type>(c5));
-  return manage(res);
+  return isl::map(*this).sum(map2);
 }
 
-isl::basic_map basic_map::equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const
+isl::basic_map_list basic_map::to_list() const
 {
-  auto res = isl_basic_map_equate(copy(), static_cast<enum isl_dim_type>(type1), pos1, static_cast<enum isl_dim_type>(type2), pos2);
+  auto res = isl_basic_map_to_list(copy());
   return manage(res);
 }
 
-int basic_map::find_dim_by_name(isl::dim type, const std::string &name) const
+isl::union_map basic_map::to_union_map() const
 {
-  auto res = isl_basic_map_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+  return isl::map(*this).to_union_map();
 }
 
-isl::basic_map basic_map::fix_si(isl::dim type, unsigned int pos, int value) const
+isl::id basic_map::tuple_id(isl::dim type) const
 {
-  auto res = isl_basic_map_fix_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
-  return manage(res);
+  return isl::map(*this).tuple_id(type);
 }
 
-isl::basic_map basic_map::fix_val(isl::dim type, unsigned int pos, isl::val v) const
+isl::map basic_map::uncurry() const
 {
-  auto res = isl_basic_map_fix_val(copy(), static_cast<enum isl_dim_type>(type), pos, v.release());
-  return manage(res);
+  return isl::map(*this).uncurry();
 }
 
-isl::basic_map basic_map::flat_product(isl::basic_map bmap2) const
+isl::map basic_map::unite(isl::basic_map bmap2) const
 {
-  auto res = isl_basic_map_flat_product(copy(), bmap2.release());
+  auto res = isl_basic_map_union(copy(), bmap2.release());
   return manage(res);
 }
 
-isl::basic_map basic_map::flat_range_product(isl::basic_map bmap2) const
+isl::map basic_map::unite(const isl::map &map2) const
 {
-  auto res = isl_basic_map_flat_range_product(copy(), bmap2.release());
-  return manage(res);
+  return isl::map(*this).unite(map2);
 }
 
-isl::basic_map basic_map::flatten() const
+isl::union_map basic_map::unite(const isl::union_map &umap2) const
 {
-  auto res = isl_basic_map_flatten(copy());
-  return manage(res);
+  return isl::map(*this).unite(umap2);
 }
 
-isl::basic_map basic_map::flatten_domain() const
+isl::basic_map basic_map::universe(isl::space space)
 {
-  auto res = isl_basic_map_flatten_domain(copy());
+  auto res = isl_basic_map_universe(space.release());
   return manage(res);
 }
 
-isl::basic_map basic_map::flatten_range() const
+isl::basic_map basic_map::unshifted_simple_hull() const
 {
-  auto res = isl_basic_map_flatten_range(copy());
-  return manage(res);
+  return isl::map(*this).unshifted_simple_hull();
 }
 
-stat basic_map::foreach_constraint(const std::function<stat(constraint)> &fn) const
+isl::map basic_map::upper_bound(const isl::multi_pw_aff &upper) const
 {
-  struct fn_data {
-    const std::function<stat(constraint)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_constraint *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_basic_map_foreach_constraint(get(), fn_lambda, &fn_data);
-  return manage(res);
+  return isl::map(*this).upper_bound(upper);
 }
 
-isl::basic_map basic_map::from_aff(isl::aff aff)
+isl::map basic_map::upper_bound_si(isl::dim type, unsigned int pos, int value) const
 {
-  auto res = isl_basic_map_from_aff(aff.release());
-  return manage(res);
+  return isl::map(*this).upper_bound_si(type, pos, value);
 }
 
-isl::basic_map basic_map::from_aff_list(isl::space domain_space, isl::aff_list list)
+isl::set basic_map::wrap() const
 {
-  auto res = isl_basic_map_from_aff_list(domain_space.release(), list.release());
-  return manage(res);
+  return isl::map(*this).wrap();
 }
 
-isl::basic_map basic_map::from_constraint(isl::constraint constraint)
+isl::map basic_map::zip() const
 {
-  auto res = isl_basic_map_from_constraint(constraint.release());
-  return manage(res);
+  return isl::map(*this).zip();
 }
 
-isl::basic_map basic_map::from_domain(isl::basic_set bset)
+inline std::ostream &operator<<(std::ostream &os, const basic_map &obj)
 {
-  auto res = isl_basic_map_from_domain(bset.release());
-  return manage(res);
+  char *str = isl_basic_map_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::basic_map basic_map::from_domain_and_range(isl::basic_set domain, isl::basic_set range)
-{
-  auto res = isl_basic_map_from_domain_and_range(domain.release(), range.release());
-  return manage(res);
+// implementations for isl::basic_map_list
+basic_map_list manage(__isl_take isl_basic_map_list *ptr) {
+  return basic_map_list(ptr);
+}
+basic_map_list manage_copy(__isl_keep isl_basic_map_list *ptr) {
+  ptr = isl_basic_map_list_copy(ptr);
+  return basic_map_list(ptr);
 }
 
-isl::basic_map basic_map::from_multi_aff(isl::multi_aff maff)
+basic_map_list::basic_map_list()
+    : ptr(nullptr) {}
+
+basic_map_list::basic_map_list(const basic_map_list &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_basic_map_from_multi_aff(maff.release());
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::basic_map basic_map::from_qpolynomial(isl::qpolynomial qp)
+basic_map_list::basic_map_list(__isl_take isl_basic_map_list *ptr)
+    : ptr(ptr) {}
+
+basic_map_list::basic_map_list(isl::ctx ctx, int n)
 {
-  auto res = isl_basic_map_from_qpolynomial(qp.release());
-  return manage(res);
+  auto res = isl_basic_map_list_alloc(ctx.release(), n);
+  ptr = res;
 }
 
-isl::basic_map basic_map::from_range(isl::basic_set bset)
+basic_map_list::basic_map_list(isl::basic_map el)
 {
-  auto res = isl_basic_map_from_range(bset.release());
-  return manage(res);
+  auto res = isl_basic_map_list_from_basic_map(el.release());
+  ptr = res;
 }
 
-isl::constraint_list basic_map::get_constraint_list() const
-{
-  auto res = isl_basic_map_get_constraint_list(get());
-  return manage(res);
+basic_map_list &basic_map_list::operator=(basic_map_list obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-std::string basic_map::get_dim_name(isl::dim type, unsigned int pos) const
-{
-  auto res = isl_basic_map_get_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
-  std::string tmp(res);
+basic_map_list::~basic_map_list() {
+  if (ptr)
+    isl_basic_map_list_free(ptr);
+}
+
+__isl_give isl_basic_map_list *basic_map_list::copy() const & {
+  return isl_basic_map_list_copy(ptr);
+}
+
+__isl_keep isl_basic_map_list *basic_map_list::get() const {
+  return ptr;
+}
+
+__isl_give isl_basic_map_list *basic_map_list::release() {
+  isl_basic_map_list *tmp = ptr;
+  ptr = nullptr;
   return tmp;
 }
 
-isl::aff basic_map::get_div(int pos) const
-{
-  auto res = isl_basic_map_get_div(get(), pos);
-  return manage(res);
+bool basic_map_list::is_null() const {
+  return ptr == nullptr;
+}
+
+isl::ctx basic_map_list::ctx() const {
+  return isl::ctx(isl_basic_map_list_get_ctx(ptr));
 }
 
-isl::local_space basic_map::get_local_space() const
+isl::basic_map_list basic_map_list::add(isl::basic_map el) const
 {
-  auto res = isl_basic_map_get_local_space(get());
+  auto res = isl_basic_map_list_add(copy(), el.release());
   return manage(res);
 }
 
-isl::space basic_map::get_space() const
+isl::basic_map basic_map_list::at(int index) const
 {
-  auto res = isl_basic_map_get_space(get());
+  auto res = isl_basic_map_list_get_at(get(), index);
   return manage(res);
 }
 
-std::string basic_map::get_tuple_name(isl::dim type) const
+isl::basic_map basic_map_list::get_at(int index) const
 {
-  auto res = isl_basic_map_get_tuple_name(get(), static_cast<enum isl_dim_type>(type));
-  std::string tmp(res);
-  return tmp;
+  return at(index);
 }
 
-isl::basic_map basic_map::gist(isl::basic_map context) const
+isl::basic_map_list basic_map_list::clear() const
 {
-  auto res = isl_basic_map_gist(copy(), context.release());
+  auto res = isl_basic_map_list_clear(copy());
   return manage(res);
 }
 
-isl::basic_map basic_map::gist_domain(isl::basic_set context) const
+isl::basic_map_list basic_map_list::concat(isl::basic_map_list list2) const
 {
-  auto res = isl_basic_map_gist_domain(copy(), context.release());
+  auto res = isl_basic_map_list_concat(copy(), list2.release());
   return manage(res);
 }
 
-boolean basic_map::has_dim_id(isl::dim type, unsigned int pos) const
+isl::basic_map_list basic_map_list::drop(unsigned int first, unsigned int n) const
 {
-  auto res = isl_basic_map_has_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_basic_map_list_drop(copy(), first, n);
   return manage(res);
 }
 
-isl::basic_map basic_map::identity(isl::space space)
+stat basic_map_list::foreach(const std::function<stat(isl::basic_map)> &fn) const
 {
-  auto res = isl_basic_map_identity(space.release());
+  struct fn_data {
+    std::function<stat(isl::basic_map)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_basic_map *arg_0, void *arg_1) -> isl_stat {
+    auto *data = static_cast<struct fn_data *>(arg_1);
+    auto ret = (data->func)(manage(arg_0));
+    return ret.release();
+  };
+  auto res = isl_basic_map_list_foreach(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-boolean basic_map::image_is_bounded() const
+isl::basic_map_list basic_map_list::insert(unsigned int pos, isl::basic_map el) const
 {
-  auto res = isl_basic_map_image_is_bounded(get());
+  auto res = isl_basic_map_list_insert(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::mat basic_map::inequalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4, isl::dim c5) const
+class size basic_map_list::size() const
 {
-  auto res = isl_basic_map_inequalities_matrix(get(), static_cast<enum isl_dim_type>(c1), static_cast<enum isl_dim_type>(c2), static_cast<enum isl_dim_type>(c3), static_cast<enum isl_dim_type>(c4), static_cast<enum isl_dim_type>(c5));
+  auto res = isl_basic_map_list_size(get());
   return manage(res);
 }
 
-isl::basic_map basic_map::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const
+inline std::ostream &operator<<(std::ostream &os, const basic_map_list &obj)
 {
-  auto res = isl_basic_map_insert_dims(copy(), static_cast<enum isl_dim_type>(type), pos, n);
-  return manage(res);
+  char *str = isl_basic_map_list_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::basic_map basic_map::intersect(isl::basic_map bmap2) const
+// implementations for isl::basic_set
+basic_set manage(__isl_take isl_basic_set *ptr) {
+  return basic_set(ptr);
+}
+basic_set manage_copy(__isl_keep isl_basic_set *ptr) {
+  ptr = isl_basic_set_copy(ptr);
+  return basic_set(ptr);
+}
+
+basic_set::basic_set()
+    : ptr(nullptr) {}
+
+basic_set::basic_set(const basic_set &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_basic_map_intersect(copy(), bmap2.release());
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::basic_map basic_map::intersect_domain(isl::basic_set bset) const
+basic_set::basic_set(__isl_take isl_basic_set *ptr)
+    : ptr(ptr) {}
+
+basic_set::basic_set(isl::point pnt)
 {
-  auto res = isl_basic_map_intersect_domain(copy(), bset.release());
-  return manage(res);
+  auto res = isl_basic_set_from_point(pnt.release());
+  ptr = res;
+}
+
+basic_set::basic_set(isl::ctx ctx, const std::string &str)
+{
+  auto res = isl_basic_set_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
+}
+
+basic_set &basic_set::operator=(basic_set obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+basic_set::~basic_set() {
+  if (ptr)
+    isl_basic_set_free(ptr);
 }
 
-isl::basic_map basic_map::intersect_range(isl::basic_set bset) const
-{
-  auto res = isl_basic_map_intersect_range(copy(), bset.release());
-  return manage(res);
+__isl_give isl_basic_set *basic_set::copy() const & {
+  return isl_basic_set_copy(ptr);
 }
 
-boolean basic_map::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
-{
-  auto res = isl_basic_map_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+__isl_keep isl_basic_set *basic_set::get() const {
+  return ptr;
 }
 
-boolean basic_map::is_disjoint(const isl::basic_map &bmap2) const
-{
-  auto res = isl_basic_map_is_disjoint(get(), bmap2.get());
-  return manage(res);
+__isl_give isl_basic_set *basic_set::release() {
+  isl_basic_set *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
 }
 
-boolean basic_map::is_empty() const
-{
-  auto res = isl_basic_map_is_empty(get());
-  return manage(res);
+bool basic_set::is_null() const {
+  return ptr == nullptr;
 }
 
-boolean basic_map::is_equal(const isl::basic_map &bmap2) const
-{
-  auto res = isl_basic_map_is_equal(get(), bmap2.get());
-  return manage(res);
+isl::ctx basic_set::ctx() const {
+  return isl::ctx(isl_basic_set_get_ctx(ptr));
 }
 
-boolean basic_map::is_rational() const
+isl::set basic_set::add_constraint(const isl::constraint &constraint) const
 {
-  auto res = isl_basic_map_is_rational(get());
-  return manage(res);
+  return isl::set(*this).add_constraint(constraint);
 }
 
-boolean basic_map::is_single_valued() const
+isl::set basic_set::add_dims(isl::dim type, unsigned int n) const
 {
-  auto res = isl_basic_map_is_single_valued(get());
-  return manage(res);
+  return isl::set(*this).add_dims(type, n);
 }
 
-boolean basic_map::is_strict_subset(const isl::basic_map &bmap2) const
+isl::basic_set basic_set::affine_hull() const
 {
-  auto res = isl_basic_map_is_strict_subset(get(), bmap2.get());
+  auto res = isl_basic_set_affine_hull(copy());
   return manage(res);
 }
 
-boolean basic_map::is_subset(const isl::basic_map &bmap2) const
+isl::set basic_set::align_params(const isl::space &model) const
 {
-  auto res = isl_basic_map_is_subset(get(), bmap2.get());
-  return manage(res);
+  return isl::set(*this).align_params(model);
 }
 
-boolean basic_map::is_universe() const
+isl::basic_set basic_set::apply(isl::basic_map bmap) const
 {
-  auto res = isl_basic_map_is_universe(get());
+  auto res = isl_basic_set_apply(copy(), bmap.release());
   return manage(res);
 }
 
-isl::basic_map basic_map::less_at(isl::space space, unsigned int pos)
+isl::set basic_set::apply(const isl::map &map) const
 {
-  auto res = isl_basic_map_less_at(space.release(), pos);
-  return manage(res);
+  return isl::set(*this).apply(map);
 }
 
-isl::map basic_map::lexmax() const
+isl::union_set basic_set::apply(const isl::union_map &umap) const
 {
-  auto res = isl_basic_map_lexmax(copy());
-  return manage(res);
+  return isl::set(*this).apply(umap);
 }
 
-isl::map basic_map::lexmin() const
+isl::pw_multi_aff basic_set::as_pw_multi_aff() const
 {
-  auto res = isl_basic_map_lexmin(copy());
-  return manage(res);
+  return isl::set(*this).as_pw_multi_aff();
 }
 
-isl::pw_multi_aff basic_map::lexmin_pw_multi_aff() const
+isl::set basic_set::as_set() const
 {
-  auto res = isl_basic_map_lexmin_pw_multi_aff(copy());
-  return manage(res);
+  return isl::set(*this).as_set();
 }
 
-isl::basic_map basic_map::lower_bound_si(isl::dim type, unsigned int pos, int value) const
+isl::basic_set_list basic_set::basic_set_list() const
 {
-  auto res = isl_basic_map_lower_bound_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
-  return manage(res);
+  return isl::set(*this).basic_set_list();
 }
 
-isl::basic_map basic_map::more_at(isl::space space, unsigned int pos)
+isl::set basic_set::bind(const isl::multi_id &tuple) const
 {
-  auto res = isl_basic_map_more_at(space.release(), pos);
-  return manage(res);
+  return isl::set(*this).bind(tuple);
 }
 
-isl::basic_map basic_map::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
+isl::set basic_set::coalesce() const
 {
-  auto res = isl_basic_map_move_dims(copy(), static_cast<enum isl_dim_type>(dst_type), dst_pos, static_cast<enum isl_dim_type>(src_type), src_pos, n);
-  return manage(res);
+  return isl::set(*this).coalesce();
 }
 
-isl_size basic_map::n_constraint() const
+isl::set basic_set::complement() const
 {
-  auto res = isl_basic_map_n_constraint(get());
-  return res;
+  return isl::set(*this).complement();
 }
 
-isl::basic_map basic_map::nat_universe(isl::space space)
+isl::union_set basic_set::compute_divs() const
 {
-  auto res = isl_basic_map_nat_universe(space.release());
-  return manage(res);
+  return isl::set(*this).compute_divs();
 }
 
-isl::basic_map basic_map::neg() const
+boolean basic_set::contains(const isl::space &space) const
 {
-  auto res = isl_basic_map_neg(copy());
-  return manage(res);
+  return isl::set(*this).contains(space);
 }
 
-isl::basic_map basic_map::order_ge(isl::dim type1, int pos1, isl::dim type2, int pos2) const
+isl::basic_set basic_set::convex_hull() const
 {
-  auto res = isl_basic_map_order_ge(copy(), static_cast<enum isl_dim_type>(type1), pos1, static_cast<enum isl_dim_type>(type2), pos2);
-  return manage(res);
+  return isl::set(*this).convex_hull();
 }
 
-isl::basic_map basic_map::order_gt(isl::dim type1, int pos1, isl::dim type2, int pos2) const
+isl::basic_set basic_set::detect_equalities() const
 {
-  auto res = isl_basic_map_order_gt(copy(), static_cast<enum isl_dim_type>(type1), pos1, static_cast<enum isl_dim_type>(type2), pos2);
+  auto res = isl_basic_set_detect_equalities(copy());
   return manage(res);
 }
 
-isl::val basic_map::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const
+class size basic_set::dim(isl::dim type) const
 {
-  auto res = isl_basic_map_plain_get_val_if_fixed(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_basic_set_dim(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-boolean basic_map::plain_is_empty() const
+boolean basic_set::dim_has_any_lower_bound(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_basic_map_plain_is_empty(get());
-  return manage(res);
+  return isl::set(*this).dim_has_any_lower_bound(type, pos);
 }
 
-boolean basic_map::plain_is_universe() const
+isl::id basic_set::dim_id(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_basic_map_plain_is_universe(get());
-  return manage(res);
+  return isl::set(*this).dim_id(type, pos);
 }
 
-isl::basic_map basic_map::preimage_domain_multi_aff(isl::multi_aff ma) const
+isl::pw_aff basic_set::dim_max(int pos) const
 {
-  auto res = isl_basic_map_preimage_domain_multi_aff(copy(), ma.release());
-  return manage(res);
+  return isl::set(*this).dim_max(pos);
 }
 
-isl::basic_map basic_map::preimage_range_multi_aff(isl::multi_aff ma) const
+isl::val basic_set::dim_max_val(int pos) const
 {
-  auto res = isl_basic_map_preimage_range_multi_aff(copy(), ma.release());
+  auto res = isl_basic_set_dim_max_val(copy(), pos);
   return manage(res);
 }
 
-isl::basic_map basic_map::product(isl::basic_map bmap2) const
+isl::pw_aff basic_set::dim_min(int pos) const
 {
-  auto res = isl_basic_map_product(copy(), bmap2.release());
-  return manage(res);
+  return isl::set(*this).dim_min(pos);
 }
 
-isl::basic_map basic_map::project_out(isl::dim type, unsigned int first, unsigned int n) const
+isl::val basic_set::dim_min_val(int pos) const
 {
-  auto res = isl_basic_map_project_out(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::set(*this).dim_min_val(pos);
 }
 
-isl::basic_set basic_map::range() const
+std::string basic_set::dim_name(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_basic_map_range(copy());
-  return manage(res);
+  return isl::set(*this).dim_name(type, pos);
 }
 
-isl::basic_map basic_map::range_map() const
+isl::aff basic_set::div(int pos) const
 {
-  auto res = isl_basic_map_range_map(copy());
+  auto res = isl_basic_set_get_div(get(), pos);
   return manage(res);
 }
 
-isl::basic_map basic_map::range_product(isl::basic_map bmap2) const
+isl::aff basic_set::get_div(int pos) const
 {
-  auto res = isl_basic_map_range_product(copy(), bmap2.release());
-  return manage(res);
+  return div(pos);
 }
 
-isl::basic_map basic_map::remove_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::set basic_set::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_basic_map_remove_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::set(*this).drop_constraints_involving_dims(type, first, n);
 }
 
-isl::basic_map basic_map::remove_divs() const
+isl::set basic_set::eliminate(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_basic_map_remove_divs(copy());
-  return manage(res);
+  return isl::set(*this).eliminate(type, first, n);
 }
 
-isl::basic_map basic_map::remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
+boolean basic_set::every_set(const std::function<boolean(isl::set)> &test) const
 {
-  auto res = isl_basic_map_remove_divs_involving_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::set(*this).every_set(test);
 }
 
-isl::basic_map basic_map::remove_redundancies() const
+isl::set basic_set::extract_set(const isl::space &space) const
 {
-  auto res = isl_basic_map_remove_redundancies(copy());
-  return manage(res);
+  return isl::set(*this).extract_set(space);
 }
 
-isl::basic_map basic_map::reverse() const
+int basic_set::find_dim_by_id(isl::dim type, const isl::id &id) const
 {
-  auto res = isl_basic_map_reverse(copy());
-  return manage(res);
+  return isl::set(*this).find_dim_by_id(type, id);
 }
 
-isl::basic_map basic_map::sample() const
+int basic_set::find_dim_by_id(isl::dim type, const std::string &id) const
 {
-  auto res = isl_basic_map_sample(copy());
-  return manage(res);
+  return this->find_dim_by_id(type, isl::id(ctx(), id));
 }
 
-isl::basic_map basic_map::set_tuple_id(isl::dim type, isl::id id) const
+isl::basic_set basic_set::fix_si(isl::dim type, unsigned int pos, int value) const
 {
-  auto res = isl_basic_map_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
+  auto res = isl_basic_set_fix_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
   return manage(res);
 }
 
-isl::basic_map basic_map::set_tuple_name(isl::dim type, const std::string &s) const
+isl::basic_set basic_set::fix_val(isl::dim type, unsigned int pos, isl::val v) const
 {
-  auto res = isl_basic_map_set_tuple_name(copy(), static_cast<enum isl_dim_type>(type), s.c_str());
+  auto res = isl_basic_set_fix_val(copy(), static_cast<enum isl_dim_type>(type), pos, v.release());
   return manage(res);
 }
 
-isl::basic_map basic_map::sum(isl::basic_map bmap2) const
+isl::basic_set basic_set::fix_val(isl::dim type, unsigned int pos, long v) const
 {
-  auto res = isl_basic_map_sum(copy(), bmap2.release());
-  return manage(res);
+  return this->fix_val(type, pos, isl::val(ctx(), v));
 }
 
-isl::basic_map basic_map::uncurry() const
+isl::basic_set basic_set::flatten() const
 {
-  auto res = isl_basic_map_uncurry(copy());
+  auto res = isl_basic_set_flatten(copy());
   return manage(res);
 }
 
-isl::map basic_map::unite(isl::basic_map bmap2) const
+stat basic_set::foreach_basic_set(const std::function<stat(isl::basic_set)> &fn) const
 {
-  auto res = isl_basic_map_union(copy(), bmap2.release());
-  return manage(res);
+  return isl::set(*this).foreach_basic_set(fn);
 }
 
-isl::basic_map basic_map::universe(isl::space space)
+stat basic_set::foreach_point(const std::function<stat(isl::point)> &fn) const
 {
-  auto res = isl_basic_map_universe(space.release());
-  return manage(res);
+  return isl::set(*this).foreach_point(fn);
 }
 
-isl::basic_map basic_map::upper_bound_si(isl::dim type, unsigned int pos, int value) const
+stat basic_set::foreach_set(const std::function<stat(isl::set)> &fn) const
 {
-  auto res = isl_basic_map_upper_bound_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
-  return manage(res);
+  return isl::set(*this).foreach_set(fn);
 }
 
-isl::basic_set basic_map::wrap() const
+isl::basic_set basic_set::gist(isl::basic_set context) const
 {
-  auto res = isl_basic_map_wrap(copy());
+  auto res = isl_basic_set_gist(copy(), context.release());
   return manage(res);
 }
 
-isl::basic_map basic_map::zip() const
+isl::set basic_set::gist(const isl::set &context) const
 {
-  auto res = isl_basic_map_zip(copy());
-  return manage(res);
-}
-
-// implementations for isl::basic_map_list
-basic_map_list manage(__isl_take isl_basic_map_list *ptr) {
-  return basic_map_list(ptr);
-}
-basic_map_list manage_copy(__isl_keep isl_basic_map_list *ptr) {
-  ptr = isl_basic_map_list_copy(ptr);
-  return basic_map_list(ptr);
+  return isl::set(*this).gist(context);
 }
 
-basic_map_list::basic_map_list()
-    : ptr(nullptr) {}
-
-basic_map_list::basic_map_list(const basic_map_list &obj)
-    : ptr(nullptr)
+isl::union_set basic_set::gist(const isl::union_set &context) const
 {
-  ptr = obj.copy();
-}
-
-
-basic_map_list::basic_map_list(__isl_take isl_basic_map_list *ptr)
-    : ptr(ptr) {}
-
-
-basic_map_list &basic_map_list::operator=(basic_map_list obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
-}
-
-basic_map_list::~basic_map_list() {
-  if (ptr)
-    isl_basic_map_list_free(ptr);
+  return isl::set(*this).gist(context);
 }
 
-__isl_give isl_basic_map_list *basic_map_list::copy() const & {
-  return isl_basic_map_list_copy(ptr);
+isl::basic_set basic_set::gist(const isl::point &context) const
+{
+  return this->gist(isl::basic_set(context));
 }
 
-__isl_keep isl_basic_map_list *basic_map_list::get() const {
-  return ptr;
+isl::set basic_set::gist_params(const isl::set &context) const
+{
+  return isl::set(*this).gist_params(context);
 }
 
-__isl_give isl_basic_map_list *basic_map_list::release() {
-  isl_basic_map_list *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+boolean basic_set::has_equal_space(const isl::set &set2) const
+{
+  return isl::set(*this).has_equal_space(set2);
 }
 
-bool basic_map_list::is_null() const {
-  return ptr == nullptr;
+isl::map basic_set::identity() const
+{
+  return isl::set(*this).identity();
 }
 
-
-isl::ctx basic_map_list::ctx() const {
-  return isl::ctx(isl_basic_map_list_get_ctx(ptr));
+isl::union_pw_multi_aff basic_set::identity_union_pw_multi_aff() const
+{
+  return isl::set(*this).identity_union_pw_multi_aff();
 }
 
-void basic_map_list::dump() const {
-  isl_basic_map_list_dump(get());
+isl::pw_aff basic_set::indicator_function() const
+{
+  return isl::set(*this).indicator_function();
 }
 
-
-isl::basic_map_list basic_map_list::add(isl::basic_map el) const
+isl::set basic_set::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const
 {
-  auto res = isl_basic_map_list_add(copy(), el.release());
-  return manage(res);
+  return isl::set(*this).insert_dims(type, pos, n);
 }
 
-isl::basic_map_list basic_map_list::alloc(isl::ctx ctx, int n)
+isl::map basic_set::insert_domain(const isl::space &domain) const
 {
-  auto res = isl_basic_map_list_alloc(ctx.release(), n);
-  return manage(res);
+  return isl::set(*this).insert_domain(domain);
 }
 
-isl::basic_map_list basic_map_list::clear() const
+isl::basic_set basic_set::intersect(isl::basic_set bset2) const
 {
-  auto res = isl_basic_map_list_clear(copy());
+  auto res = isl_basic_set_intersect(copy(), bset2.release());
   return manage(res);
 }
 
-isl::basic_map_list basic_map_list::concat(isl::basic_map_list list2) const
+isl::set basic_set::intersect(const isl::set &set2) const
 {
-  auto res = isl_basic_map_list_concat(copy(), list2.release());
-  return manage(res);
+  return isl::set(*this).intersect(set2);
 }
 
-isl::basic_map_list basic_map_list::drop(unsigned int first, unsigned int n) const
+isl::union_set basic_set::intersect(const isl::union_set &uset2) const
 {
-  auto res = isl_basic_map_list_drop(copy(), first, n);
-  return manage(res);
+  return isl::set(*this).intersect(uset2);
 }
 
-stat basic_map_list::foreach(const std::function<stat(basic_map)> &fn) const
+isl::basic_set basic_set::intersect(const isl::point &bset2) const
 {
-  struct fn_data {
-    const std::function<stat(basic_map)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_basic_map *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_basic_map_list_foreach(get(), fn_lambda, &fn_data);
-  return manage(res);
+  return this->intersect(isl::basic_set(bset2));
 }
 
-isl::basic_map_list basic_map_list::from_basic_map(isl::basic_map el)
+isl::basic_set basic_set::intersect_params(isl::basic_set bset2) const
 {
-  auto res = isl_basic_map_list_from_basic_map(el.release());
+  auto res = isl_basic_set_intersect_params(copy(), bset2.release());
   return manage(res);
 }
 
-isl::basic_map basic_map_list::get_at(int index) const
+isl::set basic_set::intersect_params(const isl::set &params) const
 {
-  auto res = isl_basic_map_list_get_at(get(), index);
-  return manage(res);
+  return isl::set(*this).intersect_params(params);
 }
 
-isl::basic_map basic_map_list::get_basic_map(int index) const
+isl::basic_set basic_set::intersect_params(const isl::point &bset2) const
 {
-  auto res = isl_basic_map_list_get_basic_map(get(), index);
-  return manage(res);
+  return this->intersect_params(isl::basic_set(bset2));
 }
 
-isl::basic_map_list basic_map_list::insert(unsigned int pos, isl::basic_map el) const
+boolean basic_set::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_basic_map_list_insert(copy(), pos, el.release());
-  return manage(res);
+  return isl::set(*this).involves_dims(type, first, n);
 }
 
-isl_size basic_map_list::n_basic_map() const
+boolean basic_set::involves_locals() const
 {
-  auto res = isl_basic_map_list_n_basic_map(get());
-  return res;
+  return isl::set(*this).involves_locals();
 }
 
-isl::basic_map_list basic_map_list::reverse() const
+boolean basic_set::is_bounded() const
 {
-  auto res = isl_basic_map_list_reverse(copy());
+  auto res = isl_basic_set_is_bounded(get());
   return manage(res);
 }
 
-isl::basic_map_list basic_map_list::set_basic_map(int index, isl::basic_map el) const
+boolean basic_set::is_disjoint(const isl::set &set2) const
 {
-  auto res = isl_basic_map_list_set_basic_map(copy(), index, el.release());
-  return manage(res);
+  return isl::set(*this).is_disjoint(set2);
 }
 
-isl_size basic_map_list::size() const
+boolean basic_set::is_disjoint(const isl::union_set &uset2) const
 {
-  auto res = isl_basic_map_list_size(get());
-  return res;
+  return isl::set(*this).is_disjoint(uset2);
 }
 
-isl::basic_map_list basic_map_list::swap(unsigned int pos1, unsigned int pos2) const
+boolean basic_set::is_empty() const
 {
-  auto res = isl_basic_map_list_swap(copy(), pos1, pos2);
+  auto res = isl_basic_set_is_empty(get());
   return manage(res);
 }
 
-// implementations for isl::basic_set
-basic_set manage(__isl_take isl_basic_set *ptr) {
-  return basic_set(ptr);
-}
-basic_set manage_copy(__isl_keep isl_basic_set *ptr) {
-  ptr = isl_basic_set_copy(ptr);
-  return basic_set(ptr);
-}
-
-basic_set::basic_set()
-    : ptr(nullptr) {}
-
-basic_set::basic_set(const basic_set &obj)
-    : ptr(nullptr)
+boolean basic_set::is_equal(const isl::basic_set &bset2) const
 {
-  ptr = obj.copy();
+  auto res = isl_basic_set_is_equal(get(), bset2.get());
+  return manage(res);
 }
 
-
-basic_set::basic_set(__isl_take isl_basic_set *ptr)
-    : ptr(ptr) {}
-
-basic_set::basic_set(isl::point pnt)
-{
-  auto res = isl_basic_set_from_point(pnt.release());
-  ptr = res;
-}
-basic_set::basic_set(isl::ctx ctx, const std::string &str)
+boolean basic_set::is_equal(const isl::set &set2) const
 {
-  auto res = isl_basic_set_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
-}
-
-basic_set &basic_set::operator=(basic_set obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
-}
-
-basic_set::~basic_set() {
-  if (ptr)
-    isl_basic_set_free(ptr);
-}
-
-__isl_give isl_basic_set *basic_set::copy() const & {
-  return isl_basic_set_copy(ptr);
-}
-
-__isl_keep isl_basic_set *basic_set::get() const {
-  return ptr;
-}
-
-__isl_give isl_basic_set *basic_set::release() {
-  isl_basic_set *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+  return isl::set(*this).is_equal(set2);
 }
 
-bool basic_set::is_null() const {
-  return ptr == nullptr;
+boolean basic_set::is_equal(const isl::union_set &uset2) const
+{
+  return isl::set(*this).is_equal(uset2);
 }
 
-
-isl::ctx basic_set::ctx() const {
-  return isl::ctx(isl_basic_set_get_ctx(ptr));
+boolean basic_set::is_equal(const isl::point &bset2) const
+{
+  return this->is_equal(isl::basic_set(bset2));
 }
 
-void basic_set::dump() const {
-  isl_basic_set_dump(get());
+boolean basic_set::is_params() const
+{
+  return isl::set(*this).is_params();
 }
 
-
-isl::basic_set basic_set::affine_hull() const
+boolean basic_set::is_singleton() const
 {
-  auto res = isl_basic_set_affine_hull(copy());
-  return manage(res);
+  return isl::set(*this).is_singleton();
 }
 
-isl::basic_set basic_set::align_params(isl::space model) const
+boolean basic_set::is_strict_subset(const isl::set &set2) const
 {
-  auto res = isl_basic_set_align_params(copy(), model.release());
-  return manage(res);
+  return isl::set(*this).is_strict_subset(set2);
 }
 
-isl::basic_set basic_set::apply(isl::basic_map bmap) const
+boolean basic_set::is_strict_subset(const isl::union_set &uset2) const
 {
-  auto res = isl_basic_set_apply(copy(), bmap.release());
-  return manage(res);
+  return isl::set(*this).is_strict_subset(uset2);
 }
 
-isl::basic_set basic_set::box_from_points(isl::point pnt1, isl::point pnt2)
+boolean basic_set::is_subset(const isl::basic_set &bset2) const
 {
-  auto res = isl_basic_set_box_from_points(pnt1.release(), pnt2.release());
+  auto res = isl_basic_set_is_subset(get(), bset2.get());
   return manage(res);
 }
 
-isl::basic_set basic_set::coefficients() const
+boolean basic_set::is_subset(const isl::set &set2) const
 {
-  auto res = isl_basic_set_coefficients(copy());
-  return manage(res);
+  return isl::set(*this).is_subset(set2);
 }
 
-isl::basic_set basic_set::detect_equalities() const
+boolean basic_set::is_subset(const isl::union_set &uset2) const
 {
-  auto res = isl_basic_set_detect_equalities(copy());
-  return manage(res);
+  return isl::set(*this).is_subset(uset2);
 }
 
-isl_size basic_set::dim(isl::dim type) const
+boolean basic_set::is_subset(const isl::point &bset2) const
 {
-  auto res = isl_basic_set_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return this->is_subset(isl::basic_set(bset2));
 }
 
-isl::val basic_set::dim_max_val(int pos) const
+boolean basic_set::is_wrapping() const
 {
-  auto res = isl_basic_set_dim_max_val(copy(), pos);
+  auto res = isl_basic_set_is_wrapping(get());
   return manage(res);
 }
 
-isl::basic_set basic_set::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
+boolean basic_set::isa_set() const
 {
-  auto res = isl_basic_set_drop_constraints_involving_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::set(*this).isa_set();
 }
 
-isl::basic_set basic_set::drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::set basic_set::lexmax() const
 {
-  auto res = isl_basic_set_drop_constraints_not_involving_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_basic_set_lexmax(copy());
   return manage(res);
 }
 
-isl::basic_set basic_set::drop_unused_params() const
+isl::pw_multi_aff basic_set::lexmax_pw_multi_aff() const
 {
-  auto res = isl_basic_set_drop_unused_params(copy());
-  return manage(res);
+  return isl::set(*this).lexmax_pw_multi_aff();
 }
 
-isl::basic_set basic_set::eliminate(isl::dim type, unsigned int first, unsigned int n) const
+isl::set basic_set::lexmin() const
 {
-  auto res = isl_basic_set_eliminate(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_basic_set_lexmin(copy());
   return manage(res);
 }
 
-isl::basic_set basic_set::empty(isl::space space)
+isl::pw_multi_aff basic_set::lexmin_pw_multi_aff() const
 {
-  auto res = isl_basic_set_empty(space.release());
-  return manage(res);
+  return isl::set(*this).lexmin_pw_multi_aff();
 }
 
-isl::mat basic_set::equalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4) const
+isl::set basic_set::lower_bound(const isl::multi_pw_aff &lower) const
 {
-  auto res = isl_basic_set_equalities_matrix(get(), static_cast<enum isl_dim_type>(c1), static_cast<enum isl_dim_type>(c2), static_cast<enum isl_dim_type>(c3), static_cast<enum isl_dim_type>(c4));
-  return manage(res);
+  return isl::set(*this).lower_bound(lower);
 }
 
-isl::basic_set basic_set::fix_si(isl::dim type, unsigned int pos, int value) const
+isl::set basic_set::lower_bound(const isl::multi_val &lower) const
 {
-  auto res = isl_basic_set_fix_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
-  return manage(res);
+  return isl::set(*this).lower_bound(lower);
 }
 
-isl::basic_set basic_set::fix_val(isl::dim type, unsigned int pos, isl::val v) const
+isl::set basic_set::lower_bound_si(isl::dim type, unsigned int pos, int value) const
 {
-  auto res = isl_basic_set_fix_val(copy(), static_cast<enum isl_dim_type>(type), pos, v.release());
-  return manage(res);
+  return isl::set(*this).lower_bound_si(type, pos, value);
 }
 
-isl::basic_set basic_set::flat_product(isl::basic_set bset2) const
+isl::set basic_set::lower_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const
 {
-  auto res = isl_basic_set_flat_product(copy(), bset2.release());
-  return manage(res);
+  return isl::set(*this).lower_bound_val(type, pos, value);
 }
 
-isl::basic_set basic_set::flatten() const
+isl::set basic_set::lower_bound_val(isl::dim type, unsigned int pos, long value) const
 {
-  auto res = isl_basic_set_flatten(copy());
-  return manage(res);
+  return this->lower_bound_val(type, pos, isl::val(ctx(), value));
 }
 
-stat basic_set::foreach_bound_pair(isl::dim type, unsigned int pos, const std::function<stat(constraint, constraint, basic_set)> &fn) const
+isl::multi_pw_aff basic_set::max_multi_pw_aff() const
 {
-  struct fn_data {
-    const std::function<stat(constraint, constraint, basic_set)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_constraint *arg_0, isl_constraint *arg_1, isl_basic_set *arg_2, void *arg_3) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_3);
-    stat ret = (*data->func)(manage(arg_0), manage(arg_1), manage(arg_2));
-    return ret.release();
-  };
-  auto res = isl_basic_set_foreach_bound_pair(get(), static_cast<enum isl_dim_type>(type), pos, fn_lambda, &fn_data);
-  return manage(res);
+  return isl::set(*this).max_multi_pw_aff();
 }
 
-stat basic_set::foreach_constraint(const std::function<stat(constraint)> &fn) const
+isl::val basic_set::max_val(const isl::aff &obj) const
 {
-  struct fn_data {
-    const std::function<stat(constraint)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_constraint *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_basic_set_foreach_constraint(get(), fn_lambda, &fn_data);
-  return manage(res);
+  return isl::set(*this).max_val(obj);
 }
 
-isl::basic_set basic_set::from_constraint(isl::constraint constraint)
+isl::multi_pw_aff basic_set::min_multi_pw_aff() const
 {
-  auto res = isl_basic_set_from_constraint(constraint.release());
-  return manage(res);
+  return isl::set(*this).min_multi_pw_aff();
 }
 
-isl::basic_set basic_set::from_multi_aff(isl::multi_aff ma)
+isl::val basic_set::min_val(const isl::aff &obj) const
 {
-  auto res = isl_basic_set_from_multi_aff(ma.release());
-  return manage(res);
+  return isl::set(*this).min_val(obj);
 }
 
-isl::basic_set basic_set::from_params() const
+class size basic_set::n_basic_set() const
 {
-  auto res = isl_basic_set_from_params(copy());
-  return manage(res);
+  return isl::set(*this).n_basic_set();
 }
 
-isl::constraint_list basic_set::get_constraint_list() const
+isl::basic_set basic_set::params() const
 {
-  auto res = isl_basic_set_get_constraint_list(get());
+  auto res = isl_basic_set_params(copy());
   return manage(res);
 }
 
-isl::id basic_set::get_dim_id(isl::dim type, unsigned int pos) const
+isl::val basic_set::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_basic_set_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  return isl::set(*this).plain_get_val_if_fixed(type, pos);
 }
 
-std::string basic_set::get_dim_name(isl::dim type, unsigned int pos) const
+isl::multi_val basic_set::plain_multi_val_if_fixed() const
 {
-  auto res = isl_basic_set_get_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
-  std::string tmp(res);
-  return tmp;
+  return isl::set(*this).plain_multi_val_if_fixed();
 }
 
-isl::aff basic_set::get_div(int pos) const
+isl::basic_set basic_set::polyhedral_hull() const
 {
-  auto res = isl_basic_set_get_div(get(), pos);
-  return manage(res);
+  return isl::set(*this).polyhedral_hull();
 }
 
-isl::local_space basic_set::get_local_space() const
+isl::set basic_set::preimage(const isl::multi_aff &ma) const
 {
-  auto res = isl_basic_set_get_local_space(get());
-  return manage(res);
+  return isl::set(*this).preimage(ma);
 }
 
-isl::space basic_set::get_space() const
+isl::set basic_set::preimage(const isl::multi_pw_aff &mpa) const
 {
-  auto res = isl_basic_set_get_space(get());
-  return manage(res);
+  return isl::set(*this).preimage(mpa);
 }
 
-std::string basic_set::get_tuple_name() const
+isl::set basic_set::preimage(const isl::pw_multi_aff &pma) const
 {
-  auto res = isl_basic_set_get_tuple_name(get());
-  std::string tmp(res);
-  return tmp;
+  return isl::set(*this).preimage(pma);
 }
 
-isl::basic_set basic_set::gist(isl::basic_set context) const
+isl::union_set basic_set::preimage(const isl::union_pw_multi_aff &upma) const
 {
-  auto res = isl_basic_set_gist(copy(), context.release());
-  return manage(res);
+  return isl::set(*this).preimage(upma);
 }
 
-isl::mat basic_set::inequalities_matrix(isl::dim c1, isl::dim c2, isl::dim c3, isl::dim c4) const
+isl::set basic_set::product(const isl::set &set2) const
 {
-  auto res = isl_basic_set_inequalities_matrix(get(), static_cast<enum isl_dim_type>(c1), static_cast<enum isl_dim_type>(c2), static_cast<enum isl_dim_type>(c3), static_cast<enum isl_dim_type>(c4));
-  return manage(res);
+  return isl::set(*this).product(set2);
 }
 
-isl::basic_set basic_set::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const
+isl::basic_set basic_set::project_out(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_basic_set_insert_dims(copy(), static_cast<enum isl_dim_type>(type), pos, n);
+  auto res = isl_basic_set_project_out(copy(), static_cast<enum isl_dim_type>(type), first, n);
   return manage(res);
 }
 
-isl::basic_set basic_set::intersect(isl::basic_set bset2) const
+isl::set basic_set::project_out_all_params() const
 {
-  auto res = isl_basic_set_intersect(copy(), bset2.release());
-  return manage(res);
+  return isl::set(*this).project_out_all_params();
 }
 
-isl::basic_set basic_set::intersect_params(isl::basic_set bset2) const
+isl::set basic_set::project_out_param(const isl::id &id) const
 {
-  auto res = isl_basic_set_intersect_params(copy(), bset2.release());
-  return manage(res);
+  return isl::set(*this).project_out_param(id);
 }
 
-boolean basic_set::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::set basic_set::project_out_param(const std::string &id) const
 {
-  auto res = isl_basic_set_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return this->project_out_param(isl::id(ctx(), id));
 }
 
-boolean basic_set::is_bounded() const
+isl::set basic_set::project_out_param(const isl::id_list &list) const
 {
-  auto res = isl_basic_set_is_bounded(get());
-  return manage(res);
+  return isl::set(*this).project_out_param(list);
 }
 
-boolean basic_set::is_disjoint(const isl::basic_set &bset2) const
+isl::pw_multi_aff basic_set::pw_multi_aff_on_domain(const isl::multi_val &mv) const
 {
-  auto res = isl_basic_set_is_disjoint(get(), bset2.get());
-  return manage(res);
+  return isl::set(*this).pw_multi_aff_on_domain(mv);
 }
 
-boolean basic_set::is_empty() const
+isl::set basic_set::remove_dims(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_basic_set_is_empty(get());
-  return manage(res);
+  return isl::set(*this).remove_dims(type, first, n);
 }
 
-boolean basic_set::is_equal(const isl::basic_set &bset2) const
+isl::set basic_set::remove_divs() const
 {
-  auto res = isl_basic_set_is_equal(get(), bset2.get());
-  return manage(res);
+  return isl::set(*this).remove_divs();
 }
 
-int basic_set::is_rational() const
+isl::set basic_set::remove_redundancies() const
 {
-  auto res = isl_basic_set_is_rational(get());
-  return res;
+  return isl::set(*this).remove_redundancies();
 }
 
-boolean basic_set::is_subset(const isl::basic_set &bset2) const
+isl::set basic_set::reset_tuple_id() const
 {
-  auto res = isl_basic_set_is_subset(get(), bset2.get());
-  return manage(res);
+  return isl::set(*this).reset_tuple_id();
 }
 
-boolean basic_set::is_universe() const
+isl::basic_set basic_set::sample() const
 {
-  auto res = isl_basic_set_is_universe(get());
+  auto res = isl_basic_set_sample(copy());
   return manage(res);
 }
 
-boolean basic_set::is_wrapping() const
+isl::point basic_set::sample_point() const
 {
-  auto res = isl_basic_set_is_wrapping(get());
+  auto res = isl_basic_set_sample_point(copy());
   return manage(res);
 }
 
-isl::set basic_set::lexmax() const
+isl::set basic_set::set_dim_id(isl::dim type, unsigned int pos, const isl::id &id) const
 {
-  auto res = isl_basic_set_lexmax(copy());
-  return manage(res);
+  return isl::set(*this).set_dim_id(type, pos, id);
 }
 
-isl::set basic_set::lexmin() const
+isl::set basic_set::set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const
 {
-  auto res = isl_basic_set_lexmin(copy());
-  return manage(res);
+  return this->set_dim_id(type, pos, isl::id(ctx(), id));
 }
 
-isl::basic_set basic_set::lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const
+isl::set_list basic_set::set_list() const
 {
-  auto res = isl_basic_set_lower_bound_val(copy(), static_cast<enum isl_dim_type>(type), pos, value.release());
-  return manage(res);
+  return isl::set(*this).set_list();
 }
 
-isl::val basic_set::max_val(const isl::aff &obj) const
+isl::set basic_set::set_tuple_id(const isl::id &id) const
 {
-  auto res = isl_basic_set_max_val(get(), obj.get());
-  return manage(res);
+  return isl::set(*this).set_tuple_id(id);
 }
 
-isl::basic_set basic_set::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
+isl::set basic_set::set_tuple_id(const std::string &id) const
 {
-  auto res = isl_basic_set_move_dims(copy(), static_cast<enum isl_dim_type>(dst_type), dst_pos, static_cast<enum isl_dim_type>(src_type), src_pos, n);
-  return manage(res);
+  return this->set_tuple_id(isl::id(ctx(), id));
 }
 
-isl_size basic_set::n_constraint() const
+isl::fixed_box basic_set::simple_fixed_box_hull() const
 {
-  auto res = isl_basic_set_n_constraint(get());
-  return res;
+  return isl::set(*this).simple_fixed_box_hull();
 }
 
-isl_size basic_set::n_dim() const
+isl::basic_set basic_set::simple_hull() const
 {
-  auto res = isl_basic_set_n_dim(get());
-  return res;
+  return isl::set(*this).simple_hull();
 }
 
-isl::basic_set basic_set::nat_universe(isl::space space)
+isl::space basic_set::space() const
 {
-  auto res = isl_basic_set_nat_universe(space.release());
+  auto res = isl_basic_set_get_space(get());
   return manage(res);
 }
 
-isl::basic_set basic_set::neg() const
+isl::space basic_set::get_space() const
 {
-  auto res = isl_basic_set_neg(copy());
-  return manage(res);
+  return space();
 }
 
-isl::basic_set basic_set::params() const
+isl::val basic_set::stride(int pos) const
 {
-  auto res = isl_basic_set_params(copy());
-  return manage(res);
+  return isl::set(*this).stride(pos);
 }
 
-boolean basic_set::plain_is_empty() const
+isl::set basic_set::subtract(const isl::set &set2) const
 {
-  auto res = isl_basic_set_plain_is_empty(get());
-  return manage(res);
+  return isl::set(*this).subtract(set2);
 }
 
-boolean basic_set::plain_is_equal(const isl::basic_set &bset2) const
+isl::union_set basic_set::subtract(const isl::union_set &uset2) const
 {
-  auto res = isl_basic_set_plain_is_equal(get(), bset2.get());
-  return manage(res);
+  return isl::set(*this).subtract(uset2);
 }
 
-boolean basic_set::plain_is_universe() const
+isl::basic_set_list basic_set::to_list() const
 {
-  auto res = isl_basic_set_plain_is_universe(get());
+  auto res = isl_basic_set_to_list(copy());
   return manage(res);
 }
 
-isl::basic_set basic_set::positive_orthant(isl::space space)
+isl::set basic_set::to_set() const
 {
-  auto res = isl_basic_set_positive_orthant(space.release());
+  auto res = isl_basic_set_to_set(copy());
   return manage(res);
 }
 
-isl::basic_set basic_set::preimage_multi_aff(isl::multi_aff ma) const
+isl::union_set basic_set::to_union_set() const
 {
-  auto res = isl_basic_set_preimage_multi_aff(copy(), ma.release());
-  return manage(res);
+  return isl::set(*this).to_union_set();
 }
 
-isl::basic_set basic_set::project_out(isl::dim type, unsigned int first, unsigned int n) const
+isl::map basic_set::translation() const
 {
-  auto res = isl_basic_set_project_out(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::set(*this).translation();
 }
 
-isl::mat basic_set::reduced_basis() const
+class size basic_set::tuple_dim() const
 {
-  auto res = isl_basic_set_reduced_basis(get());
-  return manage(res);
+  return isl::set(*this).tuple_dim();
 }
 
-isl::basic_set basic_set::remove_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::id basic_set::tuple_id() const
 {
-  auto res = isl_basic_set_remove_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::set(*this).tuple_id();
 }
 
-isl::basic_set basic_set::remove_divs() const
+std::string basic_set::tuple_name() const
 {
-  auto res = isl_basic_set_remove_divs(copy());
-  return manage(res);
+  return isl::set(*this).tuple_name();
 }
 
-isl::basic_set basic_set::remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::set basic_set::unbind_params(const isl::multi_id &tuple) const
 {
-  auto res = isl_basic_set_remove_divs_involving_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::set(*this).unbind_params(tuple);
 }
 
-isl::basic_set basic_set::remove_redundancies() const
+isl::map basic_set::unbind_params_insert_domain(const isl::multi_id &domain) const
 {
-  auto res = isl_basic_set_remove_redundancies(copy());
-  return manage(res);
+  return isl::set(*this).unbind_params_insert_domain(domain);
 }
 
-isl::basic_set basic_set::remove_unknown_divs() const
+isl::set basic_set::unite(isl::basic_set bset2) const
 {
-  auto res = isl_basic_set_remove_unknown_divs(copy());
+  auto res = isl_basic_set_union(copy(), bset2.release());
   return manage(res);
 }
 
-isl::basic_set basic_set::sample() const
+isl::set basic_set::unite(const isl::set &set2) const
 {
-  auto res = isl_basic_set_sample(copy());
-  return manage(res);
+  return isl::set(*this).unite(set2);
 }
 
-isl::point basic_set::sample_point() const
+isl::union_set basic_set::unite(const isl::union_set &uset2) const
 {
-  auto res = isl_basic_set_sample_point(copy());
-  return manage(res);
+  return isl::set(*this).unite(uset2);
 }
 
-isl::basic_set basic_set::set_tuple_id(isl::id id) const
+isl::set basic_set::unite(const isl::point &bset2) const
 {
-  auto res = isl_basic_set_set_tuple_id(copy(), id.release());
-  return manage(res);
+  return this->unite(isl::basic_set(bset2));
 }
 
-isl::basic_set basic_set::set_tuple_name(const std::string &s) const
+isl::basic_set basic_set::universe(isl::space space)
 {
-  auto res = isl_basic_set_set_tuple_name(copy(), s.c_str());
+  auto res = isl_basic_set_universe(space.release());
   return manage(res);
 }
 
-isl::basic_set basic_set::solutions() const
+isl::basic_set basic_set::unshifted_simple_hull() const
 {
-  auto res = isl_basic_set_solutions(copy());
-  return manage(res);
+  return isl::set(*this).unshifted_simple_hull();
 }
 
-isl::set basic_set::unite(isl::basic_set bset2) const
+isl::map basic_set::unwrap() const
 {
-  auto res = isl_basic_set_union(copy(), bset2.release());
-  return manage(res);
+  return isl::set(*this).unwrap();
 }
 
-isl::basic_set basic_set::universe(isl::space space)
+isl::set basic_set::upper_bound(const isl::multi_pw_aff &upper) const
 {
-  auto res = isl_basic_set_universe(space.release());
-  return manage(res);
+  return isl::set(*this).upper_bound(upper);
 }
 
-isl::basic_map basic_set::unwrap() const
+isl::set basic_set::upper_bound(const isl::multi_val &upper) const
 {
-  auto res = isl_basic_set_unwrap(copy());
-  return manage(res);
+  return isl::set(*this).upper_bound(upper);
 }
 
-isl::basic_set basic_set::upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const
+isl::set basic_set::upper_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const
 {
-  auto res = isl_basic_set_upper_bound_val(copy(), static_cast<enum isl_dim_type>(type), pos, value.release());
-  return manage(res);
+  return isl::set(*this).upper_bound_val(type, pos, value);
+}
+
+isl::set basic_set::upper_bound_val(isl::dim type, unsigned int pos, long value) const
+{
+  return this->upper_bound_val(type, pos, isl::val(ctx(), value));
+}
+
+inline std::ostream &operator<<(std::ostream &os, const basic_set &obj)
+{
+  char *str = isl_basic_set_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
 // implementations for isl::basic_set_list
@@ -6956,10 +9862,20 @@ basic_set_list::basic_set_list(const basic_set_list &obj)
   ptr = obj.copy();
 }
 
-
 basic_set_list::basic_set_list(__isl_take isl_basic_set_list *ptr)
     : ptr(ptr) {}
 
+basic_set_list::basic_set_list(isl::ctx ctx, int n)
+{
+  auto res = isl_basic_set_list_alloc(ctx.release(), n);
+  ptr = res;
+}
+
+basic_set_list::basic_set_list(isl::basic_set el)
+{
+  auto res = isl_basic_set_list_from_basic_set(el.release());
+  ptr = res;
+}
 
 basic_set_list &basic_set_list::operator=(basic_set_list obj) {
   std::swap(this->ptr, obj.ptr);
@@ -6989,37 +9905,30 @@ bool basic_set_list::is_null() const {
   return ptr == nullptr;
 }
 
-
 isl::ctx basic_set_list::ctx() const {
   return isl::ctx(isl_basic_set_list_get_ctx(ptr));
 }
 
-void basic_set_list::dump() const {
-  isl_basic_set_list_dump(get());
-}
-
-
 isl::basic_set_list basic_set_list::add(isl::basic_set el) const
 {
   auto res = isl_basic_set_list_add(copy(), el.release());
   return manage(res);
 }
 
-isl::basic_set_list basic_set_list::alloc(isl::ctx ctx, int n)
+isl::basic_set basic_set_list::at(int index) const
 {
-  auto res = isl_basic_set_list_alloc(ctx.release(), n);
+  auto res = isl_basic_set_list_get_at(get(), index);
   return manage(res);
 }
 
-isl::basic_set_list basic_set_list::clear() const
+isl::basic_set basic_set_list::get_at(int index) const
 {
-  auto res = isl_basic_set_list_clear(copy());
-  return manage(res);
+  return at(index);
 }
 
-isl::basic_set_list basic_set_list::coefficients() const
+isl::basic_set_list basic_set_list::clear() const
 {
-  auto res = isl_basic_set_list_coefficients(copy());
+  auto res = isl_basic_set_list_clear(copy());
   return manage(res);
 }
 
@@ -7035,7631 +9944,7135 @@ isl::basic_set_list basic_set_list::drop(unsigned int first, unsigned int n) con
   return manage(res);
 }
 
-stat basic_set_list::foreach(const std::function<stat(basic_set)> &fn) const
+stat basic_set_list::foreach(const std::function<stat(isl::basic_set)> &fn) const
 {
   struct fn_data {
-    const std::function<stat(basic_set)> *func;
-  } fn_data = { &fn };
+    std::function<stat(isl::basic_set)> func;
+  } fn_data = { fn };
   auto fn_lambda = [](isl_basic_set *arg_0, void *arg_1) -> isl_stat {
     auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
+    auto ret = (data->func)(manage(arg_0));
     return ret.release();
   };
   auto res = isl_basic_set_list_foreach(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::basic_set_list basic_set_list::from_basic_set(isl::basic_set el)
+isl::basic_set_list basic_set_list::insert(unsigned int pos, isl::basic_set el) const
 {
-  auto res = isl_basic_set_list_from_basic_set(el.release());
+  auto res = isl_basic_set_list_insert(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::basic_set basic_set_list::get_at(int index) const
+class size basic_set_list::size() const
 {
-  auto res = isl_basic_set_list_get_at(get(), index);
+  auto res = isl_basic_set_list_size(get());
   return manage(res);
 }
 
-isl::basic_set basic_set_list::get_basic_set(int index) const
+inline std::ostream &operator<<(std::ostream &os, const basic_set_list &obj)
 {
-  auto res = isl_basic_set_list_get_basic_set(get(), index);
-  return manage(res);
+  char *str = isl_basic_set_list_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::basic_set_list basic_set_list::insert(unsigned int pos, isl::basic_set el) const
+// implementations for isl::constraint
+constraint manage(__isl_take isl_constraint *ptr) {
+  return constraint(ptr);
+}
+constraint manage_copy(__isl_keep isl_constraint *ptr) {
+  ptr = isl_constraint_copy(ptr);
+  return constraint(ptr);
+}
+
+constraint::constraint()
+    : ptr(nullptr) {}
+
+constraint::constraint(const constraint &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_basic_set_list_insert(copy(), pos, el.release());
-  return manage(res);
+  ptr = obj.copy();
+}
+
+constraint::constraint(__isl_take isl_constraint *ptr)
+    : ptr(ptr) {}
+
+constraint &constraint::operator=(constraint obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+constraint::~constraint() {
+  if (ptr)
+    isl_constraint_free(ptr);
+}
+
+__isl_give isl_constraint *constraint::copy() const & {
+  return isl_constraint_copy(ptr);
+}
+
+__isl_keep isl_constraint *constraint::get() const {
+  return ptr;
+}
+
+__isl_give isl_constraint *constraint::release() {
+  isl_constraint *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
+}
+
+bool constraint::is_null() const {
+  return ptr == nullptr;
+}
+
+isl::ctx constraint::ctx() const {
+  return isl::ctx(isl_constraint_get_ctx(ptr));
 }
 
-isl_size basic_set_list::n_basic_set() const
+isl::constraint constraint::alloc_equality(isl::local_space ls)
 {
-  auto res = isl_basic_set_list_n_basic_set(get());
-  return res;
+  auto res = isl_constraint_alloc_equality(ls.release());
+  return manage(res);
 }
 
-isl::basic_set_list basic_set_list::reverse() const
+isl::constraint constraint::alloc_inequality(isl::local_space ls)
 {
-  auto res = isl_basic_set_list_reverse(copy());
+  auto res = isl_constraint_alloc_inequality(ls.release());
   return manage(res);
 }
 
-isl::basic_set_list basic_set_list::set_basic_set(int index, isl::basic_set el) const
+isl::constraint constraint::set_coefficient_si(isl::dim type, int pos, int v) const
 {
-  auto res = isl_basic_set_list_set_basic_set(copy(), index, el.release());
+  auto res = isl_constraint_set_coefficient_si(copy(), static_cast<enum isl_dim_type>(type), pos, v);
   return manage(res);
 }
 
-isl_size basic_set_list::size() const
+isl::constraint constraint::set_constant_si(int v) const
 {
-  auto res = isl_basic_set_list_size(get());
-  return res;
+  auto res = isl_constraint_set_constant_si(copy(), v);
+  return manage(res);
 }
 
-isl::basic_set_list basic_set_list::swap(unsigned int pos1, unsigned int pos2) const
+isl::constraint constraint::set_constant_val(isl::val v) const
 {
-  auto res = isl_basic_set_list_swap(copy(), pos1, pos2);
+  auto res = isl_constraint_set_constant_val(copy(), v.release());
   return manage(res);
 }
 
-// implementations for isl::constraint
-constraint manage(__isl_take isl_constraint *ptr) {
-  return constraint(ptr);
+isl::constraint constraint::set_constant_val(long v) const
+{
+  return this->set_constant_val(isl::val(ctx(), v));
+}
+
+// implementations for isl::fixed_box
+fixed_box manage(__isl_take isl_fixed_box *ptr) {
+  return fixed_box(ptr);
 }
-constraint manage_copy(__isl_keep isl_constraint *ptr) {
-  ptr = isl_constraint_copy(ptr);
-  return constraint(ptr);
+fixed_box manage_copy(__isl_keep isl_fixed_box *ptr) {
+  ptr = isl_fixed_box_copy(ptr);
+  return fixed_box(ptr);
 }
 
-constraint::constraint()
+fixed_box::fixed_box()
     : ptr(nullptr) {}
 
-constraint::constraint(const constraint &obj)
+fixed_box::fixed_box(const fixed_box &obj)
     : ptr(nullptr)
 {
   ptr = obj.copy();
 }
 
-
-constraint::constraint(__isl_take isl_constraint *ptr)
+fixed_box::fixed_box(__isl_take isl_fixed_box *ptr)
     : ptr(ptr) {}
 
-
-constraint &constraint::operator=(constraint obj) {
+fixed_box &fixed_box::operator=(fixed_box obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-constraint::~constraint() {
+fixed_box::~fixed_box() {
   if (ptr)
-    isl_constraint_free(ptr);
+    isl_fixed_box_free(ptr);
 }
 
-__isl_give isl_constraint *constraint::copy() const & {
-  return isl_constraint_copy(ptr);
+__isl_give isl_fixed_box *fixed_box::copy() const & {
+  return isl_fixed_box_copy(ptr);
 }
 
-__isl_keep isl_constraint *constraint::get() const {
+__isl_keep isl_fixed_box *fixed_box::get() const {
   return ptr;
 }
 
-__isl_give isl_constraint *constraint::release() {
-  isl_constraint *tmp = ptr;
+__isl_give isl_fixed_box *fixed_box::release() {
+  isl_fixed_box *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool constraint::is_null() const {
+bool fixed_box::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx constraint::ctx() const {
-  return isl::ctx(isl_constraint_get_ctx(ptr));
-}
-
-void constraint::dump() const {
-  isl_constraint_dump(get());
+isl::ctx fixed_box::ctx() const {
+  return isl::ctx(isl_fixed_box_get_ctx(ptr));
 }
 
-
-isl::constraint constraint::alloc_equality(isl::local_space ls)
+boolean fixed_box::is_valid() const
 {
-  auto res = isl_constraint_alloc_equality(ls.release());
+  auto res = isl_fixed_box_is_valid(get());
   return manage(res);
 }
 
-isl::constraint constraint::alloc_inequality(isl::local_space ls)
+isl::multi_aff fixed_box::offset() const
 {
-  auto res = isl_constraint_alloc_inequality(ls.release());
+  auto res = isl_fixed_box_get_offset(get());
   return manage(res);
 }
 
-int constraint::cmp_last_non_zero(const isl::constraint &c2) const
+isl::multi_aff fixed_box::get_offset() const
 {
-  auto res = isl_constraint_cmp_last_non_zero(get(), c2.get());
-  return res;
+  return offset();
 }
 
-isl::aff constraint::get_aff() const
+isl::multi_val fixed_box::size() const
 {
-  auto res = isl_constraint_get_aff(get());
+  auto res = isl_fixed_box_get_size(get());
   return manage(res);
 }
 
-isl::aff constraint::get_bound(isl::dim type, int pos) const
+isl::multi_val fixed_box::get_size() const
 {
-  auto res = isl_constraint_get_bound(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  return size();
 }
 
-isl::val constraint::get_coefficient_val(isl::dim type, int pos) const
+isl::space fixed_box::space() const
 {
-  auto res = isl_constraint_get_coefficient_val(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_fixed_box_get_space(get());
   return manage(res);
 }
 
-isl::val constraint::get_constant_val() const
+isl::space fixed_box::get_space() const
 {
-  auto res = isl_constraint_get_constant_val(get());
-  return manage(res);
+  return space();
 }
 
-std::string constraint::get_dim_name(isl::dim type, unsigned int pos) const
+inline std::ostream &operator<<(std::ostream &os, const fixed_box &obj)
 {
-  auto res = isl_constraint_get_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
-  std::string tmp(res);
-  return tmp;
+  char *str = isl_fixed_box_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::aff constraint::get_div(int pos) const
-{
-  auto res = isl_constraint_get_div(get(), pos);
-  return manage(res);
+// implementations for isl::id
+id manage(__isl_take isl_id *ptr) {
+  return id(ptr);
+}
+id manage_copy(__isl_keep isl_id *ptr) {
+  ptr = isl_id_copy(ptr);
+  return id(ptr);
 }
 
-isl::local_space constraint::get_local_space() const
+id::id()
+    : ptr(nullptr) {}
+
+id::id(const id &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_constraint_get_local_space(get());
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::space constraint::get_space() const
+id::id(__isl_take isl_id *ptr)
+    : ptr(ptr) {}
+
+id::id(isl::ctx ctx, const std::string &str)
 {
-  auto res = isl_constraint_get_space(get());
-  return manage(res);
+  auto res = isl_id_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
 }
 
-boolean constraint::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
-{
-  auto res = isl_constraint_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+id &id::operator=(id obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-boolean constraint::is_div_constraint() const
-{
-  auto res = isl_constraint_is_div_constraint(get());
-  return manage(res);
+id::~id() {
+  if (ptr)
+    isl_id_free(ptr);
+}
+
+__isl_give isl_id *id::copy() const & {
+  return isl_id_copy(ptr);
+}
+
+__isl_keep isl_id *id::get() const {
+  return ptr;
+}
+
+__isl_give isl_id *id::release() {
+  isl_id *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
+}
+
+bool id::is_null() const {
+  return ptr == nullptr;
+}
+
+isl::ctx id::ctx() const {
+  return isl::ctx(isl_id_get_ctx(ptr));
 }
 
-boolean constraint::is_lower_bound(isl::dim type, unsigned int pos) const
+isl::id id::alloc(isl::ctx ctx, const std::string &name, void * user)
 {
-  auto res = isl_constraint_is_lower_bound(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_id_alloc(ctx.release(), name.c_str(), user);
   return manage(res);
 }
 
-boolean constraint::is_upper_bound(isl::dim type, unsigned int pos) const
+std::string id::name() const
 {
-  auto res = isl_constraint_is_upper_bound(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  auto res = isl_id_get_name(get());
+  std::string tmp(res);
+  return tmp;
 }
 
-int constraint::plain_cmp(const isl::constraint &c2) const
+std::string id::get_name() const
 {
-  auto res = isl_constraint_plain_cmp(get(), c2.get());
-  return res;
+  return name();
 }
 
-isl::constraint constraint::set_coefficient_si(isl::dim type, int pos, int v) const
+isl::id_list id::to_list() const
 {
-  auto res = isl_constraint_set_coefficient_si(copy(), static_cast<enum isl_dim_type>(type), pos, v);
+  auto res = isl_id_to_list(copy());
   return manage(res);
 }
 
-isl::constraint constraint::set_coefficient_val(isl::dim type, int pos, isl::val v) const
+void * id::user() const
 {
-  auto res = isl_constraint_set_coefficient_val(copy(), static_cast<enum isl_dim_type>(type), pos, v.release());
-  return manage(res);
+  auto res = isl_id_get_user(get());
+  return res;
 }
 
-isl::constraint constraint::set_constant_si(int v) const
+void * id::get_user() const
 {
-  auto res = isl_constraint_set_constant_si(copy(), v);
-  return manage(res);
+  return user();
 }
 
-isl::constraint constraint::set_constant_val(isl::val v) const
+inline std::ostream &operator<<(std::ostream &os, const id &obj)
 {
-  auto res = isl_constraint_set_constant_val(copy(), v.release());
-  return manage(res);
+  char *str = isl_id_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-// implementations for isl::constraint_list
-constraint_list manage(__isl_take isl_constraint_list *ptr) {
-  return constraint_list(ptr);
+// implementations for isl::id_list
+id_list manage(__isl_take isl_id_list *ptr) {
+  return id_list(ptr);
 }
-constraint_list manage_copy(__isl_keep isl_constraint_list *ptr) {
-  ptr = isl_constraint_list_copy(ptr);
-  return constraint_list(ptr);
+id_list manage_copy(__isl_keep isl_id_list *ptr) {
+  ptr = isl_id_list_copy(ptr);
+  return id_list(ptr);
 }
 
-constraint_list::constraint_list()
+id_list::id_list()
     : ptr(nullptr) {}
 
-constraint_list::constraint_list(const constraint_list &obj)
+id_list::id_list(const id_list &obj)
     : ptr(nullptr)
 {
   ptr = obj.copy();
 }
 
-
-constraint_list::constraint_list(__isl_take isl_constraint_list *ptr)
+id_list::id_list(__isl_take isl_id_list *ptr)
     : ptr(ptr) {}
 
+id_list::id_list(isl::ctx ctx, int n)
+{
+  auto res = isl_id_list_alloc(ctx.release(), n);
+  ptr = res;
+}
+
+id_list::id_list(isl::id el)
+{
+  auto res = isl_id_list_from_id(el.release());
+  ptr = res;
+}
+
+id_list::id_list(isl::ctx ctx, const std::string &str)
+{
+  auto res = isl_id_list_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
+}
 
-constraint_list &constraint_list::operator=(constraint_list obj) {
+id_list &id_list::operator=(id_list obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-constraint_list::~constraint_list() {
+id_list::~id_list() {
   if (ptr)
-    isl_constraint_list_free(ptr);
+    isl_id_list_free(ptr);
 }
 
-__isl_give isl_constraint_list *constraint_list::copy() const & {
-  return isl_constraint_list_copy(ptr);
+__isl_give isl_id_list *id_list::copy() const & {
+  return isl_id_list_copy(ptr);
 }
 
-__isl_keep isl_constraint_list *constraint_list::get() const {
+__isl_keep isl_id_list *id_list::get() const {
   return ptr;
 }
 
-__isl_give isl_constraint_list *constraint_list::release() {
-  isl_constraint_list *tmp = ptr;
+__isl_give isl_id_list *id_list::release() {
+  isl_id_list *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool constraint_list::is_null() const {
+bool id_list::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx constraint_list::ctx() const {
-  return isl::ctx(isl_constraint_list_get_ctx(ptr));
+isl::ctx id_list::ctx() const {
+  return isl::ctx(isl_id_list_get_ctx(ptr));
 }
 
-void constraint_list::dump() const {
-  isl_constraint_list_dump(get());
+isl::id_list id_list::add(isl::id el) const
+{
+  auto res = isl_id_list_add(copy(), el.release());
+  return manage(res);
 }
 
+isl::id_list id_list::add(const std::string &el) const
+{
+  return this->add(isl::id(ctx(), el));
+}
 
-isl::constraint_list constraint_list::add(isl::constraint el) const
+isl::id id_list::at(int index) const
 {
-  auto res = isl_constraint_list_add(copy(), el.release());
+  auto res = isl_id_list_get_at(get(), index);
   return manage(res);
 }
 
-isl::constraint_list constraint_list::alloc(isl::ctx ctx, int n)
+isl::id id_list::get_at(int index) const
 {
-  auto res = isl_constraint_list_alloc(ctx.release(), n);
-  return manage(res);
+  return at(index);
 }
 
-isl::constraint_list constraint_list::clear() const
+isl::id_list id_list::clear() const
 {
-  auto res = isl_constraint_list_clear(copy());
+  auto res = isl_id_list_clear(copy());
   return manage(res);
 }
 
-isl::constraint_list constraint_list::concat(isl::constraint_list list2) const
+isl::id_list id_list::concat(isl::id_list list2) const
 {
-  auto res = isl_constraint_list_concat(copy(), list2.release());
+  auto res = isl_id_list_concat(copy(), list2.release());
   return manage(res);
 }
 
-isl::constraint_list constraint_list::drop(unsigned int first, unsigned int n) const
+isl::id_list id_list::drop(unsigned int first, unsigned int n) const
 {
-  auto res = isl_constraint_list_drop(copy(), first, n);
+  auto res = isl_id_list_drop(copy(), first, n);
   return manage(res);
 }
 
-stat constraint_list::foreach(const std::function<stat(constraint)> &fn) const
+stat id_list::foreach(const std::function<stat(isl::id)> &fn) const
 {
   struct fn_data {
-    const std::function<stat(constraint)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_constraint *arg_0, void *arg_1) -> isl_stat {
+    std::function<stat(isl::id)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_id *arg_0, void *arg_1) -> isl_stat {
     auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
+    auto ret = (data->func)(manage(arg_0));
     return ret.release();
   };
-  auto res = isl_constraint_list_foreach(get(), fn_lambda, &fn_data);
+  auto res = isl_id_list_foreach(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::constraint_list constraint_list::from_constraint(isl::constraint el)
+isl::id_list id_list::insert(unsigned int pos, isl::id el) const
 {
-  auto res = isl_constraint_list_from_constraint(el.release());
+  auto res = isl_id_list_insert(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::constraint constraint_list::get_at(int index) const
+isl::id_list id_list::insert(unsigned int pos, const std::string &el) const
 {
-  auto res = isl_constraint_list_get_at(get(), index);
-  return manage(res);
+  return this->insert(pos, isl::id(ctx(), el));
 }
 
-isl::constraint constraint_list::get_constraint(int index) const
+class size id_list::size() const
 {
-  auto res = isl_constraint_list_get_constraint(get(), index);
+  auto res = isl_id_list_size(get());
   return manage(res);
 }
 
-isl::constraint_list constraint_list::insert(unsigned int pos, isl::constraint el) const
+inline std::ostream &operator<<(std::ostream &os, const id_list &obj)
 {
-  auto res = isl_constraint_list_insert(copy(), pos, el.release());
-  return manage(res);
+  char *str = isl_id_list_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl_size constraint_list::n_constraint() const
-{
-  auto res = isl_constraint_list_n_constraint(get());
-  return res;
+// implementations for isl::id_to_ast_expr
+id_to_ast_expr manage(__isl_take isl_id_to_ast_expr *ptr) {
+  return id_to_ast_expr(ptr);
+}
+id_to_ast_expr manage_copy(__isl_keep isl_id_to_ast_expr *ptr) {
+  ptr = isl_id_to_ast_expr_copy(ptr);
+  return id_to_ast_expr(ptr);
 }
 
-isl::constraint_list constraint_list::reverse() const
+id_to_ast_expr::id_to_ast_expr()
+    : ptr(nullptr) {}
+
+id_to_ast_expr::id_to_ast_expr(const id_to_ast_expr &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_constraint_list_reverse(copy());
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::constraint_list constraint_list::set_constraint(int index, isl::constraint el) const
-{
-  auto res = isl_constraint_list_set_constraint(copy(), index, el.release());
-  return manage(res);
+id_to_ast_expr::id_to_ast_expr(__isl_take isl_id_to_ast_expr *ptr)
+    : ptr(ptr) {}
+
+id_to_ast_expr &id_to_ast_expr::operator=(id_to_ast_expr obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+id_to_ast_expr::~id_to_ast_expr() {
+  if (ptr)
+    isl_id_to_ast_expr_free(ptr);
+}
+
+__isl_give isl_id_to_ast_expr *id_to_ast_expr::copy() const & {
+  return isl_id_to_ast_expr_copy(ptr);
+}
+
+__isl_keep isl_id_to_ast_expr *id_to_ast_expr::get() const {
+  return ptr;
+}
+
+__isl_give isl_id_to_ast_expr *id_to_ast_expr::release() {
+  isl_id_to_ast_expr *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
+}
+
+bool id_to_ast_expr::is_null() const {
+  return ptr == nullptr;
+}
+
+isl::ctx id_to_ast_expr::ctx() const {
+  return isl::ctx(isl_id_to_ast_expr_get_ctx(ptr));
 }
 
-isl_size constraint_list::size() const
+isl::id_to_ast_expr id_to_ast_expr::alloc(isl::ctx ctx, int min_size)
 {
-  auto res = isl_constraint_list_size(get());
-  return res;
+  auto res = isl_id_to_ast_expr_alloc(ctx.release(), min_size);
+  return manage(res);
 }
 
-isl::constraint_list constraint_list::swap(unsigned int pos1, unsigned int pos2) const
+isl::id_to_ast_expr id_to_ast_expr::set(isl::id key, isl::ast_expr val) const
 {
-  auto res = isl_constraint_list_swap(copy(), pos1, pos2);
+  auto res = isl_id_to_ast_expr_set(copy(), key.release(), val.release());
   return manage(res);
 }
 
-// implementations for isl::fixed_box
-fixed_box manage(__isl_take isl_fixed_box *ptr) {
-  return fixed_box(ptr);
+isl::id_to_ast_expr id_to_ast_expr::set(const std::string &key, const isl::ast_expr &val) const
+{
+  return this->set(isl::id(ctx(), key), val);
+}
+
+// implementations for isl::local_space
+local_space manage(__isl_take isl_local_space *ptr) {
+  return local_space(ptr);
 }
-fixed_box manage_copy(__isl_keep isl_fixed_box *ptr) {
-  ptr = isl_fixed_box_copy(ptr);
-  return fixed_box(ptr);
+local_space manage_copy(__isl_keep isl_local_space *ptr) {
+  ptr = isl_local_space_copy(ptr);
+  return local_space(ptr);
 }
 
-fixed_box::fixed_box()
+local_space::local_space()
     : ptr(nullptr) {}
 
-fixed_box::fixed_box(const fixed_box &obj)
+local_space::local_space(const local_space &obj)
     : ptr(nullptr)
 {
   ptr = obj.copy();
 }
 
-
-fixed_box::fixed_box(__isl_take isl_fixed_box *ptr)
+local_space::local_space(__isl_take isl_local_space *ptr)
     : ptr(ptr) {}
 
+local_space::local_space(isl::space space)
+{
+  auto res = isl_local_space_from_space(space.release());
+  ptr = res;
+}
 
-fixed_box &fixed_box::operator=(fixed_box obj) {
+local_space &local_space::operator=(local_space obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-fixed_box::~fixed_box() {
+local_space::~local_space() {
   if (ptr)
-    isl_fixed_box_free(ptr);
+    isl_local_space_free(ptr);
 }
 
-__isl_give isl_fixed_box *fixed_box::copy() const & {
-  return isl_fixed_box_copy(ptr);
+__isl_give isl_local_space *local_space::copy() const & {
+  return isl_local_space_copy(ptr);
 }
 
-__isl_keep isl_fixed_box *fixed_box::get() const {
+__isl_keep isl_local_space *local_space::get() const {
   return ptr;
 }
 
-__isl_give isl_fixed_box *fixed_box::release() {
-  isl_fixed_box *tmp = ptr;
+__isl_give isl_local_space *local_space::release() {
+  isl_local_space *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool fixed_box::is_null() const {
+bool local_space::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx fixed_box::ctx() const {
-  return isl::ctx(isl_fixed_box_get_ctx(ptr));
-}
-
-void fixed_box::dump() const {
-  isl_fixed_box_dump(get());
-}
-
-
-isl::multi_aff fixed_box::get_offset() const
-{
-  auto res = isl_fixed_box_get_offset(get());
-  return manage(res);
-}
-
-isl::multi_val fixed_box::get_size() const
-{
-  auto res = isl_fixed_box_get_size(get());
-  return manage(res);
-}
-
-isl::space fixed_box::get_space() const
-{
-  auto res = isl_fixed_box_get_space(get());
-  return manage(res);
-}
-
-boolean fixed_box::is_valid() const
-{
-  auto res = isl_fixed_box_is_valid(get());
-  return manage(res);
+isl::ctx local_space::ctx() const {
+  return isl::ctx(isl_local_space_get_ctx(ptr));
 }
 
-// implementations for isl::id
-id manage(__isl_take isl_id *ptr) {
-  return id(ptr);
+// implementations for isl::map
+map manage(__isl_take isl_map *ptr) {
+  return map(ptr);
 }
-id manage_copy(__isl_keep isl_id *ptr) {
-  ptr = isl_id_copy(ptr);
-  return id(ptr);
+map manage_copy(__isl_keep isl_map *ptr) {
+  ptr = isl_map_copy(ptr);
+  return map(ptr);
 }
 
-id::id()
+map::map()
     : ptr(nullptr) {}
 
-id::id(const id &obj)
+map::map(const map &obj)
     : ptr(nullptr)
 {
   ptr = obj.copy();
 }
 
-
-id::id(__isl_take isl_id *ptr)
+map::map(__isl_take isl_map *ptr)
     : ptr(ptr) {}
 
-id::id(isl::ctx ctx, const std::string &str)
+map::map(isl::basic_map bmap)
 {
-  auto res = isl_id_read_from_str(ctx.release(), str.c_str());
+  auto res = isl_map_from_basic_map(bmap.release());
   ptr = res;
 }
 
-id &id::operator=(id obj) {
+map::map(isl::ctx ctx, const std::string &str)
+{
+  auto res = isl_map_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
+}
+
+map &map::operator=(map obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-id::~id() {
+map::~map() {
   if (ptr)
-    isl_id_free(ptr);
+    isl_map_free(ptr);
 }
 
-__isl_give isl_id *id::copy() const & {
-  return isl_id_copy(ptr);
+__isl_give isl_map *map::copy() const & {
+  return isl_map_copy(ptr);
 }
 
-__isl_keep isl_id *id::get() const {
+__isl_keep isl_map *map::get() const {
   return ptr;
 }
 
-__isl_give isl_id *id::release() {
-  isl_id *tmp = ptr;
+__isl_give isl_map *map::release() {
+  isl_map *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool id::is_null() const {
+bool map::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx id::ctx() const {
-  return isl::ctx(isl_id_get_ctx(ptr));
+isl::ctx map::ctx() const {
+  return isl::ctx(isl_map_get_ctx(ptr));
 }
 
-void id::dump() const {
-  isl_id_dump(get());
+isl::map map::add_constraint(isl::constraint constraint) const
+{
+  auto res = isl_map_add_constraint(copy(), constraint.release());
+  return manage(res);
 }
 
-
-isl::id id::alloc(isl::ctx ctx, const std::string &name, void * user)
+isl::map map::add_dims(isl::dim type, unsigned int n) const
 {
-  auto res = isl_id_alloc(ctx.release(), name.c_str(), user);
+  auto res = isl_map_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
   return manage(res);
 }
 
-uint32_t id::get_hash() const
+isl::basic_map map::affine_hull() const
 {
-  auto res = isl_id_get_hash(get());
-  return res;
+  auto res = isl_map_affine_hull(copy());
+  return manage(res);
 }
 
-std::string id::get_name() const
+isl::map map::align_params(isl::space model) const
 {
-  auto res = isl_id_get_name(get());
-  std::string tmp(res);
-  return tmp;
+  auto res = isl_map_align_params(copy(), model.release());
+  return manage(res);
 }
 
-void * id::get_user() const
+isl::map map::apply_domain(isl::map map2) const
 {
-  auto res = isl_id_get_user(get());
-  return res;
+  auto res = isl_map_apply_domain(copy(), map2.release());
+  return manage(res);
 }
 
-// implementations for isl::id_list
-id_list manage(__isl_take isl_id_list *ptr) {
-  return id_list(ptr);
+isl::union_map map::apply_domain(const isl::union_map &umap2) const
+{
+  return isl::union_map(*this).apply_domain(umap2);
 }
-id_list manage_copy(__isl_keep isl_id_list *ptr) {
-  ptr = isl_id_list_copy(ptr);
-  return id_list(ptr);
+
+isl::map map::apply_domain(const isl::basic_map &map2) const
+{
+  return this->apply_domain(isl::map(map2));
 }
 
-id_list::id_list()
-    : ptr(nullptr) {}
+isl::map map::apply_range(isl::map map2) const
+{
+  auto res = isl_map_apply_range(copy(), map2.release());
+  return manage(res);
+}
 
-id_list::id_list(const id_list &obj)
-    : ptr(nullptr)
+isl::union_map map::apply_range(const isl::union_map &umap2) const
 {
-  ptr = obj.copy();
+  return isl::union_map(*this).apply_range(umap2);
 }
 
+isl::map map::apply_range(const isl::basic_map &map2) const
+{
+  return this->apply_range(isl::map(map2));
+}
 
-id_list::id_list(__isl_take isl_id_list *ptr)
-    : ptr(ptr) {}
+isl::map map::as_map() const
+{
+  return isl::union_map(*this).as_map();
+}
 
+isl::multi_union_pw_aff map::as_multi_union_pw_aff() const
+{
+  return isl::union_map(*this).as_multi_union_pw_aff();
+}
 
-id_list &id_list::operator=(id_list obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::pw_multi_aff map::as_pw_multi_aff() const
+{
+  auto res = isl_map_as_pw_multi_aff(copy());
+  return manage(res);
 }
 
-id_list::~id_list() {
-  if (ptr)
-    isl_id_list_free(ptr);
+isl::union_pw_multi_aff map::as_union_pw_multi_aff() const
+{
+  return isl::union_map(*this).as_union_pw_multi_aff();
 }
 
-__isl_give isl_id_list *id_list::copy() const & {
-  return isl_id_list_copy(ptr);
+isl::basic_map_list map::basic_map_list() const
+{
+  auto res = isl_map_get_basic_map_list(get());
+  return manage(res);
 }
 
-__isl_keep isl_id_list *id_list::get() const {
-  return ptr;
+isl::basic_map_list map::get_basic_map_list() const
+{
+  return basic_map_list();
 }
 
-__isl_give isl_id_list *id_list::release() {
-  isl_id_list *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::set map::bind_domain(isl::multi_id tuple) const
+{
+  auto res = isl_map_bind_domain(copy(), tuple.release());
+  return manage(res);
 }
 
-bool id_list::is_null() const {
-  return ptr == nullptr;
+isl::set map::bind_range(isl::multi_id tuple) const
+{
+  auto res = isl_map_bind_range(copy(), tuple.release());
+  return manage(res);
 }
 
+boolean map::can_curry() const
+{
+  auto res = isl_map_can_curry(get());
+  return manage(res);
+}
 
-isl::ctx id_list::ctx() const {
-  return isl::ctx(isl_id_list_get_ctx(ptr));
+isl::map map::coalesce() const
+{
+  auto res = isl_map_coalesce(copy());
+  return manage(res);
 }
 
-void id_list::dump() const {
-  isl_id_list_dump(get());
+isl::map map::complement() const
+{
+  auto res = isl_map_complement(copy());
+  return manage(res);
 }
 
+isl::union_map map::compute_divs() const
+{
+  return isl::union_map(*this).compute_divs();
+}
 
-isl::id_list id_list::add(isl::id el) const
+isl::map map::curry() const
 {
-  auto res = isl_id_list_add(copy(), el.release());
+  auto res = isl_map_curry(copy());
   return manage(res);
 }
 
-isl::id_list id_list::alloc(isl::ctx ctx, int n)
+isl::set map::deltas() const
 {
-  auto res = isl_id_list_alloc(ctx.release(), n);
+  auto res = isl_map_deltas(copy());
   return manage(res);
 }
 
-isl::id_list id_list::clear() const
+isl::map map::detect_equalities() const
 {
-  auto res = isl_id_list_clear(copy());
+  auto res = isl_map_detect_equalities(copy());
   return manage(res);
 }
 
-isl::id_list id_list::concat(isl::id_list list2) const
+class size map::dim(isl::dim type) const
 {
-  auto res = isl_id_list_concat(copy(), list2.release());
+  auto res = isl_map_dim(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::id_list id_list::drop(unsigned int first, unsigned int n) const
+isl::pw_aff map::dim_max(int pos) const
 {
-  auto res = isl_id_list_drop(copy(), first, n);
+  auto res = isl_map_dim_max(copy(), pos);
   return manage(res);
 }
 
-stat id_list::foreach(const std::function<stat(id)> &fn) const
+isl::pw_aff map::dim_min(int pos) const
 {
-  struct fn_data {
-    const std::function<stat(id)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_id *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_id_list_foreach(get(), fn_lambda, &fn_data);
+  auto res = isl_map_dim_min(copy(), pos);
   return manage(res);
 }
 
-isl::id_list id_list::from_id(isl::id el)
+isl::set map::domain() const
 {
-  auto res = isl_id_list_from_id(el.release());
+  auto res = isl_map_domain(copy());
   return manage(res);
 }
 
-isl::id id_list::get_at(int index) const
+isl::map map::domain_factor_domain() const
 {
-  auto res = isl_id_list_get_at(get(), index);
+  auto res = isl_map_domain_factor_domain(copy());
   return manage(res);
 }
 
-isl::id id_list::get_id(int index) const
+isl::map map::domain_factor_range() const
 {
-  auto res = isl_id_list_get_id(get(), index);
+  auto res = isl_map_domain_factor_range(copy());
   return manage(res);
 }
 
-isl::id_list id_list::insert(unsigned int pos, isl::id el) const
+isl::map map::domain_map() const
 {
-  auto res = isl_id_list_insert(copy(), pos, el.release());
+  auto res = isl_map_domain_map(copy());
   return manage(res);
 }
 
-isl_size id_list::n_id() const
+isl::union_pw_multi_aff map::domain_map_union_pw_multi_aff() const
 {
-  auto res = isl_id_list_n_id(get());
-  return res;
+  return isl::union_map(*this).domain_map_union_pw_multi_aff();
 }
 
-isl::id_list id_list::reverse() const
+isl::map map::domain_product(isl::map map2) const
 {
-  auto res = isl_id_list_reverse(copy());
+  auto res = isl_map_domain_product(copy(), map2.release());
   return manage(res);
 }
 
-isl::id_list id_list::set_id(int index, isl::id el) const
+isl::union_map map::domain_product(const isl::union_map &umap2) const
 {
-  auto res = isl_id_list_set_id(copy(), index, el.release());
-  return manage(res);
+  return isl::union_map(*this).domain_product(umap2);
 }
 
-isl_size id_list::size() const
+isl::map map::domain_product(const isl::basic_map &map2) const
 {
-  auto res = isl_id_list_size(get());
-  return res;
+  return this->domain_product(isl::map(map2));
 }
 
-isl::id_list id_list::swap(unsigned int pos1, unsigned int pos2) const
+class size map::domain_tuple_dim() const
 {
-  auto res = isl_id_list_swap(copy(), pos1, pos2);
+  auto res = isl_map_domain_tuple_dim(get());
   return manage(res);
 }
 
-// implementations for isl::id_to_ast_expr
-id_to_ast_expr manage(__isl_take isl_id_to_ast_expr *ptr) {
-  return id_to_ast_expr(ptr);
-}
-id_to_ast_expr manage_copy(__isl_keep isl_id_to_ast_expr *ptr) {
-  ptr = isl_id_to_ast_expr_copy(ptr);
-  return id_to_ast_expr(ptr);
-}
-
-id_to_ast_expr::id_to_ast_expr()
-    : ptr(nullptr) {}
-
-id_to_ast_expr::id_to_ast_expr(const id_to_ast_expr &obj)
-    : ptr(nullptr)
+isl::id map::domain_tuple_id() const
 {
-  ptr = obj.copy();
+  auto res = isl_map_get_domain_tuple_id(get());
+  return manage(res);
 }
 
-
-id_to_ast_expr::id_to_ast_expr(__isl_take isl_id_to_ast_expr *ptr)
-    : ptr(ptr) {}
-
-
-id_to_ast_expr &id_to_ast_expr::operator=(id_to_ast_expr obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::id map::get_domain_tuple_id() const
+{
+  return domain_tuple_id();
 }
 
-id_to_ast_expr::~id_to_ast_expr() {
-  if (ptr)
-    isl_id_to_ast_expr_free(ptr);
+isl::map map::empty(isl::space space)
+{
+  auto res = isl_map_empty(space.release());
+  return manage(res);
 }
 
-__isl_give isl_id_to_ast_expr *id_to_ast_expr::copy() const & {
-  return isl_id_to_ast_expr_copy(ptr);
+isl::map map::eq_at(isl::multi_pw_aff mpa) const
+{
+  auto res = isl_map_eq_at_multi_pw_aff(copy(), mpa.release());
+  return manage(res);
 }
 
-__isl_keep isl_id_to_ast_expr *id_to_ast_expr::get() const {
-  return ptr;
+isl::union_map map::eq_at(const isl::multi_union_pw_aff &mupa) const
+{
+  return isl::union_map(*this).eq_at(mupa);
 }
 
-__isl_give isl_id_to_ast_expr *id_to_ast_expr::release() {
-  isl_id_to_ast_expr *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::map map::eq_at(const isl::aff &mpa) const
+{
+  return this->eq_at(isl::multi_pw_aff(mpa));
 }
 
-bool id_to_ast_expr::is_null() const {
-  return ptr == nullptr;
+isl::map map::eq_at(const isl::multi_aff &mpa) const
+{
+  return this->eq_at(isl::multi_pw_aff(mpa));
 }
 
-
-isl::ctx id_to_ast_expr::ctx() const {
-  return isl::ctx(isl_id_to_ast_expr_get_ctx(ptr));
+isl::map map::eq_at(const isl::pw_aff &mpa) const
+{
+  return this->eq_at(isl::multi_pw_aff(mpa));
 }
 
-void id_to_ast_expr::dump() const {
-  isl_id_to_ast_expr_dump(get());
+isl::map map::eq_at(const isl::pw_multi_aff &mpa) const
+{
+  return this->eq_at(isl::multi_pw_aff(mpa));
 }
 
-
-isl::id_to_ast_expr id_to_ast_expr::alloc(isl::ctx ctx, int min_size)
+isl::map map::equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const
 {
-  auto res = isl_id_to_ast_expr_alloc(ctx.release(), min_size);
+  auto res = isl_map_equate(copy(), static_cast<enum isl_dim_type>(type1), pos1, static_cast<enum isl_dim_type>(type2), pos2);
   return manage(res);
 }
 
-isl::id_to_ast_expr id_to_ast_expr::drop(isl::id key) const
+boolean map::every_map(const std::function<boolean(isl::map)> &test) const
 {
-  auto res = isl_id_to_ast_expr_drop(copy(), key.release());
-  return manage(res);
+  return isl::union_map(*this).every_map(test);
 }
 
-stat id_to_ast_expr::foreach(const std::function<stat(id, ast_expr)> &fn) const
+isl::map map::extract_map(const isl::space &space) const
 {
-  struct fn_data {
-    const std::function<stat(id, ast_expr)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_id *arg_0, isl_ast_expr *arg_1, void *arg_2) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_2);
-    stat ret = (*data->func)(manage(arg_0), manage(arg_1));
-    return ret.release();
-  };
-  auto res = isl_id_to_ast_expr_foreach(get(), fn_lambda, &fn_data);
-  return manage(res);
+  return isl::union_map(*this).extract_map(space);
 }
 
-isl::ast_expr id_to_ast_expr::get(isl::id key) const
+isl::map map::factor_domain() const
 {
-  auto res = isl_id_to_ast_expr_get(get(), key.release());
+  auto res = isl_map_factor_domain(copy());
   return manage(res);
 }
 
-boolean id_to_ast_expr::has(const isl::id &key) const
+isl::map map::factor_range() const
 {
-  auto res = isl_id_to_ast_expr_has(get(), key.get());
+  auto res = isl_map_factor_range(copy());
   return manage(res);
 }
 
-isl::id_to_ast_expr id_to_ast_expr::set(isl::id key, isl::ast_expr val) const
+isl::map map::fix_si(isl::dim type, unsigned int pos, int value) const
 {
-  auto res = isl_id_to_ast_expr_set(copy(), key.release(), val.release());
+  auto res = isl_map_fix_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
   return manage(res);
 }
 
-// implementations for isl::local_space
-local_space manage(__isl_take isl_local_space *ptr) {
-  return local_space(ptr);
-}
-local_space manage_copy(__isl_keep isl_local_space *ptr) {
-  ptr = isl_local_space_copy(ptr);
-  return local_space(ptr);
-}
-
-local_space::local_space()
-    : ptr(nullptr) {}
-
-local_space::local_space(const local_space &obj)
-    : ptr(nullptr)
+isl::union_map map::fixed_power(const isl::val &exp) const
 {
-  ptr = obj.copy();
+  return isl::union_map(*this).fixed_power(exp);
 }
 
-
-local_space::local_space(__isl_take isl_local_space *ptr)
-    : ptr(ptr) {}
-
-local_space::local_space(isl::space space)
+isl::union_map map::fixed_power(long exp) const
 {
-  auto res = isl_local_space_from_space(space.release());
-  ptr = res;
+  return this->fixed_power(isl::val(ctx(), exp));
 }
 
-local_space &local_space::operator=(local_space obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::map map::flat_range_product(isl::map map2) const
+{
+  auto res = isl_map_flat_range_product(copy(), map2.release());
+  return manage(res);
 }
 
-local_space::~local_space() {
-  if (ptr)
-    isl_local_space_free(ptr);
+isl::union_map map::flat_range_product(const isl::union_map &umap2) const
+{
+  return isl::union_map(*this).flat_range_product(umap2);
 }
 
-__isl_give isl_local_space *local_space::copy() const & {
-  return isl_local_space_copy(ptr);
+isl::map map::flat_range_product(const isl::basic_map &map2) const
+{
+  return this->flat_range_product(isl::map(map2));
 }
 
-__isl_keep isl_local_space *local_space::get() const {
-  return ptr;
+isl::map map::flatten() const
+{
+  auto res = isl_map_flatten(copy());
+  return manage(res);
 }
 
-__isl_give isl_local_space *local_space::release() {
-  isl_local_space *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::map map::flatten_domain() const
+{
+  auto res = isl_map_flatten_domain(copy());
+  return manage(res);
 }
 
-bool local_space::is_null() const {
-  return ptr == nullptr;
+isl::map map::flatten_range() const
+{
+  auto res = isl_map_flatten_range(copy());
+  return manage(res);
 }
 
-
-isl::ctx local_space::ctx() const {
-  return isl::ctx(isl_local_space_get_ctx(ptr));
+isl::map map::floordiv_val(isl::val d) const
+{
+  auto res = isl_map_floordiv_val(copy(), d.release());
+  return manage(res);
 }
 
-void local_space::dump() const {
-  isl_local_space_dump(get());
+isl::map map::floordiv_val(long d) const
+{
+  return this->floordiv_val(isl::val(ctx(), d));
 }
 
-
-isl::local_space local_space::add_dims(isl::dim type, unsigned int n) const
+stat map::foreach_basic_map(const std::function<stat(isl::basic_map)> &fn) const
 {
-  auto res = isl_local_space_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
+  struct fn_data {
+    std::function<stat(isl::basic_map)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_basic_map *arg_0, void *arg_1) -> isl_stat {
+    auto *data = static_cast<struct fn_data *>(arg_1);
+    auto ret = (data->func)(manage(arg_0));
+    return ret.release();
+  };
+  auto res = isl_map_foreach_basic_map(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl_size local_space::dim(isl::dim type) const
+stat map::foreach_map(const std::function<stat(isl::map)> &fn) const
 {
-  auto res = isl_local_space_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return isl::union_map(*this).foreach_map(fn);
 }
 
-isl::local_space local_space::domain() const
+isl::map map::from_aff(isl::aff aff)
 {
-  auto res = isl_local_space_domain(copy());
+  auto res = isl_map_from_aff(aff.release());
   return manage(res);
 }
 
-isl::local_space local_space::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::map map::from_domain(isl::set set)
 {
-  auto res = isl_local_space_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_map_from_domain(set.release());
   return manage(res);
 }
 
-int local_space::find_dim_by_name(isl::dim type, const std::string &name) const
+isl::map map::from_domain_and_range(isl::set domain, isl::set range)
 {
-  auto res = isl_local_space_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+  auto res = isl_map_from_domain_and_range(domain.release(), range.release());
+  return manage(res);
 }
 
-isl::local_space local_space::flatten_domain() const
+isl::map map::from_multi_aff(isl::multi_aff maff)
 {
-  auto res = isl_local_space_flatten_domain(copy());
+  auto res = isl_map_from_multi_aff(maff.release());
   return manage(res);
 }
 
-isl::local_space local_space::flatten_range() const
+isl::map map::from_pw_aff(isl::pw_aff pwaff)
 {
-  auto res = isl_local_space_flatten_range(copy());
+  auto res = isl_map_from_pw_aff(pwaff.release());
   return manage(res);
 }
 
-isl::local_space local_space::from_domain() const
+isl::map map::from_range(isl::set set)
 {
-  auto res = isl_local_space_from_domain(copy());
+  auto res = isl_map_from_range(set.release());
   return manage(res);
 }
 
-isl::id local_space::get_dim_id(isl::dim type, unsigned int pos) const
+isl::map map::from_union_map(isl::union_map umap)
 {
-  auto res = isl_local_space_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_map_from_union_map(umap.release());
   return manage(res);
 }
 
-std::string local_space::get_dim_name(isl::dim type, unsigned int pos) const
+isl::map map::gist(isl::map context) const
 {
-  auto res = isl_local_space_get_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
-  std::string tmp(res);
-  return tmp;
+  auto res = isl_map_gist(copy(), context.release());
+  return manage(res);
 }
 
-isl::aff local_space::get_div(int pos) const
+isl::union_map map::gist(const isl::union_map &context) const
 {
-  auto res = isl_local_space_get_div(get(), pos);
-  return manage(res);
+  return isl::union_map(*this).gist(context);
 }
 
-isl::space local_space::get_space() const
+isl::map map::gist(const isl::basic_map &context) const
 {
-  auto res = isl_local_space_get_space(get());
-  return manage(res);
+  return this->gist(isl::map(context));
 }
 
-boolean local_space::has_dim_id(isl::dim type, unsigned int pos) const
+isl::map map::gist_domain(isl::set context) const
 {
-  auto res = isl_local_space_has_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_map_gist_domain(copy(), context.release());
   return manage(res);
 }
 
-boolean local_space::has_dim_name(isl::dim type, unsigned int pos) const
+isl::union_map map::gist_domain(const isl::union_set &uset) const
 {
-  auto res = isl_local_space_has_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  return isl::union_map(*this).gist_domain(uset);
 }
 
-isl::local_space local_space::insert_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::map map::gist_domain(const isl::basic_set &context) const
 {
-  auto res = isl_local_space_insert_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return this->gist_domain(isl::set(context));
 }
 
-isl::local_space local_space::intersect(isl::local_space ls2) const
+isl::map map::gist_domain(const isl::point &context) const
 {
-  auto res = isl_local_space_intersect(copy(), ls2.release());
-  return manage(res);
+  return this->gist_domain(isl::set(context));
 }
 
-boolean local_space::is_equal(const isl::local_space &ls2) const
+isl::map map::gist_params(isl::set context) const
 {
-  auto res = isl_local_space_is_equal(get(), ls2.get());
+  auto res = isl_map_gist_params(copy(), context.release());
   return manage(res);
 }
 
-boolean local_space::is_params() const
+isl::union_map map::gist_range(const isl::union_set &uset) const
 {
-  auto res = isl_local_space_is_params(get());
-  return manage(res);
+  return isl::union_map(*this).gist_range(uset);
 }
 
-boolean local_space::is_set() const
+boolean map::has_domain_tuple_id() const
 {
-  auto res = isl_local_space_is_set(get());
+  auto res = isl_map_has_domain_tuple_id(get());
   return manage(res);
 }
 
-isl::local_space local_space::range() const
+boolean map::has_equal_space(const isl::map &map2) const
 {
-  auto res = isl_local_space_range(copy());
+  auto res = isl_map_has_equal_space(get(), map2.get());
   return manage(res);
 }
 
-isl::local_space local_space::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const
+boolean map::has_range_tuple_id() const
 {
-  auto res = isl_local_space_set_dim_id(copy(), static_cast<enum isl_dim_type>(type), pos, id.release());
+  auto res = isl_map_has_range_tuple_id(get());
   return manage(res);
 }
 
-isl::local_space local_space::set_from_params() const
+boolean map::has_tuple_id(isl::dim type) const
 {
-  auto res = isl_local_space_set_from_params(copy());
+  auto res = isl_map_has_tuple_id(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::local_space local_space::set_tuple_id(isl::dim type, isl::id id) const
+boolean map::has_tuple_name(isl::dim type) const
 {
-  auto res = isl_local_space_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
+  auto res = isl_map_has_tuple_name(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::local_space local_space::wrap() const
+isl::map map::identity(isl::space space)
 {
-  auto res = isl_local_space_wrap(copy());
+  auto res = isl_map_identity(space.release());
   return manage(res);
 }
 
-// implementations for isl::map
-map manage(__isl_take isl_map *ptr) {
-  return map(ptr);
-}
-map manage_copy(__isl_keep isl_map *ptr) {
-  ptr = isl_map_copy(ptr);
-  return map(ptr);
-}
-
-map::map()
-    : ptr(nullptr) {}
-
-map::map(const map &obj)
-    : ptr(nullptr)
+isl::map map::intersect(isl::map map2) const
 {
-  ptr = obj.copy();
+  auto res = isl_map_intersect(copy(), map2.release());
+  return manage(res);
 }
 
-
-map::map(__isl_take isl_map *ptr)
-    : ptr(ptr) {}
-
-map::map(isl::basic_map bmap)
-{
-  auto res = isl_map_from_basic_map(bmap.release());
-  ptr = res;
-}
-map::map(isl::ctx ctx, const std::string &str)
+isl::union_map map::intersect(const isl::union_map &umap2) const
 {
-  auto res = isl_map_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
+  return isl::union_map(*this).intersect(umap2);
 }
 
-map &map::operator=(map obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::map map::intersect(const isl::basic_map &map2) const
+{
+  return this->intersect(isl::map(map2));
 }
 
-map::~map() {
-  if (ptr)
-    isl_map_free(ptr);
+isl::map map::intersect_domain(isl::set set) const
+{
+  auto res = isl_map_intersect_domain(copy(), set.release());
+  return manage(res);
 }
 
-__isl_give isl_map *map::copy() const & {
-  return isl_map_copy(ptr);
+isl::union_map map::intersect_domain(const isl::space &space) const
+{
+  return isl::union_map(*this).intersect_domain(space);
 }
 
-__isl_keep isl_map *map::get() const {
-  return ptr;
+isl::union_map map::intersect_domain(const isl::union_set &uset) const
+{
+  return isl::union_map(*this).intersect_domain(uset);
 }
 
-__isl_give isl_map *map::release() {
-  isl_map *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::map map::intersect_domain(const isl::basic_set &set) const
+{
+  return this->intersect_domain(isl::set(set));
 }
 
-bool map::is_null() const {
-  return ptr == nullptr;
+isl::map map::intersect_domain(const isl::point &set) const
+{
+  return this->intersect_domain(isl::set(set));
 }
 
-
-isl::ctx map::ctx() const {
-  return isl::ctx(isl_map_get_ctx(ptr));
+isl::map map::intersect_domain_factor_domain(isl::map factor) const
+{
+  auto res = isl_map_intersect_domain_factor_domain(copy(), factor.release());
+  return manage(res);
 }
 
-void map::dump() const {
-  isl_map_dump(get());
+isl::union_map map::intersect_domain_factor_domain(const isl::union_map &factor) const
+{
+  return isl::union_map(*this).intersect_domain_factor_domain(factor);
 }
 
+isl::map map::intersect_domain_factor_domain(const isl::basic_map &factor) const
+{
+  return this->intersect_domain_factor_domain(isl::map(factor));
+}
 
-isl::map map::add_constraint(isl::constraint constraint) const
+isl::map map::intersect_domain_factor_range(isl::map factor) const
 {
-  auto res = isl_map_add_constraint(copy(), constraint.release());
+  auto res = isl_map_intersect_domain_factor_range(copy(), factor.release());
   return manage(res);
 }
 
-isl::map map::add_dims(isl::dim type, unsigned int n) const
+isl::union_map map::intersect_domain_factor_range(const isl::union_map &factor) const
 {
-  auto res = isl_map_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
-  return manage(res);
+  return isl::union_map(*this).intersect_domain_factor_range(factor);
 }
 
-isl::basic_map map::affine_hull() const
+isl::map map::intersect_domain_factor_range(const isl::basic_map &factor) const
 {
-  auto res = isl_map_affine_hull(copy());
-  return manage(res);
+  return this->intersect_domain_factor_range(isl::map(factor));
 }
 
-isl::map map::align_params(isl::space model) const
+isl::map map::intersect_params(isl::set params) const
 {
-  auto res = isl_map_align_params(copy(), model.release());
+  auto res = isl_map_intersect_params(copy(), params.release());
   return manage(res);
 }
 
-isl::map map::apply_domain(isl::map map2) const
+isl::map map::intersect_range(isl::set set) const
 {
-  auto res = isl_map_apply_domain(copy(), map2.release());
+  auto res = isl_map_intersect_range(copy(), set.release());
   return manage(res);
 }
 
-isl::map map::apply_range(isl::map map2) const
+isl::union_map map::intersect_range(const isl::space &space) const
 {
-  auto res = isl_map_apply_range(copy(), map2.release());
-  return manage(res);
+  return isl::union_map(*this).intersect_range(space);
 }
 
-isl::set map::bind_domain(isl::multi_id tuple) const
+isl::union_map map::intersect_range(const isl::union_set &uset) const
 {
-  auto res = isl_map_bind_domain(copy(), tuple.release());
-  return manage(res);
+  return isl::union_map(*this).intersect_range(uset);
 }
 
-isl::set map::bind_range(isl::multi_id tuple) const
+isl::map map::intersect_range(const isl::basic_set &set) const
 {
-  auto res = isl_map_bind_range(copy(), tuple.release());
-  return manage(res);
+  return this->intersect_range(isl::set(set));
 }
 
-boolean map::can_curry() const
+isl::map map::intersect_range(const isl::point &set) const
 {
-  auto res = isl_map_can_curry(get());
-  return manage(res);
+  return this->intersect_range(isl::set(set));
 }
 
-boolean map::can_range_curry() const
+isl::map map::intersect_range_factor_domain(isl::map factor) const
 {
-  auto res = isl_map_can_range_curry(get());
+  auto res = isl_map_intersect_range_factor_domain(copy(), factor.release());
   return manage(res);
 }
 
-boolean map::can_uncurry() const
+isl::union_map map::intersect_range_factor_domain(const isl::union_map &factor) const
 {
-  auto res = isl_map_can_uncurry(get());
-  return manage(res);
+  return isl::union_map(*this).intersect_range_factor_domain(factor);
 }
 
-boolean map::can_zip() const
+isl::map map::intersect_range_factor_domain(const isl::basic_map &factor) const
 {
-  auto res = isl_map_can_zip(get());
-  return manage(res);
+  return this->intersect_range_factor_domain(isl::map(factor));
 }
 
-isl::map map::coalesce() const
+isl::map map::intersect_range_factor_range(isl::map factor) const
 {
-  auto res = isl_map_coalesce(copy());
+  auto res = isl_map_intersect_range_factor_range(copy(), factor.release());
   return manage(res);
 }
 
-isl::map map::complement() const
+isl::union_map map::intersect_range_factor_range(const isl::union_map &factor) const
 {
-  auto res = isl_map_complement(copy());
-  return manage(res);
+  return isl::union_map(*this).intersect_range_factor_range(factor);
 }
 
-isl::basic_map map::convex_hull() const
+isl::map map::intersect_range_factor_range(const isl::basic_map &factor) const
 {
-  auto res = isl_map_convex_hull(copy());
-  return manage(res);
+  return this->intersect_range_factor_range(isl::map(factor));
 }
 
-isl::map map::curry() const
+boolean map::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_map_curry(copy());
+  auto res = isl_map_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
   return manage(res);
 }
 
-isl::set map::deltas() const
+boolean map::is_bijective() const
 {
-  auto res = isl_map_deltas(copy());
+  auto res = isl_map_is_bijective(get());
   return manage(res);
 }
 
-isl::map map::deltas_map() const
+boolean map::is_disjoint(const isl::map &map2) const
 {
-  auto res = isl_map_deltas_map(copy());
+  auto res = isl_map_is_disjoint(get(), map2.get());
   return manage(res);
 }
 
-isl::map map::detect_equalities() const
+boolean map::is_disjoint(const isl::union_map &umap2) const
 {
-  auto res = isl_map_detect_equalities(copy());
-  return manage(res);
+  return isl::union_map(*this).is_disjoint(umap2);
 }
 
-isl_size map::dim(isl::dim type) const
+boolean map::is_disjoint(const isl::basic_map &map2) const
 {
-  auto res = isl_map_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return this->is_disjoint(isl::map(map2));
 }
 
-isl::pw_aff map::dim_max(int pos) const
+boolean map::is_empty() const
 {
-  auto res = isl_map_dim_max(copy(), pos);
+  auto res = isl_map_is_empty(get());
   return manage(res);
 }
 
-isl::pw_aff map::dim_min(int pos) const
+boolean map::is_equal(const isl::map &map2) const
 {
-  auto res = isl_map_dim_min(copy(), pos);
+  auto res = isl_map_is_equal(get(), map2.get());
   return manage(res);
 }
 
-isl::set map::domain() const
+boolean map::is_equal(const isl::union_map &umap2) const
 {
-  auto res = isl_map_domain(copy());
-  return manage(res);
+  return isl::union_map(*this).is_equal(umap2);
 }
 
-isl::map map::domain_factor_domain() const
+boolean map::is_equal(const isl::basic_map &map2) const
 {
-  auto res = isl_map_domain_factor_domain(copy());
-  return manage(res);
+  return this->is_equal(isl::map(map2));
 }
 
-isl::map map::domain_factor_range() const
+boolean map::is_injective() const
 {
-  auto res = isl_map_domain_factor_range(copy());
+  auto res = isl_map_is_injective(get());
   return manage(res);
 }
 
-boolean map::domain_is_wrapping() const
+boolean map::is_single_valued() const
 {
-  auto res = isl_map_domain_is_wrapping(get());
+  auto res = isl_map_is_single_valued(get());
   return manage(res);
 }
 
-isl::map map::domain_map() const
+boolean map::is_strict_subset(const isl::map &map2) const
 {
-  auto res = isl_map_domain_map(copy());
+  auto res = isl_map_is_strict_subset(get(), map2.get());
   return manage(res);
 }
 
-isl::map map::domain_product(isl::map map2) const
+boolean map::is_strict_subset(const isl::union_map &umap2) const
 {
-  auto res = isl_map_domain_product(copy(), map2.release());
-  return manage(res);
+  return isl::union_map(*this).is_strict_subset(umap2);
 }
 
-isl_size map::domain_tuple_dim() const
+boolean map::is_strict_subset(const isl::basic_map &map2) const
 {
-  auto res = isl_map_domain_tuple_dim(get());
-  return res;
+  return this->is_strict_subset(isl::map(map2));
 }
 
-isl::map map::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
+boolean map::is_subset(const isl::map &map2) const
 {
-  auto res = isl_map_drop_constraints_involving_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_map_is_subset(get(), map2.get());
   return manage(res);
 }
 
-isl::map map::drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
+boolean map::is_subset(const isl::union_map &umap2) const
 {
-  auto res = isl_map_drop_constraints_not_involving_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::union_map(*this).is_subset(umap2);
 }
 
-isl::map map::drop_unused_params() const
+boolean map::is_subset(const isl::basic_map &map2) const
 {
-  auto res = isl_map_drop_unused_params(copy());
-  return manage(res);
+  return this->is_subset(isl::map(map2));
 }
 
-isl::map map::eliminate(isl::dim type, unsigned int first, unsigned int n) const
+boolean map::isa_map() const
 {
-  auto res = isl_map_eliminate(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::union_map(*this).isa_map();
 }
 
-isl::map map::empty(isl::space space)
+isl::map map::lex_ge(isl::space set_space)
 {
-  auto res = isl_map_empty(space.release());
+  auto res = isl_map_lex_ge(set_space.release());
   return manage(res);
 }
 
-isl::map map::eq_at(isl::multi_pw_aff mpa) const
+isl::map map::lex_ge_at(isl::multi_pw_aff mpa) const
 {
-  auto res = isl_map_eq_at_multi_pw_aff(copy(), mpa.release());
+  auto res = isl_map_lex_ge_at_multi_pw_aff(copy(), mpa.release());
   return manage(res);
 }
 
-isl::map map::equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const
+isl::map map::lex_gt(isl::space set_space)
 {
-  auto res = isl_map_equate(copy(), static_cast<enum isl_dim_type>(type1), pos1, static_cast<enum isl_dim_type>(type2), pos2);
+  auto res = isl_map_lex_gt(set_space.release());
   return manage(res);
 }
 
-isl::map map::factor_domain() const
+isl::map map::lex_gt_at(isl::multi_pw_aff mpa) const
 {
-  auto res = isl_map_factor_domain(copy());
+  auto res = isl_map_lex_gt_at_multi_pw_aff(copy(), mpa.release());
   return manage(res);
 }
 
-isl::map map::factor_range() const
+isl::map map::lex_le(isl::space set_space)
 {
-  auto res = isl_map_factor_range(copy());
+  auto res = isl_map_lex_le(set_space.release());
   return manage(res);
 }
 
-int map::find_dim_by_id(isl::dim type, const isl::id &id) const
+isl::map map::lex_le_at(isl::multi_pw_aff mpa) const
 {
-  auto res = isl_map_find_dim_by_id(get(), static_cast<enum isl_dim_type>(type), id.get());
-  return res;
+  auto res = isl_map_lex_le_at_multi_pw_aff(copy(), mpa.release());
+  return manage(res);
 }
 
-int map::find_dim_by_name(isl::dim type, const std::string &name) const
+isl::map map::lex_lt(isl::space set_space)
 {
-  auto res = isl_map_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+  auto res = isl_map_lex_lt(set_space.release());
+  return manage(res);
 }
 
-isl::map map::fix_si(isl::dim type, unsigned int pos, int value) const
+isl::map map::lex_lt_at(isl::multi_pw_aff mpa) const
 {
-  auto res = isl_map_fix_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
+  auto res = isl_map_lex_lt_at_multi_pw_aff(copy(), mpa.release());
   return manage(res);
 }
 
-isl::map map::fix_val(isl::dim type, unsigned int pos, isl::val v) const
+isl::map map::lexmax() const
 {
-  auto res = isl_map_fix_val(copy(), static_cast<enum isl_dim_type>(type), pos, v.release());
+  auto res = isl_map_lexmax(copy());
   return manage(res);
 }
 
-isl::map map::fixed_power_val(isl::val exp) const
+isl::pw_multi_aff map::lexmax_pw_multi_aff() const
 {
-  auto res = isl_map_fixed_power_val(copy(), exp.release());
+  auto res = isl_map_lexmax_pw_multi_aff(copy());
   return manage(res);
 }
 
-isl::map map::flat_domain_product(isl::map map2) const
+isl::map map::lexmin() const
 {
-  auto res = isl_map_flat_domain_product(copy(), map2.release());
+  auto res = isl_map_lexmin(copy());
   return manage(res);
 }
 
-isl::map map::flat_product(isl::map map2) const
+isl::pw_multi_aff map::lexmin_pw_multi_aff() const
 {
-  auto res = isl_map_flat_product(copy(), map2.release());
+  auto res = isl_map_lexmin_pw_multi_aff(copy());
   return manage(res);
 }
 
-isl::map map::flat_range_product(isl::map map2) const
+isl::map map::lower_bound(isl::multi_pw_aff lower) const
 {
-  auto res = isl_map_flat_range_product(copy(), map2.release());
+  auto res = isl_map_lower_bound_multi_pw_aff(copy(), lower.release());
   return manage(res);
 }
 
-isl::map map::flatten() const
+isl::map map::lower_bound_si(isl::dim type, unsigned int pos, int value) const
 {
-  auto res = isl_map_flatten(copy());
+  auto res = isl_map_lower_bound_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
   return manage(res);
 }
 
-isl::map map::flatten_domain() const
+isl::map_list map::map_list() const
 {
-  auto res = isl_map_flatten_domain(copy());
-  return manage(res);
+  return isl::union_map(*this).map_list();
 }
 
-isl::map map::flatten_range() const
+isl::multi_pw_aff map::max_multi_pw_aff() const
 {
-  auto res = isl_map_flatten_range(copy());
+  auto res = isl_map_max_multi_pw_aff(copy());
   return manage(res);
 }
 
-isl::map map::floordiv_val(isl::val d) const
+isl::multi_pw_aff map::min_multi_pw_aff() const
 {
-  auto res = isl_map_floordiv_val(copy(), d.release());
+  auto res = isl_map_min_multi_pw_aff(copy());
   return manage(res);
 }
 
-stat map::foreach_basic_map(const std::function<stat(basic_map)> &fn) const
+isl::map map::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
 {
-  struct fn_data {
-    const std::function<stat(basic_map)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_basic_map *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_map_foreach_basic_map(get(), fn_lambda, &fn_data);
+  auto res = isl_map_move_dims(copy(), static_cast<enum isl_dim_type>(dst_type), dst_pos, static_cast<enum isl_dim_type>(src_type), src_pos, n);
   return manage(res);
 }
 
-isl::map map::from_aff(isl::aff aff)
+class size map::n_basic_map() const
 {
-  auto res = isl_map_from_aff(aff.release());
+  auto res = isl_map_n_basic_map(get());
   return manage(res);
 }
 
-isl::map map::from_domain(isl::set set)
+isl::map map::order_lt(isl::dim type1, int pos1, isl::dim type2, int pos2) const
 {
-  auto res = isl_map_from_domain(set.release());
+  auto res = isl_map_order_lt(copy(), static_cast<enum isl_dim_type>(type1), pos1, static_cast<enum isl_dim_type>(type2), pos2);
   return manage(res);
 }
 
-isl::map map::from_domain_and_range(isl::set domain, isl::set range)
+isl::set map::params() const
 {
-  auto res = isl_map_from_domain_and_range(domain.release(), range.release());
-  return manage(res);
+  return isl::union_map(*this).params();
 }
 
-isl::map map::from_multi_aff(isl::multi_aff maff)
+isl::basic_map map::polyhedral_hull() const
 {
-  auto res = isl_map_from_multi_aff(maff.release());
+  auto res = isl_map_polyhedral_hull(copy());
   return manage(res);
 }
 
-isl::map map::from_multi_pw_aff(isl::multi_pw_aff mpa)
+isl::map map::preimage_domain(isl::multi_aff ma) const
 {
-  auto res = isl_map_from_multi_pw_aff(mpa.release());
+  auto res = isl_map_preimage_domain_multi_aff(copy(), ma.release());
   return manage(res);
 }
 
-isl::map map::from_pw_aff(isl::pw_aff pwaff)
+isl::map map::preimage_domain(isl::multi_pw_aff mpa) const
 {
-  auto res = isl_map_from_pw_aff(pwaff.release());
+  auto res = isl_map_preimage_domain_multi_pw_aff(copy(), mpa.release());
   return manage(res);
 }
 
-isl::map map::from_pw_multi_aff(isl::pw_multi_aff pma)
+isl::map map::preimage_domain(isl::pw_multi_aff pma) const
 {
-  auto res = isl_map_from_pw_multi_aff(pma.release());
+  auto res = isl_map_preimage_domain_pw_multi_aff(copy(), pma.release());
   return manage(res);
 }
 
-isl::map map::from_range(isl::set set)
+isl::union_map map::preimage_domain(const isl::union_pw_multi_aff &upma) const
 {
-  auto res = isl_map_from_range(set.release());
-  return manage(res);
+  return isl::union_map(*this).preimage_domain(upma);
 }
 
-isl::map map::from_union_map(isl::union_map umap)
+isl::map map::preimage_range(isl::multi_aff ma) const
 {
-  auto res = isl_map_from_union_map(umap.release());
+  auto res = isl_map_preimage_range_multi_aff(copy(), ma.release());
   return manage(res);
 }
 
-isl::basic_map_list map::get_basic_map_list() const
+isl::map map::preimage_range(isl::pw_multi_aff pma) const
 {
-  auto res = isl_map_get_basic_map_list(get());
+  auto res = isl_map_preimage_range_pw_multi_aff(copy(), pma.release());
   return manage(res);
 }
 
-isl::id map::get_dim_id(isl::dim type, unsigned int pos) const
+isl::union_map map::preimage_range(const isl::union_pw_multi_aff &upma) const
+{
+  return isl::union_map(*this).preimage_range(upma);
+}
+
+isl::map map::product(isl::map map2) const
 {
-  auto res = isl_map_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_map_product(copy(), map2.release());
   return manage(res);
 }
 
-std::string map::get_dim_name(isl::dim type, unsigned int pos) const
+isl::union_map map::product(const isl::union_map &umap2) const
 {
-  auto res = isl_map_get_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
-  std::string tmp(res);
-  return tmp;
+  return isl::union_map(*this).product(umap2);
 }
 
-uint32_t map::get_hash() const
+isl::map map::product(const isl::basic_map &map2) const
 {
-  auto res = isl_map_get_hash(get());
-  return res;
+  return this->product(isl::map(map2));
 }
 
-isl::fixed_box map::get_range_simple_fixed_box_hull() const
+isl::map map::project_out(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_map_get_range_simple_fixed_box_hull(get());
+  auto res = isl_map_project_out(copy(), static_cast<enum isl_dim_type>(type), first, n);
   return manage(res);
 }
 
-isl::space map::get_space() const
+isl::map map::project_out_all_params() const
 {
-  auto res = isl_map_get_space(get());
+  auto res = isl_map_project_out_all_params(copy());
   return manage(res);
 }
 
-isl::id map::get_tuple_id(isl::dim type) const
+isl::set map::range() const
 {
-  auto res = isl_map_get_tuple_id(get(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_map_range(copy());
   return manage(res);
 }
 
-std::string map::get_tuple_name(isl::dim type) const
+isl::map map::range_factor_domain() const
 {
-  auto res = isl_map_get_tuple_name(get(), static_cast<enum isl_dim_type>(type));
-  std::string tmp(res);
-  return tmp;
+  auto res = isl_map_range_factor_domain(copy());
+  return manage(res);
 }
 
-isl::map map::gist(isl::map context) const
+isl::map map::range_factor_range() const
 {
-  auto res = isl_map_gist(copy(), context.release());
+  auto res = isl_map_range_factor_range(copy());
   return manage(res);
 }
 
-isl::map map::gist_basic_map(isl::basic_map context) const
+isl::fixed_box map::range_lattice_tile() const
 {
-  auto res = isl_map_gist_basic_map(copy(), context.release());
+  auto res = isl_map_get_range_lattice_tile(get());
   return manage(res);
 }
 
-isl::map map::gist_domain(isl::set context) const
+isl::fixed_box map::get_range_lattice_tile() const
 {
-  auto res = isl_map_gist_domain(copy(), context.release());
-  return manage(res);
+  return range_lattice_tile();
 }
 
-isl::map map::gist_params(isl::set context) const
+isl::map map::range_map() const
 {
-  auto res = isl_map_gist_params(copy(), context.release());
+  auto res = isl_map_range_map(copy());
   return manage(res);
 }
 
-isl::map map::gist_range(isl::set context) const
+isl::map map::range_product(isl::map map2) const
 {
-  auto res = isl_map_gist_range(copy(), context.release());
+  auto res = isl_map_range_product(copy(), map2.release());
   return manage(res);
 }
 
-boolean map::has_dim_id(isl::dim type, unsigned int pos) const
+isl::union_map map::range_product(const isl::union_map &umap2) const
 {
-  auto res = isl_map_has_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  return isl::union_map(*this).range_product(umap2);
 }
 
-boolean map::has_dim_name(isl::dim type, unsigned int pos) const
+isl::map map::range_product(const isl::basic_map &map2) const
 {
-  auto res = isl_map_has_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  return this->range_product(isl::map(map2));
 }
 
-boolean map::has_equal_space(const isl::map &map2) const
+isl::map map::range_reverse() const
 {
-  auto res = isl_map_has_equal_space(get(), map2.get());
+  auto res = isl_map_range_reverse(copy());
   return manage(res);
 }
 
-boolean map::has_tuple_id(isl::dim type) const
+isl::fixed_box map::range_simple_fixed_box_hull() const
 {
-  auto res = isl_map_has_tuple_id(get(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_map_get_range_simple_fixed_box_hull(get());
   return manage(res);
 }
 
-boolean map::has_tuple_name(isl::dim type) const
+isl::fixed_box map::get_range_simple_fixed_box_hull() const
 {
-  auto res = isl_map_has_tuple_name(get(), static_cast<enum isl_dim_type>(type));
+  return range_simple_fixed_box_hull();
+}
+
+class size map::range_tuple_dim() const
+{
+  auto res = isl_map_range_tuple_dim(get());
   return manage(res);
 }
 
-isl::map map::identity(isl::space space)
+isl::id map::range_tuple_id() const
 {
-  auto res = isl_map_identity(space.release());
+  auto res = isl_map_get_range_tuple_id(get());
   return manage(res);
 }
 
-isl::map map::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const
+isl::id map::get_range_tuple_id() const
 {
-  auto res = isl_map_insert_dims(copy(), static_cast<enum isl_dim_type>(type), pos, n);
-  return manage(res);
+  return range_tuple_id();
 }
 
-isl::map map::intersect(isl::map map2) const
+isl::map map::reverse() const
 {
-  auto res = isl_map_intersect(copy(), map2.release());
+  auto res = isl_map_reverse(copy());
   return manage(res);
 }
 
-isl::map map::intersect_domain(isl::set set) const
+isl::basic_map map::sample() const
 {
-  auto res = isl_map_intersect_domain(copy(), set.release());
+  auto res = isl_map_sample(copy());
   return manage(res);
 }
 
-isl::map map::intersect_domain_factor_domain(isl::map factor) const
+isl::map map::set_domain_tuple(isl::id id) const
 {
-  auto res = isl_map_intersect_domain_factor_domain(copy(), factor.release());
+  auto res = isl_map_set_domain_tuple_id(copy(), id.release());
   return manage(res);
 }
 
-isl::map map::intersect_domain_factor_range(isl::map factor) const
+isl::map map::set_domain_tuple(const std::string &id) const
 {
-  auto res = isl_map_intersect_domain_factor_range(copy(), factor.release());
-  return manage(res);
+  return this->set_domain_tuple(isl::id(ctx(), id));
 }
 
-isl::map map::intersect_params(isl::set params) const
+isl::map map::set_range_tuple(isl::id id) const
 {
-  auto res = isl_map_intersect_params(copy(), params.release());
+  auto res = isl_map_set_range_tuple_id(copy(), id.release());
   return manage(res);
 }
 
-isl::map map::intersect_range(isl::set set) const
+isl::map map::set_range_tuple(const std::string &id) const
 {
-  auto res = isl_map_intersect_range(copy(), set.release());
-  return manage(res);
+  return this->set_range_tuple(isl::id(ctx(), id));
 }
 
-isl::map map::intersect_range_factor_domain(isl::map factor) const
+isl::map map::set_tuple_id(isl::dim type, isl::id id) const
 {
-  auto res = isl_map_intersect_range_factor_domain(copy(), factor.release());
+  auto res = isl_map_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
   return manage(res);
 }
 
-isl::map map::intersect_range_factor_range(isl::map factor) const
+isl::map map::set_tuple_id(isl::dim type, const std::string &id) const
 {
-  auto res = isl_map_intersect_range_factor_range(copy(), factor.release());
-  return manage(res);
+  return this->set_tuple_id(type, isl::id(ctx(), id));
 }
 
-boolean map::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::space map::space() const
 {
-  auto res = isl_map_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_map_get_space(get());
   return manage(res);
 }
 
-boolean map::is_bijective() const
+isl::space map::get_space() const
 {
-  auto res = isl_map_is_bijective(get());
-  return manage(res);
+  return space();
 }
 
-boolean map::is_disjoint(const isl::map &map2) const
+isl::map map::subtract(isl::map map2) const
 {
-  auto res = isl_map_is_disjoint(get(), map2.get());
+  auto res = isl_map_subtract(copy(), map2.release());
   return manage(res);
 }
 
-boolean map::is_empty() const
+isl::union_map map::subtract(const isl::union_map &umap2) const
 {
-  auto res = isl_map_is_empty(get());
-  return manage(res);
+  return isl::union_map(*this).subtract(umap2);
 }
 
-boolean map::is_equal(const isl::map &map2) const
+isl::map map::subtract(const isl::basic_map &map2) const
 {
-  auto res = isl_map_is_equal(get(), map2.get());
-  return manage(res);
+  return this->subtract(isl::map(map2));
 }
 
-boolean map::is_identity() const
+isl::union_map map::subtract_domain(const isl::union_set &dom) const
 {
-  auto res = isl_map_is_identity(get());
-  return manage(res);
+  return isl::union_map(*this).subtract_domain(dom);
 }
 
-boolean map::is_injective() const
+isl::union_map map::subtract_range(const isl::union_set &dom) const
 {
-  auto res = isl_map_is_injective(get());
-  return manage(res);
+  return isl::union_map(*this).subtract_range(dom);
 }
 
-boolean map::is_product() const
+isl::map map::sum(isl::map map2) const
 {
-  auto res = isl_map_is_product(get());
+  auto res = isl_map_sum(copy(), map2.release());
   return manage(res);
 }
 
-boolean map::is_single_valued() const
+isl::map_list map::to_list() const
 {
-  auto res = isl_map_is_single_valued(get());
+  auto res = isl_map_to_list(copy());
   return manage(res);
 }
 
-boolean map::is_strict_subset(const isl::map &map2) const
+isl::union_map map::to_union_map() const
 {
-  auto res = isl_map_is_strict_subset(get(), map2.get());
+  auto res = isl_map_to_union_map(copy());
   return manage(res);
 }
 
-boolean map::is_subset(const isl::map &map2) const
+isl::id map::tuple_id(isl::dim type) const
 {
-  auto res = isl_map_is_subset(get(), map2.get());
+  auto res = isl_map_get_tuple_id(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-int map::is_translation() const
+isl::id map::get_tuple_id(isl::dim type) const
 {
-  auto res = isl_map_is_translation(get());
-  return res;
+  return tuple_id(type);
 }
 
-isl::map map::lex_ge(isl::space set_space)
+isl::map map::uncurry() const
 {
-  auto res = isl_map_lex_ge(set_space.release());
+  auto res = isl_map_uncurry(copy());
   return manage(res);
 }
 
-isl::map map::lex_ge_at(isl::multi_pw_aff mpa) const
+isl::map map::unite(isl::map map2) const
 {
-  auto res = isl_map_lex_ge_at_multi_pw_aff(copy(), mpa.release());
+  auto res = isl_map_union(copy(), map2.release());
   return manage(res);
 }
 
-isl::map map::lex_ge_first(isl::space space, unsigned int n)
+isl::union_map map::unite(const isl::union_map &umap2) const
 {
-  auto res = isl_map_lex_ge_first(space.release(), n);
-  return manage(res);
+  return isl::union_map(*this).unite(umap2);
 }
 
-isl::map map::lex_ge_map(isl::map map2) const
+isl::map map::unite(const isl::basic_map &map2) const
 {
-  auto res = isl_map_lex_ge_map(copy(), map2.release());
-  return manage(res);
+  return this->unite(isl::map(map2));
 }
 
-isl::map map::lex_gt(isl::space set_space)
+isl::map map::universe(isl::space space)
 {
-  auto res = isl_map_lex_gt(set_space.release());
+  auto res = isl_map_universe(space.release());
   return manage(res);
 }
 
-isl::map map::lex_gt_at(isl::multi_pw_aff mpa) const
+isl::basic_map map::unshifted_simple_hull() const
 {
-  auto res = isl_map_lex_gt_at_multi_pw_aff(copy(), mpa.release());
+  auto res = isl_map_unshifted_simple_hull(copy());
   return manage(res);
 }
 
-isl::map map::lex_gt_first(isl::space space, unsigned int n)
+isl::map map::upper_bound(isl::multi_pw_aff upper) const
 {
-  auto res = isl_map_lex_gt_first(space.release(), n);
+  auto res = isl_map_upper_bound_multi_pw_aff(copy(), upper.release());
   return manage(res);
 }
 
-isl::map map::lex_gt_map(isl::map map2) const
+isl::map map::upper_bound_si(isl::dim type, unsigned int pos, int value) const
 {
-  auto res = isl_map_lex_gt_map(copy(), map2.release());
+  auto res = isl_map_upper_bound_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
   return manage(res);
 }
 
-isl::map map::lex_le(isl::space set_space)
+isl::set map::wrap() const
 {
-  auto res = isl_map_lex_le(set_space.release());
+  auto res = isl_map_wrap(copy());
   return manage(res);
 }
 
-isl::map map::lex_le_at(isl::multi_pw_aff mpa) const
+isl::map map::zip() const
 {
-  auto res = isl_map_lex_le_at_multi_pw_aff(copy(), mpa.release());
+  auto res = isl_map_zip(copy());
   return manage(res);
 }
 
-isl::map map::lex_le_first(isl::space space, unsigned int n)
+inline std::ostream &operator<<(std::ostream &os, const map &obj)
 {
-  auto res = isl_map_lex_le_first(space.release(), n);
-  return manage(res);
+  char *str = isl_map_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::map map::lex_le_map(isl::map map2) const
-{
-  auto res = isl_map_lex_le_map(copy(), map2.release());
-  return manage(res);
+// implementations for isl::map_list
+map_list manage(__isl_take isl_map_list *ptr) {
+  return map_list(ptr);
+}
+map_list manage_copy(__isl_keep isl_map_list *ptr) {
+  ptr = isl_map_list_copy(ptr);
+  return map_list(ptr);
 }
 
-isl::map map::lex_lt(isl::space set_space)
+map_list::map_list()
+    : ptr(nullptr) {}
+
+map_list::map_list(const map_list &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_map_lex_lt(set_space.release());
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::map map::lex_lt_at(isl::multi_pw_aff mpa) const
+map_list::map_list(__isl_take isl_map_list *ptr)
+    : ptr(ptr) {}
+
+map_list::map_list(isl::ctx ctx, int n)
 {
-  auto res = isl_map_lex_lt_at_multi_pw_aff(copy(), mpa.release());
-  return manage(res);
+  auto res = isl_map_list_alloc(ctx.release(), n);
+  ptr = res;
 }
 
-isl::map map::lex_lt_first(isl::space space, unsigned int n)
+map_list::map_list(isl::map el)
 {
-  auto res = isl_map_lex_lt_first(space.release(), n);
-  return manage(res);
+  auto res = isl_map_list_from_map(el.release());
+  ptr = res;
 }
 
-isl::map map::lex_lt_map(isl::map map2) const
+map_list::map_list(isl::ctx ctx, const std::string &str)
 {
-  auto res = isl_map_lex_lt_map(copy(), map2.release());
-  return manage(res);
+  auto res = isl_map_list_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
 }
 
-isl::map map::lexmax() const
-{
-  auto res = isl_map_lexmax(copy());
-  return manage(res);
+map_list &map_list::operator=(map_list obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::pw_multi_aff map::lexmax_pw_multi_aff() const
+map_list::~map_list() {
+  if (ptr)
+    isl_map_list_free(ptr);
+}
+
+__isl_give isl_map_list *map_list::copy() const & {
+  return isl_map_list_copy(ptr);
+}
+
+__isl_keep isl_map_list *map_list::get() const {
+  return ptr;
+}
+
+__isl_give isl_map_list *map_list::release() {
+  isl_map_list *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
+}
+
+bool map_list::is_null() const {
+  return ptr == nullptr;
+}
+
+isl::ctx map_list::ctx() const {
+  return isl::ctx(isl_map_list_get_ctx(ptr));
+}
+
+isl::map_list map_list::add(isl::map el) const
 {
-  auto res = isl_map_lexmax_pw_multi_aff(copy());
+  auto res = isl_map_list_add(copy(), el.release());
   return manage(res);
 }
 
-isl::map map::lexmin() const
+isl::map map_list::at(int index) const
 {
-  auto res = isl_map_lexmin(copy());
+  auto res = isl_map_list_get_at(get(), index);
   return manage(res);
 }
 
-isl::pw_multi_aff map::lexmin_pw_multi_aff() const
+isl::map map_list::get_at(int index) const
 {
-  auto res = isl_map_lexmin_pw_multi_aff(copy());
-  return manage(res);
+  return at(index);
 }
 
-isl::map map::lower_bound(isl::multi_pw_aff lower) const
+isl::map_list map_list::clear() const
 {
-  auto res = isl_map_lower_bound_multi_pw_aff(copy(), lower.release());
+  auto res = isl_map_list_clear(copy());
   return manage(res);
 }
 
-isl::map map::lower_bound_si(isl::dim type, unsigned int pos, int value) const
+isl::map_list map_list::concat(isl::map_list list2) const
 {
-  auto res = isl_map_lower_bound_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
+  auto res = isl_map_list_concat(copy(), list2.release());
   return manage(res);
 }
 
-isl::map map::lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const
+isl::map_list map_list::drop(unsigned int first, unsigned int n) const
 {
-  auto res = isl_map_lower_bound_val(copy(), static_cast<enum isl_dim_type>(type), pos, value.release());
+  auto res = isl_map_list_drop(copy(), first, n);
   return manage(res);
 }
 
-isl::multi_pw_aff map::max_multi_pw_aff() const
+stat map_list::foreach(const std::function<stat(isl::map)> &fn) const
 {
-  auto res = isl_map_max_multi_pw_aff(copy());
+  struct fn_data {
+    std::function<stat(isl::map)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_map *arg_0, void *arg_1) -> isl_stat {
+    auto *data = static_cast<struct fn_data *>(arg_1);
+    auto ret = (data->func)(manage(arg_0));
+    return ret.release();
+  };
+  auto res = isl_map_list_foreach(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::multi_pw_aff map::min_multi_pw_aff() const
+isl::map_list map_list::insert(unsigned int pos, isl::map el) const
 {
-  auto res = isl_map_min_multi_pw_aff(copy());
+  auto res = isl_map_list_insert(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::map map::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
+class size map_list::size() const
 {
-  auto res = isl_map_move_dims(copy(), static_cast<enum isl_dim_type>(dst_type), dst_pos, static_cast<enum isl_dim_type>(src_type), src_pos, n);
+  auto res = isl_map_list_size(get());
   return manage(res);
 }
 
-isl_size map::n_basic_map() const
+inline std::ostream &operator<<(std::ostream &os, const map_list &obj)
 {
-  auto res = isl_map_n_basic_map(get());
-  return res;
+  char *str = isl_map_list_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::map map::nat_universe(isl::space space)
-{
-  auto res = isl_map_nat_universe(space.release());
-  return manage(res);
+// implementations for isl::multi_aff
+multi_aff manage(__isl_take isl_multi_aff *ptr) {
+  return multi_aff(ptr);
+}
+multi_aff manage_copy(__isl_keep isl_multi_aff *ptr) {
+  ptr = isl_multi_aff_copy(ptr);
+  return multi_aff(ptr);
 }
 
-isl::map map::neg() const
+multi_aff::multi_aff()
+    : ptr(nullptr) {}
+
+multi_aff::multi_aff(const multi_aff &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_map_neg(copy());
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::map map::oppose(isl::dim type1, int pos1, isl::dim type2, int pos2) const
+multi_aff::multi_aff(__isl_take isl_multi_aff *ptr)
+    : ptr(ptr) {}
+
+multi_aff::multi_aff(isl::aff aff)
 {
-  auto res = isl_map_oppose(copy(), static_cast<enum isl_dim_type>(type1), pos1, static_cast<enum isl_dim_type>(type2), pos2);
-  return manage(res);
+  auto res = isl_multi_aff_from_aff(aff.release());
+  ptr = res;
 }
 
-isl::map map::order_ge(isl::dim type1, int pos1, isl::dim type2, int pos2) const
+multi_aff::multi_aff(isl::space space, isl::aff_list list)
 {
-  auto res = isl_map_order_ge(copy(), static_cast<enum isl_dim_type>(type1), pos1, static_cast<enum isl_dim_type>(type2), pos2);
-  return manage(res);
+  auto res = isl_multi_aff_from_aff_list(space.release(), list.release());
+  ptr = res;
 }
 
-isl::map map::order_gt(isl::dim type1, int pos1, isl::dim type2, int pos2) const
+multi_aff::multi_aff(isl::ctx ctx, const std::string &str)
 {
-  auto res = isl_map_order_gt(copy(), static_cast<enum isl_dim_type>(type1), pos1, static_cast<enum isl_dim_type>(type2), pos2);
-  return manage(res);
+  auto res = isl_multi_aff_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
 }
 
-isl::map map::order_le(isl::dim type1, int pos1, isl::dim type2, int pos2) const
-{
-  auto res = isl_map_order_le(copy(), static_cast<enum isl_dim_type>(type1), pos1, static_cast<enum isl_dim_type>(type2), pos2);
-  return manage(res);
+multi_aff &multi_aff::operator=(multi_aff obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::map map::order_lt(isl::dim type1, int pos1, isl::dim type2, int pos2) const
-{
-  auto res = isl_map_order_lt(copy(), static_cast<enum isl_dim_type>(type1), pos1, static_cast<enum isl_dim_type>(type2), pos2);
-  return manage(res);
+multi_aff::~multi_aff() {
+  if (ptr)
+    isl_multi_aff_free(ptr);
 }
 
-isl::set map::params() const
-{
-  auto res = isl_map_params(copy());
-  return manage(res);
+__isl_give isl_multi_aff *multi_aff::copy() const & {
+  return isl_multi_aff_copy(ptr);
 }
 
-isl::val map::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const
-{
-  auto res = isl_map_plain_get_val_if_fixed(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+__isl_keep isl_multi_aff *multi_aff::get() const {
+  return ptr;
+}
+
+__isl_give isl_multi_aff *multi_aff::release() {
+  isl_multi_aff *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
+}
+
+bool multi_aff::is_null() const {
+  return ptr == nullptr;
 }
 
-boolean map::plain_is_empty() const
-{
-  auto res = isl_map_plain_is_empty(get());
-  return manage(res);
+isl::ctx multi_aff::ctx() const {
+  return isl::ctx(isl_multi_aff_get_ctx(ptr));
 }
 
-boolean map::plain_is_equal(const isl::map &map2) const
+isl::multi_aff multi_aff::add(isl::multi_aff multi2) const
 {
-  auto res = isl_map_plain_is_equal(get(), map2.get());
+  auto res = isl_multi_aff_add(copy(), multi2.release());
   return manage(res);
 }
 
-boolean map::plain_is_injective() const
+isl::multi_pw_aff multi_aff::add(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_map_plain_is_injective(get());
-  return manage(res);
+  return isl::pw_multi_aff(*this).add(multi2);
 }
 
-boolean map::plain_is_single_valued() const
+isl::multi_union_pw_aff multi_aff::add(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_map_plain_is_single_valued(get());
-  return manage(res);
+  return isl::pw_multi_aff(*this).add(multi2);
 }
 
-boolean map::plain_is_universe() const
+isl::pw_multi_aff multi_aff::add(const isl::pw_multi_aff &pma2) const
 {
-  auto res = isl_map_plain_is_universe(get());
-  return manage(res);
+  return isl::pw_multi_aff(*this).add(pma2);
 }
 
-isl::basic_map map::plain_unshifted_simple_hull() const
+isl::union_pw_multi_aff multi_aff::add(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_map_plain_unshifted_simple_hull(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).add(upma2);
 }
 
-isl::basic_map map::polyhedral_hull() const
+isl::multi_aff multi_aff::add(const isl::aff &multi2) const
 {
-  auto res = isl_map_polyhedral_hull(copy());
-  return manage(res);
+  return this->add(isl::multi_aff(multi2));
 }
 
-isl::map map::preimage_domain(isl::multi_aff ma) const
+isl::multi_aff multi_aff::add_constant(isl::multi_val mv) const
 {
-  auto res = isl_map_preimage_domain_multi_aff(copy(), ma.release());
+  auto res = isl_multi_aff_add_constant_multi_val(copy(), mv.release());
   return manage(res);
 }
 
-isl::map map::preimage_domain(isl::multi_pw_aff mpa) const
+isl::multi_aff multi_aff::add_constant(isl::val v) const
 {
-  auto res = isl_map_preimage_domain_multi_pw_aff(copy(), mpa.release());
+  auto res = isl_multi_aff_add_constant_val(copy(), v.release());
   return manage(res);
 }
 
-isl::map map::preimage_domain(isl::pw_multi_aff pma) const
+isl::multi_aff multi_aff::add_constant(long v) const
 {
-  auto res = isl_map_preimage_domain_pw_multi_aff(copy(), pma.release());
-  return manage(res);
+  return this->add_constant(isl::val(ctx(), v));
 }
 
-isl::map map::preimage_range(isl::multi_aff ma) const
+isl::union_pw_multi_aff multi_aff::add_pw_multi_aff(const isl::pw_multi_aff &pma) const
 {
-  auto res = isl_map_preimage_range_multi_aff(copy(), ma.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).add_pw_multi_aff(pma);
 }
 
-isl::map map::preimage_range(isl::pw_multi_aff pma) const
+isl::union_pw_multi_aff multi_aff::apply(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_map_preimage_range_pw_multi_aff(copy(), pma.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).apply(upma2);
 }
 
-isl::map map::product(isl::map map2) const
+isl::map multi_aff::as_map() const
 {
-  auto res = isl_map_product(copy(), map2.release());
+  auto res = isl_multi_aff_as_map(copy());
   return manage(res);
 }
 
-isl::map map::project_out(isl::dim type, unsigned int first, unsigned int n) const
+isl::multi_aff multi_aff::as_multi_aff() const
 {
-  auto res = isl_map_project_out(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::pw_multi_aff(*this).as_multi_aff();
 }
 
-isl::map map::project_out_all_params() const
+isl::multi_union_pw_aff multi_aff::as_multi_union_pw_aff() const
 {
-  auto res = isl_map_project_out_all_params(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).as_multi_union_pw_aff();
 }
 
-isl::set map::range() const
+isl::pw_multi_aff multi_aff::as_pw_multi_aff() const
 {
-  auto res = isl_map_range(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).as_pw_multi_aff();
 }
 
-isl::map map::range_curry() const
+isl::set multi_aff::as_set() const
 {
-  auto res = isl_map_range_curry(copy());
+  auto res = isl_multi_aff_as_set(copy());
   return manage(res);
 }
 
-isl::map map::range_factor_domain() const
+isl::union_map multi_aff::as_union_map() const
 {
-  auto res = isl_map_range_factor_domain(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).as_union_map();
 }
 
-isl::map map::range_factor_range() const
+isl::aff multi_aff::at(int pos) const
 {
-  auto res = isl_map_range_factor_range(copy());
+  auto res = isl_multi_aff_get_at(get(), pos);
   return manage(res);
 }
 
-boolean map::range_is_wrapping() const
+isl::aff multi_aff::get_at(int pos) const
 {
-  auto res = isl_map_range_is_wrapping(get());
-  return manage(res);
+  return at(pos);
 }
 
-isl::map map::range_map() const
+isl::basic_set multi_aff::bind(isl::multi_id tuple) const
 {
-  auto res = isl_map_range_map(copy());
+  auto res = isl_multi_aff_bind(copy(), tuple.release());
   return manage(res);
 }
 
-isl::map map::range_product(isl::map map2) const
+isl::multi_aff multi_aff::bind_domain(isl::multi_id tuple) const
 {
-  auto res = isl_map_range_product(copy(), map2.release());
+  auto res = isl_multi_aff_bind_domain(copy(), tuple.release());
   return manage(res);
 }
 
-isl::map map::range_reverse() const
+isl::multi_aff multi_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const
 {
-  auto res = isl_map_range_reverse(copy());
+  auto res = isl_multi_aff_bind_domain_wrapped_domain(copy(), tuple.release());
   return manage(res);
 }
 
-isl_size map::range_tuple_dim() const
+isl::pw_multi_aff multi_aff::coalesce() const
 {
-  auto res = isl_map_range_tuple_dim(get());
-  return res;
+  return isl::pw_multi_aff(*this).coalesce();
 }
 
-isl::map map::remove_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::multi_val multi_aff::constant_multi_val() const
 {
-  auto res = isl_map_remove_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_multi_aff_get_constant_multi_val(get());
   return manage(res);
 }
 
-isl::map map::remove_divs() const
+isl::multi_val multi_aff::get_constant_multi_val() const
 {
-  auto res = isl_map_remove_divs(copy());
-  return manage(res);
+  return constant_multi_val();
 }
 
-isl::map map::remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
+class size multi_aff::dim(isl::dim type) const
 {
-  auto res = isl_map_remove_divs_involving_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_multi_aff_dim(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::map map::remove_redundancies() const
+isl::set multi_aff::domain() const
 {
-  auto res = isl_map_remove_redundancies(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).domain();
 }
 
-isl::map map::remove_unknown_divs() const
+isl::multi_aff multi_aff::domain_map(isl::space space)
 {
-  auto res = isl_map_remove_unknown_divs(copy());
+  auto res = isl_multi_aff_domain_map(space.release());
   return manage(res);
 }
 
-isl::map map::reset_tuple_id(isl::dim type) const
+isl::pw_multi_aff multi_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_map_reset_tuple_id(copy(), static_cast<enum isl_dim_type>(type));
-  return manage(res);
+  return isl::pw_multi_aff(*this).drop_dims(type, first, n);
 }
 
-isl::map map::reset_user() const
+isl::pw_multi_aff multi_aff::extract_pw_multi_aff(const isl::space &space) const
 {
-  auto res = isl_map_reset_user(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).extract_pw_multi_aff(space);
 }
 
-isl::map map::reverse() const
+isl::multi_aff multi_aff::flat_range_product(isl::multi_aff multi2) const
 {
-  auto res = isl_map_reverse(copy());
+  auto res = isl_multi_aff_flat_range_product(copy(), multi2.release());
   return manage(res);
 }
 
-isl::basic_map map::sample() const
+isl::multi_pw_aff multi_aff::flat_range_product(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_map_sample(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).flat_range_product(multi2);
 }
 
-isl::map map::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const
+isl::multi_union_pw_aff multi_aff::flat_range_product(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_map_set_dim_id(copy(), static_cast<enum isl_dim_type>(type), pos, id.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).flat_range_product(multi2);
 }
 
-isl::map map::set_tuple_id(isl::dim type, isl::id id) const
+isl::pw_multi_aff multi_aff::flat_range_product(const isl::pw_multi_aff &pma2) const
 {
-  auto res = isl_map_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).flat_range_product(pma2);
 }
 
-isl::map map::set_tuple_name(isl::dim type, const std::string &s) const
+isl::union_pw_multi_aff multi_aff::flat_range_product(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_map_set_tuple_name(copy(), static_cast<enum isl_dim_type>(type), s.c_str());
-  return manage(res);
+  return isl::pw_multi_aff(*this).flat_range_product(upma2);
 }
 
-isl::basic_map map::simple_hull() const
+isl::multi_aff multi_aff::flat_range_product(const isl::aff &multi2) const
 {
-  auto res = isl_map_simple_hull(copy());
-  return manage(res);
+  return this->flat_range_product(isl::multi_aff(multi2));
 }
 
-isl::map map::subtract(isl::map map2) const
+isl::multi_aff multi_aff::floor() const
 {
-  auto res = isl_map_subtract(copy(), map2.release());
+  auto res = isl_multi_aff_floor(copy());
   return manage(res);
 }
 
-isl::map map::subtract_domain(isl::set dom) const
+stat multi_aff::foreach_piece(const std::function<stat(isl::set, isl::multi_aff)> &fn) const
 {
-  auto res = isl_map_subtract_domain(copy(), dom.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).foreach_piece(fn);
 }
 
-isl::map map::subtract_range(isl::set dom) const
+isl::multi_aff multi_aff::gist(isl::set context) const
 {
-  auto res = isl_map_subtract_range(copy(), dom.release());
+  auto res = isl_multi_aff_gist(copy(), context.release());
   return manage(res);
 }
 
-isl::map map::sum(isl::map map2) const
+isl::union_pw_multi_aff multi_aff::gist(const isl::union_set &context) const
 {
-  auto res = isl_map_sum(copy(), map2.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).gist(context);
 }
 
-isl::map map::uncurry() const
+isl::multi_aff multi_aff::gist(const isl::basic_set &context) const
 {
-  auto res = isl_map_uncurry(copy());
-  return manage(res);
+  return this->gist(isl::set(context));
 }
 
-isl::map map::unite(isl::map map2) const
+isl::multi_aff multi_aff::gist(const isl::point &context) const
 {
-  auto res = isl_map_union(copy(), map2.release());
-  return manage(res);
+  return this->gist(isl::set(context));
 }
 
-isl::map map::universe(isl::space space)
+boolean multi_aff::has_range_tuple_id() const
 {
-  auto res = isl_map_universe(space.release());
+  auto res = isl_multi_aff_has_range_tuple_id(get());
   return manage(res);
 }
 
-isl::basic_map map::unshifted_simple_hull() const
+isl::multi_aff multi_aff::identity(isl::space space)
 {
-  auto res = isl_map_unshifted_simple_hull(copy());
+  auto res = isl_multi_aff_identity(space.release());
   return manage(res);
 }
 
-isl::basic_map map::unshifted_simple_hull_from_map_list(isl::map_list list) const
+isl::multi_aff multi_aff::identity() const
 {
-  auto res = isl_map_unshifted_simple_hull_from_map_list(copy(), list.release());
+  auto res = isl_multi_aff_identity_multi_aff(copy());
   return manage(res);
 }
 
-isl::map map::upper_bound(isl::multi_pw_aff upper) const
+isl::multi_aff multi_aff::identity_on_domain(isl::space space)
 {
-  auto res = isl_map_upper_bound_multi_pw_aff(copy(), upper.release());
+  auto res = isl_multi_aff_identity_on_domain_space(space.release());
   return manage(res);
 }
 
-isl::map map::upper_bound_si(isl::dim type, unsigned int pos, int value) const
+isl::multi_aff multi_aff::insert_domain(isl::space domain) const
 {
-  auto res = isl_map_upper_bound_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
+  auto res = isl_multi_aff_insert_domain(copy(), domain.release());
   return manage(res);
 }
 
-isl::map map::upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const
+isl::pw_multi_aff multi_aff::intersect_domain(const isl::set &set) const
 {
-  auto res = isl_map_upper_bound_val(copy(), static_cast<enum isl_dim_type>(type), pos, value.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).intersect_domain(set);
 }
 
-isl::set map::wrap() const
+isl::union_pw_multi_aff multi_aff::intersect_domain(const isl::space &space) const
 {
-  auto res = isl_map_wrap(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).intersect_domain(space);
 }
 
-isl::map map::zip() const
+isl::union_pw_multi_aff multi_aff::intersect_domain(const isl::union_set &uset) const
 {
-  auto res = isl_map_zip(copy());
-  return manage(res);
-}
-
-// implementations for isl::map_list
-map_list manage(__isl_take isl_map_list *ptr) {
-  return map_list(ptr);
-}
-map_list manage_copy(__isl_keep isl_map_list *ptr) {
-  ptr = isl_map_list_copy(ptr);
-  return map_list(ptr);
+  return isl::pw_multi_aff(*this).intersect_domain(uset);
 }
 
-map_list::map_list()
-    : ptr(nullptr) {}
-
-map_list::map_list(const map_list &obj)
-    : ptr(nullptr)
+isl::union_pw_multi_aff multi_aff::intersect_domain_wrapped_domain(const isl::union_set &uset) const
 {
-  ptr = obj.copy();
-}
-
-
-map_list::map_list(__isl_take isl_map_list *ptr)
-    : ptr(ptr) {}
-
-
-map_list &map_list::operator=(map_list obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
-}
-
-map_list::~map_list() {
-  if (ptr)
-    isl_map_list_free(ptr);
-}
-
-__isl_give isl_map_list *map_list::copy() const & {
-  return isl_map_list_copy(ptr);
-}
-
-__isl_keep isl_map_list *map_list::get() const {
-  return ptr;
-}
-
-__isl_give isl_map_list *map_list::release() {
-  isl_map_list *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+  return isl::pw_multi_aff(*this).intersect_domain_wrapped_domain(uset);
 }
 
-bool map_list::is_null() const {
-  return ptr == nullptr;
+isl::union_pw_multi_aff multi_aff::intersect_domain_wrapped_range(const isl::union_set &uset) const
+{
+  return isl::pw_multi_aff(*this).intersect_domain_wrapped_range(uset);
 }
 
-
-isl::ctx map_list::ctx() const {
-  return isl::ctx(isl_map_list_get_ctx(ptr));
+isl::pw_multi_aff multi_aff::intersect_params(const isl::set &set) const
+{
+  return isl::pw_multi_aff(*this).intersect_params(set);
 }
 
-void map_list::dump() const {
-  isl_map_list_dump(get());
+boolean multi_aff::involves_locals() const
+{
+  auto res = isl_multi_aff_involves_locals(get());
+  return manage(res);
 }
 
-
-isl::map_list map_list::add(isl::map el) const
+boolean multi_aff::involves_nan() const
 {
-  auto res = isl_map_list_add(copy(), el.release());
+  auto res = isl_multi_aff_involves_nan(get());
   return manage(res);
 }
 
-isl::map_list map_list::alloc(isl::ctx ctx, int n)
+boolean multi_aff::involves_param(const isl::id &id) const
 {
-  auto res = isl_map_list_alloc(ctx.release(), n);
-  return manage(res);
+  return isl::pw_multi_aff(*this).involves_param(id);
 }
 
-isl::map_list map_list::clear() const
+boolean multi_aff::involves_param(const std::string &id) const
 {
-  auto res = isl_map_list_clear(copy());
-  return manage(res);
+  return this->involves_param(isl::id(ctx(), id));
 }
 
-isl::map_list map_list::concat(isl::map_list list2) const
+boolean multi_aff::involves_param(const isl::id_list &list) const
 {
-  auto res = isl_map_list_concat(copy(), list2.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).involves_param(list);
 }
 
-isl::map_list map_list::drop(unsigned int first, unsigned int n) const
+boolean multi_aff::isa_multi_aff() const
 {
-  auto res = isl_map_list_drop(copy(), first, n);
-  return manage(res);
+  return isl::pw_multi_aff(*this).isa_multi_aff();
 }
 
-stat map_list::foreach(const std::function<stat(map)> &fn) const
+boolean multi_aff::isa_pw_multi_aff() const
 {
-  struct fn_data {
-    const std::function<stat(map)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_map *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_map_list_foreach(get(), fn_lambda, &fn_data);
-  return manage(res);
+  return isl::pw_multi_aff(*this).isa_pw_multi_aff();
 }
 
-isl::map_list map_list::from_map(isl::map el)
+isl::aff_list multi_aff::list() const
 {
-  auto res = isl_map_list_from_map(el.release());
+  auto res = isl_multi_aff_get_list(get());
   return manage(res);
 }
 
-isl::map map_list::get_at(int index) const
+isl::aff_list multi_aff::get_list() const
 {
-  auto res = isl_map_list_get_at(get(), index);
-  return manage(res);
+  return list();
 }
 
-isl::map map_list::get_map(int index) const
+isl::multi_pw_aff multi_aff::max(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_map_list_get_map(get(), index);
-  return manage(res);
+  return isl::pw_multi_aff(*this).max(multi2);
 }
 
-isl::map_list map_list::insert(unsigned int pos, isl::map el) const
+isl::multi_val multi_aff::max_multi_val() const
 {
-  auto res = isl_map_list_insert(copy(), pos, el.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).max_multi_val();
 }
 
-isl_size map_list::n_map() const
+isl::multi_pw_aff multi_aff::min(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_map_list_n_map(get());
-  return res;
+  return isl::pw_multi_aff(*this).min(multi2);
 }
 
-isl::map_list map_list::reverse() const
+isl::multi_val multi_aff::min_multi_val() const
 {
-  auto res = isl_map_list_reverse(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).min_multi_val();
 }
 
-isl::map_list map_list::set_map(int index, isl::map el) const
+isl::multi_aff multi_aff::multi_val_on_domain(isl::space space, isl::multi_val mv)
 {
-  auto res = isl_map_list_set_map(copy(), index, el.release());
+  auto res = isl_multi_aff_multi_val_on_domain_space(space.release(), mv.release());
   return manage(res);
 }
 
-isl_size map_list::size() const
+class size multi_aff::n_piece() const
 {
-  auto res = isl_map_list_size(get());
-  return res;
+  return isl::pw_multi_aff(*this).n_piece();
 }
 
-isl::map_list map_list::swap(unsigned int pos1, unsigned int pos2) const
+isl::multi_aff multi_aff::neg() const
 {
-  auto res = isl_map_list_swap(copy(), pos1, pos2);
+  auto res = isl_multi_aff_neg(copy());
   return manage(res);
 }
 
-// implementations for isl::mat
-mat manage(__isl_take isl_mat *ptr) {
-  return mat(ptr);
-}
-mat manage_copy(__isl_keep isl_mat *ptr) {
-  ptr = isl_mat_copy(ptr);
-  return mat(ptr);
+boolean multi_aff::plain_is_empty() const
+{
+  return isl::pw_multi_aff(*this).plain_is_empty();
 }
 
-mat::mat()
-    : ptr(nullptr) {}
-
-mat::mat(const mat &obj)
-    : ptr(nullptr)
+boolean multi_aff::plain_is_equal(const isl::multi_aff &multi2) const
 {
-  ptr = obj.copy();
+  auto res = isl_multi_aff_plain_is_equal(get(), multi2.get());
+  return manage(res);
 }
 
-
-mat::mat(__isl_take isl_mat *ptr)
-    : ptr(ptr) {}
-
-
-mat &mat::operator=(mat obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+boolean multi_aff::plain_is_equal(const isl::multi_pw_aff &multi2) const
+{
+  return isl::pw_multi_aff(*this).plain_is_equal(multi2);
 }
 
-mat::~mat() {
-  if (ptr)
-    isl_mat_free(ptr);
+boolean multi_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const
+{
+  return isl::pw_multi_aff(*this).plain_is_equal(multi2);
 }
 
-__isl_give isl_mat *mat::copy() const & {
-  return isl_mat_copy(ptr);
+boolean multi_aff::plain_is_equal(const isl::aff &multi2) const
+{
+  return this->plain_is_equal(isl::multi_aff(multi2));
 }
 
-__isl_keep isl_mat *mat::get() const {
-  return ptr;
+isl::pw_multi_aff multi_aff::preimage_domain_wrapped_domain(const isl::pw_multi_aff &pma2) const
+{
+  return isl::pw_multi_aff(*this).preimage_domain_wrapped_domain(pma2);
 }
 
-__isl_give isl_mat *mat::release() {
-  isl_mat *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::union_pw_multi_aff multi_aff::preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const
+{
+  return isl::pw_multi_aff(*this).preimage_domain_wrapped_domain(upma2);
 }
 
-bool mat::is_null() const {
-  return ptr == nullptr;
+isl::multi_aff multi_aff::product(isl::multi_aff multi2) const
+{
+  auto res = isl_multi_aff_product(copy(), multi2.release());
+  return manage(res);
 }
 
-
-isl::ctx mat::ctx() const {
-  return isl::ctx(isl_mat_get_ctx(ptr));
+isl::multi_pw_aff multi_aff::product(const isl::multi_pw_aff &multi2) const
+{
+  return isl::pw_multi_aff(*this).product(multi2);
 }
 
-void mat::dump() const {
-  isl_mat_dump(get());
+isl::pw_multi_aff multi_aff::product(const isl::pw_multi_aff &pma2) const
+{
+  return isl::pw_multi_aff(*this).product(pma2);
 }
 
-
-isl::mat mat::add_rows(unsigned int n) const
+isl::multi_aff multi_aff::product(const isl::aff &multi2) const
 {
-  auto res = isl_mat_add_rows(copy(), n);
-  return manage(res);
+  return this->product(isl::multi_aff(multi2));
 }
 
-isl::mat mat::add_zero_cols(unsigned int n) const
+isl::multi_aff multi_aff::pullback(isl::multi_aff ma2) const
 {
-  auto res = isl_mat_add_zero_cols(copy(), n);
+  auto res = isl_multi_aff_pullback_multi_aff(copy(), ma2.release());
   return manage(res);
 }
 
-isl::mat mat::add_zero_rows(unsigned int n) const
+isl::multi_pw_aff multi_aff::pullback(const isl::multi_pw_aff &mpa2) const
 {
-  auto res = isl_mat_add_zero_rows(copy(), n);
-  return manage(res);
+  return isl::pw_multi_aff(*this).pullback(mpa2);
 }
 
-isl::mat mat::aff_direct_sum(isl::mat right) const
+isl::pw_multi_aff multi_aff::pullback(const isl::pw_multi_aff &pma2) const
 {
-  auto res = isl_mat_aff_direct_sum(copy(), right.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).pullback(pma2);
 }
 
-isl::mat mat::alloc(isl::ctx ctx, unsigned int n_row, unsigned int n_col)
+isl::union_pw_multi_aff multi_aff::pullback(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_mat_alloc(ctx.release(), n_row, n_col);
-  return manage(res);
+  return isl::pw_multi_aff(*this).pullback(upma2);
 }
 
-isl_size mat::cols() const
+isl::multi_aff multi_aff::pullback(const isl::aff &ma2) const
 {
-  auto res = isl_mat_cols(get());
-  return res;
+  return this->pullback(isl::multi_aff(ma2));
 }
 
-isl::mat mat::concat(isl::mat bot) const
+isl::pw_multi_aff_list multi_aff::pw_multi_aff_list() const
 {
-  auto res = isl_mat_concat(copy(), bot.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).pw_multi_aff_list();
 }
 
-isl::mat mat::diagonal(isl::mat mat2) const
+isl::pw_multi_aff multi_aff::range_factor_domain() const
 {
-  auto res = isl_mat_diagonal(copy(), mat2.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).range_factor_domain();
 }
 
-isl::mat mat::drop_cols(unsigned int col, unsigned int n) const
+isl::pw_multi_aff multi_aff::range_factor_range() const
 {
-  auto res = isl_mat_drop_cols(copy(), col, n);
-  return manage(res);
+  return isl::pw_multi_aff(*this).range_factor_range();
 }
 
-isl::mat mat::drop_rows(unsigned int row, unsigned int n) const
+isl::multi_aff multi_aff::range_map(isl::space space)
 {
-  auto res = isl_mat_drop_rows(copy(), row, n);
+  auto res = isl_multi_aff_range_map(space.release());
   return manage(res);
 }
 
-isl::mat mat::from_row_vec(isl::vec vec)
+isl::multi_aff multi_aff::range_product(isl::multi_aff multi2) const
 {
-  auto res = isl_mat_from_row_vec(vec.release());
+  auto res = isl_multi_aff_range_product(copy(), multi2.release());
   return manage(res);
 }
 
-isl::val mat::get_element_val(int row, int col) const
+isl::multi_pw_aff multi_aff::range_product(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_mat_get_element_val(get(), row, col);
-  return manage(res);
+  return isl::pw_multi_aff(*this).range_product(multi2);
 }
 
-boolean mat::has_linearly_independent_rows(const isl::mat &mat2) const
+isl::multi_union_pw_aff multi_aff::range_product(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_mat_has_linearly_independent_rows(get(), mat2.get());
-  return manage(res);
+  return isl::pw_multi_aff(*this).range_product(multi2);
 }
 
-int mat::initial_non_zero_cols() const
+isl::pw_multi_aff multi_aff::range_product(const isl::pw_multi_aff &pma2) const
 {
-  auto res = isl_mat_initial_non_zero_cols(get());
-  return res;
+  return isl::pw_multi_aff(*this).range_product(pma2);
 }
 
-isl::mat mat::insert_cols(unsigned int col, unsigned int n) const
+isl::union_pw_multi_aff multi_aff::range_product(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_mat_insert_cols(copy(), col, n);
-  return manage(res);
+  return isl::pw_multi_aff(*this).range_product(upma2);
 }
 
-isl::mat mat::insert_rows(unsigned int row, unsigned int n) const
+isl::multi_aff multi_aff::range_product(const isl::aff &multi2) const
 {
-  auto res = isl_mat_insert_rows(copy(), row, n);
-  return manage(res);
+  return this->range_product(isl::multi_aff(multi2));
 }
 
-isl::mat mat::insert_zero_cols(unsigned int first, unsigned int n) const
+isl::id multi_aff::range_tuple_id() const
 {
-  auto res = isl_mat_insert_zero_cols(copy(), first, n);
+  auto res = isl_multi_aff_get_range_tuple_id(get());
   return manage(res);
 }
 
-isl::mat mat::insert_zero_rows(unsigned int row, unsigned int n) const
+isl::id multi_aff::get_range_tuple_id() const
 {
-  auto res = isl_mat_insert_zero_rows(copy(), row, n);
-  return manage(res);
+  return range_tuple_id();
 }
 
-isl::mat mat::inverse_product(isl::mat right) const
+isl::multi_aff multi_aff::reset_range_tuple_id() const
 {
-  auto res = isl_mat_inverse_product(copy(), right.release());
+  auto res = isl_multi_aff_reset_range_tuple_id(copy());
   return manage(res);
 }
 
-boolean mat::is_equal(const isl::mat &mat2) const
+isl::multi_aff multi_aff::reset_tuple_id(isl::dim type) const
 {
-  auto res = isl_mat_is_equal(get(), mat2.get());
+  auto res = isl_multi_aff_reset_tuple_id(copy(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::mat mat::lin_to_aff() const
+isl::multi_aff multi_aff::scale(isl::multi_val mv) const
 {
-  auto res = isl_mat_lin_to_aff(copy());
+  auto res = isl_multi_aff_scale_multi_val(copy(), mv.release());
   return manage(res);
 }
 
-isl::mat mat::move_cols(unsigned int dst_col, unsigned int src_col, unsigned int n) const
+isl::multi_aff multi_aff::scale(isl::val v) const
 {
-  auto res = isl_mat_move_cols(copy(), dst_col, src_col, n);
+  auto res = isl_multi_aff_scale_val(copy(), v.release());
   return manage(res);
 }
 
-isl::mat mat::normalize() const
+isl::multi_aff multi_aff::scale(long v) const
 {
-  auto res = isl_mat_normalize(copy());
-  return manage(res);
+  return this->scale(isl::val(ctx(), v));
 }
 
-isl::mat mat::normalize_row(int row) const
+isl::multi_aff multi_aff::scale_down(isl::multi_val mv) const
 {
-  auto res = isl_mat_normalize_row(copy(), row);
+  auto res = isl_multi_aff_scale_down_multi_val(copy(), mv.release());
   return manage(res);
 }
 
-isl::mat mat::product(isl::mat right) const
+isl::multi_aff multi_aff::scale_down(isl::val v) const
 {
-  auto res = isl_mat_product(copy(), right.release());
+  auto res = isl_multi_aff_scale_down_val(copy(), v.release());
   return manage(res);
 }
 
-isl_size mat::rank() const
+isl::multi_aff multi_aff::scale_down(long v) const
 {
-  auto res = isl_mat_rank(get());
-  return res;
+  return this->scale_down(isl::val(ctx(), v));
 }
 
-isl::mat mat::right_inverse() const
+isl::multi_aff multi_aff::set_aff(int pos, isl::aff el) const
 {
-  auto res = isl_mat_right_inverse(copy());
+  auto res = isl_multi_aff_set_aff(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::mat mat::right_kernel() const
+isl::multi_aff multi_aff::set_at(int pos, isl::aff el) const
 {
-  auto res = isl_mat_right_kernel(copy());
+  auto res = isl_multi_aff_set_at(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::mat mat::row_basis() const
+isl::multi_pw_aff multi_aff::set_at(int pos, const isl::pw_aff &el) const
 {
-  auto res = isl_mat_row_basis(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).set_at(pos, el);
 }
 
-isl::mat mat::row_basis_extension(isl::mat mat2) const
+isl::multi_union_pw_aff multi_aff::set_at(int pos, const isl::union_pw_aff &el) const
 {
-  auto res = isl_mat_row_basis_extension(copy(), mat2.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).set_at(pos, el);
 }
 
-isl_size mat::rows() const
+isl::multi_pw_aff multi_aff::set_pw_aff(int pos, const isl::pw_aff &el) const
 {
-  auto res = isl_mat_rows(get());
-  return res;
+  return isl::pw_multi_aff(*this).set_pw_aff(pos, el);
 }
 
-isl::mat mat::set_element_si(int row, int col, int v) const
+isl::pw_multi_aff multi_aff::set_pw_aff(unsigned int pos, const isl::pw_aff &pa) const
 {
-  auto res = isl_mat_set_element_si(copy(), row, col, v);
-  return manage(res);
+  return isl::pw_multi_aff(*this).set_pw_aff(pos, pa);
 }
 
-isl::mat mat::set_element_val(int row, int col, isl::val v) const
+isl::multi_aff multi_aff::set_range_tuple(isl::id id) const
 {
-  auto res = isl_mat_set_element_val(copy(), row, col, v.release());
+  auto res = isl_multi_aff_set_range_tuple_id(copy(), id.release());
   return manage(res);
 }
 
-isl::mat mat::swap_cols(unsigned int i, unsigned int j) const
+isl::multi_aff multi_aff::set_range_tuple(const std::string &id) const
 {
-  auto res = isl_mat_swap_cols(copy(), i, j);
-  return manage(res);
+  return this->set_range_tuple(isl::id(ctx(), id));
 }
 
-isl::mat mat::swap_rows(unsigned int i, unsigned int j) const
+isl::multi_union_pw_aff multi_aff::set_union_pw_aff(int pos, const isl::union_pw_aff &el) const
 {
-  auto res = isl_mat_swap_rows(copy(), i, j);
-  return manage(res);
+  return isl::pw_multi_aff(*this).set_union_pw_aff(pos, el);
 }
 
-isl::mat mat::transpose() const
+class size multi_aff::size() const
 {
-  auto res = isl_mat_transpose(copy());
+  auto res = isl_multi_aff_size(get());
   return manage(res);
 }
 
-isl::mat mat::unimodular_complete(int row) const
+isl::space multi_aff::space() const
 {
-  auto res = isl_mat_unimodular_complete(copy(), row);
+  auto res = isl_multi_aff_get_space(get());
   return manage(res);
 }
 
-isl::mat mat::vec_concat(isl::vec bot) const
+isl::space multi_aff::get_space() const
 {
-  auto res = isl_mat_vec_concat(copy(), bot.release());
-  return manage(res);
+  return space();
 }
 
-isl::vec mat::vec_inverse_product(isl::vec vec) const
+isl::multi_aff multi_aff::sub(isl::multi_aff multi2) const
 {
-  auto res = isl_mat_vec_inverse_product(copy(), vec.release());
+  auto res = isl_multi_aff_sub(copy(), multi2.release());
   return manage(res);
 }
 
-isl::vec mat::vec_product(isl::vec vec) const
+isl::multi_pw_aff multi_aff::sub(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_mat_vec_product(copy(), vec.release());
-  return manage(res);
-}
-
-// implementations for isl::multi_aff
-multi_aff manage(__isl_take isl_multi_aff *ptr) {
-  return multi_aff(ptr);
-}
-multi_aff manage_copy(__isl_keep isl_multi_aff *ptr) {
-  ptr = isl_multi_aff_copy(ptr);
-  return multi_aff(ptr);
+  return isl::pw_multi_aff(*this).sub(multi2);
 }
 
-multi_aff::multi_aff()
-    : ptr(nullptr) {}
-
-multi_aff::multi_aff(const multi_aff &obj)
-    : ptr(nullptr)
+isl::multi_union_pw_aff multi_aff::sub(const isl::multi_union_pw_aff &multi2) const
 {
-  ptr = obj.copy();
+  return isl::pw_multi_aff(*this).sub(multi2);
 }
 
-
-multi_aff::multi_aff(__isl_take isl_multi_aff *ptr)
-    : ptr(ptr) {}
-
-multi_aff::multi_aff(isl::aff aff)
+isl::pw_multi_aff multi_aff::sub(const isl::pw_multi_aff &pma2) const
 {
-  auto res = isl_multi_aff_from_aff(aff.release());
-  ptr = res;
+  return isl::pw_multi_aff(*this).sub(pma2);
 }
-multi_aff::multi_aff(isl::space space, isl::aff_list list)
+
+isl::union_pw_multi_aff multi_aff::sub(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_multi_aff_from_aff_list(space.release(), list.release());
-  ptr = res;
+  return isl::pw_multi_aff(*this).sub(upma2);
 }
-multi_aff::multi_aff(isl::ctx ctx, const std::string &str)
+
+isl::multi_aff multi_aff::sub(const isl::aff &multi2) const
 {
-  auto res = isl_multi_aff_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
+  return this->sub(isl::multi_aff(multi2));
 }
 
-multi_aff &multi_aff::operator=(multi_aff obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::pw_multi_aff multi_aff::subtract_domain(const isl::set &set) const
+{
+  return isl::pw_multi_aff(*this).subtract_domain(set);
 }
 
-multi_aff::~multi_aff() {
-  if (ptr)
-    isl_multi_aff_free(ptr);
+isl::union_pw_multi_aff multi_aff::subtract_domain(const isl::space &space) const
+{
+  return isl::pw_multi_aff(*this).subtract_domain(space);
 }
 
-__isl_give isl_multi_aff *multi_aff::copy() const & {
-  return isl_multi_aff_copy(ptr);
+isl::union_pw_multi_aff multi_aff::subtract_domain(const isl::union_set &uset) const
+{
+  return isl::pw_multi_aff(*this).subtract_domain(uset);
 }
 
-__isl_keep isl_multi_aff *multi_aff::get() const {
-  return ptr;
+isl::pw_multi_aff_list multi_aff::to_list() const
+{
+  return isl::pw_multi_aff(*this).to_list();
 }
 
-__isl_give isl_multi_aff *multi_aff::release() {
-  isl_multi_aff *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::multi_pw_aff multi_aff::to_multi_pw_aff() const
+{
+  auto res = isl_multi_aff_to_multi_pw_aff(copy());
+  return manage(res);
 }
 
-bool multi_aff::is_null() const {
-  return ptr == nullptr;
+isl::multi_union_pw_aff multi_aff::to_multi_union_pw_aff() const
+{
+  auto res = isl_multi_aff_to_multi_union_pw_aff(copy());
+  return manage(res);
 }
 
-
-isl::ctx multi_aff::ctx() const {
-  return isl::ctx(isl_multi_aff_get_ctx(ptr));
+isl::pw_multi_aff multi_aff::to_pw_multi_aff() const
+{
+  auto res = isl_multi_aff_to_pw_multi_aff(copy());
+  return manage(res);
 }
 
-void multi_aff::dump() const {
-  isl_multi_aff_dump(get());
+isl::union_pw_multi_aff multi_aff::to_union_pw_multi_aff() const
+{
+  return isl::pw_multi_aff(*this).to_union_pw_multi_aff();
 }
 
-
-isl::multi_aff multi_aff::add(isl::multi_aff multi2) const
+isl::id multi_aff::tuple_id(isl::dim type) const
 {
-  auto res = isl_multi_aff_add(copy(), multi2.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).tuple_id(type);
 }
 
-isl::multi_aff multi_aff::add_constant(isl::multi_val mv) const
+isl::multi_aff multi_aff::unbind_params_insert_domain(isl::multi_id domain) const
 {
-  auto res = isl_multi_aff_add_constant_multi_val(copy(), mv.release());
+  auto res = isl_multi_aff_unbind_params_insert_domain(copy(), domain.release());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::add_constant(isl::val v) const
+isl::multi_pw_aff multi_aff::union_add(const isl::multi_pw_aff &mpa2) const
 {
-  auto res = isl_multi_aff_add_constant_val(copy(), v.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).union_add(mpa2);
 }
 
-isl::multi_aff multi_aff::add_dims(isl::dim type, unsigned int n) const
+isl::multi_union_pw_aff multi_aff::union_add(const isl::multi_union_pw_aff &mupa2) const
 {
-  auto res = isl_multi_aff_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
-  return manage(res);
+  return isl::pw_multi_aff(*this).union_add(mupa2);
 }
 
-isl::multi_aff multi_aff::align_params(isl::space model) const
+isl::pw_multi_aff multi_aff::union_add(const isl::pw_multi_aff &pma2) const
 {
-  auto res = isl_multi_aff_align_params(copy(), model.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).union_add(pma2);
 }
 
-isl::basic_set multi_aff::bind(isl::multi_id tuple) const
+isl::union_pw_multi_aff multi_aff::union_add(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_multi_aff_bind(copy(), tuple.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).union_add(upma2);
 }
 
-isl::multi_aff multi_aff::bind_domain(isl::multi_id tuple) const
+isl::multi_aff multi_aff::zero(isl::space space)
 {
-  auto res = isl_multi_aff_bind_domain(copy(), tuple.release());
+  auto res = isl_multi_aff_zero(space.release());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const
+inline std::ostream &operator<<(std::ostream &os, const multi_aff &obj)
 {
-  auto res = isl_multi_aff_bind_domain_wrapped_domain(copy(), tuple.release());
-  return manage(res);
+  char *str = isl_multi_aff_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl_size multi_aff::dim(isl::dim type) const
-{
-  auto res = isl_multi_aff_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+// implementations for isl::multi_id
+multi_id manage(__isl_take isl_multi_id *ptr) {
+  return multi_id(ptr);
+}
+multi_id manage_copy(__isl_keep isl_multi_id *ptr) {
+  ptr = isl_multi_id_copy(ptr);
+  return multi_id(ptr);
 }
 
-isl::multi_aff multi_aff::domain_map(isl::space space)
+multi_id::multi_id()
+    : ptr(nullptr) {}
+
+multi_id::multi_id(const multi_id &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_multi_aff_domain_map(space.release());
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::multi_aff multi_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+multi_id::multi_id(__isl_take isl_multi_id *ptr)
+    : ptr(ptr) {}
+
+multi_id::multi_id(isl::space space, isl::id_list list)
 {
-  auto res = isl_multi_aff_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  auto res = isl_multi_id_from_id_list(space.release(), list.release());
+  ptr = res;
 }
 
-isl::multi_aff multi_aff::factor_range() const
+multi_id::multi_id(isl::ctx ctx, const std::string &str)
 {
-  auto res = isl_multi_aff_factor_range(copy());
-  return manage(res);
+  auto res = isl_multi_id_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
 }
 
-int multi_aff::find_dim_by_id(isl::dim type, const isl::id &id) const
-{
-  auto res = isl_multi_aff_find_dim_by_id(get(), static_cast<enum isl_dim_type>(type), id.get());
-  return res;
+multi_id &multi_id::operator=(multi_id obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-int multi_aff::find_dim_by_name(isl::dim type, const std::string &name) const
-{
-  auto res = isl_multi_aff_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+multi_id::~multi_id() {
+  if (ptr)
+    isl_multi_id_free(ptr);
 }
 
-isl::multi_aff multi_aff::flat_range_product(isl::multi_aff multi2) const
-{
-  auto res = isl_multi_aff_flat_range_product(copy(), multi2.release());
-  return manage(res);
+__isl_give isl_multi_id *multi_id::copy() const & {
+  return isl_multi_id_copy(ptr);
 }
 
-isl::multi_aff multi_aff::flatten_domain() const
-{
-  auto res = isl_multi_aff_flatten_domain(copy());
-  return manage(res);
+__isl_keep isl_multi_id *multi_id::get() const {
+  return ptr;
 }
 
-isl::multi_aff multi_aff::flatten_range() const
-{
-  auto res = isl_multi_aff_flatten_range(copy());
-  return manage(res);
+__isl_give isl_multi_id *multi_id::release() {
+  isl_multi_id *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
 }
 
-isl::multi_aff multi_aff::floor() const
-{
-  auto res = isl_multi_aff_floor(copy());
-  return manage(res);
+bool multi_id::is_null() const {
+  return ptr == nullptr;
 }
 
-isl::multi_aff multi_aff::from_range() const
-{
-  auto res = isl_multi_aff_from_range(copy());
-  return manage(res);
+isl::ctx multi_id::ctx() const {
+  return isl::ctx(isl_multi_id_get_ctx(ptr));
 }
 
-isl::aff multi_aff::get_aff(int pos) const
+isl::id multi_id::at(int pos) const
 {
-  auto res = isl_multi_aff_get_aff(get(), pos);
+  auto res = isl_multi_id_get_at(get(), pos);
   return manage(res);
 }
 
-isl::aff multi_aff::get_at(int pos) const
+isl::id multi_id::get_at(int pos) const
 {
-  auto res = isl_multi_aff_get_at(get(), pos);
-  return manage(res);
+  return at(pos);
 }
 
-isl::multi_val multi_aff::get_constant_multi_val() const
+isl::multi_id multi_id::flat_range_product(isl::multi_id multi2) const
 {
-  auto res = isl_multi_aff_get_constant_multi_val(get());
+  auto res = isl_multi_id_flat_range_product(copy(), multi2.release());
   return manage(res);
 }
 
-isl::id multi_aff::get_dim_id(isl::dim type, unsigned int pos) const
+isl::id_list multi_id::list() const
 {
-  auto res = isl_multi_aff_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_multi_id_get_list(get());
   return manage(res);
 }
 
-isl::space multi_aff::get_domain_space() const
+isl::id_list multi_id::get_list() const
 {
-  auto res = isl_multi_aff_get_domain_space(get());
-  return manage(res);
+  return list();
 }
 
-isl::aff_list multi_aff::get_list() const
+boolean multi_id::plain_is_equal(const isl::multi_id &multi2) const
 {
-  auto res = isl_multi_aff_get_list(get());
+  auto res = isl_multi_id_plain_is_equal(get(), multi2.get());
   return manage(res);
 }
 
-isl::space multi_aff::get_space() const
+isl::multi_id multi_id::range_product(isl::multi_id multi2) const
 {
-  auto res = isl_multi_aff_get_space(get());
+  auto res = isl_multi_id_range_product(copy(), multi2.release());
   return manage(res);
 }
 
-isl::id multi_aff::get_tuple_id(isl::dim type) const
+isl::multi_id multi_id::set_at(int pos, isl::id el) const
 {
-  auto res = isl_multi_aff_get_tuple_id(get(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_multi_id_set_at(copy(), pos, el.release());
   return manage(res);
 }
 
-std::string multi_aff::get_tuple_name(isl::dim type) const
+isl::multi_id multi_id::set_at(int pos, const std::string &el) const
 {
-  auto res = isl_multi_aff_get_tuple_name(get(), static_cast<enum isl_dim_type>(type));
-  std::string tmp(res);
-  return tmp;
+  return this->set_at(pos, isl::id(ctx(), el));
 }
 
-isl::multi_aff multi_aff::gist(isl::set context) const
+isl::multi_id multi_id::set_id(int pos, isl::id el) const
 {
-  auto res = isl_multi_aff_gist(copy(), context.release());
+  auto res = isl_multi_id_set_id(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::gist_params(isl::set context) const
+isl::multi_id multi_id::set_id(int pos, const std::string &el) const
 {
-  auto res = isl_multi_aff_gist_params(copy(), context.release());
-  return manage(res);
+  return this->set_id(pos, isl::id(ctx(), el));
 }
 
-boolean multi_aff::has_tuple_id(isl::dim type) const
+class size multi_id::size() const
 {
-  auto res = isl_multi_aff_has_tuple_id(get(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_multi_id_size(get());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::identity(isl::space space)
+isl::space multi_id::space() const
 {
-  auto res = isl_multi_aff_identity(space.release());
+  auto res = isl_multi_id_get_space(get());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::identity() const
+isl::space multi_id::get_space() const
 {
-  auto res = isl_multi_aff_identity_multi_aff(copy());
-  return manage(res);
+  return space();
 }
 
-isl::multi_aff multi_aff::identity_on_domain(isl::space space)
+inline std::ostream &operator<<(std::ostream &os, const multi_id &obj)
 {
-  auto res = isl_multi_aff_identity_on_domain_space(space.release());
-  return manage(res);
+  char *str = isl_multi_id_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
+
+// implementations for isl::multi_pw_aff
+multi_pw_aff manage(__isl_take isl_multi_pw_aff *ptr) {
+  return multi_pw_aff(ptr);
+}
+multi_pw_aff manage_copy(__isl_keep isl_multi_pw_aff *ptr) {
+  ptr = isl_multi_pw_aff_copy(ptr);
+  return multi_pw_aff(ptr);
 }
 
-isl::multi_aff multi_aff::insert_dims(isl::dim type, unsigned int first, unsigned int n) const
+multi_pw_aff::multi_pw_aff()
+    : ptr(nullptr) {}
+
+multi_pw_aff::multi_pw_aff(const multi_pw_aff &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_multi_aff_insert_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::multi_aff multi_aff::insert_domain(isl::space domain) const
+multi_pw_aff::multi_pw_aff(__isl_take isl_multi_pw_aff *ptr)
+    : ptr(ptr) {}
+
+multi_pw_aff::multi_pw_aff(isl::aff aff)
 {
-  auto res = isl_multi_aff_insert_domain(copy(), domain.release());
-  return manage(res);
+  auto res = isl_multi_pw_aff_from_aff(aff.release());
+  ptr = res;
 }
 
-boolean multi_aff::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
+multi_pw_aff::multi_pw_aff(isl::multi_aff ma)
 {
-  auto res = isl_multi_aff_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  auto res = isl_multi_pw_aff_from_multi_aff(ma.release());
+  ptr = res;
 }
 
-boolean multi_aff::involves_locals() const
+multi_pw_aff::multi_pw_aff(isl::pw_aff pa)
 {
-  auto res = isl_multi_aff_involves_locals(get());
-  return manage(res);
+  auto res = isl_multi_pw_aff_from_pw_aff(pa.release());
+  ptr = res;
 }
 
-boolean multi_aff::involves_nan() const
+multi_pw_aff::multi_pw_aff(isl::space space, isl::pw_aff_list list)
 {
-  auto res = isl_multi_aff_involves_nan(get());
-  return manage(res);
+  auto res = isl_multi_pw_aff_from_pw_aff_list(space.release(), list.release());
+  ptr = res;
 }
 
-isl::set multi_aff::lex_ge_set(isl::multi_aff ma2) const
+multi_pw_aff::multi_pw_aff(isl::pw_multi_aff pma)
 {
-  auto res = isl_multi_aff_lex_ge_set(copy(), ma2.release());
-  return manage(res);
+  auto res = isl_multi_pw_aff_from_pw_multi_aff(pma.release());
+  ptr = res;
 }
 
-isl::set multi_aff::lex_gt_set(isl::multi_aff ma2) const
+multi_pw_aff::multi_pw_aff(isl::ctx ctx, const std::string &str)
 {
-  auto res = isl_multi_aff_lex_gt_set(copy(), ma2.release());
-  return manage(res);
+  auto res = isl_multi_pw_aff_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
 }
 
-isl::set multi_aff::lex_le_set(isl::multi_aff ma2) const
-{
-  auto res = isl_multi_aff_lex_le_set(copy(), ma2.release());
-  return manage(res);
+multi_pw_aff &multi_pw_aff::operator=(multi_pw_aff obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::set multi_aff::lex_lt_set(isl::multi_aff ma2) const
-{
-  auto res = isl_multi_aff_lex_lt_set(copy(), ma2.release());
-  return manage(res);
+multi_pw_aff::~multi_pw_aff() {
+  if (ptr)
+    isl_multi_pw_aff_free(ptr);
 }
 
-isl::multi_aff multi_aff::mod_multi_val(isl::multi_val mv) const
-{
-  auto res = isl_multi_aff_mod_multi_val(copy(), mv.release());
-  return manage(res);
+__isl_give isl_multi_pw_aff *multi_pw_aff::copy() const & {
+  return isl_multi_pw_aff_copy(ptr);
 }
 
-isl::multi_aff multi_aff::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
-{
-  auto res = isl_multi_aff_move_dims(copy(), static_cast<enum isl_dim_type>(dst_type), dst_pos, static_cast<enum isl_dim_type>(src_type), src_pos, n);
-  return manage(res);
+__isl_keep isl_multi_pw_aff *multi_pw_aff::get() const {
+  return ptr;
 }
 
-isl::multi_aff multi_aff::multi_val_on_space(isl::space space, isl::multi_val mv)
-{
-  auto res = isl_multi_aff_multi_val_on_space(space.release(), mv.release());
-  return manage(res);
+__isl_give isl_multi_pw_aff *multi_pw_aff::release() {
+  isl_multi_pw_aff *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
 }
 
-isl::multi_aff multi_aff::neg() const
-{
-  auto res = isl_multi_aff_neg(copy());
-  return manage(res);
+bool multi_pw_aff::is_null() const {
+  return ptr == nullptr;
 }
 
-int multi_aff::plain_cmp(const isl::multi_aff &multi2) const
-{
-  auto res = isl_multi_aff_plain_cmp(get(), multi2.get());
-  return res;
+isl::ctx multi_pw_aff::ctx() const {
+  return isl::ctx(isl_multi_pw_aff_get_ctx(ptr));
 }
 
-boolean multi_aff::plain_is_equal(const isl::multi_aff &multi2) const
+isl::multi_pw_aff multi_pw_aff::add(isl::multi_pw_aff multi2) const
 {
-  auto res = isl_multi_aff_plain_is_equal(get(), multi2.get());
+  auto res = isl_multi_pw_aff_add(copy(), multi2.release());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::product(isl::multi_aff multi2) const
+isl::multi_union_pw_aff multi_pw_aff::add(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_multi_aff_product(copy(), multi2.release());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).add(multi2);
 }
 
-isl::multi_aff multi_aff::project_domain_on_params() const
+isl::multi_pw_aff multi_pw_aff::add(const isl::aff &multi2) const
 {
-  auto res = isl_multi_aff_project_domain_on_params(copy());
-  return manage(res);
+  return this->add(isl::multi_pw_aff(multi2));
 }
 
-isl::multi_aff multi_aff::project_out_map(isl::space space, isl::dim type, unsigned int first, unsigned int n)
+isl::multi_pw_aff multi_pw_aff::add(const isl::multi_aff &multi2) const
 {
-  auto res = isl_multi_aff_project_out_map(space.release(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return this->add(isl::multi_pw_aff(multi2));
 }
 
-isl::multi_aff multi_aff::pullback(isl::multi_aff ma2) const
+isl::multi_pw_aff multi_pw_aff::add(const isl::pw_aff &multi2) const
 {
-  auto res = isl_multi_aff_pullback_multi_aff(copy(), ma2.release());
-  return manage(res);
+  return this->add(isl::multi_pw_aff(multi2));
 }
 
-isl::multi_aff multi_aff::range_factor_domain() const
+isl::multi_pw_aff multi_pw_aff::add(const isl::pw_multi_aff &multi2) const
 {
-  auto res = isl_multi_aff_range_factor_domain(copy());
-  return manage(res);
+  return this->add(isl::multi_pw_aff(multi2));
 }
 
-isl::multi_aff multi_aff::range_factor_range() const
+isl::multi_pw_aff multi_pw_aff::add_constant(isl::multi_val mv) const
 {
-  auto res = isl_multi_aff_range_factor_range(copy());
+  auto res = isl_multi_pw_aff_add_constant_multi_val(copy(), mv.release());
   return manage(res);
 }
 
-boolean multi_aff::range_is_wrapping() const
+isl::multi_pw_aff multi_pw_aff::add_constant(isl::val v) const
 {
-  auto res = isl_multi_aff_range_is_wrapping(get());
+  auto res = isl_multi_pw_aff_add_constant_val(copy(), v.release());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::range_map(isl::space space)
+isl::multi_pw_aff multi_pw_aff::add_constant(long v) const
 {
-  auto res = isl_multi_aff_range_map(space.release());
-  return manage(res);
+  return this->add_constant(isl::val(ctx(), v));
 }
 
-isl::multi_aff multi_aff::range_product(isl::multi_aff multi2) const
+isl::map multi_pw_aff::as_map() const
 {
-  auto res = isl_multi_aff_range_product(copy(), multi2.release());
+  auto res = isl_multi_pw_aff_as_map(copy());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::range_splice(unsigned int pos, isl::multi_aff multi2) const
+isl::multi_aff multi_pw_aff::as_multi_aff() const
 {
-  auto res = isl_multi_aff_range_splice(copy(), pos, multi2.release());
+  auto res = isl_multi_pw_aff_as_multi_aff(copy());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::reset_tuple_id(isl::dim type) const
+isl::set multi_pw_aff::as_set() const
 {
-  auto res = isl_multi_aff_reset_tuple_id(copy(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_multi_pw_aff_as_set(copy());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::reset_user() const
+isl::pw_aff multi_pw_aff::at(int pos) const
 {
-  auto res = isl_multi_aff_reset_user(copy());
+  auto res = isl_multi_pw_aff_get_at(get(), pos);
   return manage(res);
 }
 
-isl::multi_aff multi_aff::scale(isl::multi_val mv) const
+isl::pw_aff multi_pw_aff::get_at(int pos) const
 {
-  auto res = isl_multi_aff_scale_multi_val(copy(), mv.release());
-  return manage(res);
+  return at(pos);
 }
 
-isl::multi_aff multi_aff::scale(isl::val v) const
+isl::set multi_pw_aff::bind(isl::multi_id tuple) const
 {
-  auto res = isl_multi_aff_scale_val(copy(), v.release());
+  auto res = isl_multi_pw_aff_bind(copy(), tuple.release());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::scale_down(isl::multi_val mv) const
+isl::multi_pw_aff multi_pw_aff::bind_domain(isl::multi_id tuple) const
 {
-  auto res = isl_multi_aff_scale_down_multi_val(copy(), mv.release());
+  auto res = isl_multi_pw_aff_bind_domain(copy(), tuple.release());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::scale_down(isl::val v) const
+isl::multi_pw_aff multi_pw_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const
 {
-  auto res = isl_multi_aff_scale_down_val(copy(), v.release());
+  auto res = isl_multi_pw_aff_bind_domain_wrapped_domain(copy(), tuple.release());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::set_aff(int pos, isl::aff el) const
+isl::multi_pw_aff multi_pw_aff::coalesce() const
 {
-  auto res = isl_multi_aff_set_aff(copy(), pos, el.release());
+  auto res = isl_multi_pw_aff_coalesce(copy());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::set_at(int pos, isl::aff el) const
+class size multi_pw_aff::dim(isl::dim type) const
 {
-  auto res = isl_multi_aff_set_at(copy(), pos, el.release());
+  auto res = isl_multi_pw_aff_dim(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::multi_aff multi_aff::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const
+isl::set multi_pw_aff::domain() const
 {
-  auto res = isl_multi_aff_set_dim_id(copy(), static_cast<enum isl_dim_type>(type), pos, id.release());
+  auto res = isl_multi_pw_aff_domain(copy());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::set_tuple_id(isl::dim type, isl::id id) const
+isl::multi_pw_aff multi_pw_aff::flat_range_product(isl::multi_pw_aff multi2) const
 {
-  auto res = isl_multi_aff_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
+  auto res = isl_multi_pw_aff_flat_range_product(copy(), multi2.release());
   return manage(res);
 }
 
-isl::multi_aff multi_aff::set_tuple_name(isl::dim type, const std::string &s) const
+isl::multi_union_pw_aff multi_pw_aff::flat_range_product(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_multi_aff_set_tuple_name(copy(), static_cast<enum isl_dim_type>(type), s.c_str());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).flat_range_product(multi2);
 }
 
-isl_size multi_aff::size() const
+isl::multi_pw_aff multi_pw_aff::flat_range_product(const isl::aff &multi2) const
 {
-  auto res = isl_multi_aff_size(get());
-  return res;
+  return this->flat_range_product(isl::multi_pw_aff(multi2));
 }
 
-isl::multi_aff multi_aff::splice(unsigned int in_pos, unsigned int out_pos, isl::multi_aff multi2) const
+isl::multi_pw_aff multi_pw_aff::flat_range_product(const isl::multi_aff &multi2) const
 {
-  auto res = isl_multi_aff_splice(copy(), in_pos, out_pos, multi2.release());
-  return manage(res);
+  return this->flat_range_product(isl::multi_pw_aff(multi2));
 }
 
-isl::multi_aff multi_aff::sub(isl::multi_aff multi2) const
+isl::multi_pw_aff multi_pw_aff::flat_range_product(const isl::pw_aff &multi2) const
 {
-  auto res = isl_multi_aff_sub(copy(), multi2.release());
-  return manage(res);
+  return this->flat_range_product(isl::multi_pw_aff(multi2));
 }
 
-isl::multi_aff multi_aff::unbind_params_insert_domain(isl::multi_id domain) const
+isl::multi_pw_aff multi_pw_aff::flat_range_product(const isl::pw_multi_aff &multi2) const
 {
-  auto res = isl_multi_aff_unbind_params_insert_domain(copy(), domain.release());
-  return manage(res);
+  return this->flat_range_product(isl::multi_pw_aff(multi2));
 }
 
-isl::multi_aff multi_aff::zero(isl::space space)
+isl::multi_pw_aff multi_pw_aff::gist(isl::set set) const
 {
-  auto res = isl_multi_aff_zero(space.release());
+  auto res = isl_multi_pw_aff_gist(copy(), set.release());
   return manage(res);
 }
 
-// implementations for isl::multi_id
-multi_id manage(__isl_take isl_multi_id *ptr) {
-  return multi_id(ptr);
-}
-multi_id manage_copy(__isl_keep isl_multi_id *ptr) {
-  ptr = isl_multi_id_copy(ptr);
-  return multi_id(ptr);
-}
-
-multi_id::multi_id()
-    : ptr(nullptr) {}
-
-multi_id::multi_id(const multi_id &obj)
-    : ptr(nullptr)
+isl::multi_union_pw_aff multi_pw_aff::gist(const isl::union_set &context) const
 {
-  ptr = obj.copy();
+  return isl::multi_union_pw_aff(*this).gist(context);
 }
 
-
-multi_id::multi_id(__isl_take isl_multi_id *ptr)
-    : ptr(ptr) {}
-
-multi_id::multi_id(isl::space space, isl::id_list list)
-{
-  auto res = isl_multi_id_from_id_list(space.release(), list.release());
-  ptr = res;
-}
-multi_id::multi_id(isl::ctx ctx, const std::string &str)
+isl::multi_pw_aff multi_pw_aff::gist(const isl::basic_set &set) const
 {
-  auto res = isl_multi_id_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
-}
-
-multi_id &multi_id::operator=(multi_id obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
-}
-
-multi_id::~multi_id() {
-  if (ptr)
-    isl_multi_id_free(ptr);
-}
-
-__isl_give isl_multi_id *multi_id::copy() const & {
-  return isl_multi_id_copy(ptr);
-}
-
-__isl_keep isl_multi_id *multi_id::get() const {
-  return ptr;
-}
-
-__isl_give isl_multi_id *multi_id::release() {
-  isl_multi_id *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+  return this->gist(isl::set(set));
 }
 
-bool multi_id::is_null() const {
-  return ptr == nullptr;
+isl::multi_pw_aff multi_pw_aff::gist(const isl::point &set) const
+{
+  return this->gist(isl::set(set));
 }
 
-
-isl::ctx multi_id::ctx() const {
-  return isl::ctx(isl_multi_id_get_ctx(ptr));
+boolean multi_pw_aff::has_range_tuple_id() const
+{
+  auto res = isl_multi_pw_aff_has_range_tuple_id(get());
+  return manage(res);
 }
 
-void multi_id::dump() const {
-  isl_multi_id_dump(get());
+isl::multi_pw_aff multi_pw_aff::identity(isl::space space)
+{
+  auto res = isl_multi_pw_aff_identity(space.release());
+  return manage(res);
 }
 
-
-isl::multi_id multi_id::align_params(isl::space model) const
+isl::multi_pw_aff multi_pw_aff::identity() const
 {
-  auto res = isl_multi_id_align_params(copy(), model.release());
+  auto res = isl_multi_pw_aff_identity_multi_pw_aff(copy());
   return manage(res);
 }
 
-isl::multi_id multi_id::factor_range() const
+isl::multi_pw_aff multi_pw_aff::identity_on_domain(isl::space space)
 {
-  auto res = isl_multi_id_factor_range(copy());
+  auto res = isl_multi_pw_aff_identity_on_domain_space(space.release());
   return manage(res);
 }
 
-isl::multi_id multi_id::flat_range_product(isl::multi_id multi2) const
+isl::multi_pw_aff multi_pw_aff::insert_domain(isl::space domain) const
 {
-  auto res = isl_multi_id_flat_range_product(copy(), multi2.release());
+  auto res = isl_multi_pw_aff_insert_domain(copy(), domain.release());
   return manage(res);
 }
 
-isl::multi_id multi_id::flatten_range() const
+isl::multi_pw_aff multi_pw_aff::intersect_domain(isl::set domain) const
 {
-  auto res = isl_multi_id_flatten_range(copy());
+  auto res = isl_multi_pw_aff_intersect_domain(copy(), domain.release());
   return manage(res);
 }
 
-isl::multi_id multi_id::from_range() const
+isl::multi_union_pw_aff multi_pw_aff::intersect_domain(const isl::union_set &uset) const
 {
-  auto res = isl_multi_id_from_range(copy());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).intersect_domain(uset);
 }
 
-isl::id multi_id::get_at(int pos) const
+isl::multi_pw_aff multi_pw_aff::intersect_domain(const isl::basic_set &domain) const
 {
-  auto res = isl_multi_id_get_at(get(), pos);
-  return manage(res);
+  return this->intersect_domain(isl::set(domain));
 }
 
-isl::space multi_id::get_domain_space() const
+isl::multi_pw_aff multi_pw_aff::intersect_domain(const isl::point &domain) const
 {
-  auto res = isl_multi_id_get_domain_space(get());
-  return manage(res);
+  return this->intersect_domain(isl::set(domain));
 }
 
-isl::id multi_id::get_id(int pos) const
+isl::multi_pw_aff multi_pw_aff::intersect_params(isl::set set) const
 {
-  auto res = isl_multi_id_get_id(get(), pos);
+  auto res = isl_multi_pw_aff_intersect_params(copy(), set.release());
   return manage(res);
 }
 
-isl::id_list multi_id::get_list() const
+boolean multi_pw_aff::involves_nan() const
 {
-  auto res = isl_multi_id_get_list(get());
+  auto res = isl_multi_pw_aff_involves_nan(get());
   return manage(res);
 }
 
-isl::space multi_id::get_space() const
+boolean multi_pw_aff::involves_param(const isl::id &id) const
 {
-  auto res = isl_multi_id_get_space(get());
+  auto res = isl_multi_pw_aff_involves_param_id(get(), id.get());
   return manage(res);
 }
 
-boolean multi_id::plain_is_equal(const isl::multi_id &multi2) const
+boolean multi_pw_aff::involves_param(const std::string &id) const
 {
-  auto res = isl_multi_id_plain_is_equal(get(), multi2.get());
-  return manage(res);
+  return this->involves_param(isl::id(ctx(), id));
 }
 
-isl::multi_id multi_id::range_factor_domain() const
+boolean multi_pw_aff::involves_param(const isl::id_list &list) const
 {
-  auto res = isl_multi_id_range_factor_domain(copy());
+  auto res = isl_multi_pw_aff_involves_param_id_list(get(), list.get());
   return manage(res);
 }
 
-isl::multi_id multi_id::range_factor_range() const
+boolean multi_pw_aff::isa_multi_aff() const
 {
-  auto res = isl_multi_id_range_factor_range(copy());
+  auto res = isl_multi_pw_aff_isa_multi_aff(get());
   return manage(res);
 }
 
-boolean multi_id::range_is_wrapping() const
+isl::pw_aff_list multi_pw_aff::list() const
 {
-  auto res = isl_multi_id_range_is_wrapping(get());
+  auto res = isl_multi_pw_aff_get_list(get());
   return manage(res);
 }
 
-isl::multi_id multi_id::range_product(isl::multi_id multi2) const
+isl::pw_aff_list multi_pw_aff::get_list() const
 {
-  auto res = isl_multi_id_range_product(copy(), multi2.release());
-  return manage(res);
+  return list();
 }
 
-isl::multi_id multi_id::range_splice(unsigned int pos, isl::multi_id multi2) const
+isl::multi_pw_aff multi_pw_aff::max(isl::multi_pw_aff multi2) const
 {
-  auto res = isl_multi_id_range_splice(copy(), pos, multi2.release());
+  auto res = isl_multi_pw_aff_max(copy(), multi2.release());
   return manage(res);
 }
 
-isl::multi_id multi_id::reset_user() const
+isl::multi_val multi_pw_aff::max_multi_val() const
 {
-  auto res = isl_multi_id_reset_user(copy());
+  auto res = isl_multi_pw_aff_max_multi_val(copy());
   return manage(res);
 }
 
-isl::multi_id multi_id::set_at(int pos, isl::id el) const
+isl::multi_pw_aff multi_pw_aff::min(isl::multi_pw_aff multi2) const
 {
-  auto res = isl_multi_id_set_at(copy(), pos, el.release());
+  auto res = isl_multi_pw_aff_min(copy(), multi2.release());
   return manage(res);
 }
 
-isl::multi_id multi_id::set_id(int pos, isl::id el) const
+isl::multi_val multi_pw_aff::min_multi_val() const
 {
-  auto res = isl_multi_id_set_id(copy(), pos, el.release());
+  auto res = isl_multi_pw_aff_min_multi_val(copy());
   return manage(res);
 }
 
-isl_size multi_id::size() const
+isl::multi_pw_aff multi_pw_aff::neg() const
 {
-  auto res = isl_multi_id_size(get());
-  return res;
+  auto res = isl_multi_pw_aff_neg(copy());
+  return manage(res);
 }
 
-// implementations for isl::multi_pw_aff
-multi_pw_aff manage(__isl_take isl_multi_pw_aff *ptr) {
-  return multi_pw_aff(ptr);
-}
-multi_pw_aff manage_copy(__isl_keep isl_multi_pw_aff *ptr) {
-  ptr = isl_multi_pw_aff_copy(ptr);
-  return multi_pw_aff(ptr);
+boolean multi_pw_aff::plain_is_equal(const isl::multi_pw_aff &multi2) const
+{
+  auto res = isl_multi_pw_aff_plain_is_equal(get(), multi2.get());
+  return manage(res);
 }
 
-multi_pw_aff::multi_pw_aff()
-    : ptr(nullptr) {}
-
-multi_pw_aff::multi_pw_aff(const multi_pw_aff &obj)
-    : ptr(nullptr)
+boolean multi_pw_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const
 {
-  ptr = obj.copy();
+  return isl::multi_union_pw_aff(*this).plain_is_equal(multi2);
 }
 
-
-multi_pw_aff::multi_pw_aff(__isl_take isl_multi_pw_aff *ptr)
-    : ptr(ptr) {}
-
-multi_pw_aff::multi_pw_aff(isl::aff aff)
+boolean multi_pw_aff::plain_is_equal(const isl::aff &multi2) const
 {
-  auto res = isl_multi_pw_aff_from_aff(aff.release());
-  ptr = res;
+  return this->plain_is_equal(isl::multi_pw_aff(multi2));
 }
-multi_pw_aff::multi_pw_aff(isl::multi_aff ma)
+
+boolean multi_pw_aff::plain_is_equal(const isl::multi_aff &multi2) const
 {
-  auto res = isl_multi_pw_aff_from_multi_aff(ma.release());
-  ptr = res;
+  return this->plain_is_equal(isl::multi_pw_aff(multi2));
 }
-multi_pw_aff::multi_pw_aff(isl::pw_aff pa)
+
+boolean multi_pw_aff::plain_is_equal(const isl::pw_aff &multi2) const
 {
-  auto res = isl_multi_pw_aff_from_pw_aff(pa.release());
-  ptr = res;
+  return this->plain_is_equal(isl::multi_pw_aff(multi2));
 }
-multi_pw_aff::multi_pw_aff(isl::space space, isl::pw_aff_list list)
+
+boolean multi_pw_aff::plain_is_equal(const isl::pw_multi_aff &multi2) const
 {
-  auto res = isl_multi_pw_aff_from_pw_aff_list(space.release(), list.release());
-  ptr = res;
+  return this->plain_is_equal(isl::multi_pw_aff(multi2));
 }
-multi_pw_aff::multi_pw_aff(isl::pw_multi_aff pma)
+
+isl::multi_pw_aff multi_pw_aff::product(isl::multi_pw_aff multi2) const
 {
-  auto res = isl_multi_pw_aff_from_pw_multi_aff(pma.release());
-  ptr = res;
+  auto res = isl_multi_pw_aff_product(copy(), multi2.release());
+  return manage(res);
 }
-multi_pw_aff::multi_pw_aff(isl::ctx ctx, const std::string &str)
+
+isl::multi_pw_aff multi_pw_aff::pullback(isl::multi_aff ma) const
 {
-  auto res = isl_multi_pw_aff_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
+  auto res = isl_multi_pw_aff_pullback_multi_aff(copy(), ma.release());
+  return manage(res);
 }
 
-multi_pw_aff &multi_pw_aff::operator=(multi_pw_aff obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::multi_pw_aff multi_pw_aff::pullback(isl::multi_pw_aff mpa2) const
+{
+  auto res = isl_multi_pw_aff_pullback_multi_pw_aff(copy(), mpa2.release());
+  return manage(res);
 }
 
-multi_pw_aff::~multi_pw_aff() {
-  if (ptr)
-    isl_multi_pw_aff_free(ptr);
+isl::multi_pw_aff multi_pw_aff::pullback(isl::pw_multi_aff pma) const
+{
+  auto res = isl_multi_pw_aff_pullback_pw_multi_aff(copy(), pma.release());
+  return manage(res);
 }
 
-__isl_give isl_multi_pw_aff *multi_pw_aff::copy() const & {
-  return isl_multi_pw_aff_copy(ptr);
+isl::multi_union_pw_aff multi_pw_aff::pullback(const isl::union_pw_multi_aff &upma) const
+{
+  return isl::multi_union_pw_aff(*this).pullback(upma);
 }
 
-__isl_keep isl_multi_pw_aff *multi_pw_aff::get() const {
-  return ptr;
+isl::multi_pw_aff multi_pw_aff::range_product(isl::multi_pw_aff multi2) const
+{
+  auto res = isl_multi_pw_aff_range_product(copy(), multi2.release());
+  return manage(res);
 }
 
-__isl_give isl_multi_pw_aff *multi_pw_aff::release() {
-  isl_multi_pw_aff *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::multi_union_pw_aff multi_pw_aff::range_product(const isl::multi_union_pw_aff &multi2) const
+{
+  return isl::multi_union_pw_aff(*this).range_product(multi2);
 }
 
-bool multi_pw_aff::is_null() const {
-  return ptr == nullptr;
+isl::multi_pw_aff multi_pw_aff::range_product(const isl::aff &multi2) const
+{
+  return this->range_product(isl::multi_pw_aff(multi2));
 }
 
-
-isl::ctx multi_pw_aff::ctx() const {
-  return isl::ctx(isl_multi_pw_aff_get_ctx(ptr));
+isl::multi_pw_aff multi_pw_aff::range_product(const isl::multi_aff &multi2) const
+{
+  return this->range_product(isl::multi_pw_aff(multi2));
 }
 
-void multi_pw_aff::dump() const {
-  isl_multi_pw_aff_dump(get());
+isl::multi_pw_aff multi_pw_aff::range_product(const isl::pw_aff &multi2) const
+{
+  return this->range_product(isl::multi_pw_aff(multi2));
 }
 
+isl::multi_pw_aff multi_pw_aff::range_product(const isl::pw_multi_aff &multi2) const
+{
+  return this->range_product(isl::multi_pw_aff(multi2));
+}
 
-isl::multi_pw_aff multi_pw_aff::add(isl::multi_pw_aff multi2) const
+isl::id multi_pw_aff::range_tuple_id() const
 {
-  auto res = isl_multi_pw_aff_add(copy(), multi2.release());
+  auto res = isl_multi_pw_aff_get_range_tuple_id(get());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::add_constant(isl::multi_val mv) const
+isl::id multi_pw_aff::get_range_tuple_id() const
 {
-  auto res = isl_multi_pw_aff_add_constant_multi_val(copy(), mv.release());
-  return manage(res);
+  return range_tuple_id();
 }
 
-isl::multi_pw_aff multi_pw_aff::add_constant(isl::val v) const
+isl::multi_pw_aff multi_pw_aff::reset_range_tuple_id() const
 {
-  auto res = isl_multi_pw_aff_add_constant_val(copy(), v.release());
+  auto res = isl_multi_pw_aff_reset_range_tuple_id(copy());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::add_dims(isl::dim type, unsigned int n) const
+isl::multi_pw_aff multi_pw_aff::reset_tuple_id(isl::dim type) const
 {
-  auto res = isl_multi_pw_aff_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
+  auto res = isl_multi_pw_aff_reset_tuple_id(copy(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::align_params(isl::space model) const
+isl::multi_pw_aff multi_pw_aff::scale(isl::multi_val mv) const
 {
-  auto res = isl_multi_pw_aff_align_params(copy(), model.release());
+  auto res = isl_multi_pw_aff_scale_multi_val(copy(), mv.release());
   return manage(res);
 }
 
-isl::set multi_pw_aff::bind(isl::multi_id tuple) const
+isl::multi_pw_aff multi_pw_aff::scale(isl::val v) const
 {
-  auto res = isl_multi_pw_aff_bind(copy(), tuple.release());
+  auto res = isl_multi_pw_aff_scale_val(copy(), v.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::bind_domain(isl::multi_id tuple) const
+isl::multi_pw_aff multi_pw_aff::scale(long v) const
 {
-  auto res = isl_multi_pw_aff_bind_domain(copy(), tuple.release());
-  return manage(res);
+  return this->scale(isl::val(ctx(), v));
 }
 
-isl::multi_pw_aff multi_pw_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const
+isl::multi_pw_aff multi_pw_aff::scale_down(isl::multi_val mv) const
 {
-  auto res = isl_multi_pw_aff_bind_domain_wrapped_domain(copy(), tuple.release());
+  auto res = isl_multi_pw_aff_scale_down_multi_val(copy(), mv.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::coalesce() const
+isl::multi_pw_aff multi_pw_aff::scale_down(isl::val v) const
 {
-  auto res = isl_multi_pw_aff_coalesce(copy());
+  auto res = isl_multi_pw_aff_scale_down_val(copy(), v.release());
   return manage(res);
 }
 
-isl_size multi_pw_aff::dim(isl::dim type) const
+isl::multi_pw_aff multi_pw_aff::scale_down(long v) const
 {
-  auto res = isl_multi_pw_aff_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return this->scale_down(isl::val(ctx(), v));
 }
 
-isl::set multi_pw_aff::domain() const
+isl::multi_pw_aff multi_pw_aff::set_at(int pos, isl::pw_aff el) const
 {
-  auto res = isl_multi_pw_aff_domain(copy());
+  auto res = isl_multi_pw_aff_set_at(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::multi_union_pw_aff multi_pw_aff::set_at(int pos, const isl::union_pw_aff &el) const
 {
-  auto res = isl_multi_pw_aff_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).set_at(pos, el);
 }
 
-isl::map multi_pw_aff::eq_map(isl::multi_pw_aff mpa2) const
+isl::multi_pw_aff multi_pw_aff::set_pw_aff(int pos, isl::pw_aff el) const
 {
-  auto res = isl_multi_pw_aff_eq_map(copy(), mpa2.release());
+  auto res = isl_multi_pw_aff_set_pw_aff(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::factor_range() const
+isl::multi_pw_aff multi_pw_aff::set_range_tuple(isl::id id) const
 {
-  auto res = isl_multi_pw_aff_factor_range(copy());
+  auto res = isl_multi_pw_aff_set_range_tuple_id(copy(), id.release());
   return manage(res);
 }
 
-int multi_pw_aff::find_dim_by_id(isl::dim type, const isl::id &id) const
+isl::multi_pw_aff multi_pw_aff::set_range_tuple(const std::string &id) const
 {
-  auto res = isl_multi_pw_aff_find_dim_by_id(get(), static_cast<enum isl_dim_type>(type), id.get());
-  return res;
+  return this->set_range_tuple(isl::id(ctx(), id));
 }
 
-int multi_pw_aff::find_dim_by_name(isl::dim type, const std::string &name) const
+isl::multi_union_pw_aff multi_pw_aff::set_union_pw_aff(int pos, const isl::union_pw_aff &el) const
 {
-  auto res = isl_multi_pw_aff_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+  return isl::multi_union_pw_aff(*this).set_union_pw_aff(pos, el);
 }
 
-isl::multi_pw_aff multi_pw_aff::flat_range_product(isl::multi_pw_aff multi2) const
+class size multi_pw_aff::size() const
 {
-  auto res = isl_multi_pw_aff_flat_range_product(copy(), multi2.release());
+  auto res = isl_multi_pw_aff_size(get());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::flatten_range() const
+isl::space multi_pw_aff::space() const
 {
-  auto res = isl_multi_pw_aff_flatten_range(copy());
+  auto res = isl_multi_pw_aff_get_space(get());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::from_range() const
+isl::space multi_pw_aff::get_space() const
 {
-  auto res = isl_multi_pw_aff_from_range(copy());
-  return manage(res);
+  return space();
 }
 
-isl::pw_aff multi_pw_aff::get_at(int pos) const
+isl::multi_pw_aff multi_pw_aff::sub(isl::multi_pw_aff multi2) const
 {
-  auto res = isl_multi_pw_aff_get_at(get(), pos);
+  auto res = isl_multi_pw_aff_sub(copy(), multi2.release());
   return manage(res);
 }
 
-isl::id multi_pw_aff::get_dim_id(isl::dim type, unsigned int pos) const
+isl::multi_union_pw_aff multi_pw_aff::sub(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_multi_pw_aff_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).sub(multi2);
 }
 
-isl::space multi_pw_aff::get_domain_space() const
+isl::multi_pw_aff multi_pw_aff::sub(const isl::aff &multi2) const
 {
-  auto res = isl_multi_pw_aff_get_domain_space(get());
-  return manage(res);
+  return this->sub(isl::multi_pw_aff(multi2));
 }
 
-uint32_t multi_pw_aff::get_hash() const
+isl::multi_pw_aff multi_pw_aff::sub(const isl::multi_aff &multi2) const
 {
-  auto res = isl_multi_pw_aff_get_hash(get());
-  return res;
+  return this->sub(isl::multi_pw_aff(multi2));
 }
 
-isl::pw_aff_list multi_pw_aff::get_list() const
+isl::multi_pw_aff multi_pw_aff::sub(const isl::pw_aff &multi2) const
 {
-  auto res = isl_multi_pw_aff_get_list(get());
-  return manage(res);
+  return this->sub(isl::multi_pw_aff(multi2));
 }
 
-isl::pw_aff multi_pw_aff::get_pw_aff(int pos) const
+isl::multi_pw_aff multi_pw_aff::sub(const isl::pw_multi_aff &multi2) const
 {
-  auto res = isl_multi_pw_aff_get_pw_aff(get(), pos);
-  return manage(res);
+  return this->sub(isl::multi_pw_aff(multi2));
 }
 
-isl::space multi_pw_aff::get_space() const
+isl::multi_pw_aff multi_pw_aff::unbind_params_insert_domain(isl::multi_id domain) const
 {
-  auto res = isl_multi_pw_aff_get_space(get());
+  auto res = isl_multi_pw_aff_unbind_params_insert_domain(copy(), domain.release());
   return manage(res);
 }
 
-isl::id multi_pw_aff::get_tuple_id(isl::dim type) const
+isl::multi_pw_aff multi_pw_aff::union_add(isl::multi_pw_aff mpa2) const
 {
-  auto res = isl_multi_pw_aff_get_tuple_id(get(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_multi_pw_aff_union_add(copy(), mpa2.release());
   return manage(res);
 }
 
-std::string multi_pw_aff::get_tuple_name(isl::dim type) const
+isl::multi_union_pw_aff multi_pw_aff::union_add(const isl::multi_union_pw_aff &mupa2) const
 {
-  auto res = isl_multi_pw_aff_get_tuple_name(get(), static_cast<enum isl_dim_type>(type));
-  std::string tmp(res);
-  return tmp;
+  return isl::multi_union_pw_aff(*this).union_add(mupa2);
 }
 
-isl::multi_pw_aff multi_pw_aff::gist(isl::set set) const
+isl::multi_pw_aff multi_pw_aff::union_add(const isl::aff &mpa2) const
 {
-  auto res = isl_multi_pw_aff_gist(copy(), set.release());
-  return manage(res);
+  return this->union_add(isl::multi_pw_aff(mpa2));
 }
 
-isl::multi_pw_aff multi_pw_aff::gist_params(isl::set set) const
+isl::multi_pw_aff multi_pw_aff::union_add(const isl::multi_aff &mpa2) const
 {
-  auto res = isl_multi_pw_aff_gist_params(copy(), set.release());
-  return manage(res);
+  return this->union_add(isl::multi_pw_aff(mpa2));
 }
 
-boolean multi_pw_aff::has_tuple_id(isl::dim type) const
+isl::multi_pw_aff multi_pw_aff::union_add(const isl::pw_aff &mpa2) const
 {
-  auto res = isl_multi_pw_aff_has_tuple_id(get(), static_cast<enum isl_dim_type>(type));
-  return manage(res);
+  return this->union_add(isl::multi_pw_aff(mpa2));
 }
 
-isl::multi_pw_aff multi_pw_aff::identity(isl::space space)
+isl::multi_pw_aff multi_pw_aff::union_add(const isl::pw_multi_aff &mpa2) const
 {
-  auto res = isl_multi_pw_aff_identity(space.release());
-  return manage(res);
+  return this->union_add(isl::multi_pw_aff(mpa2));
 }
 
-isl::multi_pw_aff multi_pw_aff::identity() const
+isl::multi_pw_aff multi_pw_aff::zero(isl::space space)
 {
-  auto res = isl_multi_pw_aff_identity_multi_pw_aff(copy());
+  auto res = isl_multi_pw_aff_zero(space.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::identity_on_domain(isl::space space)
+inline std::ostream &operator<<(std::ostream &os, const multi_pw_aff &obj)
 {
-  auto res = isl_multi_pw_aff_identity_on_domain_space(space.release());
-  return manage(res);
+  char *str = isl_multi_pw_aff_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::multi_pw_aff multi_pw_aff::insert_dims(isl::dim type, unsigned int first, unsigned int n) const
-{
-  auto res = isl_multi_pw_aff_insert_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+// implementations for isl::multi_union_pw_aff
+multi_union_pw_aff manage(__isl_take isl_multi_union_pw_aff *ptr) {
+  return multi_union_pw_aff(ptr);
+}
+multi_union_pw_aff manage_copy(__isl_keep isl_multi_union_pw_aff *ptr) {
+  ptr = isl_multi_union_pw_aff_copy(ptr);
+  return multi_union_pw_aff(ptr);
 }
 
-isl::multi_pw_aff multi_pw_aff::insert_domain(isl::space domain) const
+multi_union_pw_aff::multi_union_pw_aff()
+    : ptr(nullptr) {}
+
+multi_union_pw_aff::multi_union_pw_aff(const multi_union_pw_aff &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_multi_pw_aff_insert_domain(copy(), domain.release());
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::multi_pw_aff multi_pw_aff::intersect_domain(isl::set domain) const
+multi_union_pw_aff::multi_union_pw_aff(__isl_take isl_multi_union_pw_aff *ptr)
+    : ptr(ptr) {}
+
+multi_union_pw_aff::multi_union_pw_aff(isl::multi_pw_aff mpa)
 {
-  auto res = isl_multi_pw_aff_intersect_domain(copy(), domain.release());
-  return manage(res);
+  auto res = isl_multi_union_pw_aff_from_multi_pw_aff(mpa.release());
+  ptr = res;
 }
 
-isl::multi_pw_aff multi_pw_aff::intersect_params(isl::set set) const
+multi_union_pw_aff::multi_union_pw_aff(isl::union_pw_aff upa)
 {
-  auto res = isl_multi_pw_aff_intersect_params(copy(), set.release());
-  return manage(res);
+  auto res = isl_multi_union_pw_aff_from_union_pw_aff(upa.release());
+  ptr = res;
 }
 
-boolean multi_pw_aff::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
+multi_union_pw_aff::multi_union_pw_aff(isl::space space, isl::union_pw_aff_list list)
 {
-  auto res = isl_multi_pw_aff_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  auto res = isl_multi_union_pw_aff_from_union_pw_aff_list(space.release(), list.release());
+  ptr = res;
 }
 
-boolean multi_pw_aff::involves_nan() const
+multi_union_pw_aff::multi_union_pw_aff(isl::union_pw_multi_aff upma)
 {
-  auto res = isl_multi_pw_aff_involves_nan(get());
-  return manage(res);
+  auto res = isl_multi_union_pw_aff_from_union_pw_multi_aff(upma.release());
+  ptr = res;
 }
 
-boolean multi_pw_aff::involves_param(const isl::id &id) const
+multi_union_pw_aff::multi_union_pw_aff(isl::ctx ctx, const std::string &str)
 {
-  auto res = isl_multi_pw_aff_involves_param_id(get(), id.get());
-  return manage(res);
+  auto res = isl_multi_union_pw_aff_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
 }
 
-boolean multi_pw_aff::involves_param(const isl::id_list &list) const
-{
-  auto res = isl_multi_pw_aff_involves_param_id_list(get(), list.get());
-  return manage(res);
+multi_union_pw_aff &multi_union_pw_aff::operator=(multi_union_pw_aff obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+multi_union_pw_aff::~multi_union_pw_aff() {
+  if (ptr)
+    isl_multi_union_pw_aff_free(ptr);
+}
+
+__isl_give isl_multi_union_pw_aff *multi_union_pw_aff::copy() const & {
+  return isl_multi_union_pw_aff_copy(ptr);
+}
+
+__isl_keep isl_multi_union_pw_aff *multi_union_pw_aff::get() const {
+  return ptr;
 }
 
-boolean multi_pw_aff::is_cst() const
+__isl_give isl_multi_union_pw_aff *multi_union_pw_aff::release() {
+  isl_multi_union_pw_aff *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
+}
+
+bool multi_union_pw_aff::is_null() const {
+  return ptr == nullptr;
+}
+
+isl::ctx multi_union_pw_aff::ctx() const {
+  return isl::ctx(isl_multi_union_pw_aff_get_ctx(ptr));
+}
+
+isl::multi_union_pw_aff multi_union_pw_aff::add(isl::multi_union_pw_aff multi2) const
 {
-  auto res = isl_multi_pw_aff_is_cst(get());
+  auto res = isl_multi_union_pw_aff_add(copy(), multi2.release());
   return manage(res);
 }
 
-boolean multi_pw_aff::is_equal(const isl::multi_pw_aff &mpa2) const
+isl::union_pw_aff multi_union_pw_aff::at(int pos) const
 {
-  auto res = isl_multi_pw_aff_is_equal(get(), mpa2.get());
+  auto res = isl_multi_union_pw_aff_get_at(get(), pos);
   return manage(res);
 }
 
-isl::map multi_pw_aff::lex_ge_map(isl::multi_pw_aff mpa2) const
+isl::union_pw_aff multi_union_pw_aff::get_at(int pos) const
 {
-  auto res = isl_multi_pw_aff_lex_ge_map(copy(), mpa2.release());
-  return manage(res);
+  return at(pos);
 }
 
-isl::map multi_pw_aff::lex_gt_map(isl::multi_pw_aff mpa2) const
+isl::union_set multi_union_pw_aff::bind(isl::multi_id tuple) const
 {
-  auto res = isl_multi_pw_aff_lex_gt_map(copy(), mpa2.release());
+  auto res = isl_multi_union_pw_aff_bind(copy(), tuple.release());
   return manage(res);
 }
 
-isl::map multi_pw_aff::lex_le_map(isl::multi_pw_aff mpa2) const
+isl::multi_union_pw_aff multi_union_pw_aff::coalesce() const
 {
-  auto res = isl_multi_pw_aff_lex_le_map(copy(), mpa2.release());
+  auto res = isl_multi_union_pw_aff_coalesce(copy());
   return manage(res);
 }
 
-isl::map multi_pw_aff::lex_lt_map(isl::multi_pw_aff mpa2) const
+class size multi_union_pw_aff::dim(isl::dim type) const
 {
-  auto res = isl_multi_pw_aff_lex_lt_map(copy(), mpa2.release());
+  auto res = isl_multi_union_pw_aff_dim(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::max(isl::multi_pw_aff multi2) const
+isl::union_set multi_union_pw_aff::domain() const
 {
-  auto res = isl_multi_pw_aff_max(copy(), multi2.release());
+  auto res = isl_multi_union_pw_aff_domain(copy());
   return manage(res);
 }
 
-isl::multi_val multi_pw_aff::max_multi_val() const
+isl::multi_union_pw_aff multi_union_pw_aff::flat_range_product(isl::multi_union_pw_aff multi2) const
 {
-  auto res = isl_multi_pw_aff_max_multi_val(copy());
+  auto res = isl_multi_union_pw_aff_flat_range_product(copy(), multi2.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::min(isl::multi_pw_aff multi2) const
+isl::multi_union_pw_aff multi_union_pw_aff::from_union_map(isl::union_map umap)
 {
-  auto res = isl_multi_pw_aff_min(copy(), multi2.release());
+  auto res = isl_multi_union_pw_aff_from_union_map(umap.release());
   return manage(res);
 }
 
-isl::multi_val multi_pw_aff::min_multi_val() const
+isl::multi_union_pw_aff multi_union_pw_aff::gist(isl::union_set context) const
 {
-  auto res = isl_multi_pw_aff_min_multi_val(copy());
+  auto res = isl_multi_union_pw_aff_gist(copy(), context.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::mod_multi_val(isl::multi_val mv) const
+boolean multi_union_pw_aff::has_range_tuple_id() const
 {
-  auto res = isl_multi_pw_aff_mod_multi_val(copy(), mv.release());
+  auto res = isl_multi_union_pw_aff_has_range_tuple_id(get());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
+isl::multi_union_pw_aff multi_union_pw_aff::intersect_domain(isl::union_set uset) const
 {
-  auto res = isl_multi_pw_aff_move_dims(copy(), static_cast<enum isl_dim_type>(dst_type), dst_pos, static_cast<enum isl_dim_type>(src_type), src_pos, n);
+  auto res = isl_multi_union_pw_aff_intersect_domain(copy(), uset.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::neg() const
+isl::multi_union_pw_aff multi_union_pw_aff::intersect_params(isl::set params) const
 {
-  auto res = isl_multi_pw_aff_neg(copy());
+  auto res = isl_multi_union_pw_aff_intersect_params(copy(), params.release());
   return manage(res);
 }
 
-boolean multi_pw_aff::plain_is_equal(const isl::multi_pw_aff &multi2) const
+boolean multi_union_pw_aff::involves_nan() const
 {
-  auto res = isl_multi_pw_aff_plain_is_equal(get(), multi2.get());
+  auto res = isl_multi_union_pw_aff_involves_nan(get());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::product(isl::multi_pw_aff multi2) const
+isl::union_pw_aff_list multi_union_pw_aff::list() const
 {
-  auto res = isl_multi_pw_aff_product(copy(), multi2.release());
+  auto res = isl_multi_union_pw_aff_get_list(get());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::project_domain_on_params() const
+isl::union_pw_aff_list multi_union_pw_aff::get_list() const
 {
-  auto res = isl_multi_pw_aff_project_domain_on_params(copy());
-  return manage(res);
+  return list();
 }
 
-isl::multi_pw_aff multi_pw_aff::pullback(isl::multi_aff ma) const
+isl::multi_union_pw_aff multi_union_pw_aff::neg() const
 {
-  auto res = isl_multi_pw_aff_pullback_multi_aff(copy(), ma.release());
+  auto res = isl_multi_union_pw_aff_neg(copy());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::pullback(isl::multi_pw_aff mpa2) const
+boolean multi_union_pw_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_multi_pw_aff_pullback_multi_pw_aff(copy(), mpa2.release());
+  auto res = isl_multi_union_pw_aff_plain_is_equal(get(), multi2.get());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::pullback(isl::pw_multi_aff pma) const
+isl::multi_union_pw_aff multi_union_pw_aff::pullback(isl::union_pw_multi_aff upma) const
 {
-  auto res = isl_multi_pw_aff_pullback_pw_multi_aff(copy(), pma.release());
+  auto res = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(copy(), upma.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::range_factor_domain() const
+isl::multi_union_pw_aff multi_union_pw_aff::range_product(isl::multi_union_pw_aff multi2) const
 {
-  auto res = isl_multi_pw_aff_range_factor_domain(copy());
+  auto res = isl_multi_union_pw_aff_range_product(copy(), multi2.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::range_factor_range() const
+isl::id multi_union_pw_aff::range_tuple_id() const
 {
-  auto res = isl_multi_pw_aff_range_factor_range(copy());
+  auto res = isl_multi_union_pw_aff_get_range_tuple_id(get());
   return manage(res);
 }
 
-boolean multi_pw_aff::range_is_wrapping() const
+isl::id multi_union_pw_aff::get_range_tuple_id() const
 {
-  auto res = isl_multi_pw_aff_range_is_wrapping(get());
-  return manage(res);
+  return range_tuple_id();
 }
 
-isl::multi_pw_aff multi_pw_aff::range_product(isl::multi_pw_aff multi2) const
+isl::multi_union_pw_aff multi_union_pw_aff::reset_range_tuple_id() const
 {
-  auto res = isl_multi_pw_aff_range_product(copy(), multi2.release());
+  auto res = isl_multi_union_pw_aff_reset_range_tuple_id(copy());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::range_splice(unsigned int pos, isl::multi_pw_aff multi2) const
+isl::multi_union_pw_aff multi_union_pw_aff::reset_tuple_id(isl::dim type) const
 {
-  auto res = isl_multi_pw_aff_range_splice(copy(), pos, multi2.release());
+  auto res = isl_multi_union_pw_aff_reset_tuple_id(copy(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::reset_tuple_id(isl::dim type) const
+isl::multi_union_pw_aff multi_union_pw_aff::scale(isl::multi_val mv) const
 {
-  auto res = isl_multi_pw_aff_reset_tuple_id(copy(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_multi_union_pw_aff_scale_multi_val(copy(), mv.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::reset_user() const
+isl::multi_union_pw_aff multi_union_pw_aff::scale(isl::val v) const
 {
-  auto res = isl_multi_pw_aff_reset_user(copy());
+  auto res = isl_multi_union_pw_aff_scale_val(copy(), v.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::scale(isl::multi_val mv) const
+isl::multi_union_pw_aff multi_union_pw_aff::scale(long v) const
 {
-  auto res = isl_multi_pw_aff_scale_multi_val(copy(), mv.release());
-  return manage(res);
+  return this->scale(isl::val(ctx(), v));
 }
 
-isl::multi_pw_aff multi_pw_aff::scale(isl::val v) const
+isl::multi_union_pw_aff multi_union_pw_aff::scale_down(isl::multi_val mv) const
 {
-  auto res = isl_multi_pw_aff_scale_val(copy(), v.release());
+  auto res = isl_multi_union_pw_aff_scale_down_multi_val(copy(), mv.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::scale_down(isl::multi_val mv) const
+isl::multi_union_pw_aff multi_union_pw_aff::scale_down(isl::val v) const
 {
-  auto res = isl_multi_pw_aff_scale_down_multi_val(copy(), mv.release());
+  auto res = isl_multi_union_pw_aff_scale_down_val(copy(), v.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::scale_down(isl::val v) const
+isl::multi_union_pw_aff multi_union_pw_aff::scale_down(long v) const
 {
-  auto res = isl_multi_pw_aff_scale_down_val(copy(), v.release());
-  return manage(res);
+  return this->scale_down(isl::val(ctx(), v));
 }
 
-isl::multi_pw_aff multi_pw_aff::set_at(int pos, isl::pw_aff el) const
+isl::multi_union_pw_aff multi_union_pw_aff::set_at(int pos, isl::union_pw_aff el) const
 {
-  auto res = isl_multi_pw_aff_set_at(copy(), pos, el.release());
+  auto res = isl_multi_union_pw_aff_set_at(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const
+isl::multi_union_pw_aff multi_union_pw_aff::set_range_tuple(isl::id id) const
 {
-  auto res = isl_multi_pw_aff_set_dim_id(copy(), static_cast<enum isl_dim_type>(type), pos, id.release());
+  auto res = isl_multi_union_pw_aff_set_range_tuple_id(copy(), id.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::set_pw_aff(int pos, isl::pw_aff el) const
+isl::multi_union_pw_aff multi_union_pw_aff::set_range_tuple(const std::string &id) const
 {
-  auto res = isl_multi_pw_aff_set_pw_aff(copy(), pos, el.release());
-  return manage(res);
+  return this->set_range_tuple(isl::id(ctx(), id));
 }
 
-isl::multi_pw_aff multi_pw_aff::set_tuple_id(isl::dim type, isl::id id) const
+isl::multi_union_pw_aff multi_union_pw_aff::set_union_pw_aff(int pos, isl::union_pw_aff el) const
 {
-  auto res = isl_multi_pw_aff_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
+  auto res = isl_multi_union_pw_aff_set_union_pw_aff(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::set_tuple_name(isl::dim type, const std::string &s) const
+class size multi_union_pw_aff::size() const
 {
-  auto res = isl_multi_pw_aff_set_tuple_name(copy(), static_cast<enum isl_dim_type>(type), s.c_str());
+  auto res = isl_multi_union_pw_aff_size(get());
   return manage(res);
 }
 
-isl_size multi_pw_aff::size() const
+isl::space multi_union_pw_aff::space() const
 {
-  auto res = isl_multi_pw_aff_size(get());
-  return res;
+  auto res = isl_multi_union_pw_aff_get_space(get());
+  return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::splice(unsigned int in_pos, unsigned int out_pos, isl::multi_pw_aff multi2) const
+isl::space multi_union_pw_aff::get_space() const
 {
-  auto res = isl_multi_pw_aff_splice(copy(), in_pos, out_pos, multi2.release());
-  return manage(res);
+  return space();
 }
 
-isl::multi_pw_aff multi_pw_aff::sub(isl::multi_pw_aff multi2) const
+isl::multi_union_pw_aff multi_union_pw_aff::sub(isl::multi_union_pw_aff multi2) const
 {
-  auto res = isl_multi_pw_aff_sub(copy(), multi2.release());
+  auto res = isl_multi_union_pw_aff_sub(copy(), multi2.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::unbind_params_insert_domain(isl::multi_id domain) const
+isl::multi_union_pw_aff multi_union_pw_aff::union_add(isl::multi_union_pw_aff mupa2) const
 {
-  auto res = isl_multi_pw_aff_unbind_params_insert_domain(copy(), domain.release());
+  auto res = isl_multi_union_pw_aff_union_add(copy(), mupa2.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::union_add(isl::multi_pw_aff mpa2) const
+isl::multi_union_pw_aff multi_union_pw_aff::zero(isl::space space)
 {
-  auto res = isl_multi_pw_aff_union_add(copy(), mpa2.release());
+  auto res = isl_multi_union_pw_aff_zero(space.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_pw_aff::zero(isl::space space)
+inline std::ostream &operator<<(std::ostream &os, const multi_union_pw_aff &obj)
 {
-  auto res = isl_multi_pw_aff_zero(space.release());
-  return manage(res);
+  char *str = isl_multi_union_pw_aff_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-// implementations for isl::multi_union_pw_aff
-multi_union_pw_aff manage(__isl_take isl_multi_union_pw_aff *ptr) {
-  return multi_union_pw_aff(ptr);
+// implementations for isl::multi_val
+multi_val manage(__isl_take isl_multi_val *ptr) {
+  return multi_val(ptr);
 }
-multi_union_pw_aff manage_copy(__isl_keep isl_multi_union_pw_aff *ptr) {
-  ptr = isl_multi_union_pw_aff_copy(ptr);
-  return multi_union_pw_aff(ptr);
+multi_val manage_copy(__isl_keep isl_multi_val *ptr) {
+  ptr = isl_multi_val_copy(ptr);
+  return multi_val(ptr);
 }
 
-multi_union_pw_aff::multi_union_pw_aff()
+multi_val::multi_val()
     : ptr(nullptr) {}
 
-multi_union_pw_aff::multi_union_pw_aff(const multi_union_pw_aff &obj)
+multi_val::multi_val(const multi_val &obj)
     : ptr(nullptr)
 {
   ptr = obj.copy();
 }
 
-
-multi_union_pw_aff::multi_union_pw_aff(__isl_take isl_multi_union_pw_aff *ptr)
+multi_val::multi_val(__isl_take isl_multi_val *ptr)
     : ptr(ptr) {}
 
-multi_union_pw_aff::multi_union_pw_aff(isl::multi_pw_aff mpa)
-{
-  auto res = isl_multi_union_pw_aff_from_multi_pw_aff(mpa.release());
-  ptr = res;
-}
-multi_union_pw_aff::multi_union_pw_aff(isl::union_pw_aff upa)
-{
-  auto res = isl_multi_union_pw_aff_from_union_pw_aff(upa.release());
-  ptr = res;
-}
-multi_union_pw_aff::multi_union_pw_aff(isl::space space, isl::union_pw_aff_list list)
-{
-  auto res = isl_multi_union_pw_aff_from_union_pw_aff_list(space.release(), list.release());
-  ptr = res;
-}
-multi_union_pw_aff::multi_union_pw_aff(isl::union_pw_multi_aff upma)
+multi_val::multi_val(isl::space space, isl::val_list list)
 {
-  auto res = isl_multi_union_pw_aff_from_union_pw_multi_aff(upma.release());
+  auto res = isl_multi_val_from_val_list(space.release(), list.release());
   ptr = res;
 }
-multi_union_pw_aff::multi_union_pw_aff(isl::ctx ctx, const std::string &str)
+
+multi_val::multi_val(isl::ctx ctx, const std::string &str)
 {
-  auto res = isl_multi_union_pw_aff_read_from_str(ctx.release(), str.c_str());
+  auto res = isl_multi_val_read_from_str(ctx.release(), str.c_str());
   ptr = res;
 }
 
-multi_union_pw_aff &multi_union_pw_aff::operator=(multi_union_pw_aff obj) {
+multi_val &multi_val::operator=(multi_val obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-multi_union_pw_aff::~multi_union_pw_aff() {
+multi_val::~multi_val() {
   if (ptr)
-    isl_multi_union_pw_aff_free(ptr);
+    isl_multi_val_free(ptr);
 }
 
-__isl_give isl_multi_union_pw_aff *multi_union_pw_aff::copy() const & {
-  return isl_multi_union_pw_aff_copy(ptr);
+__isl_give isl_multi_val *multi_val::copy() const & {
+  return isl_multi_val_copy(ptr);
 }
 
-__isl_keep isl_multi_union_pw_aff *multi_union_pw_aff::get() const {
+__isl_keep isl_multi_val *multi_val::get() const {
   return ptr;
 }
 
-__isl_give isl_multi_union_pw_aff *multi_union_pw_aff::release() {
-  isl_multi_union_pw_aff *tmp = ptr;
+__isl_give isl_multi_val *multi_val::release() {
+  isl_multi_val *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool multi_union_pw_aff::is_null() const {
+bool multi_val::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx multi_union_pw_aff::ctx() const {
-  return isl::ctx(isl_multi_union_pw_aff_get_ctx(ptr));
-}
-
-void multi_union_pw_aff::dump() const {
-  isl_multi_union_pw_aff_dump(get());
-}
-
-
-isl::multi_union_pw_aff multi_union_pw_aff::add(isl::multi_union_pw_aff multi2) const
-{
-  auto res = isl_multi_union_pw_aff_add(copy(), multi2.release());
-  return manage(res);
-}
-
-isl::multi_union_pw_aff multi_union_pw_aff::align_params(isl::space model) const
-{
-  auto res = isl_multi_union_pw_aff_align_params(copy(), model.release());
-  return manage(res);
-}
-
-isl::union_pw_aff multi_union_pw_aff::apply_aff(isl::aff aff) const
-{
-  auto res = isl_multi_union_pw_aff_apply_aff(copy(), aff.release());
-  return manage(res);
+isl::ctx multi_val::ctx() const {
+  return isl::ctx(isl_multi_val_get_ctx(ptr));
 }
 
-isl::union_pw_aff multi_union_pw_aff::apply_pw_aff(isl::pw_aff pa) const
+isl::multi_val multi_val::add(isl::multi_val multi2) const
 {
-  auto res = isl_multi_union_pw_aff_apply_pw_aff(copy(), pa.release());
+  auto res = isl_multi_val_add(copy(), multi2.release());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::apply_pw_multi_aff(isl::pw_multi_aff pma) const
+isl::multi_val multi_val::add(isl::val v) const
 {
-  auto res = isl_multi_union_pw_aff_apply_pw_multi_aff(copy(), pma.release());
+  auto res = isl_multi_val_add_val(copy(), v.release());
   return manage(res);
 }
 
-isl::union_set multi_union_pw_aff::bind(isl::multi_id tuple) const
+isl::multi_val multi_val::add(long v) const
 {
-  auto res = isl_multi_union_pw_aff_bind(copy(), tuple.release());
-  return manage(res);
+  return this->add(isl::val(ctx(), v));
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::coalesce() const
+isl::val multi_val::at(int pos) const
 {
-  auto res = isl_multi_union_pw_aff_coalesce(copy());
+  auto res = isl_multi_val_get_at(get(), pos);
   return manage(res);
 }
 
-isl_size multi_union_pw_aff::dim(isl::dim type) const
+isl::val multi_val::get_at(int pos) const
 {
-  auto res = isl_multi_union_pw_aff_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return at(pos);
 }
 
-isl::union_set multi_union_pw_aff::domain() const
+class size multi_val::dim(isl::dim type) const
 {
-  auto res = isl_multi_union_pw_aff_domain(copy());
+  auto res = isl_multi_val_dim(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::multi_val multi_val::flat_range_product(isl::multi_val multi2) const
 {
-  auto res = isl_multi_union_pw_aff_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_multi_val_flat_range_product(copy(), multi2.release());
   return manage(res);
 }
 
-isl::multi_pw_aff multi_union_pw_aff::extract_multi_pw_aff(isl::space space) const
+boolean multi_val::has_range_tuple_id() const
 {
-  auto res = isl_multi_union_pw_aff_extract_multi_pw_aff(get(), space.release());
+  auto res = isl_multi_val_has_range_tuple_id(get());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::factor_range() const
+boolean multi_val::involves_nan() const
 {
-  auto res = isl_multi_union_pw_aff_factor_range(copy());
+  auto res = isl_multi_val_involves_nan(get());
   return manage(res);
 }
 
-int multi_union_pw_aff::find_dim_by_id(isl::dim type, const isl::id &id) const
-{
-  auto res = isl_multi_union_pw_aff_find_dim_by_id(get(), static_cast<enum isl_dim_type>(type), id.get());
-  return res;
-}
-
-int multi_union_pw_aff::find_dim_by_name(isl::dim type, const std::string &name) const
-{
-  auto res = isl_multi_union_pw_aff_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
-}
-
-isl::multi_union_pw_aff multi_union_pw_aff::flat_range_product(isl::multi_union_pw_aff multi2) const
+isl::val_list multi_val::list() const
 {
-  auto res = isl_multi_union_pw_aff_flat_range_product(copy(), multi2.release());
+  auto res = isl_multi_val_get_list(get());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::flatten_range() const
+isl::val_list multi_val::get_list() const
 {
-  auto res = isl_multi_union_pw_aff_flatten_range(copy());
-  return manage(res);
+  return list();
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::floor() const
+isl::multi_val multi_val::max(isl::multi_val multi2) const
 {
-  auto res = isl_multi_union_pw_aff_floor(copy());
+  auto res = isl_multi_val_max(copy(), multi2.release());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::from_multi_aff(isl::multi_aff ma)
+isl::multi_val multi_val::min(isl::multi_val multi2) const
 {
-  auto res = isl_multi_union_pw_aff_from_multi_aff(ma.release());
+  auto res = isl_multi_val_min(copy(), multi2.release());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::from_range() const
+isl::multi_val multi_val::neg() const
 {
-  auto res = isl_multi_union_pw_aff_from_range(copy());
+  auto res = isl_multi_val_neg(copy());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::from_union_map(isl::union_map umap)
+boolean multi_val::plain_is_equal(const isl::multi_val &multi2) const
 {
-  auto res = isl_multi_union_pw_aff_from_union_map(umap.release());
+  auto res = isl_multi_val_plain_is_equal(get(), multi2.get());
   return manage(res);
 }
 
-isl::union_pw_aff multi_union_pw_aff::get_at(int pos) const
+isl::multi_val multi_val::product(isl::multi_val multi2) const
 {
-  auto res = isl_multi_union_pw_aff_get_at(get(), pos);
+  auto res = isl_multi_val_product(copy(), multi2.release());
   return manage(res);
 }
 
-isl::id multi_union_pw_aff::get_dim_id(isl::dim type, unsigned int pos) const
+isl::multi_val multi_val::range_product(isl::multi_val multi2) const
 {
-  auto res = isl_multi_union_pw_aff_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_multi_val_range_product(copy(), multi2.release());
   return manage(res);
 }
 
-isl::space multi_union_pw_aff::get_domain_space() const
+isl::id multi_val::range_tuple_id() const
 {
-  auto res = isl_multi_union_pw_aff_get_domain_space(get());
+  auto res = isl_multi_val_get_range_tuple_id(get());
   return manage(res);
 }
 
-isl::union_pw_aff_list multi_union_pw_aff::get_list() const
+isl::id multi_val::get_range_tuple_id() const
 {
-  auto res = isl_multi_union_pw_aff_get_list(get());
-  return manage(res);
+  return range_tuple_id();
 }
 
-isl::space multi_union_pw_aff::get_space() const
+isl::multi_val multi_val::reset_range_tuple_id() const
 {
-  auto res = isl_multi_union_pw_aff_get_space(get());
+  auto res = isl_multi_val_reset_range_tuple_id(copy());
   return manage(res);
 }
 
-isl::id multi_union_pw_aff::get_tuple_id(isl::dim type) const
+isl::multi_val multi_val::reset_tuple_id(isl::dim type) const
 {
-  auto res = isl_multi_union_pw_aff_get_tuple_id(get(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_multi_val_reset_tuple_id(copy(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-std::string multi_union_pw_aff::get_tuple_name(isl::dim type) const
-{
-  auto res = isl_multi_union_pw_aff_get_tuple_name(get(), static_cast<enum isl_dim_type>(type));
-  std::string tmp(res);
-  return tmp;
-}
-
-isl::union_pw_aff multi_union_pw_aff::get_union_pw_aff(int pos) const
+isl::multi_val multi_val::scale(isl::multi_val mv) const
 {
-  auto res = isl_multi_union_pw_aff_get_union_pw_aff(get(), pos);
+  auto res = isl_multi_val_scale_multi_val(copy(), mv.release());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::gist(isl::union_set context) const
+isl::multi_val multi_val::scale(isl::val v) const
 {
-  auto res = isl_multi_union_pw_aff_gist(copy(), context.release());
+  auto res = isl_multi_val_scale_val(copy(), v.release());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::gist_params(isl::set context) const
+isl::multi_val multi_val::scale(long v) const
 {
-  auto res = isl_multi_union_pw_aff_gist_params(copy(), context.release());
-  return manage(res);
+  return this->scale(isl::val(ctx(), v));
 }
 
-boolean multi_union_pw_aff::has_tuple_id(isl::dim type) const
+isl::multi_val multi_val::scale_down(isl::multi_val mv) const
 {
-  auto res = isl_multi_union_pw_aff_has_tuple_id(get(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_multi_val_scale_down_multi_val(copy(), mv.release());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::intersect_domain(isl::union_set uset) const
+isl::multi_val multi_val::scale_down(isl::val v) const
 {
-  auto res = isl_multi_union_pw_aff_intersect_domain(copy(), uset.release());
+  auto res = isl_multi_val_scale_down_val(copy(), v.release());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::intersect_params(isl::set params) const
+isl::multi_val multi_val::scale_down(long v) const
 {
-  auto res = isl_multi_union_pw_aff_intersect_params(copy(), params.release());
-  return manage(res);
+  return this->scale_down(isl::val(ctx(), v));
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::intersect_range(isl::set set) const
+isl::multi_val multi_val::set_at(int pos, isl::val el) const
 {
-  auto res = isl_multi_union_pw_aff_intersect_range(copy(), set.release());
+  auto res = isl_multi_val_set_at(copy(), pos, el.release());
   return manage(res);
 }
 
-boolean multi_union_pw_aff::involves_nan() const
+isl::multi_val multi_val::set_at(int pos, long el) const
 {
-  auto res = isl_multi_union_pw_aff_involves_nan(get());
-  return manage(res);
+  return this->set_at(pos, isl::val(ctx(), el));
 }
 
-isl::multi_val multi_union_pw_aff::max_multi_val() const
+isl::multi_val multi_val::set_range_tuple(isl::id id) const
 {
-  auto res = isl_multi_union_pw_aff_max_multi_val(copy());
+  auto res = isl_multi_val_set_range_tuple_id(copy(), id.release());
   return manage(res);
 }
 
-isl::multi_val multi_union_pw_aff::min_multi_val() const
+isl::multi_val multi_val::set_range_tuple(const std::string &id) const
 {
-  auto res = isl_multi_union_pw_aff_min_multi_val(copy());
-  return manage(res);
+  return this->set_range_tuple(isl::id(ctx(), id));
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::mod_multi_val(isl::multi_val mv) const
+isl::multi_val multi_val::set_val(int pos, isl::val el) const
 {
-  auto res = isl_multi_union_pw_aff_mod_multi_val(copy(), mv.release());
+  auto res = isl_multi_val_set_val(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::multi_aff_on_domain(isl::union_set domain, isl::multi_aff ma)
+isl::multi_val multi_val::set_val(int pos, long el) const
 {
-  auto res = isl_multi_union_pw_aff_multi_aff_on_domain(domain.release(), ma.release());
-  return manage(res);
+  return this->set_val(pos, isl::val(ctx(), el));
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::multi_val_on_domain(isl::union_set domain, isl::multi_val mv)
+class size multi_val::size() const
 {
-  auto res = isl_multi_union_pw_aff_multi_val_on_domain(domain.release(), mv.release());
+  auto res = isl_multi_val_size(get());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::neg() const
+isl::space multi_val::space() const
 {
-  auto res = isl_multi_union_pw_aff_neg(copy());
+  auto res = isl_multi_val_get_space(get());
   return manage(res);
 }
 
-boolean multi_union_pw_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const
+isl::space multi_val::get_space() const
 {
-  auto res = isl_multi_union_pw_aff_plain_is_equal(get(), multi2.get());
-  return manage(res);
+  return space();
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::pullback(isl::union_pw_multi_aff upma) const
+isl::multi_val multi_val::sub(isl::multi_val multi2) const
 {
-  auto res = isl_multi_union_pw_aff_pullback_union_pw_multi_aff(copy(), upma.release());
+  auto res = isl_multi_val_sub(copy(), multi2.release());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::pw_multi_aff_on_domain(isl::union_set domain, isl::pw_multi_aff pma)
+isl::multi_val multi_val::zero(isl::space space)
 {
-  auto res = isl_multi_union_pw_aff_pw_multi_aff_on_domain(domain.release(), pma.release());
+  auto res = isl_multi_val_zero(space.release());
   return manage(res);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::range_factor_domain() const
+inline std::ostream &operator<<(std::ostream &os, const multi_val &obj)
 {
-  auto res = isl_multi_union_pw_aff_range_factor_domain(copy());
-  return manage(res);
+  char *str = isl_multi_val_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::range_factor_range() const
-{
-  auto res = isl_multi_union_pw_aff_range_factor_range(copy());
-  return manage(res);
+// implementations for isl::point
+point manage(__isl_take isl_point *ptr) {
+  return point(ptr);
+}
+point manage_copy(__isl_keep isl_point *ptr) {
+  ptr = isl_point_copy(ptr);
+  return point(ptr);
 }
 
-boolean multi_union_pw_aff::range_is_wrapping() const
+point::point()
+    : ptr(nullptr) {}
+
+point::point(const point &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_multi_union_pw_aff_range_is_wrapping(get());
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::range_product(isl::multi_union_pw_aff multi2) const
+point::point(__isl_take isl_point *ptr)
+    : ptr(ptr) {}
+
+point::point(isl::space space)
 {
-  auto res = isl_multi_union_pw_aff_range_product(copy(), multi2.release());
-  return manage(res);
+  auto res = isl_point_zero(space.release());
+  ptr = res;
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::range_splice(unsigned int pos, isl::multi_union_pw_aff multi2) const
-{
-  auto res = isl_multi_union_pw_aff_range_splice(copy(), pos, multi2.release());
-  return manage(res);
+point &point::operator=(point obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::reset_tuple_id(isl::dim type) const
-{
-  auto res = isl_multi_union_pw_aff_reset_tuple_id(copy(), static_cast<enum isl_dim_type>(type));
-  return manage(res);
+point::~point() {
+  if (ptr)
+    isl_point_free(ptr);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::reset_user() const
-{
-  auto res = isl_multi_union_pw_aff_reset_user(copy());
-  return manage(res);
+__isl_give isl_point *point::copy() const & {
+  return isl_point_copy(ptr);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::scale(isl::multi_val mv) const
-{
-  auto res = isl_multi_union_pw_aff_scale_multi_val(copy(), mv.release());
-  return manage(res);
+__isl_keep isl_point *point::get() const {
+  return ptr;
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::scale(isl::val v) const
-{
-  auto res = isl_multi_union_pw_aff_scale_val(copy(), v.release());
-  return manage(res);
+__isl_give isl_point *point::release() {
+  isl_point *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::scale_down(isl::multi_val mv) const
-{
-  auto res = isl_multi_union_pw_aff_scale_down_multi_val(copy(), mv.release());
-  return manage(res);
+bool point::is_null() const {
+  return ptr == nullptr;
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::scale_down(isl::val v) const
-{
-  auto res = isl_multi_union_pw_aff_scale_down_val(copy(), v.release());
-  return manage(res);
+isl::ctx point::ctx() const {
+  return isl::ctx(isl_point_get_ctx(ptr));
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::set_at(int pos, isl::union_pw_aff el) const
+isl::set point::add_constraint(const isl::constraint &constraint) const
 {
-  auto res = isl_multi_union_pw_aff_set_at(copy(), pos, el.release());
-  return manage(res);
+  return isl::basic_set(*this).add_constraint(constraint);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const
+isl::set point::add_dims(isl::dim type, unsigned int n) const
 {
-  auto res = isl_multi_union_pw_aff_set_dim_id(copy(), static_cast<enum isl_dim_type>(type), pos, id.release());
-  return manage(res);
+  return isl::basic_set(*this).add_dims(type, n);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::set_tuple_id(isl::dim type, isl::id id) const
+isl::basic_set point::affine_hull() const
 {
-  auto res = isl_multi_union_pw_aff_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
-  return manage(res);
+  return isl::basic_set(*this).affine_hull();
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::set_tuple_name(isl::dim type, const std::string &s) const
+isl::set point::align_params(const isl::space &model) const
 {
-  auto res = isl_multi_union_pw_aff_set_tuple_name(copy(), static_cast<enum isl_dim_type>(type), s.c_str());
-  return manage(res);
+  return isl::basic_set(*this).align_params(model);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::set_union_pw_aff(int pos, isl::union_pw_aff el) const
+isl::basic_set point::apply(const isl::basic_map &bmap) const
 {
-  auto res = isl_multi_union_pw_aff_set_union_pw_aff(copy(), pos, el.release());
-  return manage(res);
+  return isl::basic_set(*this).apply(bmap);
 }
 
-isl_size multi_union_pw_aff::size() const
+isl::set point::apply(const isl::map &map) const
 {
-  auto res = isl_multi_union_pw_aff_size(get());
-  return res;
+  return isl::basic_set(*this).apply(map);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::sub(isl::multi_union_pw_aff multi2) const
+isl::union_set point::apply(const isl::union_map &umap) const
 {
-  auto res = isl_multi_union_pw_aff_sub(copy(), multi2.release());
-  return manage(res);
+  return isl::basic_set(*this).apply(umap);
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::union_add(isl::multi_union_pw_aff mupa2) const
+isl::pw_multi_aff point::as_pw_multi_aff() const
 {
-  auto res = isl_multi_union_pw_aff_union_add(copy(), mupa2.release());
-  return manage(res);
+  return isl::basic_set(*this).as_pw_multi_aff();
 }
 
-isl::multi_union_pw_aff multi_union_pw_aff::zero(isl::space space)
+isl::set point::as_set() const
 {
-  auto res = isl_multi_union_pw_aff_zero(space.release());
-  return manage(res);
+  return isl::basic_set(*this).as_set();
 }
 
-isl::union_set multi_union_pw_aff::zero_union_set() const
+isl::basic_set_list point::basic_set_list() const
 {
-  auto res = isl_multi_union_pw_aff_zero_union_set(copy());
-  return manage(res);
+  return isl::basic_set(*this).basic_set_list();
 }
 
-// implementations for isl::multi_val
-multi_val manage(__isl_take isl_multi_val *ptr) {
-  return multi_val(ptr);
-}
-multi_val manage_copy(__isl_keep isl_multi_val *ptr) {
-  ptr = isl_multi_val_copy(ptr);
-  return multi_val(ptr);
+isl::set point::bind(const isl::multi_id &tuple) const
+{
+  return isl::basic_set(*this).bind(tuple);
 }
 
-multi_val::multi_val()
-    : ptr(nullptr) {}
-
-multi_val::multi_val(const multi_val &obj)
-    : ptr(nullptr)
+isl::set point::coalesce() const
 {
-  ptr = obj.copy();
+  return isl::basic_set(*this).coalesce();
 }
 
-
-multi_val::multi_val(__isl_take isl_multi_val *ptr)
-    : ptr(ptr) {}
-
-multi_val::multi_val(isl::space space, isl::val_list list)
+isl::set point::complement() const
 {
-  auto res = isl_multi_val_from_val_list(space.release(), list.release());
-  ptr = res;
+  return isl::basic_set(*this).complement();
 }
-multi_val::multi_val(isl::ctx ctx, const std::string &str)
+
+isl::union_set point::compute_divs() const
 {
-  auto res = isl_multi_val_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
+  return isl::basic_set(*this).compute_divs();
 }
 
-multi_val &multi_val::operator=(multi_val obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+boolean point::contains(const isl::space &space) const
+{
+  return isl::basic_set(*this).contains(space);
 }
 
-multi_val::~multi_val() {
-  if (ptr)
-    isl_multi_val_free(ptr);
+isl::basic_set point::convex_hull() const
+{
+  return isl::basic_set(*this).convex_hull();
 }
 
-__isl_give isl_multi_val *multi_val::copy() const & {
-  return isl_multi_val_copy(ptr);
+isl::val point::coordinate_val(isl::dim type, int pos) const
+{
+  auto res = isl_point_get_coordinate_val(get(), static_cast<enum isl_dim_type>(type), pos);
+  return manage(res);
 }
 
-__isl_keep isl_multi_val *multi_val::get() const {
-  return ptr;
+isl::val point::get_coordinate_val(isl::dim type, int pos) const
+{
+  return coordinate_val(type, pos);
 }
 
-__isl_give isl_multi_val *multi_val::release() {
-  isl_multi_val *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::basic_set point::detect_equalities() const
+{
+  return isl::basic_set(*this).detect_equalities();
 }
 
-bool multi_val::is_null() const {
-  return ptr == nullptr;
+class size point::dim(isl::dim type) const
+{
+  return isl::basic_set(*this).dim(type);
 }
 
-
-isl::ctx multi_val::ctx() const {
-  return isl::ctx(isl_multi_val_get_ctx(ptr));
+boolean point::dim_has_any_lower_bound(isl::dim type, unsigned int pos) const
+{
+  return isl::basic_set(*this).dim_has_any_lower_bound(type, pos);
 }
 
-void multi_val::dump() const {
-  isl_multi_val_dump(get());
+isl::id point::dim_id(isl::dim type, unsigned int pos) const
+{
+  return isl::basic_set(*this).dim_id(type, pos);
 }
 
-
-isl::multi_val multi_val::add(isl::multi_val multi2) const
+isl::pw_aff point::dim_max(int pos) const
 {
-  auto res = isl_multi_val_add(copy(), multi2.release());
-  return manage(res);
+  return isl::basic_set(*this).dim_max(pos);
 }
 
-isl::multi_val multi_val::add(isl::val v) const
+isl::val point::dim_max_val(int pos) const
 {
-  auto res = isl_multi_val_add_val(copy(), v.release());
-  return manage(res);
+  return isl::basic_set(*this).dim_max_val(pos);
 }
 
-isl::multi_val multi_val::add_dims(isl::dim type, unsigned int n) const
+isl::pw_aff point::dim_min(int pos) const
 {
-  auto res = isl_multi_val_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
-  return manage(res);
+  return isl::basic_set(*this).dim_min(pos);
 }
 
-isl::multi_val multi_val::align_params(isl::space model) const
+isl::val point::dim_min_val(int pos) const
 {
-  auto res = isl_multi_val_align_params(copy(), model.release());
-  return manage(res);
+  return isl::basic_set(*this).dim_min_val(pos);
 }
 
-isl_size multi_val::dim(isl::dim type) const
+std::string point::dim_name(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_multi_val_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return isl::basic_set(*this).dim_name(type, pos);
 }
 
-isl::multi_val multi_val::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::aff point::div(int pos) const
 {
-  auto res = isl_multi_val_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::basic_set(*this).div(pos);
 }
 
-isl::multi_val multi_val::factor_range() const
+isl::set point::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_multi_val_factor_range(copy());
-  return manage(res);
+  return isl::basic_set(*this).drop_constraints_involving_dims(type, first, n);
 }
 
-int multi_val::find_dim_by_id(isl::dim type, const isl::id &id) const
+isl::set point::eliminate(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_multi_val_find_dim_by_id(get(), static_cast<enum isl_dim_type>(type), id.get());
-  return res;
+  return isl::basic_set(*this).eliminate(type, first, n);
 }
 
-int multi_val::find_dim_by_name(isl::dim type, const std::string &name) const
+boolean point::every_set(const std::function<boolean(isl::set)> &test) const
 {
-  auto res = isl_multi_val_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+  return isl::basic_set(*this).every_set(test);
 }
 
-isl::multi_val multi_val::flat_range_product(isl::multi_val multi2) const
+isl::set point::extract_set(const isl::space &space) const
 {
-  auto res = isl_multi_val_flat_range_product(copy(), multi2.release());
-  return manage(res);
+  return isl::basic_set(*this).extract_set(space);
 }
 
-isl::multi_val multi_val::flatten_range() const
+int point::find_dim_by_id(isl::dim type, const isl::id &id) const
 {
-  auto res = isl_multi_val_flatten_range(copy());
-  return manage(res);
+  return isl::basic_set(*this).find_dim_by_id(type, id);
 }
 
-isl::multi_val multi_val::from_range() const
+int point::find_dim_by_id(isl::dim type, const std::string &id) const
 {
-  auto res = isl_multi_val_from_range(copy());
-  return manage(res);
+  return this->find_dim_by_id(type, isl::id(ctx(), id));
 }
 
-isl::val multi_val::get_at(int pos) const
+isl::basic_set point::fix_si(isl::dim type, unsigned int pos, int value) const
 {
-  auto res = isl_multi_val_get_at(get(), pos);
-  return manage(res);
+  return isl::basic_set(*this).fix_si(type, pos, value);
 }
 
-isl::id multi_val::get_dim_id(isl::dim type, unsigned int pos) const
+isl::basic_set point::fix_val(isl::dim type, unsigned int pos, const isl::val &v) const
 {
-  auto res = isl_multi_val_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  return isl::basic_set(*this).fix_val(type, pos, v);
 }
 
-isl::space multi_val::get_domain_space() const
+isl::basic_set point::fix_val(isl::dim type, unsigned int pos, long v) const
 {
-  auto res = isl_multi_val_get_domain_space(get());
-  return manage(res);
+  return this->fix_val(type, pos, isl::val(ctx(), v));
 }
 
-isl::val_list multi_val::get_list() const
+isl::basic_set point::flatten() const
 {
-  auto res = isl_multi_val_get_list(get());
-  return manage(res);
+  return isl::basic_set(*this).flatten();
 }
 
-isl::space multi_val::get_space() const
+stat point::foreach_basic_set(const std::function<stat(isl::basic_set)> &fn) const
 {
-  auto res = isl_multi_val_get_space(get());
-  return manage(res);
+  return isl::basic_set(*this).foreach_basic_set(fn);
 }
 
-isl::id multi_val::get_tuple_id(isl::dim type) const
+stat point::foreach_point(const std::function<stat(isl::point)> &fn) const
 {
-  auto res = isl_multi_val_get_tuple_id(get(), static_cast<enum isl_dim_type>(type));
-  return manage(res);
+  return isl::basic_set(*this).foreach_point(fn);
 }
 
-std::string multi_val::get_tuple_name(isl::dim type) const
+stat point::foreach_set(const std::function<stat(isl::set)> &fn) const
 {
-  auto res = isl_multi_val_get_tuple_name(get(), static_cast<enum isl_dim_type>(type));
-  std::string tmp(res);
-  return tmp;
+  return isl::basic_set(*this).foreach_set(fn);
 }
 
-isl::val multi_val::get_val(int pos) const
+isl::basic_set point::gist(const isl::basic_set &context) const
 {
-  auto res = isl_multi_val_get_val(get(), pos);
-  return manage(res);
+  return isl::basic_set(*this).gist(context);
 }
 
-boolean multi_val::has_tuple_id(isl::dim type) const
+isl::set point::gist(const isl::set &context) const
 {
-  auto res = isl_multi_val_has_tuple_id(get(), static_cast<enum isl_dim_type>(type));
-  return manage(res);
+  return isl::basic_set(*this).gist(context);
 }
 
-isl::multi_val multi_val::insert_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::union_set point::gist(const isl::union_set &context) const
 {
-  auto res = isl_multi_val_insert_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::basic_set(*this).gist(context);
 }
 
-boolean multi_val::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::set point::gist_params(const isl::set &context) const
 {
-  auto res = isl_multi_val_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::basic_set(*this).gist_params(context);
 }
 
-boolean multi_val::involves_nan() const
+boolean point::has_equal_space(const isl::set &set2) const
 {
-  auto res = isl_multi_val_involves_nan(get());
-  return manage(res);
+  return isl::basic_set(*this).has_equal_space(set2);
 }
 
-boolean multi_val::is_zero() const
+isl::map point::identity() const
 {
-  auto res = isl_multi_val_is_zero(get());
-  return manage(res);
+  return isl::basic_set(*this).identity();
 }
 
-isl::multi_val multi_val::max(isl::multi_val multi2) const
+isl::union_pw_multi_aff point::identity_union_pw_multi_aff() const
 {
-  auto res = isl_multi_val_max(copy(), multi2.release());
-  return manage(res);
+  return isl::basic_set(*this).identity_union_pw_multi_aff();
 }
 
-isl::multi_val multi_val::min(isl::multi_val multi2) const
+isl::pw_aff point::indicator_function() const
 {
-  auto res = isl_multi_val_min(copy(), multi2.release());
-  return manage(res);
+  return isl::basic_set(*this).indicator_function();
 }
 
-isl::multi_val multi_val::mod_multi_val(isl::multi_val mv) const
+isl::set point::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const
 {
-  auto res = isl_multi_val_mod_multi_val(copy(), mv.release());
-  return manage(res);
+  return isl::basic_set(*this).insert_dims(type, pos, n);
 }
 
-isl::multi_val multi_val::mod_val(isl::val v) const
+isl::map point::insert_domain(const isl::space &domain) const
 {
-  auto res = isl_multi_val_mod_val(copy(), v.release());
-  return manage(res);
+  return isl::basic_set(*this).insert_domain(domain);
 }
 
-isl::multi_val multi_val::neg() const
+isl::basic_set point::intersect(const isl::basic_set &bset2) const
 {
-  auto res = isl_multi_val_neg(copy());
-  return manage(res);
+  return isl::basic_set(*this).intersect(bset2);
 }
 
-boolean multi_val::plain_is_equal(const isl::multi_val &multi2) const
+isl::set point::intersect(const isl::set &set2) const
 {
-  auto res = isl_multi_val_plain_is_equal(get(), multi2.get());
-  return manage(res);
+  return isl::basic_set(*this).intersect(set2);
 }
 
-isl::multi_val multi_val::product(isl::multi_val multi2) const
+isl::union_set point::intersect(const isl::union_set &uset2) const
 {
-  auto res = isl_multi_val_product(copy(), multi2.release());
-  return manage(res);
+  return isl::basic_set(*this).intersect(uset2);
 }
 
-isl::multi_val multi_val::project_domain_on_params() const
+isl::basic_set point::intersect_params(const isl::basic_set &bset2) const
 {
-  auto res = isl_multi_val_project_domain_on_params(copy());
-  return manage(res);
+  return isl::basic_set(*this).intersect_params(bset2);
 }
 
-isl::multi_val multi_val::range_factor_domain() const
+isl::set point::intersect_params(const isl::set &params) const
 {
-  auto res = isl_multi_val_range_factor_domain(copy());
-  return manage(res);
+  return isl::basic_set(*this).intersect_params(params);
 }
 
-isl::multi_val multi_val::range_factor_range() const
+boolean point::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_multi_val_range_factor_range(copy());
-  return manage(res);
+  return isl::basic_set(*this).involves_dims(type, first, n);
 }
 
-boolean multi_val::range_is_wrapping() const
+boolean point::involves_locals() const
 {
-  auto res = isl_multi_val_range_is_wrapping(get());
-  return manage(res);
+  return isl::basic_set(*this).involves_locals();
 }
 
-isl::multi_val multi_val::range_product(isl::multi_val multi2) const
+boolean point::is_bounded() const
 {
-  auto res = isl_multi_val_range_product(copy(), multi2.release());
-  return manage(res);
+  return isl::basic_set(*this).is_bounded();
 }
 
-isl::multi_val multi_val::range_splice(unsigned int pos, isl::multi_val multi2) const
+boolean point::is_disjoint(const isl::set &set2) const
 {
-  auto res = isl_multi_val_range_splice(copy(), pos, multi2.release());
-  return manage(res);
+  return isl::basic_set(*this).is_disjoint(set2);
 }
 
-isl::multi_val multi_val::reset_tuple_id(isl::dim type) const
+boolean point::is_disjoint(const isl::union_set &uset2) const
 {
-  auto res = isl_multi_val_reset_tuple_id(copy(), static_cast<enum isl_dim_type>(type));
-  return manage(res);
+  return isl::basic_set(*this).is_disjoint(uset2);
 }
 
-isl::multi_val multi_val::reset_user() const
+boolean point::is_empty() const
 {
-  auto res = isl_multi_val_reset_user(copy());
-  return manage(res);
+  return isl::basic_set(*this).is_empty();
 }
 
-isl::multi_val multi_val::scale(isl::multi_val mv) const
+boolean point::is_equal(const isl::basic_set &bset2) const
 {
-  auto res = isl_multi_val_scale_multi_val(copy(), mv.release());
-  return manage(res);
+  return isl::basic_set(*this).is_equal(bset2);
 }
 
-isl::multi_val multi_val::scale(isl::val v) const
+boolean point::is_equal(const isl::set &set2) const
 {
-  auto res = isl_multi_val_scale_val(copy(), v.release());
-  return manage(res);
+  return isl::basic_set(*this).is_equal(set2);
 }
 
-isl::multi_val multi_val::scale_down(isl::multi_val mv) const
+boolean point::is_equal(const isl::union_set &uset2) const
 {
-  auto res = isl_multi_val_scale_down_multi_val(copy(), mv.release());
-  return manage(res);
+  return isl::basic_set(*this).is_equal(uset2);
 }
 
-isl::multi_val multi_val::scale_down(isl::val v) const
+boolean point::is_params() const
 {
-  auto res = isl_multi_val_scale_down_val(copy(), v.release());
-  return manage(res);
+  return isl::basic_set(*this).is_params();
 }
 
-isl::multi_val multi_val::set_at(int pos, isl::val el) const
+boolean point::is_singleton() const
 {
-  auto res = isl_multi_val_set_at(copy(), pos, el.release());
-  return manage(res);
+  return isl::basic_set(*this).is_singleton();
 }
 
-isl::multi_val multi_val::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const
+boolean point::is_strict_subset(const isl::set &set2) const
 {
-  auto res = isl_multi_val_set_dim_id(copy(), static_cast<enum isl_dim_type>(type), pos, id.release());
-  return manage(res);
+  return isl::basic_set(*this).is_strict_subset(set2);
 }
 
-isl::multi_val multi_val::set_tuple_id(isl::dim type, isl::id id) const
+boolean point::is_strict_subset(const isl::union_set &uset2) const
 {
-  auto res = isl_multi_val_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
-  return manage(res);
+  return isl::basic_set(*this).is_strict_subset(uset2);
 }
 
-isl::multi_val multi_val::set_tuple_name(isl::dim type, const std::string &s) const
+boolean point::is_subset(const isl::basic_set &bset2) const
 {
-  auto res = isl_multi_val_set_tuple_name(copy(), static_cast<enum isl_dim_type>(type), s.c_str());
-  return manage(res);
+  return isl::basic_set(*this).is_subset(bset2);
 }
 
-isl::multi_val multi_val::set_val(int pos, isl::val el) const
+boolean point::is_subset(const isl::set &set2) const
 {
-  auto res = isl_multi_val_set_val(copy(), pos, el.release());
-  return manage(res);
+  return isl::basic_set(*this).is_subset(set2);
 }
 
-isl_size multi_val::size() const
+boolean point::is_subset(const isl::union_set &uset2) const
 {
-  auto res = isl_multi_val_size(get());
-  return res;
+  return isl::basic_set(*this).is_subset(uset2);
 }
 
-isl::multi_val multi_val::splice(unsigned int in_pos, unsigned int out_pos, isl::multi_val multi2) const
+boolean point::is_wrapping() const
 {
-  auto res = isl_multi_val_splice(copy(), in_pos, out_pos, multi2.release());
-  return manage(res);
+  return isl::basic_set(*this).is_wrapping();
 }
 
-isl::multi_val multi_val::sub(isl::multi_val multi2) const
+boolean point::isa_set() const
 {
-  auto res = isl_multi_val_sub(copy(), multi2.release());
-  return manage(res);
+  return isl::basic_set(*this).isa_set();
 }
 
-isl::multi_val multi_val::zero(isl::space space)
+isl::set point::lexmax() const
 {
-  auto res = isl_multi_val_zero(space.release());
-  return manage(res);
+  return isl::basic_set(*this).lexmax();
 }
 
-// implementations for isl::point
-point manage(__isl_take isl_point *ptr) {
-  return point(ptr);
-}
-point manage_copy(__isl_keep isl_point *ptr) {
-  ptr = isl_point_copy(ptr);
-  return point(ptr);
+isl::pw_multi_aff point::lexmax_pw_multi_aff() const
+{
+  return isl::basic_set(*this).lexmax_pw_multi_aff();
 }
 
-point::point()
-    : ptr(nullptr) {}
-
-point::point(const point &obj)
-    : ptr(nullptr)
+isl::set point::lexmin() const
 {
-  ptr = obj.copy();
+  return isl::basic_set(*this).lexmin();
 }
 
-
-point::point(__isl_take isl_point *ptr)
-    : ptr(ptr) {}
-
-point::point(isl::space dim)
+isl::pw_multi_aff point::lexmin_pw_multi_aff() const
 {
-  auto res = isl_point_zero(dim.release());
-  ptr = res;
+  return isl::basic_set(*this).lexmin_pw_multi_aff();
 }
 
-point &point::operator=(point obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::set point::lower_bound(const isl::multi_pw_aff &lower) const
+{
+  return isl::basic_set(*this).lower_bound(lower);
 }
 
-point::~point() {
-  if (ptr)
-    isl_point_free(ptr);
+isl::set point::lower_bound(const isl::multi_val &lower) const
+{
+  return isl::basic_set(*this).lower_bound(lower);
 }
 
-__isl_give isl_point *point::copy() const & {
-  return isl_point_copy(ptr);
+isl::set point::lower_bound_si(isl::dim type, unsigned int pos, int value) const
+{
+  return isl::basic_set(*this).lower_bound_si(type, pos, value);
 }
 
-__isl_keep isl_point *point::get() const {
-  return ptr;
+isl::set point::lower_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const
+{
+  return isl::basic_set(*this).lower_bound_val(type, pos, value);
 }
 
-__isl_give isl_point *point::release() {
-  isl_point *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::set point::lower_bound_val(isl::dim type, unsigned int pos, long value) const
+{
+  return this->lower_bound_val(type, pos, isl::val(ctx(), value));
 }
 
-bool point::is_null() const {
-  return ptr == nullptr;
+isl::multi_pw_aff point::max_multi_pw_aff() const
+{
+  return isl::basic_set(*this).max_multi_pw_aff();
 }
 
-
-isl::ctx point::ctx() const {
-  return isl::ctx(isl_point_get_ctx(ptr));
+isl::val point::max_val(const isl::aff &obj) const
+{
+  return isl::basic_set(*this).max_val(obj);
 }
 
-void point::dump() const {
-  isl_point_dump(get());
+isl::multi_pw_aff point::min_multi_pw_aff() const
+{
+  return isl::basic_set(*this).min_multi_pw_aff();
 }
 
-
-isl::point point::add_ui(isl::dim type, int pos, unsigned int val) const
+isl::val point::min_val(const isl::aff &obj) const
 {
-  auto res = isl_point_add_ui(copy(), static_cast<enum isl_dim_type>(type), pos, val);
-  return manage(res);
+  return isl::basic_set(*this).min_val(obj);
 }
 
-isl::val point::get_coordinate_val(isl::dim type, int pos) const
+isl::multi_val point::multi_val() const
 {
-  auto res = isl_point_get_coordinate_val(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_point_get_multi_val(get());
   return manage(res);
 }
 
 isl::multi_val point::get_multi_val() const
 {
-  auto res = isl_point_get_multi_val(get());
-  return manage(res);
+  return multi_val();
 }
 
-isl::space point::get_space() const
+class size point::n_basic_set() const
 {
-  auto res = isl_point_get_space(get());
-  return manage(res);
+  return isl::basic_set(*this).n_basic_set();
 }
 
-isl::point point::set_coordinate_val(isl::dim type, int pos, isl::val v) const
+isl::basic_set point::params() const
 {
-  auto res = isl_point_set_coordinate_val(copy(), static_cast<enum isl_dim_type>(type), pos, v.release());
-  return manage(res);
+  return isl::basic_set(*this).params();
 }
 
-isl::point point::sub_ui(isl::dim type, int pos, unsigned int val) const
+isl::val point::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_point_sub_ui(copy(), static_cast<enum isl_dim_type>(type), pos, val);
-  return manage(res);
+  return isl::basic_set(*this).plain_get_val_if_fixed(type, pos);
 }
 
-// implementations for isl::pw_aff
-pw_aff manage(__isl_take isl_pw_aff *ptr) {
-  return pw_aff(ptr);
-}
-pw_aff manage_copy(__isl_keep isl_pw_aff *ptr) {
-  ptr = isl_pw_aff_copy(ptr);
-  return pw_aff(ptr);
+isl::multi_val point::plain_multi_val_if_fixed() const
+{
+  return isl::basic_set(*this).plain_multi_val_if_fixed();
 }
 
-pw_aff::pw_aff()
-    : ptr(nullptr) {}
-
-pw_aff::pw_aff(const pw_aff &obj)
-    : ptr(nullptr)
+isl::basic_set point::polyhedral_hull() const
 {
-  ptr = obj.copy();
+  return isl::basic_set(*this).polyhedral_hull();
 }
 
+isl::set point::preimage(const isl::multi_aff &ma) const
+{
+  return isl::basic_set(*this).preimage(ma);
+}
 
-pw_aff::pw_aff(__isl_take isl_pw_aff *ptr)
-    : ptr(ptr) {}
-
-pw_aff::pw_aff(isl::aff aff)
+isl::set point::preimage(const isl::multi_pw_aff &mpa) const
 {
-  auto res = isl_pw_aff_from_aff(aff.release());
-  ptr = res;
+  return isl::basic_set(*this).preimage(mpa);
 }
-pw_aff::pw_aff(isl::ctx ctx, const std::string &str)
+
+isl::set point::preimage(const isl::pw_multi_aff &pma) const
 {
-  auto res = isl_pw_aff_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
+  return isl::basic_set(*this).preimage(pma);
 }
-pw_aff::pw_aff(isl::set domain, isl::val v)
+
+isl::union_set point::preimage(const isl::union_pw_multi_aff &upma) const
 {
-  auto res = isl_pw_aff_val_on_domain(domain.release(), v.release());
-  ptr = res;
+  return isl::basic_set(*this).preimage(upma);
 }
-pw_aff::pw_aff(isl::local_space ls)
+
+isl::set point::product(const isl::set &set2) const
 {
-  auto res = isl_pw_aff_zero_on_domain(ls.release());
-  ptr = res;
+  return isl::basic_set(*this).product(set2);
 }
 
-pw_aff &pw_aff::operator=(pw_aff obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::basic_set point::project_out(isl::dim type, unsigned int first, unsigned int n) const
+{
+  return isl::basic_set(*this).project_out(type, first, n);
 }
 
-pw_aff::~pw_aff() {
-  if (ptr)
-    isl_pw_aff_free(ptr);
+isl::set point::project_out_all_params() const
+{
+  return isl::basic_set(*this).project_out_all_params();
 }
 
-__isl_give isl_pw_aff *pw_aff::copy() const & {
-  return isl_pw_aff_copy(ptr);
+isl::set point::project_out_param(const isl::id &id) const
+{
+  return isl::basic_set(*this).project_out_param(id);
 }
 
-__isl_keep isl_pw_aff *pw_aff::get() const {
-  return ptr;
+isl::set point::project_out_param(const std::string &id) const
+{
+  return this->project_out_param(isl::id(ctx(), id));
 }
 
-__isl_give isl_pw_aff *pw_aff::release() {
-  isl_pw_aff *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::set point::project_out_param(const isl::id_list &list) const
+{
+  return isl::basic_set(*this).project_out_param(list);
 }
 
-bool pw_aff::is_null() const {
-  return ptr == nullptr;
+isl::pw_multi_aff point::pw_multi_aff_on_domain(const isl::multi_val &mv) const
+{
+  return isl::basic_set(*this).pw_multi_aff_on_domain(mv);
 }
 
+isl::set point::remove_dims(isl::dim type, unsigned int first, unsigned int n) const
+{
+  return isl::basic_set(*this).remove_dims(type, first, n);
+}
 
-isl::ctx pw_aff::ctx() const {
-  return isl::ctx(isl_pw_aff_get_ctx(ptr));
+isl::set point::remove_divs() const
+{
+  return isl::basic_set(*this).remove_divs();
 }
 
-void pw_aff::dump() const {
-  isl_pw_aff_dump(get());
+isl::set point::remove_redundancies() const
+{
+  return isl::basic_set(*this).remove_redundancies();
 }
 
+isl::set point::reset_tuple_id() const
+{
+  return isl::basic_set(*this).reset_tuple_id();
+}
 
-isl::pw_aff pw_aff::add(isl::pw_aff pwaff2) const
+isl::basic_set point::sample() const
 {
-  auto res = isl_pw_aff_add(copy(), pwaff2.release());
-  return manage(res);
+  return isl::basic_set(*this).sample();
 }
 
-isl::pw_aff pw_aff::add_constant(isl::val v) const
+isl::point point::sample_point() const
 {
-  auto res = isl_pw_aff_add_constant_val(copy(), v.release());
-  return manage(res);
+  return isl::basic_set(*this).sample_point();
 }
 
-isl::pw_aff pw_aff::add_dims(isl::dim type, unsigned int n) const
+isl::set point::set_dim_id(isl::dim type, unsigned int pos, const isl::id &id) const
 {
-  auto res = isl_pw_aff_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
-  return manage(res);
+  return isl::basic_set(*this).set_dim_id(type, pos, id);
 }
 
-isl::pw_aff pw_aff::align_params(isl::space model) const
+isl::set point::set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const
 {
-  auto res = isl_pw_aff_align_params(copy(), model.release());
-  return manage(res);
+  return this->set_dim_id(type, pos, isl::id(ctx(), id));
 }
 
-isl::pw_aff pw_aff::alloc(isl::set set, isl::aff aff)
+isl::set_list point::set_list() const
 {
-  auto res = isl_pw_aff_alloc(set.release(), aff.release());
-  return manage(res);
+  return isl::basic_set(*this).set_list();
 }
 
-isl::aff pw_aff::as_aff() const
+isl::set point::set_tuple_id(const isl::id &id) const
 {
-  auto res = isl_pw_aff_as_aff(copy());
-  return manage(res);
+  return isl::basic_set(*this).set_tuple_id(id);
 }
 
-isl::set pw_aff::bind(isl::id id) const
+isl::set point::set_tuple_id(const std::string &id) const
 {
-  auto res = isl_pw_aff_bind_id(copy(), id.release());
-  return manage(res);
+  return this->set_tuple_id(isl::id(ctx(), id));
 }
 
-isl::pw_aff pw_aff::bind_domain(isl::multi_id tuple) const
+isl::fixed_box point::simple_fixed_box_hull() const
 {
-  auto res = isl_pw_aff_bind_domain(copy(), tuple.release());
-  return manage(res);
+  return isl::basic_set(*this).simple_fixed_box_hull();
 }
 
-isl::pw_aff pw_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const
+isl::basic_set point::simple_hull() const
 {
-  auto res = isl_pw_aff_bind_domain_wrapped_domain(copy(), tuple.release());
-  return manage(res);
+  return isl::basic_set(*this).simple_hull();
 }
 
-isl::pw_aff pw_aff::ceil() const
+isl::space point::space() const
 {
-  auto res = isl_pw_aff_ceil(copy());
-  return manage(res);
+  return isl::basic_set(*this).space();
 }
 
-isl::pw_aff pw_aff::coalesce() const
+isl::val point::stride(int pos) const
 {
-  auto res = isl_pw_aff_coalesce(copy());
-  return manage(res);
+  return isl::basic_set(*this).stride(pos);
 }
 
-isl::pw_aff pw_aff::cond(isl::pw_aff pwaff_true, isl::pw_aff pwaff_false) const
+isl::set point::subtract(const isl::set &set2) const
 {
-  auto res = isl_pw_aff_cond(copy(), pwaff_true.release(), pwaff_false.release());
-  return manage(res);
+  return isl::basic_set(*this).subtract(set2);
 }
 
-isl_size pw_aff::dim(isl::dim type) const
+isl::union_set point::subtract(const isl::union_set &uset2) const
 {
-  auto res = isl_pw_aff_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return isl::basic_set(*this).subtract(uset2);
 }
 
-isl::pw_aff pw_aff::div(isl::pw_aff pa2) const
+isl::basic_set_list point::to_list() const
 {
-  auto res = isl_pw_aff_div(copy(), pa2.release());
-  return manage(res);
+  return isl::basic_set(*this).to_list();
 }
 
-isl::set pw_aff::domain() const
+isl::set point::to_set() const
 {
-  auto res = isl_pw_aff_domain(copy());
+  auto res = isl_point_to_set(copy());
   return manage(res);
 }
 
-isl::pw_aff pw_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::union_set point::to_union_set() const
 {
-  auto res = isl_pw_aff_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::basic_set(*this).to_union_set();
 }
 
-isl::pw_aff pw_aff::drop_unused_params() const
+isl::map point::translation() const
 {
-  auto res = isl_pw_aff_drop_unused_params(copy());
-  return manage(res);
+  return isl::basic_set(*this).translation();
 }
 
-isl::pw_aff pw_aff::empty(isl::space space)
+class size point::tuple_dim() const
 {
-  auto res = isl_pw_aff_empty(space.release());
-  return manage(res);
+  return isl::basic_set(*this).tuple_dim();
 }
 
-isl::map pw_aff::eq_map(isl::pw_aff pa2) const
+isl::id point::tuple_id() const
 {
-  auto res = isl_pw_aff_eq_map(copy(), pa2.release());
-  return manage(res);
+  return isl::basic_set(*this).tuple_id();
 }
 
-isl::set pw_aff::eq_set(isl::pw_aff pwaff2) const
+std::string point::tuple_name() const
 {
-  auto res = isl_pw_aff_eq_set(copy(), pwaff2.release());
-  return manage(res);
+  return isl::basic_set(*this).tuple_name();
 }
 
-isl::val pw_aff::eval(isl::point pnt) const
+isl::set point::unbind_params(const isl::multi_id &tuple) const
 {
-  auto res = isl_pw_aff_eval(copy(), pnt.release());
-  return manage(res);
+  return isl::basic_set(*this).unbind_params(tuple);
 }
 
-int pw_aff::find_dim_by_name(isl::dim type, const std::string &name) const
+isl::map point::unbind_params_insert_domain(const isl::multi_id &domain) const
 {
-  auto res = isl_pw_aff_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+  return isl::basic_set(*this).unbind_params_insert_domain(domain);
 }
 
-isl::pw_aff pw_aff::floor() const
+isl::set point::unite(const isl::basic_set &bset2) const
 {
-  auto res = isl_pw_aff_floor(copy());
-  return manage(res);
+  return isl::basic_set(*this).unite(bset2);
 }
 
-stat pw_aff::foreach_piece(const std::function<stat(set, aff)> &fn) const
+isl::set point::unite(const isl::set &set2) const
 {
-  struct fn_data {
-    const std::function<stat(set, aff)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_set *arg_0, isl_aff *arg_1, void *arg_2) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_2);
-    stat ret = (*data->func)(manage(arg_0), manage(arg_1));
-    return ret.release();
-  };
-  auto res = isl_pw_aff_foreach_piece(get(), fn_lambda, &fn_data);
-  return manage(res);
+  return isl::basic_set(*this).unite(set2);
 }
 
-isl::pw_aff pw_aff::from_range() const
+isl::union_set point::unite(const isl::union_set &uset2) const
 {
-  auto res = isl_pw_aff_from_range(copy());
-  return manage(res);
+  return isl::basic_set(*this).unite(uset2);
 }
 
-isl::map pw_aff::ge_map(isl::pw_aff pa2) const
+isl::basic_set point::unshifted_simple_hull() const
 {
-  auto res = isl_pw_aff_ge_map(copy(), pa2.release());
-  return manage(res);
+  return isl::basic_set(*this).unshifted_simple_hull();
 }
 
-isl::set pw_aff::ge_set(isl::pw_aff pwaff2) const
+isl::map point::unwrap() const
 {
-  auto res = isl_pw_aff_ge_set(copy(), pwaff2.release());
-  return manage(res);
+  return isl::basic_set(*this).unwrap();
 }
 
-isl::id pw_aff::get_dim_id(isl::dim type, unsigned int pos) const
+isl::set point::upper_bound(const isl::multi_pw_aff &upper) const
 {
-  auto res = isl_pw_aff_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  return isl::basic_set(*this).upper_bound(upper);
 }
 
-std::string pw_aff::get_dim_name(isl::dim type, unsigned int pos) const
+isl::set point::upper_bound(const isl::multi_val &upper) const
 {
-  auto res = isl_pw_aff_get_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
-  std::string tmp(res);
-  return tmp;
+  return isl::basic_set(*this).upper_bound(upper);
 }
 
-isl::space pw_aff::get_domain_space() const
+isl::set point::upper_bound_val(isl::dim type, unsigned int pos, const isl::val &value) const
 {
-  auto res = isl_pw_aff_get_domain_space(get());
-  return manage(res);
+  return isl::basic_set(*this).upper_bound_val(type, pos, value);
 }
 
-uint32_t pw_aff::get_hash() const
+isl::set point::upper_bound_val(isl::dim type, unsigned int pos, long value) const
 {
-  auto res = isl_pw_aff_get_hash(get());
-  return res;
+  return this->upper_bound_val(type, pos, isl::val(ctx(), value));
 }
 
-isl::space pw_aff::get_space() const
+inline std::ostream &operator<<(std::ostream &os, const point &obj)
 {
-  auto res = isl_pw_aff_get_space(get());
-  return manage(res);
+  char *str = isl_point_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::id pw_aff::get_tuple_id(isl::dim type) const
+// implementations for isl::pw_aff
+pw_aff manage(__isl_take isl_pw_aff *ptr) {
+  return pw_aff(ptr);
+}
+pw_aff manage_copy(__isl_keep isl_pw_aff *ptr) {
+  ptr = isl_pw_aff_copy(ptr);
+  return pw_aff(ptr);
+}
+
+pw_aff::pw_aff()
+    : ptr(nullptr) {}
+
+pw_aff::pw_aff(const pw_aff &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_pw_aff_get_tuple_id(get(), static_cast<enum isl_dim_type>(type));
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::pw_aff pw_aff::gist(isl::set context) const
+pw_aff::pw_aff(__isl_take isl_pw_aff *ptr)
+    : ptr(ptr) {}
+
+pw_aff::pw_aff(isl::aff aff)
 {
-  auto res = isl_pw_aff_gist(copy(), context.release());
-  return manage(res);
+  auto res = isl_pw_aff_from_aff(aff.release());
+  ptr = res;
 }
 
-isl::pw_aff pw_aff::gist_params(isl::set context) const
+pw_aff::pw_aff(isl::ctx ctx, const std::string &str)
 {
-  auto res = isl_pw_aff_gist_params(copy(), context.release());
-  return manage(res);
+  auto res = isl_pw_aff_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
 }
 
-isl::map pw_aff::gt_map(isl::pw_aff pa2) const
+pw_aff::pw_aff(isl::set domain, isl::val v)
 {
-  auto res = isl_pw_aff_gt_map(copy(), pa2.release());
-  return manage(res);
+  auto res = isl_pw_aff_val_on_domain(domain.release(), v.release());
+  ptr = res;
 }
 
-isl::set pw_aff::gt_set(isl::pw_aff pwaff2) const
+pw_aff::pw_aff(isl::local_space ls)
 {
-  auto res = isl_pw_aff_gt_set(copy(), pwaff2.release());
-  return manage(res);
+  auto res = isl_pw_aff_zero_on_domain(ls.release());
+  ptr = res;
+}
+
+pw_aff &pw_aff::operator=(pw_aff obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+pw_aff::~pw_aff() {
+  if (ptr)
+    isl_pw_aff_free(ptr);
 }
 
-boolean pw_aff::has_dim_id(isl::dim type, unsigned int pos) const
-{
-  auto res = isl_pw_aff_has_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+__isl_give isl_pw_aff *pw_aff::copy() const & {
+  return isl_pw_aff_copy(ptr);
 }
 
-boolean pw_aff::has_tuple_id(isl::dim type) const
-{
-  auto res = isl_pw_aff_has_tuple_id(get(), static_cast<enum isl_dim_type>(type));
-  return manage(res);
+__isl_keep isl_pw_aff *pw_aff::get() const {
+  return ptr;
 }
 
-isl::pw_aff pw_aff::insert_dims(isl::dim type, unsigned int first, unsigned int n) const
-{
-  auto res = isl_pw_aff_insert_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+__isl_give isl_pw_aff *pw_aff::release() {
+  isl_pw_aff *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
 }
 
-isl::pw_aff pw_aff::insert_domain(isl::space domain) const
-{
-  auto res = isl_pw_aff_insert_domain(copy(), domain.release());
-  return manage(res);
+bool pw_aff::is_null() const {
+  return ptr == nullptr;
 }
 
-isl::pw_aff pw_aff::intersect_domain(isl::set set) const
-{
-  auto res = isl_pw_aff_intersect_domain(copy(), set.release());
-  return manage(res);
+isl::ctx pw_aff::ctx() const {
+  return isl::ctx(isl_pw_aff_get_ctx(ptr));
 }
 
-isl::pw_aff pw_aff::intersect_domain_wrapped_domain(isl::set set) const
+isl::multi_pw_aff pw_aff::add(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_pw_aff_intersect_domain_wrapped_domain(copy(), set.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).add(multi2);
 }
 
-isl::pw_aff pw_aff::intersect_domain_wrapped_range(isl::set set) const
+isl::multi_union_pw_aff pw_aff::add(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_pw_aff_intersect_domain_wrapped_range(copy(), set.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).add(multi2);
 }
 
-isl::pw_aff pw_aff::intersect_params(isl::set set) const
+isl::pw_aff pw_aff::add(isl::pw_aff pwaff2) const
 {
-  auto res = isl_pw_aff_intersect_params(copy(), set.release());
+  auto res = isl_pw_aff_add(copy(), pwaff2.release());
   return manage(res);
 }
 
-boolean pw_aff::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::pw_multi_aff pw_aff::add(const isl::pw_multi_aff &pma2) const
 {
-  auto res = isl_pw_aff_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::pw_multi_aff(*this).add(pma2);
 }
 
-boolean pw_aff::involves_nan() const
+isl::union_pw_aff pw_aff::add(const isl::union_pw_aff &upa2) const
 {
-  auto res = isl_pw_aff_involves_nan(get());
-  return manage(res);
+  return isl::union_pw_aff(*this).add(upa2);
 }
 
-boolean pw_aff::involves_param_id(const isl::id &id) const
+isl::union_pw_multi_aff pw_aff::add(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_pw_aff_involves_param_id(get(), id.get());
-  return manage(res);
+  return isl::union_pw_aff(*this).add(upma2);
 }
 
-boolean pw_aff::is_cst() const
+isl::pw_aff pw_aff::add(const isl::aff &pwaff2) const
 {
-  auto res = isl_pw_aff_is_cst(get());
-  return manage(res);
+  return this->add(isl::pw_aff(pwaff2));
 }
 
-boolean pw_aff::is_empty() const
+isl::pw_aff pw_aff::add_constant(isl::val v) const
 {
-  auto res = isl_pw_aff_is_empty(get());
+  auto res = isl_pw_aff_add_constant_val(copy(), v.release());
   return manage(res);
 }
 
-boolean pw_aff::is_equal(const isl::pw_aff &pa2) const
+isl::pw_aff pw_aff::add_constant(long v) const
 {
-  auto res = isl_pw_aff_is_equal(get(), pa2.get());
-  return manage(res);
+  return this->add_constant(isl::val(ctx(), v));
 }
 
-boolean pw_aff::isa_aff() const
+isl::pw_multi_aff pw_aff::add_constant(const isl::multi_val &mv) const
 {
-  auto res = isl_pw_aff_isa_aff(get());
-  return manage(res);
+  return isl::pw_multi_aff(*this).add_constant(mv);
 }
 
-isl::map pw_aff::le_map(isl::pw_aff pa2) const
+isl::pw_aff pw_aff::add_dims(isl::dim type, unsigned int n) const
 {
-  auto res = isl_pw_aff_le_map(copy(), pa2.release());
+  auto res = isl_pw_aff_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
   return manage(res);
 }
 
-isl::set pw_aff::le_set(isl::pw_aff pwaff2) const
+isl::union_pw_multi_aff pw_aff::add_pw_multi_aff(const isl::pw_multi_aff &pma) const
 {
-  auto res = isl_pw_aff_le_set(copy(), pwaff2.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).add_pw_multi_aff(pma);
 }
 
-isl::map pw_aff::lt_map(isl::pw_aff pa2) const
+isl::union_pw_multi_aff pw_aff::apply(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_pw_aff_lt_map(copy(), pa2.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).apply(upma2);
 }
 
-isl::set pw_aff::lt_set(isl::pw_aff pwaff2) const
+isl::aff pw_aff::as_aff() const
 {
-  auto res = isl_pw_aff_lt_set(copy(), pwaff2.release());
+  auto res = isl_pw_aff_as_aff(copy());
   return manage(res);
 }
 
-isl::pw_aff pw_aff::max(isl::pw_aff pwaff2) const
+isl::map pw_aff::as_map() const
 {
-  auto res = isl_pw_aff_max(copy(), pwaff2.release());
+  auto res = isl_pw_aff_as_map(copy());
   return manage(res);
 }
 
-isl::pw_aff pw_aff::min(isl::pw_aff pwaff2) const
+isl::multi_aff pw_aff::as_multi_aff() const
 {
-  auto res = isl_pw_aff_min(copy(), pwaff2.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).as_multi_aff();
 }
 
-isl::pw_aff pw_aff::mod(isl::val mod) const
+isl::multi_union_pw_aff pw_aff::as_multi_union_pw_aff() const
 {
-  auto res = isl_pw_aff_mod_val(copy(), mod.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).as_multi_union_pw_aff();
 }
 
-isl::pw_aff pw_aff::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
+isl::pw_multi_aff pw_aff::as_pw_multi_aff() const
 {
-  auto res = isl_pw_aff_move_dims(copy(), static_cast<enum isl_dim_type>(dst_type), dst_pos, static_cast<enum isl_dim_type>(src_type), src_pos, n);
-  return manage(res);
+  return isl::union_pw_aff(*this).as_pw_multi_aff();
 }
 
-isl::pw_aff pw_aff::mul(isl::pw_aff pwaff2) const
+isl::set pw_aff::as_set() const
 {
-  auto res = isl_pw_aff_mul(copy(), pwaff2.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).as_set();
 }
 
-isl_size pw_aff::n_piece() const
+isl::union_map pw_aff::as_union_map() const
 {
-  auto res = isl_pw_aff_n_piece(get());
-  return res;
+  return isl::union_pw_aff(*this).as_union_map();
 }
 
-isl::pw_aff pw_aff::nan_on_domain(isl::local_space ls)
+isl::pw_aff pw_aff::at(int pos) const
 {
-  auto res = isl_pw_aff_nan_on_domain(ls.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).at(pos);
 }
 
-isl::pw_aff pw_aff::nan_on_domain_space(isl::space space)
+isl::set pw_aff::bind(const isl::multi_id &tuple) const
 {
-  auto res = isl_pw_aff_nan_on_domain_space(space.release());
-  return manage(res);
+  return isl::multi_pw_aff(*this).bind(tuple);
 }
 
-isl::set pw_aff::ne_set(isl::pw_aff pwaff2) const
+isl::set pw_aff::bind(isl::id id) const
 {
-  auto res = isl_pw_aff_ne_set(copy(), pwaff2.release());
+  auto res = isl_pw_aff_bind_id(copy(), id.release());
   return manage(res);
 }
 
-isl::pw_aff pw_aff::neg() const
+isl::set pw_aff::bind(const std::string &id) const
 {
-  auto res = isl_pw_aff_neg(copy());
-  return manage(res);
+  return this->bind(isl::id(ctx(), id));
 }
 
-isl::set pw_aff::non_zero_set() const
+isl::pw_aff pw_aff::bind_domain(isl::multi_id tuple) const
 {
-  auto res = isl_pw_aff_non_zero_set(copy());
+  auto res = isl_pw_aff_bind_domain(copy(), tuple.release());
   return manage(res);
 }
 
-isl::set pw_aff::nonneg_set() const
+isl::pw_aff pw_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const
 {
-  auto res = isl_pw_aff_nonneg_set(copy());
+  auto res = isl_pw_aff_bind_domain_wrapped_domain(copy(), tuple.release());
   return manage(res);
 }
 
-isl::pw_aff pw_aff::param_on_domain(isl::set domain, isl::id id)
+isl::pw_aff pw_aff::ceil() const
 {
-  auto res = isl_pw_aff_param_on_domain_id(domain.release(), id.release());
+  auto res = isl_pw_aff_ceil(copy());
   return manage(res);
 }
 
-isl::set pw_aff::params() const
+isl::pw_aff pw_aff::coalesce() const
 {
-  auto res = isl_pw_aff_params(copy());
+  auto res = isl_pw_aff_coalesce(copy());
   return manage(res);
 }
 
-int pw_aff::plain_cmp(const isl::pw_aff &pa2) const
-{
-  auto res = isl_pw_aff_plain_cmp(get(), pa2.get());
-  return res;
-}
-
-boolean pw_aff::plain_is_equal(const isl::pw_aff &pwaff2) const
+isl::pw_aff pw_aff::cond(isl::pw_aff pwaff_true, isl::pw_aff pwaff_false) const
 {
-  auto res = isl_pw_aff_plain_is_equal(get(), pwaff2.get());
+  auto res = isl_pw_aff_cond(copy(), pwaff_true.release(), pwaff_false.release());
   return manage(res);
 }
 
-isl::set pw_aff::pos_set() const
+class size pw_aff::dim(isl::dim type) const
 {
-  auto res = isl_pw_aff_pos_set(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).dim(type);
 }
 
-isl::pw_aff pw_aff::project_domain_on_params() const
+isl::id pw_aff::dim_id(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_pw_aff_project_domain_on_params(copy());
+  auto res = isl_pw_aff_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
   return manage(res);
 }
 
-isl::pw_aff pw_aff::pullback(isl::multi_aff ma) const
+isl::id pw_aff::get_dim_id(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_pw_aff_pullback_multi_aff(copy(), ma.release());
-  return manage(res);
+  return dim_id(type, pos);
 }
 
-isl::pw_aff pw_aff::pullback(isl::multi_pw_aff mpa) const
+isl::pw_aff pw_aff::div(isl::pw_aff pa2) const
 {
-  auto res = isl_pw_aff_pullback_multi_pw_aff(copy(), mpa.release());
+  auto res = isl_pw_aff_div(copy(), pa2.release());
   return manage(res);
 }
 
-isl::pw_aff pw_aff::pullback(isl::pw_multi_aff pma) const
+isl::set pw_aff::domain() const
 {
-  auto res = isl_pw_aff_pullback_pw_multi_aff(copy(), pma.release());
+  auto res = isl_pw_aff_domain(copy());
   return manage(res);
 }
 
-isl::pw_aff pw_aff::reset_tuple_id(isl::dim type) const
+isl::space pw_aff::domain_space() const
 {
-  auto res = isl_pw_aff_reset_tuple_id(copy(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_pw_aff_get_domain_space(get());
   return manage(res);
 }
 
-isl::pw_aff pw_aff::reset_user() const
+isl::space pw_aff::get_domain_space() const
 {
-  auto res = isl_pw_aff_reset_user(copy());
-  return manage(res);
+  return domain_space();
 }
 
-isl::pw_aff pw_aff::scale(isl::val v) const
+isl::pw_multi_aff pw_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_pw_aff_scale_val(copy(), v.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).drop_dims(type, first, n);
 }
 
-isl::pw_aff pw_aff::scale_down(isl::val f) const
+isl::set pw_aff::eq_set(isl::pw_aff pwaff2) const
 {
-  auto res = isl_pw_aff_scale_down_val(copy(), f.release());
+  auto res = isl_pw_aff_eq_set(copy(), pwaff2.release());
   return manage(res);
 }
 
-isl::pw_aff pw_aff::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const
+isl::val pw_aff::eval(isl::point pnt) const
 {
-  auto res = isl_pw_aff_set_dim_id(copy(), static_cast<enum isl_dim_type>(type), pos, id.release());
+  auto res = isl_pw_aff_eval(copy(), pnt.release());
   return manage(res);
 }
 
-isl::pw_aff pw_aff::set_tuple_id(isl::dim type, isl::id id) const
+isl::pw_multi_aff pw_aff::extract_pw_multi_aff(const isl::space &space) const
 {
-  auto res = isl_pw_aff_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).extract_pw_multi_aff(space);
 }
 
-isl::pw_aff pw_aff::sub(isl::pw_aff pwaff2) const
+isl::multi_pw_aff pw_aff::flat_range_product(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_pw_aff_sub(copy(), pwaff2.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).flat_range_product(multi2);
 }
 
-isl::pw_aff pw_aff::subtract_domain(isl::set set) const
+isl::multi_union_pw_aff pw_aff::flat_range_product(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_pw_aff_subtract_domain(copy(), set.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).flat_range_product(multi2);
 }
 
-isl::pw_aff pw_aff::tdiv_q(isl::pw_aff pa2) const
+isl::pw_multi_aff pw_aff::flat_range_product(const isl::pw_multi_aff &pma2) const
 {
-  auto res = isl_pw_aff_tdiv_q(copy(), pa2.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).flat_range_product(pma2);
 }
 
-isl::pw_aff pw_aff::tdiv_r(isl::pw_aff pa2) const
+isl::union_pw_multi_aff pw_aff::flat_range_product(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_pw_aff_tdiv_r(copy(), pa2.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).flat_range_product(upma2);
 }
 
-isl::pw_aff pw_aff::union_add(isl::pw_aff pwaff2) const
+isl::pw_aff pw_aff::floor() const
 {
-  auto res = isl_pw_aff_union_add(copy(), pwaff2.release());
+  auto res = isl_pw_aff_floor(copy());
   return manage(res);
 }
 
-isl::pw_aff pw_aff::union_max(isl::pw_aff pwaff2) const
+stat pw_aff::foreach_piece(const std::function<stat(isl::set, isl::aff)> &fn) const
 {
-  auto res = isl_pw_aff_union_max(copy(), pwaff2.release());
+  struct fn_data {
+    std::function<stat(isl::set, isl::aff)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_set *arg_0, isl_aff *arg_1, void *arg_2) -> isl_stat {
+    auto *data = static_cast<struct fn_data *>(arg_2);
+    auto ret = (data->func)(manage(arg_0), manage(arg_1));
+    return ret.release();
+  };
+  auto res = isl_pw_aff_foreach_piece(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::pw_aff pw_aff::union_min(isl::pw_aff pwaff2) const
+stat pw_aff::foreach_piece(const std::function<stat(isl::set, isl::multi_aff)> &fn) const
 {
-  auto res = isl_pw_aff_union_min(copy(), pwaff2.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).foreach_piece(fn);
 }
 
-isl::pw_aff pw_aff::var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos)
+stat pw_aff::foreach_pw_aff(const std::function<stat(isl::pw_aff)> &fn) const
 {
-  auto res = isl_pw_aff_var_on_domain(ls.release(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  return isl::union_pw_aff(*this).foreach_pw_aff(fn);
 }
 
-isl::set pw_aff::zero_set() const
+isl::set pw_aff::ge_set(isl::pw_aff pwaff2) const
 {
-  auto res = isl_pw_aff_zero_set(copy());
+  auto res = isl_pw_aff_ge_set(copy(), pwaff2.release());
   return manage(res);
 }
 
-// implementations for isl::pw_aff_list
-pw_aff_list manage(__isl_take isl_pw_aff_list *ptr) {
-  return pw_aff_list(ptr);
-}
-pw_aff_list manage_copy(__isl_keep isl_pw_aff_list *ptr) {
-  ptr = isl_pw_aff_list_copy(ptr);
-  return pw_aff_list(ptr);
-}
-
-pw_aff_list::pw_aff_list()
-    : ptr(nullptr) {}
-
-pw_aff_list::pw_aff_list(const pw_aff_list &obj)
-    : ptr(nullptr)
+isl::pw_aff pw_aff::gist(isl::set context) const
 {
-  ptr = obj.copy();
-}
-
-
-pw_aff_list::pw_aff_list(__isl_take isl_pw_aff_list *ptr)
-    : ptr(ptr) {}
-
-
-pw_aff_list &pw_aff_list::operator=(pw_aff_list obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
-}
-
-pw_aff_list::~pw_aff_list() {
-  if (ptr)
-    isl_pw_aff_list_free(ptr);
-}
-
-__isl_give isl_pw_aff_list *pw_aff_list::copy() const & {
-  return isl_pw_aff_list_copy(ptr);
-}
-
-__isl_keep isl_pw_aff_list *pw_aff_list::get() const {
-  return ptr;
-}
-
-__isl_give isl_pw_aff_list *pw_aff_list::release() {
-  isl_pw_aff_list *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
-}
-
-bool pw_aff_list::is_null() const {
-  return ptr == nullptr;
-}
-
-
-isl::ctx pw_aff_list::ctx() const {
-  return isl::ctx(isl_pw_aff_list_get_ctx(ptr));
-}
-
-void pw_aff_list::dump() const {
-  isl_pw_aff_list_dump(get());
+  auto res = isl_pw_aff_gist(copy(), context.release());
+  return manage(res);
 }
 
-
-isl::pw_aff_list pw_aff_list::add(isl::pw_aff el) const
+isl::union_pw_aff pw_aff::gist(const isl::union_set &context) const
 {
-  auto res = isl_pw_aff_list_add(copy(), el.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).gist(context);
 }
 
-isl::pw_aff_list pw_aff_list::alloc(isl::ctx ctx, int n)
+isl::pw_aff pw_aff::gist(const isl::basic_set &context) const
 {
-  auto res = isl_pw_aff_list_alloc(ctx.release(), n);
-  return manage(res);
+  return this->gist(isl::set(context));
 }
 
-isl::pw_aff_list pw_aff_list::clear() const
+isl::pw_aff pw_aff::gist(const isl::point &context) const
 {
-  auto res = isl_pw_aff_list_clear(copy());
-  return manage(res);
+  return this->gist(isl::set(context));
 }
 
-isl::pw_aff_list pw_aff_list::concat(isl::pw_aff_list list2) const
+isl::set pw_aff::gt_set(isl::pw_aff pwaff2) const
 {
-  auto res = isl_pw_aff_list_concat(copy(), list2.release());
+  auto res = isl_pw_aff_gt_set(copy(), pwaff2.release());
   return manage(res);
 }
 
-isl::pw_aff_list pw_aff_list::drop(unsigned int first, unsigned int n) const
+boolean pw_aff::has_range_tuple_id() const
 {
-  auto res = isl_pw_aff_list_drop(copy(), first, n);
-  return manage(res);
+  return isl::pw_multi_aff(*this).has_range_tuple_id();
 }
 
-isl::set pw_aff_list::eq_set(isl::pw_aff_list list2) const
+isl::multi_pw_aff pw_aff::identity() const
 {
-  auto res = isl_pw_aff_list_eq_set(copy(), list2.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).identity();
 }
 
-stat pw_aff_list::foreach(const std::function<stat(pw_aff)> &fn) const
+isl::pw_aff pw_aff::insert_domain(isl::space domain) const
 {
-  struct fn_data {
-    const std::function<stat(pw_aff)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_pw_aff *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_pw_aff_list_foreach(get(), fn_lambda, &fn_data);
+  auto res = isl_pw_aff_insert_domain(copy(), domain.release());
   return manage(res);
 }
 
-isl::pw_aff_list pw_aff_list::from_pw_aff(isl::pw_aff el)
+isl::pw_aff pw_aff::intersect_domain(isl::set set) const
 {
-  auto res = isl_pw_aff_list_from_pw_aff(el.release());
+  auto res = isl_pw_aff_intersect_domain(copy(), set.release());
   return manage(res);
 }
 
-isl::set pw_aff_list::ge_set(isl::pw_aff_list list2) const
+isl::union_pw_aff pw_aff::intersect_domain(const isl::space &space) const
 {
-  auto res = isl_pw_aff_list_ge_set(copy(), list2.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).intersect_domain(space);
 }
 
-isl::pw_aff pw_aff_list::get_at(int index) const
+isl::union_pw_aff pw_aff::intersect_domain(const isl::union_set &uset) const
 {
-  auto res = isl_pw_aff_list_get_at(get(), index);
-  return manage(res);
+  return isl::union_pw_aff(*this).intersect_domain(uset);
 }
 
-isl::pw_aff pw_aff_list::get_pw_aff(int index) const
+isl::pw_aff pw_aff::intersect_domain(const isl::basic_set &set) const
 {
-  auto res = isl_pw_aff_list_get_pw_aff(get(), index);
-  return manage(res);
+  return this->intersect_domain(isl::set(set));
 }
 
-isl::set pw_aff_list::gt_set(isl::pw_aff_list list2) const
+isl::pw_aff pw_aff::intersect_domain(const isl::point &set) const
 {
-  auto res = isl_pw_aff_list_gt_set(copy(), list2.release());
-  return manage(res);
+  return this->intersect_domain(isl::set(set));
 }
 
-isl::pw_aff_list pw_aff_list::insert(unsigned int pos, isl::pw_aff el) const
+isl::union_pw_aff pw_aff::intersect_domain_wrapped_domain(const isl::union_set &uset) const
 {
-  auto res = isl_pw_aff_list_insert(copy(), pos, el.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).intersect_domain_wrapped_domain(uset);
 }
 
-isl::set pw_aff_list::le_set(isl::pw_aff_list list2) const
+isl::union_pw_aff pw_aff::intersect_domain_wrapped_range(const isl::union_set &uset) const
 {
-  auto res = isl_pw_aff_list_le_set(copy(), list2.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).intersect_domain_wrapped_range(uset);
 }
 
-isl::set pw_aff_list::lt_set(isl::pw_aff_list list2) const
+isl::pw_aff pw_aff::intersect_params(isl::set set) const
 {
-  auto res = isl_pw_aff_list_lt_set(copy(), list2.release());
+  auto res = isl_pw_aff_intersect_params(copy(), set.release());
   return manage(res);
 }
 
-isl::pw_aff pw_aff_list::max() const
+boolean pw_aff::involves_locals() const
 {
-  auto res = isl_pw_aff_list_max(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).involves_locals();
 }
 
-isl::pw_aff pw_aff_list::min() const
+boolean pw_aff::involves_nan() const
 {
-  auto res = isl_pw_aff_list_min(copy());
-  return manage(res);
+  return isl::multi_pw_aff(*this).involves_nan();
 }
 
-isl_size pw_aff_list::n_pw_aff() const
+boolean pw_aff::involves_param(const isl::id &id) const
 {
-  auto res = isl_pw_aff_list_n_pw_aff(get());
-  return res;
+  return isl::pw_multi_aff(*this).involves_param(id);
 }
 
-isl::set pw_aff_list::ne_set(isl::pw_aff_list list2) const
+boolean pw_aff::involves_param(const std::string &id) const
 {
-  auto res = isl_pw_aff_list_ne_set(copy(), list2.release());
-  return manage(res);
+  return this->involves_param(isl::id(ctx(), id));
 }
 
-isl::pw_aff_list pw_aff_list::reverse() const
+boolean pw_aff::involves_param(const isl::id_list &list) const
 {
-  auto res = isl_pw_aff_list_reverse(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).involves_param(list);
 }
 
-isl::pw_aff_list pw_aff_list::set_pw_aff(int index, isl::pw_aff el) const
+boolean pw_aff::is_cst() const
 {
-  auto res = isl_pw_aff_list_set_pw_aff(copy(), index, el.release());
+  auto res = isl_pw_aff_is_cst(get());
   return manage(res);
 }
 
-isl_size pw_aff_list::size() const
+boolean pw_aff::is_equal(const isl::pw_aff &pa2) const
 {
-  auto res = isl_pw_aff_list_size(get());
-  return res;
+  auto res = isl_pw_aff_is_equal(get(), pa2.get());
+  return manage(res);
 }
 
-isl::pw_aff_list pw_aff_list::swap(unsigned int pos1, unsigned int pos2) const
+boolean pw_aff::isa_aff() const
 {
-  auto res = isl_pw_aff_list_swap(copy(), pos1, pos2);
+  auto res = isl_pw_aff_isa_aff(get());
   return manage(res);
 }
 
-// implementations for isl::pw_multi_aff
-pw_multi_aff manage(__isl_take isl_pw_multi_aff *ptr) {
-  return pw_multi_aff(ptr);
-}
-pw_multi_aff manage_copy(__isl_keep isl_pw_multi_aff *ptr) {
-  ptr = isl_pw_multi_aff_copy(ptr);
-  return pw_multi_aff(ptr);
-}
-
-pw_multi_aff::pw_multi_aff()
-    : ptr(nullptr) {}
-
-pw_multi_aff::pw_multi_aff(const pw_multi_aff &obj)
-    : ptr(nullptr)
+boolean pw_aff::isa_multi_aff() const
 {
-  ptr = obj.copy();
+  return isl::pw_multi_aff(*this).isa_multi_aff();
 }
 
-
-pw_multi_aff::pw_multi_aff(__isl_take isl_pw_multi_aff *ptr)
-    : ptr(ptr) {}
-
-pw_multi_aff::pw_multi_aff(isl::multi_aff ma)
+boolean pw_aff::isa_pw_multi_aff() const
 {
-  auto res = isl_pw_multi_aff_from_multi_aff(ma.release());
-  ptr = res;
+  return isl::union_pw_aff(*this).isa_pw_multi_aff();
 }
-pw_multi_aff::pw_multi_aff(isl::pw_aff pa)
+
+isl::set pw_aff::le_set(isl::pw_aff pwaff2) const
 {
-  auto res = isl_pw_multi_aff_from_pw_aff(pa.release());
-  ptr = res;
+  auto res = isl_pw_aff_le_set(copy(), pwaff2.release());
+  return manage(res);
 }
-pw_multi_aff::pw_multi_aff(isl::ctx ctx, const std::string &str)
+
+isl::pw_aff_list pw_aff::list() const
 {
-  auto res = isl_pw_multi_aff_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
+  return isl::multi_pw_aff(*this).list();
 }
 
-pw_multi_aff &pw_multi_aff::operator=(pw_multi_aff obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::set pw_aff::lt_set(isl::pw_aff pwaff2) const
+{
+  auto res = isl_pw_aff_lt_set(copy(), pwaff2.release());
+  return manage(res);
 }
 
-pw_multi_aff::~pw_multi_aff() {
-  if (ptr)
-    isl_pw_multi_aff_free(ptr);
+isl::multi_pw_aff pw_aff::max(const isl::multi_pw_aff &multi2) const
+{
+  return isl::pw_multi_aff(*this).max(multi2);
 }
 
-__isl_give isl_pw_multi_aff *pw_multi_aff::copy() const & {
-  return isl_pw_multi_aff_copy(ptr);
+isl::pw_aff pw_aff::max(isl::pw_aff pwaff2) const
+{
+  auto res = isl_pw_aff_max(copy(), pwaff2.release());
+  return manage(res);
 }
 
-__isl_keep isl_pw_multi_aff *pw_multi_aff::get() const {
-  return ptr;
+isl::pw_aff pw_aff::max(const isl::aff &pwaff2) const
+{
+  return this->max(isl::pw_aff(pwaff2));
 }
 
-__isl_give isl_pw_multi_aff *pw_multi_aff::release() {
-  isl_pw_multi_aff *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::multi_val pw_aff::max_multi_val() const
+{
+  return isl::pw_multi_aff(*this).max_multi_val();
 }
 
-bool pw_multi_aff::is_null() const {
-  return ptr == nullptr;
+isl::multi_pw_aff pw_aff::min(const isl::multi_pw_aff &multi2) const
+{
+  return isl::pw_multi_aff(*this).min(multi2);
 }
 
-
-isl::ctx pw_multi_aff::ctx() const {
-  return isl::ctx(isl_pw_multi_aff_get_ctx(ptr));
+isl::pw_aff pw_aff::min(isl::pw_aff pwaff2) const
+{
+  auto res = isl_pw_aff_min(copy(), pwaff2.release());
+  return manage(res);
 }
 
-void pw_multi_aff::dump() const {
-  isl_pw_multi_aff_dump(get());
+isl::pw_aff pw_aff::min(const isl::aff &pwaff2) const
+{
+  return this->min(isl::pw_aff(pwaff2));
 }
 
-
-isl::pw_multi_aff pw_multi_aff::add(isl::pw_multi_aff pma2) const
+isl::multi_val pw_aff::min_multi_val() const
 {
-  auto res = isl_pw_multi_aff_add(copy(), pma2.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).min_multi_val();
 }
 
-isl::pw_multi_aff pw_multi_aff::add_constant(isl::multi_val mv) const
+isl::pw_aff pw_aff::mod(isl::val mod) const
 {
-  auto res = isl_pw_multi_aff_add_constant_multi_val(copy(), mv.release());
+  auto res = isl_pw_aff_mod_val(copy(), mod.release());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::add_constant(isl::val v) const
+isl::pw_aff pw_aff::mod(long mod) const
 {
-  auto res = isl_pw_multi_aff_add_constant_val(copy(), v.release());
-  return manage(res);
+  return this->mod(isl::val(ctx(), mod));
 }
 
-isl::pw_multi_aff pw_multi_aff::align_params(isl::space model) const
+isl::pw_aff pw_aff::mul(isl::pw_aff pwaff2) const
 {
-  auto res = isl_pw_multi_aff_align_params(copy(), model.release());
+  auto res = isl_pw_aff_mul(copy(), pwaff2.release());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::alloc(isl::set set, isl::multi_aff maff)
+class size pw_aff::n_piece() const
 {
-  auto res = isl_pw_multi_aff_alloc(set.release(), maff.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).n_piece();
 }
 
-isl::multi_aff pw_multi_aff::as_multi_aff() const
+isl::set pw_aff::ne_set(isl::pw_aff pwaff2) const
 {
-  auto res = isl_pw_multi_aff_as_multi_aff(copy());
+  auto res = isl_pw_aff_ne_set(copy(), pwaff2.release());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::bind_domain(isl::multi_id tuple) const
+isl::pw_aff pw_aff::neg() const
 {
-  auto res = isl_pw_multi_aff_bind_domain(copy(), tuple.release());
+  auto res = isl_pw_aff_neg(copy());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const
+isl::pw_aff pw_aff::param_on_domain(isl::set domain, isl::id id)
 {
-  auto res = isl_pw_multi_aff_bind_domain_wrapped_domain(copy(), tuple.release());
+  auto res = isl_pw_aff_param_on_domain_id(domain.release(), id.release());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::coalesce() const
+boolean pw_aff::plain_is_empty() const
 {
-  auto res = isl_pw_multi_aff_coalesce(copy());
-  return manage(res);
+  return isl::union_pw_aff(*this).plain_is_empty();
 }
 
-isl_size pw_multi_aff::dim(isl::dim type) const
+boolean pw_aff::plain_is_equal(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_pw_multi_aff_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return isl::pw_multi_aff(*this).plain_is_equal(multi2);
 }
 
-isl::set pw_multi_aff::domain() const
+boolean pw_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_pw_multi_aff_domain(copy());
-  return manage(res);
+  return isl::union_pw_aff(*this).plain_is_equal(multi2);
 }
 
-isl::pw_multi_aff pw_multi_aff::domain_map(isl::space space)
+isl::pw_multi_aff pw_aff::preimage_domain_wrapped_domain(const isl::pw_multi_aff &pma2) const
 {
-  auto res = isl_pw_multi_aff_domain_map(space.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).preimage_domain_wrapped_domain(pma2);
 }
 
-isl::pw_multi_aff pw_multi_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::union_pw_multi_aff pw_aff::preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_pw_multi_aff_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::union_pw_aff(*this).preimage_domain_wrapped_domain(upma2);
 }
 
-isl::pw_multi_aff pw_multi_aff::drop_unused_params() const
+isl::multi_pw_aff pw_aff::product(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_pw_multi_aff_drop_unused_params(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).product(multi2);
 }
 
-isl::pw_multi_aff pw_multi_aff::empty(isl::space space)
+isl::pw_multi_aff pw_aff::product(const isl::pw_multi_aff &pma2) const
 {
-  auto res = isl_pw_multi_aff_empty(space.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).product(pma2);
 }
 
-int pw_multi_aff::find_dim_by_name(isl::dim type, const std::string &name) const
+isl::pw_aff pw_aff::pullback(isl::multi_aff ma) const
 {
-  auto res = isl_pw_multi_aff_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+  auto res = isl_pw_aff_pullback_multi_aff(copy(), ma.release());
+  return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::fix_si(isl::dim type, unsigned int pos, int value) const
+isl::pw_aff pw_aff::pullback(isl::multi_pw_aff mpa) const
 {
-  auto res = isl_pw_multi_aff_fix_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
+  auto res = isl_pw_aff_pullback_multi_pw_aff(copy(), mpa.release());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::flat_range_product(isl::pw_multi_aff pma2) const
+isl::pw_aff pw_aff::pullback(isl::pw_multi_aff pma) const
 {
-  auto res = isl_pw_multi_aff_flat_range_product(copy(), pma2.release());
+  auto res = isl_pw_aff_pullback_pw_multi_aff(copy(), pma.release());
   return manage(res);
 }
 
-stat pw_multi_aff::foreach_piece(const std::function<stat(set, multi_aff)> &fn) const
+isl::union_pw_aff pw_aff::pullback(const isl::union_pw_multi_aff &upma) const
 {
-  struct fn_data {
-    const std::function<stat(set, multi_aff)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_set *arg_0, isl_multi_aff *arg_1, void *arg_2) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_2);
-    stat ret = (*data->func)(manage(arg_0), manage(arg_1));
-    return ret.release();
-  };
-  auto res = isl_pw_multi_aff_foreach_piece(get(), fn_lambda, &fn_data);
-  return manage(res);
+  return isl::union_pw_aff(*this).pullback(upma);
 }
 
-isl::pw_multi_aff pw_multi_aff::from_domain(isl::set set)
+isl::pw_multi_aff_list pw_aff::pw_multi_aff_list() const
 {
-  auto res = isl_pw_multi_aff_from_domain(set.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).pw_multi_aff_list();
 }
 
-isl::pw_multi_aff pw_multi_aff::from_map(isl::map map)
+isl::pw_multi_aff pw_aff::range_factor_domain() const
 {
-  auto res = isl_pw_multi_aff_from_map(map.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).range_factor_domain();
 }
 
-isl::pw_multi_aff pw_multi_aff::from_multi_pw_aff(isl::multi_pw_aff mpa)
+isl::pw_multi_aff pw_aff::range_factor_range() const
 {
-  auto res = isl_pw_multi_aff_from_multi_pw_aff(mpa.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).range_factor_range();
 }
 
-isl::pw_multi_aff pw_multi_aff::from_set(isl::set set)
+isl::multi_pw_aff pw_aff::range_product(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_pw_multi_aff_from_set(set.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).range_product(multi2);
 }
 
-isl::id pw_multi_aff::get_dim_id(isl::dim type, unsigned int pos) const
+isl::multi_union_pw_aff pw_aff::range_product(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_pw_multi_aff_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  return isl::union_pw_aff(*this).range_product(multi2);
 }
 
-std::string pw_multi_aff::get_dim_name(isl::dim type, unsigned int pos) const
+isl::pw_multi_aff pw_aff::range_product(const isl::pw_multi_aff &pma2) const
 {
-  auto res = isl_pw_multi_aff_get_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
-  std::string tmp(res);
-  return tmp;
+  return isl::pw_multi_aff(*this).range_product(pma2);
 }
 
-isl::space pw_multi_aff::get_domain_space() const
+isl::union_pw_multi_aff pw_aff::range_product(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_pw_multi_aff_get_domain_space(get());
-  return manage(res);
+  return isl::union_pw_aff(*this).range_product(upma2);
 }
 
-isl::pw_aff pw_multi_aff::get_pw_aff(int pos) const
+isl::id pw_aff::range_tuple_id() const
 {
-  auto res = isl_pw_multi_aff_get_pw_aff(get(), pos);
-  return manage(res);
+  return isl::pw_multi_aff(*this).range_tuple_id();
 }
 
-isl::space pw_multi_aff::get_space() const
+isl::multi_pw_aff pw_aff::reset_range_tuple_id() const
 {
-  auto res = isl_pw_multi_aff_get_space(get());
-  return manage(res);
+  return isl::multi_pw_aff(*this).reset_range_tuple_id();
 }
 
-isl::id pw_multi_aff::get_tuple_id(isl::dim type) const
+isl::multi_pw_aff pw_aff::reset_tuple_id(isl::dim type) const
 {
-  auto res = isl_pw_multi_aff_get_tuple_id(get(), static_cast<enum isl_dim_type>(type));
-  return manage(res);
+  return isl::multi_pw_aff(*this).reset_tuple_id(type);
 }
 
-std::string pw_multi_aff::get_tuple_name(isl::dim type) const
+isl::multi_pw_aff pw_aff::scale(const isl::multi_val &mv) const
 {
-  auto res = isl_pw_multi_aff_get_tuple_name(get(), static_cast<enum isl_dim_type>(type));
-  std::string tmp(res);
-  return tmp;
+  return isl::multi_pw_aff(*this).scale(mv);
 }
 
-isl::pw_multi_aff pw_multi_aff::gist(isl::set set) const
+isl::pw_aff pw_aff::scale(isl::val v) const
 {
-  auto res = isl_pw_multi_aff_gist(copy(), set.release());
+  auto res = isl_pw_aff_scale_val(copy(), v.release());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::gist_params(isl::set set) const
+isl::pw_aff pw_aff::scale(long v) const
 {
-  auto res = isl_pw_multi_aff_gist_params(copy(), set.release());
-  return manage(res);
+  return this->scale(isl::val(ctx(), v));
 }
 
-boolean pw_multi_aff::has_tuple_id(isl::dim type) const
+isl::multi_pw_aff pw_aff::scale_down(const isl::multi_val &mv) const
 {
-  auto res = isl_pw_multi_aff_has_tuple_id(get(), static_cast<enum isl_dim_type>(type));
-  return manage(res);
+  return isl::multi_pw_aff(*this).scale_down(mv);
 }
 
-boolean pw_multi_aff::has_tuple_name(isl::dim type) const
+isl::pw_aff pw_aff::scale_down(isl::val f) const
 {
-  auto res = isl_pw_multi_aff_has_tuple_name(get(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_pw_aff_scale_down_val(copy(), f.release());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::identity(isl::space space)
+isl::pw_aff pw_aff::scale_down(long f) const
 {
-  auto res = isl_pw_multi_aff_identity(space.release());
-  return manage(res);
+  return this->scale_down(isl::val(ctx(), f));
 }
 
-isl::pw_multi_aff pw_multi_aff::identity_on_domain(isl::space space)
+isl::multi_pw_aff pw_aff::set_at(int pos, const isl::pw_aff &el) const
 {
-  auto res = isl_pw_multi_aff_identity_on_domain_space(space.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).set_at(pos, el);
 }
 
-isl::pw_multi_aff pw_multi_aff::insert_domain(isl::space domain) const
+isl::multi_union_pw_aff pw_aff::set_at(int pos, const isl::union_pw_aff &el) const
 {
-  auto res = isl_pw_multi_aff_insert_domain(copy(), domain.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).set_at(pos, el);
 }
 
-isl::pw_multi_aff pw_multi_aff::intersect_domain(isl::set set) const
+isl::multi_pw_aff pw_aff::set_pw_aff(int pos, const isl::pw_aff &el) const
 {
-  auto res = isl_pw_multi_aff_intersect_domain(copy(), set.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).set_pw_aff(pos, el);
 }
 
-isl::pw_multi_aff pw_multi_aff::intersect_domain_wrapped_domain(isl::set set) const
+isl::pw_multi_aff pw_aff::set_pw_aff(unsigned int pos, const isl::pw_aff &pa) const
 {
-  auto res = isl_pw_multi_aff_intersect_domain_wrapped_domain(copy(), set.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).set_pw_aff(pos, pa);
 }
 
-isl::pw_multi_aff pw_multi_aff::intersect_domain_wrapped_range(isl::set set) const
+isl::pw_multi_aff pw_aff::set_range_tuple(const isl::id &id) const
 {
-  auto res = isl_pw_multi_aff_intersect_domain_wrapped_range(copy(), set.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).set_range_tuple(id);
 }
 
-isl::pw_multi_aff pw_multi_aff::intersect_params(isl::set set) const
+isl::pw_multi_aff pw_aff::set_range_tuple(const std::string &id) const
 {
-  auto res = isl_pw_multi_aff_intersect_params(copy(), set.release());
-  return manage(res);
+  return this->set_range_tuple(isl::id(ctx(), id));
 }
 
-boolean pw_multi_aff::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::pw_aff pw_aff::set_tuple_id(isl::dim type, isl::id id) const
 {
-  auto res = isl_pw_multi_aff_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_pw_aff_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
   return manage(res);
 }
 
-boolean pw_multi_aff::involves_locals() const
+isl::pw_aff pw_aff::set_tuple_id(isl::dim type, const std::string &id) const
 {
-  auto res = isl_pw_multi_aff_involves_locals(get());
-  return manage(res);
+  return this->set_tuple_id(type, isl::id(ctx(), id));
 }
 
-boolean pw_multi_aff::involves_nan() const
+isl::multi_union_pw_aff pw_aff::set_union_pw_aff(int pos, const isl::union_pw_aff &el) const
 {
-  auto res = isl_pw_multi_aff_involves_nan(get());
-  return manage(res);
+  return isl::union_pw_aff(*this).set_union_pw_aff(pos, el);
 }
 
-boolean pw_multi_aff::involves_param_id(const isl::id &id) const
+class size pw_aff::size() const
 {
-  auto res = isl_pw_multi_aff_involves_param_id(get(), id.get());
-  return manage(res);
+  return isl::multi_pw_aff(*this).size();
 }
 
-boolean pw_multi_aff::is_equal(const isl::pw_multi_aff &pma2) const
+isl::space pw_aff::space() const
 {
-  auto res = isl_pw_multi_aff_is_equal(get(), pma2.get());
+  auto res = isl_pw_aff_get_space(get());
   return manage(res);
 }
 
-boolean pw_multi_aff::isa_multi_aff() const
+isl::space pw_aff::get_space() const
 {
-  auto res = isl_pw_multi_aff_isa_multi_aff(get());
-  return manage(res);
+  return space();
 }
 
-isl::multi_val pw_multi_aff::max_multi_val() const
+isl::multi_pw_aff pw_aff::sub(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_pw_multi_aff_max_multi_val(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).sub(multi2);
 }
 
-isl::multi_val pw_multi_aff::min_multi_val() const
+isl::multi_union_pw_aff pw_aff::sub(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_pw_multi_aff_min_multi_val(copy());
-  return manage(res);
+  return isl::union_pw_aff(*this).sub(multi2);
 }
 
-isl::pw_multi_aff pw_multi_aff::multi_val_on_domain(isl::set domain, isl::multi_val mv)
+isl::pw_aff pw_aff::sub(isl::pw_aff pwaff2) const
 {
-  auto res = isl_pw_multi_aff_multi_val_on_domain(domain.release(), mv.release());
+  auto res = isl_pw_aff_sub(copy(), pwaff2.release());
   return manage(res);
 }
 
-isl_size pw_multi_aff::n_piece() const
+isl::pw_multi_aff pw_aff::sub(const isl::pw_multi_aff &pma2) const
 {
-  auto res = isl_pw_multi_aff_n_piece(get());
-  return res;
+  return isl::pw_multi_aff(*this).sub(pma2);
 }
 
-isl::pw_multi_aff pw_multi_aff::neg() const
+isl::union_pw_aff pw_aff::sub(const isl::union_pw_aff &upa2) const
 {
-  auto res = isl_pw_multi_aff_neg(copy());
-  return manage(res);
+  return isl::union_pw_aff(*this).sub(upa2);
 }
 
-boolean pw_multi_aff::plain_is_equal(const isl::pw_multi_aff &pma2) const
+isl::union_pw_multi_aff pw_aff::sub(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_pw_multi_aff_plain_is_equal(get(), pma2.get());
-  return manage(res);
+  return isl::union_pw_aff(*this).sub(upma2);
 }
 
-isl::pw_multi_aff pw_multi_aff::preimage_domain_wrapped_domain(isl::pw_multi_aff pma2) const
+isl::pw_aff pw_aff::sub(const isl::aff &pwaff2) const
 {
-  auto res = isl_pw_multi_aff_preimage_domain_wrapped_domain_pw_multi_aff(copy(), pma2.release());
-  return manage(res);
+  return this->sub(isl::pw_aff(pwaff2));
 }
 
-isl::pw_multi_aff pw_multi_aff::product(isl::pw_multi_aff pma2) const
+isl::pw_aff pw_aff::subtract_domain(isl::set set) const
 {
-  auto res = isl_pw_multi_aff_product(copy(), pma2.release());
+  auto res = isl_pw_aff_subtract_domain(copy(), set.release());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::project_domain_on_params() const
+isl::union_pw_aff pw_aff::subtract_domain(const isl::space &space) const
 {
-  auto res = isl_pw_multi_aff_project_domain_on_params(copy());
-  return manage(res);
+  return isl::union_pw_aff(*this).subtract_domain(space);
 }
 
-isl::pw_multi_aff pw_multi_aff::project_out_map(isl::space space, isl::dim type, unsigned int first, unsigned int n)
+isl::union_pw_aff pw_aff::subtract_domain(const isl::union_set &uset) const
 {
-  auto res = isl_pw_multi_aff_project_out_map(space.release(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::union_pw_aff(*this).subtract_domain(uset);
 }
 
-isl::pw_multi_aff pw_multi_aff::pullback(isl::multi_aff ma) const
+isl::pw_aff pw_aff::subtract_domain(const isl::basic_set &set) const
 {
-  auto res = isl_pw_multi_aff_pullback_multi_aff(copy(), ma.release());
-  return manage(res);
+  return this->subtract_domain(isl::set(set));
 }
 
-isl::pw_multi_aff pw_multi_aff::pullback(isl::pw_multi_aff pma2) const
+isl::pw_aff pw_aff::subtract_domain(const isl::point &set) const
 {
-  auto res = isl_pw_multi_aff_pullback_pw_multi_aff(copy(), pma2.release());
-  return manage(res);
+  return this->subtract_domain(isl::set(set));
 }
 
-isl::pw_multi_aff pw_multi_aff::range_factor_domain() const
+isl::pw_aff pw_aff::tdiv_q(isl::pw_aff pa2) const
 {
-  auto res = isl_pw_multi_aff_range_factor_domain(copy());
+  auto res = isl_pw_aff_tdiv_q(copy(), pa2.release());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::range_factor_range() const
+isl::pw_aff pw_aff::tdiv_r(isl::pw_aff pa2) const
 {
-  auto res = isl_pw_multi_aff_range_factor_range(copy());
+  auto res = isl_pw_aff_tdiv_r(copy(), pa2.release());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::range_map(isl::space space)
+isl::pw_aff_list pw_aff::to_list() const
 {
-  auto res = isl_pw_multi_aff_range_map(space.release());
+  auto res = isl_pw_aff_to_list(copy());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::range_product(isl::pw_multi_aff pma2) const
+isl::multi_pw_aff pw_aff::to_multi_pw_aff() const
 {
-  auto res = isl_pw_multi_aff_range_product(copy(), pma2.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).to_multi_pw_aff();
 }
 
-isl::pw_multi_aff pw_multi_aff::reset_tuple_id(isl::dim type) const
+isl::union_pw_aff pw_aff::to_union_pw_aff() const
 {
-  auto res = isl_pw_multi_aff_reset_tuple_id(copy(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_pw_aff_to_union_pw_aff(copy());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::reset_user() const
+isl::union_pw_multi_aff pw_aff::to_union_pw_multi_aff() const
 {
-  auto res = isl_pw_multi_aff_reset_user(copy());
-  return manage(res);
+  return isl::pw_multi_aff(*this).to_union_pw_multi_aff();
 }
 
-isl::pw_multi_aff pw_multi_aff::scale(isl::val v) const
+isl::id pw_aff::tuple_id(isl::dim type) const
 {
-  auto res = isl_pw_multi_aff_scale_val(copy(), v.release());
+  auto res = isl_pw_aff_get_tuple_id(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::scale_down(isl::val v) const
+isl::id pw_aff::get_tuple_id(isl::dim type) const
 {
-  auto res = isl_pw_multi_aff_scale_down_val(copy(), v.release());
-  return manage(res);
+  return tuple_id(type);
 }
 
-isl::pw_multi_aff pw_multi_aff::scale_multi_val(isl::multi_val mv) const
+isl::multi_pw_aff pw_aff::unbind_params_insert_domain(const isl::multi_id &domain) const
 {
-  auto res = isl_pw_multi_aff_scale_multi_val(copy(), mv.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).unbind_params_insert_domain(domain);
 }
 
-isl::pw_multi_aff pw_multi_aff::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const
+isl::multi_pw_aff pw_aff::union_add(const isl::multi_pw_aff &mpa2) const
 {
-  auto res = isl_pw_multi_aff_set_dim_id(copy(), static_cast<enum isl_dim_type>(type), pos, id.release());
-  return manage(res);
+  return isl::pw_multi_aff(*this).union_add(mpa2);
 }
 
-isl::pw_multi_aff pw_multi_aff::set_pw_aff(unsigned int pos, isl::pw_aff pa) const
+isl::multi_union_pw_aff pw_aff::union_add(const isl::multi_union_pw_aff &mupa2) const
 {
-  auto res = isl_pw_multi_aff_set_pw_aff(copy(), pos, pa.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).union_add(mupa2);
 }
 
-isl::pw_multi_aff pw_multi_aff::set_tuple_id(isl::dim type, isl::id id) const
+isl::pw_aff pw_aff::union_add(isl::pw_aff pwaff2) const
 {
-  auto res = isl_pw_multi_aff_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
+  auto res = isl_pw_aff_union_add(copy(), pwaff2.release());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::sub(isl::pw_multi_aff pma2) const
-{
-  auto res = isl_pw_multi_aff_sub(copy(), pma2.release());
-  return manage(res);
+isl::pw_multi_aff pw_aff::union_add(const isl::pw_multi_aff &pma2) const
+{
+  return isl::pw_multi_aff(*this).union_add(pma2);
 }
 
-isl::pw_multi_aff pw_multi_aff::subtract_domain(isl::set set) const
+isl::union_pw_aff pw_aff::union_add(const isl::union_pw_aff &upa2) const
 {
-  auto res = isl_pw_multi_aff_subtract_domain(copy(), set.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).union_add(upa2);
 }
 
-isl::pw_multi_aff pw_multi_aff::union_add(isl::pw_multi_aff pma2) const
+isl::union_pw_multi_aff pw_aff::union_add(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_pw_multi_aff_union_add(copy(), pma2.release());
-  return manage(res);
+  return isl::union_pw_aff(*this).union_add(upma2);
 }
 
-isl::pw_multi_aff pw_multi_aff::union_lexmax(isl::pw_multi_aff pma2) const
+isl::pw_aff pw_aff::union_add(const isl::aff &pwaff2) const
 {
-  auto res = isl_pw_multi_aff_union_lexmax(copy(), pma2.release());
-  return manage(res);
+  return this->union_add(isl::pw_aff(pwaff2));
 }
 
-isl::pw_multi_aff pw_multi_aff::union_lexmin(isl::pw_multi_aff pma2) const
+isl::pw_aff pw_aff::var_on_domain(isl::local_space ls, isl::dim type, unsigned int pos)
 {
-  auto res = isl_pw_multi_aff_union_lexmin(copy(), pma2.release());
+  auto res = isl_pw_aff_var_on_domain(ls.release(), static_cast<enum isl_dim_type>(type), pos);
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff::zero(isl::space space)
+inline std::ostream &operator<<(std::ostream &os, const pw_aff &obj)
 {
-  auto res = isl_pw_multi_aff_zero(space.release());
-  return manage(res);
+  char *str = isl_pw_aff_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-// implementations for isl::pw_multi_aff_list
-pw_multi_aff_list manage(__isl_take isl_pw_multi_aff_list *ptr) {
-  return pw_multi_aff_list(ptr);
+// implementations for isl::pw_aff_list
+pw_aff_list manage(__isl_take isl_pw_aff_list *ptr) {
+  return pw_aff_list(ptr);
 }
-pw_multi_aff_list manage_copy(__isl_keep isl_pw_multi_aff_list *ptr) {
-  ptr = isl_pw_multi_aff_list_copy(ptr);
-  return pw_multi_aff_list(ptr);
+pw_aff_list manage_copy(__isl_keep isl_pw_aff_list *ptr) {
+  ptr = isl_pw_aff_list_copy(ptr);
+  return pw_aff_list(ptr);
 }
 
-pw_multi_aff_list::pw_multi_aff_list()
+pw_aff_list::pw_aff_list()
     : ptr(nullptr) {}
 
-pw_multi_aff_list::pw_multi_aff_list(const pw_multi_aff_list &obj)
+pw_aff_list::pw_aff_list(const pw_aff_list &obj)
     : ptr(nullptr)
 {
   ptr = obj.copy();
 }
 
-
-pw_multi_aff_list::pw_multi_aff_list(__isl_take isl_pw_multi_aff_list *ptr)
+pw_aff_list::pw_aff_list(__isl_take isl_pw_aff_list *ptr)
     : ptr(ptr) {}
 
+pw_aff_list::pw_aff_list(isl::ctx ctx, int n)
+{
+  auto res = isl_pw_aff_list_alloc(ctx.release(), n);
+  ptr = res;
+}
 
-pw_multi_aff_list &pw_multi_aff_list::operator=(pw_multi_aff_list obj) {
+pw_aff_list::pw_aff_list(isl::pw_aff el)
+{
+  auto res = isl_pw_aff_list_from_pw_aff(el.release());
+  ptr = res;
+}
+
+pw_aff_list::pw_aff_list(isl::ctx ctx, const std::string &str)
+{
+  auto res = isl_pw_aff_list_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
+}
+
+pw_aff_list &pw_aff_list::operator=(pw_aff_list obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-pw_multi_aff_list::~pw_multi_aff_list() {
+pw_aff_list::~pw_aff_list() {
   if (ptr)
-    isl_pw_multi_aff_list_free(ptr);
+    isl_pw_aff_list_free(ptr);
 }
 
-__isl_give isl_pw_multi_aff_list *pw_multi_aff_list::copy() const & {
-  return isl_pw_multi_aff_list_copy(ptr);
+__isl_give isl_pw_aff_list *pw_aff_list::copy() const & {
+  return isl_pw_aff_list_copy(ptr);
 }
 
-__isl_keep isl_pw_multi_aff_list *pw_multi_aff_list::get() const {
+__isl_keep isl_pw_aff_list *pw_aff_list::get() const {
   return ptr;
 }
 
-__isl_give isl_pw_multi_aff_list *pw_multi_aff_list::release() {
-  isl_pw_multi_aff_list *tmp = ptr;
+__isl_give isl_pw_aff_list *pw_aff_list::release() {
+  isl_pw_aff_list *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool pw_multi_aff_list::is_null() const {
+bool pw_aff_list::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx pw_multi_aff_list::ctx() const {
-  return isl::ctx(isl_pw_multi_aff_list_get_ctx(ptr));
+isl::ctx pw_aff_list::ctx() const {
+  return isl::ctx(isl_pw_aff_list_get_ctx(ptr));
 }
 
-void pw_multi_aff_list::dump() const {
-  isl_pw_multi_aff_list_dump(get());
+isl::pw_aff_list pw_aff_list::add(isl::pw_aff el) const
+{
+  auto res = isl_pw_aff_list_add(copy(), el.release());
+  return manage(res);
 }
 
-
-isl::pw_multi_aff_list pw_multi_aff_list::add(isl::pw_multi_aff el) const
+isl::pw_aff pw_aff_list::at(int index) const
 {
-  auto res = isl_pw_multi_aff_list_add(copy(), el.release());
+  auto res = isl_pw_aff_list_get_at(get(), index);
   return manage(res);
 }
 
-isl::pw_multi_aff_list pw_multi_aff_list::alloc(isl::ctx ctx, int n)
+isl::pw_aff pw_aff_list::get_at(int index) const
 {
-  auto res = isl_pw_multi_aff_list_alloc(ctx.release(), n);
-  return manage(res);
+  return at(index);
 }
 
-isl::pw_multi_aff_list pw_multi_aff_list::clear() const
+isl::pw_aff_list pw_aff_list::clear() const
 {
-  auto res = isl_pw_multi_aff_list_clear(copy());
+  auto res = isl_pw_aff_list_clear(copy());
   return manage(res);
 }
 
-isl::pw_multi_aff_list pw_multi_aff_list::concat(isl::pw_multi_aff_list list2) const
+isl::pw_aff_list pw_aff_list::concat(isl::pw_aff_list list2) const
 {
-  auto res = isl_pw_multi_aff_list_concat(copy(), list2.release());
+  auto res = isl_pw_aff_list_concat(copy(), list2.release());
   return manage(res);
 }
 
-isl::pw_multi_aff_list pw_multi_aff_list::drop(unsigned int first, unsigned int n) const
+isl::pw_aff_list pw_aff_list::drop(unsigned int first, unsigned int n) const
 {
-  auto res = isl_pw_multi_aff_list_drop(copy(), first, n);
+  auto res = isl_pw_aff_list_drop(copy(), first, n);
   return manage(res);
 }
 
-stat pw_multi_aff_list::foreach(const std::function<stat(pw_multi_aff)> &fn) const
+stat pw_aff_list::foreach(const std::function<stat(isl::pw_aff)> &fn) const
 {
   struct fn_data {
-    const std::function<stat(pw_multi_aff)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_pw_multi_aff *arg_0, void *arg_1) -> isl_stat {
+    std::function<stat(isl::pw_aff)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_pw_aff *arg_0, void *arg_1) -> isl_stat {
     auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
+    auto ret = (data->func)(manage(arg_0));
     return ret.release();
   };
-  auto res = isl_pw_multi_aff_list_foreach(get(), fn_lambda, &fn_data);
+  auto res = isl_pw_aff_list_foreach(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::pw_multi_aff_list pw_multi_aff_list::from_pw_multi_aff(isl::pw_multi_aff el)
+isl::pw_aff_list pw_aff_list::insert(unsigned int pos, isl::pw_aff el) const
 {
-  auto res = isl_pw_multi_aff_list_from_pw_multi_aff(el.release());
+  auto res = isl_pw_aff_list_insert(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff_list::get_at(int index) const
+class size pw_aff_list::size() const
 {
-  auto res = isl_pw_multi_aff_list_get_at(get(), index);
+  auto res = isl_pw_aff_list_size(get());
   return manage(res);
 }
 
-isl::pw_multi_aff pw_multi_aff_list::get_pw_multi_aff(int index) const
+inline std::ostream &operator<<(std::ostream &os, const pw_aff_list &obj)
 {
-  auto res = isl_pw_multi_aff_list_get_pw_multi_aff(get(), index);
-  return manage(res);
+  char *str = isl_pw_aff_list_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::pw_multi_aff_list pw_multi_aff_list::insert(unsigned int pos, isl::pw_multi_aff el) const
-{
-  auto res = isl_pw_multi_aff_list_insert(copy(), pos, el.release());
-  return manage(res);
+// implementations for isl::pw_multi_aff
+pw_multi_aff manage(__isl_take isl_pw_multi_aff *ptr) {
+  return pw_multi_aff(ptr);
 }
-
-isl_size pw_multi_aff_list::n_pw_multi_aff() const
-{
-  auto res = isl_pw_multi_aff_list_n_pw_multi_aff(get());
-  return res;
+pw_multi_aff manage_copy(__isl_keep isl_pw_multi_aff *ptr) {
+  ptr = isl_pw_multi_aff_copy(ptr);
+  return pw_multi_aff(ptr);
 }
 
-isl::pw_multi_aff_list pw_multi_aff_list::reverse() const
-{
-  auto res = isl_pw_multi_aff_list_reverse(copy());
-  return manage(res);
-}
+pw_multi_aff::pw_multi_aff()
+    : ptr(nullptr) {}
 
-isl::pw_multi_aff_list pw_multi_aff_list::set_pw_multi_aff(int index, isl::pw_multi_aff el) const
+pw_multi_aff::pw_multi_aff(const pw_multi_aff &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_pw_multi_aff_list_set_pw_multi_aff(copy(), index, el.release());
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl_size pw_multi_aff_list::size() const
-{
-  auto res = isl_pw_multi_aff_list_size(get());
-  return res;
-}
+pw_multi_aff::pw_multi_aff(__isl_take isl_pw_multi_aff *ptr)
+    : ptr(ptr) {}
 
-isl::pw_multi_aff_list pw_multi_aff_list::swap(unsigned int pos1, unsigned int pos2) const
+pw_multi_aff::pw_multi_aff(isl::multi_aff ma)
 {
-  auto res = isl_pw_multi_aff_list_swap(copy(), pos1, pos2);
-  return manage(res);
-}
-
-// implementations for isl::pw_qpolynomial
-pw_qpolynomial manage(__isl_take isl_pw_qpolynomial *ptr) {
-  return pw_qpolynomial(ptr);
-}
-pw_qpolynomial manage_copy(__isl_keep isl_pw_qpolynomial *ptr) {
-  ptr = isl_pw_qpolynomial_copy(ptr);
-  return pw_qpolynomial(ptr);
+  auto res = isl_pw_multi_aff_from_multi_aff(ma.release());
+  ptr = res;
 }
 
-pw_qpolynomial::pw_qpolynomial()
-    : ptr(nullptr) {}
-
-pw_qpolynomial::pw_qpolynomial(const pw_qpolynomial &obj)
-    : ptr(nullptr)
+pw_multi_aff::pw_multi_aff(isl::pw_aff pa)
 {
-  ptr = obj.copy();
+  auto res = isl_pw_multi_aff_from_pw_aff(pa.release());
+  ptr = res;
 }
 
-
-pw_qpolynomial::pw_qpolynomial(__isl_take isl_pw_qpolynomial *ptr)
-    : ptr(ptr) {}
-
-pw_qpolynomial::pw_qpolynomial(isl::ctx ctx, const std::string &str)
+pw_multi_aff::pw_multi_aff(isl::ctx ctx, const std::string &str)
 {
-  auto res = isl_pw_qpolynomial_read_from_str(ctx.release(), str.c_str());
+  auto res = isl_pw_multi_aff_read_from_str(ctx.release(), str.c_str());
   ptr = res;
 }
 
-pw_qpolynomial &pw_qpolynomial::operator=(pw_qpolynomial obj) {
+pw_multi_aff &pw_multi_aff::operator=(pw_multi_aff obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-pw_qpolynomial::~pw_qpolynomial() {
+pw_multi_aff::~pw_multi_aff() {
   if (ptr)
-    isl_pw_qpolynomial_free(ptr);
+    isl_pw_multi_aff_free(ptr);
 }
 
-__isl_give isl_pw_qpolynomial *pw_qpolynomial::copy() const & {
-  return isl_pw_qpolynomial_copy(ptr);
+__isl_give isl_pw_multi_aff *pw_multi_aff::copy() const & {
+  return isl_pw_multi_aff_copy(ptr);
 }
 
-__isl_keep isl_pw_qpolynomial *pw_qpolynomial::get() const {
+__isl_keep isl_pw_multi_aff *pw_multi_aff::get() const {
   return ptr;
 }
 
-__isl_give isl_pw_qpolynomial *pw_qpolynomial::release() {
-  isl_pw_qpolynomial *tmp = ptr;
+__isl_give isl_pw_multi_aff *pw_multi_aff::release() {
+  isl_pw_multi_aff *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool pw_qpolynomial::is_null() const {
+bool pw_multi_aff::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx pw_qpolynomial::ctx() const {
-  return isl::ctx(isl_pw_qpolynomial_get_ctx(ptr));
-}
-
-void pw_qpolynomial::dump() const {
-  isl_pw_qpolynomial_dump(get());
+isl::ctx pw_multi_aff::ctx() const {
+  return isl::ctx(isl_pw_multi_aff_get_ctx(ptr));
 }
 
-
-isl::pw_qpolynomial pw_qpolynomial::add(isl::pw_qpolynomial pwqp2) const
+isl::multi_pw_aff pw_multi_aff::add(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_pw_qpolynomial_add(copy(), pwqp2.release());
-  return manage(res);
+  return isl::multi_pw_aff(*this).add(multi2);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::add_dims(isl::dim type, unsigned int n) const
+isl::multi_union_pw_aff pw_multi_aff::add(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_pw_qpolynomial_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
-  return manage(res);
+  return isl::multi_pw_aff(*this).add(multi2);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::alloc(isl::set set, isl::qpolynomial qp)
+isl::pw_multi_aff pw_multi_aff::add(isl::pw_multi_aff pma2) const
 {
-  auto res = isl_pw_qpolynomial_alloc(set.release(), qp.release());
+  auto res = isl_pw_multi_aff_add(copy(), pma2.release());
   return manage(res);
 }
 
-isl::qpolynomial pw_qpolynomial::as_qpolynomial() const
+isl::union_pw_multi_aff pw_multi_aff::add(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_pw_qpolynomial_as_qpolynomial(copy());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).add(upma2);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::coalesce() const
+isl::pw_multi_aff pw_multi_aff::add(const isl::multi_aff &pma2) const
 {
-  auto res = isl_pw_qpolynomial_coalesce(copy());
-  return manage(res);
+  return this->add(isl::pw_multi_aff(pma2));
 }
 
-isl_size pw_qpolynomial::dim(isl::dim type) const
+isl::pw_multi_aff pw_multi_aff::add(const isl::pw_aff &pma2) const
 {
-  auto res = isl_pw_qpolynomial_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return this->add(isl::pw_multi_aff(pma2));
 }
 
-isl::set pw_qpolynomial::domain() const
+isl::pw_multi_aff pw_multi_aff::add_constant(isl::multi_val mv) const
 {
-  auto res = isl_pw_qpolynomial_domain(copy());
+  auto res = isl_pw_multi_aff_add_constant_multi_val(copy(), mv.release());
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::pw_multi_aff pw_multi_aff::add_constant(isl::val v) const
 {
-  auto res = isl_pw_qpolynomial_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_pw_multi_aff_add_constant_val(copy(), v.release());
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::drop_unused_params() const
+isl::pw_multi_aff pw_multi_aff::add_constant(long v) const
 {
-  auto res = isl_pw_qpolynomial_drop_unused_params(copy());
-  return manage(res);
+  return this->add_constant(isl::val(ctx(), v));
 }
 
-isl::val pw_qpolynomial::eval(isl::point pnt) const
+isl::union_pw_multi_aff pw_multi_aff::add_pw_multi_aff(const isl::pw_multi_aff &pma) const
 {
-  auto res = isl_pw_qpolynomial_eval(copy(), pnt.release());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).add_pw_multi_aff(pma);
 }
 
-int pw_qpolynomial::find_dim_by_name(isl::dim type, const std::string &name) const
+isl::union_pw_multi_aff pw_multi_aff::apply(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_pw_qpolynomial_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+  return isl::union_pw_multi_aff(*this).apply(upma2);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::fix_val(isl::dim type, unsigned int n, isl::val v) const
+isl::map pw_multi_aff::as_map() const
 {
-  auto res = isl_pw_qpolynomial_fix_val(copy(), static_cast<enum isl_dim_type>(type), n, v.release());
+  auto res = isl_pw_multi_aff_as_map(copy());
   return manage(res);
 }
 
-stat pw_qpolynomial::foreach_piece(const std::function<stat(set, qpolynomial)> &fn) const
+isl::multi_aff pw_multi_aff::as_multi_aff() const
 {
-  struct fn_data {
-    const std::function<stat(set, qpolynomial)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_set *arg_0, isl_qpolynomial *arg_1, void *arg_2) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_2);
-    stat ret = (*data->func)(manage(arg_0), manage(arg_1));
-    return ret.release();
-  };
-  auto res = isl_pw_qpolynomial_foreach_piece(get(), fn_lambda, &fn_data);
+  auto res = isl_pw_multi_aff_as_multi_aff(copy());
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::from_pw_aff(isl::pw_aff pwaff)
+isl::multi_union_pw_aff pw_multi_aff::as_multi_union_pw_aff() const
 {
-  auto res = isl_pw_qpolynomial_from_pw_aff(pwaff.release());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).as_multi_union_pw_aff();
 }
 
-isl::pw_qpolynomial pw_qpolynomial::from_qpolynomial(isl::qpolynomial qp)
+isl::pw_multi_aff pw_multi_aff::as_pw_multi_aff() const
 {
-  auto res = isl_pw_qpolynomial_from_qpolynomial(qp.release());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).as_pw_multi_aff();
 }
 
-isl::pw_qpolynomial pw_qpolynomial::from_range() const
+isl::set pw_multi_aff::as_set() const
 {
-  auto res = isl_pw_qpolynomial_from_range(copy());
+  auto res = isl_pw_multi_aff_as_set(copy());
   return manage(res);
 }
 
-isl::space pw_qpolynomial::get_domain_space() const
+isl::union_map pw_multi_aff::as_union_map() const
 {
-  auto res = isl_pw_qpolynomial_get_domain_space(get());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).as_union_map();
 }
 
-isl::space pw_qpolynomial::get_space() const
+isl::pw_aff pw_multi_aff::at(int pos) const
 {
-  auto res = isl_pw_qpolynomial_get_space(get());
+  auto res = isl_pw_multi_aff_get_at(get(), pos);
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::gist(isl::set context) const
+isl::pw_aff pw_multi_aff::get_at(int pos) const
 {
-  auto res = isl_pw_qpolynomial_gist(copy(), context.release());
-  return manage(res);
+  return at(pos);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::gist_params(isl::set context) const
+isl::set pw_multi_aff::bind(const isl::multi_id &tuple) const
 {
-  auto res = isl_pw_qpolynomial_gist_params(copy(), context.release());
-  return manage(res);
+  return isl::multi_pw_aff(*this).bind(tuple);
 }
 
-boolean pw_qpolynomial::has_equal_space(const isl::pw_qpolynomial &pwqp2) const
+isl::pw_multi_aff pw_multi_aff::bind_domain(isl::multi_id tuple) const
 {
-  auto res = isl_pw_qpolynomial_has_equal_space(get(), pwqp2.get());
+  auto res = isl_pw_multi_aff_bind_domain(copy(), tuple.release());
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::insert_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::pw_multi_aff pw_multi_aff::bind_domain_wrapped_domain(isl::multi_id tuple) const
 {
-  auto res = isl_pw_qpolynomial_insert_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_pw_multi_aff_bind_domain_wrapped_domain(copy(), tuple.release());
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::intersect_domain(isl::set set) const
+isl::pw_multi_aff pw_multi_aff::coalesce() const
 {
-  auto res = isl_pw_qpolynomial_intersect_domain(copy(), set.release());
+  auto res = isl_pw_multi_aff_coalesce(copy());
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::intersect_domain_wrapped_domain(isl::set set) const
+class size pw_multi_aff::dim(isl::dim type) const
 {
-  auto res = isl_pw_qpolynomial_intersect_domain_wrapped_domain(copy(), set.release());
+  auto res = isl_pw_multi_aff_dim(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::intersect_domain_wrapped_range(isl::set set) const
+isl::set pw_multi_aff::domain() const
 {
-  auto res = isl_pw_qpolynomial_intersect_domain_wrapped_range(copy(), set.release());
+  auto res = isl_pw_multi_aff_domain(copy());
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::intersect_params(isl::set set) const
+isl::pw_multi_aff pw_multi_aff::domain_map(isl::space space)
 {
-  auto res = isl_pw_qpolynomial_intersect_params(copy(), set.release());
+  auto res = isl_pw_multi_aff_domain_map(space.release());
   return manage(res);
 }
 
-boolean pw_qpolynomial::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::pw_multi_aff pw_multi_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_pw_qpolynomial_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_pw_multi_aff_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
   return manage(res);
 }
 
-boolean pw_qpolynomial::involves_nan() const
+isl::pw_multi_aff pw_multi_aff::extract_pw_multi_aff(const isl::space &space) const
 {
-  auto res = isl_pw_qpolynomial_involves_nan(get());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).extract_pw_multi_aff(space);
 }
 
-boolean pw_qpolynomial::involves_param_id(const isl::id &id) const
+isl::multi_pw_aff pw_multi_aff::flat_range_product(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_pw_qpolynomial_involves_param_id(get(), id.get());
-  return manage(res);
+  return isl::multi_pw_aff(*this).flat_range_product(multi2);
 }
 
-boolean pw_qpolynomial::is_zero() const
+isl::multi_union_pw_aff pw_multi_aff::flat_range_product(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_pw_qpolynomial_is_zero(get());
-  return manage(res);
+  return isl::multi_pw_aff(*this).flat_range_product(multi2);
 }
 
-boolean pw_qpolynomial::isa_qpolynomial() const
+isl::pw_multi_aff pw_multi_aff::flat_range_product(isl::pw_multi_aff pma2) const
 {
-  auto res = isl_pw_qpolynomial_isa_qpolynomial(get());
+  auto res = isl_pw_multi_aff_flat_range_product(copy(), pma2.release());
   return manage(res);
 }
 
-isl::val pw_qpolynomial::max() const
+isl::union_pw_multi_aff pw_multi_aff::flat_range_product(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_pw_qpolynomial_max(copy());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).flat_range_product(upma2);
 }
 
-isl::val pw_qpolynomial::min() const
+isl::pw_multi_aff pw_multi_aff::flat_range_product(const isl::multi_aff &pma2) const
 {
-  auto res = isl_pw_qpolynomial_min(copy());
-  return manage(res);
+  return this->flat_range_product(isl::pw_multi_aff(pma2));
 }
 
-isl::pw_qpolynomial pw_qpolynomial::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
+isl::pw_multi_aff pw_multi_aff::flat_range_product(const isl::pw_aff &pma2) const
 {
-  auto res = isl_pw_qpolynomial_move_dims(copy(), static_cast<enum isl_dim_type>(dst_type), dst_pos, static_cast<enum isl_dim_type>(src_type), src_pos, n);
-  return manage(res);
+  return this->flat_range_product(isl::pw_multi_aff(pma2));
 }
 
-isl::pw_qpolynomial pw_qpolynomial::mul(isl::pw_qpolynomial pwqp2) const
+stat pw_multi_aff::foreach_piece(const std::function<stat(isl::set, isl::multi_aff)> &fn) const
 {
-  auto res = isl_pw_qpolynomial_mul(copy(), pwqp2.release());
+  struct fn_data {
+    std::function<stat(isl::set, isl::multi_aff)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_set *arg_0, isl_multi_aff *arg_1, void *arg_2) -> isl_stat {
+    auto *data = static_cast<struct fn_data *>(arg_2);
+    auto ret = (data->func)(manage(arg_0), manage(arg_1));
+    return ret.release();
+  };
+  auto res = isl_pw_multi_aff_foreach_piece(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl_size pw_qpolynomial::n_piece() const
-{
-  auto res = isl_pw_qpolynomial_n_piece(get());
-  return res;
-}
-
-isl::pw_qpolynomial pw_qpolynomial::neg() const
+isl::pw_multi_aff pw_multi_aff::from_map(isl::map map)
 {
-  auto res = isl_pw_qpolynomial_neg(copy());
+  auto res = isl_pw_multi_aff_from_map(map.release());
   return manage(res);
 }
 
-boolean pw_qpolynomial::plain_is_equal(const isl::pw_qpolynomial &pwqp2) const
+isl::pw_multi_aff pw_multi_aff::gist(isl::set set) const
 {
-  auto res = isl_pw_qpolynomial_plain_is_equal(get(), pwqp2.get());
+  auto res = isl_pw_multi_aff_gist(copy(), set.release());
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::pow(unsigned int exponent) const
+isl::union_pw_multi_aff pw_multi_aff::gist(const isl::union_set &context) const
 {
-  auto res = isl_pw_qpolynomial_pow(copy(), exponent);
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).gist(context);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::project_domain_on_params() const
+isl::pw_multi_aff pw_multi_aff::gist(const isl::basic_set &set) const
 {
-  auto res = isl_pw_qpolynomial_project_domain_on_params(copy());
-  return manage(res);
+  return this->gist(isl::set(set));
 }
 
-isl::pw_qpolynomial pw_qpolynomial::reset_domain_space(isl::space space) const
+isl::pw_multi_aff pw_multi_aff::gist(const isl::point &set) const
 {
-  auto res = isl_pw_qpolynomial_reset_domain_space(copy(), space.release());
-  return manage(res);
+  return this->gist(isl::set(set));
 }
 
-isl::pw_qpolynomial pw_qpolynomial::reset_user() const
+boolean pw_multi_aff::has_range_tuple_id() const
 {
-  auto res = isl_pw_qpolynomial_reset_user(copy());
+  auto res = isl_pw_multi_aff_has_range_tuple_id(get());
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::scale_down_val(isl::val v) const
+isl::multi_pw_aff pw_multi_aff::identity() const
 {
-  auto res = isl_pw_qpolynomial_scale_down_val(copy(), v.release());
-  return manage(res);
+  return isl::multi_pw_aff(*this).identity();
 }
 
-isl::pw_qpolynomial pw_qpolynomial::scale_val(isl::val v) const
+isl::pw_multi_aff pw_multi_aff::identity_on_domain(isl::space space)
 {
-  auto res = isl_pw_qpolynomial_scale_val(copy(), v.release());
+  auto res = isl_pw_multi_aff_identity_on_domain_space(space.release());
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::split_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::pw_multi_aff pw_multi_aff::insert_domain(isl::space domain) const
 {
-  auto res = isl_pw_qpolynomial_split_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_pw_multi_aff_insert_domain(copy(), domain.release());
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::split_periods(int max_periods) const
+isl::pw_multi_aff pw_multi_aff::intersect_domain(isl::set set) const
 {
-  auto res = isl_pw_qpolynomial_split_periods(copy(), max_periods);
+  auto res = isl_pw_multi_aff_intersect_domain(copy(), set.release());
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::sub(isl::pw_qpolynomial pwqp2) const
+isl::union_pw_multi_aff pw_multi_aff::intersect_domain(const isl::space &space) const
 {
-  auto res = isl_pw_qpolynomial_sub(copy(), pwqp2.release());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).intersect_domain(space);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::subtract_domain(isl::set set) const
+isl::union_pw_multi_aff pw_multi_aff::intersect_domain(const isl::union_set &uset) const
 {
-  auto res = isl_pw_qpolynomial_subtract_domain(copy(), set.release());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).intersect_domain(uset);
 }
 
-isl::pw_qpolynomial pw_qpolynomial::to_polynomial(int sign) const
+isl::pw_multi_aff pw_multi_aff::intersect_domain(const isl::basic_set &set) const
 {
-  auto res = isl_pw_qpolynomial_to_polynomial(copy(), sign);
-  return manage(res);
+  return this->intersect_domain(isl::set(set));
 }
 
-isl::pw_qpolynomial pw_qpolynomial::zero(isl::space space)
+isl::pw_multi_aff pw_multi_aff::intersect_domain(const isl::point &set) const
 {
-  auto res = isl_pw_qpolynomial_zero(space.release());
-  return manage(res);
+  return this->intersect_domain(isl::set(set));
 }
 
-// implementations for isl::pw_qpolynomial_fold_list
-pw_qpolynomial_fold_list manage(__isl_take isl_pw_qpolynomial_fold_list *ptr) {
-  return pw_qpolynomial_fold_list(ptr);
-}
-pw_qpolynomial_fold_list manage_copy(__isl_keep isl_pw_qpolynomial_fold_list *ptr) {
-  ptr = isl_pw_qpolynomial_fold_list_copy(ptr);
-  return pw_qpolynomial_fold_list(ptr);
+isl::union_pw_multi_aff pw_multi_aff::intersect_domain_wrapped_domain(const isl::union_set &uset) const
+{
+  return isl::union_pw_multi_aff(*this).intersect_domain_wrapped_domain(uset);
 }
 
-pw_qpolynomial_fold_list::pw_qpolynomial_fold_list()
-    : ptr(nullptr) {}
-
-pw_qpolynomial_fold_list::pw_qpolynomial_fold_list(const pw_qpolynomial_fold_list &obj)
-    : ptr(nullptr)
+isl::union_pw_multi_aff pw_multi_aff::intersect_domain_wrapped_range(const isl::union_set &uset) const
 {
-  ptr = obj.copy();
+  return isl::union_pw_multi_aff(*this).intersect_domain_wrapped_range(uset);
 }
 
-
-pw_qpolynomial_fold_list::pw_qpolynomial_fold_list(__isl_take isl_pw_qpolynomial_fold_list *ptr)
-    : ptr(ptr) {}
-
-
-pw_qpolynomial_fold_list &pw_qpolynomial_fold_list::operator=(pw_qpolynomial_fold_list obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::pw_multi_aff pw_multi_aff::intersect_params(isl::set set) const
+{
+  auto res = isl_pw_multi_aff_intersect_params(copy(), set.release());
+  return manage(res);
 }
 
-pw_qpolynomial_fold_list::~pw_qpolynomial_fold_list() {
-  if (ptr)
-    isl_pw_qpolynomial_fold_list_free(ptr);
+boolean pw_multi_aff::involves_locals() const
+{
+  auto res = isl_pw_multi_aff_involves_locals(get());
+  return manage(res);
 }
 
-__isl_give isl_pw_qpolynomial_fold_list *pw_qpolynomial_fold_list::copy() const & {
-  return isl_pw_qpolynomial_fold_list_copy(ptr);
+boolean pw_multi_aff::involves_nan() const
+{
+  return isl::multi_pw_aff(*this).involves_nan();
 }
 
-__isl_keep isl_pw_qpolynomial_fold_list *pw_qpolynomial_fold_list::get() const {
-  return ptr;
+boolean pw_multi_aff::involves_param(const isl::id &id) const
+{
+  return isl::multi_pw_aff(*this).involves_param(id);
 }
 
-__isl_give isl_pw_qpolynomial_fold_list *pw_qpolynomial_fold_list::release() {
-  isl_pw_qpolynomial_fold_list *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+boolean pw_multi_aff::involves_param(const std::string &id) const
+{
+  return this->involves_param(isl::id(ctx(), id));
 }
 
-bool pw_qpolynomial_fold_list::is_null() const {
-  return ptr == nullptr;
+boolean pw_multi_aff::involves_param(const isl::id_list &list) const
+{
+  return isl::multi_pw_aff(*this).involves_param(list);
 }
 
-
-isl::ctx pw_qpolynomial_fold_list::ctx() const {
-  return isl::ctx(isl_pw_qpolynomial_fold_list_get_ctx(ptr));
+boolean pw_multi_aff::isa_multi_aff() const
+{
+  auto res = isl_pw_multi_aff_isa_multi_aff(get());
+  return manage(res);
 }
 
-void pw_qpolynomial_fold_list::dump() const {
-  isl_pw_qpolynomial_fold_list_dump(get());
+boolean pw_multi_aff::isa_pw_multi_aff() const
+{
+  return isl::union_pw_multi_aff(*this).isa_pw_multi_aff();
 }
 
-
-
-// implementations for isl::pw_qpolynomial_list
-pw_qpolynomial_list manage(__isl_take isl_pw_qpolynomial_list *ptr) {
-  return pw_qpolynomial_list(ptr);
-}
-pw_qpolynomial_list manage_copy(__isl_keep isl_pw_qpolynomial_list *ptr) {
-  ptr = isl_pw_qpolynomial_list_copy(ptr);
-  return pw_qpolynomial_list(ptr);
+isl::pw_aff_list pw_multi_aff::list() const
+{
+  return isl::multi_pw_aff(*this).list();
 }
 
-pw_qpolynomial_list::pw_qpolynomial_list()
-    : ptr(nullptr) {}
-
-pw_qpolynomial_list::pw_qpolynomial_list(const pw_qpolynomial_list &obj)
-    : ptr(nullptr)
+isl::multi_pw_aff pw_multi_aff::max(const isl::multi_pw_aff &multi2) const
 {
-  ptr = obj.copy();
+  return isl::multi_pw_aff(*this).max(multi2);
 }
 
+isl::multi_val pw_multi_aff::max_multi_val() const
+{
+  auto res = isl_pw_multi_aff_max_multi_val(copy());
+  return manage(res);
+}
 
-pw_qpolynomial_list::pw_qpolynomial_list(__isl_take isl_pw_qpolynomial_list *ptr)
-    : ptr(ptr) {}
-
-
-pw_qpolynomial_list &pw_qpolynomial_list::operator=(pw_qpolynomial_list obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::multi_pw_aff pw_multi_aff::min(const isl::multi_pw_aff &multi2) const
+{
+  return isl::multi_pw_aff(*this).min(multi2);
 }
 
-pw_qpolynomial_list::~pw_qpolynomial_list() {
-  if (ptr)
-    isl_pw_qpolynomial_list_free(ptr);
+isl::multi_val pw_multi_aff::min_multi_val() const
+{
+  auto res = isl_pw_multi_aff_min_multi_val(copy());
+  return manage(res);
 }
 
-__isl_give isl_pw_qpolynomial_list *pw_qpolynomial_list::copy() const & {
-  return isl_pw_qpolynomial_list_copy(ptr);
+isl::pw_multi_aff pw_multi_aff::multi_val_on_domain(isl::set domain, isl::multi_val mv)
+{
+  auto res = isl_pw_multi_aff_multi_val_on_domain(domain.release(), mv.release());
+  return manage(res);
 }
 
-__isl_keep isl_pw_qpolynomial_list *pw_qpolynomial_list::get() const {
-  return ptr;
+class size pw_multi_aff::n_piece() const
+{
+  auto res = isl_pw_multi_aff_n_piece(get());
+  return manage(res);
 }
 
-__isl_give isl_pw_qpolynomial_list *pw_qpolynomial_list::release() {
-  isl_pw_qpolynomial_list *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::multi_pw_aff pw_multi_aff::neg() const
+{
+  return isl::multi_pw_aff(*this).neg();
 }
 
-bool pw_qpolynomial_list::is_null() const {
-  return ptr == nullptr;
+boolean pw_multi_aff::plain_is_empty() const
+{
+  return isl::union_pw_multi_aff(*this).plain_is_empty();
 }
 
+boolean pw_multi_aff::plain_is_equal(const isl::multi_pw_aff &multi2) const
+{
+  return isl::multi_pw_aff(*this).plain_is_equal(multi2);
+}
 
-isl::ctx pw_qpolynomial_list::ctx() const {
-  return isl::ctx(isl_pw_qpolynomial_list_get_ctx(ptr));
+boolean pw_multi_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const
+{
+  return isl::multi_pw_aff(*this).plain_is_equal(multi2);
 }
 
-void pw_qpolynomial_list::dump() const {
-  isl_pw_qpolynomial_list_dump(get());
+isl::pw_multi_aff pw_multi_aff::preimage_domain_wrapped_domain(isl::pw_multi_aff pma2) const
+{
+  auto res = isl_pw_multi_aff_preimage_domain_wrapped_domain_pw_multi_aff(copy(), pma2.release());
+  return manage(res);
 }
 
+isl::union_pw_multi_aff pw_multi_aff::preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const
+{
+  return isl::union_pw_multi_aff(*this).preimage_domain_wrapped_domain(upma2);
+}
 
-isl::pw_qpolynomial_list pw_qpolynomial_list::add(isl::pw_qpolynomial el) const
+isl::pw_multi_aff pw_multi_aff::preimage_domain_wrapped_domain(const isl::multi_aff &pma2) const
 {
-  auto res = isl_pw_qpolynomial_list_add(copy(), el.release());
-  return manage(res);
+  return this->preimage_domain_wrapped_domain(isl::pw_multi_aff(pma2));
 }
 
-isl::pw_qpolynomial_list pw_qpolynomial_list::alloc(isl::ctx ctx, int n)
+isl::pw_multi_aff pw_multi_aff::preimage_domain_wrapped_domain(const isl::pw_aff &pma2) const
 {
-  auto res = isl_pw_qpolynomial_list_alloc(ctx.release(), n);
-  return manage(res);
+  return this->preimage_domain_wrapped_domain(isl::pw_multi_aff(pma2));
 }
 
-isl::pw_qpolynomial_list pw_qpolynomial_list::clear() const
+isl::multi_pw_aff pw_multi_aff::product(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_pw_qpolynomial_list_clear(copy());
-  return manage(res);
+  return isl::multi_pw_aff(*this).product(multi2);
 }
 
-isl::pw_qpolynomial_list pw_qpolynomial_list::concat(isl::pw_qpolynomial_list list2) const
+isl::pw_multi_aff pw_multi_aff::product(isl::pw_multi_aff pma2) const
 {
-  auto res = isl_pw_qpolynomial_list_concat(copy(), list2.release());
+  auto res = isl_pw_multi_aff_product(copy(), pma2.release());
   return manage(res);
 }
 
-isl::pw_qpolynomial_list pw_qpolynomial_list::drop(unsigned int first, unsigned int n) const
+isl::pw_multi_aff pw_multi_aff::product(const isl::multi_aff &pma2) const
 {
-  auto res = isl_pw_qpolynomial_list_drop(copy(), first, n);
-  return manage(res);
+  return this->product(isl::pw_multi_aff(pma2));
 }
 
-stat pw_qpolynomial_list::foreach(const std::function<stat(pw_qpolynomial)> &fn) const
+isl::pw_multi_aff pw_multi_aff::product(const isl::pw_aff &pma2) const
 {
-  struct fn_data {
-    const std::function<stat(pw_qpolynomial)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_pw_qpolynomial *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_pw_qpolynomial_list_foreach(get(), fn_lambda, &fn_data);
-  return manage(res);
+  return this->product(isl::pw_multi_aff(pma2));
 }
 
-isl::pw_qpolynomial_list pw_qpolynomial_list::from_pw_qpolynomial(isl::pw_qpolynomial el)
+isl::pw_multi_aff pw_multi_aff::project_out_map(isl::space space, isl::dim type, unsigned int first, unsigned int n)
 {
-  auto res = isl_pw_qpolynomial_list_from_pw_qpolynomial(el.release());
+  auto res = isl_pw_multi_aff_project_out_map(space.release(), static_cast<enum isl_dim_type>(type), first, n);
   return manage(res);
 }
 
-isl::pw_qpolynomial pw_qpolynomial_list::get_at(int index) const
+isl::multi_pw_aff pw_multi_aff::pullback(const isl::multi_pw_aff &mpa2) const
 {
-  auto res = isl_pw_qpolynomial_list_get_at(get(), index);
-  return manage(res);
+  return isl::multi_pw_aff(*this).pullback(mpa2);
 }
 
-isl::pw_qpolynomial pw_qpolynomial_list::get_pw_qpolynomial(int index) const
+isl::pw_multi_aff pw_multi_aff::pullback(isl::multi_aff ma) const
 {
-  auto res = isl_pw_qpolynomial_list_get_pw_qpolynomial(get(), index);
+  auto res = isl_pw_multi_aff_pullback_multi_aff(copy(), ma.release());
   return manage(res);
 }
 
-isl::pw_qpolynomial_list pw_qpolynomial_list::insert(unsigned int pos, isl::pw_qpolynomial el) const
+isl::pw_multi_aff pw_multi_aff::pullback(isl::pw_multi_aff pma2) const
 {
-  auto res = isl_pw_qpolynomial_list_insert(copy(), pos, el.release());
+  auto res = isl_pw_multi_aff_pullback_pw_multi_aff(copy(), pma2.release());
   return manage(res);
 }
 
-isl_size pw_qpolynomial_list::n_pw_qpolynomial() const
+isl::union_pw_multi_aff pw_multi_aff::pullback(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_pw_qpolynomial_list_n_pw_qpolynomial(get());
-  return res;
+  return isl::union_pw_multi_aff(*this).pullback(upma2);
 }
 
-isl::pw_qpolynomial_list pw_qpolynomial_list::reverse() const
+isl::pw_multi_aff_list pw_multi_aff::pw_multi_aff_list() const
 {
-  auto res = isl_pw_qpolynomial_list_reverse(copy());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).pw_multi_aff_list();
 }
 
-isl::pw_qpolynomial_list pw_qpolynomial_list::set_pw_qpolynomial(int index, isl::pw_qpolynomial el) const
+isl::pw_multi_aff pw_multi_aff::range_factor_domain() const
 {
-  auto res = isl_pw_qpolynomial_list_set_pw_qpolynomial(copy(), index, el.release());
+  auto res = isl_pw_multi_aff_range_factor_domain(copy());
   return manage(res);
 }
 
-isl_size pw_qpolynomial_list::size() const
+isl::pw_multi_aff pw_multi_aff::range_factor_range() const
 {
-  auto res = isl_pw_qpolynomial_list_size(get());
-  return res;
+  auto res = isl_pw_multi_aff_range_factor_range(copy());
+  return manage(res);
 }
 
-isl::pw_qpolynomial_list pw_qpolynomial_list::swap(unsigned int pos1, unsigned int pos2) const
+isl::pw_multi_aff pw_multi_aff::range_map(isl::space space)
 {
-  auto res = isl_pw_qpolynomial_list_swap(copy(), pos1, pos2);
+  auto res = isl_pw_multi_aff_range_map(space.release());
   return manage(res);
 }
 
-// implementations for isl::qpolynomial
-qpolynomial manage(__isl_take isl_qpolynomial *ptr) {
-  return qpolynomial(ptr);
-}
-qpolynomial manage_copy(__isl_keep isl_qpolynomial *ptr) {
-  ptr = isl_qpolynomial_copy(ptr);
-  return qpolynomial(ptr);
+isl::multi_pw_aff pw_multi_aff::range_product(const isl::multi_pw_aff &multi2) const
+{
+  return isl::multi_pw_aff(*this).range_product(multi2);
 }
 
-qpolynomial::qpolynomial()
-    : ptr(nullptr) {}
-
-qpolynomial::qpolynomial(const qpolynomial &obj)
-    : ptr(nullptr)
+isl::multi_union_pw_aff pw_multi_aff::range_product(const isl::multi_union_pw_aff &multi2) const
 {
-  ptr = obj.copy();
+  return isl::multi_pw_aff(*this).range_product(multi2);
 }
 
-
-qpolynomial::qpolynomial(__isl_take isl_qpolynomial *ptr)
-    : ptr(ptr) {}
-
-
-qpolynomial &qpolynomial::operator=(qpolynomial obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::pw_multi_aff pw_multi_aff::range_product(isl::pw_multi_aff pma2) const
+{
+  auto res = isl_pw_multi_aff_range_product(copy(), pma2.release());
+  return manage(res);
 }
 
-qpolynomial::~qpolynomial() {
-  if (ptr)
-    isl_qpolynomial_free(ptr);
+isl::union_pw_multi_aff pw_multi_aff::range_product(const isl::union_pw_multi_aff &upma2) const
+{
+  return isl::union_pw_multi_aff(*this).range_product(upma2);
 }
 
-__isl_give isl_qpolynomial *qpolynomial::copy() const & {
-  return isl_qpolynomial_copy(ptr);
+isl::pw_multi_aff pw_multi_aff::range_product(const isl::multi_aff &pma2) const
+{
+  return this->range_product(isl::pw_multi_aff(pma2));
 }
 
-__isl_keep isl_qpolynomial *qpolynomial::get() const {
-  return ptr;
+isl::pw_multi_aff pw_multi_aff::range_product(const isl::pw_aff &pma2) const
+{
+  return this->range_product(isl::pw_multi_aff(pma2));
 }
 
-__isl_give isl_qpolynomial *qpolynomial::release() {
-  isl_qpolynomial *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::id pw_multi_aff::range_tuple_id() const
+{
+  auto res = isl_pw_multi_aff_get_range_tuple_id(get());
+  return manage(res);
 }
 
-bool qpolynomial::is_null() const {
-  return ptr == nullptr;
+isl::id pw_multi_aff::get_range_tuple_id() const
+{
+  return range_tuple_id();
 }
 
-
-isl::ctx qpolynomial::ctx() const {
-  return isl::ctx(isl_qpolynomial_get_ctx(ptr));
+isl::multi_pw_aff pw_multi_aff::reset_range_tuple_id() const
+{
+  return isl::multi_pw_aff(*this).reset_range_tuple_id();
 }
 
-void qpolynomial::dump() const {
-  isl_qpolynomial_dump(get());
+isl::multi_pw_aff pw_multi_aff::reset_tuple_id(isl::dim type) const
+{
+  return isl::multi_pw_aff(*this).reset_tuple_id(type);
 }
 
+isl::multi_pw_aff pw_multi_aff::scale(const isl::multi_val &mv) const
+{
+  return isl::multi_pw_aff(*this).scale(mv);
+}
 
-isl::qpolynomial qpolynomial::add(isl::qpolynomial qp2) const
+isl::pw_multi_aff pw_multi_aff::scale(isl::val v) const
 {
-  auto res = isl_qpolynomial_add(copy(), qp2.release());
+  auto res = isl_pw_multi_aff_scale_val(copy(), v.release());
   return manage(res);
 }
 
-isl::qpolynomial qpolynomial::add_dims(isl::dim type, unsigned int n) const
+isl::pw_multi_aff pw_multi_aff::scale(long v) const
 {
-  auto res = isl_qpolynomial_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
-  return manage(res);
+  return this->scale(isl::val(ctx(), v));
 }
 
-isl::qpolynomial qpolynomial::align_params(isl::space model) const
+isl::multi_pw_aff pw_multi_aff::scale_down(const isl::multi_val &mv) const
 {
-  auto res = isl_qpolynomial_align_params(copy(), model.release());
-  return manage(res);
+  return isl::multi_pw_aff(*this).scale_down(mv);
 }
 
-stat qpolynomial::as_polynomial_on_domain(const isl::basic_set &bset, const std::function<stat(basic_set, qpolynomial)> &fn) const
+isl::pw_multi_aff pw_multi_aff::scale_down(isl::val v) const
 {
-  struct fn_data {
-    const std::function<stat(basic_set, qpolynomial)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_basic_set *arg_0, isl_qpolynomial *arg_1, void *arg_2) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_2);
-    stat ret = (*data->func)(manage(arg_0), manage(arg_1));
-    return ret.release();
-  };
-  auto res = isl_qpolynomial_as_polynomial_on_domain(get(), bset.get(), fn_lambda, &fn_data);
+  auto res = isl_pw_multi_aff_scale_down_val(copy(), v.release());
   return manage(res);
 }
 
-isl_size qpolynomial::dim(isl::dim type) const
+isl::pw_multi_aff pw_multi_aff::scale_down(long v) const
 {
-  auto res = isl_qpolynomial_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return this->scale_down(isl::val(ctx(), v));
 }
 
-isl::qpolynomial qpolynomial::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::multi_pw_aff pw_multi_aff::set_at(int pos, const isl::pw_aff &el) const
 {
-  auto res = isl_qpolynomial_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::multi_pw_aff(*this).set_at(pos, el);
 }
 
-isl::val qpolynomial::eval(isl::point pnt) const
+isl::multi_union_pw_aff pw_multi_aff::set_at(int pos, const isl::union_pw_aff &el) const
 {
-  auto res = isl_qpolynomial_eval(copy(), pnt.release());
-  return manage(res);
+  return isl::multi_pw_aff(*this).set_at(pos, el);
 }
 
-stat qpolynomial::foreach_term(const std::function<stat(term)> &fn) const
+isl::multi_pw_aff pw_multi_aff::set_pw_aff(int pos, const isl::pw_aff &el) const
 {
-  struct fn_data {
-    const std::function<stat(term)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_term *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_qpolynomial_foreach_term(get(), fn_lambda, &fn_data);
-  return manage(res);
+  return isl::multi_pw_aff(*this).set_pw_aff(pos, el);
 }
 
-isl::qpolynomial qpolynomial::from_aff(isl::aff aff)
+isl::pw_multi_aff pw_multi_aff::set_pw_aff(unsigned int pos, isl::pw_aff pa) const
 {
-  auto res = isl_qpolynomial_from_aff(aff.release());
+  auto res = isl_pw_multi_aff_set_pw_aff(copy(), pos, pa.release());
   return manage(res);
 }
 
-isl::qpolynomial qpolynomial::from_constraint(isl::constraint c, isl::dim type, unsigned int pos)
+isl::pw_multi_aff pw_multi_aff::set_range_tuple(isl::id id) const
 {
-  auto res = isl_qpolynomial_from_constraint(c.release(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_pw_multi_aff_set_range_tuple_id(copy(), id.release());
   return manage(res);
 }
 
-isl::qpolynomial qpolynomial::from_term(isl::term term)
+isl::pw_multi_aff pw_multi_aff::set_range_tuple(const std::string &id) const
 {
-  auto res = isl_qpolynomial_from_term(term.release());
-  return manage(res);
+  return this->set_range_tuple(isl::id(ctx(), id));
 }
 
-isl::val qpolynomial::get_constant_val() const
+isl::multi_union_pw_aff pw_multi_aff::set_union_pw_aff(int pos, const isl::union_pw_aff &el) const
 {
-  auto res = isl_qpolynomial_get_constant_val(get());
-  return manage(res);
+  return isl::multi_pw_aff(*this).set_union_pw_aff(pos, el);
 }
 
-isl::space qpolynomial::get_domain_space() const
+class size pw_multi_aff::size() const
 {
-  auto res = isl_qpolynomial_get_domain_space(get());
-  return manage(res);
+  return isl::multi_pw_aff(*this).size();
 }
 
-isl::space qpolynomial::get_space() const
+isl::space pw_multi_aff::space() const
 {
-  auto res = isl_qpolynomial_get_space(get());
+  auto res = isl_pw_multi_aff_get_space(get());
   return manage(res);
 }
 
-isl::qpolynomial qpolynomial::gist(isl::set context) const
+isl::space pw_multi_aff::get_space() const
 {
-  auto res = isl_qpolynomial_gist(copy(), context.release());
-  return manage(res);
+  return space();
 }
 
-isl::qpolynomial qpolynomial::gist_params(isl::set context) const
+isl::multi_pw_aff pw_multi_aff::sub(const isl::multi_pw_aff &multi2) const
 {
-  auto res = isl_qpolynomial_gist_params(copy(), context.release());
-  return manage(res);
+  return isl::multi_pw_aff(*this).sub(multi2);
 }
 
-isl::qpolynomial qpolynomial::homogenize() const
+isl::multi_union_pw_aff pw_multi_aff::sub(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_qpolynomial_homogenize(copy());
-  return manage(res);
+  return isl::multi_pw_aff(*this).sub(multi2);
 }
 
-isl::qpolynomial qpolynomial::infty_on_domain(isl::space domain)
+isl::pw_multi_aff pw_multi_aff::sub(isl::pw_multi_aff pma2) const
 {
-  auto res = isl_qpolynomial_infty_on_domain(domain.release());
+  auto res = isl_pw_multi_aff_sub(copy(), pma2.release());
   return manage(res);
 }
 
-isl::qpolynomial qpolynomial::insert_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::union_pw_multi_aff pw_multi_aff::sub(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_qpolynomial_insert_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).sub(upma2);
 }
 
-boolean qpolynomial::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::pw_multi_aff pw_multi_aff::sub(const isl::multi_aff &pma2) const
 {
-  auto res = isl_qpolynomial_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return this->sub(isl::pw_multi_aff(pma2));
 }
 
-boolean qpolynomial::is_infty() const
+isl::pw_multi_aff pw_multi_aff::sub(const isl::pw_aff &pma2) const
 {
-  auto res = isl_qpolynomial_is_infty(get());
-  return manage(res);
+  return this->sub(isl::pw_multi_aff(pma2));
 }
 
-boolean qpolynomial::is_nan() const
+isl::pw_multi_aff pw_multi_aff::subtract_domain(isl::set set) const
 {
-  auto res = isl_qpolynomial_is_nan(get());
+  auto res = isl_pw_multi_aff_subtract_domain(copy(), set.release());
   return manage(res);
 }
 
-boolean qpolynomial::is_neginfty() const
+isl::union_pw_multi_aff pw_multi_aff::subtract_domain(const isl::space &space) const
 {
-  auto res = isl_qpolynomial_is_neginfty(get());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).subtract_domain(space);
 }
 
-boolean qpolynomial::is_zero() const
+isl::union_pw_multi_aff pw_multi_aff::subtract_domain(const isl::union_set &uset) const
 {
-  auto res = isl_qpolynomial_is_zero(get());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).subtract_domain(uset);
 }
 
-isl::qpolynomial qpolynomial::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
+isl::pw_multi_aff pw_multi_aff::subtract_domain(const isl::basic_set &set) const
 {
-  auto res = isl_qpolynomial_move_dims(copy(), static_cast<enum isl_dim_type>(dst_type), dst_pos, static_cast<enum isl_dim_type>(src_type), src_pos, n);
-  return manage(res);
+  return this->subtract_domain(isl::set(set));
 }
 
-isl::qpolynomial qpolynomial::mul(isl::qpolynomial qp2) const
+isl::pw_multi_aff pw_multi_aff::subtract_domain(const isl::point &set) const
 {
-  auto res = isl_qpolynomial_mul(copy(), qp2.release());
-  return manage(res);
+  return this->subtract_domain(isl::set(set));
 }
 
-isl::qpolynomial qpolynomial::nan_on_domain(isl::space domain)
+isl::pw_multi_aff_list pw_multi_aff::to_list() const
 {
-  auto res = isl_qpolynomial_nan_on_domain(domain.release());
+  auto res = isl_pw_multi_aff_to_list(copy());
   return manage(res);
 }
 
-isl::qpolynomial qpolynomial::neg() const
+isl::multi_pw_aff pw_multi_aff::to_multi_pw_aff() const
 {
-  auto res = isl_qpolynomial_neg(copy());
+  auto res = isl_pw_multi_aff_to_multi_pw_aff(copy());
   return manage(res);
 }
 
-isl::qpolynomial qpolynomial::neginfty_on_domain(isl::space domain)
+isl::union_pw_multi_aff pw_multi_aff::to_union_pw_multi_aff() const
 {
-  auto res = isl_qpolynomial_neginfty_on_domain(domain.release());
+  auto res = isl_pw_multi_aff_to_union_pw_multi_aff(copy());
   return manage(res);
 }
 
-isl::qpolynomial qpolynomial::one_on_domain(isl::space domain)
+isl::id pw_multi_aff::tuple_id(isl::dim type) const
 {
-  auto res = isl_qpolynomial_one_on_domain(domain.release());
+  auto res = isl_pw_multi_aff_get_tuple_id(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-boolean qpolynomial::plain_is_equal(const isl::qpolynomial &qp2) const
+isl::id pw_multi_aff::get_tuple_id(isl::dim type) const
 {
-  auto res = isl_qpolynomial_plain_is_equal(get(), qp2.get());
-  return manage(res);
+  return tuple_id(type);
 }
 
-isl::qpolynomial qpolynomial::pow(unsigned int power) const
+isl::multi_pw_aff pw_multi_aff::unbind_params_insert_domain(const isl::multi_id &domain) const
 {
-  auto res = isl_qpolynomial_pow(copy(), power);
-  return manage(res);
+  return isl::multi_pw_aff(*this).unbind_params_insert_domain(domain);
 }
 
-isl::qpolynomial qpolynomial::project_domain_on_params() const
+isl::multi_pw_aff pw_multi_aff::union_add(const isl::multi_pw_aff &mpa2) const
 {
-  auto res = isl_qpolynomial_project_domain_on_params(copy());
-  return manage(res);
+  return isl::multi_pw_aff(*this).union_add(mpa2);
 }
 
-isl::qpolynomial qpolynomial::scale_down_val(isl::val v) const
+isl::multi_union_pw_aff pw_multi_aff::union_add(const isl::multi_union_pw_aff &mupa2) const
 {
-  auto res = isl_qpolynomial_scale_down_val(copy(), v.release());
-  return manage(res);
+  return isl::multi_pw_aff(*this).union_add(mupa2);
 }
 
-isl::qpolynomial qpolynomial::scale_val(isl::val v) const
+isl::pw_multi_aff pw_multi_aff::union_add(isl::pw_multi_aff pma2) const
 {
-  auto res = isl_qpolynomial_scale_val(copy(), v.release());
+  auto res = isl_pw_multi_aff_union_add(copy(), pma2.release());
   return manage(res);
 }
 
-int qpolynomial::sgn() const
+isl::union_pw_multi_aff pw_multi_aff::union_add(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_qpolynomial_sgn(get());
-  return res;
+  return isl::union_pw_multi_aff(*this).union_add(upma2);
 }
 
-isl::qpolynomial qpolynomial::sub(isl::qpolynomial qp2) const
+isl::pw_multi_aff pw_multi_aff::union_add(const isl::multi_aff &pma2) const
 {
-  auto res = isl_qpolynomial_sub(copy(), qp2.release());
-  return manage(res);
+  return this->union_add(isl::pw_multi_aff(pma2));
 }
 
-isl::qpolynomial qpolynomial::val_on_domain(isl::space space, isl::val val)
+isl::pw_multi_aff pw_multi_aff::union_add(const isl::pw_aff &pma2) const
 {
-  auto res = isl_qpolynomial_val_on_domain(space.release(), val.release());
-  return manage(res);
+  return this->union_add(isl::pw_multi_aff(pma2));
 }
 
-isl::qpolynomial qpolynomial::var_on_domain(isl::space domain, isl::dim type, unsigned int pos)
+isl::pw_multi_aff pw_multi_aff::zero(isl::space space)
 {
-  auto res = isl_qpolynomial_var_on_domain(domain.release(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_pw_multi_aff_zero(space.release());
   return manage(res);
 }
 
-isl::qpolynomial qpolynomial::zero_on_domain(isl::space domain)
+inline std::ostream &operator<<(std::ostream &os, const pw_multi_aff &obj)
 {
-  auto res = isl_qpolynomial_zero_on_domain(domain.release());
-  return manage(res);
+  char *str = isl_pw_multi_aff_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-// implementations for isl::qpolynomial_list
-qpolynomial_list manage(__isl_take isl_qpolynomial_list *ptr) {
-  return qpolynomial_list(ptr);
+// implementations for isl::pw_multi_aff_list
+pw_multi_aff_list manage(__isl_take isl_pw_multi_aff_list *ptr) {
+  return pw_multi_aff_list(ptr);
 }
-qpolynomial_list manage_copy(__isl_keep isl_qpolynomial_list *ptr) {
-  ptr = isl_qpolynomial_list_copy(ptr);
-  return qpolynomial_list(ptr);
+pw_multi_aff_list manage_copy(__isl_keep isl_pw_multi_aff_list *ptr) {
+  ptr = isl_pw_multi_aff_list_copy(ptr);
+  return pw_multi_aff_list(ptr);
 }
 
-qpolynomial_list::qpolynomial_list()
+pw_multi_aff_list::pw_multi_aff_list()
     : ptr(nullptr) {}
 
-qpolynomial_list::qpolynomial_list(const qpolynomial_list &obj)
+pw_multi_aff_list::pw_multi_aff_list(const pw_multi_aff_list &obj)
     : ptr(nullptr)
 {
   ptr = obj.copy();
 }
 
-
-qpolynomial_list::qpolynomial_list(__isl_take isl_qpolynomial_list *ptr)
+pw_multi_aff_list::pw_multi_aff_list(__isl_take isl_pw_multi_aff_list *ptr)
     : ptr(ptr) {}
 
+pw_multi_aff_list::pw_multi_aff_list(isl::ctx ctx, int n)
+{
+  auto res = isl_pw_multi_aff_list_alloc(ctx.release(), n);
+  ptr = res;
+}
+
+pw_multi_aff_list::pw_multi_aff_list(isl::pw_multi_aff el)
+{
+  auto res = isl_pw_multi_aff_list_from_pw_multi_aff(el.release());
+  ptr = res;
+}
+
+pw_multi_aff_list::pw_multi_aff_list(isl::ctx ctx, const std::string &str)
+{
+  auto res = isl_pw_multi_aff_list_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
+}
 
-qpolynomial_list &qpolynomial_list::operator=(qpolynomial_list obj) {
+pw_multi_aff_list &pw_multi_aff_list::operator=(pw_multi_aff_list obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-qpolynomial_list::~qpolynomial_list() {
+pw_multi_aff_list::~pw_multi_aff_list() {
   if (ptr)
-    isl_qpolynomial_list_free(ptr);
+    isl_pw_multi_aff_list_free(ptr);
 }
 
-__isl_give isl_qpolynomial_list *qpolynomial_list::copy() const & {
-  return isl_qpolynomial_list_copy(ptr);
+__isl_give isl_pw_multi_aff_list *pw_multi_aff_list::copy() const & {
+  return isl_pw_multi_aff_list_copy(ptr);
 }
 
-__isl_keep isl_qpolynomial_list *qpolynomial_list::get() const {
+__isl_keep isl_pw_multi_aff_list *pw_multi_aff_list::get() const {
   return ptr;
 }
 
-__isl_give isl_qpolynomial_list *qpolynomial_list::release() {
-  isl_qpolynomial_list *tmp = ptr;
+__isl_give isl_pw_multi_aff_list *pw_multi_aff_list::release() {
+  isl_pw_multi_aff_list *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool qpolynomial_list::is_null() const {
+bool pw_multi_aff_list::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx qpolynomial_list::ctx() const {
-  return isl::ctx(isl_qpolynomial_list_get_ctx(ptr));
-}
-
-void qpolynomial_list::dump() const {
-  isl_qpolynomial_list_dump(get());
-}
-
-
-isl::qpolynomial_list qpolynomial_list::add(isl::qpolynomial el) const
-{
-  auto res = isl_qpolynomial_list_add(copy(), el.release());
-  return manage(res);
-}
-
-isl::qpolynomial_list qpolynomial_list::alloc(isl::ctx ctx, int n)
-{
-  auto res = isl_qpolynomial_list_alloc(ctx.release(), n);
-  return manage(res);
-}
-
-isl::qpolynomial_list qpolynomial_list::clear() const
-{
-  auto res = isl_qpolynomial_list_clear(copy());
-  return manage(res);
-}
-
-isl::qpolynomial_list qpolynomial_list::concat(isl::qpolynomial_list list2) const
-{
-  auto res = isl_qpolynomial_list_concat(copy(), list2.release());
-  return manage(res);
+isl::ctx pw_multi_aff_list::ctx() const {
+  return isl::ctx(isl_pw_multi_aff_list_get_ctx(ptr));
 }
 
-isl::qpolynomial_list qpolynomial_list::drop(unsigned int first, unsigned int n) const
+isl::pw_multi_aff_list pw_multi_aff_list::add(isl::pw_multi_aff el) const
 {
-  auto res = isl_qpolynomial_list_drop(copy(), first, n);
+  auto res = isl_pw_multi_aff_list_add(copy(), el.release());
   return manage(res);
 }
 
-stat qpolynomial_list::foreach(const std::function<stat(qpolynomial)> &fn) const
+isl::pw_multi_aff pw_multi_aff_list::at(int index) const
 {
-  struct fn_data {
-    const std::function<stat(qpolynomial)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_qpolynomial *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_qpolynomial_list_foreach(get(), fn_lambda, &fn_data);
+  auto res = isl_pw_multi_aff_list_get_at(get(), index);
   return manage(res);
 }
 
-isl::qpolynomial_list qpolynomial_list::from_qpolynomial(isl::qpolynomial el)
+isl::pw_multi_aff pw_multi_aff_list::get_at(int index) const
 {
-  auto res = isl_qpolynomial_list_from_qpolynomial(el.release());
-  return manage(res);
+  return at(index);
 }
 
-isl::qpolynomial qpolynomial_list::get_at(int index) const
+isl::pw_multi_aff_list pw_multi_aff_list::clear() const
 {
-  auto res = isl_qpolynomial_list_get_at(get(), index);
+  auto res = isl_pw_multi_aff_list_clear(copy());
   return manage(res);
 }
 
-isl::qpolynomial qpolynomial_list::get_qpolynomial(int index) const
+isl::pw_multi_aff_list pw_multi_aff_list::concat(isl::pw_multi_aff_list list2) const
 {
-  auto res = isl_qpolynomial_list_get_qpolynomial(get(), index);
+  auto res = isl_pw_multi_aff_list_concat(copy(), list2.release());
   return manage(res);
 }
 
-isl::qpolynomial_list qpolynomial_list::insert(unsigned int pos, isl::qpolynomial el) const
+isl::pw_multi_aff_list pw_multi_aff_list::drop(unsigned int first, unsigned int n) const
 {
-  auto res = isl_qpolynomial_list_insert(copy(), pos, el.release());
+  auto res = isl_pw_multi_aff_list_drop(copy(), first, n);
   return manage(res);
 }
 
-isl_size qpolynomial_list::n_qpolynomial() const
-{
-  auto res = isl_qpolynomial_list_n_qpolynomial(get());
-  return res;
-}
-
-isl::qpolynomial_list qpolynomial_list::reverse() const
+stat pw_multi_aff_list::foreach(const std::function<stat(isl::pw_multi_aff)> &fn) const
 {
-  auto res = isl_qpolynomial_list_reverse(copy());
+  struct fn_data {
+    std::function<stat(isl::pw_multi_aff)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_pw_multi_aff *arg_0, void *arg_1) -> isl_stat {
+    auto *data = static_cast<struct fn_data *>(arg_1);
+    auto ret = (data->func)(manage(arg_0));
+    return ret.release();
+  };
+  auto res = isl_pw_multi_aff_list_foreach(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::qpolynomial_list qpolynomial_list::set_qpolynomial(int index, isl::qpolynomial el) const
+isl::pw_multi_aff_list pw_multi_aff_list::insert(unsigned int pos, isl::pw_multi_aff el) const
 {
-  auto res = isl_qpolynomial_list_set_qpolynomial(copy(), index, el.release());
+  auto res = isl_pw_multi_aff_list_insert(copy(), pos, el.release());
   return manage(res);
 }
 
-isl_size qpolynomial_list::size() const
+class size pw_multi_aff_list::size() const
 {
-  auto res = isl_qpolynomial_list_size(get());
-  return res;
+  auto res = isl_pw_multi_aff_list_size(get());
+  return manage(res);
 }
 
-isl::qpolynomial_list qpolynomial_list::swap(unsigned int pos1, unsigned int pos2) const
+inline std::ostream &operator<<(std::ostream &os, const pw_multi_aff_list &obj)
 {
-  auto res = isl_qpolynomial_list_swap(copy(), pos1, pos2);
-  return manage(res);
+  char *str = isl_pw_multi_aff_list_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
 // implementations for isl::schedule
@@ -14680,7 +17093,6 @@ schedule::schedule(const schedule &obj)
   ptr = obj.copy();
 }
 
-
 schedule::schedule(__isl_take isl_schedule *ptr)
     : ptr(ptr) {}
 
@@ -14718,49 +17130,30 @@ bool schedule::is_null() const {
   return ptr == nullptr;
 }
 
-
 isl::ctx schedule::ctx() const {
   return isl::ctx(isl_schedule_get_ctx(ptr));
 }
 
-void schedule::dump() const {
-  isl_schedule_dump(get());
-}
-
-
 isl::schedule schedule::align_params(isl::space space) const
 {
   auto res = isl_schedule_align_params(copy(), space.release());
   return manage(res);
 }
 
-isl::schedule schedule::empty(isl::space space)
-{
-  auto res = isl_schedule_empty(space.release());
-  return manage(res);
-}
-
-isl::schedule schedule::from_domain(isl::union_set domain)
-{
-  auto res = isl_schedule_from_domain(domain.release());
-  return manage(res);
-}
-
-isl::union_set schedule::get_domain() const
+isl::union_set schedule::domain() const
 {
   auto res = isl_schedule_get_domain(get());
   return manage(res);
 }
 
-isl::union_map schedule::get_map() const
+isl::union_set schedule::get_domain() const
 {
-  auto res = isl_schedule_get_map(get());
-  return manage(res);
+  return domain();
 }
 
-isl::schedule_node schedule::get_root() const
+isl::schedule schedule::from_domain(isl::union_set domain)
 {
-  auto res = isl_schedule_get_root(get());
+  auto res = isl_schedule_from_domain(domain.release());
   return manage(res);
 }
 
@@ -14770,18 +17163,6 @@ isl::schedule schedule::gist_domain_params(isl::set context) const
   return manage(res);
 }
 
-isl::schedule schedule::insert_context(isl::set context) const
-{
-  auto res = isl_schedule_insert_context(copy(), context.release());
-  return manage(res);
-}
-
-isl::schedule schedule::insert_guard(isl::set guard) const
-{
-  auto res = isl_schedule_insert_guard(copy(), guard.release());
-  return manage(res);
-}
-
 isl::schedule schedule::insert_partial_schedule(isl::multi_union_pw_aff partial) const
 {
   auto res = isl_schedule_insert_partial_schedule(copy(), partial.release());
@@ -14794,30 +17175,52 @@ isl::schedule schedule::intersect_domain(isl::union_set domain) const
   return manage(res);
 }
 
-boolean schedule::plain_is_equal(const isl::schedule &schedule2) const
+isl::union_map schedule::map() const
 {
-  auto res = isl_schedule_plain_is_equal(get(), schedule2.get());
+  auto res = isl_schedule_get_map(get());
   return manage(res);
 }
 
+isl::union_map schedule::get_map() const
+{
+  return map();
+}
+
 isl::schedule schedule::pullback(isl::union_pw_multi_aff upma) const
 {
   auto res = isl_schedule_pullback_union_pw_multi_aff(copy(), upma.release());
   return manage(res);
 }
 
-isl::schedule schedule::reset_user() const
+isl::schedule_node schedule::root() const
 {
-  auto res = isl_schedule_reset_user(copy());
+  auto res = isl_schedule_get_root(get());
   return manage(res);
 }
 
+isl::schedule_node schedule::get_root() const
+{
+  return root();
+}
+
 isl::schedule schedule::sequence(isl::schedule schedule2) const
 {
   auto res = isl_schedule_sequence(copy(), schedule2.release());
   return manage(res);
 }
 
+inline std::ostream &operator<<(std::ostream &os, const schedule &obj)
+{
+  char *str = isl_schedule_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
+
 // implementations for isl::schedule_constraints
 schedule_constraints manage(__isl_take isl_schedule_constraints *ptr) {
   return schedule_constraints(ptr);
@@ -14836,7 +17239,6 @@ schedule_constraints::schedule_constraints(const schedule_constraints &obj)
   ptr = obj.copy();
 }
 
-
 schedule_constraints::schedule_constraints(__isl_take isl_schedule_constraints *ptr)
     : ptr(ptr) {}
 
@@ -14874,20 +17276,19 @@ bool schedule_constraints::is_null() const {
   return ptr == nullptr;
 }
 
-
 isl::ctx schedule_constraints::ctx() const {
   return isl::ctx(isl_schedule_constraints_get_ctx(ptr));
 }
 
-void schedule_constraints::dump() const {
-  isl_schedule_constraints_dump(get());
+isl::union_map schedule_constraints::coincidence() const
+{
+  auto res = isl_schedule_constraints_get_coincidence(get());
+  return manage(res);
 }
 
-
-isl::schedule_constraints schedule_constraints::apply(isl::union_map umap) const
+isl::union_map schedule_constraints::get_coincidence() const
 {
-  auto res = isl_schedule_constraints_apply(copy(), umap.release());
-  return manage(res);
+  return coincidence();
 }
 
 isl::schedule schedule_constraints::compute_schedule() const
@@ -14896,46 +17297,48 @@ isl::schedule schedule_constraints::compute_schedule() const
   return manage(res);
 }
 
-isl::union_map schedule_constraints::get_coincidence() const
+isl::union_map schedule_constraints::conditional_validity() const
 {
-  auto res = isl_schedule_constraints_get_coincidence(get());
+  auto res = isl_schedule_constraints_get_conditional_validity(get());
   return manage(res);
 }
 
 isl::union_map schedule_constraints::get_conditional_validity() const
 {
-  auto res = isl_schedule_constraints_get_conditional_validity(get());
-  return manage(res);
+  return conditional_validity();
 }
 
-isl::union_map schedule_constraints::get_conditional_validity_condition() const
+isl::union_map schedule_constraints::conditional_validity_condition() const
 {
   auto res = isl_schedule_constraints_get_conditional_validity_condition(get());
   return manage(res);
 }
 
-isl::set schedule_constraints::get_context() const
+isl::union_map schedule_constraints::get_conditional_validity_condition() const
+{
+  return conditional_validity_condition();
+}
+
+isl::set schedule_constraints::context() const
 {
   auto res = isl_schedule_constraints_get_context(get());
   return manage(res);
 }
 
-isl::union_set schedule_constraints::get_domain() const
+isl::set schedule_constraints::get_context() const
 {
-  auto res = isl_schedule_constraints_get_domain(get());
-  return manage(res);
+  return context();
 }
 
-isl::union_map schedule_constraints::get_proximity() const
+isl::union_set schedule_constraints::domain() const
 {
-  auto res = isl_schedule_constraints_get_proximity(get());
+  auto res = isl_schedule_constraints_get_domain(get());
   return manage(res);
 }
 
-isl::union_map schedule_constraints::get_validity() const
+isl::union_set schedule_constraints::get_domain() const
 {
-  auto res = isl_schedule_constraints_get_validity(get());
-  return manage(res);
+  return domain();
 }
 
 isl::schedule_constraints schedule_constraints::on_domain(isl::union_set domain)
@@ -14944,6 +17347,17 @@ isl::schedule_constraints schedule_constraints::on_domain(isl::union_set domain)
   return manage(res);
 }
 
+isl::union_map schedule_constraints::proximity() const
+{
+  auto res = isl_schedule_constraints_get_proximity(get());
+  return manage(res);
+}
+
+isl::union_map schedule_constraints::get_proximity() const
+{
+  return proximity();
+}
+
 isl::schedule_constraints schedule_constraints::set_coincidence(isl::union_map coincidence) const
 {
   auto res = isl_schedule_constraints_set_coincidence(copy(), coincidence.release());
@@ -14974,6 +17388,29 @@ isl::schedule_constraints schedule_constraints::set_validity(isl::union_map vali
   return manage(res);
 }
 
+isl::union_map schedule_constraints::validity() const
+{
+  auto res = isl_schedule_constraints_get_validity(get());
+  return manage(res);
+}
+
+isl::union_map schedule_constraints::get_validity() const
+{
+  return validity();
+}
+
+inline std::ostream &operator<<(std::ostream &os, const schedule_constraints &obj)
+{
+  char *str = isl_schedule_constraints_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
+
 // implementations for isl::schedule_node
 schedule_node manage(__isl_take isl_schedule_node *ptr) {
   return schedule_node(ptr);
@@ -14992,11 +17429,9 @@ schedule_node::schedule_node(const schedule_node &obj)
   ptr = obj.copy();
 }
 
-
 schedule_node::schedule_node(__isl_take isl_schedule_node *ptr)
     : ptr(ptr) {}
 
-
 schedule_node &schedule_node::operator=(schedule_node obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
@@ -15025,91 +17460,86 @@ bool schedule_node::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx schedule_node::ctx() const {
-  return isl::ctx(isl_schedule_node_get_ctx(ptr));
-}
-
-void schedule_node::dump() const {
-  isl_schedule_node_dump(get());
-}
-
-
-isl::schedule_node schedule_node::align_params(isl::space space) const
+template <typename T, typename>
+boolean schedule_node::isa_type(T subtype) const
 {
-  auto res = isl_schedule_node_align_params(copy(), space.release());
-  return manage(res);
+  if (is_null())
+    return boolean();
+  return isl_schedule_node_get_type(get()) == subtype;
 }
-
-isl::schedule_node schedule_node::ancestor(int generation) const
+template <class T>
+boolean schedule_node::isa() const
 {
-  auto res = isl_schedule_node_ancestor(copy(), generation);
-  return manage(res);
+  return isa_type<decltype(T::type)>(T::type);
 }
-
-boolean schedule_node::band_member_get_coincident(int pos) const
+template <class T>
+T schedule_node::as() const
 {
-  auto res = isl_schedule_node_band_member_get_coincident(get(), pos);
-  return manage(res);
+ if (isa<T>().is_false())
+    isl_die(ctx().get(), isl_error_invalid, "not an object of the requested subtype", return T());
+  return T(copy());
 }
 
-isl::schedule_node schedule_node::band_member_set_coincident(int pos, int coincident) const
-{
-  auto res = isl_schedule_node_band_member_set_coincident(copy(), pos, coincident);
-  return manage(res);
+isl::ctx schedule_node::ctx() const {
+  return isl::ctx(isl_schedule_node_get_ctx(ptr));
 }
 
-isl::schedule_node schedule_node::band_set_ast_build_options(isl::union_set options) const
+isl::schedule_node schedule_node::ancestor(int generation) const
 {
-  auto res = isl_schedule_node_band_set_ast_build_options(copy(), options.release());
+  auto res = isl_schedule_node_ancestor(copy(), generation);
   return manage(res);
 }
 
-isl::schedule_node schedule_node::child(int pos) const
+class size schedule_node::ancestor_child_position(const isl::schedule_node &ancestor) const
 {
-  auto res = isl_schedule_node_child(copy(), pos);
+  auto res = isl_schedule_node_get_ancestor_child_position(get(), ancestor.get());
   return manage(res);
 }
 
-isl::set schedule_node::context_get_context() const
+class size schedule_node::get_ancestor_child_position(const isl::schedule_node &ancestor) const
 {
-  auto res = isl_schedule_node_context_get_context(get());
-  return manage(res);
+  return ancestor_child_position(ancestor);
 }
 
-isl::schedule_node schedule_node::cut() const
+isl::schedule_node schedule_node::child(int pos) const
 {
-  auto res = isl_schedule_node_cut(copy());
+  auto res = isl_schedule_node_child(copy(), pos);
   return manage(res);
 }
 
-isl::union_set schedule_node::domain_get_domain() const
+class size schedule_node::child_position() const
 {
-  auto res = isl_schedule_node_domain_get_domain(get());
+  auto res = isl_schedule_node_get_child_position(get());
   return manage(res);
 }
 
-isl::union_pw_multi_aff schedule_node::expansion_get_contraction() const
+class size schedule_node::get_child_position() const
 {
-  auto res = isl_schedule_node_expansion_get_contraction(get());
-  return manage(res);
+  return child_position();
 }
 
-isl::union_map schedule_node::expansion_get_expansion() const
+isl::union_set schedule_node::domain() const
 {
-  auto res = isl_schedule_node_expansion_get_expansion(get());
+  auto res = isl_schedule_node_get_domain(get());
   return manage(res);
 }
 
-isl::union_map schedule_node::extension_get_extension() const
+isl::union_set schedule_node::get_domain() const
 {
-  auto res = isl_schedule_node_extension_get_extension(get());
-  return manage(res);
+  return domain();
 }
 
-isl::union_set schedule_node::filter_get_filter() const
+boolean schedule_node::every_descendant(const std::function<boolean(isl::schedule_node)> &test) const
 {
-  auto res = isl_schedule_node_filter_get_filter(get());
+  struct test_data {
+    std::function<boolean(isl::schedule_node)> func;
+  } test_data = { test };
+  auto test_lambda = [](isl_schedule_node *arg_0, void *arg_1) -> isl_bool {
+    auto *data = static_cast<struct test_data *>(arg_1);
+    auto ret = (data->func)(manage_copy(arg_0));
+    return ret.release();
+  };
+  auto res = isl_schedule_node_every_descendant(get(), test_lambda, &test_data);
   return manage(res);
 }
 
@@ -15119,125 +17549,43 @@ isl::schedule_node schedule_node::first_child() const
   return manage(res);
 }
 
-stat schedule_node::foreach_ancestor_top_down(const std::function<stat(schedule_node)> &fn) const
+stat schedule_node::foreach_ancestor_top_down(const std::function<stat(isl::schedule_node)> &fn) const
 {
   struct fn_data {
-    const std::function<stat(schedule_node)> *func;
-  } fn_data = { &fn };
+    std::function<stat(isl::schedule_node)> func;
+  } fn_data = { fn };
   auto fn_lambda = [](isl_schedule_node *arg_0, void *arg_1) -> isl_stat {
     auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage_copy(arg_0));
+    auto ret = (data->func)(manage_copy(arg_0));
     return ret.release();
   };
   auto res = isl_schedule_node_foreach_ancestor_top_down(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::schedule_node schedule_node::from_domain(isl::union_set domain)
-{
-  auto res = isl_schedule_node_from_domain(domain.release());
-  return manage(res);
-}
-
-isl::schedule_node schedule_node::from_extension(isl::union_map extension)
-{
-  auto res = isl_schedule_node_from_extension(extension.release());
-  return manage(res);
-}
-
-isl_size schedule_node::get_ancestor_child_position(const isl::schedule_node &ancestor) const
-{
-  auto res = isl_schedule_node_get_ancestor_child_position(get(), ancestor.get());
-  return res;
-}
-
-isl::schedule_node schedule_node::get_child(int pos) const
-{
-  auto res = isl_schedule_node_get_child(get(), pos);
-  return manage(res);
-}
-
-isl_size schedule_node::get_child_position() const
-{
-  auto res = isl_schedule_node_get_child_position(get());
-  return res;
-}
-
-isl::union_set schedule_node::get_domain() const
-{
-  auto res = isl_schedule_node_get_domain(get());
-  return manage(res);
-}
-
-isl::multi_union_pw_aff schedule_node::get_prefix_schedule_multi_union_pw_aff() const
-{
-  auto res = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(get());
-  return manage(res);
-}
-
-isl::union_map schedule_node::get_prefix_schedule_relation() const
-{
-  auto res = isl_schedule_node_get_prefix_schedule_relation(get());
-  return manage(res);
-}
-
-isl::union_map schedule_node::get_prefix_schedule_union_map() const
-{
-  auto res = isl_schedule_node_get_prefix_schedule_union_map(get());
-  return manage(res);
-}
-
-isl::union_pw_multi_aff schedule_node::get_prefix_schedule_union_pw_multi_aff() const
-{
-  auto res = isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(get());
-  return manage(res);
-}
-
-isl::schedule schedule_node::get_schedule() const
-{
-  auto res = isl_schedule_node_get_schedule(get());
-  return manage(res);
-}
-
-isl_size schedule_node::get_schedule_depth() const
-{
-  auto res = isl_schedule_node_get_schedule_depth(get());
-  return res;
-}
-
-isl::schedule_node schedule_node::get_shared_ancestor(const isl::schedule_node &node2) const
-{
-  auto res = isl_schedule_node_get_shared_ancestor(get(), node2.get());
-  return manage(res);
-}
-
-isl::union_pw_multi_aff schedule_node::get_subtree_contraction() const
-{
-  auto res = isl_schedule_node_get_subtree_contraction(get());
-  return manage(res);
-}
-
-isl::union_map schedule_node::get_subtree_expansion() const
+stat schedule_node::foreach_descendant_top_down(const std::function<boolean(isl::schedule_node)> &fn) const
 {
-  auto res = isl_schedule_node_get_subtree_expansion(get());
+  struct fn_data {
+    std::function<boolean(isl::schedule_node)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_schedule_node *arg_0, void *arg_1) -> isl_bool {
+    auto *data = static_cast<struct fn_data *>(arg_1);
+    auto ret = (data->func)(manage_copy(arg_0));
+    return ret.release();
+  };
+  auto res = isl_schedule_node_foreach_descendant_top_down(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::union_map schedule_node::get_subtree_schedule_union_map() const
+isl::schedule_node schedule_node::from_domain(isl::union_set domain)
 {
-  auto res = isl_schedule_node_get_subtree_schedule_union_map(get());
+  auto res = isl_schedule_node_from_domain(domain.release());
   return manage(res);
 }
 
-isl_size schedule_node::get_tree_depth() const
-{
-  auto res = isl_schedule_node_get_tree_depth(get());
-  return res;
-}
-
-isl::union_set schedule_node::get_universe_domain() const
+isl::schedule_node schedule_node::from_extension(isl::union_map extension)
 {
-  auto res = isl_schedule_node_get_universe_domain(get());
+  auto res = isl_schedule_node_from_extension(extension.release());
   return manage(res);
 }
 
@@ -15253,18 +17601,6 @@ isl::schedule_node schedule_node::graft_before(isl::schedule_node graft) const
   return manage(res);
 }
 
-isl::schedule_node schedule_node::group(isl::id group_id) const
-{
-  auto res = isl_schedule_node_group(copy(), group_id.release());
-  return manage(res);
-}
-
-isl::set schedule_node::guard_get_guard() const
-{
-  auto res = isl_schedule_node_guard_get_guard(get());
-  return manage(res);
-}
-
 boolean schedule_node::has_children() const
 {
   auto res = isl_schedule_node_has_children(get());
@@ -15313,6 +17649,11 @@ isl::schedule_node schedule_node::insert_mark(isl::id mark) const
   return manage(res);
 }
 
+isl::schedule_node schedule_node::insert_mark(const std::string &mark) const
+{
+  return this->insert_mark(isl::id(ctx(), mark));
+}
+
 isl::schedule_node schedule_node::insert_partial_schedule(isl::multi_union_pw_aff schedule) const
 {
   auto res = isl_schedule_node_insert_partial_schedule(copy(), schedule.release());
@@ -15343,16 +17684,24 @@ boolean schedule_node::is_subtree_anchored() const
   return manage(res);
 }
 
-isl::id schedule_node::mark_get_id() const
+isl::schedule_node schedule_node::map_descendant_bottom_up(const std::function<isl::schedule_node(isl::schedule_node)> &fn) const
 {
-  auto res = isl_schedule_node_mark_get_id(get());
+  struct fn_data {
+    std::function<isl::schedule_node(isl::schedule_node)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_schedule_node *arg_0, void *arg_1) -> isl_schedule_node * {
+    auto *data = static_cast<struct fn_data *>(arg_1);
+    auto ret = (data->func)(manage(arg_0));
+    return ret.release();
+  };
+  auto res = isl_schedule_node_map_descendant_bottom_up(copy(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl_size schedule_node::n_children() const
+class size schedule_node::n_children() const
 {
   auto res = isl_schedule_node_n_children(get());
-  return res;
+  return manage(res);
 }
 
 isl::schedule_node schedule_node::next_sibling() const
@@ -15379,4255 +17728,4142 @@ isl::schedule_node schedule_node::parent() const
   return manage(res);
 }
 
-isl::schedule_node schedule_node::previous_sibling() const
+isl::multi_union_pw_aff schedule_node::prefix_schedule_multi_union_pw_aff() const
 {
-  auto res = isl_schedule_node_previous_sibling(copy());
+  auto res = isl_schedule_node_get_prefix_schedule_multi_union_pw_aff(get());
   return manage(res);
 }
 
-isl::schedule_node schedule_node::reset_user() const
+isl::multi_union_pw_aff schedule_node::get_prefix_schedule_multi_union_pw_aff() const
 {
-  auto res = isl_schedule_node_reset_user(copy());
-  return manage(res);
+  return prefix_schedule_multi_union_pw_aff();
 }
 
-isl::schedule_node schedule_node::root() const
+isl::union_map schedule_node::prefix_schedule_relation() const
 {
-  auto res = isl_schedule_node_root(copy());
+  auto res = isl_schedule_node_get_prefix_schedule_relation(get());
   return manage(res);
 }
 
-isl::schedule_node schedule_node::sequence_splice_child(int pos) const
+isl::union_map schedule_node::get_prefix_schedule_relation() const
 {
-  auto res = isl_schedule_node_sequence_splice_child(copy(), pos);
-  return manage(res);
-}
-
-// implementations for isl::set
-set manage(__isl_take isl_set *ptr) {
-  return set(ptr);
-}
-set manage_copy(__isl_keep isl_set *ptr) {
-  ptr = isl_set_copy(ptr);
-  return set(ptr);
+  return prefix_schedule_relation();
 }
 
-set::set()
-    : ptr(nullptr) {}
-
-set::set(const set &obj)
-    : ptr(nullptr)
+isl::union_map schedule_node::prefix_schedule_union_map() const
 {
-  ptr = obj.copy();
+  auto res = isl_schedule_node_get_prefix_schedule_union_map(get());
+  return manage(res);
 }
 
-
-set::set(__isl_take isl_set *ptr)
-    : ptr(ptr) {}
-
-set::set(isl::basic_set bset)
-{
-  auto res = isl_set_from_basic_set(bset.release());
-  ptr = res;
-}
-set::set(isl::point pnt)
-{
-  auto res = isl_set_from_point(pnt.release());
-  ptr = res;
-}
-set::set(isl::union_set uset)
-{
-  auto res = isl_set_from_union_set(uset.release());
-  ptr = res;
-}
-set::set(isl::ctx ctx, const std::string &str)
+isl::union_map schedule_node::get_prefix_schedule_union_map() const
 {
-  auto res = isl_set_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
-}
-
-set &set::operator=(set obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
-}
-
-set::~set() {
-  if (ptr)
-    isl_set_free(ptr);
-}
-
-__isl_give isl_set *set::copy() const & {
-  return isl_set_copy(ptr);
-}
-
-__isl_keep isl_set *set::get() const {
-  return ptr;
-}
-
-__isl_give isl_set *set::release() {
-  isl_set *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
-}
-
-bool set::is_null() const {
-  return ptr == nullptr;
-}
-
-
-isl::ctx set::ctx() const {
-  return isl::ctx(isl_set_get_ctx(ptr));
-}
-
-void set::dump() const {
-  isl_set_dump(get());
+  return prefix_schedule_union_map();
 }
 
-
-isl::set set::add_constraint(isl::constraint constraint) const
+isl::union_pw_multi_aff schedule_node::prefix_schedule_union_pw_multi_aff() const
 {
-  auto res = isl_set_add_constraint(copy(), constraint.release());
+  auto res = isl_schedule_node_get_prefix_schedule_union_pw_multi_aff(get());
   return manage(res);
 }
 
-isl::set set::add_dims(isl::dim type, unsigned int n) const
+isl::union_pw_multi_aff schedule_node::get_prefix_schedule_union_pw_multi_aff() const
 {
-  auto res = isl_set_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
-  return manage(res);
+  return prefix_schedule_union_pw_multi_aff();
 }
 
-isl::basic_set set::affine_hull() const
+isl::schedule_node schedule_node::previous_sibling() const
 {
-  auto res = isl_set_affine_hull(copy());
+  auto res = isl_schedule_node_previous_sibling(copy());
   return manage(res);
 }
 
-isl::set set::align_params(isl::space model) const
+isl::schedule_node schedule_node::root() const
 {
-  auto res = isl_set_align_params(copy(), model.release());
+  auto res = isl_schedule_node_root(copy());
   return manage(res);
 }
 
-isl::set set::apply(isl::map map) const
+isl::schedule schedule_node::schedule() const
 {
-  auto res = isl_set_apply(copy(), map.release());
+  auto res = isl_schedule_node_get_schedule(get());
   return manage(res);
 }
 
-isl::set set::bind(isl::multi_id tuple) const
+isl::schedule schedule_node::get_schedule() const
 {
-  auto res = isl_set_bind(copy(), tuple.release());
-  return manage(res);
+  return schedule();
 }
 
-isl::basic_set set::bounded_simple_hull() const
+class size schedule_node::schedule_depth() const
 {
-  auto res = isl_set_bounded_simple_hull(copy());
+  auto res = isl_schedule_node_get_schedule_depth(get());
   return manage(res);
 }
 
-isl::set set::box_from_points(isl::point pnt1, isl::point pnt2)
+class size schedule_node::get_schedule_depth() const
 {
-  auto res = isl_set_box_from_points(pnt1.release(), pnt2.release());
-  return manage(res);
+  return schedule_depth();
 }
 
-isl::set set::coalesce() const
+isl::schedule_node schedule_node::shared_ancestor(const isl::schedule_node &node2) const
 {
-  auto res = isl_set_coalesce(copy());
+  auto res = isl_schedule_node_get_shared_ancestor(get(), node2.get());
   return manage(res);
 }
 
-isl::basic_set set::coefficients() const
+isl::schedule_node schedule_node::get_shared_ancestor(const isl::schedule_node &node2) const
 {
-  auto res = isl_set_coefficients(copy());
-  return manage(res);
+  return shared_ancestor(node2);
 }
 
-isl::set set::complement() const
+class size schedule_node::tree_depth() const
 {
-  auto res = isl_set_complement(copy());
+  auto res = isl_schedule_node_get_tree_depth(get());
   return manage(res);
 }
 
-isl::basic_set set::convex_hull() const
+class size schedule_node::get_tree_depth() const
 {
-  auto res = isl_set_convex_hull(copy());
-  return manage(res);
+  return tree_depth();
 }
 
-isl::val set::count_val() const
+isl::union_set schedule_node::universe_domain() const
 {
-  auto res = isl_set_count_val(get());
+  auto res = isl_schedule_node_get_universe_domain(get());
   return manage(res);
 }
 
-isl::set set::detect_equalities() const
+isl::union_set schedule_node::get_universe_domain() const
 {
-  auto res = isl_set_detect_equalities(copy());
-  return manage(res);
+  return universe_domain();
 }
 
-isl_size set::dim(isl::dim type) const
+inline std::ostream &operator<<(std::ostream &os, const schedule_node &obj)
 {
-  auto res = isl_set_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  char *str = isl_schedule_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-boolean set::dim_has_any_lower_bound(isl::dim type, unsigned int pos) const
-{
-  auto res = isl_set_dim_has_any_lower_bound(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
-}
+// implementations for isl::schedule_node_band
+schedule_node_band::schedule_node_band()
+    : schedule_node() {}
 
-boolean set::dim_has_any_upper_bound(isl::dim type, unsigned int pos) const
+schedule_node_band::schedule_node_band(const schedule_node_band &obj)
+    : schedule_node(obj)
 {
-  auto res = isl_set_dim_has_any_upper_bound(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
 }
 
-boolean set::dim_has_lower_bound(isl::dim type, unsigned int pos) const
-{
-  auto res = isl_set_dim_has_lower_bound(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+schedule_node_band::schedule_node_band(__isl_take isl_schedule_node *ptr)
+    : schedule_node(ptr) {}
+
+schedule_node_band &schedule_node_band::operator=(schedule_node_band obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-boolean set::dim_has_upper_bound(isl::dim type, unsigned int pos) const
-{
-  auto res = isl_set_dim_has_upper_bound(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+isl::ctx schedule_node_band::ctx() const {
+  return isl::ctx(isl_schedule_node_get_ctx(ptr));
 }
 
-boolean set::dim_is_bounded(isl::dim type, unsigned int pos) const
+isl::union_set schedule_node_band::ast_build_options() const
 {
-  auto res = isl_set_dim_is_bounded(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_schedule_node_band_get_ast_build_options(get());
   return manage(res);
 }
 
-isl::pw_aff set::dim_max(int pos) const
+isl::union_set schedule_node_band::get_ast_build_options() const
 {
-  auto res = isl_set_dim_max(copy(), pos);
-  return manage(res);
+  return ast_build_options();
 }
 
-isl::val set::dim_max_val(int pos) const
+isl::set schedule_node_band::ast_isolate_option() const
 {
-  auto res = isl_set_dim_max_val(copy(), pos);
+  auto res = isl_schedule_node_band_get_ast_isolate_option(get());
   return manage(res);
 }
 
-isl::pw_aff set::dim_min(int pos) const
+isl::set schedule_node_band::get_ast_isolate_option() const
 {
-  auto res = isl_set_dim_min(copy(), pos);
-  return manage(res);
+  return ast_isolate_option();
 }
 
-isl::val set::dim_min_val(int pos) const
+boolean schedule_node_band::member_get_coincident(int pos) const
 {
-  auto res = isl_set_dim_min_val(copy(), pos);
+  auto res = isl_schedule_node_band_member_get_coincident(get(), pos);
   return manage(res);
 }
 
-isl::set set::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
+schedule_node_band schedule_node_band::member_set_coincident(int pos, int coincident) const
 {
-  auto res = isl_set_drop_constraints_involving_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  auto res = isl_schedule_node_band_member_set_coincident(copy(), pos, coincident);
+  return manage(res).as<schedule_node_band>();
 }
 
-isl::set set::drop_constraints_not_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
+schedule_node_band schedule_node_band::mod(isl::multi_val mv) const
 {
-  auto res = isl_set_drop_constraints_not_involving_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  auto res = isl_schedule_node_band_mod(copy(), mv.release());
+  return manage(res).as<schedule_node_band>();
 }
 
-isl::set set::drop_unused_params() const
+class size schedule_node_band::n_member() const
 {
-  auto res = isl_set_drop_unused_params(copy());
+  auto res = isl_schedule_node_band_n_member(get());
   return manage(res);
 }
 
-isl::set set::eliminate(isl::dim type, unsigned int first, unsigned int n) const
+isl::multi_union_pw_aff schedule_node_band::partial_schedule() const
 {
-  auto res = isl_set_eliminate(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_schedule_node_band_get_partial_schedule(get());
   return manage(res);
 }
 
-isl::set set::empty(isl::space space)
+isl::multi_union_pw_aff schedule_node_band::get_partial_schedule() const
 {
-  auto res = isl_set_empty(space.release());
-  return manage(res);
+  return partial_schedule();
 }
 
-isl::set set::equate(isl::dim type1, int pos1, isl::dim type2, int pos2) const
+boolean schedule_node_band::permutable() const
 {
-  auto res = isl_set_equate(copy(), static_cast<enum isl_dim_type>(type1), pos1, static_cast<enum isl_dim_type>(type2), pos2);
+  auto res = isl_schedule_node_band_get_permutable(get());
   return manage(res);
 }
 
-int set::find_dim_by_id(isl::dim type, const isl::id &id) const
+boolean schedule_node_band::get_permutable() const
 {
-  auto res = isl_set_find_dim_by_id(get(), static_cast<enum isl_dim_type>(type), id.get());
-  return res;
+  return permutable();
 }
 
-int set::find_dim_by_name(isl::dim type, const std::string &name) const
+schedule_node_band schedule_node_band::scale(isl::multi_val mv) const
 {
-  auto res = isl_set_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+  auto res = isl_schedule_node_band_scale(copy(), mv.release());
+  return manage(res).as<schedule_node_band>();
 }
 
-isl::set set::fix_si(isl::dim type, unsigned int pos, int value) const
+schedule_node_band schedule_node_band::scale_down(isl::multi_val mv) const
 {
-  auto res = isl_set_fix_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
-  return manage(res);
+  auto res = isl_schedule_node_band_scale_down(copy(), mv.release());
+  return manage(res).as<schedule_node_band>();
 }
 
-isl::set set::fix_val(isl::dim type, unsigned int pos, isl::val v) const
+schedule_node_band schedule_node_band::set_ast_build_options(isl::union_set options) const
 {
-  auto res = isl_set_fix_val(copy(), static_cast<enum isl_dim_type>(type), pos, v.release());
-  return manage(res);
+  auto res = isl_schedule_node_band_set_ast_build_options(copy(), options.release());
+  return manage(res).as<schedule_node_band>();
 }
 
-isl::set set::flat_product(isl::set set2) const
+schedule_node_band schedule_node_band::set_permutable(int permutable) const
 {
-  auto res = isl_set_flat_product(copy(), set2.release());
-  return manage(res);
+  auto res = isl_schedule_node_band_set_permutable(copy(), permutable);
+  return manage(res).as<schedule_node_band>();
 }
 
-isl::set set::flatten() const
+schedule_node_band schedule_node_band::shift(isl::multi_union_pw_aff shift) const
 {
-  auto res = isl_set_flatten(copy());
-  return manage(res);
+  auto res = isl_schedule_node_band_shift(copy(), shift.release());
+  return manage(res).as<schedule_node_band>();
 }
 
-isl::map set::flatten_map() const
+schedule_node_band schedule_node_band::split(int pos) const
 {
-  auto res = isl_set_flatten_map(copy());
-  return manage(res);
+  auto res = isl_schedule_node_band_split(copy(), pos);
+  return manage(res).as<schedule_node_band>();
 }
 
-int set::follows_at(const isl::set &set2, int pos) const
+schedule_node_band schedule_node_band::tile(isl::multi_val sizes) const
 {
-  auto res = isl_set_follows_at(get(), set2.get(), pos);
-  return res;
+  auto res = isl_schedule_node_band_tile(copy(), sizes.release());
+  return manage(res).as<schedule_node_band>();
 }
 
-stat set::foreach_basic_set(const std::function<stat(basic_set)> &fn) const
+schedule_node_band schedule_node_band::member_set_ast_loop_default(int pos) const
 {
-  struct fn_data {
-    const std::function<stat(basic_set)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_basic_set *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_set_foreach_basic_set(get(), fn_lambda, &fn_data);
-  return manage(res);
+  auto res = isl_schedule_node_band_member_set_ast_loop_type(copy(), pos, isl_ast_loop_default);
+  return manage(res).as<schedule_node_band>();
 }
 
-stat set::foreach_point(const std::function<stat(point)> &fn) const
+schedule_node_band schedule_node_band::member_set_ast_loop_atomic(int pos) const
 {
-  struct fn_data {
-    const std::function<stat(point)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_point *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_set_foreach_point(get(), fn_lambda, &fn_data);
-  return manage(res);
+  auto res = isl_schedule_node_band_member_set_ast_loop_type(copy(), pos, isl_ast_loop_atomic);
+  return manage(res).as<schedule_node_band>();
 }
 
-isl::set set::from_multi_aff(isl::multi_aff ma)
+schedule_node_band schedule_node_band::member_set_ast_loop_unroll(int pos) const
 {
-  auto res = isl_set_from_multi_aff(ma.release());
-  return manage(res);
+  auto res = isl_schedule_node_band_member_set_ast_loop_type(copy(), pos, isl_ast_loop_unroll);
+  return manage(res).as<schedule_node_band>();
 }
 
-isl::set set::from_multi_pw_aff(isl::multi_pw_aff mpa)
+schedule_node_band schedule_node_band::member_set_ast_loop_separate(int pos) const
 {
-  auto res = isl_set_from_multi_pw_aff(mpa.release());
-  return manage(res);
+  auto res = isl_schedule_node_band_member_set_ast_loop_type(copy(), pos, isl_ast_loop_separate);
+  return manage(res).as<schedule_node_band>();
 }
 
-isl::set set::from_params() const
+inline std::ostream &operator<<(std::ostream &os, const schedule_node_band &obj)
 {
-  auto res = isl_set_from_params(copy());
-  return manage(res);
+  char *str = isl_schedule_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::set set::from_pw_aff(isl::pw_aff pwaff)
-{
-  auto res = isl_set_from_pw_aff(pwaff.release());
-  return manage(res);
-}
+// implementations for isl::schedule_node_context
+schedule_node_context::schedule_node_context()
+    : schedule_node() {}
 
-isl::set set::from_pw_multi_aff(isl::pw_multi_aff pma)
+schedule_node_context::schedule_node_context(const schedule_node_context &obj)
+    : schedule_node(obj)
 {
-  auto res = isl_set_from_pw_multi_aff(pma.release());
-  return manage(res);
 }
 
-isl::basic_set_list set::get_basic_set_list() const
-{
-  auto res = isl_set_get_basic_set_list(get());
-  return manage(res);
-}
+schedule_node_context::schedule_node_context(__isl_take isl_schedule_node *ptr)
+    : schedule_node(ptr) {}
 
-isl::id set::get_dim_id(isl::dim type, unsigned int pos) const
-{
-  auto res = isl_set_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+schedule_node_context &schedule_node_context::operator=(schedule_node_context obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-std::string set::get_dim_name(isl::dim type, unsigned int pos) const
-{
-  auto res = isl_set_get_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
-  std::string tmp(res);
-  return tmp;
+isl::ctx schedule_node_context::ctx() const {
+  return isl::ctx(isl_schedule_node_get_ctx(ptr));
 }
 
-isl::multi_val set::get_plain_multi_val_if_fixed() const
+isl::set schedule_node_context::context() const
 {
-  auto res = isl_set_get_plain_multi_val_if_fixed(get());
+  auto res = isl_schedule_node_context_get_context(get());
   return manage(res);
 }
 
-isl::fixed_box set::get_simple_fixed_box_hull() const
+isl::set schedule_node_context::get_context() const
 {
-  auto res = isl_set_get_simple_fixed_box_hull(get());
-  return manage(res);
+  return context();
 }
 
-isl::space set::get_space() const
+inline std::ostream &operator<<(std::ostream &os, const schedule_node_context &obj)
 {
-  auto res = isl_set_get_space(get());
-  return manage(res);
+  char *str = isl_schedule_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::val set::get_stride(int pos) const
-{
-  auto res = isl_set_get_stride(get(), pos);
-  return manage(res);
-}
+// implementations for isl::schedule_node_domain
+schedule_node_domain::schedule_node_domain()
+    : schedule_node() {}
 
-isl::id set::get_tuple_id() const
+schedule_node_domain::schedule_node_domain(const schedule_node_domain &obj)
+    : schedule_node(obj)
 {
-  auto res = isl_set_get_tuple_id(get());
-  return manage(res);
 }
 
-std::string set::get_tuple_name() const
-{
-  auto res = isl_set_get_tuple_name(get());
-  std::string tmp(res);
-  return tmp;
+schedule_node_domain::schedule_node_domain(__isl_take isl_schedule_node *ptr)
+    : schedule_node(ptr) {}
+
+schedule_node_domain &schedule_node_domain::operator=(schedule_node_domain obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::set set::gist(isl::set context) const
-{
-  auto res = isl_set_gist(copy(), context.release());
-  return manage(res);
+isl::ctx schedule_node_domain::ctx() const {
+  return isl::ctx(isl_schedule_node_get_ctx(ptr));
 }
 
-isl::set set::gist_basic_set(isl::basic_set context) const
+isl::union_set schedule_node_domain::domain() const
 {
-  auto res = isl_set_gist_basic_set(copy(), context.release());
+  auto res = isl_schedule_node_domain_get_domain(get());
   return manage(res);
 }
 
-isl::set set::gist_params(isl::set context) const
+isl::union_set schedule_node_domain::get_domain() const
 {
-  auto res = isl_set_gist_params(copy(), context.release());
-  return manage(res);
+  return domain();
 }
 
-boolean set::has_dim_id(isl::dim type, unsigned int pos) const
+inline std::ostream &operator<<(std::ostream &os, const schedule_node_domain &obj)
 {
-  auto res = isl_set_has_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  char *str = isl_schedule_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-boolean set::has_dim_name(isl::dim type, unsigned int pos) const
+// implementations for isl::schedule_node_expansion
+schedule_node_expansion::schedule_node_expansion()
+    : schedule_node() {}
+
+schedule_node_expansion::schedule_node_expansion(const schedule_node_expansion &obj)
+    : schedule_node(obj)
 {
-  auto res = isl_set_has_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
 }
 
-boolean set::has_equal_space(const isl::set &set2) const
-{
-  auto res = isl_set_has_equal_space(get(), set2.get());
-  return manage(res);
+schedule_node_expansion::schedule_node_expansion(__isl_take isl_schedule_node *ptr)
+    : schedule_node(ptr) {}
+
+schedule_node_expansion &schedule_node_expansion::operator=(schedule_node_expansion obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-boolean set::has_tuple_id() const
-{
-  auto res = isl_set_has_tuple_id(get());
-  return manage(res);
+isl::ctx schedule_node_expansion::ctx() const {
+  return isl::ctx(isl_schedule_node_get_ctx(ptr));
 }
 
-boolean set::has_tuple_name() const
+isl::union_pw_multi_aff schedule_node_expansion::contraction() const
 {
-  auto res = isl_set_has_tuple_name(get());
+  auto res = isl_schedule_node_expansion_get_contraction(get());
   return manage(res);
 }
 
-isl::map set::identity() const
+isl::union_pw_multi_aff schedule_node_expansion::get_contraction() const
 {
-  auto res = isl_set_identity(copy());
-  return manage(res);
+  return contraction();
 }
 
-isl::pw_aff set::indicator_function() const
+isl::union_map schedule_node_expansion::expansion() const
 {
-  auto res = isl_set_indicator_function(copy());
+  auto res = isl_schedule_node_expansion_get_expansion(get());
   return manage(res);
 }
 
-isl::set set::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const
+isl::union_map schedule_node_expansion::get_expansion() const
 {
-  auto res = isl_set_insert_dims(copy(), static_cast<enum isl_dim_type>(type), pos, n);
-  return manage(res);
+  return expansion();
 }
 
-isl::map set::insert_domain(isl::space domain) const
+inline std::ostream &operator<<(std::ostream &os, const schedule_node_expansion &obj)
 {
-  auto res = isl_set_insert_domain(copy(), domain.release());
-  return manage(res);
+  char *str = isl_schedule_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::set set::intersect(isl::set set2) const
+// implementations for isl::schedule_node_extension
+schedule_node_extension::schedule_node_extension()
+    : schedule_node() {}
+
+schedule_node_extension::schedule_node_extension(const schedule_node_extension &obj)
+    : schedule_node(obj)
 {
-  auto res = isl_set_intersect(copy(), set2.release());
-  return manage(res);
 }
 
-isl::set set::intersect_factor_domain(isl::set domain) const
-{
-  auto res = isl_set_intersect_factor_domain(copy(), domain.release());
-  return manage(res);
+schedule_node_extension::schedule_node_extension(__isl_take isl_schedule_node *ptr)
+    : schedule_node(ptr) {}
+
+schedule_node_extension &schedule_node_extension::operator=(schedule_node_extension obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::set set::intersect_factor_range(isl::set range) const
-{
-  auto res = isl_set_intersect_factor_range(copy(), range.release());
-  return manage(res);
+isl::ctx schedule_node_extension::ctx() const {
+  return isl::ctx(isl_schedule_node_get_ctx(ptr));
 }
 
-isl::set set::intersect_params(isl::set params) const
+isl::union_map schedule_node_extension::extension() const
 {
-  auto res = isl_set_intersect_params(copy(), params.release());
+  auto res = isl_schedule_node_extension_get_extension(get());
   return manage(res);
 }
 
-boolean set::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::union_map schedule_node_extension::get_extension() const
 {
-  auto res = isl_set_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return extension();
 }
 
-boolean set::involves_locals() const
+inline std::ostream &operator<<(std::ostream &os, const schedule_node_extension &obj)
 {
-  auto res = isl_set_involves_locals(get());
-  return manage(res);
+  char *str = isl_schedule_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-boolean set::is_bounded() const
+// implementations for isl::schedule_node_filter
+schedule_node_filter::schedule_node_filter()
+    : schedule_node() {}
+
+schedule_node_filter::schedule_node_filter(const schedule_node_filter &obj)
+    : schedule_node(obj)
 {
-  auto res = isl_set_is_bounded(get());
-  return manage(res);
 }
 
-boolean set::is_box() const
-{
-  auto res = isl_set_is_box(get());
-  return manage(res);
+schedule_node_filter::schedule_node_filter(__isl_take isl_schedule_node *ptr)
+    : schedule_node(ptr) {}
+
+schedule_node_filter &schedule_node_filter::operator=(schedule_node_filter obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-boolean set::is_disjoint(const isl::set &set2) const
-{
-  auto res = isl_set_is_disjoint(get(), set2.get());
-  return manage(res);
+isl::ctx schedule_node_filter::ctx() const {
+  return isl::ctx(isl_schedule_node_get_ctx(ptr));
 }
 
-boolean set::is_empty() const
+isl::union_set schedule_node_filter::filter() const
 {
-  auto res = isl_set_is_empty(get());
+  auto res = isl_schedule_node_filter_get_filter(get());
   return manage(res);
 }
 
-boolean set::is_equal(const isl::set &set2) const
+isl::union_set schedule_node_filter::get_filter() const
 {
-  auto res = isl_set_is_equal(get(), set2.get());
-  return manage(res);
+  return filter();
 }
 
-boolean set::is_params() const
+inline std::ostream &operator<<(std::ostream &os, const schedule_node_filter &obj)
 {
-  auto res = isl_set_is_params(get());
-  return manage(res);
+  char *str = isl_schedule_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-boolean set::is_singleton() const
+// implementations for isl::schedule_node_guard
+schedule_node_guard::schedule_node_guard()
+    : schedule_node() {}
+
+schedule_node_guard::schedule_node_guard(const schedule_node_guard &obj)
+    : schedule_node(obj)
 {
-  auto res = isl_set_is_singleton(get());
-  return manage(res);
 }
 
-boolean set::is_strict_subset(const isl::set &set2) const
-{
-  auto res = isl_set_is_strict_subset(get(), set2.get());
-  return manage(res);
+schedule_node_guard::schedule_node_guard(__isl_take isl_schedule_node *ptr)
+    : schedule_node(ptr) {}
+
+schedule_node_guard &schedule_node_guard::operator=(schedule_node_guard obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-boolean set::is_subset(const isl::set &set2) const
-{
-  auto res = isl_set_is_subset(get(), set2.get());
-  return manage(res);
+isl::ctx schedule_node_guard::ctx() const {
+  return isl::ctx(isl_schedule_node_get_ctx(ptr));
 }
 
-boolean set::is_wrapping() const
+isl::set schedule_node_guard::guard() const
 {
-  auto res = isl_set_is_wrapping(get());
+  auto res = isl_schedule_node_guard_get_guard(get());
   return manage(res);
 }
 
-isl::map set::lex_ge_set(isl::set set2) const
+isl::set schedule_node_guard::get_guard() const
 {
-  auto res = isl_set_lex_ge_set(copy(), set2.release());
-  return manage(res);
+  return guard();
 }
 
-isl::map set::lex_gt_set(isl::set set2) const
+inline std::ostream &operator<<(std::ostream &os, const schedule_node_guard &obj)
 {
-  auto res = isl_set_lex_gt_set(copy(), set2.release());
-  return manage(res);
+  char *str = isl_schedule_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::map set::lex_lt_set(isl::set set2) const
+// implementations for isl::schedule_node_leaf
+schedule_node_leaf::schedule_node_leaf()
+    : schedule_node() {}
+
+schedule_node_leaf::schedule_node_leaf(const schedule_node_leaf &obj)
+    : schedule_node(obj)
 {
-  auto res = isl_set_lex_lt_set(copy(), set2.release());
-  return manage(res);
 }
 
-isl::set set::lexmax() const
-{
-  auto res = isl_set_lexmax(copy());
-  return manage(res);
+schedule_node_leaf::schedule_node_leaf(__isl_take isl_schedule_node *ptr)
+    : schedule_node(ptr) {}
+
+schedule_node_leaf &schedule_node_leaf::operator=(schedule_node_leaf obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::pw_multi_aff set::lexmax_pw_multi_aff() const
-{
-  auto res = isl_set_lexmax_pw_multi_aff(copy());
-  return manage(res);
+isl::ctx schedule_node_leaf::ctx() const {
+  return isl::ctx(isl_schedule_node_get_ctx(ptr));
 }
 
-isl::set set::lexmin() const
+inline std::ostream &operator<<(std::ostream &os, const schedule_node_leaf &obj)
 {
-  auto res = isl_set_lexmin(copy());
-  return manage(res);
+  char *str = isl_schedule_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::pw_multi_aff set::lexmin_pw_multi_aff() const
+// implementations for isl::schedule_node_mark
+schedule_node_mark::schedule_node_mark()
+    : schedule_node() {}
+
+schedule_node_mark::schedule_node_mark(const schedule_node_mark &obj)
+    : schedule_node(obj)
 {
-  auto res = isl_set_lexmin_pw_multi_aff(copy());
-  return manage(res);
 }
 
-isl::set set::lower_bound(isl::multi_pw_aff lower) const
-{
-  auto res = isl_set_lower_bound_multi_pw_aff(copy(), lower.release());
-  return manage(res);
+schedule_node_mark::schedule_node_mark(__isl_take isl_schedule_node *ptr)
+    : schedule_node(ptr) {}
+
+schedule_node_mark &schedule_node_mark::operator=(schedule_node_mark obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::set set::lower_bound(isl::multi_val lower) const
-{
-  auto res = isl_set_lower_bound_multi_val(copy(), lower.release());
-  return manage(res);
+isl::ctx schedule_node_mark::ctx() const {
+  return isl::ctx(isl_schedule_node_get_ctx(ptr));
 }
 
-isl::set set::lower_bound_si(isl::dim type, unsigned int pos, int value) const
+isl::id schedule_node_mark::id() const
 {
-  auto res = isl_set_lower_bound_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
+  auto res = isl_schedule_node_mark_get_id(get());
   return manage(res);
 }
 
-isl::set set::lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const
+isl::id schedule_node_mark::get_id() const
 {
-  auto res = isl_set_lower_bound_val(copy(), static_cast<enum isl_dim_type>(type), pos, value.release());
-  return manage(res);
+  return id();
 }
 
-isl::multi_pw_aff set::max_multi_pw_aff() const
+inline std::ostream &operator<<(std::ostream &os, const schedule_node_mark &obj)
 {
-  auto res = isl_set_max_multi_pw_aff(copy());
-  return manage(res);
+  char *str = isl_schedule_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::val set::max_val(const isl::aff &obj) const
+// implementations for isl::schedule_node_sequence
+schedule_node_sequence::schedule_node_sequence()
+    : schedule_node() {}
+
+schedule_node_sequence::schedule_node_sequence(const schedule_node_sequence &obj)
+    : schedule_node(obj)
 {
-  auto res = isl_set_max_val(get(), obj.get());
-  return manage(res);
 }
 
-isl::multi_pw_aff set::min_multi_pw_aff() const
-{
-  auto res = isl_set_min_multi_pw_aff(copy());
-  return manage(res);
+schedule_node_sequence::schedule_node_sequence(__isl_take isl_schedule_node *ptr)
+    : schedule_node(ptr) {}
+
+schedule_node_sequence &schedule_node_sequence::operator=(schedule_node_sequence obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::val set::min_val(const isl::aff &obj) const
-{
-  auto res = isl_set_min_val(get(), obj.get());
-  return manage(res);
+isl::ctx schedule_node_sequence::ctx() const {
+  return isl::ctx(isl_schedule_node_get_ctx(ptr));
 }
 
-isl::set set::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
+inline std::ostream &operator<<(std::ostream &os, const schedule_node_sequence &obj)
 {
-  auto res = isl_set_move_dims(copy(), static_cast<enum isl_dim_type>(dst_type), dst_pos, static_cast<enum isl_dim_type>(src_type), src_pos, n);
-  return manage(res);
+  char *str = isl_schedule_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl_size set::n_basic_set() const
+// implementations for isl::schedule_node_set
+schedule_node_set::schedule_node_set()
+    : schedule_node() {}
+
+schedule_node_set::schedule_node_set(const schedule_node_set &obj)
+    : schedule_node(obj)
 {
-  auto res = isl_set_n_basic_set(get());
-  return res;
 }
 
-isl_size set::n_dim() const
-{
-  auto res = isl_set_n_dim(get());
-  return res;
+schedule_node_set::schedule_node_set(__isl_take isl_schedule_node *ptr)
+    : schedule_node(ptr) {}
+
+schedule_node_set &schedule_node_set::operator=(schedule_node_set obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::set set::nat_universe(isl::space space)
-{
-  auto res = isl_set_nat_universe(space.release());
-  return manage(res);
+isl::ctx schedule_node_set::ctx() const {
+  return isl::ctx(isl_schedule_node_get_ctx(ptr));
 }
 
-isl::set set::neg() const
+inline std::ostream &operator<<(std::ostream &os, const schedule_node_set &obj)
 {
-  auto res = isl_set_neg(copy());
-  return manage(res);
+  char *str = isl_schedule_node_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::set set::params() const
-{
-  auto res = isl_set_params(copy());
-  return manage(res);
+// implementations for isl::set
+set manage(__isl_take isl_set *ptr) {
+  return set(ptr);
+}
+set manage_copy(__isl_keep isl_set *ptr) {
+  ptr = isl_set_copy(ptr);
+  return set(ptr);
 }
 
-int set::plain_cmp(const isl::set &set2) const
+set::set()
+    : ptr(nullptr) {}
+
+set::set(const set &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_set_plain_cmp(get(), set2.get());
-  return res;
+  ptr = obj.copy();
 }
 
-isl::val set::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const
+set::set(__isl_take isl_set *ptr)
+    : ptr(ptr) {}
+
+set::set(isl::basic_set bset)
 {
-  auto res = isl_set_plain_get_val_if_fixed(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  auto res = isl_set_from_basic_set(bset.release());
+  ptr = res;
 }
 
-boolean set::plain_is_disjoint(const isl::set &set2) const
+set::set(isl::point pnt)
 {
-  auto res = isl_set_plain_is_disjoint(get(), set2.get());
-  return manage(res);
+  auto res = isl_set_from_point(pnt.release());
+  ptr = res;
 }
 
-boolean set::plain_is_empty() const
+set::set(isl::union_set uset)
 {
-  auto res = isl_set_plain_is_empty(get());
-  return manage(res);
+  auto res = isl_set_from_union_set(uset.release());
+  ptr = res;
 }
 
-boolean set::plain_is_equal(const isl::set &set2) const
+set::set(isl::ctx ctx, const std::string &str)
 {
-  auto res = isl_set_plain_is_equal(get(), set2.get());
-  return manage(res);
+  auto res = isl_set_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
 }
 
-boolean set::plain_is_universe() const
-{
-  auto res = isl_set_plain_is_universe(get());
-  return manage(res);
+set &set::operator=(set obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::basic_set set::plain_unshifted_simple_hull() const
-{
-  auto res = isl_set_plain_unshifted_simple_hull(copy());
-  return manage(res);
+set::~set() {
+  if (ptr)
+    isl_set_free(ptr);
 }
 
-isl::basic_set set::polyhedral_hull() const
-{
-  auto res = isl_set_polyhedral_hull(copy());
-  return manage(res);
+__isl_give isl_set *set::copy() const & {
+  return isl_set_copy(ptr);
 }
 
-isl::set set::preimage(isl::multi_aff ma) const
-{
-  auto res = isl_set_preimage_multi_aff(copy(), ma.release());
-  return manage(res);
+__isl_keep isl_set *set::get() const {
+  return ptr;
 }
 
-isl::set set::preimage(isl::multi_pw_aff mpa) const
-{
-  auto res = isl_set_preimage_multi_pw_aff(copy(), mpa.release());
-  return manage(res);
+__isl_give isl_set *set::release() {
+  isl_set *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
 }
 
-isl::set set::preimage(isl::pw_multi_aff pma) const
-{
-  auto res = isl_set_preimage_pw_multi_aff(copy(), pma.release());
-  return manage(res);
+bool set::is_null() const {
+  return ptr == nullptr;
 }
 
-isl::set set::product(isl::set set2) const
+isl::ctx set::ctx() const {
+  return isl::ctx(isl_set_get_ctx(ptr));
+}
+
+isl::set set::add_constraint(isl::constraint constraint) const
 {
-  auto res = isl_set_product(copy(), set2.release());
+  auto res = isl_set_add_constraint(copy(), constraint.release());
   return manage(res);
 }
 
-isl::map set::project_onto_map(isl::dim type, unsigned int first, unsigned int n) const
+isl::set set::add_dims(isl::dim type, unsigned int n) const
 {
-  auto res = isl_set_project_onto_map(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_set_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
   return manage(res);
 }
 
-isl::set set::project_out(isl::dim type, unsigned int first, unsigned int n) const
+isl::basic_set set::affine_hull() const
 {
-  auto res = isl_set_project_out(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_set_affine_hull(copy());
   return manage(res);
 }
 
-isl::set set::project_out_all_params() const
+isl::set set::align_params(isl::space model) const
 {
-  auto res = isl_set_project_out_all_params(copy());
+  auto res = isl_set_align_params(copy(), model.release());
   return manage(res);
 }
 
-isl::set set::project_out_param(isl::id id) const
+isl::set set::apply(isl::map map) const
 {
-  auto res = isl_set_project_out_param_id(copy(), id.release());
+  auto res = isl_set_apply(copy(), map.release());
   return manage(res);
 }
 
-isl::set set::project_out_param(isl::id_list list) const
+isl::union_set set::apply(const isl::union_map &umap) const
 {
-  auto res = isl_set_project_out_param_id_list(copy(), list.release());
-  return manage(res);
+  return isl::union_set(*this).apply(umap);
 }
 
-isl::set set::remove_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::set set::apply(const isl::basic_map &map) const
 {
-  auto res = isl_set_remove_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return this->apply(isl::map(map));
 }
 
-isl::set set::remove_divs() const
+isl::pw_multi_aff set::as_pw_multi_aff() const
 {
-  auto res = isl_set_remove_divs(copy());
+  auto res = isl_set_as_pw_multi_aff(copy());
   return manage(res);
 }
 
-isl::set set::remove_divs_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::set set::as_set() const
 {
-  auto res = isl_set_remove_divs_involving_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::union_set(*this).as_set();
 }
 
-isl::set set::remove_redundancies() const
+isl::basic_set_list set::basic_set_list() const
 {
-  auto res = isl_set_remove_redundancies(copy());
+  auto res = isl_set_get_basic_set_list(get());
   return manage(res);
 }
 
-isl::set set::remove_unknown_divs() const
+isl::basic_set_list set::get_basic_set_list() const
 {
-  auto res = isl_set_remove_unknown_divs(copy());
-  return manage(res);
+  return basic_set_list();
 }
 
-isl::set set::reset_space(isl::space space) const
+isl::set set::bind(isl::multi_id tuple) const
 {
-  auto res = isl_set_reset_space(copy(), space.release());
+  auto res = isl_set_bind(copy(), tuple.release());
   return manage(res);
 }
 
-isl::set set::reset_tuple_id() const
+isl::set set::coalesce() const
 {
-  auto res = isl_set_reset_tuple_id(copy());
+  auto res = isl_set_coalesce(copy());
   return manage(res);
 }
 
-isl::set set::reset_user() const
+isl::set set::complement() const
 {
-  auto res = isl_set_reset_user(copy());
+  auto res = isl_set_complement(copy());
   return manage(res);
 }
 
-isl::basic_set set::sample() const
+isl::union_set set::compute_divs() const
 {
-  auto res = isl_set_sample(copy());
-  return manage(res);
+  return isl::union_set(*this).compute_divs();
 }
 
-isl::point set::sample_point() const
+boolean set::contains(const isl::space &space) const
 {
-  auto res = isl_set_sample_point(copy());
-  return manage(res);
+  return isl::union_set(*this).contains(space);
 }
 
-isl::set set::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const
+isl::basic_set set::convex_hull() const
 {
-  auto res = isl_set_set_dim_id(copy(), static_cast<enum isl_dim_type>(type), pos, id.release());
+  auto res = isl_set_convex_hull(copy());
   return manage(res);
 }
 
-isl::set set::set_tuple_id(isl::id id) const
+isl::set set::detect_equalities() const
 {
-  auto res = isl_set_set_tuple_id(copy(), id.release());
+  auto res = isl_set_detect_equalities(copy());
   return manage(res);
 }
 
-isl::set set::set_tuple_name(const std::string &s) const
+class size set::dim(isl::dim type) const
 {
-  auto res = isl_set_set_tuple_name(copy(), s.c_str());
+  auto res = isl_set_dim(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::basic_set set::simple_hull() const
+boolean set::dim_has_any_lower_bound(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_set_simple_hull(copy());
+  auto res = isl_set_dim_has_any_lower_bound(get(), static_cast<enum isl_dim_type>(type), pos);
   return manage(res);
 }
 
-int set::size() const
+isl::id set::dim_id(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_set_size(get());
-  return res;
+  auto res = isl_set_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
+  return manage(res);
 }
 
-isl::basic_set set::solutions() const
+isl::id set::get_dim_id(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_set_solutions(copy());
-  return manage(res);
+  return dim_id(type, pos);
 }
 
-isl::set set::split_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::pw_aff set::dim_max(int pos) const
 {
-  auto res = isl_set_split_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_set_dim_max(copy(), pos);
   return manage(res);
 }
 
-isl::set set::subtract(isl::set set2) const
+isl::val set::dim_max_val(int pos) const
 {
-  auto res = isl_set_subtract(copy(), set2.release());
+  auto res = isl_set_dim_max_val(copy(), pos);
   return manage(res);
 }
 
-isl::set set::sum(isl::set set2) const
+isl::pw_aff set::dim_min(int pos) const
 {
-  auto res = isl_set_sum(copy(), set2.release());
+  auto res = isl_set_dim_min(copy(), pos);
   return manage(res);
 }
 
-isl::map set::translation() const
+isl::val set::dim_min_val(int pos) const
 {
-  auto res = isl_set_translation(copy());
+  auto res = isl_set_dim_min_val(copy(), pos);
   return manage(res);
 }
 
-isl_size set::tuple_dim() const
+std::string set::dim_name(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_set_tuple_dim(get());
-  return res;
+  auto res = isl_set_get_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
+  std::string tmp(res);
+  return tmp;
 }
 
-isl::set set::unbind_params(isl::multi_id tuple) const
+std::string set::get_dim_name(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_set_unbind_params(copy(), tuple.release());
-  return manage(res);
+  return dim_name(type, pos);
 }
 
-isl::map set::unbind_params_insert_domain(isl::multi_id domain) const
+isl::set set::drop_constraints_involving_dims(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_set_unbind_params_insert_domain(copy(), domain.release());
+  auto res = isl_set_drop_constraints_involving_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
   return manage(res);
 }
 
-isl::set set::unite(isl::set set2) const
+isl::set set::eliminate(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_set_union(copy(), set2.release());
+  auto res = isl_set_eliminate(copy(), static_cast<enum isl_dim_type>(type), first, n);
   return manage(res);
 }
 
-isl::set set::universe(isl::space space)
+isl::set set::empty(isl::space space)
 {
-  auto res = isl_set_universe(space.release());
+  auto res = isl_set_empty(space.release());
   return manage(res);
 }
 
-isl::basic_set set::unshifted_simple_hull() const
+boolean set::every_set(const std::function<boolean(isl::set)> &test) const
 {
-  auto res = isl_set_unshifted_simple_hull(copy());
-  return manage(res);
+  return isl::union_set(*this).every_set(test);
 }
 
-isl::basic_set set::unshifted_simple_hull_from_set_list(isl::set_list list) const
+isl::set set::extract_set(const isl::space &space) const
 {
-  auto res = isl_set_unshifted_simple_hull_from_set_list(copy(), list.release());
-  return manage(res);
+  return isl::union_set(*this).extract_set(space);
 }
 
-isl::map set::unwrap() const
+int set::find_dim_by_id(isl::dim type, const isl::id &id) const
 {
-  auto res = isl_set_unwrap(copy());
-  return manage(res);
+  auto res = isl_set_find_dim_by_id(get(), static_cast<enum isl_dim_type>(type), id.get());
+  return res;
 }
 
-isl::set set::upper_bound(isl::multi_pw_aff upper) const
+int set::find_dim_by_id(isl::dim type, const std::string &id) const
 {
-  auto res = isl_set_upper_bound_multi_pw_aff(copy(), upper.release());
-  return manage(res);
+  return this->find_dim_by_id(type, isl::id(ctx(), id));
 }
 
-isl::set set::upper_bound(isl::multi_val upper) const
+isl::set set::fix_si(isl::dim type, unsigned int pos, int value) const
 {
-  auto res = isl_set_upper_bound_multi_val(copy(), upper.release());
+  auto res = isl_set_fix_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
   return manage(res);
 }
 
-isl::set set::upper_bound_si(isl::dim type, unsigned int pos, int value) const
+isl::set set::flatten() const
 {
-  auto res = isl_set_upper_bound_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
+  auto res = isl_set_flatten(copy());
   return manage(res);
 }
 
-isl::set set::upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const
+stat set::foreach_basic_set(const std::function<stat(isl::basic_set)> &fn) const
 {
-  auto res = isl_set_upper_bound_val(copy(), static_cast<enum isl_dim_type>(type), pos, value.release());
+  struct fn_data {
+    std::function<stat(isl::basic_set)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_basic_set *arg_0, void *arg_1) -> isl_stat {
+    auto *data = static_cast<struct fn_data *>(arg_1);
+    auto ret = (data->func)(manage(arg_0));
+    return ret.release();
+  };
+  auto res = isl_set_foreach_basic_set(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::map set::wrapped_domain_map() const
+stat set::foreach_point(const std::function<stat(isl::point)> &fn) const
 {
-  auto res = isl_set_wrapped_domain_map(copy());
+  struct fn_data {
+    std::function<stat(isl::point)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_point *arg_0, void *arg_1) -> isl_stat {
+    auto *data = static_cast<struct fn_data *>(arg_1);
+    auto ret = (data->func)(manage(arg_0));
+    return ret.release();
+  };
+  auto res = isl_set_foreach_point(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-// implementations for isl::set_list
-set_list manage(__isl_take isl_set_list *ptr) {
-  return set_list(ptr);
-}
-set_list manage_copy(__isl_keep isl_set_list *ptr) {
-  ptr = isl_set_list_copy(ptr);
-  return set_list(ptr);
-}
-
-set_list::set_list()
-    : ptr(nullptr) {}
-
-set_list::set_list(const set_list &obj)
-    : ptr(nullptr)
+stat set::foreach_set(const std::function<stat(isl::set)> &fn) const
 {
-  ptr = obj.copy();
-}
-
-
-set_list::set_list(__isl_take isl_set_list *ptr)
-    : ptr(ptr) {}
-
-
-set_list &set_list::operator=(set_list obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
-}
-
-set_list::~set_list() {
-  if (ptr)
-    isl_set_list_free(ptr);
-}
-
-__isl_give isl_set_list *set_list::copy() const & {
-  return isl_set_list_copy(ptr);
-}
-
-__isl_keep isl_set_list *set_list::get() const {
-  return ptr;
-}
-
-__isl_give isl_set_list *set_list::release() {
-  isl_set_list *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
-}
-
-bool set_list::is_null() const {
-  return ptr == nullptr;
-}
-
-
-isl::ctx set_list::ctx() const {
-  return isl::ctx(isl_set_list_get_ctx(ptr));
-}
-
-void set_list::dump() const {
-  isl_set_list_dump(get());
+  return isl::union_set(*this).foreach_set(fn);
 }
 
-
-isl::set_list set_list::add(isl::set el) const
+isl::set set::gist(isl::set context) const
 {
-  auto res = isl_set_list_add(copy(), el.release());
+  auto res = isl_set_gist(copy(), context.release());
   return manage(res);
 }
 
-isl::set_list set_list::alloc(isl::ctx ctx, int n)
+isl::union_set set::gist(const isl::union_set &context) const
 {
-  auto res = isl_set_list_alloc(ctx.release(), n);
-  return manage(res);
+  return isl::union_set(*this).gist(context);
 }
 
-isl::set_list set_list::clear() const
+isl::set set::gist(const isl::basic_set &context) const
 {
-  auto res = isl_set_list_clear(copy());
-  return manage(res);
+  return this->gist(isl::set(context));
 }
 
-isl::set_list set_list::concat(isl::set_list list2) const
+isl::set set::gist(const isl::point &context) const
 {
-  auto res = isl_set_list_concat(copy(), list2.release());
-  return manage(res);
+  return this->gist(isl::set(context));
 }
 
-isl::set_list set_list::drop(unsigned int first, unsigned int n) const
+isl::set set::gist_params(isl::set context) const
 {
-  auto res = isl_set_list_drop(copy(), first, n);
+  auto res = isl_set_gist_params(copy(), context.release());
   return manage(res);
 }
 
-stat set_list::foreach(const std::function<stat(set)> &fn) const
+boolean set::has_equal_space(const isl::set &set2) const
 {
-  struct fn_data {
-    const std::function<stat(set)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_set *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_set_list_foreach(get(), fn_lambda, &fn_data);
+  auto res = isl_set_has_equal_space(get(), set2.get());
   return manage(res);
 }
 
-isl::set_list set_list::from_set(isl::set el)
+isl::map set::identity() const
 {
-  auto res = isl_set_list_from_set(el.release());
+  auto res = isl_set_identity(copy());
   return manage(res);
 }
 
-isl::set set_list::get_at(int index) const
+isl::union_pw_multi_aff set::identity_union_pw_multi_aff() const
 {
-  auto res = isl_set_list_get_at(get(), index);
-  return manage(res);
+  return isl::union_set(*this).identity_union_pw_multi_aff();
 }
 
-isl::set set_list::get_set(int index) const
+isl::pw_aff set::indicator_function() const
 {
-  auto res = isl_set_list_get_set(get(), index);
+  auto res = isl_set_indicator_function(copy());
   return manage(res);
 }
 
-isl::set_list set_list::insert(unsigned int pos, isl::set el) const
+isl::set set::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const
 {
-  auto res = isl_set_list_insert(copy(), pos, el.release());
+  auto res = isl_set_insert_dims(copy(), static_cast<enum isl_dim_type>(type), pos, n);
   return manage(res);
 }
 
-isl_size set_list::n_set() const
+isl::map set::insert_domain(isl::space domain) const
 {
-  auto res = isl_set_list_n_set(get());
-  return res;
+  auto res = isl_set_insert_domain(copy(), domain.release());
+  return manage(res);
 }
 
-isl::set_list set_list::reverse() const
+isl::set set::intersect(isl::set set2) const
 {
-  auto res = isl_set_list_reverse(copy());
+  auto res = isl_set_intersect(copy(), set2.release());
   return manage(res);
 }
 
-isl::set_list set_list::set_set(int index, isl::set el) const
+isl::union_set set::intersect(const isl::union_set &uset2) const
 {
-  auto res = isl_set_list_set_set(copy(), index, el.release());
-  return manage(res);
+  return isl::union_set(*this).intersect(uset2);
 }
 
-isl_size set_list::size() const
+isl::set set::intersect(const isl::basic_set &set2) const
 {
-  auto res = isl_set_list_size(get());
-  return res;
+  return this->intersect(isl::set(set2));
 }
 
-isl::set_list set_list::swap(unsigned int pos1, unsigned int pos2) const
+isl::set set::intersect(const isl::point &set2) const
 {
-  auto res = isl_set_list_swap(copy(), pos1, pos2);
-  return manage(res);
+  return this->intersect(isl::set(set2));
 }
 
-isl::set set_list::unite() const
+isl::set set::intersect_params(isl::set params) const
 {
-  auto res = isl_set_list_union(copy());
+  auto res = isl_set_intersect_params(copy(), params.release());
   return manage(res);
 }
 
-// implementations for isl::space
-space manage(__isl_take isl_space *ptr) {
-  return space(ptr);
-}
-space manage_copy(__isl_keep isl_space *ptr) {
-  ptr = isl_space_copy(ptr);
-  return space(ptr);
-}
-
-space::space()
-    : ptr(nullptr) {}
-
-space::space(const space &obj)
-    : ptr(nullptr)
+boolean set::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
 {
-  ptr = obj.copy();
-}
-
-
-space::space(__isl_take isl_space *ptr)
-    : ptr(ptr) {}
+  auto res = isl_set_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
+  return manage(res);
+}
 
-space::space(isl::ctx ctx, unsigned int nparam, unsigned int n_in, unsigned int n_out)
+boolean set::involves_locals() const
 {
-  auto res = isl_space_alloc(ctx.release(), nparam, n_in, n_out);
-  ptr = res;
+  auto res = isl_set_involves_locals(get());
+  return manage(res);
 }
-space::space(isl::ctx ctx, unsigned int nparam, unsigned int dim)
+
+boolean set::is_bounded() const
 {
-  auto res = isl_space_set_alloc(ctx.release(), nparam, dim);
-  ptr = res;
+  auto res = isl_set_is_bounded(get());
+  return manage(res);
 }
 
-space &space::operator=(space obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+boolean set::is_disjoint(const isl::set &set2) const
+{
+  auto res = isl_set_is_disjoint(get(), set2.get());
+  return manage(res);
 }
 
-space::~space() {
-  if (ptr)
-    isl_space_free(ptr);
+boolean set::is_disjoint(const isl::union_set &uset2) const
+{
+  return isl::union_set(*this).is_disjoint(uset2);
 }
 
-__isl_give isl_space *space::copy() const & {
-  return isl_space_copy(ptr);
+boolean set::is_disjoint(const isl::basic_set &set2) const
+{
+  return this->is_disjoint(isl::set(set2));
 }
 
-__isl_keep isl_space *space::get() const {
-  return ptr;
+boolean set::is_disjoint(const isl::point &set2) const
+{
+  return this->is_disjoint(isl::set(set2));
 }
 
-__isl_give isl_space *space::release() {
-  isl_space *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+boolean set::is_empty() const
+{
+  auto res = isl_set_is_empty(get());
+  return manage(res);
 }
 
-bool space::is_null() const {
-  return ptr == nullptr;
+boolean set::is_equal(const isl::set &set2) const
+{
+  auto res = isl_set_is_equal(get(), set2.get());
+  return manage(res);
 }
 
-
-isl::ctx space::ctx() const {
-  return isl::ctx(isl_space_get_ctx(ptr));
+boolean set::is_equal(const isl::union_set &uset2) const
+{
+  return isl::union_set(*this).is_equal(uset2);
 }
 
-void space::dump() const {
-  isl_space_dump(get());
+boolean set::is_equal(const isl::basic_set &set2) const
+{
+  return this->is_equal(isl::set(set2));
 }
 
-
-isl::space space::add_dims(isl::dim type, unsigned int n) const
+boolean set::is_equal(const isl::point &set2) const
 {
-  auto res = isl_space_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
-  return manage(res);
+  return this->is_equal(isl::set(set2));
 }
 
-isl::space space::add_named_tuple(isl::id tuple_id, unsigned int dim) const
+boolean set::is_params() const
 {
-  auto res = isl_space_add_named_tuple_id_ui(copy(), tuple_id.release(), dim);
+  auto res = isl_set_is_params(get());
   return manage(res);
 }
 
-isl::space space::add_param_id(isl::id id) const
+boolean set::is_singleton() const
 {
-  auto res = isl_space_add_param_id(copy(), id.release());
+  auto res = isl_set_is_singleton(get());
   return manage(res);
 }
 
-isl::space space::add_unnamed_tuple(unsigned int dim) const
+boolean set::is_strict_subset(const isl::set &set2) const
 {
-  auto res = isl_space_add_unnamed_tuple_ui(copy(), dim);
+  auto res = isl_set_is_strict_subset(get(), set2.get());
   return manage(res);
 }
 
-isl::space space::align_params(isl::space space2) const
+boolean set::is_strict_subset(const isl::union_set &uset2) const
 {
-  auto res = isl_space_align_params(copy(), space2.release());
-  return manage(res);
+  return isl::union_set(*this).is_strict_subset(uset2);
 }
 
-boolean space::can_curry() const
+boolean set::is_strict_subset(const isl::basic_set &set2) const
 {
-  auto res = isl_space_can_curry(get());
-  return manage(res);
+  return this->is_strict_subset(isl::set(set2));
 }
 
-boolean space::can_range_curry() const
+boolean set::is_strict_subset(const isl::point &set2) const
 {
-  auto res = isl_space_can_range_curry(get());
-  return manage(res);
+  return this->is_strict_subset(isl::set(set2));
 }
 
-boolean space::can_uncurry() const
+boolean set::is_subset(const isl::set &set2) const
 {
-  auto res = isl_space_can_uncurry(get());
+  auto res = isl_set_is_subset(get(), set2.get());
   return manage(res);
 }
 
-boolean space::can_zip() const
+boolean set::is_subset(const isl::union_set &uset2) const
 {
-  auto res = isl_space_can_zip(get());
-  return manage(res);
+  return isl::union_set(*this).is_subset(uset2);
 }
 
-isl::space space::curry() const
+boolean set::is_subset(const isl::basic_set &set2) const
 {
-  auto res = isl_space_curry(copy());
-  return manage(res);
+  return this->is_subset(isl::set(set2));
 }
 
-isl_size space::dim(isl::dim type) const
+boolean set::is_subset(const isl::point &set2) const
 {
-  auto res = isl_space_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return this->is_subset(isl::set(set2));
 }
 
-isl::space space::domain() const
+boolean set::is_wrapping() const
 {
-  auto res = isl_space_domain(copy());
+  auto res = isl_set_is_wrapping(get());
   return manage(res);
 }
 
-isl::space space::domain_factor_domain() const
+boolean set::isa_set() const
 {
-  auto res = isl_space_domain_factor_domain(copy());
-  return manage(res);
+  return isl::union_set(*this).isa_set();
 }
 
-isl::space space::domain_factor_range() const
+isl::set set::lexmax() const
 {
-  auto res = isl_space_domain_factor_range(copy());
+  auto res = isl_set_lexmax(copy());
   return manage(res);
 }
 
-boolean space::domain_is_wrapping() const
+isl::pw_multi_aff set::lexmax_pw_multi_aff() const
 {
-  auto res = isl_space_domain_is_wrapping(get());
+  auto res = isl_set_lexmax_pw_multi_aff(copy());
   return manage(res);
 }
 
-isl::space space::domain_map() const
+isl::set set::lexmin() const
 {
-  auto res = isl_space_domain_map(copy());
+  auto res = isl_set_lexmin(copy());
   return manage(res);
 }
 
-isl::space space::domain_product(isl::space right) const
+isl::pw_multi_aff set::lexmin_pw_multi_aff() const
 {
-  auto res = isl_space_domain_product(copy(), right.release());
+  auto res = isl_set_lexmin_pw_multi_aff(copy());
   return manage(res);
 }
 
-isl::space space::drop_all_params() const
+isl::set set::lower_bound(isl::multi_pw_aff lower) const
 {
-  auto res = isl_space_drop_all_params(copy());
+  auto res = isl_set_lower_bound_multi_pw_aff(copy(), lower.release());
   return manage(res);
 }
 
-isl::space space::drop_dims(isl::dim type, unsigned int first, unsigned int num) const
+isl::set set::lower_bound(isl::multi_val lower) const
 {
-  auto res = isl_space_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, num);
+  auto res = isl_set_lower_bound_multi_val(copy(), lower.release());
   return manage(res);
 }
 
-isl::space space::factor_domain() const
+isl::set set::lower_bound_si(isl::dim type, unsigned int pos, int value) const
 {
-  auto res = isl_space_factor_domain(copy());
+  auto res = isl_set_lower_bound_si(copy(), static_cast<enum isl_dim_type>(type), pos, value);
   return manage(res);
 }
 
-isl::space space::factor_range() const
+isl::set set::lower_bound_val(isl::dim type, unsigned int pos, isl::val value) const
 {
-  auto res = isl_space_factor_range(copy());
+  auto res = isl_set_lower_bound_val(copy(), static_cast<enum isl_dim_type>(type), pos, value.release());
   return manage(res);
 }
 
-int space::find_dim_by_id(isl::dim type, const isl::id &id) const
+isl::set set::lower_bound_val(isl::dim type, unsigned int pos, long value) const
 {
-  auto res = isl_space_find_dim_by_id(get(), static_cast<enum isl_dim_type>(type), id.get());
-  return res;
+  return this->lower_bound_val(type, pos, isl::val(ctx(), value));
 }
 
-int space::find_dim_by_name(isl::dim type, const std::string &name) const
+isl::multi_pw_aff set::max_multi_pw_aff() const
 {
-  auto res = isl_space_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+  auto res = isl_set_max_multi_pw_aff(copy());
+  return manage(res);
 }
 
-isl::space space::flatten_domain() const
+isl::val set::max_val(const isl::aff &obj) const
 {
-  auto res = isl_space_flatten_domain(copy());
+  auto res = isl_set_max_val(get(), obj.get());
   return manage(res);
 }
 
-isl::space space::flatten_range() const
+isl::multi_pw_aff set::min_multi_pw_aff() const
 {
-  auto res = isl_space_flatten_range(copy());
+  auto res = isl_set_min_multi_pw_aff(copy());
   return manage(res);
 }
 
-isl::space space::from_domain() const
+isl::val set::min_val(const isl::aff &obj) const
 {
-  auto res = isl_space_from_domain(copy());
+  auto res = isl_set_min_val(get(), obj.get());
   return manage(res);
 }
 
-isl::space space::from_range() const
+class size set::n_basic_set() const
 {
-  auto res = isl_space_from_range(copy());
+  auto res = isl_set_n_basic_set(get());
   return manage(res);
 }
 
-isl::id space::get_dim_id(isl::dim type, unsigned int pos) const
+isl::set set::params() const
 {
-  auto res = isl_space_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_set_params(copy());
   return manage(res);
 }
 
-std::string space::get_dim_name(isl::dim type, unsigned int pos) const
+isl::val set::plain_get_val_if_fixed(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_space_get_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
-  std::string tmp(res);
-  return tmp;
+  auto res = isl_set_plain_get_val_if_fixed(get(), static_cast<enum isl_dim_type>(type), pos);
+  return manage(res);
 }
 
-isl::id space::get_tuple_id(isl::dim type) const
+isl::multi_val set::plain_multi_val_if_fixed() const
 {
-  auto res = isl_space_get_tuple_id(get(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_set_get_plain_multi_val_if_fixed(get());
   return manage(res);
 }
 
-std::string space::get_tuple_name(isl::dim type) const
+isl::multi_val set::get_plain_multi_val_if_fixed() const
 {
-  auto res = isl_space_get_tuple_name(get(), static_cast<enum isl_dim_type>(type));
-  std::string tmp(res);
-  return tmp;
+  return plain_multi_val_if_fixed();
 }
 
-boolean space::has_dim_id(isl::dim type, unsigned int pos) const
+isl::basic_set set::polyhedral_hull() const
 {
-  auto res = isl_space_has_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_set_polyhedral_hull(copy());
   return manage(res);
 }
 
-boolean space::has_dim_name(isl::dim type, unsigned int pos) const
+isl::set set::preimage(isl::multi_aff ma) const
 {
-  auto res = isl_space_has_dim_name(get(), static_cast<enum isl_dim_type>(type), pos);
+  auto res = isl_set_preimage_multi_aff(copy(), ma.release());
   return manage(res);
 }
 
-boolean space::has_equal_params(const isl::space &space2) const
+isl::set set::preimage(isl::multi_pw_aff mpa) const
 {
-  auto res = isl_space_has_equal_params(get(), space2.get());
+  auto res = isl_set_preimage_multi_pw_aff(copy(), mpa.release());
   return manage(res);
 }
 
-boolean space::has_equal_tuples(const isl::space &space2) const
+isl::set set::preimage(isl::pw_multi_aff pma) const
 {
-  auto res = isl_space_has_equal_tuples(get(), space2.get());
+  auto res = isl_set_preimage_pw_multi_aff(copy(), pma.release());
   return manage(res);
 }
 
-boolean space::has_tuple_id(isl::dim type) const
+isl::union_set set::preimage(const isl::union_pw_multi_aff &upma) const
 {
-  auto res = isl_space_has_tuple_id(get(), static_cast<enum isl_dim_type>(type));
-  return manage(res);
+  return isl::union_set(*this).preimage(upma);
 }
 
-boolean space::has_tuple_name(isl::dim type) const
+isl::set set::product(isl::set set2) const
 {
-  auto res = isl_space_has_tuple_name(get(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_set_product(copy(), set2.release());
   return manage(res);
 }
 
-isl::space space::insert_dims(isl::dim type, unsigned int pos, unsigned int n) const
+isl::set set::project_out(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_space_insert_dims(copy(), static_cast<enum isl_dim_type>(type), pos, n);
+  auto res = isl_set_project_out(copy(), static_cast<enum isl_dim_type>(type), first, n);
   return manage(res);
 }
 
-boolean space::is_domain(const isl::space &space2) const
+isl::set set::project_out_all_params() const
 {
-  auto res = isl_space_is_domain(get(), space2.get());
+  auto res = isl_set_project_out_all_params(copy());
   return manage(res);
 }
 
-boolean space::is_equal(const isl::space &space2) const
+isl::set set::project_out_param(isl::id id) const
 {
-  auto res = isl_space_is_equal(get(), space2.get());
+  auto res = isl_set_project_out_param_id(copy(), id.release());
   return manage(res);
 }
 
-boolean space::is_map() const
+isl::set set::project_out_param(const std::string &id) const
 {
-  auto res = isl_space_is_map(get());
-  return manage(res);
+  return this->project_out_param(isl::id(ctx(), id));
 }
 
-boolean space::is_params() const
+isl::set set::project_out_param(isl::id_list list) const
 {
-  auto res = isl_space_is_params(get());
+  auto res = isl_set_project_out_param_id_list(copy(), list.release());
   return manage(res);
 }
 
-boolean space::is_product() const
+isl::pw_multi_aff set::pw_multi_aff_on_domain(isl::multi_val mv) const
 {
-  auto res = isl_space_is_product(get());
+  auto res = isl_set_pw_multi_aff_on_domain_multi_val(copy(), mv.release());
   return manage(res);
 }
 
-boolean space::is_range(const isl::space &space2) const
+isl::set set::remove_dims(isl::dim type, unsigned int first, unsigned int n) const
 {
-  auto res = isl_space_is_range(get(), space2.get());
+  auto res = isl_set_remove_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
   return manage(res);
 }
 
-boolean space::is_set() const
+isl::set set::remove_divs() const
 {
-  auto res = isl_space_is_set(get());
+  auto res = isl_set_remove_divs(copy());
   return manage(res);
 }
 
-boolean space::is_wrapping() const
+isl::set set::remove_redundancies() const
 {
-  auto res = isl_space_is_wrapping(get());
+  auto res = isl_set_remove_redundancies(copy());
   return manage(res);
 }
 
-isl::space space::join(isl::space right) const
+isl::set set::reset_tuple_id() const
 {
-  auto res = isl_space_join(copy(), right.release());
+  auto res = isl_set_reset_tuple_id(copy());
   return manage(res);
 }
 
-isl::space space::map_from_domain_and_range(isl::space range) const
+isl::basic_set set::sample() const
 {
-  auto res = isl_space_map_from_domain_and_range(copy(), range.release());
+  auto res = isl_set_sample(copy());
   return manage(res);
 }
 
-isl::space space::map_from_set() const
+isl::point set::sample_point() const
 {
-  auto res = isl_space_map_from_set(copy());
+  auto res = isl_set_sample_point(copy());
   return manage(res);
 }
 
-isl::space space::move_dims(isl::dim dst_type, unsigned int dst_pos, isl::dim src_type, unsigned int src_pos, unsigned int n) const
+isl::set set::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const
 {
-  auto res = isl_space_move_dims(copy(), static_cast<enum isl_dim_type>(dst_type), dst_pos, static_cast<enum isl_dim_type>(src_type), src_pos, n);
+  auto res = isl_set_set_dim_id(copy(), static_cast<enum isl_dim_type>(type), pos, id.release());
   return manage(res);
 }
 
-isl::space space::params() const
+isl::set set::set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const
 {
-  auto res = isl_space_params(copy());
-  return manage(res);
+  return this->set_dim_id(type, pos, isl::id(ctx(), id));
 }
 
-isl::space space::params_alloc(isl::ctx ctx, unsigned int nparam)
+isl::set_list set::set_list() const
 {
-  auto res = isl_space_params_alloc(ctx.release(), nparam);
-  return manage(res);
+  return isl::union_set(*this).set_list();
 }
 
-isl::space space::product(isl::space right) const
+isl::set set::set_tuple_id(isl::id id) const
 {
-  auto res = isl_space_product(copy(), right.release());
+  auto res = isl_set_set_tuple_id(copy(), id.release());
   return manage(res);
 }
 
-isl::space space::range() const
+isl::set set::set_tuple_id(const std::string &id) const
 {
-  auto res = isl_space_range(copy());
-  return manage(res);
+  return this->set_tuple_id(isl::id(ctx(), id));
 }
 
-isl::space space::range_curry() const
+isl::fixed_box set::simple_fixed_box_hull() const
 {
-  auto res = isl_space_range_curry(copy());
+  auto res = isl_set_get_simple_fixed_box_hull(get());
   return manage(res);
 }
 
-isl::space space::range_factor_domain() const
+isl::fixed_box set::get_simple_fixed_box_hull() const
 {
-  auto res = isl_space_range_factor_domain(copy());
-  return manage(res);
+  return simple_fixed_box_hull();
 }
 
-isl::space space::range_factor_range() const
+isl::basic_set set::simple_hull() const
 {
-  auto res = isl_space_range_factor_range(copy());
+  auto res = isl_set_simple_hull(copy());
   return manage(res);
 }
 
-boolean space::range_is_wrapping() const
+isl::space set::space() const
 {
-  auto res = isl_space_range_is_wrapping(get());
+  auto res = isl_set_get_space(get());
   return manage(res);
 }
 
-isl::space space::range_map() const
+isl::space set::get_space() const
 {
-  auto res = isl_space_range_map(copy());
-  return manage(res);
+  return space();
 }
 
-isl::space space::range_product(isl::space right) const
+isl::val set::stride(int pos) const
 {
-  auto res = isl_space_range_product(copy(), right.release());
+  auto res = isl_set_get_stride(get(), pos);
   return manage(res);
 }
 
-isl::space space::range_reverse() const
+isl::val set::get_stride(int pos) const
 {
-  auto res = isl_space_range_reverse(copy());
-  return manage(res);
+  return stride(pos);
 }
 
-isl::space space::reset_tuple_id(isl::dim type) const
+isl::set set::subtract(isl::set set2) const
 {
-  auto res = isl_space_reset_tuple_id(copy(), static_cast<enum isl_dim_type>(type));
+  auto res = isl_set_subtract(copy(), set2.release());
   return manage(res);
 }
 
-isl::space space::reset_user() const
+isl::union_set set::subtract(const isl::union_set &uset2) const
 {
-  auto res = isl_space_reset_user(copy());
-  return manage(res);
+  return isl::union_set(*this).subtract(uset2);
 }
 
-isl::space space::reverse() const
+isl::set set::subtract(const isl::basic_set &set2) const
 {
-  auto res = isl_space_reverse(copy());
-  return manage(res);
+  return this->subtract(isl::set(set2));
 }
 
-isl::space space::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const
+isl::set set::subtract(const isl::point &set2) const
 {
-  auto res = isl_space_set_dim_id(copy(), static_cast<enum isl_dim_type>(type), pos, id.release());
-  return manage(res);
+  return this->subtract(isl::set(set2));
 }
 
-isl::space space::set_from_params() const
+isl::set_list set::to_list() const
 {
-  auto res = isl_space_set_from_params(copy());
+  auto res = isl_set_to_list(copy());
   return manage(res);
 }
 
-isl::space space::set_tuple_id(isl::dim type, isl::id id) const
+isl::union_set set::to_union_set() const
 {
-  auto res = isl_space_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
+  auto res = isl_set_to_union_set(copy());
   return manage(res);
 }
 
-isl::space space::set_tuple_name(isl::dim type, const std::string &s) const
+isl::map set::translation() const
 {
-  auto res = isl_space_set_tuple_name(copy(), static_cast<enum isl_dim_type>(type), s.c_str());
+  auto res = isl_set_translation(copy());
   return manage(res);
 }
 
-boolean space::tuple_is_equal(isl::dim type1, const isl::space &space2, isl::dim type2) const
+class size set::tuple_dim() const
 {
-  auto res = isl_space_tuple_is_equal(get(), static_cast<enum isl_dim_type>(type1), space2.get(), static_cast<enum isl_dim_type>(type2));
+  auto res = isl_set_tuple_dim(get());
   return manage(res);
 }
 
-isl::space space::uncurry() const
+isl::id set::tuple_id() const
 {
-  auto res = isl_space_uncurry(copy());
+  auto res = isl_set_get_tuple_id(get());
   return manage(res);
 }
 
-isl::space space::unit(isl::ctx ctx)
+isl::id set::get_tuple_id() const
 {
-  auto res = isl_space_unit(ctx.release());
-  return manage(res);
+  return tuple_id();
 }
 
-isl::space space::unwrap() const
+std::string set::tuple_name() const
 {
-  auto res = isl_space_unwrap(copy());
-  return manage(res);
+  auto res = isl_set_get_tuple_name(get());
+  std::string tmp(res);
+  return tmp;
 }
 
-isl::space space::wrap() const
+std::string set::get_tuple_name() const
 {
-  auto res = isl_space_wrap(copy());
-  return manage(res);
+  return tuple_name();
 }
 
-isl::space space::zip() const
+isl::set set::unbind_params(isl::multi_id tuple) const
 {
-  auto res = isl_space_zip(copy());
+  auto res = isl_set_unbind_params(copy(), tuple.release());
   return manage(res);
 }
 
-// implementations for isl::term
-term manage(__isl_take isl_term *ptr) {
-  return term(ptr);
-}
-term manage_copy(__isl_keep isl_term *ptr) {
-  ptr = isl_term_copy(ptr);
-  return term(ptr);
+isl::map set::unbind_params_insert_domain(isl::multi_id domain) const
+{
+  auto res = isl_set_unbind_params_insert_domain(copy(), domain.release());
+  return manage(res);
 }
 
-term::term()
-    : ptr(nullptr) {}
-
-term::term(const term &obj)
-    : ptr(nullptr)
+isl::set set::unite(isl::set set2) const
 {
-  ptr = obj.copy();
+  auto res = isl_set_union(copy(), set2.release());
+  return manage(res);
 }
 
-
-term::term(__isl_take isl_term *ptr)
-    : ptr(ptr) {}
-
-
-term &term::operator=(term obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::union_set set::unite(const isl::union_set &uset2) const
+{
+  return isl::union_set(*this).unite(uset2);
 }
 
-term::~term() {
-  if (ptr)
-    isl_term_free(ptr);
+isl::set set::unite(const isl::basic_set &set2) const
+{
+  return this->unite(isl::set(set2));
 }
 
-__isl_give isl_term *term::copy() const & {
-  return isl_term_copy(ptr);
+isl::set set::unite(const isl::point &set2) const
+{
+  return this->unite(isl::set(set2));
 }
 
-__isl_keep isl_term *term::get() const {
-  return ptr;
+isl::set set::universe(isl::space space)
+{
+  auto res = isl_set_universe(space.release());
+  return manage(res);
 }
 
-__isl_give isl_term *term::release() {
-  isl_term *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::basic_set set::unshifted_simple_hull() const
+{
+  auto res = isl_set_unshifted_simple_hull(copy());
+  return manage(res);
 }
 
-bool term::is_null() const {
-  return ptr == nullptr;
+isl::map set::unwrap() const
+{
+  auto res = isl_set_unwrap(copy());
+  return manage(res);
 }
 
-
-isl::ctx term::ctx() const {
-  return isl::ctx(isl_term_get_ctx(ptr));
+isl::set set::upper_bound(isl::multi_pw_aff upper) const
+{
+  auto res = isl_set_upper_bound_multi_pw_aff(copy(), upper.release());
+  return manage(res);
 }
 
-
-isl_size term::dim(isl::dim type) const
+isl::set set::upper_bound(isl::multi_val upper) const
 {
-  auto res = isl_term_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  auto res = isl_set_upper_bound_multi_val(copy(), upper.release());
+  return manage(res);
 }
 
-isl::val term::get_coefficient_val() const
+isl::set set::upper_bound_val(isl::dim type, unsigned int pos, isl::val value) const
 {
-  auto res = isl_term_get_coefficient_val(get());
+  auto res = isl_set_upper_bound_val(copy(), static_cast<enum isl_dim_type>(type), pos, value.release());
   return manage(res);
 }
 
-isl::aff term::get_div(unsigned int pos) const
+isl::set set::upper_bound_val(isl::dim type, unsigned int pos, long value) const
 {
-  auto res = isl_term_get_div(get(), pos);
-  return manage(res);
+  return this->upper_bound_val(type, pos, isl::val(ctx(), value));
 }
 
-isl_size term::get_exp(isl::dim type, unsigned int pos) const
+inline std::ostream &operator<<(std::ostream &os, const set &obj)
 {
-  auto res = isl_term_get_exp(get(), static_cast<enum isl_dim_type>(type), pos);
-  return res;
+  char *str = isl_set_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-// implementations for isl::union_access_info
-union_access_info manage(__isl_take isl_union_access_info *ptr) {
-  return union_access_info(ptr);
+// implementations for isl::set_list
+set_list manage(__isl_take isl_set_list *ptr) {
+  return set_list(ptr);
 }
-union_access_info manage_copy(__isl_keep isl_union_access_info *ptr) {
-  ptr = isl_union_access_info_copy(ptr);
-  return union_access_info(ptr);
+set_list manage_copy(__isl_keep isl_set_list *ptr) {
+  ptr = isl_set_list_copy(ptr);
+  return set_list(ptr);
 }
 
-union_access_info::union_access_info()
+set_list::set_list()
     : ptr(nullptr) {}
 
-union_access_info::union_access_info(const union_access_info &obj)
+set_list::set_list(const set_list &obj)
     : ptr(nullptr)
 {
   ptr = obj.copy();
 }
 
-
-union_access_info::union_access_info(__isl_take isl_union_access_info *ptr)
+set_list::set_list(__isl_take isl_set_list *ptr)
     : ptr(ptr) {}
 
-union_access_info::union_access_info(isl::union_map sink)
+set_list::set_list(isl::ctx ctx, int n)
 {
-  auto res = isl_union_access_info_from_sink(sink.release());
+  auto res = isl_set_list_alloc(ctx.release(), n);
   ptr = res;
 }
 
-union_access_info &union_access_info::operator=(union_access_info obj) {
+set_list::set_list(isl::set el)
+{
+  auto res = isl_set_list_from_set(el.release());
+  ptr = res;
+}
+
+set_list::set_list(isl::ctx ctx, const std::string &str)
+{
+  auto res = isl_set_list_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
+}
+
+set_list &set_list::operator=(set_list obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-union_access_info::~union_access_info() {
+set_list::~set_list() {
   if (ptr)
-    isl_union_access_info_free(ptr);
+    isl_set_list_free(ptr);
 }
 
-__isl_give isl_union_access_info *union_access_info::copy() const & {
-  return isl_union_access_info_copy(ptr);
+__isl_give isl_set_list *set_list::copy() const & {
+  return isl_set_list_copy(ptr);
 }
 
-__isl_keep isl_union_access_info *union_access_info::get() const {
+__isl_keep isl_set_list *set_list::get() const {
   return ptr;
 }
 
-__isl_give isl_union_access_info *union_access_info::release() {
-  isl_union_access_info *tmp = ptr;
+__isl_give isl_set_list *set_list::release() {
+  isl_set_list *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool union_access_info::is_null() const {
+bool set_list::is_null() const {
   return ptr == nullptr;
 }
 
+isl::ctx set_list::ctx() const {
+  return isl::ctx(isl_set_list_get_ctx(ptr));
+}
+
+isl::set_list set_list::add(isl::set el) const
+{
+  auto res = isl_set_list_add(copy(), el.release());
+  return manage(res);
+}
 
-isl::ctx union_access_info::ctx() const {
-  return isl::ctx(isl_union_access_info_get_ctx(ptr));
+isl::set set_list::at(int index) const
+{
+  auto res = isl_set_list_get_at(get(), index);
+  return manage(res);
 }
 
+isl::set set_list::get_at(int index) const
+{
+  return at(index);
+}
 
-isl::union_flow union_access_info::compute_flow() const
+isl::set_list set_list::clear() const
 {
-  auto res = isl_union_access_info_compute_flow(copy());
+  auto res = isl_set_list_clear(copy());
   return manage(res);
 }
 
-isl::union_access_info union_access_info::set_kill(isl::union_map kill) const
+isl::set_list set_list::concat(isl::set_list list2) const
 {
-  auto res = isl_union_access_info_set_kill(copy(), kill.release());
+  auto res = isl_set_list_concat(copy(), list2.release());
   return manage(res);
 }
 
-isl::union_access_info union_access_info::set_may_source(isl::union_map may_source) const
+isl::set_list set_list::drop(unsigned int first, unsigned int n) const
 {
-  auto res = isl_union_access_info_set_may_source(copy(), may_source.release());
+  auto res = isl_set_list_drop(copy(), first, n);
   return manage(res);
 }
 
-isl::union_access_info union_access_info::set_must_source(isl::union_map must_source) const
+stat set_list::foreach(const std::function<stat(isl::set)> &fn) const
 {
-  auto res = isl_union_access_info_set_must_source(copy(), must_source.release());
+  struct fn_data {
+    std::function<stat(isl::set)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_set *arg_0, void *arg_1) -> isl_stat {
+    auto *data = static_cast<struct fn_data *>(arg_1);
+    auto ret = (data->func)(manage(arg_0));
+    return ret.release();
+  };
+  auto res = isl_set_list_foreach(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::union_access_info union_access_info::set_schedule(isl::schedule schedule) const
+isl::set_list set_list::insert(unsigned int pos, isl::set el) const
 {
-  auto res = isl_union_access_info_set_schedule(copy(), schedule.release());
+  auto res = isl_set_list_insert(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::union_access_info union_access_info::set_schedule_map(isl::union_map schedule_map) const
+class size set_list::size() const
 {
-  auto res = isl_union_access_info_set_schedule_map(copy(), schedule_map.release());
+  auto res = isl_set_list_size(get());
   return manage(res);
 }
 
-// implementations for isl::union_flow
-union_flow manage(__isl_take isl_union_flow *ptr) {
-  return union_flow(ptr);
+inline std::ostream &operator<<(std::ostream &os, const set_list &obj)
+{
+  char *str = isl_set_list_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
-union_flow manage_copy(__isl_keep isl_union_flow *ptr) {
-  ptr = isl_union_flow_copy(ptr);
-  return union_flow(ptr);
+
+// implementations for isl::space
+space manage(__isl_take isl_space *ptr) {
+  return space(ptr);
+}
+space manage_copy(__isl_keep isl_space *ptr) {
+  ptr = isl_space_copy(ptr);
+  return space(ptr);
 }
 
-union_flow::union_flow()
+space::space()
     : ptr(nullptr) {}
 
-union_flow::union_flow(const union_flow &obj)
+space::space(const space &obj)
     : ptr(nullptr)
 {
   ptr = obj.copy();
 }
 
-
-union_flow::union_flow(__isl_take isl_union_flow *ptr)
+space::space(__isl_take isl_space *ptr)
     : ptr(ptr) {}
 
+space::space(isl::ctx ctx, unsigned int nparam, unsigned int n_in, unsigned int n_out)
+{
+  auto res = isl_space_alloc(ctx.release(), nparam, n_in, n_out);
+  ptr = res;
+}
 
-union_flow &union_flow::operator=(union_flow obj) {
+space::space(isl::ctx ctx, unsigned int nparam, unsigned int dim)
+{
+  auto res = isl_space_set_alloc(ctx.release(), nparam, dim);
+  ptr = res;
+}
+
+space &space::operator=(space obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-union_flow::~union_flow() {
+space::~space() {
   if (ptr)
-    isl_union_flow_free(ptr);
+    isl_space_free(ptr);
 }
 
-__isl_give isl_union_flow *union_flow::copy() const & {
-  return isl_union_flow_copy(ptr);
+__isl_give isl_space *space::copy() const & {
+  return isl_space_copy(ptr);
 }
 
-__isl_keep isl_union_flow *union_flow::get() const {
+__isl_keep isl_space *space::get() const {
   return ptr;
 }
 
-__isl_give isl_union_flow *union_flow::release() {
-  isl_union_flow *tmp = ptr;
+__isl_give isl_space *space::release() {
+  isl_space *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool union_flow::is_null() const {
+bool space::is_null() const {
   return ptr == nullptr;
 }
 
+isl::ctx space::ctx() const {
+  return isl::ctx(isl_space_get_ctx(ptr));
+}
+
+isl::space space::add_dims(isl::dim type, unsigned int n) const
+{
+  auto res = isl_space_add_dims(copy(), static_cast<enum isl_dim_type>(type), n);
+  return manage(res);
+}
 
-isl::ctx union_flow::ctx() const {
-  return isl::ctx(isl_union_flow_get_ctx(ptr));
+isl::space space::add_named_tuple(isl::id tuple_id, unsigned int dim) const
+{
+  auto res = isl_space_add_named_tuple_id_ui(copy(), tuple_id.release(), dim);
+  return manage(res);
 }
 
+isl::space space::add_named_tuple(const std::string &tuple_id, unsigned int dim) const
+{
+  return this->add_named_tuple(isl::id(ctx(), tuple_id), dim);
+}
 
-isl::union_map union_flow::get_full_may_dependence() const
+isl::space space::add_param(isl::id id) const
 {
-  auto res = isl_union_flow_get_full_may_dependence(get());
+  auto res = isl_space_add_param_id(copy(), id.release());
   return manage(res);
 }
 
-isl::union_map union_flow::get_full_must_dependence() const
+isl::space space::add_param(const std::string &id) const
 {
-  auto res = isl_union_flow_get_full_must_dependence(get());
+  return this->add_param(isl::id(ctx(), id));
+}
+
+isl::space space::add_unnamed_tuple(unsigned int dim) const
+{
+  auto res = isl_space_add_unnamed_tuple_ui(copy(), dim);
   return manage(res);
 }
 
-isl::union_map union_flow::get_may_dependence() const
+isl::space space::align_params(isl::space space2) const
 {
-  auto res = isl_union_flow_get_may_dependence(get());
+  auto res = isl_space_align_params(copy(), space2.release());
   return manage(res);
 }
 
-isl::union_map union_flow::get_may_no_source() const
+isl::space space::curry() const
 {
-  auto res = isl_union_flow_get_may_no_source(get());
+  auto res = isl_space_curry(copy());
   return manage(res);
 }
 
-isl::union_map union_flow::get_must_dependence() const
+class size space::dim(isl::dim type) const
 {
-  auto res = isl_union_flow_get_must_dependence(get());
+  auto res = isl_space_dim(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::union_map union_flow::get_must_no_source() const
+isl::id space::dim_id(isl::dim type, unsigned int pos) const
 {
-  auto res = isl_union_flow_get_must_no_source(get());
+  auto res = isl_space_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
   return manage(res);
 }
 
-// implementations for isl::union_map
-union_map manage(__isl_take isl_union_map *ptr) {
-  return union_map(ptr);
+isl::id space::get_dim_id(isl::dim type, unsigned int pos) const
+{
+  return dim_id(type, pos);
 }
-union_map manage_copy(__isl_keep isl_union_map *ptr) {
-  ptr = isl_union_map_copy(ptr);
-  return union_map(ptr);
+
+isl::space space::domain() const
+{
+  auto res = isl_space_domain(copy());
+  return manage(res);
 }
 
-union_map::union_map()
-    : ptr(nullptr) {}
+isl::multi_aff space::domain_map_multi_aff() const
+{
+  auto res = isl_space_domain_map_multi_aff(copy());
+  return manage(res);
+}
 
-union_map::union_map(const union_map &obj)
-    : ptr(nullptr)
+isl::pw_multi_aff space::domain_map_pw_multi_aff() const
+{
+  auto res = isl_space_domain_map_pw_multi_aff(copy());
+  return manage(res);
+}
+
+isl::id space::domain_tuple_id() const
 {
-  ptr = obj.copy();
+  auto res = isl_space_get_domain_tuple_id(get());
+  return manage(res);
 }
 
+isl::id space::get_domain_tuple_id() const
+{
+  return domain_tuple_id();
+}
 
-union_map::union_map(__isl_take isl_union_map *ptr)
-    : ptr(ptr) {}
+isl::space space::drop_dims(isl::dim type, unsigned int first, unsigned int num) const
+{
+  auto res = isl_space_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, num);
+  return manage(res);
+}
 
-union_map::union_map(isl::basic_map bmap)
+int space::find_dim_by_id(isl::dim type, const isl::id &id) const
 {
-  auto res = isl_union_map_from_basic_map(bmap.release());
-  ptr = res;
+  auto res = isl_space_find_dim_by_id(get(), static_cast<enum isl_dim_type>(type), id.get());
+  return res;
 }
-union_map::union_map(isl::map map)
+
+int space::find_dim_by_id(isl::dim type, const std::string &id) const
 {
-  auto res = isl_union_map_from_map(map.release());
-  ptr = res;
+  return this->find_dim_by_id(type, isl::id(ctx(), id));
 }
-union_map::union_map(isl::union_pw_multi_aff upma)
+
+isl::space space::flatten_domain() const
 {
-  auto res = isl_union_map_from_union_pw_multi_aff(upma.release());
-  ptr = res;
+  auto res = isl_space_flatten_domain(copy());
+  return manage(res);
 }
-union_map::union_map(isl::ctx ctx, const std::string &str)
+
+isl::space space::flatten_range() const
 {
-  auto res = isl_union_map_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
+  auto res = isl_space_flatten_range(copy());
+  return manage(res);
 }
 
-union_map &union_map::operator=(union_map obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+boolean space::has_domain_tuple_id() const
+{
+  auto res = isl_space_has_domain_tuple_id(get());
+  return manage(res);
 }
 
-union_map::~union_map() {
-  if (ptr)
-    isl_union_map_free(ptr);
+boolean space::has_equal_tuples(const isl::space &space2) const
+{
+  auto res = isl_space_has_equal_tuples(get(), space2.get());
+  return manage(res);
 }
 
-__isl_give isl_union_map *union_map::copy() const & {
-  return isl_union_map_copy(ptr);
+boolean space::has_range_tuple_id() const
+{
+  auto res = isl_space_has_range_tuple_id(get());
+  return manage(res);
 }
 
-__isl_keep isl_union_map *union_map::get() const {
-  return ptr;
+boolean space::has_tuple_id(isl::dim type) const
+{
+  auto res = isl_space_has_tuple_id(get(), static_cast<enum isl_dim_type>(type));
+  return manage(res);
 }
 
-__isl_give isl_union_map *union_map::release() {
-  isl_union_map *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+boolean space::has_tuple_name(isl::dim type) const
+{
+  auto res = isl_space_has_tuple_name(get(), static_cast<enum isl_dim_type>(type));
+  return manage(res);
 }
 
-bool union_map::is_null() const {
-  return ptr == nullptr;
+isl::multi_aff space::identity_multi_aff_on_domain() const
+{
+  auto res = isl_space_identity_multi_aff_on_domain(copy());
+  return manage(res);
 }
 
+isl::multi_pw_aff space::identity_multi_pw_aff_on_domain() const
+{
+  auto res = isl_space_identity_multi_pw_aff_on_domain(copy());
+  return manage(res);
+}
 
-isl::ctx union_map::ctx() const {
-  return isl::ctx(isl_union_map_get_ctx(ptr));
+isl::pw_multi_aff space::identity_pw_multi_aff_on_domain() const
+{
+  auto res = isl_space_identity_pw_multi_aff_on_domain(copy());
+  return manage(res);
 }
 
-void union_map::dump() const {
-  isl_union_map_dump(get());
+boolean space::is_equal(const isl::space &space2) const
+{
+  auto res = isl_space_is_equal(get(), space2.get());
+  return manage(res);
 }
 
+boolean space::is_params() const
+{
+  auto res = isl_space_is_params(get());
+  return manage(res);
+}
 
-isl::union_map union_map::affine_hull() const
+boolean space::is_set() const
 {
-  auto res = isl_union_map_affine_hull(copy());
+  auto res = isl_space_is_set(get());
   return manage(res);
 }
 
-isl::union_map union_map::align_params(isl::space model) const
+boolean space::is_wrapping() const
 {
-  auto res = isl_union_map_align_params(copy(), model.release());
+  auto res = isl_space_is_wrapping(get());
   return manage(res);
 }
 
-isl::union_map union_map::apply_domain(isl::union_map umap2) const
+isl::space space::map_from_domain_and_range(isl::space range) const
 {
-  auto res = isl_union_map_apply_domain(copy(), umap2.release());
+  auto res = isl_space_map_from_domain_and_range(copy(), range.release());
   return manage(res);
 }
 
-isl::union_map union_map::apply_range(isl::union_map umap2) const
+isl::space space::map_from_set() const
 {
-  auto res = isl_union_map_apply_range(copy(), umap2.release());
+  auto res = isl_space_map_from_set(copy());
   return manage(res);
 }
 
-isl::union_set union_map::bind_range(isl::multi_id tuple) const
+isl::multi_aff space::multi_aff(isl::aff_list list) const
 {
-  auto res = isl_union_map_bind_range(copy(), tuple.release());
+  auto res = isl_space_multi_aff(copy(), list.release());
   return manage(res);
 }
 
-isl::union_map union_map::coalesce() const
+isl::multi_aff space::multi_aff_on_domain(isl::multi_val mv) const
 {
-  auto res = isl_union_map_coalesce(copy());
+  auto res = isl_space_multi_aff_on_domain_multi_val(copy(), mv.release());
   return manage(res);
 }
 
-boolean union_map::contains(const isl::space &space) const
+isl::multi_id space::multi_id(isl::id_list list) const
 {
-  auto res = isl_union_map_contains(get(), space.get());
+  auto res = isl_space_multi_id(copy(), list.release());
   return manage(res);
 }
 
-isl::union_map union_map::curry() const
+isl::multi_pw_aff space::multi_pw_aff(isl::pw_aff_list list) const
 {
-  auto res = isl_union_map_curry(copy());
+  auto res = isl_space_multi_pw_aff(copy(), list.release());
   return manage(res);
 }
 
-isl::union_set union_map::deltas() const
+isl::multi_union_pw_aff space::multi_union_pw_aff(isl::union_pw_aff_list list) const
 {
-  auto res = isl_union_map_deltas(copy());
+  auto res = isl_space_multi_union_pw_aff(copy(), list.release());
   return manage(res);
 }
 
-isl::union_map union_map::deltas_map() const
+isl::multi_val space::multi_val(isl::val_list list) const
 {
-  auto res = isl_union_map_deltas_map(copy());
+  auto res = isl_space_multi_val(copy(), list.release());
   return manage(res);
 }
 
-isl::union_map union_map::detect_equalities() const
+isl::aff space::param_aff_on_domain(isl::id id) const
 {
-  auto res = isl_union_map_detect_equalities(copy());
+  auto res = isl_space_param_aff_on_domain_id(copy(), id.release());
   return manage(res);
 }
 
-isl_size union_map::dim(isl::dim type) const
+isl::aff space::param_aff_on_domain(const std::string &id) const
 {
-  auto res = isl_union_map_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return this->param_aff_on_domain(isl::id(ctx(), id));
 }
 
-isl::union_set union_map::domain() const
+isl::space space::params() const
 {
-  auto res = isl_union_map_domain(copy());
+  auto res = isl_space_params(copy());
   return manage(res);
 }
 
-isl::union_map union_map::domain_factor_domain() const
+isl::space space::params_alloc(isl::ctx ctx, unsigned int nparam)
 {
-  auto res = isl_union_map_domain_factor_domain(copy());
+  auto res = isl_space_params_alloc(ctx.release(), nparam);
   return manage(res);
 }
 
-isl::union_map union_map::domain_factor_range() const
+isl::space space::product(isl::space right) const
 {
-  auto res = isl_union_map_domain_factor_range(copy());
+  auto res = isl_space_product(copy(), right.release());
   return manage(res);
 }
 
-isl::union_map union_map::domain_map() const
+isl::space space::range() const
 {
-  auto res = isl_union_map_domain_map(copy());
+  auto res = isl_space_range(copy());
   return manage(res);
 }
 
-isl::union_pw_multi_aff union_map::domain_map_union_pw_multi_aff() const
+isl::multi_aff space::range_map_multi_aff() const
 {
-  auto res = isl_union_map_domain_map_union_pw_multi_aff(copy());
+  auto res = isl_space_range_map_multi_aff(copy());
   return manage(res);
 }
 
-isl::union_map union_map::domain_product(isl::union_map umap2) const
+isl::pw_multi_aff space::range_map_pw_multi_aff() const
 {
-  auto res = isl_union_map_domain_product(copy(), umap2.release());
+  auto res = isl_space_range_map_pw_multi_aff(copy());
   return manage(res);
 }
 
-isl::union_map union_map::empty(isl::ctx ctx)
+isl::space space::range_reverse() const
 {
-  auto res = isl_union_map_empty_ctx(ctx.release());
+  auto res = isl_space_range_reverse(copy());
   return manage(res);
 }
 
-isl::union_map union_map::eq_at(isl::multi_union_pw_aff mupa) const
+isl::id space::range_tuple_id() const
 {
-  auto res = isl_union_map_eq_at_multi_union_pw_aff(copy(), mupa.release());
+  auto res = isl_space_get_range_tuple_id(get());
   return manage(res);
 }
 
-isl::map union_map::extract_map(isl::space space) const
+isl::id space::get_range_tuple_id() const
 {
-  auto res = isl_union_map_extract_map(get(), space.release());
-  return manage(res);
+  return range_tuple_id();
 }
 
-isl::union_map union_map::factor_domain() const
+isl::space space::reverse() const
 {
-  auto res = isl_union_map_factor_domain(copy());
+  auto res = isl_space_reverse(copy());
   return manage(res);
 }
 
-isl::union_map union_map::factor_range() const
+isl::space space::set_dim_id(isl::dim type, unsigned int pos, isl::id id) const
 {
-  auto res = isl_union_map_factor_range(copy());
+  auto res = isl_space_set_dim_id(copy(), static_cast<enum isl_dim_type>(type), pos, id.release());
   return manage(res);
 }
 
-int union_map::find_dim_by_name(isl::dim type, const std::string &name) const
+isl::space space::set_dim_id(isl::dim type, unsigned int pos, const std::string &id) const
 {
-  auto res = isl_union_map_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+  return this->set_dim_id(type, pos, isl::id(ctx(), id));
 }
 
-isl::union_map union_map::fixed_power(isl::val exp) const
+isl::space space::set_domain_tuple(isl::id id) const
 {
-  auto res = isl_union_map_fixed_power_val(copy(), exp.release());
+  auto res = isl_space_set_domain_tuple_id(copy(), id.release());
   return manage(res);
 }
 
-isl::union_map union_map::flat_domain_product(isl::union_map umap2) const
+isl::space space::set_domain_tuple(const std::string &id) const
 {
-  auto res = isl_union_map_flat_domain_product(copy(), umap2.release());
-  return manage(res);
+  return this->set_domain_tuple(isl::id(ctx(), id));
 }
 
-isl::union_map union_map::flat_range_product(isl::union_map umap2) const
+isl::space space::set_from_params() const
 {
-  auto res = isl_union_map_flat_range_product(copy(), umap2.release());
+  auto res = isl_space_set_from_params(copy());
   return manage(res);
 }
 
-stat union_map::foreach_map(const std::function<stat(map)> &fn) const
+isl::space space::set_range_tuple(isl::id id) const
 {
-  struct fn_data {
-    const std::function<stat(map)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_map *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_union_map_foreach_map(get(), fn_lambda, &fn_data);
+  auto res = isl_space_set_range_tuple_id(copy(), id.release());
   return manage(res);
 }
 
-isl::union_map union_map::from(isl::multi_union_pw_aff mupa)
+isl::space space::set_range_tuple(const std::string &id) const
 {
-  auto res = isl_union_map_from_multi_union_pw_aff(mupa.release());
-  return manage(res);
+  return this->set_range_tuple(isl::id(ctx(), id));
 }
 
-isl::union_map union_map::from_domain(isl::union_set uset)
+isl::space space::set_tuple_id(isl::dim type, isl::id id) const
 {
-  auto res = isl_union_map_from_domain(uset.release());
+  auto res = isl_space_set_tuple_id(copy(), static_cast<enum isl_dim_type>(type), id.release());
   return manage(res);
 }
 
-isl::union_map union_map::from_domain_and_range(isl::union_set domain, isl::union_set range)
+isl::space space::set_tuple_id(isl::dim type, const std::string &id) const
 {
-  auto res = isl_union_map_from_domain_and_range(domain.release(), range.release());
-  return manage(res);
+  return this->set_tuple_id(type, isl::id(ctx(), id));
 }
 
-isl::union_map union_map::from_range(isl::union_set uset)
+isl::id space::tuple_id(isl::dim type) const
 {
-  auto res = isl_union_map_from_range(uset.release());
+  auto res = isl_space_get_tuple_id(get(), static_cast<enum isl_dim_type>(type));
   return manage(res);
 }
 
-isl::union_map union_map::from_union_pw_aff(isl::union_pw_aff upa)
+isl::id space::get_tuple_id(isl::dim type) const
 {
-  auto res = isl_union_map_from_union_pw_aff(upa.release());
-  return manage(res);
+  return tuple_id(type);
 }
 
-isl::id union_map::get_dim_id(isl::dim type, unsigned int pos) const
+std::string space::tuple_name(isl::dim type) const
 {
-  auto res = isl_union_map_get_dim_id(get(), static_cast<enum isl_dim_type>(type), pos);
-  return manage(res);
+  auto res = isl_space_get_tuple_name(get(), static_cast<enum isl_dim_type>(type));
+  std::string tmp(res);
+  return tmp;
 }
 
-uint32_t union_map::get_hash() const
+std::string space::get_tuple_name(isl::dim type) const
 {
-  auto res = isl_union_map_get_hash(get());
-  return res;
+  return tuple_name(type);
 }
 
-isl::map_list union_map::get_map_list() const
+isl::space space::uncurry() const
 {
-  auto res = isl_union_map_get_map_list(get());
+  auto res = isl_space_uncurry(copy());
   return manage(res);
 }
 
-isl::space union_map::get_space() const
+isl::space space::unit(isl::ctx ctx)
 {
-  auto res = isl_union_map_get_space(get());
+  auto res = isl_space_unit(ctx.release());
   return manage(res);
 }
 
-isl::union_map union_map::gist(isl::union_map context) const
+isl::map space::universe_map() const
 {
-  auto res = isl_union_map_gist(copy(), context.release());
+  auto res = isl_space_universe_map(copy());
   return manage(res);
 }
 
-isl::union_map union_map::gist_domain(isl::union_set uset) const
+isl::set space::universe_set() const
 {
-  auto res = isl_union_map_gist_domain(copy(), uset.release());
+  auto res = isl_space_universe_set(copy());
   return manage(res);
 }
 
-isl::union_map union_map::gist_params(isl::set set) const
+isl::space space::unwrap() const
 {
-  auto res = isl_union_map_gist_params(copy(), set.release());
+  auto res = isl_space_unwrap(copy());
   return manage(res);
 }
 
-isl::union_map union_map::gist_range(isl::union_set uset) const
+isl::space space::wrap() const
 {
-  auto res = isl_union_map_gist_range(copy(), uset.release());
+  auto res = isl_space_wrap(copy());
   return manage(res);
 }
 
-isl::union_map union_map::intersect(isl::union_map umap2) const
+isl::aff space::zero_aff_on_domain() const
 {
-  auto res = isl_union_map_intersect(copy(), umap2.release());
+  auto res = isl_space_zero_aff_on_domain(copy());
   return manage(res);
 }
 
-isl::union_map union_map::intersect_domain(isl::space space) const
+isl::multi_aff space::zero_multi_aff() const
 {
-  auto res = isl_union_map_intersect_domain_space(copy(), space.release());
+  auto res = isl_space_zero_multi_aff(copy());
   return manage(res);
 }
 
-isl::union_map union_map::intersect_domain(isl::union_set uset) const
+isl::multi_pw_aff space::zero_multi_pw_aff() const
 {
-  auto res = isl_union_map_intersect_domain_union_set(copy(), uset.release());
+  auto res = isl_space_zero_multi_pw_aff(copy());
   return manage(res);
 }
 
-isl::union_map union_map::intersect_domain_factor_domain(isl::union_map factor) const
+isl::multi_union_pw_aff space::zero_multi_union_pw_aff() const
 {
-  auto res = isl_union_map_intersect_domain_factor_domain(copy(), factor.release());
+  auto res = isl_space_zero_multi_union_pw_aff(copy());
   return manage(res);
 }
 
-isl::union_map union_map::intersect_domain_factor_range(isl::union_map factor) const
+isl::multi_val space::zero_multi_val() const
 {
-  auto res = isl_union_map_intersect_domain_factor_range(copy(), factor.release());
+  auto res = isl_space_zero_multi_val(copy());
   return manage(res);
 }
 
-isl::union_map union_map::intersect_params(isl::set set) const
+inline std::ostream &operator<<(std::ostream &os, const space &obj)
 {
-  auto res = isl_union_map_intersect_params(copy(), set.release());
-  return manage(res);
+  char *str = isl_space_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::union_map union_map::intersect_range(isl::space space) const
+// implementations for isl::union_access_info
+union_access_info manage(__isl_take isl_union_access_info *ptr) {
+  return union_access_info(ptr);
+}
+union_access_info manage_copy(__isl_keep isl_union_access_info *ptr) {
+  ptr = isl_union_access_info_copy(ptr);
+  return union_access_info(ptr);
+}
+
+union_access_info::union_access_info()
+    : ptr(nullptr) {}
+
+union_access_info::union_access_info(const union_access_info &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_union_map_intersect_range_space(copy(), space.release());
-  return manage(res);
+  ptr = obj.copy();
+}
+
+union_access_info::union_access_info(__isl_take isl_union_access_info *ptr)
+    : ptr(ptr) {}
+
+union_access_info::union_access_info(isl::union_map sink)
+{
+  auto res = isl_union_access_info_from_sink(sink.release());
+  ptr = res;
+}
+
+union_access_info &union_access_info::operator=(union_access_info obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
+}
+
+union_access_info::~union_access_info() {
+  if (ptr)
+    isl_union_access_info_free(ptr);
+}
+
+__isl_give isl_union_access_info *union_access_info::copy() const & {
+  return isl_union_access_info_copy(ptr);
 }
 
-isl::union_map union_map::intersect_range(isl::union_set uset) const
-{
-  auto res = isl_union_map_intersect_range_union_set(copy(), uset.release());
-  return manage(res);
+__isl_keep isl_union_access_info *union_access_info::get() const {
+  return ptr;
 }
 
-isl::union_map union_map::intersect_range_factor_domain(isl::union_map factor) const
-{
-  auto res = isl_union_map_intersect_range_factor_domain(copy(), factor.release());
-  return manage(res);
+__isl_give isl_union_access_info *union_access_info::release() {
+  isl_union_access_info *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
 }
 
-isl::union_map union_map::intersect_range_factor_range(isl::union_map factor) const
-{
-  auto res = isl_union_map_intersect_range_factor_range(copy(), factor.release());
-  return manage(res);
+bool union_access_info::is_null() const {
+  return ptr == nullptr;
 }
 
-boolean union_map::involves_dims(isl::dim type, unsigned int first, unsigned int n) const
-{
-  auto res = isl_union_map_involves_dims(get(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+isl::ctx union_access_info::ctx() const {
+  return isl::ctx(isl_union_access_info_get_ctx(ptr));
 }
 
-boolean union_map::is_bijective() const
+isl::union_flow union_access_info::compute_flow() const
 {
-  auto res = isl_union_map_is_bijective(get());
+  auto res = isl_union_access_info_compute_flow(copy());
   return manage(res);
 }
 
-boolean union_map::is_disjoint(const isl::union_map &umap2) const
+isl::union_access_info union_access_info::set_kill(isl::union_map kill) const
 {
-  auto res = isl_union_map_is_disjoint(get(), umap2.get());
+  auto res = isl_union_access_info_set_kill(copy(), kill.release());
   return manage(res);
 }
 
-boolean union_map::is_empty() const
+isl::union_access_info union_access_info::set_may_source(isl::union_map may_source) const
 {
-  auto res = isl_union_map_is_empty(get());
+  auto res = isl_union_access_info_set_may_source(copy(), may_source.release());
   return manage(res);
 }
 
-boolean union_map::is_equal(const isl::union_map &umap2) const
+isl::union_access_info union_access_info::set_must_source(isl::union_map must_source) const
 {
-  auto res = isl_union_map_is_equal(get(), umap2.get());
+  auto res = isl_union_access_info_set_must_source(copy(), must_source.release());
   return manage(res);
 }
 
-boolean union_map::is_identity() const
+isl::union_access_info union_access_info::set_schedule(isl::schedule schedule) const
 {
-  auto res = isl_union_map_is_identity(get());
+  auto res = isl_union_access_info_set_schedule(copy(), schedule.release());
   return manage(res);
 }
 
-boolean union_map::is_injective() const
+isl::union_access_info union_access_info::set_schedule_map(isl::union_map schedule_map) const
 {
-  auto res = isl_union_map_is_injective(get());
+  auto res = isl_union_access_info_set_schedule_map(copy(), schedule_map.release());
   return manage(res);
 }
 
-boolean union_map::is_single_valued() const
+inline std::ostream &operator<<(std::ostream &os, const union_access_info &obj)
 {
-  auto res = isl_union_map_is_single_valued(get());
-  return manage(res);
+  char *str = isl_union_access_info_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-boolean union_map::is_strict_subset(const isl::union_map &umap2) const
-{
-  auto res = isl_union_map_is_strict_subset(get(), umap2.get());
-  return manage(res);
+// implementations for isl::union_flow
+union_flow manage(__isl_take isl_union_flow *ptr) {
+  return union_flow(ptr);
 }
-
-boolean union_map::is_subset(const isl::union_map &umap2) const
-{
-  auto res = isl_union_map_is_subset(get(), umap2.get());
-  return manage(res);
+union_flow manage_copy(__isl_keep isl_union_flow *ptr) {
+  ptr = isl_union_flow_copy(ptr);
+  return union_flow(ptr);
 }
 
-boolean union_map::isa_map() const
+union_flow::union_flow()
+    : ptr(nullptr) {}
+
+union_flow::union_flow(const union_flow &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_union_map_isa_map(get());
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::union_map union_map::lex_ge_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const
-{
-  auto res = isl_union_map_lex_ge_at_multi_union_pw_aff(copy(), mupa.release());
-  return manage(res);
+union_flow::union_flow(__isl_take isl_union_flow *ptr)
+    : ptr(ptr) {}
+
+union_flow &union_flow::operator=(union_flow obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::union_map union_map::lex_ge_union_map(isl::union_map umap2) const
-{
-  auto res = isl_union_map_lex_ge_union_map(copy(), umap2.release());
-  return manage(res);
+union_flow::~union_flow() {
+  if (ptr)
+    isl_union_flow_free(ptr);
 }
 
-isl::union_map union_map::lex_gt_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const
-{
-  auto res = isl_union_map_lex_gt_at_multi_union_pw_aff(copy(), mupa.release());
-  return manage(res);
+__isl_give isl_union_flow *union_flow::copy() const & {
+  return isl_union_flow_copy(ptr);
 }
 
-isl::union_map union_map::lex_gt_union_map(isl::union_map umap2) const
-{
-  auto res = isl_union_map_lex_gt_union_map(copy(), umap2.release());
-  return manage(res);
+__isl_keep isl_union_flow *union_flow::get() const {
+  return ptr;
 }
 
-isl::union_map union_map::lex_le_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const
-{
-  auto res = isl_union_map_lex_le_at_multi_union_pw_aff(copy(), mupa.release());
-  return manage(res);
+__isl_give isl_union_flow *union_flow::release() {
+  isl_union_flow *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
 }
 
-isl::union_map union_map::lex_le_union_map(isl::union_map umap2) const
-{
-  auto res = isl_union_map_lex_le_union_map(copy(), umap2.release());
-  return manage(res);
+bool union_flow::is_null() const {
+  return ptr == nullptr;
 }
 
-isl::union_map union_map::lex_lt_at_multi_union_pw_aff(isl::multi_union_pw_aff mupa) const
-{
-  auto res = isl_union_map_lex_lt_at_multi_union_pw_aff(copy(), mupa.release());
-  return manage(res);
+isl::ctx union_flow::ctx() const {
+  return isl::ctx(isl_union_flow_get_ctx(ptr));
 }
 
-isl::union_map union_map::lex_lt_union_map(isl::union_map umap2) const
+isl::union_map union_flow::full_may_dependence() const
 {
-  auto res = isl_union_map_lex_lt_union_map(copy(), umap2.release());
+  auto res = isl_union_flow_get_full_may_dependence(get());
   return manage(res);
 }
 
-isl::union_map union_map::lexmax() const
+isl::union_map union_flow::get_full_may_dependence() const
 {
-  auto res = isl_union_map_lexmax(copy());
-  return manage(res);
+  return full_may_dependence();
 }
 
-isl::union_map union_map::lexmin() const
+isl::union_map union_flow::full_must_dependence() const
 {
-  auto res = isl_union_map_lexmin(copy());
+  auto res = isl_union_flow_get_full_must_dependence(get());
   return manage(res);
 }
 
-isl_size union_map::n_map() const
+isl::union_map union_flow::get_full_must_dependence() const
 {
-  auto res = isl_union_map_n_map(get());
-  return res;
+  return full_must_dependence();
 }
 
-isl::set union_map::params() const
+isl::union_map union_flow::may_dependence() const
 {
-  auto res = isl_union_map_params(copy());
+  auto res = isl_union_flow_get_may_dependence(get());
   return manage(res);
 }
 
-boolean union_map::plain_is_empty() const
+isl::union_map union_flow::get_may_dependence() const
 {
-  auto res = isl_union_map_plain_is_empty(get());
-  return manage(res);
+  return may_dependence();
 }
 
-boolean union_map::plain_is_injective() const
+isl::union_map union_flow::may_no_source() const
 {
-  auto res = isl_union_map_plain_is_injective(get());
+  auto res = isl_union_flow_get_may_no_source(get());
   return manage(res);
 }
 
-isl::union_map union_map::polyhedral_hull() const
+isl::union_map union_flow::get_may_no_source() const
 {
-  auto res = isl_union_map_polyhedral_hull(copy());
-  return manage(res);
+  return may_no_source();
 }
 
-isl::union_map union_map::preimage_domain(isl::multi_aff ma) const
+isl::union_map union_flow::must_dependence() const
 {
-  auto res = isl_union_map_preimage_domain_multi_aff(copy(), ma.release());
+  auto res = isl_union_flow_get_must_dependence(get());
   return manage(res);
 }
 
-isl::union_map union_map::preimage_domain(isl::multi_pw_aff mpa) const
+isl::union_map union_flow::get_must_dependence() const
 {
-  auto res = isl_union_map_preimage_domain_multi_pw_aff(copy(), mpa.release());
-  return manage(res);
+  return must_dependence();
 }
 
-isl::union_map union_map::preimage_domain(isl::pw_multi_aff pma) const
+isl::union_map union_flow::must_no_source() const
 {
-  auto res = isl_union_map_preimage_domain_pw_multi_aff(copy(), pma.release());
+  auto res = isl_union_flow_get_must_no_source(get());
   return manage(res);
 }
 
-isl::union_map union_map::preimage_domain(isl::union_pw_multi_aff upma) const
+isl::union_map union_flow::get_must_no_source() const
 {
-  auto res = isl_union_map_preimage_domain_union_pw_multi_aff(copy(), upma.release());
-  return manage(res);
+  return must_no_source();
 }
 
-isl::union_map union_map::preimage_range(isl::multi_aff ma) const
+inline std::ostream &operator<<(std::ostream &os, const union_flow &obj)
 {
-  auto res = isl_union_map_preimage_range_multi_aff(copy(), ma.release());
-  return manage(res);
+  char *str = isl_union_flow_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::union_map union_map::preimage_range(isl::pw_multi_aff pma) const
-{
-  auto res = isl_union_map_preimage_range_pw_multi_aff(copy(), pma.release());
-  return manage(res);
+// implementations for isl::union_map
+union_map manage(__isl_take isl_union_map *ptr) {
+  return union_map(ptr);
 }
-
-isl::union_map union_map::preimage_range(isl::union_pw_multi_aff upma) const
-{
-  auto res = isl_union_map_preimage_range_union_pw_multi_aff(copy(), upma.release());
-  return manage(res);
+union_map manage_copy(__isl_keep isl_union_map *ptr) {
+  ptr = isl_union_map_copy(ptr);
+  return union_map(ptr);
 }
 
-isl::union_map union_map::product(isl::union_map umap2) const
+union_map::union_map()
+    : ptr(nullptr) {}
+
+union_map::union_map(const union_map &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_union_map_product(copy(), umap2.release());
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::union_map union_map::project_out(isl::dim type, unsigned int first, unsigned int n) const
+union_map::union_map(__isl_take isl_union_map *ptr)
+    : ptr(ptr) {}
+
+union_map::union_map(isl::basic_map bmap)
 {
-  auto res = isl_union_map_project_out(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  auto res = isl_union_map_from_basic_map(bmap.release());
+  ptr = res;
 }
 
-isl::union_map union_map::project_out_all_params() const
+union_map::union_map(isl::map map)
 {
-  auto res = isl_union_map_project_out_all_params(copy());
-  return manage(res);
+  auto res = isl_union_map_from_map(map.release());
+  ptr = res;
 }
 
-isl::union_set union_map::range() const
+union_map::union_map(isl::ctx ctx, const std::string &str)
 {
-  auto res = isl_union_map_range(copy());
-  return manage(res);
+  auto res = isl_union_map_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
 }
 
-isl::union_map union_map::range_curry() const
-{
-  auto res = isl_union_map_range_curry(copy());
-  return manage(res);
+union_map &union_map::operator=(union_map obj) {
+  std::swap(this->ptr, obj.ptr);
+  return *this;
 }
 
-isl::union_map union_map::range_factor_domain() const
-{
-  auto res = isl_union_map_range_factor_domain(copy());
-  return manage(res);
+union_map::~union_map() {
+  if (ptr)
+    isl_union_map_free(ptr);
 }
 
-isl::union_map union_map::range_factor_range() const
-{
-  auto res = isl_union_map_range_factor_range(copy());
-  return manage(res);
+__isl_give isl_union_map *union_map::copy() const & {
+  return isl_union_map_copy(ptr);
 }
 
-isl::union_map union_map::range_map() const
-{
-  auto res = isl_union_map_range_map(copy());
-  return manage(res);
+__isl_keep isl_union_map *union_map::get() const {
+  return ptr;
 }
 
-isl::union_map union_map::range_product(isl::union_map umap2) const
-{
-  auto res = isl_union_map_range_product(copy(), umap2.release());
-  return manage(res);
+__isl_give isl_union_map *union_map::release() {
+  isl_union_map *tmp = ptr;
+  ptr = nullptr;
+  return tmp;
 }
 
-isl::union_map union_map::range_reverse() const
-{
-  auto res = isl_union_map_range_reverse(copy());
-  return manage(res);
+bool union_map::is_null() const {
+  return ptr == nullptr;
 }
 
-isl::union_map union_map::remove_divs() const
-{
-  auto res = isl_union_map_remove_divs(copy());
-  return manage(res);
+isl::ctx union_map::ctx() const {
+  return isl::ctx(isl_union_map_get_ctx(ptr));
 }
 
-isl::union_map union_map::remove_redundancies() const
+isl::union_map union_map::affine_hull() const
 {
-  auto res = isl_union_map_remove_redundancies(copy());
+  auto res = isl_union_map_affine_hull(copy());
   return manage(res);
 }
 
-isl::union_map union_map::reset_user() const
+isl::union_map union_map::apply_domain(isl::union_map umap2) const
 {
-  auto res = isl_union_map_reset_user(copy());
+  auto res = isl_union_map_apply_domain(copy(), umap2.release());
   return manage(res);
 }
 
-isl::union_map union_map::reverse() const
+isl::union_map union_map::apply_range(isl::union_map umap2) const
 {
-  auto res = isl_union_map_reverse(copy());
+  auto res = isl_union_map_apply_range(copy(), umap2.release());
   return manage(res);
 }
 
-isl::basic_map union_map::sample() const
+isl::map union_map::as_map() const
 {
-  auto res = isl_union_map_sample(copy());
+  auto res = isl_union_map_as_map(copy());
   return manage(res);
 }
 
-isl::union_map union_map::simple_hull() const
+isl::multi_union_pw_aff union_map::as_multi_union_pw_aff() const
 {
-  auto res = isl_union_map_simple_hull(copy());
+  auto res = isl_union_map_as_multi_union_pw_aff(copy());
   return manage(res);
 }
 
-isl::union_map union_map::subtract(isl::union_map umap2) const
+isl::union_pw_multi_aff union_map::as_union_pw_multi_aff() const
 {
-  auto res = isl_union_map_subtract(copy(), umap2.release());
+  auto res = isl_union_map_as_union_pw_multi_aff(copy());
   return manage(res);
 }
 
-isl::union_map union_map::subtract_domain(isl::union_set dom) const
+isl::union_set union_map::bind_range(isl::multi_id tuple) const
 {
-  auto res = isl_union_map_subtract_domain(copy(), dom.release());
+  auto res = isl_union_map_bind_range(copy(), tuple.release());
   return manage(res);
 }
 
-isl::union_map union_map::subtract_range(isl::union_set dom) const
+isl::union_map union_map::coalesce() const
 {
-  auto res = isl_union_map_subtract_range(copy(), dom.release());
+  auto res = isl_union_map_coalesce(copy());
   return manage(res);
 }
 
-isl::union_map union_map::uncurry() const
+isl::union_map union_map::compute_divs() const
 {
-  auto res = isl_union_map_uncurry(copy());
+  auto res = isl_union_map_compute_divs(copy());
   return manage(res);
 }
 
-isl::union_map union_map::unite(isl::union_map umap2) const
+isl::union_map union_map::curry() const
 {
-  auto res = isl_union_map_union(copy(), umap2.release());
+  auto res = isl_union_map_curry(copy());
   return manage(res);
 }
 
-isl::union_map union_map::universe() const
+isl::union_set union_map::deltas() const
 {
-  auto res = isl_union_map_universe(copy());
+  auto res = isl_union_map_deltas(copy());
   return manage(res);
 }
 
-isl::union_set union_map::wrap() const
+isl::union_map union_map::detect_equalities() const
 {
-  auto res = isl_union_map_wrap(copy());
+  auto res = isl_union_map_detect_equalities(copy());
   return manage(res);
 }
 
-isl::union_map union_map::zip() const
+isl::union_set union_map::domain() const
 {
-  auto res = isl_union_map_zip(copy());
+  auto res = isl_union_map_domain(copy());
   return manage(res);
 }
 
-// implementations for isl::union_map_list
-union_map_list manage(__isl_take isl_union_map_list *ptr) {
-  return union_map_list(ptr);
-}
-union_map_list manage_copy(__isl_keep isl_union_map_list *ptr) {
-  ptr = isl_union_map_list_copy(ptr);
-  return union_map_list(ptr);
-}
-
-union_map_list::union_map_list()
-    : ptr(nullptr) {}
-
-union_map_list::union_map_list(const union_map_list &obj)
-    : ptr(nullptr)
+isl::union_map union_map::domain_factor_domain() const
 {
-  ptr = obj.copy();
+  auto res = isl_union_map_domain_factor_domain(copy());
+  return manage(res);
 }
 
-
-union_map_list::union_map_list(__isl_take isl_union_map_list *ptr)
-    : ptr(ptr) {}
-
-
-union_map_list &union_map_list::operator=(union_map_list obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+isl::union_map union_map::domain_factor_range() const
+{
+  auto res = isl_union_map_domain_factor_range(copy());
+  return manage(res);
 }
 
-union_map_list::~union_map_list() {
-  if (ptr)
-    isl_union_map_list_free(ptr);
+isl::union_map union_map::domain_map() const
+{
+  auto res = isl_union_map_domain_map(copy());
+  return manage(res);
 }
 
-__isl_give isl_union_map_list *union_map_list::copy() const & {
-  return isl_union_map_list_copy(ptr);
+isl::union_pw_multi_aff union_map::domain_map_union_pw_multi_aff() const
+{
+  auto res = isl_union_map_domain_map_union_pw_multi_aff(copy());
+  return manage(res);
 }
 
-__isl_keep isl_union_map_list *union_map_list::get() const {
-  return ptr;
+isl::union_map union_map::domain_product(isl::union_map umap2) const
+{
+  auto res = isl_union_map_domain_product(copy(), umap2.release());
+  return manage(res);
 }
 
-__isl_give isl_union_map_list *union_map_list::release() {
-  isl_union_map_list *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::union_map union_map::empty(isl::ctx ctx)
+{
+  auto res = isl_union_map_empty_ctx(ctx.release());
+  return manage(res);
 }
 
-bool union_map_list::is_null() const {
-  return ptr == nullptr;
+isl::union_map union_map::eq_at(isl::multi_union_pw_aff mupa) const
+{
+  auto res = isl_union_map_eq_at_multi_union_pw_aff(copy(), mupa.release());
+  return manage(res);
 }
 
-
-isl::ctx union_map_list::ctx() const {
-  return isl::ctx(isl_union_map_list_get_ctx(ptr));
+boolean union_map::every_map(const std::function<boolean(isl::map)> &test) const
+{
+  struct test_data {
+    std::function<boolean(isl::map)> func;
+  } test_data = { test };
+  auto test_lambda = [](isl_map *arg_0, void *arg_1) -> isl_bool {
+    auto *data = static_cast<struct test_data *>(arg_1);
+    auto ret = (data->func)(manage_copy(arg_0));
+    return ret.release();
+  };
+  auto res = isl_union_map_every_map(get(), test_lambda, &test_data);
+  return manage(res);
 }
 
-void union_map_list::dump() const {
-  isl_union_map_list_dump(get());
+isl::map union_map::extract_map(isl::space space) const
+{
+  auto res = isl_union_map_extract_map(get(), space.release());
+  return manage(res);
 }
 
-
-isl::union_map_list union_map_list::add(isl::union_map el) const
+isl::union_map union_map::factor_domain() const
 {
-  auto res = isl_union_map_list_add(copy(), el.release());
+  auto res = isl_union_map_factor_domain(copy());
   return manage(res);
 }
 
-isl::union_map_list union_map_list::alloc(isl::ctx ctx, int n)
+isl::union_map union_map::factor_range() const
 {
-  auto res = isl_union_map_list_alloc(ctx.release(), n);
+  auto res = isl_union_map_factor_range(copy());
   return manage(res);
 }
 
-isl::union_map_list union_map_list::clear() const
+isl::union_map union_map::fixed_power(isl::val exp) const
 {
-  auto res = isl_union_map_list_clear(copy());
+  auto res = isl_union_map_fixed_power_val(copy(), exp.release());
   return manage(res);
 }
 
-isl::union_map_list union_map_list::concat(isl::union_map_list list2) const
+isl::union_map union_map::fixed_power(long exp) const
 {
-  auto res = isl_union_map_list_concat(copy(), list2.release());
-  return manage(res);
+  return this->fixed_power(isl::val(ctx(), exp));
 }
 
-isl::union_map_list union_map_list::drop(unsigned int first, unsigned int n) const
+isl::union_map union_map::flat_range_product(isl::union_map umap2) const
 {
-  auto res = isl_union_map_list_drop(copy(), first, n);
+  auto res = isl_union_map_flat_range_product(copy(), umap2.release());
   return manage(res);
 }
 
-stat union_map_list::foreach(const std::function<stat(union_map)> &fn) const
+stat union_map::foreach_map(const std::function<stat(isl::map)> &fn) const
 {
   struct fn_data {
-    const std::function<stat(union_map)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_union_map *arg_0, void *arg_1) -> isl_stat {
+    std::function<stat(isl::map)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_map *arg_0, void *arg_1) -> isl_stat {
     auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
+    auto ret = (data->func)(manage(arg_0));
     return ret.release();
   };
-  auto res = isl_union_map_list_foreach(get(), fn_lambda, &fn_data);
+  auto res = isl_union_map_foreach_map(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::union_map_list union_map_list::from_union_map(isl::union_map el)
+isl::union_map union_map::from(isl::multi_union_pw_aff mupa)
 {
-  auto res = isl_union_map_list_from_union_map(el.release());
+  auto res = isl_union_map_from_multi_union_pw_aff(mupa.release());
   return manage(res);
 }
 
-isl::union_map union_map_list::get_at(int index) const
+isl::union_map union_map::from(isl::union_pw_multi_aff upma)
 {
-  auto res = isl_union_map_list_get_at(get(), index);
+  auto res = isl_union_map_from_union_pw_multi_aff(upma.release());
   return manage(res);
 }
 
-isl::union_map union_map_list::get_union_map(int index) const
+isl::union_map union_map::from_domain(isl::union_set uset)
 {
-  auto res = isl_union_map_list_get_union_map(get(), index);
+  auto res = isl_union_map_from_domain(uset.release());
   return manage(res);
 }
 
-isl::union_map_list union_map_list::insert(unsigned int pos, isl::union_map el) const
+isl::union_map union_map::from_domain_and_range(isl::union_set domain, isl::union_set range)
 {
-  auto res = isl_union_map_list_insert(copy(), pos, el.release());
+  auto res = isl_union_map_from_domain_and_range(domain.release(), range.release());
   return manage(res);
 }
 
-isl_size union_map_list::n_union_map() const
+isl::union_map union_map::from_range(isl::union_set uset)
 {
-  auto res = isl_union_map_list_n_union_map(get());
-  return res;
+  auto res = isl_union_map_from_range(uset.release());
+  return manage(res);
 }
 
-isl::union_map_list union_map_list::reverse() const
+isl::union_map union_map::gist(isl::union_map context) const
 {
-  auto res = isl_union_map_list_reverse(copy());
+  auto res = isl_union_map_gist(copy(), context.release());
   return manage(res);
 }
 
-isl::union_map_list union_map_list::set_union_map(int index, isl::union_map el) const
+isl::union_map union_map::gist_domain(isl::union_set uset) const
 {
-  auto res = isl_union_map_list_set_union_map(copy(), index, el.release());
+  auto res = isl_union_map_gist_domain(copy(), uset.release());
   return manage(res);
 }
 
-isl_size union_map_list::size() const
+isl::union_map union_map::gist_params(isl::set set) const
 {
-  auto res = isl_union_map_list_size(get());
-  return res;
+  auto res = isl_union_map_gist_params(copy(), set.release());
+  return manage(res);
 }
 
-isl::union_map_list union_map_list::swap(unsigned int pos1, unsigned int pos2) const
+isl::union_map union_map::gist_range(isl::union_set uset) const
 {
-  auto res = isl_union_map_list_swap(copy(), pos1, pos2);
+  auto res = isl_union_map_gist_range(copy(), uset.release());
   return manage(res);
 }
 
-// implementations for isl::union_pw_aff
-union_pw_aff manage(__isl_take isl_union_pw_aff *ptr) {
-  return union_pw_aff(ptr);
-}
-union_pw_aff manage_copy(__isl_keep isl_union_pw_aff *ptr) {
-  ptr = isl_union_pw_aff_copy(ptr);
-  return union_pw_aff(ptr);
-}
-
-union_pw_aff::union_pw_aff()
-    : ptr(nullptr) {}
-
-union_pw_aff::union_pw_aff(const union_pw_aff &obj)
-    : ptr(nullptr)
+isl::union_map union_map::intersect(isl::union_map umap2) const
 {
-  ptr = obj.copy();
+  auto res = isl_union_map_intersect(copy(), umap2.release());
+  return manage(res);
 }
 
-
-union_pw_aff::union_pw_aff(__isl_take isl_union_pw_aff *ptr)
-    : ptr(ptr) {}
-
-union_pw_aff::union_pw_aff(isl::aff aff)
-{
-  auto res = isl_union_pw_aff_from_aff(aff.release());
-  ptr = res;
-}
-union_pw_aff::union_pw_aff(isl::pw_aff pa)
-{
-  auto res = isl_union_pw_aff_from_pw_aff(pa.release());
-  ptr = res;
-}
-union_pw_aff::union_pw_aff(isl::ctx ctx, const std::string &str)
-{
-  auto res = isl_union_pw_aff_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
-}
-union_pw_aff::union_pw_aff(isl::union_set domain, isl::val v)
+isl::union_map union_map::intersect_domain(isl::space space) const
 {
-  auto res = isl_union_pw_aff_val_on_domain(domain.release(), v.release());
-  ptr = res;
-}
-
-union_pw_aff &union_pw_aff::operator=(union_pw_aff obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
+  auto res = isl_union_map_intersect_domain_space(copy(), space.release());
+  return manage(res);
 }
 
-union_pw_aff::~union_pw_aff() {
-  if (ptr)
-    isl_union_pw_aff_free(ptr);
+isl::union_map union_map::intersect_domain(isl::union_set uset) const
+{
+  auto res = isl_union_map_intersect_domain_union_set(copy(), uset.release());
+  return manage(res);
 }
 
-__isl_give isl_union_pw_aff *union_pw_aff::copy() const & {
-  return isl_union_pw_aff_copy(ptr);
+isl::union_map union_map::intersect_domain_factor_domain(isl::union_map factor) const
+{
+  auto res = isl_union_map_intersect_domain_factor_domain(copy(), factor.release());
+  return manage(res);
 }
 
-__isl_keep isl_union_pw_aff *union_pw_aff::get() const {
-  return ptr;
+isl::union_map union_map::intersect_domain_factor_range(isl::union_map factor) const
+{
+  auto res = isl_union_map_intersect_domain_factor_range(copy(), factor.release());
+  return manage(res);
 }
 
-__isl_give isl_union_pw_aff *union_pw_aff::release() {
-  isl_union_pw_aff *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::union_map union_map::intersect_params(isl::set set) const
+{
+  auto res = isl_union_map_intersect_params(copy(), set.release());
+  return manage(res);
 }
 
-bool union_pw_aff::is_null() const {
-  return ptr == nullptr;
+isl::union_map union_map::intersect_range(isl::space space) const
+{
+  auto res = isl_union_map_intersect_range_space(copy(), space.release());
+  return manage(res);
 }
 
-
-isl::ctx union_pw_aff::ctx() const {
-  return isl::ctx(isl_union_pw_aff_get_ctx(ptr));
+isl::union_map union_map::intersect_range(isl::union_set uset) const
+{
+  auto res = isl_union_map_intersect_range_union_set(copy(), uset.release());
+  return manage(res);
 }
 
-void union_pw_aff::dump() const {
-  isl_union_pw_aff_dump(get());
+isl::union_map union_map::intersect_range_factor_domain(isl::union_map factor) const
+{
+  auto res = isl_union_map_intersect_range_factor_domain(copy(), factor.release());
+  return manage(res);
 }
 
-
-isl::union_pw_aff union_pw_aff::add(isl::union_pw_aff upa2) const
+isl::union_map union_map::intersect_range_factor_range(isl::union_map factor) const
 {
-  auto res = isl_union_pw_aff_add(copy(), upa2.release());
+  auto res = isl_union_map_intersect_range_factor_range(copy(), factor.release());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::add_pw_aff(isl::pw_aff pa) const
+boolean union_map::is_bijective() const
 {
-  auto res = isl_union_pw_aff_add_pw_aff(copy(), pa.release());
+  auto res = isl_union_map_is_bijective(get());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::aff_on_domain(isl::union_set domain, isl::aff aff)
+boolean union_map::is_disjoint(const isl::union_map &umap2) const
 {
-  auto res = isl_union_pw_aff_aff_on_domain(domain.release(), aff.release());
+  auto res = isl_union_map_is_disjoint(get(), umap2.get());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::align_params(isl::space model) const
+boolean union_map::is_empty() const
 {
-  auto res = isl_union_pw_aff_align_params(copy(), model.release());
+  auto res = isl_union_map_is_empty(get());
   return manage(res);
 }
 
-isl::union_set union_pw_aff::bind(isl::id id) const
+boolean union_map::is_equal(const isl::union_map &umap2) const
 {
-  auto res = isl_union_pw_aff_bind_id(copy(), id.release());
+  auto res = isl_union_map_is_equal(get(), umap2.get());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::coalesce() const
+boolean union_map::is_injective() const
 {
-  auto res = isl_union_pw_aff_coalesce(copy());
+  auto res = isl_union_map_is_injective(get());
   return manage(res);
 }
 
-isl_size union_pw_aff::dim(isl::dim type) const
+boolean union_map::is_single_valued() const
 {
-  auto res = isl_union_pw_aff_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  auto res = isl_union_map_is_single_valued(get());
+  return manage(res);
 }
 
-isl::union_set union_pw_aff::domain() const
+boolean union_map::is_strict_subset(const isl::union_map &umap2) const
 {
-  auto res = isl_union_pw_aff_domain(copy());
+  auto res = isl_union_map_is_strict_subset(get(), umap2.get());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+boolean union_map::is_subset(const isl::union_map &umap2) const
 {
-  auto res = isl_union_pw_aff_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_union_map_is_subset(get(), umap2.get());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::empty(isl::space space)
+boolean union_map::isa_map() const
 {
-  auto res = isl_union_pw_aff_empty(space.release());
+  auto res = isl_union_map_isa_map(get());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::empty_ctx(isl::ctx ctx)
+isl::union_map union_map::lexmax() const
 {
-  auto res = isl_union_pw_aff_empty_ctx(ctx.release());
+  auto res = isl_union_map_lexmax(copy());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::empty_space(isl::space space)
+isl::union_map union_map::lexmin() const
 {
-  auto res = isl_union_pw_aff_empty_space(space.release());
+  auto res = isl_union_map_lexmin(copy());
   return manage(res);
 }
 
-isl::pw_aff union_pw_aff::extract_pw_aff(isl::space space) const
+isl::map_list union_map::map_list() const
 {
-  auto res = isl_union_pw_aff_extract_pw_aff(get(), space.release());
+  auto res = isl_union_map_get_map_list(get());
   return manage(res);
 }
 
-int union_pw_aff::find_dim_by_name(isl::dim type, const std::string &name) const
+isl::map_list union_map::get_map_list() const
 {
-  auto res = isl_union_pw_aff_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+  return map_list();
 }
 
-isl::union_pw_aff union_pw_aff::floor() const
+isl::set union_map::params() const
 {
-  auto res = isl_union_pw_aff_floor(copy());
+  auto res = isl_union_map_params(copy());
   return manage(res);
 }
 
-stat union_pw_aff::foreach_pw_aff(const std::function<stat(pw_aff)> &fn) const
+isl::union_map union_map::polyhedral_hull() const
 {
-  struct fn_data {
-    const std::function<stat(pw_aff)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_pw_aff *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_union_pw_aff_foreach_pw_aff(get(), fn_lambda, &fn_data);
+  auto res = isl_union_map_polyhedral_hull(copy());
   return manage(res);
 }
 
-isl::pw_aff_list union_pw_aff::get_pw_aff_list() const
+isl::union_map union_map::preimage_domain(isl::multi_aff ma) const
 {
-  auto res = isl_union_pw_aff_get_pw_aff_list(get());
+  auto res = isl_union_map_preimage_domain_multi_aff(copy(), ma.release());
   return manage(res);
 }
 
-isl::space union_pw_aff::get_space() const
+isl::union_map union_map::preimage_domain(isl::multi_pw_aff mpa) const
 {
-  auto res = isl_union_pw_aff_get_space(get());
+  auto res = isl_union_map_preimage_domain_multi_pw_aff(copy(), mpa.release());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::gist(isl::union_set context) const
+isl::union_map union_map::preimage_domain(isl::pw_multi_aff pma) const
 {
-  auto res = isl_union_pw_aff_gist(copy(), context.release());
+  auto res = isl_union_map_preimage_domain_pw_multi_aff(copy(), pma.release());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::gist_params(isl::set context) const
+isl::union_map union_map::preimage_domain(isl::union_pw_multi_aff upma) const
 {
-  auto res = isl_union_pw_aff_gist_params(copy(), context.release());
+  auto res = isl_union_map_preimage_domain_union_pw_multi_aff(copy(), upma.release());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::intersect_domain(isl::space space) const
+isl::union_map union_map::preimage_range(isl::multi_aff ma) const
 {
-  auto res = isl_union_pw_aff_intersect_domain_space(copy(), space.release());
+  auto res = isl_union_map_preimage_range_multi_aff(copy(), ma.release());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::intersect_domain(isl::union_set uset) const
+isl::union_map union_map::preimage_range(isl::pw_multi_aff pma) const
 {
-  auto res = isl_union_pw_aff_intersect_domain_union_set(copy(), uset.release());
+  auto res = isl_union_map_preimage_range_pw_multi_aff(copy(), pma.release());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::intersect_domain_wrapped_domain(isl::union_set uset) const
+isl::union_map union_map::preimage_range(isl::union_pw_multi_aff upma) const
 {
-  auto res = isl_union_pw_aff_intersect_domain_wrapped_domain(copy(), uset.release());
+  auto res = isl_union_map_preimage_range_union_pw_multi_aff(copy(), upma.release());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::intersect_domain_wrapped_range(isl::union_set uset) const
+isl::union_map union_map::product(isl::union_map umap2) const
 {
-  auto res = isl_union_pw_aff_intersect_domain_wrapped_range(copy(), uset.release());
+  auto res = isl_union_map_product(copy(), umap2.release());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::intersect_params(isl::set set) const
+isl::union_map union_map::project_out_all_params() const
 {
-  auto res = isl_union_pw_aff_intersect_params(copy(), set.release());
+  auto res = isl_union_map_project_out_all_params(copy());
   return manage(res);
 }
 
-boolean union_pw_aff::involves_nan() const
+isl::union_set union_map::range() const
 {
-  auto res = isl_union_pw_aff_involves_nan(get());
+  auto res = isl_union_map_range(copy());
   return manage(res);
 }
 
-isl::val union_pw_aff::max_val() const
+isl::union_map union_map::range_factor_domain() const
 {
-  auto res = isl_union_pw_aff_max_val(copy());
+  auto res = isl_union_map_range_factor_domain(copy());
   return manage(res);
 }
 
-isl::val union_pw_aff::min_val() const
+isl::union_map union_map::range_factor_range() const
 {
-  auto res = isl_union_pw_aff_min_val(copy());
+  auto res = isl_union_map_range_factor_range(copy());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::mod_val(isl::val f) const
+isl::union_map union_map::range_map() const
 {
-  auto res = isl_union_pw_aff_mod_val(copy(), f.release());
+  auto res = isl_union_map_range_map(copy());
   return manage(res);
 }
 
-isl_size union_pw_aff::n_pw_aff() const
+isl::union_map union_map::range_product(isl::union_map umap2) const
 {
-  auto res = isl_union_pw_aff_n_pw_aff(get());
-  return res;
+  auto res = isl_union_map_range_product(copy(), umap2.release());
+  return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::neg() const
+isl::union_map union_map::range_reverse() const
 {
-  auto res = isl_union_pw_aff_neg(copy());
+  auto res = isl_union_map_range_reverse(copy());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::param_on_domain_id(isl::union_set domain, isl::id id)
+isl::union_map union_map::reverse() const
 {
-  auto res = isl_union_pw_aff_param_on_domain_id(domain.release(), id.release());
+  auto res = isl_union_map_reverse(copy());
   return manage(res);
 }
 
-boolean union_pw_aff::plain_is_equal(const isl::union_pw_aff &upa2) const
+isl::space union_map::space() const
 {
-  auto res = isl_union_pw_aff_plain_is_equal(get(), upa2.get());
+  auto res = isl_union_map_get_space(get());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::pullback(isl::union_pw_multi_aff upma) const
+isl::space union_map::get_space() const
 {
-  auto res = isl_union_pw_aff_pullback_union_pw_multi_aff(copy(), upma.release());
-  return manage(res);
+  return space();
 }
 
-isl::union_pw_aff union_pw_aff::pw_aff_on_domain(isl::union_set domain, isl::pw_aff pa)
+isl::union_map union_map::subtract(isl::union_map umap2) const
 {
-  auto res = isl_union_pw_aff_pw_aff_on_domain(domain.release(), pa.release());
+  auto res = isl_union_map_subtract(copy(), umap2.release());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::reset_user() const
+isl::union_map union_map::subtract_domain(isl::union_set dom) const
 {
-  auto res = isl_union_pw_aff_reset_user(copy());
+  auto res = isl_union_map_subtract_domain(copy(), dom.release());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::scale_down_val(isl::val v) const
+isl::union_map union_map::subtract_range(isl::union_set dom) const
 {
-  auto res = isl_union_pw_aff_scale_down_val(copy(), v.release());
+  auto res = isl_union_map_subtract_range(copy(), dom.release());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::scale_val(isl::val v) const
+isl::union_map union_map::uncurry() const
 {
-  auto res = isl_union_pw_aff_scale_val(copy(), v.release());
+  auto res = isl_union_map_uncurry(copy());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::sub(isl::union_pw_aff upa2) const
+isl::union_map union_map::unite(isl::union_map umap2) const
 {
-  auto res = isl_union_pw_aff_sub(copy(), upa2.release());
+  auto res = isl_union_map_union(copy(), umap2.release());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::subtract_domain(isl::space space) const
+isl::union_map union_map::universe() const
 {
-  auto res = isl_union_pw_aff_subtract_domain_space(copy(), space.release());
+  auto res = isl_union_map_universe(copy());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::subtract_domain(isl::union_set uset) const
+isl::union_set union_map::wrap() const
 {
-  auto res = isl_union_pw_aff_subtract_domain_union_set(copy(), uset.release());
+  auto res = isl_union_map_wrap(copy());
   return manage(res);
 }
 
-isl::union_pw_aff union_pw_aff::union_add(isl::union_pw_aff upa2) const
+isl::union_map union_map::zip() const
 {
-  auto res = isl_union_pw_aff_union_add(copy(), upa2.release());
+  auto res = isl_union_map_zip(copy());
   return manage(res);
 }
 
-isl::union_set union_pw_aff::zero_union_set() const
+inline std::ostream &operator<<(std::ostream &os, const union_map &obj)
 {
-  auto res = isl_union_pw_aff_zero_union_set(copy());
-  return manage(res);
+  char *str = isl_union_map_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-// implementations for isl::union_pw_aff_list
-union_pw_aff_list manage(__isl_take isl_union_pw_aff_list *ptr) {
-  return union_pw_aff_list(ptr);
+// implementations for isl::union_pw_aff
+union_pw_aff manage(__isl_take isl_union_pw_aff *ptr) {
+  return union_pw_aff(ptr);
 }
-union_pw_aff_list manage_copy(__isl_keep isl_union_pw_aff_list *ptr) {
-  ptr = isl_union_pw_aff_list_copy(ptr);
-  return union_pw_aff_list(ptr);
+union_pw_aff manage_copy(__isl_keep isl_union_pw_aff *ptr) {
+  ptr = isl_union_pw_aff_copy(ptr);
+  return union_pw_aff(ptr);
 }
 
-union_pw_aff_list::union_pw_aff_list()
+union_pw_aff::union_pw_aff()
     : ptr(nullptr) {}
 
-union_pw_aff_list::union_pw_aff_list(const union_pw_aff_list &obj)
+union_pw_aff::union_pw_aff(const union_pw_aff &obj)
     : ptr(nullptr)
 {
   ptr = obj.copy();
 }
 
-
-union_pw_aff_list::union_pw_aff_list(__isl_take isl_union_pw_aff_list *ptr)
+union_pw_aff::union_pw_aff(__isl_take isl_union_pw_aff *ptr)
     : ptr(ptr) {}
 
+union_pw_aff::union_pw_aff(isl::aff aff)
+{
+  auto res = isl_union_pw_aff_from_aff(aff.release());
+  ptr = res;
+}
+
+union_pw_aff::union_pw_aff(isl::pw_aff pa)
+{
+  auto res = isl_union_pw_aff_from_pw_aff(pa.release());
+  ptr = res;
+}
 
-union_pw_aff_list &union_pw_aff_list::operator=(union_pw_aff_list obj) {
+union_pw_aff::union_pw_aff(isl::ctx ctx, const std::string &str)
+{
+  auto res = isl_union_pw_aff_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
+}
+
+union_pw_aff::union_pw_aff(isl::union_set domain, isl::val v)
+{
+  auto res = isl_union_pw_aff_val_on_domain(domain.release(), v.release());
+  ptr = res;
+}
+
+union_pw_aff &union_pw_aff::operator=(union_pw_aff obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-union_pw_aff_list::~union_pw_aff_list() {
+union_pw_aff::~union_pw_aff() {
   if (ptr)
-    isl_union_pw_aff_list_free(ptr);
+    isl_union_pw_aff_free(ptr);
 }
 
-__isl_give isl_union_pw_aff_list *union_pw_aff_list::copy() const & {
-  return isl_union_pw_aff_list_copy(ptr);
+__isl_give isl_union_pw_aff *union_pw_aff::copy() const & {
+  return isl_union_pw_aff_copy(ptr);
 }
 
-__isl_keep isl_union_pw_aff_list *union_pw_aff_list::get() const {
+__isl_keep isl_union_pw_aff *union_pw_aff::get() const {
   return ptr;
 }
 
-__isl_give isl_union_pw_aff_list *union_pw_aff_list::release() {
-  isl_union_pw_aff_list *tmp = ptr;
+__isl_give isl_union_pw_aff *union_pw_aff::release() {
+  isl_union_pw_aff *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool union_pw_aff_list::is_null() const {
+bool union_pw_aff::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx union_pw_aff_list::ctx() const {
-  return isl::ctx(isl_union_pw_aff_list_get_ctx(ptr));
-}
-
-void union_pw_aff_list::dump() const {
-  isl_union_pw_aff_list_dump(get());
+isl::ctx union_pw_aff::ctx() const {
+  return isl::ctx(isl_union_pw_aff_get_ctx(ptr));
 }
 
-
-isl::union_pw_aff_list union_pw_aff_list::add(isl::union_pw_aff el) const
+isl::multi_union_pw_aff union_pw_aff::add(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_union_pw_aff_list_add(copy(), el.release());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).add(multi2);
 }
 
-isl::union_pw_aff_list union_pw_aff_list::alloc(isl::ctx ctx, int n)
+isl::union_pw_aff union_pw_aff::add(isl::union_pw_aff upa2) const
 {
-  auto res = isl_union_pw_aff_list_alloc(ctx.release(), n);
+  auto res = isl_union_pw_aff_add(copy(), upa2.release());
   return manage(res);
 }
 
-isl::union_pw_aff_list union_pw_aff_list::clear() const
+isl::union_pw_multi_aff union_pw_aff::add(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_union_pw_aff_list_clear(copy());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).add(upma2);
 }
 
-isl::union_pw_aff_list union_pw_aff_list::concat(isl::union_pw_aff_list list2) const
+isl::union_pw_aff union_pw_aff::add(const isl::aff &upa2) const
 {
-  auto res = isl_union_pw_aff_list_concat(copy(), list2.release());
-  return manage(res);
+  return this->add(isl::union_pw_aff(upa2));
 }
 
-isl::union_pw_aff_list union_pw_aff_list::drop(unsigned int first, unsigned int n) const
+isl::union_pw_aff union_pw_aff::add(const isl::pw_aff &upa2) const
 {
-  auto res = isl_union_pw_aff_list_drop(copy(), first, n);
-  return manage(res);
+  return this->add(isl::union_pw_aff(upa2));
 }
 
-stat union_pw_aff_list::foreach(const std::function<stat(union_pw_aff)> &fn) const
+isl::union_pw_multi_aff union_pw_aff::add_pw_multi_aff(const isl::pw_multi_aff &pma) const
 {
-  struct fn_data {
-    const std::function<stat(union_pw_aff)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_union_pw_aff *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_union_pw_aff_list_foreach(get(), fn_lambda, &fn_data);
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).add_pw_multi_aff(pma);
 }
 
-isl::union_pw_aff_list union_pw_aff_list::from_union_pw_aff(isl::union_pw_aff el)
+isl::union_pw_multi_aff union_pw_aff::apply(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_union_pw_aff_list_from_union_pw_aff(el.release());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).apply(upma2);
 }
 
-isl::union_pw_aff union_pw_aff_list::get_at(int index) const
+isl::multi_union_pw_aff union_pw_aff::as_multi_union_pw_aff() const
 {
-  auto res = isl_union_pw_aff_list_get_at(get(), index);
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).as_multi_union_pw_aff();
 }
 
-isl::union_pw_aff union_pw_aff_list::get_union_pw_aff(int index) const
+isl::pw_multi_aff union_pw_aff::as_pw_multi_aff() const
 {
-  auto res = isl_union_pw_aff_list_get_union_pw_aff(get(), index);
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).as_pw_multi_aff();
 }
 
-isl::union_pw_aff_list union_pw_aff_list::insert(unsigned int pos, isl::union_pw_aff el) const
+isl::union_map union_pw_aff::as_union_map() const
 {
-  auto res = isl_union_pw_aff_list_insert(copy(), pos, el.release());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).as_union_map();
 }
 
-isl_size union_pw_aff_list::n_union_pw_aff() const
+isl::union_pw_aff union_pw_aff::at(int pos) const
 {
-  auto res = isl_union_pw_aff_list_n_union_pw_aff(get());
-  return res;
+  return isl::multi_union_pw_aff(*this).at(pos);
 }
 
-isl::union_pw_aff_list union_pw_aff_list::reverse() const
+isl::union_set union_pw_aff::bind(const isl::multi_id &tuple) const
 {
-  auto res = isl_union_pw_aff_list_reverse(copy());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).bind(tuple);
 }
 
-isl::union_pw_aff_list union_pw_aff_list::set_union_pw_aff(int index, isl::union_pw_aff el) const
+isl::union_set union_pw_aff::bind(isl::id id) const
 {
-  auto res = isl_union_pw_aff_list_set_union_pw_aff(copy(), index, el.release());
+  auto res = isl_union_pw_aff_bind_id(copy(), id.release());
   return manage(res);
 }
 
-isl_size union_pw_aff_list::size() const
+isl::union_set union_pw_aff::bind(const std::string &id) const
 {
-  auto res = isl_union_pw_aff_list_size(get());
-  return res;
+  return this->bind(isl::id(ctx(), id));
 }
 
-isl::union_pw_aff_list union_pw_aff_list::swap(unsigned int pos1, unsigned int pos2) const
+isl::union_pw_aff union_pw_aff::coalesce() const
 {
-  auto res = isl_union_pw_aff_list_swap(copy(), pos1, pos2);
+  auto res = isl_union_pw_aff_coalesce(copy());
   return manage(res);
 }
 
-// implementations for isl::union_pw_multi_aff
-union_pw_multi_aff manage(__isl_take isl_union_pw_multi_aff *ptr) {
-  return union_pw_multi_aff(ptr);
-}
-union_pw_multi_aff manage_copy(__isl_keep isl_union_pw_multi_aff *ptr) {
-  ptr = isl_union_pw_multi_aff_copy(ptr);
-  return union_pw_multi_aff(ptr);
-}
-
-union_pw_multi_aff::union_pw_multi_aff()
-    : ptr(nullptr) {}
-
-union_pw_multi_aff::union_pw_multi_aff(const union_pw_multi_aff &obj)
-    : ptr(nullptr)
+class size union_pw_aff::dim(isl::dim type) const
 {
-  ptr = obj.copy();
+  return isl::multi_union_pw_aff(*this).dim(type);
 }
 
-
-union_pw_multi_aff::union_pw_multi_aff(__isl_take isl_union_pw_multi_aff *ptr)
-    : ptr(ptr) {}
-
-union_pw_multi_aff::union_pw_multi_aff(isl::aff aff)
-{
-  auto res = isl_union_pw_multi_aff_from_aff(aff.release());
-  ptr = res;
-}
-union_pw_multi_aff::union_pw_multi_aff(isl::union_set uset)
-{
-  auto res = isl_union_pw_multi_aff_from_domain(uset.release());
-  ptr = res;
-}
-union_pw_multi_aff::union_pw_multi_aff(isl::multi_aff ma)
-{
-  auto res = isl_union_pw_multi_aff_from_multi_aff(ma.release());
-  ptr = res;
-}
-union_pw_multi_aff::union_pw_multi_aff(isl::multi_union_pw_aff mupa)
-{
-  auto res = isl_union_pw_multi_aff_from_multi_union_pw_aff(mupa.release());
-  ptr = res;
-}
-union_pw_multi_aff::union_pw_multi_aff(isl::pw_multi_aff pma)
-{
-  auto res = isl_union_pw_multi_aff_from_pw_multi_aff(pma.release());
-  ptr = res;
-}
-union_pw_multi_aff::union_pw_multi_aff(isl::union_map umap)
-{
-  auto res = isl_union_pw_multi_aff_from_union_map(umap.release());
-  ptr = res;
-}
-union_pw_multi_aff::union_pw_multi_aff(isl::union_pw_aff upa)
-{
-  auto res = isl_union_pw_multi_aff_from_union_pw_aff(upa.release());
-  ptr = res;
-}
-union_pw_multi_aff::union_pw_multi_aff(isl::ctx ctx, const std::string &str)
+isl::union_set union_pw_aff::domain() const
 {
-  auto res = isl_union_pw_multi_aff_read_from_str(ctx.release(), str.c_str());
-  ptr = res;
-}
-
-union_pw_multi_aff &union_pw_multi_aff::operator=(union_pw_multi_aff obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
-}
-
-union_pw_multi_aff::~union_pw_multi_aff() {
-  if (ptr)
-    isl_union_pw_multi_aff_free(ptr);
-}
-
-__isl_give isl_union_pw_multi_aff *union_pw_multi_aff::copy() const & {
-  return isl_union_pw_multi_aff_copy(ptr);
+  auto res = isl_union_pw_aff_domain(copy());
+  return manage(res);
 }
 
-__isl_keep isl_union_pw_multi_aff *union_pw_multi_aff::get() const {
-  return ptr;
+isl::union_pw_aff union_pw_aff::empty(isl::space space)
+{
+  auto res = isl_union_pw_aff_empty(space.release());
+  return manage(res);
 }
 
-__isl_give isl_union_pw_multi_aff *union_pw_multi_aff::release() {
-  isl_union_pw_multi_aff *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
+isl::pw_multi_aff union_pw_aff::extract_pw_multi_aff(const isl::space &space) const
+{
+  return isl::union_pw_multi_aff(*this).extract_pw_multi_aff(space);
 }
 
-bool union_pw_multi_aff::is_null() const {
-  return ptr == nullptr;
+isl::multi_union_pw_aff union_pw_aff::flat_range_product(const isl::multi_union_pw_aff &multi2) const
+{
+  return isl::multi_union_pw_aff(*this).flat_range_product(multi2);
 }
 
+isl::union_pw_multi_aff union_pw_aff::flat_range_product(const isl::union_pw_multi_aff &upma2) const
+{
+  return isl::union_pw_multi_aff(*this).flat_range_product(upma2);
+}
 
-isl::ctx union_pw_multi_aff::ctx() const {
-  return isl::ctx(isl_union_pw_multi_aff_get_ctx(ptr));
+stat union_pw_aff::foreach_pw_aff(const std::function<stat(isl::pw_aff)> &fn) const
+{
+  struct fn_data {
+    std::function<stat(isl::pw_aff)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_pw_aff *arg_0, void *arg_1) -> isl_stat {
+    auto *data = static_cast<struct fn_data *>(arg_1);
+    auto ret = (data->func)(manage(arg_0));
+    return ret.release();
+  };
+  auto res = isl_union_pw_aff_foreach_pw_aff(get(), fn_lambda, &fn_data);
+  return manage(res);
 }
 
-void union_pw_multi_aff::dump() const {
-  isl_union_pw_multi_aff_dump(get());
+isl::union_pw_aff union_pw_aff::gist(isl::union_set context) const
+{
+  auto res = isl_union_pw_aff_gist(copy(), context.release());
+  return manage(res);
 }
 
+boolean union_pw_aff::has_range_tuple_id() const
+{
+  return isl::multi_union_pw_aff(*this).has_range_tuple_id();
+}
 
-isl::union_pw_multi_aff union_pw_multi_aff::add(isl::union_pw_multi_aff upma2) const
+isl::union_pw_aff union_pw_aff::intersect_domain(isl::space space) const
 {
-  auto res = isl_union_pw_multi_aff_add(copy(), upma2.release());
+  auto res = isl_union_pw_aff_intersect_domain_space(copy(), space.release());
   return manage(res);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::add_pw_multi_aff(isl::pw_multi_aff pma) const
+isl::union_pw_aff union_pw_aff::intersect_domain(isl::union_set uset) const
 {
-  auto res = isl_union_pw_multi_aff_add_pw_multi_aff(copy(), pma.release());
+  auto res = isl_union_pw_aff_intersect_domain_union_set(copy(), uset.release());
   return manage(res);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::align_params(isl::space model) const
+isl::union_pw_aff union_pw_aff::intersect_domain_wrapped_domain(isl::union_set uset) const
 {
-  auto res = isl_union_pw_multi_aff_align_params(copy(), model.release());
+  auto res = isl_union_pw_aff_intersect_domain_wrapped_domain(copy(), uset.release());
   return manage(res);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::apply(isl::union_pw_multi_aff upma2) const
+isl::union_pw_aff union_pw_aff::intersect_domain_wrapped_range(isl::union_set uset) const
 {
-  auto res = isl_union_pw_multi_aff_apply_union_pw_multi_aff(copy(), upma2.release());
+  auto res = isl_union_pw_aff_intersect_domain_wrapped_range(copy(), uset.release());
   return manage(res);
 }
 
-isl::pw_multi_aff union_pw_multi_aff::as_pw_multi_aff() const
+isl::union_pw_aff union_pw_aff::intersect_params(isl::set set) const
 {
-  auto res = isl_union_pw_multi_aff_as_pw_multi_aff(copy());
+  auto res = isl_union_pw_aff_intersect_params(copy(), set.release());
   return manage(res);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::coalesce() const
+boolean union_pw_aff::involves_locals() const
 {
-  auto res = isl_union_pw_multi_aff_coalesce(copy());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).involves_locals();
 }
 
-isl_size union_pw_multi_aff::dim(isl::dim type) const
+boolean union_pw_aff::involves_nan() const
 {
-  auto res = isl_union_pw_multi_aff_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  return isl::multi_union_pw_aff(*this).involves_nan();
 }
 
-isl::union_set union_pw_multi_aff::domain() const
+boolean union_pw_aff::isa_pw_multi_aff() const
 {
-  auto res = isl_union_pw_multi_aff_domain(copy());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).isa_pw_multi_aff();
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::union_pw_aff_list union_pw_aff::list() const
 {
-  auto res = isl_union_pw_multi_aff_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).list();
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::empty(isl::space space)
+isl::multi_union_pw_aff union_pw_aff::neg() const
 {
-  auto res = isl_union_pw_multi_aff_empty(space.release());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).neg();
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::empty(isl::ctx ctx)
+boolean union_pw_aff::plain_is_empty() const
 {
-  auto res = isl_union_pw_multi_aff_empty_ctx(ctx.release());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).plain_is_empty();
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::empty_space(isl::space space)
+boolean union_pw_aff::plain_is_equal(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_union_pw_multi_aff_empty_space(space.release());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).plain_is_equal(multi2);
 }
 
-isl::pw_multi_aff union_pw_multi_aff::extract_pw_multi_aff(isl::space space) const
+isl::union_pw_multi_aff union_pw_aff::preimage_domain_wrapped_domain(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_union_pw_multi_aff_extract_pw_multi_aff(get(), space.release());
+  return isl::union_pw_multi_aff(*this).preimage_domain_wrapped_domain(upma2);
+}
+
+isl::union_pw_aff union_pw_aff::pullback(isl::union_pw_multi_aff upma) const
+{
+  auto res = isl_union_pw_aff_pullback_union_pw_multi_aff(copy(), upma.release());
   return manage(res);
 }
 
-int union_pw_multi_aff::find_dim_by_name(isl::dim type, const std::string &name) const
+isl::pw_multi_aff_list union_pw_aff::pw_multi_aff_list() const
 {
-  auto res = isl_union_pw_multi_aff_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
+  return isl::union_pw_multi_aff(*this).pw_multi_aff_list();
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::flat_range_product(isl::union_pw_multi_aff upma2) const
+isl::union_pw_multi_aff union_pw_aff::range_factor_domain() const
 {
-  auto res = isl_union_pw_multi_aff_flat_range_product(copy(), upma2.release());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).range_factor_domain();
 }
 
-stat union_pw_multi_aff::foreach_pw_multi_aff(const std::function<stat(pw_multi_aff)> &fn) const
+isl::union_pw_multi_aff union_pw_aff::range_factor_range() const
 {
-  struct fn_data {
-    const std::function<stat(pw_multi_aff)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_pw_multi_aff *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_union_pw_multi_aff_foreach_pw_multi_aff(get(), fn_lambda, &fn_data);
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).range_factor_range();
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::from_union_set(isl::union_set uset)
+isl::multi_union_pw_aff union_pw_aff::range_product(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_union_pw_multi_aff_from_union_set(uset.release());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).range_product(multi2);
 }
 
-isl::pw_multi_aff_list union_pw_multi_aff::get_pw_multi_aff_list() const
+isl::union_pw_multi_aff union_pw_aff::range_product(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_union_pw_multi_aff_get_pw_multi_aff_list(get());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).range_product(upma2);
 }
 
-isl::space union_pw_multi_aff::get_space() const
+isl::id union_pw_aff::range_tuple_id() const
 {
-  auto res = isl_union_pw_multi_aff_get_space(get());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).range_tuple_id();
 }
 
-isl::union_pw_aff union_pw_multi_aff::get_union_pw_aff(int pos) const
+isl::multi_union_pw_aff union_pw_aff::reset_range_tuple_id() const
 {
-  auto res = isl_union_pw_multi_aff_get_union_pw_aff(get(), pos);
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).reset_range_tuple_id();
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::gist(isl::union_set context) const
+isl::multi_union_pw_aff union_pw_aff::reset_tuple_id(isl::dim type) const
 {
-  auto res = isl_union_pw_multi_aff_gist(copy(), context.release());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).reset_tuple_id(type);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::gist_params(isl::set context) const
+isl::multi_union_pw_aff union_pw_aff::scale(const isl::multi_val &mv) const
 {
-  auto res = isl_union_pw_multi_aff_gist_params(copy(), context.release());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).scale(mv);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain(isl::space space) const
+isl::multi_union_pw_aff union_pw_aff::scale(const isl::val &v) const
 {
-  auto res = isl_union_pw_multi_aff_intersect_domain_space(copy(), space.release());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).scale(v);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain(isl::union_set uset) const
+isl::multi_union_pw_aff union_pw_aff::scale(long v) const
 {
-  auto res = isl_union_pw_multi_aff_intersect_domain_union_set(copy(), uset.release());
-  return manage(res);
+  return this->scale(isl::val(ctx(), v));
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain_wrapped_domain(isl::union_set uset) const
+isl::multi_union_pw_aff union_pw_aff::scale_down(const isl::multi_val &mv) const
 {
-  auto res = isl_union_pw_multi_aff_intersect_domain_wrapped_domain(copy(), uset.release());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).scale_down(mv);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain_wrapped_range(isl::union_set uset) const
+isl::multi_union_pw_aff union_pw_aff::scale_down(const isl::val &v) const
 {
-  auto res = isl_union_pw_multi_aff_intersect_domain_wrapped_range(copy(), uset.release());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).scale_down(v);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::intersect_params(isl::set set) const
+isl::multi_union_pw_aff union_pw_aff::scale_down(long v) const
 {
-  auto res = isl_union_pw_multi_aff_intersect_params(copy(), set.release());
-  return manage(res);
+  return this->scale_down(isl::val(ctx(), v));
 }
 
-boolean union_pw_multi_aff::involves_locals() const
+isl::multi_union_pw_aff union_pw_aff::set_at(int pos, const isl::union_pw_aff &el) const
 {
-  auto res = isl_union_pw_multi_aff_involves_locals(get());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).set_at(pos, el);
 }
 
-boolean union_pw_multi_aff::involves_nan() const
+isl::multi_union_pw_aff union_pw_aff::set_range_tuple(const isl::id &id) const
 {
-  auto res = isl_union_pw_multi_aff_involves_nan(get());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).set_range_tuple(id);
 }
 
-boolean union_pw_multi_aff::isa_pw_multi_aff() const
+isl::multi_union_pw_aff union_pw_aff::set_range_tuple(const std::string &id) const
 {
-  auto res = isl_union_pw_multi_aff_isa_pw_multi_aff(get());
-  return manage(res);
+  return this->set_range_tuple(isl::id(ctx(), id));
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::multi_val_on_domain(isl::union_set domain, isl::multi_val mv)
+isl::multi_union_pw_aff union_pw_aff::set_union_pw_aff(int pos, const isl::union_pw_aff &el) const
 {
-  auto res = isl_union_pw_multi_aff_multi_val_on_domain(domain.release(), mv.release());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).set_union_pw_aff(pos, el);
 }
 
-isl_size union_pw_multi_aff::n_pw_multi_aff() const
+class size union_pw_aff::size() const
 {
-  auto res = isl_union_pw_multi_aff_n_pw_multi_aff(get());
-  return res;
+  return isl::multi_union_pw_aff(*this).size();
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::neg() const
+isl::space union_pw_aff::space() const
 {
-  auto res = isl_union_pw_multi_aff_neg(copy());
+  auto res = isl_union_pw_aff_get_space(get());
   return manage(res);
 }
 
-boolean union_pw_multi_aff::plain_is_empty() const
+isl::space union_pw_aff::get_space() const
 {
-  auto res = isl_union_pw_multi_aff_plain_is_empty(get());
-  return manage(res);
+  return space();
 }
 
-boolean union_pw_multi_aff::plain_is_equal(const isl::union_pw_multi_aff &upma2) const
+isl::multi_union_pw_aff union_pw_aff::sub(const isl::multi_union_pw_aff &multi2) const
 {
-  auto res = isl_union_pw_multi_aff_plain_is_equal(get(), upma2.get());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).sub(multi2);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::preimage_domain_wrapped_domain(isl::union_pw_multi_aff upma2) const
+isl::union_pw_aff union_pw_aff::sub(isl::union_pw_aff upa2) const
 {
-  auto res = isl_union_pw_multi_aff_preimage_domain_wrapped_domain_union_pw_multi_aff(copy(), upma2.release());
+  auto res = isl_union_pw_aff_sub(copy(), upa2.release());
   return manage(res);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::pullback(isl::union_pw_multi_aff upma2) const
+isl::union_pw_multi_aff union_pw_aff::sub(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_union_pw_multi_aff_pullback_union_pw_multi_aff(copy(), upma2.release());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).sub(upma2);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::range_factor_domain() const
+isl::union_pw_aff union_pw_aff::sub(const isl::aff &upa2) const
 {
-  auto res = isl_union_pw_multi_aff_range_factor_domain(copy());
-  return manage(res);
+  return this->sub(isl::union_pw_aff(upa2));
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::range_factor_range() const
+isl::union_pw_aff union_pw_aff::sub(const isl::pw_aff &upa2) const
 {
-  auto res = isl_union_pw_multi_aff_range_factor_range(copy());
-  return manage(res);
+  return this->sub(isl::union_pw_aff(upa2));
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::range_product(isl::union_pw_multi_aff upma2) const
+isl::union_pw_aff union_pw_aff::subtract_domain(isl::space space) const
 {
-  auto res = isl_union_pw_multi_aff_range_product(copy(), upma2.release());
+  auto res = isl_union_pw_aff_subtract_domain_space(copy(), space.release());
   return manage(res);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::reset_user() const
+isl::union_pw_aff union_pw_aff::subtract_domain(isl::union_set uset) const
 {
-  auto res = isl_union_pw_multi_aff_reset_user(copy());
+  auto res = isl_union_pw_aff_subtract_domain_union_set(copy(), uset.release());
   return manage(res);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::scale_down_val(isl::val val) const
+isl::union_pw_aff_list union_pw_aff::to_list() const
 {
-  auto res = isl_union_pw_multi_aff_scale_down_val(copy(), val.release());
+  auto res = isl_union_pw_aff_to_list(copy());
   return manage(res);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::scale_multi_val(isl::multi_val mv) const
+isl::multi_union_pw_aff union_pw_aff::union_add(const isl::multi_union_pw_aff &mupa2) const
 {
-  auto res = isl_union_pw_multi_aff_scale_multi_val(copy(), mv.release());
-  return manage(res);
+  return isl::multi_union_pw_aff(*this).union_add(mupa2);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::scale_val(isl::val val) const
+isl::union_pw_aff union_pw_aff::union_add(isl::union_pw_aff upa2) const
 {
-  auto res = isl_union_pw_multi_aff_scale_val(copy(), val.release());
+  auto res = isl_union_pw_aff_union_add(copy(), upa2.release());
   return manage(res);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::sub(isl::union_pw_multi_aff upma2) const
+isl::union_pw_multi_aff union_pw_aff::union_add(const isl::union_pw_multi_aff &upma2) const
 {
-  auto res = isl_union_pw_multi_aff_sub(copy(), upma2.release());
-  return manage(res);
+  return isl::union_pw_multi_aff(*this).union_add(upma2);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::subtract_domain(isl::space space) const
+isl::union_pw_aff union_pw_aff::union_add(const isl::aff &upa2) const
 {
-  auto res = isl_union_pw_multi_aff_subtract_domain_space(copy(), space.release());
-  return manage(res);
+  return this->union_add(isl::union_pw_aff(upa2));
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::subtract_domain(isl::union_set uset) const
+isl::union_pw_aff union_pw_aff::union_add(const isl::pw_aff &upa2) const
 {
-  auto res = isl_union_pw_multi_aff_subtract_domain_union_set(copy(), uset.release());
-  return manage(res);
+  return this->union_add(isl::union_pw_aff(upa2));
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff::union_add(isl::union_pw_multi_aff upma2) const
+inline std::ostream &operator<<(std::ostream &os, const union_pw_aff &obj)
 {
-  auto res = isl_union_pw_multi_aff_union_add(copy(), upma2.release());
-  return manage(res);
+  char *str = isl_union_pw_aff_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-// implementations for isl::union_pw_multi_aff_list
-union_pw_multi_aff_list manage(__isl_take isl_union_pw_multi_aff_list *ptr) {
-  return union_pw_multi_aff_list(ptr);
+// implementations for isl::union_pw_aff_list
+union_pw_aff_list manage(__isl_take isl_union_pw_aff_list *ptr) {
+  return union_pw_aff_list(ptr);
 }
-union_pw_multi_aff_list manage_copy(__isl_keep isl_union_pw_multi_aff_list *ptr) {
-  ptr = isl_union_pw_multi_aff_list_copy(ptr);
-  return union_pw_multi_aff_list(ptr);
+union_pw_aff_list manage_copy(__isl_keep isl_union_pw_aff_list *ptr) {
+  ptr = isl_union_pw_aff_list_copy(ptr);
+  return union_pw_aff_list(ptr);
 }
 
-union_pw_multi_aff_list::union_pw_multi_aff_list()
+union_pw_aff_list::union_pw_aff_list()
     : ptr(nullptr) {}
 
-union_pw_multi_aff_list::union_pw_multi_aff_list(const union_pw_multi_aff_list &obj)
+union_pw_aff_list::union_pw_aff_list(const union_pw_aff_list &obj)
     : ptr(nullptr)
 {
   ptr = obj.copy();
 }
 
-
-union_pw_multi_aff_list::union_pw_multi_aff_list(__isl_take isl_union_pw_multi_aff_list *ptr)
+union_pw_aff_list::union_pw_aff_list(__isl_take isl_union_pw_aff_list *ptr)
     : ptr(ptr) {}
 
+union_pw_aff_list::union_pw_aff_list(isl::ctx ctx, int n)
+{
+  auto res = isl_union_pw_aff_list_alloc(ctx.release(), n);
+  ptr = res;
+}
+
+union_pw_aff_list::union_pw_aff_list(isl::union_pw_aff el)
+{
+  auto res = isl_union_pw_aff_list_from_union_pw_aff(el.release());
+  ptr = res;
+}
+
+union_pw_aff_list::union_pw_aff_list(isl::ctx ctx, const std::string &str)
+{
+  auto res = isl_union_pw_aff_list_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
+}
 
-union_pw_multi_aff_list &union_pw_multi_aff_list::operator=(union_pw_multi_aff_list obj) {
+union_pw_aff_list &union_pw_aff_list::operator=(union_pw_aff_list obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-union_pw_multi_aff_list::~union_pw_multi_aff_list() {
+union_pw_aff_list::~union_pw_aff_list() {
   if (ptr)
-    isl_union_pw_multi_aff_list_free(ptr);
+    isl_union_pw_aff_list_free(ptr);
 }
 
-__isl_give isl_union_pw_multi_aff_list *union_pw_multi_aff_list::copy() const & {
-  return isl_union_pw_multi_aff_list_copy(ptr);
+__isl_give isl_union_pw_aff_list *union_pw_aff_list::copy() const & {
+  return isl_union_pw_aff_list_copy(ptr);
 }
 
-__isl_keep isl_union_pw_multi_aff_list *union_pw_multi_aff_list::get() const {
+__isl_keep isl_union_pw_aff_list *union_pw_aff_list::get() const {
   return ptr;
 }
 
-__isl_give isl_union_pw_multi_aff_list *union_pw_multi_aff_list::release() {
-  isl_union_pw_multi_aff_list *tmp = ptr;
+__isl_give isl_union_pw_aff_list *union_pw_aff_list::release() {
+  isl_union_pw_aff_list *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool union_pw_multi_aff_list::is_null() const {
+bool union_pw_aff_list::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx union_pw_multi_aff_list::ctx() const {
-  return isl::ctx(isl_union_pw_multi_aff_list_get_ctx(ptr));
+isl::ctx union_pw_aff_list::ctx() const {
+  return isl::ctx(isl_union_pw_aff_list_get_ctx(ptr));
 }
 
-void union_pw_multi_aff_list::dump() const {
-  isl_union_pw_multi_aff_list_dump(get());
+isl::union_pw_aff_list union_pw_aff_list::add(isl::union_pw_aff el) const
+{
+  auto res = isl_union_pw_aff_list_add(copy(), el.release());
+  return manage(res);
 }
 
-
-isl::union_pw_multi_aff_list union_pw_multi_aff_list::add(isl::union_pw_multi_aff el) const
+isl::union_pw_aff union_pw_aff_list::at(int index) const
 {
-  auto res = isl_union_pw_multi_aff_list_add(copy(), el.release());
+  auto res = isl_union_pw_aff_list_get_at(get(), index);
   return manage(res);
 }
 
-isl::union_pw_multi_aff_list union_pw_multi_aff_list::alloc(isl::ctx ctx, int n)
+isl::union_pw_aff union_pw_aff_list::get_at(int index) const
 {
-  auto res = isl_union_pw_multi_aff_list_alloc(ctx.release(), n);
-  return manage(res);
+  return at(index);
 }
 
-isl::union_pw_multi_aff_list union_pw_multi_aff_list::clear() const
+isl::union_pw_aff_list union_pw_aff_list::clear() const
 {
-  auto res = isl_union_pw_multi_aff_list_clear(copy());
+  auto res = isl_union_pw_aff_list_clear(copy());
   return manage(res);
 }
 
-isl::union_pw_multi_aff_list union_pw_multi_aff_list::concat(isl::union_pw_multi_aff_list list2) const
+isl::union_pw_aff_list union_pw_aff_list::concat(isl::union_pw_aff_list list2) const
 {
-  auto res = isl_union_pw_multi_aff_list_concat(copy(), list2.release());
+  auto res = isl_union_pw_aff_list_concat(copy(), list2.release());
   return manage(res);
 }
 
-isl::union_pw_multi_aff_list union_pw_multi_aff_list::drop(unsigned int first, unsigned int n) const
+isl::union_pw_aff_list union_pw_aff_list::drop(unsigned int first, unsigned int n) const
 {
-  auto res = isl_union_pw_multi_aff_list_drop(copy(), first, n);
+  auto res = isl_union_pw_aff_list_drop(copy(), first, n);
   return manage(res);
 }
 
-stat union_pw_multi_aff_list::foreach(const std::function<stat(union_pw_multi_aff)> &fn) const
+stat union_pw_aff_list::foreach(const std::function<stat(isl::union_pw_aff)> &fn) const
 {
   struct fn_data {
-    const std::function<stat(union_pw_multi_aff)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_union_pw_multi_aff *arg_0, void *arg_1) -> isl_stat {
+    std::function<stat(isl::union_pw_aff)> func;
+  } fn_data = { fn };
+  auto fn_lambda = [](isl_union_pw_aff *arg_0, void *arg_1) -> isl_stat {
     auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
+    auto ret = (data->func)(manage(arg_0));
     return ret.release();
   };
-  auto res = isl_union_pw_multi_aff_list_foreach(get(), fn_lambda, &fn_data);
+  auto res = isl_union_pw_aff_list_foreach(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::union_pw_multi_aff_list union_pw_multi_aff_list::from_union_pw_multi_aff(isl::union_pw_multi_aff el)
+isl::union_pw_aff_list union_pw_aff_list::insert(unsigned int pos, isl::union_pw_aff el) const
 {
-  auto res = isl_union_pw_multi_aff_list_from_union_pw_multi_aff(el.release());
+  auto res = isl_union_pw_aff_list_insert(copy(), pos, el.release());
   return manage(res);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff_list::get_at(int index) const
+class size union_pw_aff_list::size() const
 {
-  auto res = isl_union_pw_multi_aff_list_get_at(get(), index);
+  auto res = isl_union_pw_aff_list_size(get());
   return manage(res);
 }
 
-isl::union_pw_multi_aff union_pw_multi_aff_list::get_union_pw_multi_aff(int index) const
+inline std::ostream &operator<<(std::ostream &os, const union_pw_aff_list &obj)
 {
-  auto res = isl_union_pw_multi_aff_list_get_union_pw_multi_aff(get(), index);
-  return manage(res);
+  char *str = isl_union_pw_aff_list_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
-isl::union_pw_multi_aff_list union_pw_multi_aff_list::insert(unsigned int pos, isl::union_pw_multi_aff el) const
-{
-  auto res = isl_union_pw_multi_aff_list_insert(copy(), pos, el.release());
-  return manage(res);
+// implementations for isl::union_pw_multi_aff
+union_pw_multi_aff manage(__isl_take isl_union_pw_multi_aff *ptr) {
+  return union_pw_multi_aff(ptr);
 }
-
-isl_size union_pw_multi_aff_list::n_union_pw_multi_aff() const
-{
-  auto res = isl_union_pw_multi_aff_list_n_union_pw_multi_aff(get());
-  return res;
+union_pw_multi_aff manage_copy(__isl_keep isl_union_pw_multi_aff *ptr) {
+  ptr = isl_union_pw_multi_aff_copy(ptr);
+  return union_pw_multi_aff(ptr);
 }
 
-isl::union_pw_multi_aff_list union_pw_multi_aff_list::reverse() const
+union_pw_multi_aff::union_pw_multi_aff()
+    : ptr(nullptr) {}
+
+union_pw_multi_aff::union_pw_multi_aff(const union_pw_multi_aff &obj)
+    : ptr(nullptr)
 {
-  auto res = isl_union_pw_multi_aff_list_reverse(copy());
-  return manage(res);
+  ptr = obj.copy();
 }
 
-isl::union_pw_multi_aff_list union_pw_multi_aff_list::set_union_pw_multi_aff(int index, isl::union_pw_multi_aff el) const
+union_pw_multi_aff::union_pw_multi_aff(__isl_take isl_union_pw_multi_aff *ptr)
+    : ptr(ptr) {}
+
+union_pw_multi_aff::union_pw_multi_aff(isl::union_set uset)
 {
-  auto res = isl_union_pw_multi_aff_list_set_union_pw_multi_aff(copy(), index, el.release());
-  return manage(res);
+  auto res = isl_union_pw_multi_aff_from_domain(uset.release());
+  ptr = res;
 }
 
-isl_size union_pw_multi_aff_list::size() const
+union_pw_multi_aff::union_pw_multi_aff(isl::multi_aff ma)
 {
-  auto res = isl_union_pw_multi_aff_list_size(get());
-  return res;
+  auto res = isl_union_pw_multi_aff_from_multi_aff(ma.release());
+  ptr = res;
 }
 
-isl::union_pw_multi_aff_list union_pw_multi_aff_list::swap(unsigned int pos1, unsigned int pos2) const
+union_pw_multi_aff::union_pw_multi_aff(isl::pw_multi_aff pma)
 {
-  auto res = isl_union_pw_multi_aff_list_swap(copy(), pos1, pos2);
-  return manage(res);
+  auto res = isl_union_pw_multi_aff_from_pw_multi_aff(pma.release());
+  ptr = res;
 }
 
-// implementations for isl::union_pw_qpolynomial
-union_pw_qpolynomial manage(__isl_take isl_union_pw_qpolynomial *ptr) {
-  return union_pw_qpolynomial(ptr);
-}
-union_pw_qpolynomial manage_copy(__isl_keep isl_union_pw_qpolynomial *ptr) {
-  ptr = isl_union_pw_qpolynomial_copy(ptr);
-  return union_pw_qpolynomial(ptr);
+union_pw_multi_aff::union_pw_multi_aff(isl::union_map umap)
+{
+  auto res = isl_union_pw_multi_aff_from_union_map(umap.release());
+  ptr = res;
 }
 
-union_pw_qpolynomial::union_pw_qpolynomial()
-    : ptr(nullptr) {}
-
-union_pw_qpolynomial::union_pw_qpolynomial(const union_pw_qpolynomial &obj)
-    : ptr(nullptr)
+union_pw_multi_aff::union_pw_multi_aff(isl::union_pw_aff upa)
 {
-  ptr = obj.copy();
+  auto res = isl_union_pw_multi_aff_from_union_pw_aff(upa.release());
+  ptr = res;
 }
 
-
-union_pw_qpolynomial::union_pw_qpolynomial(__isl_take isl_union_pw_qpolynomial *ptr)
-    : ptr(ptr) {}
-
-union_pw_qpolynomial::union_pw_qpolynomial(isl::ctx ctx, const std::string &str)
+union_pw_multi_aff::union_pw_multi_aff(isl::ctx ctx, const std::string &str)
 {
-  auto res = isl_union_pw_qpolynomial_read_from_str(ctx.release(), str.c_str());
+  auto res = isl_union_pw_multi_aff_read_from_str(ctx.release(), str.c_str());
   ptr = res;
 }
 
-union_pw_qpolynomial &union_pw_qpolynomial::operator=(union_pw_qpolynomial obj) {
+union_pw_multi_aff &union_pw_multi_aff::operator=(union_pw_multi_aff obj) {
   std::swap(this->ptr, obj.ptr);
   return *this;
 }
 
-union_pw_qpolynomial::~union_pw_qpolynomial() {
+union_pw_multi_aff::~union_pw_multi_aff() {
   if (ptr)
-    isl_union_pw_qpolynomial_free(ptr);
+    isl_union_pw_multi_aff_free(ptr);
 }
 
-__isl_give isl_union_pw_qpolynomial *union_pw_qpolynomial::copy() const & {
-  return isl_union_pw_qpolynomial_copy(ptr);
+__isl_give isl_union_pw_multi_aff *union_pw_multi_aff::copy() const & {
+  return isl_union_pw_multi_aff_copy(ptr);
 }
 
-__isl_keep isl_union_pw_qpolynomial *union_pw_qpolynomial::get() const {
+__isl_keep isl_union_pw_multi_aff *union_pw_multi_aff::get() const {
   return ptr;
 }
 
-__isl_give isl_union_pw_qpolynomial *union_pw_qpolynomial::release() {
-  isl_union_pw_qpolynomial *tmp = ptr;
+__isl_give isl_union_pw_multi_aff *union_pw_multi_aff::release() {
+  isl_union_pw_multi_aff *tmp = ptr;
   ptr = nullptr;
   return tmp;
 }
 
-bool union_pw_qpolynomial::is_null() const {
+bool union_pw_multi_aff::is_null() const {
   return ptr == nullptr;
 }
 
-
-isl::ctx union_pw_qpolynomial::ctx() const {
-  return isl::ctx(isl_union_pw_qpolynomial_get_ctx(ptr));
+isl::ctx union_pw_multi_aff::ctx() const {
+  return isl::ctx(isl_union_pw_multi_aff_get_ctx(ptr));
 }
 
-
-isl::union_pw_qpolynomial union_pw_qpolynomial::add(isl::union_pw_qpolynomial upwqp2) const
+isl::union_pw_multi_aff union_pw_multi_aff::add(isl::union_pw_multi_aff upma2) const
 {
-  auto res = isl_union_pw_qpolynomial_add(copy(), upwqp2.release());
+  auto res = isl_union_pw_multi_aff_add(copy(), upma2.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::add_pw_qpolynomial(isl::pw_qpolynomial pwqp) const
+isl::union_pw_multi_aff union_pw_multi_aff::add_pw_multi_aff(isl::pw_multi_aff pma) const
 {
-  auto res = isl_union_pw_qpolynomial_add_pw_qpolynomial(copy(), pwqp.release());
+  auto res = isl_union_pw_multi_aff_add_pw_multi_aff(copy(), pma.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::align_params(isl::space model) const
+isl::union_pw_multi_aff union_pw_multi_aff::apply(isl::union_pw_multi_aff upma2) const
 {
-  auto res = isl_union_pw_qpolynomial_align_params(copy(), model.release());
+  auto res = isl_union_pw_multi_aff_apply_union_pw_multi_aff(copy(), upma2.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::coalesce() const
+isl::multi_union_pw_aff union_pw_multi_aff::as_multi_union_pw_aff() const
 {
-  auto res = isl_union_pw_qpolynomial_coalesce(copy());
+  auto res = isl_union_pw_multi_aff_as_multi_union_pw_aff(copy());
   return manage(res);
 }
 
-isl_size union_pw_qpolynomial::dim(isl::dim type) const
-{
-  auto res = isl_union_pw_qpolynomial_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
-}
-
-isl::union_set union_pw_qpolynomial::domain() const
+isl::pw_multi_aff union_pw_multi_aff::as_pw_multi_aff() const
 {
-  auto res = isl_union_pw_qpolynomial_domain(copy());
+  auto res = isl_union_pw_multi_aff_as_pw_multi_aff(copy());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::drop_dims(isl::dim type, unsigned int first, unsigned int n) const
+isl::union_map union_pw_multi_aff::as_union_map() const
 {
-  auto res = isl_union_pw_qpolynomial_drop_dims(copy(), static_cast<enum isl_dim_type>(type), first, n);
+  auto res = isl_union_pw_multi_aff_as_union_map(copy());
   return manage(res);
 }
 
-isl::val union_pw_qpolynomial::eval(isl::point pnt) const
+isl::union_pw_multi_aff union_pw_multi_aff::coalesce() const
 {
-  auto res = isl_union_pw_qpolynomial_eval(copy(), pnt.release());
+  auto res = isl_union_pw_multi_aff_coalesce(copy());
   return manage(res);
 }
 
-isl::pw_qpolynomial union_pw_qpolynomial::extract_pw_qpolynomial(isl::space space) const
+isl::union_set union_pw_multi_aff::domain() const
 {
-  auto res = isl_union_pw_qpolynomial_extract_pw_qpolynomial(get(), space.release());
+  auto res = isl_union_pw_multi_aff_domain(copy());
   return manage(res);
 }
 
-int union_pw_qpolynomial::find_dim_by_name(isl::dim type, const std::string &name) const
-{
-  auto res = isl_union_pw_qpolynomial_find_dim_by_name(get(), static_cast<enum isl_dim_type>(type), name.c_str());
-  return res;
-}
-
-stat union_pw_qpolynomial::foreach_pw_qpolynomial(const std::function<stat(pw_qpolynomial)> &fn) const
+isl::union_pw_multi_aff union_pw_multi_aff::empty(isl::space space)
 {
-  struct fn_data {
-    const std::function<stat(pw_qpolynomial)> *func;
-  } fn_data = { &fn };
-  auto fn_lambda = [](isl_pw_qpolynomial *arg_0, void *arg_1) -> isl_stat {
-    auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
-    return ret.release();
-  };
-  auto res = isl_union_pw_qpolynomial_foreach_pw_qpolynomial(get(), fn_lambda, &fn_data);
+  auto res = isl_union_pw_multi_aff_empty(space.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::from_pw_qpolynomial(isl::pw_qpolynomial pwqp)
+isl::union_pw_multi_aff union_pw_multi_aff::empty(isl::ctx ctx)
 {
-  auto res = isl_union_pw_qpolynomial_from_pw_qpolynomial(pwqp.release());
+  auto res = isl_union_pw_multi_aff_empty_ctx(ctx.release());
   return manage(res);
 }
 
-isl::pw_qpolynomial_list union_pw_qpolynomial::get_pw_qpolynomial_list() const
+isl::pw_multi_aff union_pw_multi_aff::extract_pw_multi_aff(isl::space space) const
 {
-  auto res = isl_union_pw_qpolynomial_get_pw_qpolynomial_list(get());
+  auto res = isl_union_pw_multi_aff_extract_pw_multi_aff(get(), space.release());
   return manage(res);
 }
 
-isl::space union_pw_qpolynomial::get_space() const
+isl::union_pw_multi_aff union_pw_multi_aff::flat_range_product(isl::union_pw_multi_aff upma2) const
 {
-  auto res = isl_union_pw_qpolynomial_get_space(get());
+  auto res = isl_union_pw_multi_aff_flat_range_product(copy(), upma2.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::gist(isl::union_set context) const
+isl::union_pw_multi_aff union_pw_multi_aff::gist(isl::union_set context) const
 {
-  auto res = isl_union_pw_qpolynomial_gist(copy(), context.release());
+  auto res = isl_union_pw_multi_aff_gist(copy(), context.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::gist_params(isl::set context) const
+isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain(isl::space space) const
 {
-  auto res = isl_union_pw_qpolynomial_gist_params(copy(), context.release());
+  auto res = isl_union_pw_multi_aff_intersect_domain_space(copy(), space.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::intersect_domain(isl::union_set uset) const
+isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain(isl::union_set uset) const
 {
-  auto res = isl_union_pw_qpolynomial_intersect_domain(copy(), uset.release());
+  auto res = isl_union_pw_multi_aff_intersect_domain_union_set(copy(), uset.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::intersect_domain_space(isl::space space) const
+isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain_wrapped_domain(isl::union_set uset) const
 {
-  auto res = isl_union_pw_qpolynomial_intersect_domain_space(copy(), space.release());
+  auto res = isl_union_pw_multi_aff_intersect_domain_wrapped_domain(copy(), uset.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::intersect_domain_union_set(isl::union_set uset) const
+isl::union_pw_multi_aff union_pw_multi_aff::intersect_domain_wrapped_range(isl::union_set uset) const
 {
-  auto res = isl_union_pw_qpolynomial_intersect_domain_union_set(copy(), uset.release());
+  auto res = isl_union_pw_multi_aff_intersect_domain_wrapped_range(copy(), uset.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::intersect_domain_wrapped_domain(isl::union_set uset) const
+isl::union_pw_multi_aff union_pw_multi_aff::intersect_params(isl::set set) const
 {
-  auto res = isl_union_pw_qpolynomial_intersect_domain_wrapped_domain(copy(), uset.release());
+  auto res = isl_union_pw_multi_aff_intersect_params(copy(), set.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::intersect_domain_wrapped_range(isl::union_set uset) const
+boolean union_pw_multi_aff::involves_locals() const
 {
-  auto res = isl_union_pw_qpolynomial_intersect_domain_wrapped_range(copy(), uset.release());
+  auto res = isl_union_pw_multi_aff_involves_locals(get());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::intersect_params(isl::set set) const
+boolean union_pw_multi_aff::isa_pw_multi_aff() const
 {
-  auto res = isl_union_pw_qpolynomial_intersect_params(copy(), set.release());
+  auto res = isl_union_pw_multi_aff_isa_pw_multi_aff(get());
   return manage(res);
 }
 
-boolean union_pw_qpolynomial::involves_nan() const
+boolean union_pw_multi_aff::plain_is_empty() const
 {
-  auto res = isl_union_pw_qpolynomial_involves_nan(get());
+  auto res = isl_union_pw_multi_aff_plain_is_empty(get());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::mul(isl::union_pw_qpolynomial upwqp2) const
+isl::union_pw_multi_aff union_pw_multi_aff::preimage_domain_wrapped_domain(isl::union_pw_multi_aff upma2) const
 {
-  auto res = isl_union_pw_qpolynomial_mul(copy(), upwqp2.release());
+  auto res = isl_union_pw_multi_aff_preimage_domain_wrapped_domain_union_pw_multi_aff(copy(), upma2.release());
   return manage(res);
 }
 
-isl_size union_pw_qpolynomial::n_pw_qpolynomial() const
-{
-  auto res = isl_union_pw_qpolynomial_n_pw_qpolynomial(get());
-  return res;
-}
-
-isl::union_pw_qpolynomial union_pw_qpolynomial::neg() const
+isl::union_pw_multi_aff union_pw_multi_aff::pullback(isl::union_pw_multi_aff upma2) const
 {
-  auto res = isl_union_pw_qpolynomial_neg(copy());
+  auto res = isl_union_pw_multi_aff_pullback_union_pw_multi_aff(copy(), upma2.release());
   return manage(res);
 }
 
-boolean union_pw_qpolynomial::plain_is_equal(const isl::union_pw_qpolynomial &upwqp2) const
+isl::pw_multi_aff_list union_pw_multi_aff::pw_multi_aff_list() const
 {
-  auto res = isl_union_pw_qpolynomial_plain_is_equal(get(), upwqp2.get());
+  auto res = isl_union_pw_multi_aff_get_pw_multi_aff_list(get());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::reset_user() const
+isl::pw_multi_aff_list union_pw_multi_aff::get_pw_multi_aff_list() const
 {
-  auto res = isl_union_pw_qpolynomial_reset_user(copy());
-  return manage(res);
+  return pw_multi_aff_list();
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::scale_down_val(isl::val v) const
+isl::union_pw_multi_aff union_pw_multi_aff::range_factor_domain() const
 {
-  auto res = isl_union_pw_qpolynomial_scale_down_val(copy(), v.release());
+  auto res = isl_union_pw_multi_aff_range_factor_domain(copy());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::scale_val(isl::val v) const
+isl::union_pw_multi_aff union_pw_multi_aff::range_factor_range() const
 {
-  auto res = isl_union_pw_qpolynomial_scale_val(copy(), v.release());
+  auto res = isl_union_pw_multi_aff_range_factor_range(copy());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::sub(isl::union_pw_qpolynomial upwqp2) const
+isl::union_pw_multi_aff union_pw_multi_aff::range_product(isl::union_pw_multi_aff upma2) const
 {
-  auto res = isl_union_pw_qpolynomial_sub(copy(), upwqp2.release());
+  auto res = isl_union_pw_multi_aff_range_product(copy(), upma2.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::subtract_domain(isl::union_set uset) const
+isl::space union_pw_multi_aff::space() const
 {
-  auto res = isl_union_pw_qpolynomial_subtract_domain(copy(), uset.release());
+  auto res = isl_union_pw_multi_aff_get_space(get());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::subtract_domain_space(isl::space space) const
+isl::space union_pw_multi_aff::get_space() const
 {
-  auto res = isl_union_pw_qpolynomial_subtract_domain_space(copy(), space.release());
-  return manage(res);
+  return space();
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::subtract_domain_union_set(isl::union_set uset) const
+isl::union_pw_multi_aff union_pw_multi_aff::sub(isl::union_pw_multi_aff upma2) const
 {
-  auto res = isl_union_pw_qpolynomial_subtract_domain_union_set(copy(), uset.release());
+  auto res = isl_union_pw_multi_aff_sub(copy(), upma2.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::to_polynomial(int sign) const
+isl::union_pw_multi_aff union_pw_multi_aff::subtract_domain(isl::space space) const
 {
-  auto res = isl_union_pw_qpolynomial_to_polynomial(copy(), sign);
+  auto res = isl_union_pw_multi_aff_subtract_domain_space(copy(), space.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::zero(isl::space space)
+isl::union_pw_multi_aff union_pw_multi_aff::subtract_domain(isl::union_set uset) const
 {
-  auto res = isl_union_pw_qpolynomial_zero(space.release());
+  auto res = isl_union_pw_multi_aff_subtract_domain_union_set(copy(), uset.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::zero_ctx(isl::ctx ctx)
+isl::union_pw_multi_aff union_pw_multi_aff::union_add(isl::union_pw_multi_aff upma2) const
 {
-  auto res = isl_union_pw_qpolynomial_zero_ctx(ctx.release());
+  auto res = isl_union_pw_multi_aff_union_add(copy(), upma2.release());
   return manage(res);
 }
 
-isl::union_pw_qpolynomial union_pw_qpolynomial::zero_space(isl::space space)
+inline std::ostream &operator<<(std::ostream &os, const union_pw_multi_aff &obj)
 {
-  auto res = isl_union_pw_qpolynomial_zero_space(space.release());
-  return manage(res);
+  char *str = isl_union_pw_multi_aff_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
 // implementations for isl::union_set
@@ -19648,7 +21884,6 @@ union_set::union_set(const union_set &obj)
   ptr = obj.copy();
 }
 
-
 union_set::union_set(__isl_take isl_union_set *ptr)
     : ptr(ptr) {}
 
@@ -19657,16 +21892,19 @@ union_set::union_set(isl::basic_set bset)
   auto res = isl_union_set_from_basic_set(bset.release());
   ptr = res;
 }
+
 union_set::union_set(isl::point pnt)
 {
   auto res = isl_union_set_from_point(pnt.release());
   ptr = res;
 }
+
 union_set::union_set(isl::set set)
 {
   auto res = isl_union_set_from_set(set.release());
   ptr = res;
 }
+
 union_set::union_set(isl::ctx ctx, const std::string &str)
 {
   auto res = isl_union_set_read_from_str(ctx.release(), str.c_str());
@@ -19701,49 +21939,37 @@ bool union_set::is_null() const {
   return ptr == nullptr;
 }
 
-
 isl::ctx union_set::ctx() const {
   return isl::ctx(isl_union_set_get_ctx(ptr));
 }
 
-void union_set::dump() const {
-  isl_union_set_dump(get());
-}
-
-
 isl::union_set union_set::affine_hull() const
 {
   auto res = isl_union_set_affine_hull(copy());
   return manage(res);
 }
 
-isl::union_set union_set::align_params(isl::space model) const
-{
-  auto res = isl_union_set_align_params(copy(), model.release());
-  return manage(res);
-}
-
 isl::union_set union_set::apply(isl::union_map umap) const
 {
   auto res = isl_union_set_apply(copy(), umap.release());
   return manage(res);
 }
 
-isl::union_set union_set::coalesce() const
+isl::set union_set::as_set() const
 {
-  auto res = isl_union_set_coalesce(copy());
+  auto res = isl_union_set_as_set(copy());
   return manage(res);
 }
 
-isl::union_set union_set::coefficients() const
+isl::union_set union_set::coalesce() const
 {
-  auto res = isl_union_set_coefficients(copy());
+  auto res = isl_union_set_coalesce(copy());
   return manage(res);
 }
 
-isl::schedule union_set::compute_schedule(isl::union_map validity, isl::union_map proximity) const
+isl::union_set union_set::compute_divs() const
 {
-  auto res = isl_union_set_compute_schedule(copy(), validity.release(), proximity.release());
+  auto res = isl_union_set_compute_divs(copy());
   return manage(res);
 }
 
@@ -19759,15 +21985,23 @@ isl::union_set union_set::detect_equalities() const
   return manage(res);
 }
 
-isl_size union_set::dim(isl::dim type) const
+isl::union_set union_set::empty(isl::ctx ctx)
 {
-  auto res = isl_union_set_dim(get(), static_cast<enum isl_dim_type>(type));
-  return res;
+  auto res = isl_union_set_empty_ctx(ctx.release());
+  return manage(res);
 }
 
-isl::union_set union_set::empty(isl::ctx ctx)
+boolean union_set::every_set(const std::function<boolean(isl::set)> &test) const
 {
-  auto res = isl_union_set_empty_ctx(ctx.release());
+  struct test_data {
+    std::function<boolean(isl::set)> func;
+  } test_data = { test };
+  auto test_lambda = [](isl_set *arg_0, void *arg_1) -> isl_bool {
+    auto *data = static_cast<struct test_data *>(arg_1);
+    auto ret = (data->func)(manage_copy(arg_0));
+    return ret.release();
+  };
+  auto res = isl_union_set_every_set(get(), test_lambda, &test_data);
   return manage(res);
 }
 
@@ -19777,58 +22011,34 @@ isl::set union_set::extract_set(isl::space space) const
   return manage(res);
 }
 
-stat union_set::foreach_point(const std::function<stat(point)> &fn) const
+stat union_set::foreach_point(const std::function<stat(isl::point)> &fn) const
 {
   struct fn_data {
-    const std::function<stat(point)> *func;
-  } fn_data = { &fn };
+    std::function<stat(isl::point)> func;
+  } fn_data = { fn };
   auto fn_lambda = [](isl_point *arg_0, void *arg_1) -> isl_stat {
     auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
+    auto ret = (data->func)(manage(arg_0));
     return ret.release();
   };
   auto res = isl_union_set_foreach_point(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-stat union_set::foreach_set(const std::function<stat(set)> &fn) const
+stat union_set::foreach_set(const std::function<stat(isl::set)> &fn) const
 {
   struct fn_data {
-    const std::function<stat(set)> *func;
-  } fn_data = { &fn };
+    std::function<stat(isl::set)> func;
+  } fn_data = { fn };
   auto fn_lambda = [](isl_set *arg_0, void *arg_1) -> isl_stat {
     auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
+    auto ret = (data->func)(manage(arg_0));
     return ret.release();
   };
   auto res = isl_union_set_foreach_set(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::basic_set_list union_set::get_basic_set_list() const
-{
-  auto res = isl_union_set_get_basic_set_list(get());
-  return manage(res);
-}
-
-uint32_t union_set::get_hash() const
-{
-  auto res = isl_union_set_get_hash(get());
-  return res;
-}
-
-isl::set_list union_set::get_set_list() const
-{
-  auto res = isl_union_set_get_set_list(get());
-  return manage(res);
-}
-
-isl::space union_set::get_space() const
-{
-  auto res = isl_union_set_get_space(get());
-  return manage(res);
-}
-
 isl::union_set union_set::gist(isl::union_set context) const
 {
   auto res = isl_union_set_gist(copy(), context.release());
@@ -19883,12 +22093,6 @@ boolean union_set::is_equal(const isl::union_set &uset2) const
   return manage(res);
 }
 
-boolean union_set::is_params() const
-{
-  auto res = isl_union_set_is_params(get());
-  return manage(res);
-}
-
 boolean union_set::is_strict_subset(const isl::union_set &uset2) const
 {
   auto res = isl_union_set_is_strict_subset(get(), uset2.get());
@@ -19907,30 +22111,6 @@ boolean union_set::isa_set() const
   return manage(res);
 }
 
-isl::union_map union_set::lex_ge_union_set(isl::union_set uset2) const
-{
-  auto res = isl_union_set_lex_ge_union_set(copy(), uset2.release());
-  return manage(res);
-}
-
-isl::union_map union_set::lex_gt_union_set(isl::union_set uset2) const
-{
-  auto res = isl_union_set_lex_gt_union_set(copy(), uset2.release());
-  return manage(res);
-}
-
-isl::union_map union_set::lex_le_union_set(isl::union_set uset2) const
-{
-  auto res = isl_union_set_lex_le_union_set(copy(), uset2.release());
-  return manage(res);
-}
-
-isl::union_map union_set::lex_lt_union_set(isl::union_set uset2) const
-{
-  auto res = isl_union_set_lex_lt_union_set(copy(), uset2.release());
-  return manage(res);
-}
-
 isl::union_set union_set::lexmax() const
 {
   auto res = isl_union_set_lexmax(copy());
@@ -19943,18 +22123,6 @@ isl::union_set union_set::lexmin() const
   return manage(res);
 }
 
-isl::multi_val union_set::min_multi_union_pw_aff(const isl::multi_union_pw_aff &obj) const
-{
-  auto res = isl_union_set_min_multi_union_pw_aff(get(), obj.get());
-  return manage(res);
-}
-
-isl_size union_set::n_set() const
-{
-  auto res = isl_union_set_n_set(get());
-  return res;
-}
-
 isl::set union_set::params() const
 {
   auto res = isl_union_set_params(copy());
@@ -19985,69 +22153,43 @@ isl::union_set union_set::preimage(isl::union_pw_multi_aff upma) const
   return manage(res);
 }
 
-isl::union_set union_set::product(isl::union_set uset2) const
-{
-  auto res = isl_union_set_product(copy(), uset2.release());
-  return manage(res);
-}
-
-isl::union_set union_set::project_out(isl::dim type, unsigned int first, unsigned int n) const
-{
-  auto res = isl_union_set_project_out(copy(), static_cast<enum isl_dim_type>(type), first, n);
-  return manage(res);
-}
-
-isl::union_set union_set::project_out_all_params() const
-{
-  auto res = isl_union_set_project_out_all_params(copy());
-  return manage(res);
-}
-
-isl::union_set union_set::remove_divs() const
-{
-  auto res = isl_union_set_remove_divs(copy());
-  return manage(res);
-}
-
-isl::union_set union_set::remove_redundancies() const
+isl::point union_set::sample_point() const
 {
-  auto res = isl_union_set_remove_redundancies(copy());
+  auto res = isl_union_set_sample_point(copy());
   return manage(res);
 }
 
-isl::union_set union_set::reset_user() const
+isl::set_list union_set::set_list() const
 {
-  auto res = isl_union_set_reset_user(copy());
+  auto res = isl_union_set_get_set_list(get());
   return manage(res);
 }
 
-isl::basic_set union_set::sample() const
+isl::set_list union_set::get_set_list() const
 {
-  auto res = isl_union_set_sample(copy());
-  return manage(res);
+  return set_list();
 }
 
-isl::point union_set::sample_point() const
+isl::space union_set::space() const
 {
-  auto res = isl_union_set_sample_point(copy());
+  auto res = isl_union_set_get_space(get());
   return manage(res);
 }
 
-isl::union_set union_set::simple_hull() const
+isl::space union_set::get_space() const
 {
-  auto res = isl_union_set_simple_hull(copy());
-  return manage(res);
+  return space();
 }
 
-isl::union_set union_set::solutions() const
+isl::union_set union_set::subtract(isl::union_set uset2) const
 {
-  auto res = isl_union_set_solutions(copy());
+  auto res = isl_union_set_subtract(copy(), uset2.release());
   return manage(res);
 }
 
-isl::union_set union_set::subtract(isl::union_set uset2) const
+isl::union_set_list union_set::to_list() const
 {
-  auto res = isl_union_set_subtract(copy(), uset2.release());
+  auto res = isl_union_set_to_list(copy());
   return manage(res);
 }
 
@@ -20069,10 +22211,16 @@ isl::union_map union_set::unwrap() const
   return manage(res);
 }
 
-isl::union_map union_set::wrapped_domain_map() const
+inline std::ostream &operator<<(std::ostream &os, const union_set &obj)
 {
-  auto res = isl_union_set_wrapped_domain_map(copy());
-  return manage(res);
+  char *str = isl_union_set_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
 // implementations for isl::union_set_list
@@ -20093,10 +22241,26 @@ union_set_list::union_set_list(const union_set_list &obj)
   ptr = obj.copy();
 }
 
-
 union_set_list::union_set_list(__isl_take isl_union_set_list *ptr)
     : ptr(ptr) {}
 
+union_set_list::union_set_list(isl::ctx ctx, int n)
+{
+  auto res = isl_union_set_list_alloc(ctx.release(), n);
+  ptr = res;
+}
+
+union_set_list::union_set_list(isl::union_set el)
+{
+  auto res = isl_union_set_list_from_union_set(el.release());
+  ptr = res;
+}
+
+union_set_list::union_set_list(isl::ctx ctx, const std::string &str)
+{
+  auto res = isl_union_set_list_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
+}
 
 union_set_list &union_set_list::operator=(union_set_list obj) {
   std::swap(this->ptr, obj.ptr);
@@ -20126,28 +22290,27 @@ bool union_set_list::is_null() const {
   return ptr == nullptr;
 }
 
-
 isl::ctx union_set_list::ctx() const {
   return isl::ctx(isl_union_set_list_get_ctx(ptr));
 }
 
-void union_set_list::dump() const {
-  isl_union_set_list_dump(get());
-}
-
-
 isl::union_set_list union_set_list::add(isl::union_set el) const
 {
   auto res = isl_union_set_list_add(copy(), el.release());
   return manage(res);
 }
 
-isl::union_set_list union_set_list::alloc(isl::ctx ctx, int n)
+isl::union_set union_set_list::at(int index) const
 {
-  auto res = isl_union_set_list_alloc(ctx.release(), n);
+  auto res = isl_union_set_list_get_at(get(), index);
   return manage(res);
 }
 
+isl::union_set union_set_list::get_at(int index) const
+{
+  return at(index);
+}
+
 isl::union_set_list union_set_list::clear() const
 {
   auto res = isl_union_set_list_clear(copy());
@@ -20166,78 +22329,42 @@ isl::union_set_list union_set_list::drop(unsigned int first, unsigned int n) con
   return manage(res);
 }
 
-stat union_set_list::foreach(const std::function<stat(union_set)> &fn) const
+stat union_set_list::foreach(const std::function<stat(isl::union_set)> &fn) const
 {
   struct fn_data {
-    const std::function<stat(union_set)> *func;
-  } fn_data = { &fn };
+    std::function<stat(isl::union_set)> func;
+  } fn_data = { fn };
   auto fn_lambda = [](isl_union_set *arg_0, void *arg_1) -> isl_stat {
     auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
+    auto ret = (data->func)(manage(arg_0));
     return ret.release();
   };
   auto res = isl_union_set_list_foreach(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::union_set_list union_set_list::from_union_set(isl::union_set el)
-{
-  auto res = isl_union_set_list_from_union_set(el.release());
-  return manage(res);
-}
-
-isl::union_set union_set_list::get_at(int index) const
-{
-  auto res = isl_union_set_list_get_at(get(), index);
-  return manage(res);
-}
-
-isl::union_set union_set_list::get_union_set(int index) const
-{
-  auto res = isl_union_set_list_get_union_set(get(), index);
-  return manage(res);
-}
-
 isl::union_set_list union_set_list::insert(unsigned int pos, isl::union_set el) const
 {
   auto res = isl_union_set_list_insert(copy(), pos, el.release());
   return manage(res);
 }
 
-isl_size union_set_list::n_union_set() const
-{
-  auto res = isl_union_set_list_n_union_set(get());
-  return res;
-}
-
-isl::union_set_list union_set_list::reverse() const
-{
-  auto res = isl_union_set_list_reverse(copy());
-  return manage(res);
-}
-
-isl::union_set_list union_set_list::set_union_set(int index, isl::union_set el) const
-{
-  auto res = isl_union_set_list_set_union_set(copy(), index, el.release());
-  return manage(res);
-}
-
-isl_size union_set_list::size() const
+class size union_set_list::size() const
 {
   auto res = isl_union_set_list_size(get());
-  return res;
-}
-
-isl::union_set_list union_set_list::swap(unsigned int pos1, unsigned int pos2) const
-{
-  auto res = isl_union_set_list_swap(copy(), pos1, pos2);
   return manage(res);
 }
 
-isl::union_set union_set_list::unite() const
+inline std::ostream &operator<<(std::ostream &os, const union_set_list &obj)
 {
-  auto res = isl_union_set_list_union(copy());
-  return manage(res);
+  char *str = isl_union_set_list_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
 
 // implementations for isl::val
@@ -20258,7 +22385,6 @@ val::val(const val &obj)
   ptr = obj.copy();
 }
 
-
 val::val(__isl_take isl_val *ptr)
     : ptr(ptr) {}
 
@@ -20267,6 +22393,7 @@ val::val(isl::ctx ctx, long i)
   auto res = isl_val_int_from_si(ctx.release(), i);
   ptr = res;
 }
+
 val::val(isl::ctx ctx, const std::string &str)
 {
   auto res = isl_val_read_from_str(ctx.release(), str.c_str());
@@ -20301,16 +22428,10 @@ bool val::is_null() const {
   return ptr == nullptr;
 }
 
-
 isl::ctx val::ctx() const {
   return isl::ctx(isl_val_get_ctx(ptr));
 }
 
-void val::dump() const {
-  isl_val_dump(get());
-}
-
-
 isl::val val::abs() const
 {
   auto res = isl_val_abs(copy());
@@ -20323,16 +22444,20 @@ boolean val::abs_eq(const isl::val &v2) const
   return manage(res);
 }
 
+boolean val::abs_eq(long v2) const
+{
+  return this->abs_eq(isl::val(ctx(), v2));
+}
+
 isl::val val::add(isl::val v2) const
 {
   auto res = isl_val_add(copy(), v2.release());
   return manage(res);
 }
 
-isl::val val::add_ui(unsigned long v2) const
+isl::val val::add(long v2) const
 {
-  auto res = isl_val_add_ui(copy(), v2);
-  return manage(res);
+  return this->add(isl::val(ctx(), v2));
 }
 
 isl::val val::ceil() const
@@ -20347,16 +22472,26 @@ int val::cmp_si(long i) const
   return res;
 }
 
+long val::den_si() const
+{
+  auto res = isl_val_get_den_si(get());
+  return res;
+}
+
+long val::get_den_si() const
+{
+  return den_si();
+}
+
 isl::val val::div(isl::val v2) const
 {
   auto res = isl_val_div(copy(), v2.release());
   return manage(res);
 }
 
-isl::val val::div_ui(unsigned long v2) const
+isl::val val::div(long v2) const
 {
-  auto res = isl_val_div_ui(copy(), v2);
-  return manage(res);
+  return this->div(isl::val(ctx(), v2));
 }
 
 boolean val::eq(const isl::val &v2) const
@@ -20365,10 +22500,9 @@ boolean val::eq(const isl::val &v2) const
   return manage(res);
 }
 
-boolean val::eq_si(long i) const
+boolean val::eq(long v2) const
 {
-  auto res = isl_val_eq_si(get(), i);
-  return manage(res);
+  return this->eq(isl::val(ctx(), v2));
 }
 
 isl::val val::floor() const
@@ -20383,22 +22517,20 @@ isl::val val::gcd(isl::val v2) const
   return manage(res);
 }
 
-boolean val::ge(const isl::val &v2) const
+isl::val val::gcd(long v2) const
 {
-  auto res = isl_val_ge(get(), v2.get());
-  return manage(res);
+  return this->gcd(isl::val(ctx(), v2));
 }
 
-uint32_t val::get_hash() const
+boolean val::ge(const isl::val &v2) const
 {
-  auto res = isl_val_get_hash(get());
-  return res;
+  auto res = isl_val_ge(get(), v2.get());
+  return manage(res);
 }
 
-long val::get_num_si() const
+boolean val::ge(long v2) const
 {
-  auto res = isl_val_get_num_si(get());
-  return res;
+  return this->ge(isl::val(ctx(), v2));
 }
 
 boolean val::gt(const isl::val &v2) const
@@ -20407,10 +22539,9 @@ boolean val::gt(const isl::val &v2) const
   return manage(res);
 }
 
-boolean val::gt_si(long i) const
+boolean val::gt(long v2) const
 {
-  auto res = isl_val_gt_si(get(), i);
-  return manage(res);
+  return this->gt(isl::val(ctx(), v2));
 }
 
 isl::val val::infty(isl::ctx ctx)
@@ -20437,6 +22568,11 @@ boolean val::is_divisible_by(const isl::val &v2) const
   return manage(res);
 }
 
+boolean val::is_divisible_by(long v2) const
+{
+  return this->is_divisible_by(isl::val(ctx(), v2));
+}
+
 boolean val::is_infty() const
 {
   auto res = isl_val_is_infty(get());
@@ -20515,46 +22651,64 @@ boolean val::le(const isl::val &v2) const
   return manage(res);
 }
 
+boolean val::le(long v2) const
+{
+  return this->le(isl::val(ctx(), v2));
+}
+
 boolean val::lt(const isl::val &v2) const
 {
   auto res = isl_val_lt(get(), v2.get());
   return manage(res);
 }
 
+boolean val::lt(long v2) const
+{
+  return this->lt(isl::val(ctx(), v2));
+}
+
 isl::val val::max(isl::val v2) const
 {
   auto res = isl_val_max(copy(), v2.release());
   return manage(res);
 }
 
+isl::val val::max(long v2) const
+{
+  return this->max(isl::val(ctx(), v2));
+}
+
 isl::val val::min(isl::val v2) const
 {
   auto res = isl_val_min(copy(), v2.release());
   return manage(res);
 }
 
+isl::val val::min(long v2) const
+{
+  return this->min(isl::val(ctx(), v2));
+}
+
 isl::val val::mod(isl::val v2) const
 {
   auto res = isl_val_mod(copy(), v2.release());
   return manage(res);
 }
 
-isl::val val::mul(isl::val v2) const
+isl::val val::mod(long v2) const
 {
-  auto res = isl_val_mul(copy(), v2.release());
-  return manage(res);
+  return this->mod(isl::val(ctx(), v2));
 }
 
-isl::val val::mul_ui(unsigned long v2) const
+isl::val val::mul(isl::val v2) const
 {
-  auto res = isl_val_mul_ui(copy(), v2);
+  auto res = isl_val_mul(copy(), v2.release());
   return manage(res);
 }
 
-isl_size val::n_abs_num_chunks(size_t size) const
+isl::val val::mul(long v2) const
 {
-  auto res = isl_val_n_abs_num_chunks(get(), size);
-  return res;
+  return this->mul(isl::val(ctx(), v2));
 }
 
 isl::val val::nan(isl::ctx ctx)
@@ -20569,6 +22723,11 @@ boolean val::ne(const isl::val &v2) const
   return manage(res);
 }
 
+boolean val::ne(long v2) const
+{
+  return this->ne(isl::val(ctx(), v2));
+}
+
 isl::val val::neg() const
 {
   auto res = isl_val_neg(copy());
@@ -20587,6 +22746,17 @@ isl::val val::negone(isl::ctx ctx)
   return manage(res);
 }
 
+long val::num_si() const
+{
+  auto res = isl_val_get_num_si(get());
+  return res;
+}
+
+long val::get_num_si() const
+{
+  return num_si();
+}
+
 isl::val val::one(isl::ctx ctx)
 {
   auto res = isl_val_one(ctx.release());
@@ -20599,12 +22769,6 @@ isl::val val::pow2() const
   return manage(res);
 }
 
-isl::val val::set_si(long i) const
-{
-  auto res = isl_val_set_si(copy(), i);
-  return manage(res);
-}
-
 int val::sgn() const
 {
   auto res = isl_val_sgn(get());
@@ -20617,9 +22781,14 @@ isl::val val::sub(isl::val v2) const
   return manage(res);
 }
 
-isl::val val::sub_ui(unsigned long v2) const
+isl::val val::sub(long v2) const
+{
+  return this->sub(isl::val(ctx(), v2));
+}
+
+isl::val_list val::to_list() const
 {
-  auto res = isl_val_sub_ui(copy(), v2);
+  auto res = isl_val_to_list(copy());
   return manage(res);
 }
 
@@ -20635,6 +22804,18 @@ isl::val val::zero(isl::ctx ctx)
   return manage(res);
 }
 
+inline std::ostream &operator<<(std::ostream &os, const val &obj)
+{
+  char *str = isl_val_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
+}
+
 // implementations for isl::val_list
 val_list manage(__isl_take isl_val_list *ptr) {
   return val_list(ptr);
@@ -20653,10 +22834,26 @@ val_list::val_list(const val_list &obj)
   ptr = obj.copy();
 }
 
-
 val_list::val_list(__isl_take isl_val_list *ptr)
     : ptr(ptr) {}
 
+val_list::val_list(isl::ctx ctx, int n)
+{
+  auto res = isl_val_list_alloc(ctx.release(), n);
+  ptr = res;
+}
+
+val_list::val_list(isl::val el)
+{
+  auto res = isl_val_list_from_val(el.release());
+  ptr = res;
+}
+
+val_list::val_list(isl::ctx ctx, const std::string &str)
+{
+  auto res = isl_val_list_read_from_str(ctx.release(), str.c_str());
+  ptr = res;
+}
 
 val_list &val_list::operator=(val_list obj) {
   std::swap(this->ptr, obj.ptr);
@@ -20686,28 +22883,32 @@ bool val_list::is_null() const {
   return ptr == nullptr;
 }
 
-
 isl::ctx val_list::ctx() const {
   return isl::ctx(isl_val_list_get_ctx(ptr));
 }
 
-void val_list::dump() const {
-  isl_val_list_dump(get());
-}
-
-
 isl::val_list val_list::add(isl::val el) const
 {
   auto res = isl_val_list_add(copy(), el.release());
   return manage(res);
 }
 
-isl::val_list val_list::alloc(isl::ctx ctx, int n)
+isl::val_list val_list::add(long el) const
 {
-  auto res = isl_val_list_alloc(ctx.release(), n);
+  return this->add(isl::val(ctx(), el));
+}
+
+isl::val val_list::at(int index) const
+{
+  auto res = isl_val_list_get_at(get(), index);
   return manage(res);
 }
 
+isl::val val_list::get_at(int index) const
+{
+  return at(index);
+}
+
 isl::val_list val_list::clear() const
 {
   auto res = isl_val_list_clear(copy());
@@ -20726,279 +22927,48 @@ isl::val_list val_list::drop(unsigned int first, unsigned int n) const
   return manage(res);
 }
 
-stat val_list::foreach(const std::function<stat(val)> &fn) const
+stat val_list::foreach(const std::function<stat(isl::val)> &fn) const
 {
   struct fn_data {
-    const std::function<stat(val)> *func;
-  } fn_data = { &fn };
+    std::function<stat(isl::val)> func;
+  } fn_data = { fn };
   auto fn_lambda = [](isl_val *arg_0, void *arg_1) -> isl_stat {
     auto *data = static_cast<struct fn_data *>(arg_1);
-    stat ret = (*data->func)(manage(arg_0));
+    auto ret = (data->func)(manage(arg_0));
     return ret.release();
   };
   auto res = isl_val_list_foreach(get(), fn_lambda, &fn_data);
   return manage(res);
 }
 
-isl::val_list val_list::from_val(isl::val el)
-{
-  auto res = isl_val_list_from_val(el.release());
-  return manage(res);
-}
-
-isl::val val_list::get_at(int index) const
-{
-  auto res = isl_val_list_get_at(get(), index);
-  return manage(res);
-}
-
-isl::val val_list::get_val(int index) const
-{
-  auto res = isl_val_list_get_val(get(), index);
-  return manage(res);
-}
-
 isl::val_list val_list::insert(unsigned int pos, isl::val el) const
 {
   auto res = isl_val_list_insert(copy(), pos, el.release());
   return manage(res);
 }
 
-isl_size val_list::n_val() const
-{
-  auto res = isl_val_list_n_val(get());
-  return res;
-}
-
-isl::val_list val_list::reverse() const
-{
-  auto res = isl_val_list_reverse(copy());
-  return manage(res);
-}
-
-isl::val_list val_list::set_val(int index, isl::val el) const
+isl::val_list val_list::insert(unsigned int pos, long el) const
 {
-  auto res = isl_val_list_set_val(copy(), index, el.release());
-  return manage(res);
+  return this->insert(pos, isl::val(ctx(), el));
 }
 
-isl_size val_list::size() const
+class size val_list::size() const
 {
   auto res = isl_val_list_size(get());
-  return res;
-}
-
-isl::val_list val_list::swap(unsigned int pos1, unsigned int pos2) const
-{
-  auto res = isl_val_list_swap(copy(), pos1, pos2);
-  return manage(res);
-}
-
-// implementations for isl::vec
-vec manage(__isl_take isl_vec *ptr) {
-  return vec(ptr);
-}
-vec manage_copy(__isl_keep isl_vec *ptr) {
-  ptr = isl_vec_copy(ptr);
-  return vec(ptr);
-}
-
-vec::vec()
-    : ptr(nullptr) {}
-
-vec::vec(const vec &obj)
-    : ptr(nullptr)
-{
-  ptr = obj.copy();
-}
-
-
-vec::vec(__isl_take isl_vec *ptr)
-    : ptr(ptr) {}
-
-
-vec &vec::operator=(vec obj) {
-  std::swap(this->ptr, obj.ptr);
-  return *this;
-}
-
-vec::~vec() {
-  if (ptr)
-    isl_vec_free(ptr);
-}
-
-__isl_give isl_vec *vec::copy() const & {
-  return isl_vec_copy(ptr);
-}
-
-__isl_keep isl_vec *vec::get() const {
-  return ptr;
-}
-
-__isl_give isl_vec *vec::release() {
-  isl_vec *tmp = ptr;
-  ptr = nullptr;
-  return tmp;
-}
-
-bool vec::is_null() const {
-  return ptr == nullptr;
-}
-
-
-isl::ctx vec::ctx() const {
-  return isl::ctx(isl_vec_get_ctx(ptr));
-}
-
-void vec::dump() const {
-  isl_vec_dump(get());
-}
-
-
-isl::vec vec::add(isl::vec vec2) const
-{
-  auto res = isl_vec_add(copy(), vec2.release());
-  return manage(res);
-}
-
-isl::vec vec::add_els(unsigned int n) const
-{
-  auto res = isl_vec_add_els(copy(), n);
-  return manage(res);
-}
-
-isl::vec vec::alloc(isl::ctx ctx, unsigned int size)
-{
-  auto res = isl_vec_alloc(ctx.release(), size);
-  return manage(res);
-}
-
-isl::vec vec::ceil() const
-{
-  auto res = isl_vec_ceil(copy());
-  return manage(res);
-}
-
-isl::vec vec::clr() const
-{
-  auto res = isl_vec_clr(copy());
-  return manage(res);
-}
-
-int vec::cmp_element(const isl::vec &vec2, int pos) const
-{
-  auto res = isl_vec_cmp_element(get(), vec2.get(), pos);
-  return res;
-}
-
-isl::vec vec::concat(isl::vec vec2) const
-{
-  auto res = isl_vec_concat(copy(), vec2.release());
-  return manage(res);
-}
-
-isl::vec vec::drop_els(unsigned int pos, unsigned int n) const
-{
-  auto res = isl_vec_drop_els(copy(), pos, n);
-  return manage(res);
-}
-
-isl::vec vec::extend(unsigned int size) const
-{
-  auto res = isl_vec_extend(copy(), size);
-  return manage(res);
-}
-
-isl::val vec::get_element_val(int pos) const
-{
-  auto res = isl_vec_get_element_val(get(), pos);
-  return manage(res);
-}
-
-isl::vec vec::insert_els(unsigned int pos, unsigned int n) const
-{
-  auto res = isl_vec_insert_els(copy(), pos, n);
-  return manage(res);
-}
-
-isl::vec vec::insert_zero_els(unsigned int pos, unsigned int n) const
-{
-  auto res = isl_vec_insert_zero_els(copy(), pos, n);
-  return manage(res);
-}
-
-boolean vec::is_equal(const isl::vec &vec2) const
-{
-  auto res = isl_vec_is_equal(get(), vec2.get());
-  return manage(res);
-}
-
-isl::vec vec::mat_product(isl::mat mat) const
-{
-  auto res = isl_vec_mat_product(copy(), mat.release());
-  return manage(res);
-}
-
-isl::vec vec::move_els(unsigned int dst_col, unsigned int src_col, unsigned int n) const
-{
-  auto res = isl_vec_move_els(copy(), dst_col, src_col, n);
-  return manage(res);
-}
-
-isl::vec vec::neg() const
-{
-  auto res = isl_vec_neg(copy());
-  return manage(res);
-}
-
-isl::vec vec::set_element_si(int pos, int v) const
-{
-  auto res = isl_vec_set_element_si(copy(), pos, v);
-  return manage(res);
-}
-
-isl::vec vec::set_element_val(int pos, isl::val v) const
-{
-  auto res = isl_vec_set_element_val(copy(), pos, v.release());
-  return manage(res);
-}
-
-isl::vec vec::set_si(int v) const
-{
-  auto res = isl_vec_set_si(copy(), v);
-  return manage(res);
-}
-
-isl::vec vec::set_val(isl::val v) const
-{
-  auto res = isl_vec_set_val(copy(), v.release());
-  return manage(res);
-}
-
-isl_size vec::size() const
-{
-  auto res = isl_vec_size(get());
-  return res;
-}
-
-isl::vec vec::sort() const
-{
-  auto res = isl_vec_sort(copy());
-  return manage(res);
-}
-
-isl::vec vec::zero(isl::ctx ctx, unsigned int size)
-{
-  auto res = isl_vec_zero(ctx.release(), size);
   return manage(res);
 }
 
-isl::vec vec::zero_extend(unsigned int size) const
+inline std::ostream &operator<<(std::ostream &os, const val_list &obj)
 {
-  auto res = isl_vec_zero_extend(copy(), size);
-  return manage(res);
+  char *str = isl_val_list_to_str(obj.get());
+  if (!str) {
+    os.setstate(std::ios_base::badbit);
+    return os;
+  }
+  os << str;
+  free(str);
+  return os;
 }
-} // namespace noexceptions 
 } // namespace isl
 
 #endif /* ISL_CPP_CHECKED */
diff --git a/polly/lib/Support/DumpFunctionPass.cpp b/polly/lib/Support/DumpFunctionPass.cpp
new file mode 100644
index 0000000000000..313fc71159348
--- /dev/null
+++ b/polly/lib/Support/DumpFunctionPass.cpp
@@ -0,0 +1,121 @@
+//===------ DumpFunctionPass.cpp --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Write a function to a file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "polly/Support/DumpFunctionPass.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/Transforms/IPO/StripDeadPrototypes.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#define DEBUG_TYPE "polly-dump-func"
+
+using namespace llvm;
+using namespace polly;
+
+namespace {
+
+static void runDumpFunction(llvm::Function &F, StringRef Suffix) {
+  StringRef FName = F.getName();
+  Module *M = F.getParent();
+
+  StringRef ModuleName = M->getName();
+  StringRef Stem = sys::path::stem(ModuleName);
+  std::string Dumpfile = (Twine(Stem) + "-" + FName + Suffix + ".ll").str();
+  LLVM_DEBUG(dbgs() << "Dumping function '" << FName << "' to '" << Dumpfile
+                    << "'...\n");
+
+  ValueToValueMapTy VMap;
+  auto ShouldCloneDefinition = [&F](const GlobalValue *GV) -> bool {
+    return GV == &F;
+  };
+  std::unique_ptr<Module> CM = CloneModule(*M, VMap, ShouldCloneDefinition);
+
+  LLVM_DEBUG(dbgs() << "Global DCE...\n");
+
+  {
+    ModuleAnalysisManager MAM;
+    ModulePassManager MPM;
+
+    PassInstrumentationCallbacks PIC;
+    MAM.registerPass([&] { return PassInstrumentationAnalysis(&PIC); });
+
+    MPM.addPass(GlobalDCEPass());
+    MPM.addPass(StripDeadPrototypesPass());
+    MPM.run(*CM, MAM);
+  }
+
+  LLVM_DEBUG(dbgs() << "Write to file '" << Dumpfile << "'...\n");
+
+  std::unique_ptr<ToolOutputFile> Out;
+  std::error_code EC;
+  Out.reset(new ToolOutputFile(Dumpfile, EC, sys::fs::OF_None));
+  if (EC) {
+    errs() << EC.message() << '\n';
+    return;
+  }
+
+  CM->print(Out->os(), nullptr);
+  Out->keep();
+  LLVM_DEBUG(dbgs() << "Dump file " << Dumpfile << " written successfully\n");
+}
+
+class DumpFunctionWrapperPass : public FunctionPass {
+private:
+  DumpFunctionWrapperPass(const DumpFunctionWrapperPass &) = delete;
+  const DumpFunctionWrapperPass &
+  operator=(const DumpFunctionWrapperPass &) = delete;
+
+  std::string Suffix;
+
+public:
+  static char ID;
+
+  explicit DumpFunctionWrapperPass() : FunctionPass(ID), Suffix("-dump") {}
+
+  explicit DumpFunctionWrapperPass(std::string Suffix)
+      : FunctionPass(ID), Suffix(std::move(Suffix)) {}
+
+  /// @name FunctionPass interface
+  //@{
+  virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+
+  virtual bool runOnFunction(llvm::Function &F) override {
+    runDumpFunction(F, Suffix);
+    return false;
+  }
+  //@}
+};
+
+char DumpFunctionWrapperPass::ID;
+} // namespace
+
+FunctionPass *polly::createDumpFunctionWrapperPass(std::string Suffix) {
+  return new DumpFunctionWrapperPass(std::move(Suffix));
+}
+
+llvm::PreservedAnalyses DumpFunctionPass::run(Function &F,
+                                              FunctionAnalysisManager &AM) {
+  runDumpFunction(F, Suffix);
+  return PreservedAnalyses::all();
+}
+
+INITIALIZE_PASS_BEGIN(DumpFunctionWrapperPass, "polly-dump-function",
+                      "Polly - Dump Function", false, false)
+INITIALIZE_PASS_END(DumpFunctionWrapperPass, "polly-dump-function",
+                    "Polly - Dump Function", false, false)
diff --git a/polly/lib/Support/GICHelper.cpp b/polly/lib/Support/GICHelper.cpp
index d1da965d1b03c..409dbf4766dd9 100644
--- a/polly/lib/Support/GICHelper.cpp
+++ b/polly/lib/Support/GICHelper.cpp
@@ -199,49 +199,43 @@ std::string polly::getIslCompatibleName(const std::string &Prefix,
 /// not know that it is never called, and therefore must ensure the existence of
 /// the dump functions.
 void neverCalled() {
-  isl::aff().dump();
-  isl::aff_list().dump();
-  isl::ast_expr().dump();
-  isl::ast_expr_list().dump();
-  isl::ast_node().dump();
-  isl::ast_node_list().dump();
-  isl::basic_map().dump();
-  isl::basic_map_list().dump();
-  isl::basic_set().dump();
-  isl::basic_set_list().dump();
-  isl::constraint().dump();
-  isl::constraint_list().dump();
-  isl::id().dump();
-  isl::id_list().dump();
-  isl::id_to_ast_expr().dump();
-  isl::local_space().dump();
-  isl::map().dump();
-  isl::map_list().dump();
-  isl::multi_aff().dump();
-  isl::multi_pw_aff().dump();
-  isl::multi_union_pw_aff().dump();
-  isl::multi_val().dump();
-  isl::point().dump();
-  isl::pw_aff().dump();
-  isl::pw_aff_list().dump();
-  isl::pw_multi_aff().dump();
-  isl::pw_qpolynomial().dump();
-  isl::qpolynomial().dump();
-  isl::schedule().dump();
-  isl::schedule_constraints().dump();
-  isl::schedule_node().dump();
-  isl::set().dump();
-  isl::set_list().dump();
-  isl::space().dump();
-  isl::union_map().dump();
-  isl::union_map_list().dump();
-  isl::union_pw_aff().dump();
-  isl::union_pw_aff_list().dump();
-  isl::union_pw_multi_aff().dump();
-  isl::union_pw_multi_aff_list().dump();
-  isl::union_set().dump();
-  isl::union_set_list().dump();
-  isl::val().dump();
-  isl::val_list().dump();
+  polly::dumpIslObj(isl::aff());
+  polly::dumpIslObj(isl::aff_list());
+  polly::dumpIslObj(isl::ast_expr());
+  polly::dumpIslObj(isl::ast_node());
+  polly::dumpIslObj(isl::ast_node_list());
+  polly::dumpIslObj(isl::basic_map());
+  polly::dumpIslObj(isl::basic_map_list());
+  polly::dumpIslObj(isl::basic_set());
+  polly::dumpIslObj(isl::basic_set_list());
+  polly::dumpIslObj(isl::constraint());
+  polly::dumpIslObj(isl::id());
+  polly::dumpIslObj(isl::id_list());
+  polly::dumpIslObj(isl::id_to_ast_expr());
+  polly::dumpIslObj(isl::local_space());
+  polly::dumpIslObj(isl::map());
+  polly::dumpIslObj(isl::map_list());
+  polly::dumpIslObj(isl::multi_aff());
+  polly::dumpIslObj(isl::multi_pw_aff());
+  polly::dumpIslObj(isl::multi_union_pw_aff());
+  polly::dumpIslObj(isl::multi_val());
+  polly::dumpIslObj(isl::point());
+  polly::dumpIslObj(isl::pw_aff());
+  polly::dumpIslObj(isl::pw_aff_list());
+  polly::dumpIslObj(isl::pw_multi_aff());
+  polly::dumpIslObj(isl::schedule());
+  polly::dumpIslObj(isl::schedule_constraints());
+  polly::dumpIslObj(isl::schedule_node());
+  polly::dumpIslObj(isl::set());
+  polly::dumpIslObj(isl::set_list());
+  polly::dumpIslObj(isl::space());
+  polly::dumpIslObj(isl::union_map());
+  polly::dumpIslObj(isl::union_pw_aff());
+  polly::dumpIslObj(isl::union_pw_aff_list());
+  polly::dumpIslObj(isl::union_pw_multi_aff());
+  polly::dumpIslObj(isl::union_set());
+  polly::dumpIslObj(isl::union_set_list());
+  polly::dumpIslObj(isl::val());
+  polly::dumpIslObj(isl::val_list());
 }
 #endif
diff --git a/polly/lib/Support/ISLTools.cpp b/polly/lib/Support/ISLTools.cpp
index 0e23d5ef5ecf0..1afd07e504dd6 100644
--- a/polly/lib/Support/ISLTools.cpp
+++ b/polly/lib/Support/ISLTools.cpp
@@ -36,7 +36,7 @@ isl::multi_aff makeShiftDimAff(isl::space Space, int Pos, int Amount) {
   auto Identity = isl::multi_aff::identity(Space);
   if (Amount == 0)
     return Identity;
-  auto ShiftAff = Identity.get_aff(Pos);
+  auto ShiftAff = Identity.at(Pos);
   ShiftAff = ShiftAff.set_constant_si(Amount);
   return Identity.set_aff(Pos, ShiftAff);
 }
@@ -56,8 +56,8 @@ isl::basic_map makeTupleSwapBasicMap(isl::space FromSpace1,
   assert(FromSpace1.is_set());
   assert(FromSpace2.is_set());
 
-  unsigned Dims1 = FromSpace1.dim(isl::dim::set);
-  unsigned Dims2 = FromSpace2.dim(isl::dim::set);
+  unsigned Dims1 = FromSpace1.dim(isl::dim::set).release();
+  unsigned Dims2 = FromSpace2.dim(isl::dim::set).release();
 
   isl::space FromSpace =
       FromSpace1.map_from_domain_and_range(FromSpace2).wrap();
@@ -166,7 +166,7 @@ isl_size polly::getNumScatterDims(const isl::union_map &Schedule) {
     if (Map.is_null())
       continue;
 
-    Dims = std::max(Dims, Map.range_tuple_dim());
+    Dims = std::max(Dims, Map.range_tuple_dim().release());
   }
   return Dims;
 }
@@ -214,7 +214,7 @@ isl::union_map polly::reverseDomain(const isl::union_map &UMap) {
 }
 
 isl::set polly::shiftDim(isl::set Set, int Pos, int Amount) {
-  int NumDims = Set.tuple_dim();
+  int NumDims = Set.tuple_dim().release();
   if (Pos < 0)
     Pos = NumDims + Pos;
   assert(Pos < NumDims && "Dimension index must be in range");
@@ -235,7 +235,7 @@ isl::union_set polly::shiftDim(isl::union_set USet, int Pos, int Amount) {
 }
 
 isl::map polly::shiftDim(isl::map Map, isl::dim Dim, int Pos, int Amount) {
-  int NumDims = Map.dim(Dim);
+  int NumDims = Map.dim(Dim).release();
   if (Pos < 0)
     Pos = NumDims + Pos;
   assert(Pos < NumDims && "Dimension index must be in range");
@@ -449,16 +449,16 @@ isl::map polly::distributeDomain(isl::map Map) {
   isl::space DomainSpace = Space.domain();
   if (DomainSpace.is_null())
     return {};
-  unsigned DomainDims = DomainSpace.dim(isl::dim::set);
+  unsigned DomainDims = DomainSpace.dim(isl::dim::set).release();
   isl::space RangeSpace = Space.range().unwrap();
   isl::space Range1Space = RangeSpace.domain();
   if (Range1Space.is_null())
     return {};
-  unsigned Range1Dims = Range1Space.dim(isl::dim::set);
+  unsigned Range1Dims = Range1Space.dim(isl::dim::set).release();
   isl::space Range2Space = RangeSpace.range();
   if (Range2Space.is_null())
     return {};
-  unsigned Range2Dims = Range2Space.dim(isl::dim::set);
+  unsigned Range2Dims = Range2Space.dim(isl::dim::set).release();
 
   isl::space OutputSpace =
       DomainSpace.map_from_domain_and_range(Range1Space)
@@ -606,17 +606,17 @@ static int flatCompare(const isl::basic_set &A, const isl::basic_set &B) {
   if (A.is_null() || B.is_null())
     return 0;
 
-  unsigned ALen = A.dim(isl::dim::set);
-  unsigned BLen = B.dim(isl::dim::set);
+  unsigned ALen = A.dim(isl::dim::set).release();
+  unsigned BLen = B.dim(isl::dim::set).release();
   unsigned Len = std::min(ALen, BLen);
 
   for (unsigned i = 0; i < Len; i += 1) {
     isl::basic_set ADim =
-        A.project_out(isl::dim::param, 0, A.dim(isl::dim::param))
+        A.project_out(isl::dim::param, 0, A.dim(isl::dim::param).release())
             .project_out(isl::dim::set, i + 1, ALen - i - 1)
             .project_out(isl::dim::set, 0, i);
     isl::basic_set BDim =
-        B.project_out(isl::dim::param, 0, B.dim(isl::dim::param))
+        B.project_out(isl::dim::param, 0, B.dim(isl::dim::param).release())
             .project_out(isl::dim::set, i + 1, BLen - i - 1)
             .project_out(isl::dim::set, 0, i);
 
@@ -687,7 +687,8 @@ static int structureCompare(const isl::space &ASpace, const isl::space &BSpace,
     return NameCompare;
 
   if (ConsiderTupleLen) {
-    int LenCompare = BSpace.dim(isl::dim::set) - ASpace.dim(isl::dim::set);
+    int LenCompare = BSpace.dim(isl::dim::set).release() -
+                     ASpace.dim(isl::dim::set).release();
     if (LenCompare != 0)
       return LenCompare;
   }
@@ -782,14 +783,14 @@ static void printSortedPolyhedra(isl::union_set USet, llvm::raw_ostream &OS,
 }
 
 static void recursiveExpand(isl::basic_set BSet, int Dim, isl::set &Expanded) {
-  int Dims = BSet.dim(isl::dim::set);
+  int Dims = BSet.dim(isl::dim::set).release();
   if (Dim >= Dims) {
     Expanded = Expanded.unite(BSet);
     return;
   }
 
   isl::basic_set DimOnly =
-      BSet.project_out(isl::dim::param, 0, BSet.dim(isl::dim::param))
+      BSet.project_out(isl::dim::param, 0, BSet.dim(isl::dim::param).release())
           .project_out(isl::dim::set, Dim + 1, Dims - Dim - 1)
           .project_out(isl::dim::set, 0, Dim);
   if (!DimOnly.is_bounded()) {
diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp
index f9475a2fe08b1..ebc0a7ee41ebe 100644
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@@ -36,6 +36,7 @@
 #include "polly/ScopDetection.h"
 #include "polly/ScopInfo.h"
 #include "polly/Simplify.h"
+#include "polly/Support/DumpFunctionPass.h"
 #include "polly/Support/DumpModulePass.h"
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -610,19 +611,19 @@ static void buildLatePollyPipeline(FunctionPassManager &PM,
     return;
 
   if (DumpBefore)
-    report_fatal_error("Option -polly-dump-before not supported with NPM",
-                       false);
+    PM.addPass(DumpFunctionPass("-before"));
   if (!DumpBeforeFile.empty())
-    report_fatal_error("Option -polly-dump-before-file not supported with NPM",
+    report_fatal_error("Option -polly-dump-before-file at -polly-position=late "
+                       "not supported with NPM",
                        false);
 
   buildCommonPollyPipeline(PM, Level, EnableForOpt);
 
   if (DumpAfter)
-    report_fatal_error("Option -polly-dump-after not supported with NPM",
-                       false);
+    PM.addPass(DumpFunctionPass("-after"));
   if (!DumpAfterFile.empty())
-    report_fatal_error("Option -polly-dump-after-file not supported with NPM",
+    report_fatal_error("Option -polly-dump-after-file at -polly-position=late "
+                       "not supported with NPM",
                        false);
 }
 
diff --git a/polly/lib/Support/SCEVValidator.cpp b/polly/lib/Support/SCEVValidator.cpp
index 0e0ec73585713..f6b601ae317da 100644
--- a/polly/lib/Support/SCEVValidator.cpp
+++ b/polly/lib/Support/SCEVValidator.cpp
@@ -777,8 +777,7 @@ extractConstantFactor(const SCEV *S, ScalarEvolution &SE) {
 }
 
 const SCEV *tryForwardThroughPHI(const SCEV *Expr, Region &R,
-                                 ScalarEvolution &SE, LoopInfo &LI,
-                                 const DominatorTree &DT) {
+                                 ScalarEvolution &SE, ScopDetection *SD) {
   if (auto *Unknown = dyn_cast<SCEVUnknown>(Expr)) {
     Value *V = Unknown->getValue();
     auto *PHI = dyn_cast<PHINode>(V);
@@ -789,7 +788,7 @@ const SCEV *tryForwardThroughPHI(const SCEV *Expr, Region &R,
 
     for (unsigned i = 0; i < PHI->getNumIncomingValues(); i++) {
       BasicBlock *Incoming = PHI->getIncomingBlock(i);
-      if (isErrorBlock(*Incoming, R, LI, DT) && R.contains(Incoming))
+      if (SD->isErrorBlock(*Incoming, R) && R.contains(Incoming))
         continue;
       if (Final)
         return Expr;
@@ -802,12 +801,11 @@ const SCEV *tryForwardThroughPHI(const SCEV *Expr, Region &R,
   return Expr;
 }
 
-Value *getUniqueNonErrorValue(PHINode *PHI, Region *R, LoopInfo &LI,
-                              const DominatorTree &DT) {
+Value *getUniqueNonErrorValue(PHINode *PHI, Region *R, ScopDetection *SD) {
   Value *V = nullptr;
   for (unsigned i = 0; i < PHI->getNumIncomingValues(); i++) {
     BasicBlock *BB = PHI->getIncomingBlock(i);
-    if (!isErrorBlock(*BB, *R, LI, DT)) {
+    if (!SD->isErrorBlock(*BB, *R)) {
       if (V)
         return nullptr;
       V = PHI->getIncomingValue(i);
diff --git a/polly/lib/Support/ScopHelper.cpp b/polly/lib/Support/ScopHelper.cpp
index efb3b9afa6980..922107833d4bc 100644
--- a/polly/lib/Support/ScopHelper.cpp
+++ b/polly/lib/Support/ScopHelper.cpp
@@ -27,11 +27,6 @@ using namespace polly;
 
 #define DEBUG_TYPE "polly-scop-helper"
 
-static cl::opt<bool> PollyAllowErrorBlocks(
-    "polly-allow-error-blocks",
-    cl::desc("Allow to speculate on the execution of 'error blocks'."),
-    cl::Hidden, cl::init(true), cl::ZeroOrMore, cl::cat(PollyCategory));
-
 static cl::list<std::string> DebugFunctions(
     "polly-debug-func",
     cl::desc("Allow calls to the specified functions in SCoPs even if their "
@@ -413,54 +408,6 @@ Value *polly::expandCodeFor(Scop &S, ScalarEvolution &SE, const DataLayout &DL,
   return Expander.expandCodeFor(E, Ty, IP);
 }
 
-bool polly::isErrorBlock(BasicBlock &BB, const Region &R, LoopInfo &LI,
-                         const DominatorTree &DT) {
-  if (!PollyAllowErrorBlocks)
-    return false;
-
-  if (isa<UnreachableInst>(BB.getTerminator()))
-    return true;
-
-  if (LI.isLoopHeader(&BB))
-    return false;
-
-  // Basic blocks that are always executed are not considered error blocks,
-  // as their execution can not be a rare event.
-  bool DominatesAllPredecessors = true;
-  if (R.isTopLevelRegion()) {
-    for (BasicBlock &I : *R.getEntry()->getParent())
-      if (isa<ReturnInst>(I.getTerminator()) && !DT.dominates(&BB, &I))
-        DominatesAllPredecessors = false;
-  } else {
-    for (auto Pred : predecessors(R.getExit()))
-      if (R.contains(Pred) && !DT.dominates(&BB, Pred))
-        DominatesAllPredecessors = false;
-  }
-
-  if (DominatesAllPredecessors)
-    return false;
-
-  for (Instruction &Inst : BB)
-    if (CallInst *CI = dyn_cast<CallInst>(&Inst)) {
-      if (isDebugCall(CI))
-        continue;
-
-      if (isIgnoredIntrinsic(CI))
-        continue;
-
-      // memset, memcpy and memmove are modeled intrinsics.
-      if (isa<MemSetInst>(CI) || isa<MemTransferInst>(CI))
-        continue;
-
-      if (!CI->doesNotAccessMemory())
-        return true;
-      if (CI->doesNotReturn())
-        return true;
-    }
-
-  return false;
-}
-
 Value *polly::getConditionFromTerminator(Instruction *TI) {
   if (BranchInst *BR = dyn_cast<BranchInst>(TI)) {
     if (BR->isUnconditional())
diff --git a/polly/lib/Transform/FlattenAlgo.cpp b/polly/lib/Transform/FlattenAlgo.cpp
index 6edece553a207..d9efe3fbfa844 100644
--- a/polly/lib/Transform/FlattenAlgo.cpp
+++ b/polly/lib/Transform/FlattenAlgo.cpp
@@ -26,10 +26,10 @@ namespace {
 /// i.e. there are two constants Min and Max, such that every value x of the
 /// chosen dimensions is Min <= x <= Max.
 bool isDimBoundedByConstant(isl::set Set, unsigned dim) {
-  auto ParamDims = Set.dim(isl::dim::param);
+  auto ParamDims = Set.dim(isl::dim::param).release();
   Set = Set.project_out(isl::dim::param, 0, ParamDims);
   Set = Set.project_out(isl::dim::set, 0, dim);
-  auto SetDims = Set.tuple_dim();
+  auto SetDims = Set.tuple_dim().release();
   Set = Set.project_out(isl::dim::set, 1, SetDims - 1);
   return bool(Set.is_bounded());
 }
@@ -40,7 +40,7 @@ bool isDimBoundedByConstant(isl::set Set, unsigned dim) {
 /// Min_p <= x <= Max_p.
 bool isDimBoundedByParameter(isl::set Set, unsigned dim) {
   Set = Set.project_out(isl::dim::set, 0, dim);
-  auto SetDims = Set.tuple_dim();
+  auto SetDims = Set.tuple_dim().release();
   Set = Set.project_out(isl::dim::set, 1, SetDims - 1);
   return bool(Set.is_bounded());
 }
@@ -135,7 +135,7 @@ isl_size scheduleScatterDims(const isl::union_map &Schedule) {
     if (Map.is_null())
       continue;
 
-    Dims = std::max(Dims, Map.range_tuple_dim());
+    Dims = std::max(Dims, Map.range_tuple_dim().release());
   }
   return Dims;
 }
@@ -144,7 +144,7 @@ isl_size scheduleScatterDims(const isl::union_map &Schedule) {
 isl::union_pw_aff scheduleExtractDimAff(isl::union_map UMap, unsigned pos) {
   auto SingleUMap = isl::union_map::empty(UMap.ctx());
   for (isl::map Map : UMap.get_map_list()) {
-    unsigned MapDims = Map.range_tuple_dim();
+    unsigned MapDims = Map.range_tuple_dim().release();
     isl::map SingleMap = Map.project_out(isl::dim::out, 0, pos);
     SingleMap = SingleMap.project_out(isl::dim::out, 1, MapDims - pos - 1);
     SingleUMap = SingleUMap.unite(SingleMap);
@@ -152,7 +152,7 @@ isl::union_pw_aff scheduleExtractDimAff(isl::union_map UMap, unsigned pos) {
 
   auto UAff = isl::union_pw_multi_aff(SingleUMap);
   auto FirstMAff = isl::multi_union_pw_aff(UAff);
-  return FirstMAff.get_union_pw_aff(0);
+  return FirstMAff.at(0);
 }
 
 /// Flatten a sequence-like first dimension.
@@ -179,7 +179,7 @@ isl::union_map tryFlattenSequence(isl::union_map Schedule) {
   auto ScatterSet = isl::set(Schedule.range());
 
   auto ParamSpace = Schedule.get_space().params();
-  auto Dims = ScatterSet.tuple_dim();
+  auto Dims = ScatterSet.tuple_dim().release();
   assert(Dims >= 2);
 
   // Would cause an infinite loop.
@@ -238,8 +238,10 @@ isl::union_map tryFlattenSequence(isl::union_map Schedule) {
     auto FirstScheduleAffWithOffset =
         FirstScheduleAffNormalized.add(AllCounter);
 
-    auto ScheduleWithOffset = isl::union_map(FirstScheduleAffWithOffset)
-                                  .flat_range_product(RemainingSubSchedule);
+    auto ScheduleWithOffset =
+        isl::union_map::from(
+            isl::union_pw_multi_aff(FirstScheduleAffWithOffset))
+            .flat_range_product(RemainingSubSchedule);
     NewSchedule = NewSchedule.unite(ScheduleWithOffset);
 
     ScatterSet = ScatterSet.subtract(ScatterFirst);
@@ -269,7 +271,7 @@ isl::union_map tryFlattenLoop(isl::union_map Schedule) {
   auto SubDims = scheduleScatterDims(SubSchedule);
 
   auto SubExtent = isl::set(SubSchedule.range());
-  auto SubExtentDims = SubExtent.dim(isl::dim::param);
+  auto SubExtentDims = SubExtent.dim(isl::dim::param).release();
   SubExtent = SubExtent.project_out(isl::dim::param, 0, SubExtentDims);
   SubExtent = SubExtent.project_out(isl::dim::set, 1, SubDims - 1);
 
@@ -294,15 +296,15 @@ isl::union_map tryFlattenLoop(isl::union_map Schedule) {
   auto FirstSubScheduleAff = scheduleExtractDimAff(SubSchedule, 0);
   auto RemainingSubSchedule = scheduleProjectOut(std::move(SubSchedule), 0, 1);
 
-  auto LenVal = MaxVal.sub(MinVal).add_ui(1);
+  auto LenVal = MaxVal.sub(MinVal).add(1);
   auto FirstSubScheduleNormalized = subtract(FirstSubScheduleAff, MinVal);
 
   // TODO: Normalize FirstAff to zero (convert to isl_map, determine minimum,
   // subtract it)
   auto FirstAff = scheduleExtractDimAff(Schedule, 0);
   auto Offset = multiply(FirstAff, LenVal);
-  auto Index = FirstSubScheduleNormalized.add(Offset);
-  auto IndexMap = isl::union_map(Index);
+  isl::union_pw_multi_aff Index = FirstSubScheduleNormalized.add(Offset);
+  auto IndexMap = isl::union_map::from(Index);
 
   auto Result = IndexMap.flat_range_product(RemainingSubSchedule);
   LLVM_DEBUG(dbgs() << "Loop-flatten result is:\n  " << Result << "\n");
diff --git a/polly/lib/Transform/MatmulOptimizer.cpp b/polly/lib/Transform/MatmulOptimizer.cpp
index 7e0a837be9ed5..3845e9bb8903d 100644
--- a/polly/lib/Transform/MatmulOptimizer.cpp
+++ b/polly/lib/Transform/MatmulOptimizer.cpp
@@ -188,8 +188,8 @@ static isl::union_set getUnrollIsolatedSetOptions(isl::ctx Ctx) {
 /// @return        The modified map.
 static isl::map permuteDimensions(isl::map Map, isl::dim DimType,
                                   unsigned DstPos, unsigned SrcPos) {
-  assert((isl_size)DstPos < Map.dim(DimType) &&
-         (isl_size)SrcPos < Map.dim(DimType));
+  assert((isl_size)DstPos < Map.dim(DimType).release() &&
+         (isl_size)SrcPos < Map.dim(DimType).release());
   if (DstPos == SrcPos)
     return Map;
   isl::id DimId;
@@ -229,7 +229,7 @@ static bool isMatMulOperandAcc(isl::set Domain, isl::map AccMap, int &FirstPos,
   isl::space Space = AccMap.get_space();
   isl::map Universe = isl::map::universe(Space);
 
-  if (Space.dim(isl::dim::out) != 2)
+  if (Space.dim(isl::dim::out).release() != 2)
     return false;
 
   // MatMul has the form:
@@ -317,7 +317,7 @@ static bool containsOnlyMatrMultAcc(isl::map PartialSchedule,
                                     MatMulInfoTy &MMI) {
   auto InputDimId = PartialSchedule.get_tuple_id(isl::dim::in);
   auto *Stmt = static_cast<ScopStmt *>(InputDimId.get_user());
-  isl_size OutDimNum = PartialSchedule.range_tuple_dim();
+  isl_size OutDimNum = PartialSchedule.range_tuple_dim().release();
   assert(OutDimNum > 2 && "In case of the matrix multiplication the loop nest "
                           "and, consequently, the corresponding scheduling "
                           "functions have at least three dimensions.");
@@ -363,7 +363,7 @@ static bool containsOnlyMatMulDep(isl::map Schedule, const Dependences *D,
   auto DomainSpace = Schedule.get_space().domain();
   auto Space = DomainSpace.map_from_domain_and_range(DomainSpace);
   auto Deltas = Dep.extract_map(Space).deltas();
-  isl_size DeltasDimNum = Deltas.dim(isl::dim::set);
+  isl_size DeltasDimNum = Deltas.dim(isl::dim::set).release();
   for (int i = 0; i < DeltasDimNum; i++) {
     auto Val = Deltas.plain_get_val_if_fixed(isl::dim::set, i);
     Pos = Pos < 0 && Val.is_one() ? i : Pos;
@@ -445,8 +445,8 @@ static isl::schedule_node permuteBandNodeDimensions(isl::schedule_node Node,
              std::max(FirstDim, SecondDim));
   auto PartialSchedule =
       isl::manage(isl_schedule_node_band_get_partial_schedule(Node.get()));
-  auto PartialScheduleFirstDim = PartialSchedule.get_union_pw_aff(FirstDim);
-  auto PartialScheduleSecondDim = PartialSchedule.get_union_pw_aff(SecondDim);
+  auto PartialScheduleFirstDim = PartialSchedule.at(FirstDim);
+  auto PartialScheduleSecondDim = PartialSchedule.at(SecondDim);
   PartialSchedule =
       PartialSchedule.set_union_pw_aff(SecondDim, PartialScheduleFirstDim);
   PartialSchedule =
@@ -492,7 +492,7 @@ createMacroKernel(isl::schedule_node Node,
   Node = permuteBandNodeDimensions(Node, DimOutNum - 3, DimOutNum - 1);
 
   // Mark the outermost loop as parallelizable.
-  Node = Node.band_member_set_coincident(0, true);
+  Node = Node.as<isl::schedule_node_band>().member_set_coincident(0, true);
 
   return Node.child(0).child(0);
 }
@@ -729,7 +729,7 @@ static isl::schedule_node optimizePackedB(isl::schedule_node Node,
 
   // Insert into the schedule tree.
   isl::map ExtMap = MapOldIndVar.project_out(
-      isl::dim::out, 2, MapOldIndVar.range_tuple_dim() - 2);
+      isl::dim::out, 2, MapOldIndVar.range_tuple_dim().release() - 2);
   ExtMap = ExtMap.reverse();
   ExtMap = ExtMap.fix_si(isl::dim::out, MMI.i, 0);
   ExtMap = ExtMap.intersect_range(Domain);
@@ -870,9 +870,9 @@ getInductionVariablesSubstitution(isl::schedule_node Node,
   auto Child = Node.child(0);
   auto UnMapOldIndVar = Child.get_prefix_schedule_union_map();
   auto MapOldIndVar = isl::map::from_union_map(UnMapOldIndVar);
-  if (MapOldIndVar.range_tuple_dim() > 9)
-    return MapOldIndVar.project_out(isl::dim::out, 0,
-                                    MapOldIndVar.range_tuple_dim() - 9);
+  if (MapOldIndVar.range_tuple_dim().release() > 9)
+    return MapOldIndVar.project_out(
+        isl::dim::out, 0, MapOldIndVar.range_tuple_dim().release() - 9);
   return MapOldIndVar;
 }
 
@@ -893,10 +893,10 @@ getInductionVariablesSubstitution(isl::schedule_node Node,
 static isl::schedule_node
 isolateAndUnrollMatMulInnerLoops(isl::schedule_node Node,
                                  struct MicroKernelParamsTy MicroKernelParams) {
-  isl::schedule_node Child = Node.get_child(0);
+  isl::schedule_node Child = Node.child(0);
   isl::union_map UnMapOldIndVar = Child.get_prefix_schedule_relation();
   isl::set Prefix = isl::map::from_union_map(UnMapOldIndVar).range();
-  isl_size Dims = Prefix.tuple_dim();
+  isl_size Dims = Prefix.tuple_dim().release();
   Prefix = Prefix.project_out(isl::dim::set, Dims - 1, 1);
   Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Nr);
   Prefix = getPartialTilePrefixes(Prefix, MicroKernelParams.Mr);
@@ -906,11 +906,11 @@ isolateAndUnrollMatMulInnerLoops(isl::schedule_node Node,
   isl::ctx Ctx = Node.ctx();
   auto Options = IsolateOption.unite(getDimOptions(Ctx, "unroll"));
   Options = Options.unite(getUnrollIsolatedSetOptions(Ctx));
-  Node = Node.band_set_ast_build_options(Options);
+  Node = Node.as<isl::schedule_node_band>().set_ast_build_options(Options);
   Node = Node.parent().parent().parent();
   IsolateOption = getIsolateOptions(Prefix, 3);
   Options = IsolateOption.unite(getDimOptions(Ctx, "separate"));
-  Node = Node.band_set_ast_build_options(Options);
+  Node = Node.as<isl::schedule_node_band>().set_ast_build_options(Options);
   Node = Node.child(0).child(0).child(0);
   return Node;
 }
@@ -953,8 +953,8 @@ getBandNodeWithOriginDimOrder(isl::schedule_node Node) {
     return Node;
   auto Domain = Node.get_universe_domain();
   assert(isl_union_set_n_set(Domain.get()) == 1);
-  if (Node.get_schedule_depth() != 0 ||
-      (isl::set(Domain).tuple_dim() !=
+  if (Node.get_schedule_depth().release() != 0 ||
+      (isl::set(Domain).tuple_dim().release() !=
        isl_schedule_node_band_n_member(Node.get())))
     return Node;
   Node = isl::manage(isl_schedule_node_delete(Node.copy()));
@@ -1029,7 +1029,7 @@ static bool isMatrMultPattern(isl::schedule_node Node, const Dependences *D,
   Node = Node.parent();
   if (LeafType != isl_schedule_node_leaf ||
       isl_schedule_node_band_n_member(Node.get()) < 3 ||
-      Node.get_schedule_depth() != 0 ||
+      Node.get_schedule_depth().release() != 0 ||
       isl_union_map_n_map(PartialSchedule.get()) != 1)
     return false;
   auto NewPartialSchedule = isl::map::from_union_map(PartialSchedule);
diff --git a/polly/lib/Transform/MaximalStaticExpansion.cpp b/polly/lib/Transform/MaximalStaticExpansion.cpp
index a28e39bcd5625..4a1665633f092 100644
--- a/polly/lib/Transform/MaximalStaticExpansion.cpp
+++ b/polly/lib/Transform/MaximalStaticExpansion.cpp
@@ -118,10 +118,10 @@ class MaximalStaticExpander : public ScopPass {
 /// i.e. there are two constants Min and Max, such that every value x of the
 /// chosen dimensions is Min <= x <= Max.
 static bool isDimBoundedByConstant(isl::set Set, unsigned dim) {
-  auto ParamDims = Set.dim(isl::dim::param);
+  auto ParamDims = Set.dim(isl::dim::param).release();
   Set = Set.project_out(isl::dim::param, 0, ParamDims);
   Set = Set.project_out(isl::dim::set, 0, dim);
-  auto SetDims = Set.tuple_dim();
+  auto SetDims = Set.tuple_dim().release();
   Set = Set.project_out(isl::dim::set, 1, SetDims - 1);
   return bool(Set.is_bounded());
 }
@@ -350,7 +350,7 @@ ScopArrayInfo *MaximalStaticExpander::expandAccess(Scop &S, MemoryAccess *MA) {
   // Get the current AM.
   auto CurrentAccessMap = MA->getAccessRelation();
 
-  unsigned in_dimensions = CurrentAccessMap.domain_tuple_dim();
+  unsigned in_dimensions = CurrentAccessMap.domain_tuple_dim().release();
 
   // Get domain from the current AM.
   auto Domain = CurrentAccessMap.domain();
@@ -405,7 +405,7 @@ ScopArrayInfo *MaximalStaticExpander::expandAccess(Scop &S, MemoryAccess *MA) {
   // Add constraints to linked output with input id.
   auto SpaceMap = NewAccessMap.get_space();
   auto ConstraintBasicMap =
-      isl::basic_map::equal(SpaceMap, SpaceMap.dim(isl::dim::in));
+      isl::basic_map::equal(SpaceMap, SpaceMap.dim(isl::dim::in).release());
   NewAccessMap = isl::map(ConstraintBasicMap);
 
   // Set the new access relation map.
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 9d344a1526976..3d26a04cada0d 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -366,8 +366,9 @@ ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
   isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1);
   Node = Node.parent().parent();
   isl::union_set Options = IsolateOption.unite(AtomicOption);
-  Node = Node.band_set_ast_build_options(Options);
-  return Node;
+  isl::schedule_node_band Result =
+      Node.as<isl::schedule_node_band>().set_ast_build_options(Options);
+  return Result;
 }
 
 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
@@ -375,7 +376,7 @@ isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
 
   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
-  isl_size ScheduleDimensions = Space.dim(isl::dim::set);
+  isl_size ScheduleDimensions = Space.dim(isl::dim::set).release();
   assert((isl_size)DimToVectorize < ScheduleDimensions);
 
   if (DimToVectorize > 0) {
@@ -394,9 +395,10 @@ isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
   Node = Node.child(0);
   // Make sure the "trivially vectorizable loop" is not unrolled. Otherwise,
   // we will have troubles to match it in the backend.
-  Node = Node.band_set_ast_build_options(
-      isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }"));
-  Node = isl::manage(isl_schedule_node_band_sink(Node.release()));
+  isl::schedule_node_band NodeBand =
+      Node.as<isl::schedule_node_band>().set_ast_build_options(
+          isl::union_set(Node.ctx(), "{ unroll[x]: 1 = 0 }"));
+  Node = isl::manage(isl_schedule_node_band_sink(NodeBand.release()));
   Node = Node.child(0);
   if (isl_schedule_node_get_type(Node.get()) == isl_schedule_node_leaf)
     Node = Node.parent();
@@ -442,7 +444,7 @@ bool ScheduleTreeOptimizer::isTileableBandNode(isl::schedule_node Node) {
     return false;
 
   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
-  auto Dims = Space.dim(isl::dim::set);
+  auto Dims = Space.dim(isl::dim::set).release();
 
   if (Dims <= 1)
     return false;
@@ -474,10 +476,10 @@ ScheduleTreeOptimizer::standardBandOpts(isl::schedule_node Node, void *User) {
     return Node;
 
   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
-  auto Dims = Space.dim(isl::dim::set);
+  auto Dims = Space.dim(isl::dim::set).release();
 
   for (int i = Dims - 1; i >= 0; i--)
-    if (Node.band_member_get_coincident(i)) {
+    if (Node.as<isl::schedule_node_band>().member_get_coincident(i)) {
       Node = prevectSchedBand(Node, i, PrevectorWidth);
       break;
     }
@@ -615,7 +617,7 @@ static void walkScheduleTreeForStatistics(isl::schedule Schedule, int Version) {
           int CountMembers = isl_schedule_node_band_n_member(Node.get());
           NumBandMembers[Version] += CountMembers;
           for (int i = 0; i < CountMembers; i += 1) {
-            if (Node.band_member_get_coincident(i))
+            if (Node.as<isl::schedule_node_band>().member_get_coincident(i))
               NumCoincident[Version]++;
           }
           break;
diff --git a/polly/lib/Transform/ScheduleTreeTransform.cpp b/polly/lib/Transform/ScheduleTreeTransform.cpp
index c0006a5b0f13f..3ce005e3c4b22 100644
--- a/polly/lib/Transform/ScheduleTreeTransform.cpp
+++ b/polly/lib/Transform/ScheduleTreeTransform.cpp
@@ -80,7 +80,7 @@ struct ScheduleTreeRewriter
     isl::schedule NewChild =
         getDerived().visit(Band.child(0), std::forward<Args>(args)...);
     isl::schedule_node NewNode =
-        NewChild.insert_partial_schedule(PartialSched).get_root().get_child(0);
+        NewChild.insert_partial_schedule(PartialSched).get_root().child(0);
 
     // Reapply permutability and coincidence attributes.
     NewNode = isl::manage(isl_schedule_node_band_set_permutable(
@@ -123,7 +123,8 @@ struct ScheduleTreeRewriter
   }
 
   isl::schedule visitMark(const isl::schedule_node &Mark, Args... args) {
-    isl::id TheMark = Mark.mark_get_id();
+
+    isl::id TheMark = Mark.as<isl::schedule_node_mark>().get_id();
     isl::schedule_node NewChild =
         getDerived()
             .visit(Mark.first_child(), std::forward<Args>(args)...)
@@ -134,7 +135,8 @@ struct ScheduleTreeRewriter
 
   isl::schedule visitExtension(const isl::schedule_node &Extension,
                                Args... args) {
-    isl::union_map TheExtension = Extension.extension_get_extension();
+    isl::union_map TheExtension =
+        Extension.as<isl::schedule_node_extension>().get_extension();
     isl::schedule_node NewChild = getDerived()
                                       .visit(Extension.child(0), args...)
                                       .get_root()
@@ -145,7 +147,8 @@ struct ScheduleTreeRewriter
   }
 
   isl::schedule visitFilter(const isl::schedule_node &Filter, Args... args) {
-    isl::union_set FilterDomain = Filter.filter_get_filter();
+    isl::union_set FilterDomain =
+        Filter.as<isl::schedule_node_filter>().get_filter();
     isl::schedule NewSchedule =
         getDerived().visit(Filter.child(0), std::forward<Args>(args)...);
     return NewSchedule.intersect_domain(FilterDomain);
@@ -236,7 +239,7 @@ struct ExtensionNodeRewriter
     isl::union_map NewPartialSchedMap = isl::union_map::from(PartialSched);
     unsigned BandDims = isl_schedule_node_band_n_member(OldNode.get());
     for (isl::map Ext : NewChildExtensions.get_map_list()) {
-      unsigned ExtDims = Ext.domain_tuple_dim();
+      unsigned ExtDims = Ext.domain_tuple_dim().release();
       assert(ExtDims >= BandDims);
       unsigned OuterDims = ExtDims - BandDims;
 
@@ -256,7 +259,7 @@ struct ExtensionNodeRewriter
     isl::schedule_node NewNode =
         NewChild.insert_partial_schedule(NewPartialSchedAsAsMultiUnionPwAff)
             .get_root()
-            .get_child(0);
+            .child(0);
 
     // Reapply permutability and coincidence attributes.
     NewNode = isl::manage(isl_schedule_node_band_set_permutable(
@@ -274,7 +277,8 @@ struct ExtensionNodeRewriter
   isl::schedule visitFilter(const isl::schedule_node &Filter,
                             const isl::union_set &Domain,
                             isl::union_map &Extensions) {
-    isl::union_set FilterDomain = Filter.filter_get_filter();
+    isl::union_set FilterDomain =
+        Filter.as<isl::schedule_node_filter>().get_filter();
     isl::union_set NewDomain = Domain.intersect(FilterDomain);
 
     // A filter is added implicitly if necessary when joining schedule trees.
@@ -284,7 +288,8 @@ struct ExtensionNodeRewriter
   isl::schedule visitExtension(const isl::schedule_node &Extension,
                                const isl::union_set &Domain,
                                isl::union_map &Extensions) {
-    isl::union_map ExtDomain = Extension.extension_get_extension();
+    isl::union_map ExtDomain =
+        Extension.as<isl::schedule_node_extension>().get_extension();
     isl::union_set NewDomain = Domain.unite(ExtDomain.range());
     isl::union_map ChildExtensions;
     isl::schedule NewChild =
@@ -340,7 +345,8 @@ struct ApplyASTBuildOptions
 
   isl::schedule_node visitBand(const isl::schedule_node &Band) {
     isl::schedule_node Result =
-        Band.band_set_ast_build_options(ASTBuildOptions[Pos]);
+        Band.as<isl::schedule_node_band>().set_ast_build_options(
+            ASTBuildOptions[Pos]);
     Pos += 1;
     return getBase().visitBand(Result);
   }
@@ -412,7 +418,7 @@ static isl::id createGeneratedLoopAttr(isl::ctx Ctx, MDNode *FollowupLoopMD) {
 /// start with either the mark or the band.
 static isl::schedule_node moveToBandMark(isl::schedule_node BandOrMark) {
   if (isBandMark(BandOrMark)) {
-    assert(isBandWithSingleLoop(BandOrMark.get_child(0)));
+    assert(isBandWithSingleLoop(BandOrMark.child(0)));
     return BandOrMark;
   }
   assert(isBandWithSingleLoop(BandOrMark));
@@ -431,7 +437,7 @@ static isl::schedule_node removeMark(isl::schedule_node MarkOrBand,
 
   isl::schedule_node Band;
   if (isMark(MarkOrBand)) {
-    Attr = getLoopAttr(MarkOrBand.mark_get_id());
+    Attr = getLoopAttr(MarkOrBand.as<isl::schedule_node_mark>().get_id());
     Band = isl::manage(isl_schedule_node_delete(MarkOrBand.release()));
   } else {
     Attr = nullptr;
@@ -453,7 +459,7 @@ static isl::schedule_node insertMark(isl::schedule_node Band, isl::id Mark) {
   assert(moveToBandMark(Band).is_equal(Band) &&
          "Don't add a two marks for a band");
 
-  return Band.insert_mark(Mark).get_child(0);
+  return Band.insert_mark(Mark).child(0);
 }
 
 /// Return the (one-dimensional) set of numbers that are divisible by @p Factor
@@ -484,7 +490,7 @@ static isl::basic_set isDivisibleBySet(isl::ctx &Ctx, long Factor,
 /// @param Set         A set, which should be modified.
 /// @param VectorWidth A parameter, which determines the constraint.
 static isl::set addExtentConstraints(isl::set Set, int VectorWidth) {
-  unsigned Dims = Set.tuple_dim();
+  unsigned Dims = Set.tuple_dim().release();
   isl::space Space = Set.get_space();
   isl::local_space LocalSpace = isl::local_space(Space);
   isl::constraint ExtConstr = isl::constraint::alloc_inequality(LocalSpace);
@@ -499,7 +505,8 @@ static isl::set addExtentConstraints(isl::set Set, int VectorWidth) {
 } // namespace
 
 bool polly::isBandMark(const isl::schedule_node &Node) {
-  return isMark(Node) && isLoopAttr(Node.mark_get_id());
+  return isMark(Node) &&
+         isLoopAttr(Node.as<isl::schedule_node_mark>().get_id());
 }
 
 BandAttr *polly::getBandAttr(isl::schedule_node MarkOrBand) {
@@ -507,7 +514,7 @@ BandAttr *polly::getBandAttr(isl::schedule_node MarkOrBand) {
   if (!isMark(MarkOrBand))
     return nullptr;
 
-  return getLoopAttr(MarkOrBand.mark_get_id());
+  return getLoopAttr(MarkOrBand.as<isl::schedule_node_mark>().get_id());
 }
 
 isl::schedule polly::hoistExtensionNodes(isl::schedule Sched) {
@@ -543,13 +550,14 @@ isl::schedule polly::applyFullUnroll(isl::schedule_node BandToUnroll) {
 
   isl::multi_union_pw_aff PartialSched = isl::manage(
       isl_schedule_node_band_get_partial_schedule(BandToUnroll.get()));
-  assert(PartialSched.dim(isl::dim::out) == 1 &&
+  assert(PartialSched.dim(isl::dim::out).release() == 1 &&
          "Can only unroll a single dimension");
-  isl::union_pw_aff PartialSchedUAff = PartialSched.get_union_pw_aff(0);
+  isl::union_pw_aff PartialSchedUAff = PartialSched.at(0);
 
   isl::union_set Domain = BandToUnroll.get_domain();
   PartialSchedUAff = PartialSchedUAff.intersect_domain(Domain);
-  isl::union_map PartialSchedUMap = isl::union_map(PartialSchedUAff);
+  isl::union_map PartialSchedUMap =
+      isl::union_map::from(isl::union_pw_multi_aff(PartialSchedUAff));
 
   // Enumerator only the scatter elements.
   isl::union_set ScatterList = PartialSchedUMap.range();
@@ -570,7 +578,7 @@ isl::schedule polly::applyFullUnroll(isl::schedule_node BandToUnroll) {
   });
 
   // Convert the points to a sequence of filters.
-  isl::union_set_list List = isl::union_set_list::alloc(Ctx, Elts.size());
+  isl::union_set_list List = isl::union_set_list(Ctx, Elts.size());
   for (isl::point P : Elts) {
     // Determine the domains that map this scatter element.
     isl::union_set DomainFilter = PartialSchedUMap.intersect_range(P).domain();
@@ -599,7 +607,7 @@ isl::schedule polly::applyPartialUnroll(isl::schedule_node BandToUnroll,
       isl_schedule_node_band_get_partial_schedule(BandToUnroll.get()));
 
   // { Stmt[] -> [x] }
-  isl::union_pw_aff PartialSchedUAff = PartialSched.get_union_pw_aff(0);
+  isl::union_pw_aff PartialSchedUAff = PartialSched.at(0);
 
   // Here we assume the schedule stride is one and starts with 0, which is not
   // necessarily the case.
@@ -616,10 +624,11 @@ isl::schedule polly::applyPartialUnroll(isl::schedule_node BandToUnroll,
     return isl::stat::ok();
   });
 
-  isl::union_set_list List = isl::union_set_list::alloc(Ctx, Factor);
+  isl::union_set_list List = isl::union_set_list(Ctx, Factor);
   for (auto i : seq<int>(0, Factor)) {
     // { Stmt[] -> [x] }
-    isl::union_map UMap{PartialSchedUAff};
+    isl::union_map UMap =
+        isl::union_map::from(isl::union_pw_multi_aff(PartialSchedUAff));
 
     // { [x] }
     isl::basic_set Divisible = isDivisibleBySet(Ctx, Factor, i);
@@ -650,7 +659,7 @@ isl::schedule polly::applyPartialUnroll(isl::schedule_node BandToUnroll,
 
 isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange,
                                        int VectorWidth) {
-  isl_size Dims = ScheduleRange.tuple_dim();
+  isl_size Dims = ScheduleRange.tuple_dim().release();
   isl::set LoopPrefixes =
       ScheduleRange.drop_constraints_involving_dims(isl::dim::set, Dims - 1, 1);
   auto ExtentPrefixes = addExtentConstraints(LoopPrefixes, VectorWidth);
@@ -662,7 +671,7 @@ isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange,
 
 isl::union_set polly::getIsolateOptions(isl::set IsolateDomain,
                                         isl_size OutDimsNum) {
-  isl_size Dims = IsolateDomain.tuple_dim();
+  isl_size Dims = IsolateDomain.tuple_dim().release();
   assert(OutDimsNum <= Dims &&
          "The isl::set IsolateDomain is used to describe the range of schedule "
          "dimensions values, which should be isolated. Consequently, the "
@@ -693,7 +702,7 @@ isl::schedule_node polly::tileNode(isl::schedule_node Node,
   auto Dims = Space.dim(isl::dim::set);
   auto Sizes = isl::multi_val::zero(Space);
   std::string IdentifierString(Identifier);
-  for (auto i : seq<isl_size>(0, Dims)) {
+  for (auto i : seq<isl_size>(0, Dims.release())) {
     auto tileSize =
         i < (isl_size)TileSizes.size() ? TileSizes[i] : DefaultTileSize;
     Sizes = Sizes.set_val(i, isl::val(Node.ctx(), tileSize));
@@ -717,5 +726,6 @@ isl::schedule_node polly::applyRegisterTiling(isl::schedule_node Node,
                                               int DefaultTileSize) {
   Node = tileNode(Node, "Register tiling", TileSizes, DefaultTileSize);
   auto Ctx = Node.ctx();
-  return Node.band_set_ast_build_options(isl::union_set(Ctx, "{unroll[x]}"));
+  return Node.as<isl::schedule_node_band>().set_ast_build_options(
+      isl::union_set(Ctx, "{unroll[x]}"));
 }
diff --git a/polly/lib/Transform/Simplify.cpp b/polly/lib/Transform/Simplify.cpp
index e5fd53289f1c5..d839289bdcb67 100644
--- a/polly/lib/Transform/Simplify.cpp
+++ b/polly/lib/Transform/Simplify.cpp
@@ -101,7 +101,7 @@ static isl::union_map underapproximatedAddMap(isl::union_map UMap,
 
   isl::map Result = isl::map::empty(PrevMap.get_space());
   for (isl::basic_map BMap : PrevMap.get_basic_map_list()) {
-    if (Result.n_basic_map() > SimplifyMaxDisjuncts)
+    if (Result.n_basic_map().release() > SimplifyMaxDisjuncts)
       break;
     Result = Result.unite(BMap);
   }
diff --git a/polly/lib/Transform/ZoneAlgo.cpp b/polly/lib/Transform/ZoneAlgo.cpp
index ae5354c7fb9f2..1aec6708ef220 100644
--- a/polly/lib/Transform/ZoneAlgo.cpp
+++ b/polly/lib/Transform/ZoneAlgo.cpp
@@ -246,7 +246,8 @@ static isl::map makeUnknownForDomain(isl::set Domain) {
 static bool isMapToUnknown(const isl::map &Map) {
   isl::space Space = Map.get_space().range();
   return Space.has_tuple_id(isl::dim::set).is_false() &&
-         Space.is_wrapping().is_false() && Space.dim(isl::dim::set) == 0;
+         Space.is_wrapping().is_false() &&
+         Space.dim(isl::dim::set).release() == 0;
 }
 
 isl::union_map polly::filterKnownValInst(const isl::union_map &UMap) {
@@ -685,10 +686,12 @@ isl::map ZoneAlgorithm::getDefToTarget(ScopStmt *DefStmt,
                    TargetStmt->getSurroundingLoop())) {
     isl::set DefDomain = getDomainFor(DefStmt);
     isl::set TargetDomain = getDomainFor(TargetStmt);
-    assert(DefDomain.tuple_dim() <= TargetDomain.tuple_dim());
+    assert(DefDomain.tuple_dim().release() <=
+           TargetDomain.tuple_dim().release());
 
     Result = isl::map::from_domain_and_range(DefDomain, TargetDomain);
-    for (unsigned i = 0, DefDims = DefDomain.tuple_dim(); i < DefDims; i += 1)
+    for (unsigned i = 0, DefDims = DefDomain.tuple_dim().release(); i < DefDims;
+         i += 1)
       Result = Result.equate(isl::dim::in, i, isl::dim::out, i);
   }
 
diff --git a/polly/test/ScopInfo/condition-after-error-block-before-scop.ll b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
new file mode 100644
index 0000000000000..1de4a64c12513
--- /dev/null
+++ b/polly/test/ScopInfo/condition-after-error-block-before-scop.ll
@@ -0,0 +1,48 @@
+; RUN: opt %loadPolly -polly-scops -polly-codegen -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%class.node = type { i32 (...)**, %class.node* }
+
+define void @foobar(double* %A) {
+if.end:
+  br i1 undef, label %if.then29, label %lor.lhs.false
+
+lor.lhs.false:
+  %call25 = tail call i32 undef(%class.node* undef)
+  br i1 undef, label %if.then29, label %if.end30
+
+if.then29:
+  br label %if.end30
+
+if.end30:
+  %tobool76.not = phi i1 [ false, %lor.lhs.false ], [ true, %if.then29 ]
+  br label %if.end75
+
+if.end75:
+  br label %if.end79
+
+if.end79:
+  br label %if.then84
+
+if.then84:
+  br label %if.end91
+
+if.end91:
+  br i1 %tobool76.not, label %if.end98, label %if.then93
+
+if.then93:
+  store double 0.0, double* %A
+  br label %if.end98
+
+if.end98:
+  %tobool131 = phi i1 [ false, %if.end91 ], [ true, %if.then93 ]
+  ret void
+}
+
+
+; CHECK: polly.stmt.if.then93:
+; CHECK:   store double 0.000000e+00, double* %A
+; CHECK:   br label %polly.exiting
+
diff --git a/polly/test/Support/dumpfunction.ll b/polly/test/Support/dumpfunction.ll
new file mode 100644
index 0000000000000..70b586bd244b1
--- /dev/null
+++ b/polly/test/Support/dumpfunction.ll
@@ -0,0 +1,94 @@
+; New pass manager
+; RUN: opt %loadPolly -enable-new-pm=1 -O3 -polly -polly-position=before-vectorizer -polly-dump-before --disable-output %s 
+; RUN: FileCheck --input-file=dumpfunction-callee-before.ll --check-prefix=CHECK --check-prefix=CALLEE %s
+; RUN: FileCheck --input-file=dumpfunction-caller-before.ll --check-prefix=CHECK --check-prefix=CALLER %s
+;
+; RUN: opt %loadPolly -enable-new-pm=1 -O3 -polly -polly-position=before-vectorizer -polly-dump-after  --disable-output %s
+; RUN: FileCheck --input-file=dumpfunction-callee-before.ll --check-prefix=CHECK --check-prefix=CALLEE %s
+; RUN: FileCheck --input-file=dumpfunction-caller-before.ll --check-prefix=CHECK --check-prefix=CALLER %s
+
+; void callee(int n, double A[], int i) {
+;   for (int j = 0; j < n; j += 1)
+;     A[i+j] = 42.0;
+; }
+;
+; void caller(int n, double A[]) {
+;   for (int i = 0; i < n; i += 1)
+;     callee(n, A, i);
+; }
+
+
+%unrelated_type = type { i32 }
+
+@callee_alias = dso_local unnamed_addr alias void(i32, double*, i32), void(i32, double*, i32 )* @callee
+
+define void @callee(i32 %n, double* noalias nonnull %A, i32 %i) {
+entry:
+  br label %for
+
+for:
+  %j = phi i32 [0, %entry], [%j.inc, %inc]
+  %j.cmp = icmp slt i32 %j, %n
+  br i1 %j.cmp, label %body, label %exit
+
+    body:
+      %idx = add i32 %i, %j
+      %arrayidx = getelementptr inbounds double, double* %A, i32 %idx
+      store double 42.0, double* %arrayidx
+      br label %inc
+
+inc:
+  %j.inc = add nuw nsw i32 %j, 1
+  br label %for
+
+exit:
+  br label %return
+
+return:
+  ret void
+}
+
+
+define void @caller(i32 %n, double* noalias nonnull %A) {
+entry:
+  br label %for
+
+for:
+  %i = phi i32 [0, %entry], [%j.inc, %inc]
+  %i.cmp = icmp slt i32 %i, %n
+  br i1 %i.cmp, label %body, label %exit
+
+    body:
+      call void @callee_alias(i32 %n, double* %A, i32 %i)
+      br label %inc
+
+inc:
+  %j.inc = add nuw nsw i32 %i, 1
+  br label %for
+
+exit:
+  br label %return
+
+return:
+  ret void
+}
+
+
+declare void @unrelated_decl()
+
+
+!llvm.ident = !{!8}
+!8 = !{!"xyxxy"}
+
+
+; CHECK-NOT: unrelated_type
+
+; CALLEE-LABEL: @callee(
+; CALLEE-NOT: @caller
+; CALLEE-NOT: @unrelated_decl
+
+; CALLER-NOT: @callee(
+; CALLER-LABEL: @caller(
+
+; CHECK-NOT: @unrelated_decl
+; CHECK: xyxxy
diff --git a/polly/unittests/Isl/IslTest.cpp b/polly/unittests/Isl/IslTest.cpp
index 72c0a85133b90..42730af278dd2 100644
--- a/polly/unittests/Isl/IslTest.cpp
+++ b/polly/unittests/Isl/IslTest.cpp
@@ -136,7 +136,7 @@ TEST(Isl, APIntToIslVal) {
   {
     APInt APNOne(32, (1ull << 32) - 1, false);
     auto IslNOne = valFromAPInt(IslCtx, APNOne, false);
-    auto IslRef = isl::val(IslCtx, 32).pow2().sub_ui(1);
+    auto IslRef = isl::val(IslCtx, 32).pow2().sub(1);
     EXPECT_EQ(IslNOne, IslRef);
   }
 
@@ -223,7 +223,7 @@ TEST(Isl, IslValToAPInt) {
   }
 
   {
-    auto IslNOne = isl::val(IslCtx, 32).pow2().sub_ui(1);
+    auto IslNOne = isl::val(IslCtx, 32).pow2().sub(1);
     auto APNOne = APIntFromVal(IslNOne);
     EXPECT_EQ((1ull << 32) - 1, APNOne);
     EXPECT_EQ(33u, APNOne.getBitWidth());
@@ -232,7 +232,7 @@ TEST(Isl, IslValToAPInt) {
   {
     auto IslLargeNum = isl::val(IslCtx, 60);
     IslLargeNum = IslLargeNum.pow2();
-    IslLargeNum = IslLargeNum.sub_ui(1);
+    IslLargeNum = IslLargeNum.sub(1);
     auto APLargeNum = APIntFromVal(IslLargeNum);
     EXPECT_EQ((1ull << 60) - 1, APLargeNum);
     EXPECT_EQ(61u, APLargeNum.getBitWidth());
diff --git a/polly/unittests/Support/ISLTools.cpp b/polly/unittests/Support/ISLTools.cpp
index 2a796439cfe4d..35225eb4ff373 100644
--- a/polly/unittests/Support/ISLTools.cpp
+++ b/polly/unittests/Support/ISLTools.cpp
@@ -19,7 +19,7 @@ TEST(Support, isl_iterator) {
       Ctx, "{ [x, y] : 0 <= x <= 5 and y >= 0 and x <= 4 and y <= 3 + x }");
   isl::set S = A.unite(B);
 
-  ASSERT_EQ(S.n_basic_set(), 2);
+  ASSERT_EQ(S.n_basic_set().release(), 2);
   std::vector<isl::basic_set> Sets;
   for (auto BS : S.get_basic_set_list())
     Sets.push_back(BS);
diff --git a/pstl/test/std/algorithms/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp b/pstl/test/std/algorithms/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
index 75d72fb614296..533443896c358 100644
--- a/pstl/test/std/algorithms/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
+++ b/pstl/test/std/algorithms/alg.modifying.operations/alg.reverse/reverse_copy.pass.cpp
@@ -120,8 +120,6 @@ test()
 int
 main()
 {
-    // clang-3.8 fails to correctly auto vectorize the loop in some cases of different types of container's elements,
-    // for example: int32_t and int8_t. This issue isn't detected for clang-3.9 and newer versions.
     test<int16_t, int8_t>();
     test<uint16_t, float32_t>();
     test<float64_t, int64_t>();
diff --git a/sycl/test/extensions/bfloat16.cpp b/sycl/test/extensions/bfloat16.cpp
index be22718433939..bd6022a341307 100644
--- a/sycl/test/extensions/bfloat16.cpp
+++ b/sycl/test/extensions/bfloat16.cpp
@@ -1,5 +1,7 @@
 // RUN: %clangxx -fsycl-device-only -fsycl-targets=%sycl_triple -S %s -o - | FileCheck %s
 
+// UNSUPPORTED: cuda
+
 #include <sycl/ext/intel/experimental/bfloat16.hpp>
 #include <sycl/sycl.hpp>
 
diff --git a/utils/bazel/WORKSPACE b/utils/bazel/WORKSPACE
index 84ac1e565c018..572c9fee878ff 100644
--- a/utils/bazel/WORKSPACE
+++ b/utils/bazel/WORKSPACE
@@ -16,15 +16,17 @@ http_archive(
     ],
 )
 
-load(":configure.bzl", "llvm_configure")
-
-llvm_configure(
-    name = "llvm-project",
-    overlay_path = "llvm-project-overlay",
-    src_path = "../..",
+new_local_repository(
+    name = "llvm-raw",
+    path = "../../",
+    build_file_content = "# empty",
 )
 
-load(":terminfo.bzl", "llvm_terminfo_from_env")
+load("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure")
+
+llvm_configure(name = "llvm-project")
+
+load("@llvm-raw//utils/bazel:terminfo.bzl", "llvm_terminfo_from_env")
 
 maybe(
     llvm_terminfo_from_env,
@@ -34,7 +36,7 @@ maybe(
 maybe(
     http_archive,
     name = "zlib",
-    build_file = "//third_party_build:zlib.BUILD",
+    build_file = "@llvm-raw//utils/bazel/third_party_build:zlib.BUILD",
     sha256 = "c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1",
     strip_prefix = "zlib-1.2.11",
     urls = [
@@ -43,7 +45,7 @@ maybe(
     ],
 )
 
-load(":zlib.bzl", "llvm_zlib_from_env")
+load("@llvm-raw//utils/bazel:zlib.bzl", "llvm_zlib_from_env")
 
 maybe(
     llvm_zlib_from_env,
@@ -54,7 +56,7 @@ maybe(
 maybe(
     http_archive,
     name = "vulkan_headers",
-    build_file = "//third_party_build:vulkan_headers.BUILD",
+    build_file = "@llvm-raw//utils/bazel/third_party_build:vulkan_headers.BUILD",
     sha256 = "19f491784ef0bc73caff877d11c96a48b946b5a1c805079d9006e3fbaa5c1895",
     strip_prefix = "Vulkan-Headers-9bd3f561bcee3f01d22912de10bb07ce4e23d378",
     urls = [
@@ -62,7 +64,7 @@ maybe(
     ],
 )
 
-load(":vulkan_sdk.bzl", "vulkan_sdk_setup")
+load("@llvm-raw//utils/bazel:vulkan_sdk.bzl", "vulkan_sdk_setup")
 
 maybe(
     vulkan_sdk_setup,
diff --git a/utils/bazel/configure.bzl b/utils/bazel/configure.bzl
index 585f3e4908dab..4c5ab8bd09724 100644
--- a/utils/bazel/configure.bzl
+++ b/utils/bazel/configure.bzl
@@ -5,7 +5,6 @@
 """Helper macros to configure the LLVM overlay project."""
 
 load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
-load("@bazel_skylib//lib:paths.bzl", "paths")
 load(":zlib.bzl", "llvm_zlib_disable", "llvm_zlib_system")
 load(":terminfo.bzl", "llvm_terminfo_disable", "llvm_terminfo_system")
 
@@ -33,23 +32,11 @@ DEFAULT_TARGETS = [
 ]
 
 def _overlay_directories(repository_ctx):
-    src_workspace_path = str(repository_ctx.path(
-        repository_ctx.attr.src_workspace,
-    ).dirname)
-
-    src_path = paths.join(src_workspace_path, repository_ctx.attr.src_path)
-
-    overlay_workspace_path = str(repository_ctx.path(
-        repository_ctx.attr.overlay_workspace,
-    ).dirname)
-    overlay_path = paths.join(
-        overlay_workspace_path,
-        repository_ctx.attr.overlay_path,
-    )
+    src_path = repository_ctx.path(Label("//:WORKSPACE")).dirname
+    bazel_path = src_path.get_child("utils").get_child("bazel")
+    overlay_path = bazel_path.get_child("llvm-project-overlay")
+    script_path = bazel_path.get_child("overlay_directories.py")
 
-    overlay_script = repository_ctx.path(
-        repository_ctx.attr._overlay_script,
-    )
     python_bin = repository_ctx.which("python3")
     if not python_bin:
         # Windows typically just defines "python" as python3. The script itself
@@ -61,7 +48,7 @@ def _overlay_directories(repository_ctx):
 
     cmd = [
         python_bin,
-        overlay_script,
+        script_path,
         "--src",
         src_path,
         "--overlay",
@@ -98,14 +85,6 @@ llvm_configure = repository_rule(
     local = True,
     configure = True,
     attrs = {
-        "_overlay_script": attr.label(
-            default = Label("//:overlay_directories.py"),
-            allow_single_file = True,
-        ),
-        "overlay_workspace": attr.label(default = Label("//:WORKSPACE")),
-        "overlay_path": attr.string(default = DEFAULT_OVERLAY_PATH),
-        "src_workspace": attr.label(default = Label("//:WORKSPACE")),
-        "src_path": attr.string(mandatory = True),
         "targets": attr.string_list(default = DEFAULT_TARGETS),
     },
 )
diff --git a/utils/bazel/examples/http_archive/WORKSPACE b/utils/bazel/examples/http_archive/WORKSPACE
index cadeefb10ebdb..ea604f5f69d6d 100644
--- a/utils/bazel/examples/http_archive/WORKSPACE
+++ b/utils/bazel/examples/http_archive/WORKSPACE
@@ -20,38 +20,24 @@ http_archive(
 )
 
 # Replace with the LLVM commit you want to use.
-LLVM_COMMIT = "09ac97ce350316b95b8cddb796d52f71b6f68296"
+LLVM_COMMIT = "81d5412439efd0860c0a8dd51b831204f118d485"
 
 # The easiest way to calculate this for a new commit is to set it to empty and
 # then run a bazel build and it will report the digest necessary to cache the
 # archive and make the build reproducible.
-LLVM_SHA256 = "2fb1aa06d12f8db349a27426cb0ced062987c5c2a75143c69f4284929e2750ff"
-
-# FIXME: It shouldn't be necessary to use http_archive twice here. Caching
-# should mean that this isn't too expensive though.
+LLVM_SHA256 = "50b3ef31b228ea0c96ae074005bfac087c56e6a4b1c147592dd33f41cad0706b"
 
 http_archive(
-    name = "llvm-project-raw",
-    build_file_content = "#empty",
+    name = "llvm-raw",
+    build_file_content = "# empty",
     sha256 = LLVM_SHA256,
     strip_prefix = "llvm-project-" + LLVM_COMMIT,
     urls = ["https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT)],
 )
 
-http_archive(
-    name = "llvm-bazel",
-    sha256 = LLVM_SHA256,
-    strip_prefix = "llvm-project-{}/utils/bazel".format(LLVM_COMMIT),
-    urls = ["https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT)],
-)
-
-load("@llvm-bazel//:configure.bzl", "llvm_configure", "llvm_disable_optional_support_deps")
+load("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure", "llvm_disable_optional_support_deps")
 
-llvm_configure(
-    name = "llvm-project",
-    src_path = ".",
-    src_workspace = "@llvm-project-raw//:WORKSPACE",
-)
+llvm_configure(name = "llvm-project")
 
 # Disables optional dependencies for Support like zlib and terminfo. You may
 # instead want to configure them using the macros in the corresponding bzl
diff --git a/utils/bazel/examples/submodule/WORKSPACE b/utils/bazel/examples/submodule/WORKSPACE
index c83ab97169494..e702caa78b2f7 100644
--- a/utils/bazel/examples/submodule/WORKSPACE
+++ b/utils/bazel/examples/submodule/WORKSPACE
@@ -17,21 +17,16 @@ http_archive(
     ],
 )
 
-# Or wherever your submodule is located.
-SUBMODULE_PATH = "third_party/llvm-project"
-
-local_repository(
-    name = "llvm-bazel",
-    path = SUBMODULE_PATH + "/utils/bazel",
+new_local_repository(
+    name = "llvm-raw",
+    build_file_content = "# empty",
+    # Or wherever your submodule is located.
+    path = "third_party/llvm-project",
 )
 
-load("@llvm-bazel//:configure.bzl", "llvm_configure", "llvm_disable_optional_support_deps")
+load("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure", "llvm_disable_optional_support_deps")
 
-llvm_configure(
-    name = "llvm-project",
-    src_path = SUBMODULE_PATH,
-    src_workspace = "@submodule_example//:WORKSPACE",
-)
+llvm_configure(name = "llvm-project")
 
 # Disables optional dependencies for Support like zlib and terminfo. You may
 # instead want to configure them using the macros in the corresponding bzl
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 26b14d5139ce9..1996451d34a52 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -343,12 +343,13 @@ exports_files(
 genrule(
     name = "basic_version_gen",
     outs = ["include/clang/Basic/Version.inc"],
-    cmd = ("printf " +
-           "\"#define CLANG_VERSION 12.0\n\"" +
-           "\"#define CLANG_VERSION_MAJOR 12\n\"" +
-           "\"#define CLANG_VERSION_MINOR 0\n\"" +
-           "\"#define CLANG_VERSION_PATCHLEVEL 0\n\"" +
-           "\"#define CLANG_VERSION_STRING \\\"git\\\"\n\" > $@"),
+    cmd = (
+        "echo '#define CLANG_VERSION 14.0.0' >> $@\n" +
+        "echo '#define CLANG_VERSION_MAJOR 14' >> $@\n" +
+        "echo '#define CLANG_VERSION_MINOR 0' >> $@\n" +
+        "echo '#define CLANG_VERSION_PATCHLEVEL 0' >> $@\n" +
+        "echo '#define CLANG_VERSION_STRING \"14.0.0\"' >> $@\n"
+    ),
 )
 
 cc_library(
@@ -372,7 +373,7 @@ genrule(
     # are passed through bazel, it's easier to drop generated files next to
     # the other includes.
     outs = ["include/VCSVersion.inc"],
-    cmd = "echo \"#define CLANG_REVISION \\\"git\\\"\" > $@",
+    cmd = "echo '#define CLANG_REVISION \"git\"' > $@",
 )
 
 # A hacky library to expose some internal headers of the `basic` library to its
diff --git a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
index 00289e804f0db..23a2ec240341c 100644
--- a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
@@ -13,7 +13,7 @@ package(
 genrule(
     name = "config_version_gen",
     outs = ["include/lld/Common/Version.inc"],
-    cmd = "echo '#define LLD_VERSION_STRING \"git\"' > $@",
+    cmd = "echo '#define LLD_VERSION_STRING \"14.0.0\"' > $@",
 )
 
 genrule(
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index a878c311d2427..441458a982874 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -98,7 +98,7 @@ enum_targets_gen(
 genrule(
     name = "version_info_gen",
     outs = ["include/llvm/Config/VersionInfo.h"],
-    cmd = "echo \"#define LLVM_VERSION_INFO \\\"git\\\"\" > $@",
+    cmd = "echo '#define LLVM_VERSION_INFO \"git\"' > $@",
 )
 
 template_rule(
diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl
index 36e17eecfd7b6..514f79bcf2b6d 100644
--- a/utils/bazel/llvm-project-overlay/llvm/config.bzl
+++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl
@@ -6,7 +6,7 @@
 
 def native_arch_defines(arch, triple):
     return [
-        "LLVM_NATIVE_ARCH=\\\"{}\\\"".format(arch),
+        r'LLVM_NATIVE_ARCH=\"{}\"'.format(arch),
         "LLVM_NATIVE_ASMPARSER=LLVMInitialize{}AsmParser".format(arch),
         "LLVM_NATIVE_ASMPRINTER=LLVMInitialize{}AsmPrinter".format(arch),
         "LLVM_NATIVE_DISASSEMBLER=LLVMInitialize{}Disassembler".format(arch),
@@ -14,16 +14,16 @@ def native_arch_defines(arch, triple):
         "LLVM_NATIVE_TARGETINFO=LLVMInitialize{}TargetInfo".format(arch),
         "LLVM_NATIVE_TARGETMC=LLVMInitialize{}TargetMC".format(arch),
         "LLVM_NATIVE_TARGETMCA=LLVMInitialize{}TargetMCA".format(arch),
-        "LLVM_HOST_TRIPLE=\\\"{}\\\"".format(triple),
-        "LLVM_DEFAULT_TARGET_TRIPLE=\\\"{}\\\"".format(triple),
+        r'LLVM_HOST_TRIPLE=\"{}\"'.format(triple),
+        r'LLVM_DEFAULT_TARGET_TRIPLE=\"{}\"'.format(triple),
     ]
 
 posix_defines = [
     "LLVM_ON_UNIX=1",
     "HAVE_BACKTRACE=1",
     "BACKTRACE_HEADER=<execinfo.h>",
-    "LTDL_SHLIB_EXT=\\\".so\\\"",
-    "LLVM_PLUGIN_EXT=\\\".so\\\"",
+    r'LTDL_SHLIB_EXT=\".so\"',
+    r'LLVM_PLUGIN_EXT=\".so\"',
     "LLVM_ENABLE_THREADS=1",
     "HAVE_SYSEXITS_H=1",
     "HAVE_UNISTD_H=1",
@@ -32,6 +32,8 @@ posix_defines = [
     "HAVE_PTHREAD_GETNAME_NP=1",
     "HAVE_PTHREAD_SETNAME_NP=1",
     "HAVE_PTHREAD_GETSPECIFIC=1",
+    "HAVE_REGISTER_FRAME=1",
+    "HAVE_DEREGISTER_FRAME=1",
 ]
 
 linux_defines = posix_defines + [
@@ -49,6 +51,7 @@ macos_defines = posix_defines + [
     "HAVE_MALLOC_MALLOC_H=1",
     "HAVE_MALLOC_ZONE_STATISTICS=1",
     "HAVE_PROC_PID_RUSAGE=1",
+    "HAVE_UNW_ADD_DYNAMIC_FDE=1",
 ]
 
 win32_defines = [
@@ -57,8 +60,8 @@ win32_defines = [
     "strdup=_strdup",
 
     # LLVM features
-    "LTDL_SHLIB_EXT=\\\".dll\\\"",
-    "LLVM_PLUGIN_EXT=\\\".dll\\\"",
+    r'LTDL_SHLIB_EXT=\".dll\"',
+    r'LLVM_PLUGIN_EXT=\".dll\"',
 ]
 
 # TODO: We should switch to platforms-based config settings to make this easier
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
index f24768d6c4a6b..03a2fa6591eec 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/config.h
@@ -75,10 +75,13 @@
 /* #undef HAVE_DLADDR */
 
 /* Define to 1 if we can register EH frames on this platform. */
-#define HAVE_REGISTER_FRAME 1
+/* HAVE_REGISTER_FRAME defined in Bazel*/
 
 /* Define to 1 if we can deregister EH frames on this platform. */
-#define HAVE_DEREGISTER_FRAME 1
+/* HAVE_DEREGISTER_FRAME defined in Bazel*/
+
+/* Define if __unw_add_dynamic_fde() is available on this platform. */
+/* HAVE_UNW_ADD_DYNAMIC_FDE defined in Bazel */
 
 /* Define to 1 if you have the <errno.h> header file. */
 #define HAVE_ERRNO_H 1
diff --git a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
index 6c41bc869b05e..1f7b7bf40f6b0 100644
--- a/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
+++ b/utils/bazel/llvm-project-overlay/llvm/include/llvm/Config/llvm-config.h
@@ -74,7 +74,7 @@
 #define LLVM_USE_PERF 0
 
 /* Major version of the LLVM API */
-#define LLVM_VERSION_MAJOR 13
+#define LLVM_VERSION_MAJOR 14
 
 /* Minor version of the LLVM API */
 #define LLVM_VERSION_MINOR 0
@@ -83,7 +83,7 @@
 #define LLVM_VERSION_PATCH 0
 
 /* LLVM version string */
-#define LLVM_VERSION_STRING "13.0.0git"
+#define LLVM_VERSION_STRING "14.0.0git"
 
 /* Whether LLVM records statistics for use with GetStatistics(),
  * PrintStatistics() or PrintStatisticsJSON()
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 433754d94730a..db23811683741 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1479,6 +1479,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":Affine",
+        ":Analysis",
         ":DialectUtils",
         ":IR",
         ":MemRefDialect",
@@ -1635,6 +1636,7 @@ cc_library(
     hdrs = ["include/mlir/Dialect/SparseTensor/Transforms/Passes.h"],
     includes = ["include"],
     deps = [
+        ":Affine",
         ":IR",
         ":LLVMDialect",
         ":LinalgOps",
@@ -6085,12 +6087,10 @@ cc_library(
         "lib/Dialect/Linalg/Transforms/*.cpp",
         "lib/Dialect/Linalg/Transforms/*.h",
     ]) + [
-        "lib/Dialect/Linalg/Analysis/ConstraintsSet.cpp",
         "lib/Dialect/Linalg/Analysis/DependenceAnalysis.cpp",
         "lib/Dialect/Linalg/Utils/Utils.cpp",
     ],
     hdrs = [
-        "include/mlir/Dialect/Linalg/Analysis/ConstraintsSet.h",
         "include/mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h",
         "include/mlir/Dialect/Linalg/Passes.h",
         "include/mlir/Dialect/Linalg/Transforms/CodegenStrategy.h",
diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake
index 8d58ec9d665b3..d7cd44b5db36a 100644
--- a/utils/bazel/llvm_configs/config.h.cmake
+++ b/utils/bazel/llvm_configs/config.h.cmake
@@ -64,6 +64,9 @@
 /* Define to 1 if we can deregister EH frames on this platform. */
 #cmakedefine HAVE_DEREGISTER_FRAME ${HAVE_DEREGISTER_FRAME}
 
+/* Define if __unw_add_dynamic_fde() is available on this platform. */
+#cmakedefine HAVE_UNW_ADD_DYNAMIC_FDE ${HAVE_UNW_ADD_DYNAMIC_FDE}
+
 /* Define to 1 if you have the <errno.h> header file. */
 #cmakedefine HAVE_ERRNO_H ${HAVE_ERRNO_H}
 
diff --git a/utils/bazel/terminfo.bzl b/utils/bazel/terminfo.bzl
index f7128269c13a5..2505f76272398 100644
--- a/utils/bazel/terminfo.bzl
+++ b/utils/bazel/terminfo.bzl
@@ -27,7 +27,7 @@ def _llvm_terminfo_disable_impl(repository_ctx):
 
 _terminfo_disable_attrs = {
     "_disable_build_template": attr.label(
-        default = Label("//deps_impl:terminfo_disable.BUILD"),
+        default = Label("//utils/bazel/deps_impl:terminfo_disable.BUILD"),
         allow_single_file = True,
     ),
 }
@@ -147,11 +147,11 @@ def _merge_attrs(attrs_list):
 
 _terminfo_system_attrs = _merge_attrs([_terminfo_disable_attrs, {
     "_system_build_template": attr.label(
-        default = Label("//deps_impl:terminfo_system.BUILD"),
+        default = Label("//utils/bazel/deps_impl:terminfo_system.BUILD"),
         allow_single_file = True,
     ),
     "_terminfo_test_source": attr.label(
-        default = Label("//deps_impl:terminfo_test.c"),
+        default = Label("//utils/bazel/deps_impl:terminfo_test.c"),
         allow_single_file = True,
     ),
     "candidate_system_linkopts": attr.string_list(
diff --git a/utils/bazel/zlib.bzl b/utils/bazel/zlib.bzl
index 7726f30c7ea4c..18f8a0023481e 100644
--- a/utils/bazel/zlib.bzl
+++ b/utils/bazel/zlib.bzl
@@ -33,7 +33,7 @@ llvm_zlib_external = repository_rule(
     implementation = _llvm_zlib_external_impl,
     attrs = {
         "_external_build_template": attr.label(
-            default = Label("//deps_impl:zlib_external.BUILD"),
+            default = Label("//utils/bazel/deps_impl:zlib_external.BUILD"),
             allow_single_file = True,
         ),
         "external_zlib": attr.string(
@@ -56,7 +56,7 @@ llvm_zlib_system = repository_rule(
     implementation = _llvm_zlib_system_impl,
     attrs = {
         "_system_build_template": attr.label(
-            default = Label("//deps_impl:zlib_system.BUILD"),
+            default = Label("//utils/bazel/deps_impl:zlib_system.BUILD"),
             allow_single_file = True,
         ),
     },
@@ -73,7 +73,7 @@ llvm_zlib_disable = repository_rule(
     implementation = _llvm_zlib_disable_impl,
     attrs = {
         "_disable_build_template": attr.label(
-            default = Label("//deps_impl:zlib_disable.BUILD"),
+            default = Label("//utils/bazel/deps_impl:zlib_disable.BUILD"),
             allow_single_file = True,
         ),
     },
@@ -92,15 +92,15 @@ llvm_zlib_from_env = repository_rule(
     implementation = _llvm_zlib_from_env_impl,
     attrs = {
         "_disable_build_template": attr.label(
-            default = Label("//deps_impl:zlib_disable.BUILD"),
+            default = Label("//utils/bazel/deps_impl:zlib_disable.BUILD"),
             allow_single_file = True,
         ),
         "_external_build_template": attr.label(
-            default = Label("//deps_impl:zlib_external.BUILD"),
+            default = Label("//utils/bazel/deps_impl:zlib_external.BUILD"),
             allow_single_file = True,
         ),
         "_system_build_template": attr.label(
-            default = Label("//deps_impl:zlib_system.BUILD"),
+            default = Label("//utils/bazel/deps_impl:zlib_system.BUILD"),
             allow_single_file = True,
         ),
         "external_zlib": attr.label(